From ecff1dbf9cbad1bdf424d27d0e2cbcf8c3b0b10a Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sun, 31 May 2026 03:47:08 +0000
Subject: [PATCH 1/4] cc: M6+M7 chordal coloring + Belady splitting (x86_64)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace x86_64's linear-scan allocator with MCS-ordered greedy
coloring on the SSA interference graph (Hack/Braun 2009) plus
Belady next-use eviction when greedy fails. Aarch64 still on M5
linear-scan; port follows.

Shared infrastructure in cc/arch/regalloc.rs:

- InterferenceGraph using BTreeMap/BTreeSet for deterministic
  iteration (the design rule established in the determinism work)
- build_interference_graph with per-instruction backward walk,
  def-before-remove + def-vs-live edges; an add_def_src_edges
  parameter so GP gets the cmov pattern edge and XMM doesn't
  (forcing def/src to differ over-constrains FP and the linker
  loses information)
- mcs_ordering + greedy_color + ColoringResult
- compute_use_positions + next_use_distance for Belady

x86_64 backend (cc/arch/x86_64/regalloc.rs):

- run_chordal_color replaces run_linear_scan
- color_gp_bank: def_src_edges=true, hard-forbid caller-saved
  for cross-call ranges, soft-prefer callee-saved in loops
- color_xmm_bank: def_src_edges=false, spill-on-fail (Belady
  GP-only for now — XMM splitting is a follow-up)
- expire_old_intervals deleted

Codegen fixes — chordal coloring exposed every codegen helper
that used Xmm0 as an implicit scratch. Linear scan happened to
keep Xmm0 cold; chordal coloring assigns it freely, so the
clobbers became real miscompilations. Symptom: _Py_dg_strtod
infinite loop on subnormal floats (e.g. float('9e-324')) — the
correction loop's intermediate doubles got silently destroyed.

Switched implicit-scratch sites from Xmm0 (allocatable) to
Xmm15/Xmm14 (reserved scratch, same pattern as R10/R11 on the
GP side):

- emit_fp_binop, emit_fp_neg, emit_fp_compare, emit_int_to_float,
  emit_float_to_int, emit_float_to_float, emit_fp_const_load
  (cc/arch/x86_64/float.rs)
- emit_select_fp (cc/arch/x86_64/codegen.rs — earlier fix)
- regular FP copy fallback in codegen.rs

This reserves 2 of 16 XMM regs from the allocator's palette. A
follow-up milestone (per-instruction scratch-clobber constraints,
shared with inline asm) can free them back.

Verified:
- cc test suite 914+948+204 pass
- clippy clean
- CPython 3.12.9 -O2 449/449 (40,817 individual tests, 30m23s)
- float('9e-324') returns 1e-323 correctly

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cc/arch/regalloc.rs        | 322 ++++++++++++++++++++++++++++
 cc/arch/x86_64/codegen.rs  |  34 ++-
 cc/arch/x86_64/float.rs    |  96 +++++----
 cc/arch/x86_64/regalloc.rs | 419 +++++++++++++++++++++++++++----------
 4 files changed, 716 insertions(+), 155 deletions(-)

diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs
index a64eb9b4..61315b3e 100644
--- a/cc/arch/regalloc.rs
+++ b/cc/arch/regalloc.rs
@@ -796,6 +796,328 @@ pub fn spill_gp_args_across_calls<L, R, IsArg, ExtractReg, MkStackLoc, RecordSpi
     }
 }
 
+// ============================================================================
+// Chordal coloring on the SSA interference graph (M6)
+// ============================================================================
+//
+// SSA interference graphs are chordal (Pereira/Palsberg 2005, Hack
+// 2005): a perfect elimination ordering exists and greedy coloring on
+// that ordering achieves the chromatic number. The two-step recipe:
+//   1. Maximum Cardinality Search (MCS) on the interference graph
+//      yields a perfect elimination ordering.
+//   2. Greedy color in MCS order: pick the lowest-index register not
+//      used by any already-colored neighbor.
+//
+// pcc's IR is not strictly SSA at the allocator boundary —
+// `lower::eliminate_phi_nodes` introduces multi-def Copy patterns. The
+// interference graph is still well-defined: vertex u interferes with
+// vertex v iff u and v are simultaneously live somewhere in the
+// function. We construct it precisely with a per-instruction backward
+// walk (the textbook "for each def, add edges to currently-live"
+// pattern) rather than the coarser block-level all-pairs that would
+// over-constrain coloring with spurious edges.
+//
+// Constraint pre-coloring (e.g. x86_64 `idiv` clobbers RAX/RDX) is
+// expressed via per-pseudo `forbidden` colors passed to greedy_color.
+// ABI-pinned arg pseudos arrive via `pre_colored`. Physical registers
+// don't need to be modeled as graph vertices.
+//
+// Use BTreeMap / BTreeSet throughout so iteration is deterministic —
+// the project's determinism invariant (cc/CLAUDE.md) covers this.
+
+/// Per-pseudo set of interfering pseudos for one register bank.
+#[derive(Debug, Default)]
+pub struct InterferenceGraph {
+    /// Adjacency map. `edges[p]` is the set of pseudos that interfere
+    /// with `p` and belong to the same register bank.
+    pub edges: BTreeMap<PseudoId, std::collections::BTreeSet<PseudoId>>,
+    /// All vertices, including isolated ones (no edges).
+    pub vertices: std::collections::BTreeSet<PseudoId>,
+}
+
+impl InterferenceGraph {
+    pub fn new() -> Self {
+        Self {
+            edges: BTreeMap::new(),
+            vertices: std::collections::BTreeSet::new(),
+        }
+    }
+
+    pub fn add_vertex(&mut self, p: PseudoId) {
+        self.vertices.insert(p);
+        self.edges.entry(p).or_default();
+    }
+
+    pub fn add_edge(&mut self, a: PseudoId, b: PseudoId) {
+        if a == b {
+            return;
+        }
+        self.add_vertex(a);
+        self.add_vertex(b);
+        self.edges.entry(a).or_default().insert(b);
+        self.edges.entry(b).or_default().insert(a);
+    }
+
+    pub fn neighbors(&self, p: PseudoId) -> impl Iterator<Item = PseudoId> + '_ {
+        self.edges
+            .get(&p)
+            .into_iter()
+            .flat_map(|s| s.iter().copied())
+    }
+}
+
+/// Build the interference graph for pseudos in `candidates`.
+///
+/// Walks each basic block backward maintaining the current live set.
+/// For each instruction, the new defs interfere with everything that
+/// is live just after the instruction (i.e. before the backward step
+/// removes them); the def-vs-live edges must be added BEFORE removing
+/// the def from `live`, or edges between same-block adjacent def/use
+/// pairs are silently dropped.
+///
+/// Additionally, each def interferes with the instruction's own
+/// sources. Some codegen lowerings (notably x86_64 `cmov` for ternary
+/// `(cond) ? a : b`) materialize the target into a register and then
+/// read source values, which would clobber a source that shares a
+/// register with the target. The def-vs-src edge enforces disjoint
+/// registers in that case at the cost of an occasional extra move
+/// when the codegen would have allowed sharing. Net effect on .text
+/// size is small; without these edges, register allocation silently
+/// corrupts cmov/conditional-store patterns.
+pub fn build_interference_graph(
+    candidates: &std::collections::BTreeSet<PseudoId>,
+    func: &Function,
+    live_out: &[HashSet<PseudoId>],
+    add_def_src_edges: bool,
+) -> InterferenceGraph {
+    let mut graph = InterferenceGraph::new();
+    for &c in candidates {
+        graph.add_vertex(c);
+    }
+    for (idx, block) in func.blocks.iter().enumerate() {
+        let mut live: std::collections::BTreeSet<PseudoId> = live_out[idx]
+            .iter()
+            .copied()
+            .filter(|p| candidates.contains(p))
+            .collect();
+        for insn in block.insns.iter().rev() {
+            // Collect defs of this instruction.
+            let mut defs: Vec<PseudoId> = Vec::new();
+            if let Some(target) = insn.target {
+                if candidates.contains(&target) {
+                    defs.push(target);
+                }
+            }
+            if insn.op == Opcode::Asm {
+                if let Some(asm) = &insn.asm_data {
+                    for output in &asm.outputs {
+                        if candidates.contains(&output.pseudo) {
+                            defs.push(output.pseudo);
+                        }
+                    }
+                }
+            }
+            // Each def interferes with everything currently live AND
+            // with the instruction's other defs AND with each src
+            // (for the lowering-correctness reason described above).
+            for &d in &defs {
+                for &l in live.iter() {
+                    if l != d {
+                        graph.add_edge(d, l);
+                    }
+                }
+                for &d2 in &defs {
+                    if d != d2 {
+                        graph.add_edge(d, d2);
+                    }
+                }
+                if add_def_src_edges {
+                    for &src in &insn.src {
+                        if src != d && candidates.contains(&src) {
+                            graph.add_edge(d, src);
+                        }
+                    }
+                }
+            }
+            // Remove defs from live (they were born here; not live before).
+            for &d in &defs {
+                live.remove(&d);
+            }
+            // Add uses (srcs are live coming INTO this instruction).
+            for &src in &insn.src {
+                if candidates.contains(&src) {
+                    live.insert(src);
+                }
+            }
+            if let Some(indirect) = insn.indirect_target {
+                if candidates.contains(&indirect) {
+                    live.insert(indirect);
+                }
+            }
+            if insn.op == Opcode::Asm {
+                if let Some(asm) = &insn.asm_data {
+                    for input in &asm.inputs {
+                        if candidates.contains(&input.pseudo) {
+                            live.insert(input.pseudo);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    graph
+}
+
+/// Maximum Cardinality Search (MCS) ordering. The reverse of this
+/// ordering is a perfect elimination ordering when the graph is
+/// chordal; greedy coloring in MCS order yields an optimal coloring.
+///
+/// Algorithm: start with all weights at 0. Repeatedly pick an
+/// unprocessed vertex with maximum weight (ties broken by smallest
+/// pseudo id for determinism), add it to the ordering, increment
+/// weight of each unprocessed neighbor.
+pub fn mcs_ordering(graph: &InterferenceGraph) -> Vec<PseudoId> {
+    let mut weight: BTreeMap<PseudoId, usize> = BTreeMap::new();
+    for &v in &graph.vertices {
+        weight.insert(v, 0);
+    }
+    let mut order: Vec<PseudoId> = Vec::with_capacity(graph.vertices.len());
+    let mut remaining: std::collections::BTreeSet<PseudoId> = graph.vertices.clone();
+    while !remaining.is_empty() {
+        // Pick max-weight (tie-break: smaller PseudoId wins).
+        let pick = *remaining
+            .iter()
+            .max_by(|a, b| {
+                let wa = weight.get(*a).copied().unwrap_or(0);
+                let wb = weight.get(*b).copied().unwrap_or(0);
+                wa.cmp(&wb).then(b.0.cmp(&a.0))
+            })
+            .unwrap();
+        remaining.remove(&pick);
+        order.push(pick);
+        for n in graph.neighbors(pick) {
+            if remaining.contains(&n) {
+                *weight.entry(n).or_insert(0) += 1;
+            }
+        }
+    }
+    order
+}
+
+/// Result of greedy coloring for one register bank.
+pub struct ColoringResult<R> {
+    pub colors: BTreeMap<PseudoId, R>,
+    pub spilled: Vec<PseudoId>,
+}
+
+/// Greedy color pseudos in `order` using registers from `palette`.
+///
+/// * `pre_colored` — forced color assignments (e.g. ABI-pinned args).
+///   These are honored unchanged.
+/// * `forbidden` — per-vertex sets of colors that MUST NOT be assigned
+///   (correctness constraint, e.g. x86_64 cross-call pseudos forbidden
+///   from caller-saved, or pseudos live across `idiv` forbidden from
+///   RAX/RDX).
+/// * `preferred_palette` — biases the search order for each vertex
+///   (e.g. prefer callee-saved for in-loop pseudos). Returns the
+///   subset to try first; the full `palette` is searched as a
+///   fallback.
+///
+/// Vertices that cannot be colored land in `spilled`.
+pub fn greedy_color<R, PrefFn>(
+    graph: &InterferenceGraph,
+    order: &[PseudoId],
+    palette: &[R],
+    pre_colored: &BTreeMap<PseudoId, R>,
+    forbidden: &BTreeMap<PseudoId, std::collections::BTreeSet<R>>,
+    preferred_palette: PrefFn,
+) -> ColoringResult<R>
+where
+    R: Copy + Eq + Ord + Hash,
+    PrefFn: Fn(PseudoId) -> Option<Vec<R>>,
+{
+    let mut colors: BTreeMap<PseudoId, R> = pre_colored.clone();
+    let mut spilled: Vec<PseudoId> = Vec::new();
+    let empty: std::collections::BTreeSet<R> = std::collections::BTreeSet::new();
+    for &v in order {
+        if colors.contains_key(&v) {
+            continue;
+        }
+        let used: std::collections::BTreeSet<R> = graph
+            .neighbors(v)
+            .filter_map(|n| colors.get(&n).copied())
+            .collect();
+        let forbid = forbidden.get(&v).unwrap_or(&empty);
+        let prefer = preferred_palette(v);
+        let pick = prefer
+            .as_ref()
+            .and_then(|p| {
+                p.iter()
+                    .find(|r| !used.contains(r) && !forbid.contains(r))
+                    .copied()
+            })
+            .or_else(|| {
+                palette
+                    .iter()
+                    .find(|r| !used.contains(r) && !forbid.contains(r))
+                    .copied()
+            });
+        if let Some(r) = pick {
+            colors.insert(v, r);
+        } else {
+            spilled.push(v);
+        }
+    }
+    ColoringResult { colors, spilled }
+}
+
+/// For each pseudo, the sorted positions at which it is used (read).
+/// M7 Belady uses this to compute "next-use distance" — when register
+/// pressure forces a spill, the pseudo with the furthest next use is
+/// the cheapest to evict (reloading later costs less than reloading
+/// sooner).
+pub fn compute_use_positions(func: &Function) -> BTreeMap<PseudoId, Vec<usize>> {
+    let mut uses: BTreeMap<PseudoId, Vec<usize>> = BTreeMap::new();
+    let mut pos = 0usize;
+    for block in &func.blocks {
+        for insn in &block.insns {
+            for &src in &insn.src {
+                uses.entry(src).or_default().push(pos);
+            }
+            if let Some(indirect) = insn.indirect_target {
+                uses.entry(indirect).or_default().push(pos);
+            }
+            if insn.op == Opcode::Asm {
+                if let Some(asm) = &insn.asm_data {
+                    for input in &asm.inputs {
+                        uses.entry(input.pseudo).or_default().push(pos);
+                    }
+                }
+            }
+            pos += 1;
+        }
+    }
+    uses
+}
+
+/// Distance from position `p` to the next use of `pseudo`, or
+/// `usize::MAX` if there is no remaining use.
+pub fn next_use_distance(
+    uses: &BTreeMap<PseudoId, Vec<usize>>,
+    pseudo: PseudoId,
+    p: usize,
+) -> usize {
+    let Some(positions) = uses.get(&pseudo) else {
+        return usize::MAX;
+    };
+    let idx = positions.partition_point(|&q| q <= p);
+    if idx < positions.len() {
+        positions[idx] - p
+    } else {
+        usize::MAX
+    }
+}
+
 /// Try to reuse a previously freed stack slot of the given size and alignment.
 /// Uses block-level interference check to verify the candidate pseudo doesn't
 /// overlap with the slot's previous owner.
diff --git a/cc/arch/x86_64/codegen.rs b/cc/arch/x86_64/codegen.rs
index e58d82fc..c74795d9 100644
--- a/cc/arch/x86_64/codegen.rs
+++ b/cc/arch/x86_64/codegen.rs
@@ -3115,7 +3115,17 @@ impl X86_64CodeGen {
         }
     }
 
-    /// Emit FP select using conditional branch (CMov doesn't work on XMM regs)
+    /// Emit FP select using conditional branch (CMov doesn't work on XMM regs).
+    ///
+    /// Uses XMM15 as scratch for the merged value. XMM15 is documented as
+    /// codegen-reserved (see `XmmReg::allocatable` — it returns xmm0–xmm13,
+    /// leaving xmm14 and xmm15 out of the allocator's palette specifically
+    /// so codegen helpers like this one can use them without coordinating
+    /// with the allocator. Using xmm0 here clobbers any live xmm0-allocated
+    /// pseudo (chordal coloring may legitimately assign xmm0 to a pseudo
+    /// whose interval doesn't cross a call) and is the classic
+    /// silent-corruption case: the value-loss only manifests in
+    /// downstream computations, often as infinite loops or wrong results.
     fn emit_select_fp(
         &mut self,
         cond: PseudoId,
@@ -3133,8 +3143,8 @@ impl X86_64CodeGen {
         match &cond_loc {
             Loc::Imm(v) => {
                 let val = if *v != 0 { then_val } else { else_val };
-                self.emit_fp_move(val, XmmReg::Xmm0, size);
-                self.emit_fp_move_from_xmm(XmmReg::Xmm0, &dst_loc, size);
+                self.emit_fp_move(val, XmmReg::Xmm15, size);
+                self.emit_fp_move_from_xmm(XmmReg::Xmm15, &dst_loc, size);
                 return;
             }
             Loc::Stack(offset) => {
@@ -3167,17 +3177,17 @@ impl X86_64CodeGen {
             cc: CondCode::Ne,
             target: then_label.clone(),
         });
-        // Else branch: load else_val
-        self.emit_fp_move(else_val, XmmReg::Xmm0, size);
+        // Else branch: load else_val into the reserved scratch xmm15.
+        self.emit_fp_move(else_val, XmmReg::Xmm15, size);
         self.push_lir(X86Inst::Jmp {
             target: done_label.clone(),
         });
-        // Then branch: load then_val
+        // Then branch: load then_val into xmm15.
         self.push_lir(X86Inst::Directive(Directive::BlockLabel(then_label)));
-        self.emit_fp_move(then_val, XmmReg::Xmm0, size);
-        // Done
+        self.emit_fp_move(then_val, XmmReg::Xmm15, size);
+        // Done: move xmm15 → dst.
         self.push_lir(X86Inst::Directive(Directive::BlockLabel(done_label)));
-        self.emit_fp_move_from_xmm(XmmReg::Xmm0, &dst_loc, size);
+        self.emit_fp_move_from_xmm(XmmReg::Xmm15, &dst_loc, size);
     }
 
     /// Emit integer select using CMOVcc
@@ -3306,10 +3316,12 @@ impl X86_64CodeGen {
                 let dst_addr = self.get_x87_mem_addr(dst);
                 self.push_lir(X86Inst::X87Store { addr: dst_addr });
             } else {
-                // Handle regular FP copy (float/double)
+                // Handle regular FP copy (float/double).
+                // Reserved scratch when target lives on the stack (see
+                // emit_fp_binop in float.rs for the full rationale).
                 let dst_xmm = match &dst_loc {
                     Loc::Xmm(x) => *x,
-                    _ => XmmReg::Xmm0,
+                    _ => XmmReg::Xmm15,
                 };
 
                 // Use type-aware size for FP operations
diff --git a/cc/arch/x86_64/float.rs b/cc/arch/x86_64/float.rs
index a7aa41d3..ca224bea 100644
--- a/cc/arch/x86_64/float.rs
+++ b/cc/arch/x86_64/float.rs
@@ -287,9 +287,20 @@ impl X86_64CodeGen {
         };
 
         let dst_loc = self.get_location(target);
+        // When the target lives on the stack, we need an XMM scratch
+        // register to perform the binop. Must be one outside the
+        // allocator's palette (xmm0-xmm13) so we don't clobber a live
+        // pseudo. Use xmm15 — xmm14 is used as the fallback scratch
+        // elsewhere in this file when dst_xmm == xmm15.
+        //
+        // Pre-fix: this used Xmm0, which silently clobbered chordal-
+        // allocated FP pseudos whose intervals didn't cross a call
+        // (e.g., short-lived temporaries in _Py_dg_strtod's
+        // correction loop). Manifested as infinite loops where the
+        // computation went off the rails.
         let dst_xmm = match &dst_loc {
             Loc::Xmm(x) => *x,
-            _ => XmmReg::Xmm0, // Use XMM0 as work register
+            _ => XmmReg::Xmm15,
         };
         // Use type-aware FP size determination
         let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types);
@@ -422,9 +433,12 @@ impl X86_64CodeGen {
         };
 
         let dst_loc = self.get_location(target);
+        // See emit_fp_binop above: fallback must be a reserved XMM (xmm14
+        // or xmm15) so we don't clobber any chordal-allocated pseudo when
+        // the target lives on the stack.
         let dst_xmm = match &dst_loc {
             Loc::Xmm(x) => *x,
-            _ => XmmReg::Xmm0,
+            _ => XmmReg::Xmm15,
         };
         // Use type-aware FP size determination
         let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types);
@@ -503,30 +517,27 @@ impl X86_64CodeGen {
         let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types);
         let move_size = Self::size_from_type(insn.typ, insn.size, types);
 
-        // Check if src2 is in Xmm0 before we clobber it with src1.
-        // If so, move src2 to Xmm15 first to avoid losing its value.
-        let src2_loc = self.get_location(src2);
-        if matches!(src2_loc, Loc::Xmm(XmmReg::Xmm0)) {
-            self.push_lir(X86Inst::MovFp {
-                size: fp_size,
-                src: XmmOperand::Reg(XmmReg::Xmm0),
-                dst: XmmOperand::Reg(XmmReg::Xmm15),
-            });
-        }
+        // Use Xmm15 as the work register for src1 (Xmm15/Xmm14 are
+        // reserved scratch — not in the allocator palette). src2 cannot
+        // be in Xmm15 (allocator never picks it), so no aliasing check
+        // is needed.
+        //
+        // Pre-fix: this used Xmm0 as the work register and Xmm15 as the
+        // src2-aliasing escape hatch. Under chordal coloring, any FP
+        // pseudo could be allocated to Xmm0, so loading src1 into Xmm0
+        // silently clobbered live values.
 
-        // Load first operand to XMM0
-        self.emit_fp_move(src1, XmmReg::Xmm0, move_size);
+        // Load first operand to XMM15
+        self.emit_fp_move(src1, XmmReg::Xmm15, move_size);
 
         // Compare with second operand using ucomiss/ucomisd
-        // Re-read src2 location; if it was Xmm0, it's now in Xmm15.
         let src2_loc = self.get_location(src2);
         match src2_loc {
             Loc::Xmm(x) => {
-                let actual_reg = if x == XmmReg::Xmm0 { XmmReg::Xmm15 } else { x };
                 self.push_lir(X86Inst::UComiFp {
                     size: fp_size,
-                    src: XmmOperand::Reg(actual_reg),
-                    dst: XmmReg::Xmm0,
+                    src: XmmOperand::Reg(x),
+                    dst: XmmReg::Xmm15,
                 });
             }
             Loc::Stack(offset) => {
@@ -537,23 +548,23 @@ impl X86_64CodeGen {
                         base: Reg::Rbp,
                         offset: -adjusted,
                     }),
-                    dst: XmmReg::Xmm0,
+                    dst: XmmReg::Xmm15,
                 });
             }
             Loc::FImm(v, _) => {
-                self.emit_fp_imm_to_xmm(v, XmmReg::Xmm15, move_size);
+                self.emit_fp_imm_to_xmm(v, XmmReg::Xmm14, move_size);
                 self.push_lir(X86Inst::UComiFp {
                     size: fp_size,
-                    src: XmmOperand::Reg(XmmReg::Xmm15),
-                    dst: XmmReg::Xmm0,
+                    src: XmmOperand::Reg(XmmReg::Xmm14),
+                    dst: XmmReg::Xmm15,
                 });
             }
             _ => {
-                self.emit_fp_move(src2, XmmReg::Xmm15, move_size);
+                self.emit_fp_move(src2, XmmReg::Xmm14, move_size);
                 self.push_lir(X86Inst::UComiFp {
                     size: fp_size,
-                    src: XmmOperand::Reg(XmmReg::Xmm15),
-                    dst: XmmReg::Xmm0,
+                    src: XmmOperand::Reg(XmmReg::Xmm14),
+                    dst: XmmReg::Xmm15,
                 });
             }
         }
@@ -660,9 +671,10 @@ impl X86_64CodeGen {
         };
 
         let dst_loc = self.get_location(target);
+        // Reserved scratch when target lives on the stack (see emit_fp_binop).
         let dst_xmm = match &dst_loc {
             Loc::Xmm(x) => *x,
-            _ => XmmReg::Xmm0,
+            _ => XmmReg::Xmm15,
         };
 
         let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types);
@@ -793,11 +805,11 @@ impl X86_64CodeGen {
             None => return,
         };
 
-        // Move float to XMM0 using type-aware source FP size
+        // Move float to XMM15 (reserved scratch — see emit_fp_binop).
         let fp_size = FpSize::from_type_or_bits(insn.src_typ, insn.src_size, types);
         self.emit_fp_move(
             src,
-            XmmReg::Xmm0,
+            XmmReg::Xmm15,
             Self::size_from_type(insn.src_typ, insn.src_size, types),
         );
 
@@ -821,7 +833,7 @@ impl X86_64CodeGen {
         self.push_lir(X86Inst::CvtFpToInt {
             fp_size,
             int_size,
-            src: XmmOperand::Reg(XmmReg::Xmm0),
+            src: XmmOperand::Reg(XmmReg::Xmm15),
             dst: dst_reg,
         });
 
@@ -842,15 +854,24 @@ impl X86_64CodeGen {
         };
 
         let dst_loc = self.get_location(target);
+        // Reserved scratch when target lives on the stack (see emit_fp_binop).
         let dst_xmm = match &dst_loc {
             Loc::Xmm(x) => *x,
-            _ => XmmReg::Xmm0,
+            _ => XmmReg::Xmm15,
         };
 
-        // Move source to XMM0
+        // Scratch for holding the source value before the conversion.
+        // Must not collide with dst_xmm; use Xmm14 when dst is Xmm15.
+        let src_xmm = if dst_xmm == XmmReg::Xmm15 {
+            XmmReg::Xmm14
+        } else {
+            XmmReg::Xmm15
+        };
+
+        // Move source to scratch
         self.emit_fp_move(
             src,
-            XmmReg::Xmm0,
+            src_xmm,
             Self::size_from_type(insn.src_typ, insn.src_size, types),
         );
 
@@ -864,7 +885,7 @@ impl X86_64CodeGen {
                 self.push_lir(X86Inst::CvtFpFp {
                     src_size: FpSize::Single,
                     dst_size: FpSize::Double,
-                    src: XmmReg::Xmm0,
+                    src: src_xmm,
                     dst: dst_xmm,
                 });
             }
@@ -873,17 +894,17 @@ impl X86_64CodeGen {
                 self.push_lir(X86Inst::CvtFpFp {
                     src_size: FpSize::Double,
                     dst_size: FpSize::Single,
-                    src: XmmReg::Xmm0,
+                    src: src_xmm,
                     dst: dst_xmm,
                 });
             }
             _ => {
                 // Same type or types unknown, just move if needed
-                if dst_xmm != XmmReg::Xmm0 {
+                if dst_xmm != src_xmm {
                     let dst_fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types);
                     self.push_lir(X86Inst::MovFp {
                         size: dst_fp_size,
-                        src: XmmOperand::Reg(XmmReg::Xmm0),
+                        src: XmmOperand::Reg(src_xmm),
                         dst: XmmOperand::Reg(dst_xmm),
                     });
                 }
@@ -902,9 +923,10 @@ impl X86_64CodeGen {
     /// Load a floating-point constant into an XMM register
     pub(super) fn emit_fp_const_load(&mut self, target: PseudoId, value: f64, size: u32) {
         let dst_loc = self.get_location(target);
+        // Reserved scratch when target lives on the stack (see emit_fp_binop).
         let dst_xmm = match &dst_loc {
             Loc::Xmm(x) => *x,
-            _ => XmmReg::Xmm0,
+            _ => XmmReg::Xmm15,
         };
 
         self.emit_fp_imm_to_xmm(value, dst_xmm, size);
diff --git a/cc/arch/x86_64/regalloc.rs b/cc/arch/x86_64/regalloc.rs
index da4e8fdf..8bd0f6c0 100644
--- a/cc/arch/x86_64/regalloc.rs
+++ b/cc/arch/x86_64/regalloc.rs
@@ -39,8 +39,7 @@
 // ============================================================================
 
 use crate::arch::regalloc::{
-    compute_live_intervals, expire_intervals, expire_stack_intervals, find_call_positions,
-    find_conflicting_registers, identify_addr_taken_syms, identify_fp_pseudos,
+    compute_live_intervals, find_call_positions, identify_addr_taken_syms, identify_fp_pseudos,
     interval_crosses_call, ConstraintPoint, FreeSlot, LiveInterval, LivenessResult,
 };
 use crate::ir::{Function, Instruction, Opcode, PseudoId, PseudoKind};
@@ -52,7 +51,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
 // ============================================================================
 
 /// x86-64 physical registers
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum Reg {
     // 64-bit general purpose registers
     Rax,
@@ -300,7 +299,7 @@ pub fn get_constraint_info(insn: &Instruction) -> Option<(Vec<Reg>, Vec<PseudoId
 // ============================================================================
 
 /// x86-64 XMM registers for floating-point operations
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum XmmReg {
     Xmm0,
     Xmm1,
@@ -523,7 +522,7 @@ impl RegAlloc {
         self.spill_args_across_calls(func, &intervals, &call_positions);
         self.spill_args_across_constraints(func, &intervals, &constraint_points);
         self.allocate_alloca_to_stack(func);
-        self.run_linear_scan(func, types, intervals, &call_positions, &constraint_points);
+        self.run_chordal_color(func, types, intervals, &call_positions, &constraint_points);
 
         crate::arch::regalloc::LocationMap::from(self.locations.clone())
     }
@@ -1001,8 +1000,31 @@ impl RegAlloc {
         }
     }
 
-    /// Run the linear scan allocation algorithm
-    fn run_linear_scan(
+    /// Look up an interval by pseudo id (linear scan over the small
+    /// intervals vector; vec is sorted by start position, not pseudo,
+    /// so a linear find is fine).
+    fn interval_by_pseudo(intervals: &[LiveInterval], p: PseudoId) -> Option<&LiveInterval> {
+        intervals.iter().find(|i| i.pseudo == p)
+    }
+
+    /// M6 chordal coloring with stub M7 (spill-on-fail).
+    ///
+    /// Three phases:
+    ///   1. Pre-pass: route non-register-allocated pseudos to their
+    ///      proper Locs (constants → Imm/FImm, Sym → stack slot,
+    ///      int128 → 16-byte stack, long-double → x87 stack, FP
+    ///      crossing call/block boundary → spill XMM caller-saved).
+    ///      The remaining pseudos go into per-bank candidate sets.
+    ///   2. Color: per-bank chordal coloring. ABI-pinned args become
+    ///      pre-colored vertices; constraint clobbers (idiv RAX/RDX,
+    ///      shift Rcx, varargs RAX) and cross-call → caller-saved
+    ///      become per-vertex forbidden colors. In-loop pseudos get a
+    ///      callee-first preferred palette (soft, not forbidden).
+    ///   3. Commit: write Loc::Reg / Loc::Xmm for colored vertices,
+    ///      allocate stack slots for spilled vertices, track
+    ///      `used_callee_saved` for the prologue.
+    #[allow(clippy::too_many_arguments)]
+    fn run_chordal_color(
         &mut self,
         func: &Function,
         types: &TypeTable,
@@ -1010,14 +1032,18 @@ impl RegAlloc {
         call_positions: &[usize],
         constraint_points: &[ConstraintPoint<Reg>],
     ) {
-        for interval in intervals {
-            self.expire_old_intervals(interval.start);
-
+        // -------- Phase 1: pre-pass --------
+        let mut gp_candidates: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        let mut xmm_candidates: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for interval in &intervals {
             if self.locations.contains_key(&interval.pseudo) {
+                // Already assigned by allocate_arguments / spill_args
+                // / alloca passes; arg pseudos become pre-colored
+                // graph vertices in Phase 2.
                 continue;
             }
-
-            // Handle constants and symbols
             if let Some(pseudo) = func.get_pseudo(interval.pseudo) {
                 match &pseudo.kind {
                     PseudoKind::Val(v) => {
@@ -1042,7 +1068,6 @@ impl RegAlloc {
                         if let Some(local_var) = func.locals.get(name) {
                             let size = (types.size_bits(local_var.typ) / 8) as i32;
                             let size = size.max(8);
-                            // Determine alignment: explicit _Alignas takes precedence
                             let natural_align = types.alignment(local_var.typ) as i32;
                             let alignment = if let Some(explicit) = local_var.explicit_align {
                                 explicit as i32
@@ -1050,15 +1075,8 @@ impl RegAlloc {
                                 natural_align.max(8)
                             };
                             let aligned_size = (size + alignment - 1) & !(alignment - 1);
-
-                            // Stack slot reuse uses block-level interference checks
-                            // (live_in/live_out from dataflow fixpoint) which are correct
-                            // for all CFG shapes. Addr-taken syms need stable addresses
-                            // because symaddr-derived pointers may escape to callees.
                             let reusable = !self.addr_taken_syms.contains(&interval.pseudo);
-
-                            self.alloc_stack_slot(&interval, aligned_size, alignment, reusable);
-
+                            self.alloc_stack_slot(interval, aligned_size, alignment, reusable);
                             if types.is_float(local_var.typ) {
                                 self.fp_pseudos.insert(interval.pseudo);
                             }
@@ -1071,110 +1089,297 @@ impl RegAlloc {
                     _ => {}
                 }
             }
-
-            // 128-bit integers always go on the stack (16 bytes, 16-byte aligned)
             if self.int128_pseudos.contains(&interval.pseudo) {
-                self.alloc_stack_slot(&interval, 16, 16, true);
+                self.alloc_stack_slot(interval, 16, 16, true);
                 continue;
             }
-
-            // Allocate register based on type
             let needs_fp = self.fp_pseudos.contains(&interval.pseudo);
-            let crosses_call = interval_crosses_call(&interval, call_positions);
-
-            // Find registers that conflict due to constraints (e.g., Rax/Rdx for division)
-            let conflicting_regs = find_conflicting_registers(&interval, constraint_points);
-
             if needs_fp {
-                // Long double uses x87 FPU (stack-based), not XMM registers.
-                // Always allocate to stack with 16-byte slots for 80-bit extended precision.
                 let is_longdouble = self.ld_pseudos.contains(&interval.pseudo);
-                // FP pseudos that live across basic block boundaries must be
-                // spilled to stack. Phase D's interval collapse is unreliable
-                // for multi-block lifetimes (same issue as stack coloring),
-                // so XMM registers would be prematurely freed and reused.
+                let crosses_call = interval_crosses_call(interval, call_positions);
                 let crosses_block = self.live_out.iter().any(|lo| lo.contains(&interval.pseudo));
                 if is_longdouble {
-                    // Long double needs 16 bytes (80-bit padded to 128-bit)
-                    self.alloc_stack_slot(&interval, 16, 16, false);
-                } else if crosses_call || crosses_block {
-                    // All XMM registers are caller-saved on x86-64 SysV ABI.
-                    // Values live across function calls or block boundaries
-                    // must be spilled to stack.
-                    self.alloc_stack_slot(&interval, 8, 8, true);
-                } else if let Some(xmm) = self.free_xmm_regs.pop() {
-                    self.locations.insert(interval.pseudo, Loc::Xmm(xmm));
-                    let pos = self
-                        .active_xmm
-                        .partition_point(|(i, _)| i.end <= interval.end);
-                    self.active_xmm.insert(pos, (interval.clone(), xmm));
-                } else {
-                    self.alloc_stack_slot(&interval, 8, 8, true);
+                    self.alloc_stack_slot(interval, 16, 16, false);
+                    continue;
                 }
-            } else if crosses_call || interval.in_loop {
-                // Interval crosses a call or is inside a loop - must use callee-saved register or spill
-                // For loops without calls, this prevents values from being clobbered by register pressure
-                // Also exclude registers that conflict with constraints
-                if let Some(idx) = self
-                    .free_regs
-                    .iter()
-                    .position(|r| r.is_callee_saved() && !conflicting_regs.contains(r))
-                {
-                    let reg = self.free_regs.remove(idx);
-                    if !self.used_callee_saved.contains(&reg) {
-                        self.used_callee_saved.push(reg);
-                    }
-                    self.locations.insert(interval.pseudo, Loc::Reg(reg));
-                    let pos = self.active.partition_point(|(i, _)| i.end <= interval.end);
-                    self.active.insert(pos, (interval.clone(), reg));
-                } else {
-                    // No callee-saved registers available - spill to stack
-                    self.alloc_stack_slot(&interval, 8, 8, true);
+                if crosses_call || crosses_block {
+                    self.alloc_stack_slot(interval, 8, 8, true);
+                    continue;
                 }
+                xmm_candidates.insert(interval.pseudo);
             } else {
-                // Interval doesn't appear to cross a call. In functions WITH
-                // calls, prefer callee-saved registers to guard against
-                // under-estimated live intervals from complex control flow
-                // (e.g., switch/goto dispatch). In functions WITHOUT calls,
-                // prefer caller-saved to minimize callee-saved usage.
-                let reg_opt = if let Some(idx) = self
-                    .free_regs
-                    .iter()
-                    .position(|r| !r.is_callee_saved() && !conflicting_regs.contains(r))
-                {
-                    Some(self.free_regs.remove(idx))
-                } else if let Some(idx) = self
-                    .free_regs
-                    .iter()
-                    .position(|r| !conflicting_regs.contains(r))
-                {
-                    Some(self.free_regs.remove(idx))
-                } else {
-                    None
-                };
+                gp_candidates.insert(interval.pseudo);
+            }
+        }
 
-                if let Some(reg) = reg_opt {
-                    if reg.is_callee_saved() && !self.used_callee_saved.contains(&reg) {
-                        self.used_callee_saved.push(reg);
-                    }
-                    self.locations.insert(interval.pseudo, Loc::Reg(reg));
-                    let pos = self.active.partition_point(|(i, _)| i.end <= interval.end);
-                    self.active.insert(pos, (interval.clone(), reg));
+        // -------- Phase 2: per-bank chordal coloring --------
+        self.color_gp_bank(
+            func,
+            &intervals,
+            call_positions,
+            constraint_points,
+            &gp_candidates,
+        );
+        self.color_xmm_bank(func, &intervals, &xmm_candidates);
+    }
+
+    fn color_gp_bank(
+        &mut self,
+        func: &Function,
+        intervals: &[LiveInterval],
+        call_positions: &[usize],
+        constraint_points: &[ConstraintPoint<Reg>],
+        gp_candidates: &std::collections::BTreeSet<PseudoId>,
+    ) {
+        use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering};
+        if gp_candidates.is_empty() {
+            return;
+        }
+
+        // Pre-colored vertices: any pseudo already mapped to a GP reg
+        // (ABI-pinned args from allocate_arguments). Add them to the
+        // graph so live conflicts are respected.
+        let mut pre_colored: BTreeMap<PseudoId, Reg> = BTreeMap::new();
+        let mut all_vertices: std::collections::BTreeSet<PseudoId> = gp_candidates.clone();
+        for (&pid, loc) in self.locations.iter() {
+            if let Loc::Reg(r) = loc {
+                pre_colored.insert(pid, *r);
+                all_vertices.insert(pid);
+            }
+        }
+
+        // GP coloring needs def-vs-src edges: some codegen lowerings
+        // (notably x86_64 `cmov` for ternary `(cond)?a:b`) materialize
+        // the target into a register and then read source values,
+        // which would clobber a source sharing the target's register.
+        let graph = build_interference_graph(&all_vertices, func, &self.live_out, true);
+
+        // Forbidden colors. Three sources:
+        //   (a) constraint clobbers (idiv RAX/RDX, etc.) — for pseudos
+        //       live across the constraint that are NOT operands.
+        //   (b) cross-call → all caller-saved registers (HARD
+        //       constraint, NOT a preference: the call clobbers them
+        //       and the value would be lost).
+        //   (c) NB: in-loop is NOT a forbidden constraint, only a
+        //       preference (soft) — see preferred_palette below.
+        let caller_saved: Vec<Reg> = Reg::allocatable()
+            .iter()
+            .copied()
+            .filter(|r| !r.is_callee_saved())
+            .collect();
+        let mut forbidden: BTreeMap<PseudoId, std::collections::BTreeSet<Reg>> = BTreeMap::new();
+        for cp in constraint_points {
+            for interval in intervals {
+                if !gp_candidates.contains(&interval.pseudo) {
+                    continue;
+                }
+                if interval.start > cp.position || cp.position > interval.end {
+                    continue;
+                }
+                if cp.involved_pseudos.contains(&interval.pseudo) {
+                    continue;
+                }
+                let entry = forbidden.entry(interval.pseudo).or_default();
+                for &c in &cp.clobbers {
+                    entry.insert(c);
+                }
+            }
+        }
+        let mut in_loop_set: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for interval in intervals {
+            if !gp_candidates.contains(&interval.pseudo) {
+                continue;
+            }
+            if interval_crosses_call(interval, call_positions) {
+                let entry = forbidden.entry(interval.pseudo).or_default();
+                for &r in &caller_saved {
+                    entry.insert(r);
+                }
+            } else if interval.in_loop {
+                in_loop_set.insert(interval.pseudo);
+            }
+        }
+
+        // Preferred palettes: in-loop → callee-saved first, else
+        // caller-saved first (to avoid touching callee-saved slots in
+        // the prologue/epilogue when not needed).
+        let caller_first: Vec<Reg> = {
+            let mut v = caller_saved.clone();
+            for &r in Reg::allocatable() {
+                if r.is_callee_saved() {
+                    v.push(r);
+                }
+            }
+            v
+        };
+        let callee_first: Vec<Reg> = {
+            let mut v: Vec<Reg> = Reg::allocatable()
+                .iter()
+                .copied()
+                .filter(|r| r.is_callee_saved())
+                .collect();
+            for &r in &caller_saved {
+                v.push(r);
+            }
+            v
+        };
+        let caller_first_c = caller_first.clone();
+        let callee_first_c = callee_first.clone();
+
+        let order = mcs_ordering(&graph);
+        let result = greedy_color(
+            &graph,
+            &order,
+            Reg::allocatable(),
+            &pre_colored,
+            &forbidden,
+            |v| {
+                if in_loop_set.contains(&v) {
+                    Some(callee_first_c.clone())
                 } else {
-                    // Reusable: short-lived, no call/loop crossing
-                    self.alloc_stack_slot(&interval, 8, 8, true);
+                    Some(caller_first_c.clone())
+                }
+            },
+        );
+
+        // -------- M7 Belady eviction --------
+        // For each pseudo the greedy pass couldn't place, look at its
+        // colored neighbors. If a neighbor's next-use is STRICTLY
+        // further than the failing pseudo's, hand the neighbor's
+        // register to the failing pseudo and spill the neighbor —
+        // Belady's "evict the value used furthest in the future" rule
+        // applied at color-failure time. Eviction only succeeds when
+        // the neighbor's color is also legal for the failing pseudo
+        // (not in its forbidden set, not held by any OTHER neighbor).
+        let uses = crate::arch::regalloc::compute_use_positions(func);
+        let mut colors = result.colors;
+        let mut final_spilled: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for spilled in result.spilled {
+            if colors.contains_key(&spilled) {
+                continue;
+            }
+            let interval = match Self::interval_by_pseudo(intervals, spilled) {
+                Some(i) => i,
+                None => {
+                    final_spilled.insert(spilled);
+                    continue;
+                }
+            };
+            let empty = std::collections::BTreeSet::new();
+            let forbid = forbidden.get(&spilled).unwrap_or(&empty);
+            let spilled_next =
+                crate::arch::regalloc::next_use_distance(&uses, spilled, interval.start);
+            // Snapshot the neighbor list before scanning; we mutate
+            // `colors` inside the inner search so the iteration set
+            // needs to be stable.
+            let neighbors: Vec<PseudoId> = graph.neighbors(spilled).collect();
+            let mut best_evict: Option<(PseudoId, Reg, usize)> = None;
+            for &n in &neighbors {
+                if pre_colored.contains_key(&n) {
+                    // Don't evict ABI-pinned args.
+                    continue;
+                }
+                let Some(&color) = colors.get(&n) else {
+                    continue;
+                };
+                if forbid.contains(&color) {
+                    continue;
+                }
+                // Color is legal for `spilled` only if no OTHER
+                // neighbor already holds it.
+                let conflict = neighbors
+                    .iter()
+                    .any(|&m| m != n && colors.get(&m).copied() == Some(color));
+                if conflict {
+                    continue;
                 }
+                let nd = crate::arch::regalloc::next_use_distance(&uses, n, interval.start);
+                if nd > spilled_next && best_evict.is_none_or(|(_, _, d)| nd > d) {
+                    best_evict = Some((n, color, nd));
+                }
+            }
+            if let Some((evicted, color, _)) = best_evict {
+                colors.remove(&evicted);
+                colors.insert(spilled, color);
+                final_spilled.insert(evicted);
+            } else {
+                final_spilled.insert(spilled);
+            }
+        }
+
+        // -------- Phase 3: commit --------
+        for (&pid, &reg) in &colors {
+            if pre_colored.contains_key(&pid) {
+                continue;
+            }
+            self.locations.insert(pid, Loc::Reg(reg));
+            if reg.is_callee_saved() && !self.used_callee_saved.contains(&reg) {
+                self.used_callee_saved.push(reg);
+            }
+        }
+        for &spilled in &final_spilled {
+            if self.locations.contains_key(&spilled) {
+                continue;
+            }
+            if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) {
+                self.alloc_stack_slot(interval, 8, 8, true);
             }
         }
     }
 
-    fn expire_old_intervals(&mut self, point: usize) {
-        // Expire integer register intervals
-        expire_intervals(&mut self.active, &mut self.free_regs, point);
-        // Expire XMM register intervals
-        expire_intervals(&mut self.active_xmm, &mut self.free_xmm_regs, point);
-        // Expire stack slot intervals for reuse
-        expire_stack_intervals(&mut self.active_stack, &mut self.free_stack_slots, point);
+    fn color_xmm_bank(
+        &mut self,
+        func: &Function,
+        intervals: &[LiveInterval],
+        xmm_candidates: &std::collections::BTreeSet<PseudoId>,
+    ) {
+        use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering};
+        if xmm_candidates.is_empty() {
+            return;
+        }
+        let mut pre_colored: BTreeMap<PseudoId, XmmReg> = BTreeMap::new();
+        let mut all_vertices: std::collections::BTreeSet<PseudoId> = xmm_candidates.clone();
+        for (&pid, loc) in self.locations.iter() {
+            if let Loc::Xmm(r) = loc {
+                pre_colored.insert(pid, *r);
+                all_vertices.insert(pid);
+            }
+        }
+        // XMM coloring does NOT need def-vs-src edges: SSE FP ops are
+        // three-operand at the IR level (target, src1, src2) and the
+        // codegen lowers them to either `movsd src1, dst; opsd src2,
+        // dst` (which is safe regardless of register sharing) or to
+        // AVX/SSE3-style three-operand forms. Adding def-vs-src edges
+        // here over-constrains coloring and corrupts FP-heavy code
+        // paths (e.g. _Py_dg_strtod's correction loop) in non-obvious
+        // ways.
+        let graph = build_interference_graph(&all_vertices, func, &self.live_out, false);
+        let forbidden: BTreeMap<PseudoId, std::collections::BTreeSet<XmmReg>> = BTreeMap::new();
+        let order = mcs_ordering(&graph);
+        let result = greedy_color(
+            &graph,
+            &order,
+            XmmReg::allocatable(),
+            &pre_colored,
+            &forbidden,
+            |_| None::<Vec<XmmReg>>,
+        );
+        for (&pid, &reg) in &result.colors {
+            if pre_colored.contains_key(&pid) {
+                continue;
+            }
+            self.locations.insert(pid, Loc::Xmm(reg));
+        }
+        for &spilled in &result.spilled {
+            if self.locations.contains_key(&spilled) {
+                continue;
+            }
+            if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) {
+                self.alloc_stack_slot(interval, 8, 8, true);
+            }
+        }
     }
 
     /// Compute live intervals, constraint points, and per-block liveness sets.

From 0e2c3b202f1c3392648a99f5489acfe6dd8b7288 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sun, 31 May 2026 03:57:50 +0000
Subject: [PATCH 2/4] cc: M6+M7 chordal coloring + Belady splitting (aarch64)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port the x86_64 chordal coloring to aarch64. Same shape as the
x86_64 implementation:

- run_chordal_color replaces run_linear_scan
- color_gp_bank: def_src_edges=true (aarch64 csel has the same
  source-vs-target clobber concern as x86_64 cmov), hard-forbid
  caller-saved for cross-call ranges, soft-prefer callee-saved
  in loops
- color_vreg_bank: def_src_edges=false, spill-on-fail (Belady
  GP-only; V-bank splitting follows the same plan as XMM)
- Ord/PartialOrd added to Reg and VReg for BTreeMap keys
  (deterministic iteration — the design rule)

aarch64 lacks x86_64's idiv/shift register constraints, so the
constraint-point machinery sees no input today. The plumbing is
kept identical to x86_64 so when the constraint system grows
(M7.5 — per-instruction operand/clobber constraints, shared with
inline asm), it lights up here without further structural change.

Multi-reg call returns (complex, struct, union) and __int128
remain forced to stack slots in Phase 1, matching prior aarch64
behavior.

cc/arch/regalloc.rs — delete now-unused expire_intervals,
expire_stack_intervals, find_conflicting_registers. Both backends
moved off the linear-scan core; these had no remaining callers.

Known latent risk (not blocking, mirrors x86_64): inline libc
calls emitted by cc/arch/aarch64/features.rs (__signbitf, fabs,
signbit) bypass the allocator's call-position walk. Linear scan
hid this by always pop'ing from the end of the palette (V31
first); chordal coloring will use V0 freely. CPython exercises
this rarely enough that x86_64 tests pass; if aarch64 CI flags
an FP-builtin failure, this is where to look. The constraint
system milestone resolves it cleanly.

Verified:
- cc test suite 914+948+204 pass on x86_64 host
- cargo build --release clean
- cargo clippy --all-targets clean
- cargo fmt --check clean
- aarch64 behavior verification deferred to macOS CI (no local
  cross-compiler)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cc/arch/aarch64/regalloc.rs | 426 +++++++++++++++++++++++++-----------
 cc/arch/regalloc.rs         |  96 --------
 2 files changed, 299 insertions(+), 223 deletions(-)

diff --git a/cc/arch/aarch64/regalloc.rs b/cc/arch/aarch64/regalloc.rs
index 818f152f..edf0fb26 100644
--- a/cc/arch/aarch64/regalloc.rs
+++ b/cc/arch/aarch64/regalloc.rs
@@ -36,8 +36,7 @@
 // ============================================================================
 
 use crate::arch::regalloc::{
-    compute_live_intervals, expire_intervals, expire_stack_intervals, find_call_positions,
-    find_conflicting_registers, identify_addr_taken_syms, identify_fp_pseudos,
+    compute_live_intervals, find_call_positions, identify_addr_taken_syms, identify_fp_pseudos,
     interval_crosses_call, ConstraintPoint, FreeSlot, LiveInterval, LivenessResult,
 };
 use crate::ir::{Function, Instruction, Opcode, PseudoId, PseudoKind};
@@ -49,7 +48,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
 // ============================================================================
 
 /// AArch64 physical registers
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum Reg {
     // General purpose registers x0-x28
     X0,
@@ -270,7 +269,7 @@ impl Reg {
 // ============================================================================
 
 /// AArch64 SIMD/FP registers (V0-V31, accessed as D0-D31 for double, S0-S31 for float)
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum VReg {
     V0,
     V1,
@@ -655,14 +654,11 @@ pub struct SpilledArg {
 pub struct RegAlloc {
     /// Mapping from pseudo to location
     locations: HashMap<PseudoId, Loc>,
-    /// Free registers
+    /// Free GP registers (used by argument pre-allocation and the
+    /// spill-args helper; the chordal coloring core ignores it).
     free_regs: Vec<Reg>,
-    /// Free FP registers
+    /// Free FP registers (same role as `free_regs` for the V bank).
     free_fp_regs: Vec<VReg>,
-    /// Active intervals (sorted by end point)
-    active: Vec<(LiveInterval, Reg)>,
-    /// Active FP intervals (sorted by end point)
-    active_fp: Vec<(LiveInterval, VReg)>,
     /// Next stack slot offset
     stack_offset: i32,
     /// Callee-saved registers that were used
@@ -693,8 +689,6 @@ impl RegAlloc {
             locations: HashMap::new(),
             free_regs: Reg::allocatable().to_vec(),
             free_fp_regs: VReg::allocatable().to_vec(),
-            active: Vec::new(),
-            active_fp: Vec::new(),
             stack_offset: 0,
             used_callee_saved: Vec::new(),
             used_callee_saved_fp: Vec::new(),
@@ -730,7 +724,7 @@ impl RegAlloc {
 
         self.spill_args_across_calls(func, &intervals, &call_positions);
         self.allocate_alloca_to_stack(func);
-        self.run_linear_scan(func, types, intervals, &call_positions, &constraint_points);
+        self.run_chordal_color(func, types, intervals, &call_positions, &constraint_points);
 
         crate::arch::regalloc::LocationMap::from(self.locations.clone())
     }
@@ -740,8 +734,6 @@ impl RegAlloc {
         self.locations.clear();
         self.free_regs = Reg::allocatable().to_vec();
         self.free_fp_regs = VReg::allocatable().to_vec();
-        self.active.clear();
-        self.active_fp.clear();
         self.stack_offset = 0;
         self.used_callee_saved.clear();
         self.used_callee_saved_fp.clear();
@@ -982,8 +974,24 @@ impl RegAlloc {
         }
     }
 
-    /// Run the linear scan allocation algorithm
-    fn run_linear_scan(
+    /// M6 chordal coloring + M7 Belady eviction, AArch64 mirror of the
+    /// x86_64 `run_chordal_color` in `cc/arch/x86_64/regalloc.rs`.
+    ///
+    /// Phases:
+    ///   1. Pre-pass: route non-register-allocated pseudos to their
+    ///      proper Locs (constants → Imm/FImm, Sym → stack slot,
+    ///      __int128 → 16-byte stack, multi-reg call returns →
+    ///      stack). The remaining pseudos go into per-bank candidate
+    ///      sets.
+    ///   2. Color: per-bank chordal coloring. ABI-pinned args become
+    ///      pre-colored vertices; cross-call → caller-saved becomes a
+    ///      per-vertex forbidden set. In-loop pseudos get a
+    ///      callee-first preferred palette (soft, not forbidden).
+    ///   3. Commit: write Loc::Reg / Loc::VReg for colored vertices,
+    ///      allocate stack slots for spilled vertices, track
+    ///      `used_callee_saved` for the prologue.
+    #[allow(clippy::too_many_arguments)]
+    fn run_chordal_color(
         &mut self,
         func: &Function,
         types: &TypeTable,
@@ -991,17 +999,18 @@ impl RegAlloc {
         call_positions: &[usize],
         constraint_points: &[ConstraintPoint<Reg>],
     ) {
-        for interval in intervals {
-            self.expire_old_intervals(interval.start);
-
+        // -------- Phase 1: pre-pass --------
+        let mut gp_candidates: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        let mut vreg_candidates: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for interval in &intervals {
             if self.locations.contains_key(&interval.pseudo) {
+                // Already assigned by allocate_arguments / spill_args
+                // / alloca passes; arg pseudos become pre-colored
+                // graph vertices in Phase 2.
                 continue;
             }
-
-            // Find registers that would conflict with constraints at this interval
-            let conflicting = find_conflicting_registers(&interval, constraint_points);
-
-            // Handle constants and symbols
             if let Some(pseudo) = func.get_pseudo(interval.pseudo) {
                 match &pseudo.kind {
                     PseudoKind::Val(v) => {
@@ -1026,23 +1035,14 @@ impl RegAlloc {
                         if let Some(local) = func.locals.get(name) {
                             let size = (types.size_bits(local.typ) / 8) as i32;
                             let size = size.max(8);
-                            // Determine alignment: explicit _Alignas takes precedence,
-                            // otherwise use natural type alignment (16 for __int128, etc.)
                             let natural_align = types.alignment(local.typ) as i32;
                             let alignment = local
                                 .explicit_align
                                 .map(|a| a as i32)
                                 .unwrap_or(natural_align.max(8));
                             let aligned_size = (size + alignment - 1) & !(alignment - 1);
-
-                            // Stack slot reuse uses block-level interference checks
-                            // (live_in/live_out from dataflow fixpoint) which are correct
-                            // for all CFG shapes. Addr-taken syms need stable addresses
-                            // because symaddr-derived pointers may escape to callees.
                             let reusable = !self.addr_taken_syms.contains(&interval.pseudo);
-
-                            self.alloc_stack_slot(&interval, aligned_size, alignment, reusable);
-
+                            self.alloc_stack_slot(interval, aligned_size, alignment, reusable);
                             if types.is_float(local.typ) {
                                 self.fp_pseudos.insert(interval.pseudo);
                             }
@@ -1056,7 +1056,7 @@ impl RegAlloc {
                 }
             }
 
-            // Force 128-bit integers to 16-byte aligned stack slots (never in GP regs)
+            // __int128 → 16-byte stack slot, never in registers.
             let is_int128 = func.blocks.iter().any(|b| {
                 b.insns.iter().any(|insn| {
                     insn.typ
@@ -1066,18 +1066,17 @@ impl RegAlloc {
                 })
             });
             if is_int128 {
-                self.alloc_stack_slot(&interval, 16, 16, true);
+                self.alloc_stack_slot(interval, 16, 16, true);
                 continue;
             }
 
-            // Force stack allocation for complex and struct return values from calls
-            // These types span multiple FP/GP registers and cannot fit in a single register
+            // Multi-register call returns (complex, struct, union) →
+            // stack. These span V0+V1 or X0+X1 and can't live in a
+            // single allocator vertex.
             let multi_reg_return_typ: Option<TypeId> = func.blocks.iter().find_map(|b| {
                 b.insns.iter().find_map(|insn| {
                     if insn.op == Opcode::Call && insn.target == Some(interval.pseudo) {
                         if let Some(typ) = insn.typ {
-                            // Complex types use V0+V1 (2 FP regs)
-                            // Struct/union types may use multiple regs depending on size
                             let kind = types.kind(typ);
                             if types.is_complex(typ)
                                 || matches!(kind, TypeKind::Struct | TypeKind::Union)
@@ -1091,112 +1090,285 @@ impl RegAlloc {
                     }
                 })
             });
-
             if let Some(typ) = multi_reg_return_typ {
                 let size = (types.size_bits(typ) / 8) as i32;
                 let size = size.max(8);
                 let alignment = types.alignment(typ) as i32;
                 let aligned_size = (size + (alignment - 1)) & !(alignment - 1);
-                self.alloc_stack_slot(&interval, aligned_size, alignment, false);
+                self.alloc_stack_slot(interval, aligned_size, alignment, false);
                 continue;
             }
 
-            // Allocate register based on type
             let needs_fp = self.fp_pseudos.contains(&interval.pseudo);
-            let crosses_call = interval_crosses_call(&interval, call_positions);
-
             if needs_fp {
-                if crosses_call {
-                    // FP interval crosses a call - must use callee-saved FP register or spill
-                    if let Some(idx) = self.free_fp_regs.iter().position(|r| r.is_callee_saved()) {
-                        let reg = self.free_fp_regs.remove(idx);
-                        if !self.used_callee_saved_fp.contains(&reg) {
-                            self.used_callee_saved_fp.push(reg);
-                        }
-                        self.locations.insert(interval.pseudo, Loc::VReg(reg));
-                        self.active_fp.push((interval.clone(), reg));
-                        self.active_fp.sort_by_key(|(i, _)| i.end);
-                    } else {
-                        self.alloc_stack_slot(&interval, 8, 8, true);
-                    }
-                } else if let Some(reg) = self.free_fp_regs.pop() {
-                    if reg.is_callee_saved() && !self.used_callee_saved_fp.contains(&reg) {
-                        self.used_callee_saved_fp.push(reg);
-                    }
-                    self.locations.insert(interval.pseudo, Loc::VReg(reg));
-                    let pos = self
-                        .active_fp
-                        .partition_point(|(i, _)| i.end <= interval.end);
-                    self.active_fp.insert(pos, (interval.clone(), reg));
+                // FP cross-call/block → stack (avoids the chordal pass
+                // having to model V-bank cross-call eviction, which is
+                // not implemented yet; matches the x86_64 XMM policy).
+                let crosses_call = interval_crosses_call(interval, call_positions);
+                let crosses_block = self.live_out.iter().any(|lo| lo.contains(&interval.pseudo));
+                if crosses_call || crosses_block {
+                    self.alloc_stack_slot(interval, 8, 8, true);
+                    continue;
+                }
+                vreg_candidates.insert(interval.pseudo);
+            } else {
+                gp_candidates.insert(interval.pseudo);
+            }
+        }
+
+        // -------- Phase 2: per-bank chordal coloring --------
+        self.color_gp_bank(
+            func,
+            &intervals,
+            call_positions,
+            constraint_points,
+            &gp_candidates,
+        );
+        self.color_vreg_bank(func, &intervals, &vreg_candidates);
+    }
+
+    fn color_gp_bank(
+        &mut self,
+        func: &Function,
+        intervals: &[LiveInterval],
+        call_positions: &[usize],
+        constraint_points: &[ConstraintPoint<Reg>],
+        gp_candidates: &std::collections::BTreeSet<PseudoId>,
+    ) {
+        use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering};
+        if gp_candidates.is_empty() {
+            return;
+        }
+
+        // Pre-colored vertices: any pseudo already mapped to a GP reg
+        // (ABI-pinned args from allocate_arguments). Add them to the
+        // graph so live conflicts are respected.
+        let mut pre_colored: BTreeMap<PseudoId, Reg> = BTreeMap::new();
+        let mut all_vertices: std::collections::BTreeSet<PseudoId> = gp_candidates.clone();
+        for (&pid, loc) in self.locations.iter() {
+            if let Loc::Reg(r) = loc {
+                pre_colored.insert(pid, *r);
+                all_vertices.insert(pid);
+            }
+        }
+
+        // GP coloring needs def-vs-src edges: aarch64 `csel` for
+        // ternary `(cond)?a:b` materializes the target then reads
+        // source values, which would clobber a source sharing the
+        // target's register. Same pattern as x86_64's cmov.
+        let graph = build_interference_graph(&all_vertices, func, &self.live_out, true);
+
+        // Forbidden colors:
+        //   (a) constraint clobbers — for pseudos live across the
+        //       constraint that are NOT operands. (AAPCS64 has no
+        //       implicit clobbers like x86 idiv/shift, so this is
+        //       a no-op today, but the plumbing matches x86_64.)
+        //   (b) cross-call → all caller-saved (HARD; the call clobbers
+        //       them and the value would be lost).
+        let caller_saved: Vec<Reg> = Reg::allocatable()
+            .iter()
+            .copied()
+            .filter(|r| !r.is_callee_saved())
+            .collect();
+        let mut forbidden: BTreeMap<PseudoId, std::collections::BTreeSet<Reg>> = BTreeMap::new();
+        for cp in constraint_points {
+            for interval in intervals {
+                if !gp_candidates.contains(&interval.pseudo) {
+                    continue;
+                }
+                if interval.start > cp.position || cp.position > interval.end {
+                    continue;
+                }
+                if cp.involved_pseudos.contains(&interval.pseudo) {
+                    continue;
+                }
+                let entry = forbidden.entry(interval.pseudo).or_default();
+                for &c in &cp.clobbers {
+                    entry.insert(c);
+                }
+            }
+        }
+        let mut in_loop_set: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for interval in intervals {
+            if !gp_candidates.contains(&interval.pseudo) {
+                continue;
+            }
+            if interval_crosses_call(interval, call_positions) {
+                let entry = forbidden.entry(interval.pseudo).or_default();
+                for &r in &caller_saved {
+                    entry.insert(r);
+                }
+            } else if interval.in_loop {
+                in_loop_set.insert(interval.pseudo);
+            }
+        }
+
+        let caller_first: Vec<Reg> = {
+            let mut v = caller_saved.clone();
+            for &r in Reg::allocatable() {
+                if r.is_callee_saved() {
+                    v.push(r);
+                }
+            }
+            v
+        };
+        let callee_first: Vec<Reg> = {
+            let mut v: Vec<Reg> = Reg::allocatable()
+                .iter()
+                .copied()
+                .filter(|r| r.is_callee_saved())
+                .collect();
+            for &r in &caller_saved {
+                v.push(r);
+            }
+            v
+        };
+        let caller_first_c = caller_first.clone();
+        let callee_first_c = callee_first.clone();
+
+        let order = mcs_ordering(&graph);
+        let result = greedy_color(
+            &graph,
+            &order,
+            Reg::allocatable(),
+            &pre_colored,
+            &forbidden,
+            |v| {
+                if in_loop_set.contains(&v) {
+                    Some(callee_first_c.clone())
                 } else {
-                    self.alloc_stack_slot(&interval, 8, 8, true);
+                    Some(caller_first_c.clone())
                 }
-            } else if crosses_call {
-                // GP interval crosses a call - must use callee-saved register or spill
-                // Also exclude registers that conflict with constraints
-                if let Some(idx) = self
-                    .free_regs
+            },
+        );
+
+        // -------- M7 Belady eviction --------
+        let uses = crate::arch::regalloc::compute_use_positions(func);
+        let mut colors = result.colors;
+        let mut final_spilled: std::collections::BTreeSet<PseudoId> =
+            std::collections::BTreeSet::new();
+        for spilled in result.spilled {
+            if colors.contains_key(&spilled) {
+                continue;
+            }
+            let interval = match Self::interval_by_pseudo(intervals, spilled) {
+                Some(i) => i,
+                None => {
+                    final_spilled.insert(spilled);
+                    continue;
+                }
+            };
+            let empty = std::collections::BTreeSet::new();
+            let forbid = forbidden.get(&spilled).unwrap_or(&empty);
+            let spilled_next =
+                crate::arch::regalloc::next_use_distance(&uses, spilled, interval.start);
+            let neighbors: Vec<PseudoId> = graph.neighbors(spilled).collect();
+            let mut best_evict: Option<(PseudoId, Reg, usize)> = None;
+            for &n in &neighbors {
+                if pre_colored.contains_key(&n) {
+                    continue;
+                }
+                let Some(&color) = colors.get(&n) else {
+                    continue;
+                };
+                if forbid.contains(&color) {
+                    continue;
+                }
+                let conflict = neighbors
                     .iter()
-                    .position(|r| r.is_callee_saved() && !conflicting.contains(r))
-                {
-                    let reg = self.free_regs.remove(idx);
-                    if !self.used_callee_saved.contains(&reg) {
-                        self.used_callee_saved.push(reg);
-                    }
-                    self.locations.insert(interval.pseudo, Loc::Reg(reg));
-                    let pos = self.active.partition_point(|(i, _)| i.end <= interval.end);
-                    self.active.insert(pos, (interval.clone(), reg));
-                } else {
-                    // No callee-saved registers available - spill to stack
-                    self.alloc_stack_slot(&interval, 8, 8, true);
+                    .any(|&m| m != n && colors.get(&m).copied() == Some(color));
+                if conflict {
+                    continue;
+                }
+                let nd = crate::arch::regalloc::next_use_distance(&uses, n, interval.start);
+                if nd > spilled_next && best_evict.is_none_or(|(_, _, d)| nd > d) {
+                    best_evict = Some((n, color, nd));
                 }
+            }
+            if let Some((evicted, color, _)) = best_evict {
+                colors.remove(&evicted);
+                colors.insert(spilled, color);
+                final_spilled.insert(evicted);
             } else {
-                // Interval doesn't cross a call - prefer caller-saved registers
-                // to minimize callee-saved register usage and stack conflicts
-                // Also exclude registers that conflict with constraints
-                let reg_opt = if let Some(idx) = self
-                    .free_regs
-                    .iter()
-                    .position(|r| !r.is_callee_saved() && !conflicting.contains(r))
-                {
-                    Some(self.free_regs.remove(idx))
-                } else if conflicting.is_empty() {
-                    // No constraints - use pop() to maintain original allocation order
-                    // (takes from end of list, which preserves existing behavior)
-                    self.free_regs.pop()
-                } else if let Some(idx) = self
-                    .free_regs
-                    .iter()
-                    .rposition(|r| !conflicting.contains(r))
-                {
-                    // With constraints - use rposition to prefer registers from end
-                    // (like pop() but filtering out conflicting registers)
-                    Some(self.free_regs.remove(idx))
-                } else {
-                    None
-                };
+                final_spilled.insert(spilled);
+            }
+        }
 
-                if let Some(reg) = reg_opt {
-                    if reg.is_callee_saved() && !self.used_callee_saved.contains(&reg) {
-                        self.used_callee_saved.push(reg);
-                    }
-                    self.locations.insert(interval.pseudo, Loc::Reg(reg));
-                    let pos = self.active.partition_point(|(i, _)| i.end <= interval.end);
-                    self.active.insert(pos, (interval.clone(), reg));
-                } else {
-                    self.alloc_stack_slot(&interval, 8, 8, true);
-                }
+        // -------- Phase 3: commit --------
+        for (&pid, &reg) in &colors {
+            if pre_colored.contains_key(&pid) {
+                continue;
+            }
+            self.locations.insert(pid, Loc::Reg(reg));
+            if reg.is_callee_saved() && !self.used_callee_saved.contains(&reg) {
+                self.used_callee_saved.push(reg);
+            }
+        }
+        for &spilled in &final_spilled {
+            if self.locations.contains_key(&spilled) {
+                continue;
+            }
+            if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) {
+                self.alloc_stack_slot(interval, 8, 8, true);
+            }
+        }
+    }
+
+    fn color_vreg_bank(
+        &mut self,
+        func: &Function,
+        intervals: &[LiveInterval],
+        vreg_candidates: &std::collections::BTreeSet<PseudoId>,
+    ) {
+        use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering};
+        if vreg_candidates.is_empty() {
+            return;
+        }
+        let mut pre_colored: BTreeMap<PseudoId, VReg> = BTreeMap::new();
+        let mut all_vertices: std::collections::BTreeSet<PseudoId> = vreg_candidates.clone();
+        for (&pid, loc) in self.locations.iter() {
+            if let Loc::VReg(r) = loc {
+                pre_colored.insert(pid, *r);
+                all_vertices.insert(pid);
+            }
+        }
+        // V-bank coloring does NOT need def-vs-src edges: aarch64 FP
+        // ops are three-operand at the architectural level (fadd d0,
+        // d1, d2), so target and source can share a register without
+        // codegen workarounds. Same rationale as x86_64's XMM bank.
+        let graph = build_interference_graph(&all_vertices, func, &self.live_out, false);
+        let forbidden: BTreeMap<PseudoId, std::collections::BTreeSet<VReg>> = BTreeMap::new();
+        let order = mcs_ordering(&graph);
+        let result = greedy_color(
+            &graph,
+            &order,
+            VReg::allocatable(),
+            &pre_colored,
+            &forbidden,
+            |_| None::<Vec<VReg>>,
+        );
+        for (&pid, &reg) in &result.colors {
+            if pre_colored.contains_key(&pid) {
+                continue;
+            }
+            self.locations.insert(pid, Loc::VReg(reg));
+            if reg.is_callee_saved() && !self.used_callee_saved_fp.contains(&reg) {
+                self.used_callee_saved_fp.push(reg);
+            }
+        }
+        for &spilled in &result.spilled {
+            if self.locations.contains_key(&spilled) {
+                continue;
+            }
+            if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) {
+                self.alloc_stack_slot(interval, 8, 8, true);
             }
         }
     }
 
-    fn expire_old_intervals(&mut self, point: usize) {
-        // Expire GP register intervals
-        expire_intervals(&mut self.active, &mut self.free_regs, point);
-        // Expire FP register intervals
-        expire_intervals(&mut self.active_fp, &mut self.free_fp_regs, point);
-        // Expire stack slot intervals for reuse
-        expire_stack_intervals(&mut self.active_stack, &mut self.free_stack_slots, point);
+    fn interval_by_pseudo(intervals: &[LiveInterval], p: PseudoId) -> Option<&LiveInterval> {
+        intervals.iter().find(|i| i.pseudo == p)
     }
 
     fn compute_live_intervals(&self, func: &Function) -> LivenessResult<Reg> {
diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs
index 61315b3e..a8512668 100644
--- a/cc/arch/regalloc.rs
+++ b/cc/arch/regalloc.rs
@@ -118,47 +118,6 @@ pub struct ConstraintPoint<R> {
 // Common Functions
 // ============================================================================
 
-/// Expire old intervals from the active list, returning freed registers to the free list.
-/// Generic over register type R (works with both GP and FP register types).
-pub fn expire_intervals<R: Copy>(
-    active: &mut Vec<(LiveInterval, R)>,
-    free_regs: &mut Vec<R>,
-    point: usize,
-) {
-    let mut to_remove = Vec::with_capacity(DEFAULT_SMALL_VEC_CAPACITY);
-    for (i, (interval, reg)) in active.iter().enumerate() {
-        if interval.end < point {
-            free_regs.push(*reg);
-            to_remove.push(i);
-        }
-    }
-    for i in to_remove.into_iter().rev() {
-        active.remove(i);
-    }
-}
-
-/// Expire stack intervals whose live range ended before `point`,
-/// returning their slots to the free list.
-pub fn expire_stack_intervals(
-    active_stack: &mut Vec<(LiveInterval, i32, i32)>,
-    free_slots: &mut BTreeMap<i32, Vec<FreeSlot>>,
-    point: usize,
-) {
-    active_stack.retain(|(interval, offset, size)| {
-        if interval.end < point {
-            let alignment = if *size >= 16 { 16 } else { 8 };
-            free_slots.entry(*size).or_default().push(FreeSlot {
-                offset: *offset,
-                alignment,
-                owner: interval.pseudo,
-            });
-            false
-        } else {
-            true
-        }
-    });
-}
-
 /// Find all positions of call instructions in a function.
 /// Used by spill_args_across_calls to identify where arguments may be clobbered.
 /// Includes Call, Longjmp, and Setjmp opcodes since they all invoke external functions
@@ -187,61 +146,6 @@ pub fn interval_crosses_call(interval: &LiveInterval, call_positions: &[usize])
         .any(|&call_pos| interval.start <= call_pos && call_pos <= interval.end)
 }
 
-/// Find registers that would conflict with this interval due to constraints.
-///
-/// A constraint point models an instruction that clobbers one or more
-/// physical registers (e.g. `idivl` clobbers RAX and RDX). For any live
-/// interval that overlaps such a point, we must keep that pseudo out of the
-/// clobbered registers — otherwise its value would be silently destroyed.
-///
-/// There is one exception: pseudos that are direct *operands* of the
-/// constraining instruction (`involved_pseudos`) MAY occupy a clobbered
-/// register, because the constraining instruction itself reads or writes
-/// those registers as part of its semantics. The classic case is the
-/// integer dividend, which must be in RAX/EAX for `idiv` to execute.
-///
-/// Crucially, that "operand exemption" only holds when the operand's value
-/// dies at the constraint point (`interval.end == cp.position`). When the
-/// interval extends *past* the clobber (`interval.end > cp.position`), the
-/// operand's value is still needed afterward, but the clobbering
-/// instruction will have destroyed it. In that case the allocator must
-/// pick a NON-clobbered register; codegen will materialize the value into
-/// the operand-required register (e.g. RAX) with a move just before the
-/// constraint, preserving the original pseudo's contents for later uses.
-///
-/// Without this distinction, `Copy`/CSE/SCCP-class passes that extend a
-/// pseudo's live range across a `mods.32`/`idivl` boundary silently
-/// miscompile (see git history: prior copyprop attempts hung do-while-
-/// continue tests by allocating the dividend to RAX with a use after the
-/// idiv).
-///
-/// Generic over register type R.
-pub fn find_conflicting_registers<R: Copy + Eq + Hash>(
-    interval: &LiveInterval,
-    constraint_points: &[ConstraintPoint<R>],
-) -> HashSet<R> {
-    let mut conflicts = HashSet::with_capacity(DEFAULT_SMALL_VEC_CAPACITY);
-
-    for cp in constraint_points {
-        // Use <= on both ends so an interval that exactly meets the
-        // constraint point still counts as overlapping.
-        if interval.start <= cp.position && cp.position <= interval.end {
-            let is_involved = cp.involved_pseudos.contains(&interval.pseudo);
-            let dies_at_point = interval.end == cp.position;
-            // The operand exemption only applies when the value is
-            // consumed AT the constraint point. If it must survive past
-            // the clobber, it has to live in a non-clobbered register.
-            if !is_involved || !dies_at_point {
-                for &reg in &cp.clobbers {
-                    conflicts.insert(reg);
-                }
-            }
-        }
-    }
-
-    conflicts
-}
-
 /// Result of liveness analysis: intervals, constraint points, and per-block liveness sets.
 pub struct LivenessResult<R> {
     pub intervals: Vec<LiveInterval>,

From 27b8d51387df98d83b49a1efc7151f272878fe76 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sun, 31 May 2026 06:18:15 +0000
Subject: [PATCH 3/4] cc: restore stack slot reuse lost in M6+M7 chordal port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

M6+M7 deleted expire_stack_intervals when it deleted the linear
scan core, since neither chordal backend called it. That removed
stack slot reuse: every alloc_stack_slot request now created a
fresh slot because the free_stack_slots pool was never populated.

On x86_64 this is wasteful but harmless — 32-bit addressing
absorbs huge frames. On aarch64 it broke 16 int128 codegen tests
on CI: stp/ldp accept signed 7-bit immediate offsets (range
[-512, 504] in multiples of 8), and int128-heavy tests pushed
offsets to #3472 (217 * 16 = 217 unique slots where 20-40 would
have sufficed with reuse).

Fix: re-introduce expire_stack_intervals in cc/arch/regalloc.rs
and call it from both chordal allocators:

- Phase 1: at the top of each iteration with interval.start.
  Intervals come pre-sorted by start position, so this monotonic
  sweep matches the previous linear-scan-era expiration shape.
- Phase 3: with usize::MAX just before spill commits, draining
  all remaining slots so the chordal-spilled pseudos can reuse
  any non-interfering slot.

The existing try_reuse_stack_slot uses interference checks (via
pseudos_interfere over live_in/live_out from the dataflow
fixpoint), so reuse is correct for any iteration order.

Latent issue noted (not fixed here, separate concern): aarch64
codegen helpers like emit_int128_move_to_stack push raw stp/ldp
without large-offset fallback. Even with reuse, frames close to
the [-512, 504] threshold will still hit this. The fix is an
add-scratch-then-stp pattern in the affected emitters; tracked
for a follow-up rather than bundled into this regression fix.

Verified:
- cc test suite 914+948+204 pass on x86_64 host
- cargo build / clippy / fmt clean
- aarch64 verification deferred to macOS CI

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cc/arch/aarch64/regalloc.rs | 25 +++++++++++++++++++++++++
 cc/arch/regalloc.rs         | 34 ++++++++++++++++++++++++++++++++++
 cc/arch/x86_64/regalloc.rs  | 23 +++++++++++++++++++++++
 3 files changed, 82 insertions(+)

diff --git a/cc/arch/aarch64/regalloc.rs b/cc/arch/aarch64/regalloc.rs
index edf0fb26..b27fd1d1 100644
--- a/cc/arch/aarch64/regalloc.rs
+++ b/cc/arch/aarch64/regalloc.rs
@@ -1005,6 +1005,17 @@ impl RegAlloc {
         let mut vreg_candidates: std::collections::BTreeSet<PseudoId> =
             std::collections::BTreeSet::new();
         for interval in &intervals {
+            // Intervals come pre-sorted by start position from
+            // compute_live_intervals, so this monotonic expiration
+            // recovers the linear-scan-era slot-reuse behavior the
+            // chordal sweep would otherwise lose. Without this,
+            // int128-heavy functions push stp/ldp offsets past
+            // aarch64's [-512, 504] immediate range.
+            crate::arch::regalloc::expire_stack_intervals(
+                &mut self.active_stack,
+                &mut self.free_stack_slots,
+                interval.start,
+            );
             if self.locations.contains_key(&interval.pseudo) {
                 // Already assigned by allocate_arguments / spill_args
                 // / alloca passes; arg pseudos become pre-colored
@@ -1305,6 +1316,13 @@ impl RegAlloc {
                 self.used_callee_saved.push(reg);
             }
         }
+        // Drain any remaining active slots into the free pool so spill
+        // commits below can reuse them via interference checks.
+        crate::arch::regalloc::expire_stack_intervals(
+            &mut self.active_stack,
+            &mut self.free_stack_slots,
+            usize::MAX,
+        );
         for &spilled in &final_spilled {
             if self.locations.contains_key(&spilled) {
                 continue;
@@ -1357,6 +1375,13 @@ impl RegAlloc {
                 self.used_callee_saved_fp.push(reg);
             }
         }
+        // Same drain-then-reuse pattern as the GP commit; see comment
+        // on expire_stack_intervals in cc/arch/regalloc.rs.
+        crate::arch::regalloc::expire_stack_intervals(
+            &mut self.active_stack,
+            &mut self.free_stack_slots,
+            usize::MAX,
+        );
         for &spilled in &result.spilled {
             if self.locations.contains_key(&spilled) {
                 continue;
diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs
index a8512668..426b4992 100644
--- a/cc/arch/regalloc.rs
+++ b/cc/arch/regalloc.rs
@@ -118,6 +118,40 @@ pub struct ConstraintPoint<R> {
 // Common Functions
 // ============================================================================
 
+/// Release stack slots whose owning interval ended before `point` back
+/// to the free-slot pool, where future `try_reuse_stack_slot` calls
+/// can find them.
+///
+/// Linear scan called this on every iteration with the next interval's
+/// `start`. The chordal allocator (M6) calls it twice per bank: once
+/// in Phase 1 sweeping monotonically by interval start, then once with
+/// `usize::MAX` just before Phase 3 spill commits to drain everything
+/// so spilled pseudos can reuse any non-interfering slot.
+///
+/// Without this, every `alloc_stack_slot` request creates a fresh
+/// slot. On x86_64 that's wasteful but harmless. On aarch64 it pushes
+/// stp/ldp offsets past the architectural [-512, 504] immediate range
+/// and the assembler rejects the output.
+pub fn expire_stack_intervals(
+    active_stack: &mut Vec<(LiveInterval, i32, i32)>,
+    free_slots: &mut BTreeMap<i32, Vec<FreeSlot>>,
+    point: usize,
+) {
+    active_stack.retain(|(interval, offset, size)| {
+        if interval.end < point {
+            let alignment = if *size >= 16 { 16 } else { 8 };
+            free_slots.entry(*size).or_default().push(FreeSlot {
+                offset: *offset,
+                alignment,
+                owner: interval.pseudo,
+            });
+            false
+        } else {
+            true
+        }
+    });
+}
+
 /// Find all positions of call instructions in a function.
 /// Used by spill_args_across_calls to identify where arguments may be clobbered.
 /// Includes Call, Longjmp, and Setjmp opcodes since they all invoke external functions
diff --git a/cc/arch/x86_64/regalloc.rs b/cc/arch/x86_64/regalloc.rs
index 8bd0f6c0..1b279fba 100644
--- a/cc/arch/x86_64/regalloc.rs
+++ b/cc/arch/x86_64/regalloc.rs
@@ -1038,6 +1038,15 @@ impl RegAlloc {
         let mut xmm_candidates: std::collections::BTreeSet<PseudoId> =
             std::collections::BTreeSet::new();
         for interval in &intervals {
+            // Intervals come pre-sorted by start position from
+            // compute_live_intervals, so this monotonic expiration
+            // recovers the linear-scan-era slot-reuse behavior the
+            // chordal sweep would otherwise lose.
+            crate::arch::regalloc::expire_stack_intervals(
+                &mut self.active_stack,
+                &mut self.free_stack_slots,
+                interval.start,
+            );
             if self.locations.contains_key(&interval.pseudo) {
                 // Already assigned by allocate_arguments / spill_args
                 // / alloca passes; arg pseudos become pre-colored
@@ -1319,6 +1328,13 @@ impl RegAlloc {
                 self.used_callee_saved.push(reg);
             }
         }
+        // Drain any remaining active slots into the free pool so spill
+        // commits below can reuse them via interference checks.
+        crate::arch::regalloc::expire_stack_intervals(
+            &mut self.active_stack,
+            &mut self.free_stack_slots,
+            usize::MAX,
+        );
         for &spilled in &final_spilled {
             if self.locations.contains_key(&spilled) {
                 continue;
@@ -1372,6 +1388,13 @@ impl RegAlloc {
             }
             self.locations.insert(pid, Loc::Xmm(reg));
         }
+        // Same drain-then-reuse pattern as the GP commit; see comment
+        // on expire_stack_intervals in cc/arch/regalloc.rs.
+        crate::arch::regalloc::expire_stack_intervals(
+            &mut self.active_stack,
+            &mut self.free_stack_slots,
+            usize::MAX,
+        );
         for &spilled in &result.spilled {
             if self.locations.contains_key(&spilled) {
                 continue;

From 6af088ebcf91ea279f2ca7eea76a0ff5c7b0cf7d Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sun, 31 May 2026 06:32:47 +0000
Subject: [PATCH 4/4] cc: legalize out-of-range stp/ldp offsets in aarch64
 codegen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aarch64 stp/ldp accept signed 7-bit immediate offsets scaled by
element size: [-512, +504] step 8 for B64, [-256, +252] step 4
for B32. Body-emitted pair instructions (int128 moves, spilled
arg stores, int128 call args) bypassed this constraint and
emitted raw stp/ldp with whatever offset stack_mem returned. On
deep frames the assembler rejected them ("index must be a
multiple of 8 in range [-512, 504]"). The prologue/epilogue
already had hand-rolled large-frame splits; the body sites did
not.

Centralized fix: a pair-address legalization helper on the
Aarch64CodeGen. Three new internal methods, two new public
emitters:

  fn pair_offset_fits(offset, size) -> bool
  fn emit_add_offset(dst, base, offset)
  fn legalize_pair_addr(size, addr) -> MemAddr
  pub fn emit_stp_legalized(size, src1, src2, addr)
  pub fn emit_ldp_legalized(size, addr, dst1, dst2)

`legalize_pair_addr` is a no-op for in-range BaseOffset and for
PreIndex / PostIndex (the latter is exclusive to the prologue /
epilogue, which retain their own split logic). For out-of-range
BaseOffset it emits `add X16, base, #offset` and rewrites the
addr to `[X16]`.

Scratch register choice: X16 (AAPCS64 IP0). Never in the
allocator palette, never used by other codegen helpers as a
*data* shuttle (they use x9–x11). The legalization convention is
documented next to the helper: X16 is clobbered iff the
legalizer fires; callers must not rely on X16 being alive past
the emit call. In practice every site follows the pattern
"compute source addr (may use X16) → load into X9/X10 →
legalize destination addr (may reuse X16) → store" — by the
time the destination's legalization runs, the source's use of
X16 is dead.

Migrated body emit sites:
  cc/arch/aarch64/codegen.rs:
    - emit_int128_move_to_stack (3 sites — the int128 store
      pattern that actually triggered the CI failure)
    - emit_int128_imm_store (1 site)
    - emit_load int128 path (2 sites)
    - emit_cbr int128 path (1 site)
    - emit_return int128 lowering (1 site)
    - int128 GP-pair arg storage (1 site)
  cc/arch/aarch64/expression.rs:
    - load_int128, store_int128 (2 sites)
  cc/arch/aarch64/call.rs:
    - int128 call-arg setup (2 sites)

Left as raw push_lir, all justified:
  - Prologue/epilogue PreIndex / PostIndex stp/ldp — already
    handle their own large-frame splits via emit_sub_sp_imm /
    emit_add_sp_imm.
  - zero_stack_frame — only enters the stp branch when offsets
    fit (lines guard `max_stp_offset <= 504`).
  - Callee-saved save/restore — offsets bounded by the callee-
    saved set (≤288 bytes max).
  - Sites where the address is `MemAddr::Base(X16)` after
    emit_load_addr — no offset to legalize.

FP pair helpers (emit_stp_fp_legalized / emit_ldp_fp_legalized)
intentionally not introduced — every current StpFp / LdpFp site
is bounded. A code comment marks where to add them when an FP
pair instruction grows a body emission with possibly-large
offset.

Verified:
- cc test suite 914+948+204 pass on x86_64 host
- cargo build / clippy / fmt clean
- aarch64 verification deferred to macOS CI

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cc/arch/aarch64/call.rs       |  19 +--
 cc/arch/aarch64/codegen.rs    | 286 ++++++++++++++++++++++++++--------
 cc/arch/aarch64/expression.rs |  14 +-
 3 files changed, 229 insertions(+), 90 deletions(-)

diff --git a/cc/arch/aarch64/call.rs b/cc/arch/aarch64/call.rs
index 0e7b73cd..1bf22998 100644
--- a/cc/arch/aarch64/call.rs
+++ b/cc/arch/aarch64/call.rs
@@ -245,12 +245,12 @@ impl Aarch64CodeGen {
                         Loc::Stack(offset) => {
                             // Load lo/hi from int128 stack slot into two consecutive regs
                             let mem = self.stack_mem(offset);
-                            self.push_lir(Aarch64Inst::Ldp {
-                                size: OperandSize::B64,
-                                addr: mem,
-                                dst1: int_arg_regs[int_arg_idx],
-                                dst2: int_arg_regs[int_arg_idx + 1],
-                            });
+                            self.emit_ldp_legalized(
+                                OperandSize::B64,
+                                mem,
+                                int_arg_regs[int_arg_idx],
+                                int_arg_regs[int_arg_idx + 1],
+                            );
                         }
                         Loc::Imm(v) => {
                             let lo = v as u64 as i64;
@@ -321,12 +321,7 @@ impl Aarch64CodeGen {
                 match loc {
                     Loc::Stack(src_off) => {
                         let mem = self.stack_mem(src_off);
-                        self.push_lir(Aarch64Inst::Ldp {
-                            size: OperandSize::B64,
-                            addr: mem,
-                            dst1: Reg::X9,
-                            dst2: Reg::X10,
-                        });
+                        self.emit_ldp_legalized(OperandSize::B64, mem, Reg::X9, Reg::X10);
                     }
                     Loc::Imm(v) => {
                         let lo = v as u64 as i64;
diff --git a/cc/arch/aarch64/codegen.rs b/cc/arch/aarch64/codegen.rs
index b78e27cf..4261142d 100644
--- a/cc/arch/aarch64/codegen.rs
+++ b/cc/arch/aarch64/codegen.rs
@@ -515,6 +515,195 @@ impl Aarch64CodeGen {
         }
     }
 
+    // ========================================================================
+    // Pair-addressing legalization (stp / ldp / stpfp / ldpfp)
+    // ========================================================================
+    //
+    // stp/ldp accept signed 7-bit immediate offsets scaled by element
+    // size:
+    //   B64 / Double : [-512,  504] step 8
+    //   B32 / Single : [-256,  252] step 4
+    //   Quad (128b)  : [-1024, 1008] step 16
+    //
+    // A deep stack frame (large alloca, int128-heavy locals, many
+    // spills) routinely overflows these. The pre-fix codegen blindly
+    // emitted out-of-range offsets and the assembler rejected them
+    // ("index must be a multiple of 8 in range [-512, 504]").
+    //
+    // Every body-emitted pair instruction that takes a `BaseOffset`
+    // routes through `emit_{stp,ldp,stp_fp,ldp_fp}_legalized`. The
+    // legalizer:
+    //   * leaves in-range offsets untouched (zero overhead);
+    //   * materializes out-of-range addresses into the `X16` scratch
+    //     register and rewrites the addr to `[X16]`.
+    //
+    // `X16` is AAPCS64 IP0 — linker scratch, never in the allocator
+    // palette, and never used by other codegen helpers as a *data*
+    // shuttle (they use x9–x11). Reserving it specifically for
+    // address materialization keeps the scratch convention clean.
+    //
+    // `PreIndex` / `PostIndex` addresses are NOT legalized here. They
+    // appear only in the prologue/epilogue, which already handles its
+    // own large-frame split (see `emit_prologue` / `emit_epilogue`).
+
+    /// stp/ldp signed-7-bit-scaled immediate range for `size`.
+    /// Returns `(min, max, step)` in bytes.
+    #[inline]
+    fn pair_offset_range(size: OperandSize) -> (i32, i32, i32) {
+        match size {
+            OperandSize::B32 => (-256, 252, 4),
+            _ => (-512, 504, 8),
+        }
+    }
+
+    /// True if `offset` fits the stp/ldp encoding for `size`.
+    #[inline]
+    fn pair_offset_fits(offset: i32, size: OperandSize) -> bool {
+        let (min, max, step) = Self::pair_offset_range(size);
+        offset >= min && offset <= max && offset % step == 0
+    }
+
+    /// Emit `dst = base + offset`, picking the cheapest encoding:
+    ///   * add/sub with 12-bit immediate (single instruction);
+    ///   * two add/sub when the offset fits within 2 × 12-bit;
+    ///   * fall back to `mov dst, imm; add dst, base, dst` for the
+    ///     extreme tail.
+    ///
+    /// `dst` must be a register the caller is free to clobber — the
+    /// pair-legalization path passes X16.
+    fn emit_add_offset(&mut self, dst: Reg, base: Reg, offset: i32) {
+        const MAX_IMM12: i32 = 4095;
+        if offset == 0 {
+            self.push_lir(Aarch64Inst::Mov {
+                size: OperandSize::B64,
+                src: GpOperand::Reg(base),
+                dst,
+            });
+            return;
+        }
+        let positive = offset >= 0;
+        let abs = offset.unsigned_abs() as i64;
+        if abs <= MAX_IMM12 as i64 {
+            self.push_lir(if positive {
+                Aarch64Inst::Add {
+                    size: OperandSize::B64,
+                    src1: base,
+                    src2: GpOperand::Imm(abs),
+                    dst,
+                }
+            } else {
+                Aarch64Inst::Sub {
+                    size: OperandSize::B64,
+                    src1: base,
+                    src2: GpOperand::Imm(abs),
+                    dst,
+                }
+            });
+        } else if abs <= 2 * MAX_IMM12 as i64 {
+            // Two-step: first to `dst`, then chain to `dst`.
+            self.push_lir(if positive {
+                Aarch64Inst::Add {
+                    size: OperandSize::B64,
+                    src1: base,
+                    src2: GpOperand::Imm(MAX_IMM12 as i64),
+                    dst,
+                }
+            } else {
+                Aarch64Inst::Sub {
+                    size: OperandSize::B64,
+                    src1: base,
+                    src2: GpOperand::Imm(MAX_IMM12 as i64),
+                    dst,
+                }
+            });
+            self.push_lir(if positive {
+                Aarch64Inst::Add {
+                    size: OperandSize::B64,
+                    src1: dst,
+                    src2: GpOperand::Imm(abs - MAX_IMM12 as i64),
+                    dst,
+                }
+            } else {
+                Aarch64Inst::Sub {
+                    size: OperandSize::B64,
+                    src1: dst,
+                    src2: GpOperand::Imm(abs - MAX_IMM12 as i64),
+                    dst,
+                }
+            });
+        } else {
+            self.emit_mov_imm(dst, offset as i64, 64);
+            self.push_lir(Aarch64Inst::Add {
+                size: OperandSize::B64,
+                src1: base,
+                src2: GpOperand::Reg(dst),
+                dst,
+            });
+        }
+    }
+
+    /// If `addr` is a `BaseOffset` with an out-of-range offset,
+    /// materialize `base + offset` into X16 and rewrite the address
+    /// to `[X16]`. Otherwise returns `addr` unchanged.
+    ///
+    /// Convention: X16 is clobbered iff legalization fires. Callers
+    /// must not rely on X16 being alive past the emit_*_legalized
+    /// call. In practice this is fine — the pattern is always
+    /// "compute source addr (may use X16) → load into X9/X10 →
+    /// legalize destination addr (may reuse X16) → store" — the
+    /// source's use of X16 is dead by the time the destination's
+    /// legalization runs.
+    fn legalize_pair_addr(&mut self, size: OperandSize, addr: MemAddr) -> MemAddr {
+        if let MemAddr::BaseOffset { base, offset } = addr {
+            if !Self::pair_offset_fits(offset, size) {
+                self.emit_add_offset(Reg::X16, base, offset);
+                return MemAddr::Base(Reg::X16);
+            }
+        }
+        addr
+    }
+
+    /// Emit `stp src1, src2, addr` with pair-address legalization.
+    pub(super) fn emit_stp_legalized(
+        &mut self,
+        size: OperandSize,
+        src1: Reg,
+        src2: Reg,
+        addr: MemAddr,
+    ) {
+        let addr = self.legalize_pair_addr(size, addr);
+        self.push_lir(Aarch64Inst::Stp {
+            size,
+            src1,
+            src2,
+            addr,
+        });
+    }
+
+    /// Emit `ldp dst1, dst2, addr` with pair-address legalization.
+    pub(super) fn emit_ldp_legalized(
+        &mut self,
+        size: OperandSize,
+        addr: MemAddr,
+        dst1: Reg,
+        dst2: Reg,
+    ) {
+        let addr = self.legalize_pair_addr(size, addr);
+        self.push_lir(Aarch64Inst::Ldp {
+            size,
+            addr,
+            dst1,
+            dst2,
+        });
+    }
+
+    // FP pair legalization helpers (emit_{stp,ldp}_fp_legalized) are
+    // intentionally absent — every current StpFp/LdpFp site emits
+    // either callee-saved save/restore (offset bounded by the small
+    // callee-saved set: ≤288 bytes) or prologue PreIndex (handled by
+    // its own large-frame split). Add them the moment an FP pair
+    // instruction needs body-emission with a possibly-large offset.
+
     /// Zero-initialize the local variable area of the stack frame.
     /// This ensures all stack slots start as zero, so narrow writes (8/16/32-bit)
     /// leave zero in the unwritten upper bytes.
@@ -898,12 +1087,12 @@ impl Aarch64CodeGen {
                                 if let Some(Loc::Stack(offset)) = self.locations.get_ref(pseudo.id)
                                 {
                                     if *offset < 0 {
-                                        self.push_lir(Aarch64Inst::Stp {
-                                            size: OperandSize::B64,
-                                            src1: arg_regs[int_arg_idx],
-                                            src2: arg_regs[int_arg_idx + 1],
-                                            addr: self.stack_mem(*offset),
-                                        });
+                                        self.emit_stp_legalized(
+                                            OperandSize::B64,
+                                            arg_regs[int_arg_idx],
+                                            arg_regs[int_arg_idx + 1],
+                                            self.stack_mem(*offset),
+                                        );
                                     }
                                 }
                             }
@@ -1119,12 +1308,7 @@ impl Aarch64CodeGen {
                 let loc = self.get_location(src);
                 if let Loc::Stack(offset) = loc {
                     let mem = self.stack_mem(offset);
-                    self.push_lir(Aarch64Inst::Ldp {
-                        size: OperandSize::B64,
-                        addr: mem,
-                        dst1: Reg::X0,
-                        dst2: Reg::X1,
-                    });
+                    self.emit_ldp_legalized(OperandSize::B64, mem, Reg::X0, Reg::X1);
                 } else {
                     // Fallback: load lo half to X0, zero X1
                     self.emit_move(src, Reg::X0, 64);
@@ -1264,12 +1448,12 @@ impl Aarch64CodeGen {
                 if insn.size >= 128 {
                     // 128-bit: load both halves and ORR them to check for non-zero
                     let (_, scratch1, _) = Reg::scratch_regs();
-                    self.push_lir(Aarch64Inst::Ldp {
-                        size: OperandSize::B64,
-                        addr: self.stack_mem(*offset),
-                        dst1: scratch0,
-                        dst2: scratch1,
-                    });
+                    self.emit_ldp_legalized(
+                        OperandSize::B64,
+                        self.stack_mem(*offset),
+                        scratch0,
+                        scratch1,
+                    );
                     self.push_lir(Aarch64Inst::Orr {
                         size: OperandSize::B64,
                         src1: scratch0,
@@ -2064,18 +2248,8 @@ impl Aarch64CodeGen {
                 // Stack-to-stack copy: load lo/hi via LDP, store via STP
                 let src_mem = self.stack_mem(src_offset);
                 let dst_mem = self.stack_mem(dst_offset);
-                self.push_lir(Aarch64Inst::Ldp {
-                    size: OperandSize::B64,
-                    addr: src_mem,
-                    dst1: Reg::X9,
-                    dst2: Reg::X10,
-                });
-                self.push_lir(Aarch64Inst::Stp {
-                    size: OperandSize::B64,
-                    src1: Reg::X9,
-                    src2: Reg::X10,
-                    addr: dst_mem,
-                });
+                self.emit_ldp_legalized(OperandSize::B64, src_mem, Reg::X9, Reg::X10);
+                self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, dst_mem);
             }
             Loc::Imm(v) => {
                 let lo = v as u64 as i64;
@@ -2083,23 +2257,13 @@ impl Aarch64CodeGen {
                 self.emit_mov_imm(Reg::X9, lo, 64);
                 self.emit_mov_imm(Reg::X10, hi, 64);
                 let dst_mem = self.stack_mem(dst_offset);
-                self.push_lir(Aarch64Inst::Stp {
-                    size: OperandSize::B64,
-                    src1: Reg::X9,
-                    src2: Reg::X10,
-                    addr: dst_mem,
-                });
+                self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, dst_mem);
             }
             _ => {
                 // For other locations, load as 64-bit and zero-extend
                 self.emit_move(src, Reg::X9, 64);
                 let dst_mem = self.stack_mem(dst_offset);
-                self.push_lir(Aarch64Inst::Stp {
-                    size: OperandSize::B64,
-                    src1: Reg::X9,
-                    src2: Reg::Xzr,
-                    addr: dst_mem,
-                });
+                self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::Xzr, dst_mem);
             }
         }
     }
@@ -2208,18 +2372,13 @@ impl Aarch64CodeGen {
             if let Loc::Stack(dst_offset) = dst_loc {
                 match self.compute_mem_addr(addr, insn.offset, Reg::X16) {
                     ComputedAddr::Direct(mem_addr) | ComputedAddr::WithSetup(mem_addr) => {
-                        self.push_lir(Aarch64Inst::Ldp {
-                            size: OperandSize::B64,
-                            addr: mem_addr,
-                            dst1: Reg::X9,
-                            dst2: Reg::X10,
-                        });
-                        self.push_lir(Aarch64Inst::Stp {
-                            size: OperandSize::B64,
-                            src1: Reg::X9,
-                            src2: Reg::X10,
-                            addr: self.stack_mem(dst_offset),
-                        });
+                        self.emit_ldp_legalized(OperandSize::B64, mem_addr, Reg::X9, Reg::X10);
+                        self.emit_stp_legalized(
+                            OperandSize::B64,
+                            Reg::X9,
+                            Reg::X10,
+                            self.stack_mem(dst_offset),
+                        );
                     }
                     ComputedAddr::Global(name) => {
                         self.emit_load_addr(&name, Reg::X16);
@@ -2229,12 +2388,12 @@ impl Aarch64CodeGen {
                             dst1: Reg::X9,
                             dst2: Reg::X10,
                         });
-                        self.push_lir(Aarch64Inst::Stp {
-                            size: OperandSize::B64,
-                            src1: Reg::X9,
-                            src2: Reg::X10,
-                            addr: self.stack_mem(dst_offset),
-                        });
+                        self.emit_stp_legalized(
+                            OperandSize::B64,
+                            Reg::X9,
+                            Reg::X10,
+                            self.stack_mem(dst_offset),
+                        );
                     }
                 }
             }
@@ -2585,12 +2744,7 @@ impl Aarch64CodeGen {
 
         match self.compute_mem_addr(addr, insn.offset, Reg::X16) {
             ComputedAddr::Direct(mem_addr) | ComputedAddr::WithSetup(mem_addr) => {
-                self.push_lir(Aarch64Inst::Stp {
-                    size: OperandSize::B64,
-                    src1: Reg::X9,
-                    src2: Reg::X10,
-                    addr: mem_addr,
-                });
+                self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, mem_addr);
             }
             ComputedAddr::Global(name) => {
                 self.emit_load_addr(&name, Reg::X16);
diff --git a/cc/arch/aarch64/expression.rs b/cc/arch/aarch64/expression.rs
index 216554a2..46cb8440 100644
--- a/cc/arch/aarch64/expression.rs
+++ b/cc/arch/aarch64/expression.rs
@@ -448,12 +448,7 @@ impl Aarch64CodeGen {
         match loc {
             Loc::Stack(offset) => {
                 let mem = self.stack_mem(offset);
-                self.push_lir(Aarch64Inst::Ldp {
-                    size: OperandSize::B64,
-                    addr: mem,
-                    dst1: lo_reg,
-                    dst2: hi_reg,
-                });
+                self.emit_ldp_legalized(OperandSize::B64, mem, lo_reg, hi_reg);
             }
             Loc::Imm(v) => {
                 let lo = v as u64 as i64;
@@ -478,12 +473,7 @@ impl Aarch64CodeGen {
         let dst_loc = self.get_location(target);
         if let Loc::Stack(offset) = dst_loc {
             let mem = self.stack_mem(offset);
-            self.push_lir(Aarch64Inst::Stp {
-                size: OperandSize::B64,
-                src1: lo_reg,
-                src2: hi_reg,
-                addr: mem,
-            });
+            self.emit_stp_legalized(OperandSize::B64, lo_reg, hi_reg, mem);
         }
     }