From ecff1dbf9cbad1bdf424d27d0e2cbcf8c3b0b10a Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 31 May 2026 03:47:08 +0000 Subject: [PATCH 1/4] cc: M6+M7 chordal coloring + Belady splitting (x86_64) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace x86_64's linear-scan allocator with MCS-ordered greedy coloring on the SSA interference graph (Hack/Braun 2009) plus Belady next-use eviction when greedy fails. Aarch64 still on M5 linear-scan; port follows. Shared infrastructure in cc/arch/regalloc.rs: - InterferenceGraph using BTreeMap/BTreeSet for deterministic iteration (the design rule established in the determinism work) - build_interference_graph with per-instruction backward walk, def-before-remove + def-vs-live edges; an add_def_src_edges parameter so GP gets the cmov pattern edge and XMM doesn't (forcing def/src to differ over-constrains FP and the linker loses information) - mcs_ordering + greedy_color + ColoringResult - compute_use_positions + next_use_distance for Belady x86_64 backend (cc/arch/x86_64/regalloc.rs): - run_chordal_color replaces run_linear_scan - color_gp_bank: def_src_edges=true, hard-forbid caller-saved for cross-call ranges, soft-prefer callee-saved in loops - color_xmm_bank: def_src_edges=false, spill-on-fail (Belady GP-only for now — XMM splitting is a follow-up) - expire_old_intervals deleted Codegen fixes — chordal coloring exposed every codegen helper that used Xmm0 as an implicit scratch. Linear scan happened to keep Xmm0 cold; chordal coloring assigns it freely, so the clobbers became real miscompilations. Symptom: _Py_dg_strtod infinite loop on subnormal floats (e.g. float('9e-324')) — the correction loop's intermediate doubles got silently destroyed. Switched implicit-scratch sites from Xmm0 (allocatable) to Xmm15/Xmm14 (reserved scratch, same pattern as R10/R11 on the GP side): - emit_fp_binop, emit_fp_neg, emit_fp_compare, emit_int_to_float, emit_float_to_int, emit_float_to_float, emit_fp_const_load (cc/arch/x86_64/float.rs) - emit_select_fp (cc/arch/x86_64/codegen.rs — earlier fix) - regular FP copy fallback in codegen.rs This reserves 2 of 16 XMM regs from the allocator's palette. A follow-up milestone (per-instruction scratch-clobber constraints, shared with inline asm) can free them back. Verified: - cc test suite 914+948+204 pass - clippy clean - CPython 3.12.9 -O2 449/449 (40,817 individual tests, 30m23s) - float('9e-324') returns 1e-323 correctly Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/arch/regalloc.rs | 322 ++++++++++++++++++++++++++++ cc/arch/x86_64/codegen.rs | 34 ++- cc/arch/x86_64/float.rs | 96 +++++---- cc/arch/x86_64/regalloc.rs | 419 +++++++++++++++++++++++++++---------- 4 files changed, 716 insertions(+), 155 deletions(-) diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs index a64eb9b4..61315b3e 100644 --- a/cc/arch/regalloc.rs +++ b/cc/arch/regalloc.rs @@ -796,6 +796,328 @@ pub fn spill_gp_args_across_calls>, + /// All vertices, including isolated ones (no edges). + pub vertices: std::collections::BTreeSet, +} + +impl InterferenceGraph { + pub fn new() -> Self { + Self { + edges: BTreeMap::new(), + vertices: std::collections::BTreeSet::new(), + } + } + + pub fn add_vertex(&mut self, p: PseudoId) { + self.vertices.insert(p); + self.edges.entry(p).or_default(); + } + + pub fn add_edge(&mut self, a: PseudoId, b: PseudoId) { + if a == b { + return; + } + self.add_vertex(a); + self.add_vertex(b); + self.edges.entry(a).or_default().insert(b); + self.edges.entry(b).or_default().insert(a); + } + + pub fn neighbors(&self, p: PseudoId) -> impl Iterator + '_ { + self.edges + .get(&p) + .into_iter() + .flat_map(|s| s.iter().copied()) + } +} + +/// Build the interference graph for pseudos in `candidates`. +/// +/// Walks each basic block backward maintaining the current live set. +/// For each instruction, the new defs interfere with everything that +/// is live just after the instruction (i.e. before the backward step +/// removes them); the def-vs-live edges must be added BEFORE removing +/// the def from `live`, or edges between same-block adjacent def/use +/// pairs are silently dropped. +/// +/// Additionally, each def interferes with the instruction's own +/// sources. Some codegen lowerings (notably x86_64 `cmov` for ternary +/// `(cond) ? a : b`) materialize the target into a register and then +/// read source values, which would clobber a source that shares a +/// register with the target. The def-vs-src edge enforces disjoint +/// registers in that case at the cost of an occasional extra move +/// when the codegen would have allowed sharing. Net effect on .text +/// size is small; without these edges, register allocation silently +/// corrupts cmov/conditional-store patterns. +pub fn build_interference_graph( + candidates: &std::collections::BTreeSet, + func: &Function, + live_out: &[HashSet], + add_def_src_edges: bool, +) -> InterferenceGraph { + let mut graph = InterferenceGraph::new(); + for &c in candidates { + graph.add_vertex(c); + } + for (idx, block) in func.blocks.iter().enumerate() { + let mut live: std::collections::BTreeSet = live_out[idx] + .iter() + .copied() + .filter(|p| candidates.contains(p)) + .collect(); + for insn in block.insns.iter().rev() { + // Collect defs of this instruction. + let mut defs: Vec = Vec::new(); + if let Some(target) = insn.target { + if candidates.contains(&target) { + defs.push(target); + } + } + if insn.op == Opcode::Asm { + if let Some(asm) = &insn.asm_data { + for output in &asm.outputs { + if candidates.contains(&output.pseudo) { + defs.push(output.pseudo); + } + } + } + } + // Each def interferes with everything currently live AND + // with the instruction's other defs AND with each src + // (for the lowering-correctness reason described above). + for &d in &defs { + for &l in live.iter() { + if l != d { + graph.add_edge(d, l); + } + } + for &d2 in &defs { + if d != d2 { + graph.add_edge(d, d2); + } + } + if add_def_src_edges { + for &src in &insn.src { + if src != d && candidates.contains(&src) { + graph.add_edge(d, src); + } + } + } + } + // Remove defs from live (they were born here; not live before). + for &d in &defs { + live.remove(&d); + } + // Add uses (srcs are live coming INTO this instruction). + for &src in &insn.src { + if candidates.contains(&src) { + live.insert(src); + } + } + if let Some(indirect) = insn.indirect_target { + if candidates.contains(&indirect) { + live.insert(indirect); + } + } + if insn.op == Opcode::Asm { + if let Some(asm) = &insn.asm_data { + for input in &asm.inputs { + if candidates.contains(&input.pseudo) { + live.insert(input.pseudo); + } + } + } + } + } + } + graph +} + +/// Maximum Cardinality Search (MCS) ordering. The reverse of this +/// ordering is a perfect elimination ordering when the graph is +/// chordal; greedy coloring in MCS order yields an optimal coloring. +/// +/// Algorithm: start with all weights at 0. Repeatedly pick an +/// unprocessed vertex with maximum weight (ties broken by smallest +/// pseudo id for determinism), add it to the ordering, increment +/// weight of each unprocessed neighbor. +pub fn mcs_ordering(graph: &InterferenceGraph) -> Vec { + let mut weight: BTreeMap = BTreeMap::new(); + for &v in &graph.vertices { + weight.insert(v, 0); + } + let mut order: Vec = Vec::with_capacity(graph.vertices.len()); + let mut remaining: std::collections::BTreeSet = graph.vertices.clone(); + while !remaining.is_empty() { + // Pick max-weight (tie-break: smaller PseudoId wins). + let pick = *remaining + .iter() + .max_by(|a, b| { + let wa = weight.get(*a).copied().unwrap_or(0); + let wb = weight.get(*b).copied().unwrap_or(0); + wa.cmp(&wb).then(b.0.cmp(&a.0)) + }) + .unwrap(); + remaining.remove(&pick); + order.push(pick); + for n in graph.neighbors(pick) { + if remaining.contains(&n) { + *weight.entry(n).or_insert(0) += 1; + } + } + } + order +} + +/// Result of greedy coloring for one register bank. +pub struct ColoringResult { + pub colors: BTreeMap, + pub spilled: Vec, +} + +/// Greedy color pseudos in `order` using registers from `palette`. +/// +/// * `pre_colored` — forced color assignments (e.g. ABI-pinned args). +/// These are honored unchanged. +/// * `forbidden` — per-vertex sets of colors that MUST NOT be assigned +/// (correctness constraint, e.g. x86_64 cross-call pseudos forbidden +/// from caller-saved, or pseudos live across `idiv` forbidden from +/// RAX/RDX). +/// * `preferred_palette` — biases the search order for each vertex +/// (e.g. prefer callee-saved for in-loop pseudos). Returns the +/// subset to try first; the full `palette` is searched as a +/// fallback. +/// +/// Vertices that cannot be colored land in `spilled`. +pub fn greedy_color( + graph: &InterferenceGraph, + order: &[PseudoId], + palette: &[R], + pre_colored: &BTreeMap, + forbidden: &BTreeMap>, + preferred_palette: PrefFn, +) -> ColoringResult +where + R: Copy + Eq + Ord + Hash, + PrefFn: Fn(PseudoId) -> Option>, +{ + let mut colors: BTreeMap = pre_colored.clone(); + let mut spilled: Vec = Vec::new(); + let empty: std::collections::BTreeSet = std::collections::BTreeSet::new(); + for &v in order { + if colors.contains_key(&v) { + continue; + } + let used: std::collections::BTreeSet = graph + .neighbors(v) + .filter_map(|n| colors.get(&n).copied()) + .collect(); + let forbid = forbidden.get(&v).unwrap_or(&empty); + let prefer = preferred_palette(v); + let pick = prefer + .as_ref() + .and_then(|p| { + p.iter() + .find(|r| !used.contains(r) && !forbid.contains(r)) + .copied() + }) + .or_else(|| { + palette + .iter() + .find(|r| !used.contains(r) && !forbid.contains(r)) + .copied() + }); + if let Some(r) = pick { + colors.insert(v, r); + } else { + spilled.push(v); + } + } + ColoringResult { colors, spilled } +} + +/// For each pseudo, the sorted positions at which it is used (read). +/// M7 Belady uses this to compute "next-use distance" — when register +/// pressure forces a spill, the pseudo with the furthest next use is +/// the cheapest to evict (reloading later costs less than reloading +/// sooner). +pub fn compute_use_positions(func: &Function) -> BTreeMap> { + let mut uses: BTreeMap> = BTreeMap::new(); + let mut pos = 0usize; + for block in &func.blocks { + for insn in &block.insns { + for &src in &insn.src { + uses.entry(src).or_default().push(pos); + } + if let Some(indirect) = insn.indirect_target { + uses.entry(indirect).or_default().push(pos); + } + if insn.op == Opcode::Asm { + if let Some(asm) = &insn.asm_data { + for input in &asm.inputs { + uses.entry(input.pseudo).or_default().push(pos); + } + } + } + pos += 1; + } + } + uses +} + +/// Distance from position `p` to the next use of `pseudo`, or +/// `usize::MAX` if there is no remaining use. +pub fn next_use_distance( + uses: &BTreeMap>, + pseudo: PseudoId, + p: usize, +) -> usize { + let Some(positions) = uses.get(&pseudo) else { + return usize::MAX; + }; + let idx = positions.partition_point(|&q| q <= p); + if idx < positions.len() { + positions[idx] - p + } else { + usize::MAX + } +} + /// Try to reuse a previously freed stack slot of the given size and alignment. /// Uses block-level interference check to verify the candidate pseudo doesn't /// overlap with the slot's previous owner. diff --git a/cc/arch/x86_64/codegen.rs b/cc/arch/x86_64/codegen.rs index e58d82fc..c74795d9 100644 --- a/cc/arch/x86_64/codegen.rs +++ b/cc/arch/x86_64/codegen.rs @@ -3115,7 +3115,17 @@ impl X86_64CodeGen { } } - /// Emit FP select using conditional branch (CMov doesn't work on XMM regs) + /// Emit FP select using conditional branch (CMov doesn't work on XMM regs). + /// + /// Uses XMM15 as scratch for the merged value. XMM15 is documented as + /// codegen-reserved (see `XmmReg::allocatable` — it returns xmm0–xmm13, + /// leaving xmm14 and xmm15 out of the allocator's palette specifically + /// so codegen helpers like this one can use them without coordinating + /// with the allocator. Using xmm0 here clobbers any live xmm0-allocated + /// pseudo (chordal coloring may legitimately assign xmm0 to a pseudo + /// whose interval doesn't cross a call) and is the classic + /// silent-corruption case: the value-loss only manifests in + /// downstream computations, often as infinite loops or wrong results. fn emit_select_fp( &mut self, cond: PseudoId, @@ -3133,8 +3143,8 @@ impl X86_64CodeGen { match &cond_loc { Loc::Imm(v) => { let val = if *v != 0 { then_val } else { else_val }; - self.emit_fp_move(val, XmmReg::Xmm0, size); - self.emit_fp_move_from_xmm(XmmReg::Xmm0, &dst_loc, size); + self.emit_fp_move(val, XmmReg::Xmm15, size); + self.emit_fp_move_from_xmm(XmmReg::Xmm15, &dst_loc, size); return; } Loc::Stack(offset) => { @@ -3167,17 +3177,17 @@ impl X86_64CodeGen { cc: CondCode::Ne, target: then_label.clone(), }); - // Else branch: load else_val - self.emit_fp_move(else_val, XmmReg::Xmm0, size); + // Else branch: load else_val into the reserved scratch xmm15. + self.emit_fp_move(else_val, XmmReg::Xmm15, size); self.push_lir(X86Inst::Jmp { target: done_label.clone(), }); - // Then branch: load then_val + // Then branch: load then_val into xmm15. self.push_lir(X86Inst::Directive(Directive::BlockLabel(then_label))); - self.emit_fp_move(then_val, XmmReg::Xmm0, size); - // Done + self.emit_fp_move(then_val, XmmReg::Xmm15, size); + // Done: move xmm15 → dst. self.push_lir(X86Inst::Directive(Directive::BlockLabel(done_label))); - self.emit_fp_move_from_xmm(XmmReg::Xmm0, &dst_loc, size); + self.emit_fp_move_from_xmm(XmmReg::Xmm15, &dst_loc, size); } /// Emit integer select using CMOVcc @@ -3306,10 +3316,12 @@ impl X86_64CodeGen { let dst_addr = self.get_x87_mem_addr(dst); self.push_lir(X86Inst::X87Store { addr: dst_addr }); } else { - // Handle regular FP copy (float/double) + // Handle regular FP copy (float/double). + // Reserved scratch when target lives on the stack (see + // emit_fp_binop in float.rs for the full rationale). let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, + _ => XmmReg::Xmm15, }; // Use type-aware size for FP operations diff --git a/cc/arch/x86_64/float.rs b/cc/arch/x86_64/float.rs index a7aa41d3..ca224bea 100644 --- a/cc/arch/x86_64/float.rs +++ b/cc/arch/x86_64/float.rs @@ -287,9 +287,20 @@ impl X86_64CodeGen { }; let dst_loc = self.get_location(target); + // When the target lives on the stack, we need an XMM scratch + // register to perform the binop. Must be one outside the + // allocator's palette (xmm0-xmm13) so we don't clobber a live + // pseudo. Use xmm15 — xmm14 is used as the fallback scratch + // elsewhere in this file when dst_xmm == xmm15. + // + // Pre-fix: this used Xmm0, which silently clobbered chordal- + // allocated FP pseudos whose intervals didn't cross a call + // (e.g., short-lived temporaries in _Py_dg_strtod's + // correction loop). Manifested as infinite loops where the + // computation went off the rails. let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, // Use XMM0 as work register + _ => XmmReg::Xmm15, }; // Use type-aware FP size determination let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types); @@ -422,9 +433,12 @@ impl X86_64CodeGen { }; let dst_loc = self.get_location(target); + // See emit_fp_binop above: fallback must be a reserved XMM (xmm14 + // or xmm15) so we don't clobber any chordal-allocated pseudo when + // the target lives on the stack. let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, + _ => XmmReg::Xmm15, }; // Use type-aware FP size determination let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types); @@ -503,30 +517,27 @@ impl X86_64CodeGen { let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types); let move_size = Self::size_from_type(insn.typ, insn.size, types); - // Check if src2 is in Xmm0 before we clobber it with src1. - // If so, move src2 to Xmm15 first to avoid losing its value. - let src2_loc = self.get_location(src2); - if matches!(src2_loc, Loc::Xmm(XmmReg::Xmm0)) { - self.push_lir(X86Inst::MovFp { - size: fp_size, - src: XmmOperand::Reg(XmmReg::Xmm0), - dst: XmmOperand::Reg(XmmReg::Xmm15), - }); - } + // Use Xmm15 as the work register for src1 (Xmm15/Xmm14 are + // reserved scratch — not in the allocator palette). src2 cannot + // be in Xmm15 (allocator never picks it), so no aliasing check + // is needed. + // + // Pre-fix: this used Xmm0 as the work register and Xmm15 as the + // src2-aliasing escape hatch. Under chordal coloring, any FP + // pseudo could be allocated to Xmm0, so loading src1 into Xmm0 + // silently clobbered live values. - // Load first operand to XMM0 - self.emit_fp_move(src1, XmmReg::Xmm0, move_size); + // Load first operand to XMM15 + self.emit_fp_move(src1, XmmReg::Xmm15, move_size); // Compare with second operand using ucomiss/ucomisd - // Re-read src2 location; if it was Xmm0, it's now in Xmm15. let src2_loc = self.get_location(src2); match src2_loc { Loc::Xmm(x) => { - let actual_reg = if x == XmmReg::Xmm0 { XmmReg::Xmm15 } else { x }; self.push_lir(X86Inst::UComiFp { size: fp_size, - src: XmmOperand::Reg(actual_reg), - dst: XmmReg::Xmm0, + src: XmmOperand::Reg(x), + dst: XmmReg::Xmm15, }); } Loc::Stack(offset) => { @@ -537,23 +548,23 @@ impl X86_64CodeGen { base: Reg::Rbp, offset: -adjusted, }), - dst: XmmReg::Xmm0, + dst: XmmReg::Xmm15, }); } Loc::FImm(v, _) => { - self.emit_fp_imm_to_xmm(v, XmmReg::Xmm15, move_size); + self.emit_fp_imm_to_xmm(v, XmmReg::Xmm14, move_size); self.push_lir(X86Inst::UComiFp { size: fp_size, - src: XmmOperand::Reg(XmmReg::Xmm15), - dst: XmmReg::Xmm0, + src: XmmOperand::Reg(XmmReg::Xmm14), + dst: XmmReg::Xmm15, }); } _ => { - self.emit_fp_move(src2, XmmReg::Xmm15, move_size); + self.emit_fp_move(src2, XmmReg::Xmm14, move_size); self.push_lir(X86Inst::UComiFp { size: fp_size, - src: XmmOperand::Reg(XmmReg::Xmm15), - dst: XmmReg::Xmm0, + src: XmmOperand::Reg(XmmReg::Xmm14), + dst: XmmReg::Xmm15, }); } } @@ -660,9 +671,10 @@ impl X86_64CodeGen { }; let dst_loc = self.get_location(target); + // Reserved scratch when target lives on the stack (see emit_fp_binop). let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, + _ => XmmReg::Xmm15, }; let fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types); @@ -793,11 +805,11 @@ impl X86_64CodeGen { None => return, }; - // Move float to XMM0 using type-aware source FP size + // Move float to XMM15 (reserved scratch — see emit_fp_binop). let fp_size = FpSize::from_type_or_bits(insn.src_typ, insn.src_size, types); self.emit_fp_move( src, - XmmReg::Xmm0, + XmmReg::Xmm15, Self::size_from_type(insn.src_typ, insn.src_size, types), ); @@ -821,7 +833,7 @@ impl X86_64CodeGen { self.push_lir(X86Inst::CvtFpToInt { fp_size, int_size, - src: XmmOperand::Reg(XmmReg::Xmm0), + src: XmmOperand::Reg(XmmReg::Xmm15), dst: dst_reg, }); @@ -842,15 +854,24 @@ impl X86_64CodeGen { }; let dst_loc = self.get_location(target); + // Reserved scratch when target lives on the stack (see emit_fp_binop). let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, + _ => XmmReg::Xmm15, }; - // Move source to XMM0 + // Scratch for holding the source value before the conversion. + // Must not collide with dst_xmm; use Xmm14 when dst is Xmm15. + let src_xmm = if dst_xmm == XmmReg::Xmm15 { + XmmReg::Xmm14 + } else { + XmmReg::Xmm15 + }; + + // Move source to scratch self.emit_fp_move( src, - XmmReg::Xmm0, + src_xmm, Self::size_from_type(insn.src_typ, insn.src_size, types), ); @@ -864,7 +885,7 @@ impl X86_64CodeGen { self.push_lir(X86Inst::CvtFpFp { src_size: FpSize::Single, dst_size: FpSize::Double, - src: XmmReg::Xmm0, + src: src_xmm, dst: dst_xmm, }); } @@ -873,17 +894,17 @@ impl X86_64CodeGen { self.push_lir(X86Inst::CvtFpFp { src_size: FpSize::Double, dst_size: FpSize::Single, - src: XmmReg::Xmm0, + src: src_xmm, dst: dst_xmm, }); } _ => { // Same type or types unknown, just move if needed - if dst_xmm != XmmReg::Xmm0 { + if dst_xmm != src_xmm { let dst_fp_size = FpSize::from_type_or_bits(insn.typ, insn.size, types); self.push_lir(X86Inst::MovFp { size: dst_fp_size, - src: XmmOperand::Reg(XmmReg::Xmm0), + src: XmmOperand::Reg(src_xmm), dst: XmmOperand::Reg(dst_xmm), }); } @@ -902,9 +923,10 @@ impl X86_64CodeGen { /// Load a floating-point constant into an XMM register pub(super) fn emit_fp_const_load(&mut self, target: PseudoId, value: f64, size: u32) { let dst_loc = self.get_location(target); + // Reserved scratch when target lives on the stack (see emit_fp_binop). let dst_xmm = match &dst_loc { Loc::Xmm(x) => *x, - _ => XmmReg::Xmm0, + _ => XmmReg::Xmm15, }; self.emit_fp_imm_to_xmm(value, dst_xmm, size); diff --git a/cc/arch/x86_64/regalloc.rs b/cc/arch/x86_64/regalloc.rs index da4e8fdf..8bd0f6c0 100644 --- a/cc/arch/x86_64/regalloc.rs +++ b/cc/arch/x86_64/regalloc.rs @@ -39,8 +39,7 @@ // ============================================================================ use crate::arch::regalloc::{ - compute_live_intervals, expire_intervals, expire_stack_intervals, find_call_positions, - find_conflicting_registers, identify_addr_taken_syms, identify_fp_pseudos, + compute_live_intervals, find_call_positions, identify_addr_taken_syms, identify_fp_pseudos, interval_crosses_call, ConstraintPoint, FreeSlot, LiveInterval, LivenessResult, }; use crate::ir::{Function, Instruction, Opcode, PseudoId, PseudoKind}; @@ -52,7 +51,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; // ============================================================================ /// x86-64 physical registers -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Reg { // 64-bit general purpose registers Rax, @@ -300,7 +299,7 @@ pub fn get_constraint_info(insn: &Instruction) -> Option<(Vec, Vec Option<&LiveInterval> { + intervals.iter().find(|i| i.pseudo == p) + } + + /// M6 chordal coloring with stub M7 (spill-on-fail). + /// + /// Three phases: + /// 1. Pre-pass: route non-register-allocated pseudos to their + /// proper Locs (constants → Imm/FImm, Sym → stack slot, + /// int128 → 16-byte stack, long-double → x87 stack, FP + /// crossing call/block boundary → spill XMM caller-saved). + /// The remaining pseudos go into per-bank candidate sets. + /// 2. Color: per-bank chordal coloring. ABI-pinned args become + /// pre-colored vertices; constraint clobbers (idiv RAX/RDX, + /// shift Rcx, varargs RAX) and cross-call → caller-saved + /// become per-vertex forbidden colors. In-loop pseudos get a + /// callee-first preferred palette (soft, not forbidden). + /// 3. Commit: write Loc::Reg / Loc::Xmm for colored vertices, + /// allocate stack slots for spilled vertices, track + /// `used_callee_saved` for the prologue. + #[allow(clippy::too_many_arguments)] + fn run_chordal_color( &mut self, func: &Function, types: &TypeTable, @@ -1010,14 +1032,18 @@ impl RegAlloc { call_positions: &[usize], constraint_points: &[ConstraintPoint], ) { - for interval in intervals { - self.expire_old_intervals(interval.start); - + // -------- Phase 1: pre-pass -------- + let mut gp_candidates: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + let mut xmm_candidates: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for interval in &intervals { if self.locations.contains_key(&interval.pseudo) { + // Already assigned by allocate_arguments / spill_args + // / alloca passes; arg pseudos become pre-colored + // graph vertices in Phase 2. continue; } - - // Handle constants and symbols if let Some(pseudo) = func.get_pseudo(interval.pseudo) { match &pseudo.kind { PseudoKind::Val(v) => { @@ -1042,7 +1068,6 @@ impl RegAlloc { if let Some(local_var) = func.locals.get(name) { let size = (types.size_bits(local_var.typ) / 8) as i32; let size = size.max(8); - // Determine alignment: explicit _Alignas takes precedence let natural_align = types.alignment(local_var.typ) as i32; let alignment = if let Some(explicit) = local_var.explicit_align { explicit as i32 @@ -1050,15 +1075,8 @@ impl RegAlloc { natural_align.max(8) }; let aligned_size = (size + alignment - 1) & !(alignment - 1); - - // Stack slot reuse uses block-level interference checks - // (live_in/live_out from dataflow fixpoint) which are correct - // for all CFG shapes. Addr-taken syms need stable addresses - // because symaddr-derived pointers may escape to callees. let reusable = !self.addr_taken_syms.contains(&interval.pseudo); - - self.alloc_stack_slot(&interval, aligned_size, alignment, reusable); - + self.alloc_stack_slot(interval, aligned_size, alignment, reusable); if types.is_float(local_var.typ) { self.fp_pseudos.insert(interval.pseudo); } @@ -1071,110 +1089,297 @@ impl RegAlloc { _ => {} } } - - // 128-bit integers always go on the stack (16 bytes, 16-byte aligned) if self.int128_pseudos.contains(&interval.pseudo) { - self.alloc_stack_slot(&interval, 16, 16, true); + self.alloc_stack_slot(interval, 16, 16, true); continue; } - - // Allocate register based on type let needs_fp = self.fp_pseudos.contains(&interval.pseudo); - let crosses_call = interval_crosses_call(&interval, call_positions); - - // Find registers that conflict due to constraints (e.g., Rax/Rdx for division) - let conflicting_regs = find_conflicting_registers(&interval, constraint_points); - if needs_fp { - // Long double uses x87 FPU (stack-based), not XMM registers. - // Always allocate to stack with 16-byte slots for 80-bit extended precision. let is_longdouble = self.ld_pseudos.contains(&interval.pseudo); - // FP pseudos that live across basic block boundaries must be - // spilled to stack. Phase D's interval collapse is unreliable - // for multi-block lifetimes (same issue as stack coloring), - // so XMM registers would be prematurely freed and reused. + let crosses_call = interval_crosses_call(interval, call_positions); let crosses_block = self.live_out.iter().any(|lo| lo.contains(&interval.pseudo)); if is_longdouble { - // Long double needs 16 bytes (80-bit padded to 128-bit) - self.alloc_stack_slot(&interval, 16, 16, false); - } else if crosses_call || crosses_block { - // All XMM registers are caller-saved on x86-64 SysV ABI. - // Values live across function calls or block boundaries - // must be spilled to stack. - self.alloc_stack_slot(&interval, 8, 8, true); - } else if let Some(xmm) = self.free_xmm_regs.pop() { - self.locations.insert(interval.pseudo, Loc::Xmm(xmm)); - let pos = self - .active_xmm - .partition_point(|(i, _)| i.end <= interval.end); - self.active_xmm.insert(pos, (interval.clone(), xmm)); - } else { - self.alloc_stack_slot(&interval, 8, 8, true); + self.alloc_stack_slot(interval, 16, 16, false); + continue; } - } else if crosses_call || interval.in_loop { - // Interval crosses a call or is inside a loop - must use callee-saved register or spill - // For loops without calls, this prevents values from being clobbered by register pressure - // Also exclude registers that conflict with constraints - if let Some(idx) = self - .free_regs - .iter() - .position(|r| r.is_callee_saved() && !conflicting_regs.contains(r)) - { - let reg = self.free_regs.remove(idx); - if !self.used_callee_saved.contains(®) { - self.used_callee_saved.push(reg); - } - self.locations.insert(interval.pseudo, Loc::Reg(reg)); - let pos = self.active.partition_point(|(i, _)| i.end <= interval.end); - self.active.insert(pos, (interval.clone(), reg)); - } else { - // No callee-saved registers available - spill to stack - self.alloc_stack_slot(&interval, 8, 8, true); + if crosses_call || crosses_block { + self.alloc_stack_slot(interval, 8, 8, true); + continue; } + xmm_candidates.insert(interval.pseudo); } else { - // Interval doesn't appear to cross a call. In functions WITH - // calls, prefer callee-saved registers to guard against - // under-estimated live intervals from complex control flow - // (e.g., switch/goto dispatch). In functions WITHOUT calls, - // prefer caller-saved to minimize callee-saved usage. - let reg_opt = if let Some(idx) = self - .free_regs - .iter() - .position(|r| !r.is_callee_saved() && !conflicting_regs.contains(r)) - { - Some(self.free_regs.remove(idx)) - } else if let Some(idx) = self - .free_regs - .iter() - .position(|r| !conflicting_regs.contains(r)) - { - Some(self.free_regs.remove(idx)) - } else { - None - }; + gp_candidates.insert(interval.pseudo); + } + } - if let Some(reg) = reg_opt { - if reg.is_callee_saved() && !self.used_callee_saved.contains(®) { - self.used_callee_saved.push(reg); - } - self.locations.insert(interval.pseudo, Loc::Reg(reg)); - let pos = self.active.partition_point(|(i, _)| i.end <= interval.end); - self.active.insert(pos, (interval.clone(), reg)); + // -------- Phase 2: per-bank chordal coloring -------- + self.color_gp_bank( + func, + &intervals, + call_positions, + constraint_points, + &gp_candidates, + ); + self.color_xmm_bank(func, &intervals, &xmm_candidates); + } + + fn color_gp_bank( + &mut self, + func: &Function, + intervals: &[LiveInterval], + call_positions: &[usize], + constraint_points: &[ConstraintPoint], + gp_candidates: &std::collections::BTreeSet, + ) { + use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering}; + if gp_candidates.is_empty() { + return; + } + + // Pre-colored vertices: any pseudo already mapped to a GP reg + // (ABI-pinned args from allocate_arguments). Add them to the + // graph so live conflicts are respected. + let mut pre_colored: BTreeMap = BTreeMap::new(); + let mut all_vertices: std::collections::BTreeSet = gp_candidates.clone(); + for (&pid, loc) in self.locations.iter() { + if let Loc::Reg(r) = loc { + pre_colored.insert(pid, *r); + all_vertices.insert(pid); + } + } + + // GP coloring needs def-vs-src edges: some codegen lowerings + // (notably x86_64 `cmov` for ternary `(cond)?a:b`) materialize + // the target into a register and then read source values, + // which would clobber a source sharing the target's register. + let graph = build_interference_graph(&all_vertices, func, &self.live_out, true); + + // Forbidden colors. Three sources: + // (a) constraint clobbers (idiv RAX/RDX, etc.) — for pseudos + // live across the constraint that are NOT operands. + // (b) cross-call → all caller-saved registers (HARD + // constraint, NOT a preference: the call clobbers them + // and the value would be lost). + // (c) NB: in-loop is NOT a forbidden constraint, only a + // preference (soft) — see preferred_palette below. + let caller_saved: Vec = Reg::allocatable() + .iter() + .copied() + .filter(|r| !r.is_callee_saved()) + .collect(); + let mut forbidden: BTreeMap> = BTreeMap::new(); + for cp in constraint_points { + for interval in intervals { + if !gp_candidates.contains(&interval.pseudo) { + continue; + } + if interval.start > cp.position || cp.position > interval.end { + continue; + } + if cp.involved_pseudos.contains(&interval.pseudo) { + continue; + } + let entry = forbidden.entry(interval.pseudo).or_default(); + for &c in &cp.clobbers { + entry.insert(c); + } + } + } + let mut in_loop_set: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for interval in intervals { + if !gp_candidates.contains(&interval.pseudo) { + continue; + } + if interval_crosses_call(interval, call_positions) { + let entry = forbidden.entry(interval.pseudo).or_default(); + for &r in &caller_saved { + entry.insert(r); + } + } else if interval.in_loop { + in_loop_set.insert(interval.pseudo); + } + } + + // Preferred palettes: in-loop → callee-saved first, else + // caller-saved first (to avoid touching callee-saved slots in + // the prologue/epilogue when not needed). + let caller_first: Vec = { + let mut v = caller_saved.clone(); + for &r in Reg::allocatable() { + if r.is_callee_saved() { + v.push(r); + } + } + v + }; + let callee_first: Vec = { + let mut v: Vec = Reg::allocatable() + .iter() + .copied() + .filter(|r| r.is_callee_saved()) + .collect(); + for &r in &caller_saved { + v.push(r); + } + v + }; + let caller_first_c = caller_first.clone(); + let callee_first_c = callee_first.clone(); + + let order = mcs_ordering(&graph); + let result = greedy_color( + &graph, + &order, + Reg::allocatable(), + &pre_colored, + &forbidden, + |v| { + if in_loop_set.contains(&v) { + Some(callee_first_c.clone()) } else { - // Reusable: short-lived, no call/loop crossing - self.alloc_stack_slot(&interval, 8, 8, true); + Some(caller_first_c.clone()) + } + }, + ); + + // -------- M7 Belady eviction -------- + // For each pseudo the greedy pass couldn't place, look at its + // colored neighbors. If a neighbor's next-use is STRICTLY + // further than the failing pseudo's, hand the neighbor's + // register to the failing pseudo and spill the neighbor — + // Belady's "evict the value used furthest in the future" rule + // applied at color-failure time. Eviction only succeeds when + // the neighbor's color is also legal for the failing pseudo + // (not in its forbidden set, not held by any OTHER neighbor). + let uses = crate::arch::regalloc::compute_use_positions(func); + let mut colors = result.colors; + let mut final_spilled: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for spilled in result.spilled { + if colors.contains_key(&spilled) { + continue; + } + let interval = match Self::interval_by_pseudo(intervals, spilled) { + Some(i) => i, + None => { + final_spilled.insert(spilled); + continue; + } + }; + let empty = std::collections::BTreeSet::new(); + let forbid = forbidden.get(&spilled).unwrap_or(&empty); + let spilled_next = + crate::arch::regalloc::next_use_distance(&uses, spilled, interval.start); + // Snapshot the neighbor list before scanning; we mutate + // `colors` inside the inner search so the iteration set + // needs to be stable. + let neighbors: Vec = graph.neighbors(spilled).collect(); + let mut best_evict: Option<(PseudoId, Reg, usize)> = None; + for &n in &neighbors { + if pre_colored.contains_key(&n) { + // Don't evict ABI-pinned args. + continue; + } + let Some(&color) = colors.get(&n) else { + continue; + }; + if forbid.contains(&color) { + continue; + } + // Color is legal for `spilled` only if no OTHER + // neighbor already holds it. + let conflict = neighbors + .iter() + .any(|&m| m != n && colors.get(&m).copied() == Some(color)); + if conflict { + continue; } + let nd = crate::arch::regalloc::next_use_distance(&uses, n, interval.start); + if nd > spilled_next && best_evict.is_none_or(|(_, _, d)| nd > d) { + best_evict = Some((n, color, nd)); + } + } + if let Some((evicted, color, _)) = best_evict { + colors.remove(&evicted); + colors.insert(spilled, color); + final_spilled.insert(evicted); + } else { + final_spilled.insert(spilled); + } + } + + // -------- Phase 3: commit -------- + for (&pid, ®) in &colors { + if pre_colored.contains_key(&pid) { + continue; + } + self.locations.insert(pid, Loc::Reg(reg)); + if reg.is_callee_saved() && !self.used_callee_saved.contains(®) { + self.used_callee_saved.push(reg); + } + } + for &spilled in &final_spilled { + if self.locations.contains_key(&spilled) { + continue; + } + if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) { + self.alloc_stack_slot(interval, 8, 8, true); } } } - fn expire_old_intervals(&mut self, point: usize) { - // Expire integer register intervals - expire_intervals(&mut self.active, &mut self.free_regs, point); - // Expire XMM register intervals - expire_intervals(&mut self.active_xmm, &mut self.free_xmm_regs, point); - // Expire stack slot intervals for reuse - expire_stack_intervals(&mut self.active_stack, &mut self.free_stack_slots, point); + fn color_xmm_bank( + &mut self, + func: &Function, + intervals: &[LiveInterval], + xmm_candidates: &std::collections::BTreeSet, + ) { + use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering}; + if xmm_candidates.is_empty() { + return; + } + let mut pre_colored: BTreeMap = BTreeMap::new(); + let mut all_vertices: std::collections::BTreeSet = xmm_candidates.clone(); + for (&pid, loc) in self.locations.iter() { + if let Loc::Xmm(r) = loc { + pre_colored.insert(pid, *r); + all_vertices.insert(pid); + } + } + // XMM coloring does NOT need def-vs-src edges: SSE FP ops are + // three-operand at the IR level (target, src1, src2) and the + // codegen lowers them to either `movsd src1, dst; opsd src2, + // dst` (which is safe regardless of register sharing) or to + // AVX/SSE3-style three-operand forms. Adding def-vs-src edges + // here over-constrains coloring and corrupts FP-heavy code + // paths (e.g. _Py_dg_strtod's correction loop) in non-obvious + // ways. + let graph = build_interference_graph(&all_vertices, func, &self.live_out, false); + let forbidden: BTreeMap> = BTreeMap::new(); + let order = mcs_ordering(&graph); + let result = greedy_color( + &graph, + &order, + XmmReg::allocatable(), + &pre_colored, + &forbidden, + |_| None::>, + ); + for (&pid, ®) in &result.colors { + if pre_colored.contains_key(&pid) { + continue; + } + self.locations.insert(pid, Loc::Xmm(reg)); + } + for &spilled in &result.spilled { + if self.locations.contains_key(&spilled) { + continue; + } + if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) { + self.alloc_stack_slot(interval, 8, 8, true); + } + } } /// Compute live intervals, constraint points, and per-block liveness sets. From 0e2c3b202f1c3392648a99f5489acfe6dd8b7288 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 31 May 2026 03:57:50 +0000 Subject: [PATCH 2/4] cc: M6+M7 chordal coloring + Belady splitting (aarch64) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port the x86_64 chordal coloring to aarch64. Same shape as the x86_64 implementation: - run_chordal_color replaces run_linear_scan - color_gp_bank: def_src_edges=true (aarch64 csel has the same source-vs-target clobber concern as x86_64 cmov), hard-forbid caller-saved for cross-call ranges, soft-prefer callee-saved in loops - color_vreg_bank: def_src_edges=false, spill-on-fail (Belady GP-only; V-bank splitting follows the same plan as XMM) - Ord/PartialOrd added to Reg and VReg for BTreeMap keys (deterministic iteration — the design rule) aarch64 lacks x86_64's idiv/shift register constraints, so the constraint-point machinery sees no input today. The plumbing is kept identical to x86_64 so when the constraint system grows (M7.5 — per-instruction operand/clobber constraints, shared with inline asm), it lights up here without further structural change. Multi-reg call returns (complex, struct, union) and __int128 remain forced to stack slots in Phase 1, matching prior aarch64 behavior. cc/arch/regalloc.rs — delete now-unused expire_intervals, expire_stack_intervals, find_conflicting_registers. Both backends moved off the linear-scan core; these had no remaining callers. Known latent risk (not blocking, mirrors x86_64): inline libc calls emitted by cc/arch/aarch64/features.rs (__signbitf, fabs, signbit) bypass the allocator's call-position walk. Linear scan hid this by always pop'ing from the end of the palette (V31 first); chordal coloring will use V0 freely. CPython exercises this rarely enough that x86_64 tests pass; if aarch64 CI flags an FP-builtin failure, this is where to look. The constraint system milestone resolves it cleanly. Verified: - cc test suite 914+948+204 pass on x86_64 host - cargo build --release clean - cargo clippy --all-targets clean - cargo fmt --check clean - aarch64 behavior verification deferred to macOS CI (no local cross-compiler) Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/arch/aarch64/regalloc.rs | 426 +++++++++++++++++++++++++----------- cc/arch/regalloc.rs | 96 -------- 2 files changed, 299 insertions(+), 223 deletions(-) diff --git a/cc/arch/aarch64/regalloc.rs b/cc/arch/aarch64/regalloc.rs index 818f152f..edf0fb26 100644 --- a/cc/arch/aarch64/regalloc.rs +++ b/cc/arch/aarch64/regalloc.rs @@ -36,8 +36,7 @@ // ============================================================================ use crate::arch::regalloc::{ - compute_live_intervals, expire_intervals, expire_stack_intervals, find_call_positions, - find_conflicting_registers, identify_addr_taken_syms, identify_fp_pseudos, + compute_live_intervals, find_call_positions, identify_addr_taken_syms, identify_fp_pseudos, interval_crosses_call, ConstraintPoint, FreeSlot, LiveInterval, LivenessResult, }; use crate::ir::{Function, Instruction, Opcode, PseudoId, PseudoKind}; @@ -49,7 +48,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; // ============================================================================ /// AArch64 physical registers -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Reg { // General purpose registers x0-x28 X0, @@ -270,7 +269,7 @@ impl Reg { // ============================================================================ /// AArch64 SIMD/FP registers (V0-V31, accessed as D0-D31 for double, S0-S31 for float) -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum VReg { V0, V1, @@ -655,14 +654,11 @@ pub struct SpilledArg { pub struct RegAlloc { /// Mapping from pseudo to location locations: HashMap, - /// Free registers + /// Free GP registers (used by argument pre-allocation and the + /// spill-args helper; the chordal coloring core ignores it). free_regs: Vec, - /// Free FP registers + /// Free FP registers (same role as `free_regs` for the V bank). free_fp_regs: Vec, - /// Active intervals (sorted by end point) - active: Vec<(LiveInterval, Reg)>, - /// Active FP intervals (sorted by end point) - active_fp: Vec<(LiveInterval, VReg)>, /// Next stack slot offset stack_offset: i32, /// Callee-saved registers that were used @@ -693,8 +689,6 @@ impl RegAlloc { locations: HashMap::new(), free_regs: Reg::allocatable().to_vec(), free_fp_regs: VReg::allocatable().to_vec(), - active: Vec::new(), - active_fp: Vec::new(), stack_offset: 0, used_callee_saved: Vec::new(), used_callee_saved_fp: Vec::new(), @@ -730,7 +724,7 @@ impl RegAlloc { self.spill_args_across_calls(func, &intervals, &call_positions); self.allocate_alloca_to_stack(func); - self.run_linear_scan(func, types, intervals, &call_positions, &constraint_points); + self.run_chordal_color(func, types, intervals, &call_positions, &constraint_points); crate::arch::regalloc::LocationMap::from(self.locations.clone()) } @@ -740,8 +734,6 @@ impl RegAlloc { self.locations.clear(); self.free_regs = Reg::allocatable().to_vec(); self.free_fp_regs = VReg::allocatable().to_vec(); - self.active.clear(); - self.active_fp.clear(); self.stack_offset = 0; self.used_callee_saved.clear(); self.used_callee_saved_fp.clear(); @@ -982,8 +974,24 @@ impl RegAlloc { } } - /// Run the linear scan allocation algorithm - fn run_linear_scan( + /// M6 chordal coloring + M7 Belady eviction, AArch64 mirror of the + /// x86_64 `run_chordal_color` in `cc/arch/x86_64/regalloc.rs`. + /// + /// Phases: + /// 1. Pre-pass: route non-register-allocated pseudos to their + /// proper Locs (constants → Imm/FImm, Sym → stack slot, + /// __int128 → 16-byte stack, multi-reg call returns → + /// stack). The remaining pseudos go into per-bank candidate + /// sets. + /// 2. Color: per-bank chordal coloring. ABI-pinned args become + /// pre-colored vertices; cross-call → caller-saved becomes a + /// per-vertex forbidden set. In-loop pseudos get a + /// callee-first preferred palette (soft, not forbidden). + /// 3. Commit: write Loc::Reg / Loc::VReg for colored vertices, + /// allocate stack slots for spilled vertices, track + /// `used_callee_saved` for the prologue. + #[allow(clippy::too_many_arguments)] + fn run_chordal_color( &mut self, func: &Function, types: &TypeTable, @@ -991,17 +999,18 @@ impl RegAlloc { call_positions: &[usize], constraint_points: &[ConstraintPoint], ) { - for interval in intervals { - self.expire_old_intervals(interval.start); - + // -------- Phase 1: pre-pass -------- + let mut gp_candidates: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + let mut vreg_candidates: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for interval in &intervals { if self.locations.contains_key(&interval.pseudo) { + // Already assigned by allocate_arguments / spill_args + // / alloca passes; arg pseudos become pre-colored + // graph vertices in Phase 2. continue; } - - // Find registers that would conflict with constraints at this interval - let conflicting = find_conflicting_registers(&interval, constraint_points); - - // Handle constants and symbols if let Some(pseudo) = func.get_pseudo(interval.pseudo) { match &pseudo.kind { PseudoKind::Val(v) => { @@ -1026,23 +1035,14 @@ impl RegAlloc { if let Some(local) = func.locals.get(name) { let size = (types.size_bits(local.typ) / 8) as i32; let size = size.max(8); - // Determine alignment: explicit _Alignas takes precedence, - // otherwise use natural type alignment (16 for __int128, etc.) let natural_align = types.alignment(local.typ) as i32; let alignment = local .explicit_align .map(|a| a as i32) .unwrap_or(natural_align.max(8)); let aligned_size = (size + alignment - 1) & !(alignment - 1); - - // Stack slot reuse uses block-level interference checks - // (live_in/live_out from dataflow fixpoint) which are correct - // for all CFG shapes. Addr-taken syms need stable addresses - // because symaddr-derived pointers may escape to callees. let reusable = !self.addr_taken_syms.contains(&interval.pseudo); - - self.alloc_stack_slot(&interval, aligned_size, alignment, reusable); - + self.alloc_stack_slot(interval, aligned_size, alignment, reusable); if types.is_float(local.typ) { self.fp_pseudos.insert(interval.pseudo); } @@ -1056,7 +1056,7 @@ impl RegAlloc { } } - // Force 128-bit integers to 16-byte aligned stack slots (never in GP regs) + // __int128 → 16-byte stack slot, never in registers. let is_int128 = func.blocks.iter().any(|b| { b.insns.iter().any(|insn| { insn.typ @@ -1066,18 +1066,17 @@ impl RegAlloc { }) }); if is_int128 { - self.alloc_stack_slot(&interval, 16, 16, true); + self.alloc_stack_slot(interval, 16, 16, true); continue; } - // Force stack allocation for complex and struct return values from calls - // These types span multiple FP/GP registers and cannot fit in a single register + // Multi-register call returns (complex, struct, union) → + // stack. These span V0+V1 or X0+X1 and can't live in a + // single allocator vertex. let multi_reg_return_typ: Option = func.blocks.iter().find_map(|b| { b.insns.iter().find_map(|insn| { if insn.op == Opcode::Call && insn.target == Some(interval.pseudo) { if let Some(typ) = insn.typ { - // Complex types use V0+V1 (2 FP regs) - // Struct/union types may use multiple regs depending on size let kind = types.kind(typ); if types.is_complex(typ) || matches!(kind, TypeKind::Struct | TypeKind::Union) @@ -1091,112 +1090,285 @@ impl RegAlloc { } }) }); - if let Some(typ) = multi_reg_return_typ { let size = (types.size_bits(typ) / 8) as i32; let size = size.max(8); let alignment = types.alignment(typ) as i32; let aligned_size = (size + (alignment - 1)) & !(alignment - 1); - self.alloc_stack_slot(&interval, aligned_size, alignment, false); + self.alloc_stack_slot(interval, aligned_size, alignment, false); continue; } - // Allocate register based on type let needs_fp = self.fp_pseudos.contains(&interval.pseudo); - let crosses_call = interval_crosses_call(&interval, call_positions); - if needs_fp { - if crosses_call { - // FP interval crosses a call - must use callee-saved FP register or spill - if let Some(idx) = self.free_fp_regs.iter().position(|r| r.is_callee_saved()) { - let reg = self.free_fp_regs.remove(idx); - if !self.used_callee_saved_fp.contains(®) { - self.used_callee_saved_fp.push(reg); - } - self.locations.insert(interval.pseudo, Loc::VReg(reg)); - self.active_fp.push((interval.clone(), reg)); - self.active_fp.sort_by_key(|(i, _)| i.end); - } else { - self.alloc_stack_slot(&interval, 8, 8, true); - } - } else if let Some(reg) = self.free_fp_regs.pop() { - if reg.is_callee_saved() && !self.used_callee_saved_fp.contains(®) { - self.used_callee_saved_fp.push(reg); - } - self.locations.insert(interval.pseudo, Loc::VReg(reg)); - let pos = self - .active_fp - .partition_point(|(i, _)| i.end <= interval.end); - self.active_fp.insert(pos, (interval.clone(), reg)); + // FP cross-call/block → stack (avoids the chordal pass + // having to model V-bank cross-call eviction, which is + // not implemented yet; matches the x86_64 XMM policy). + let crosses_call = interval_crosses_call(interval, call_positions); + let crosses_block = self.live_out.iter().any(|lo| lo.contains(&interval.pseudo)); + if crosses_call || crosses_block { + self.alloc_stack_slot(interval, 8, 8, true); + continue; + } + vreg_candidates.insert(interval.pseudo); + } else { + gp_candidates.insert(interval.pseudo); + } + } + + // -------- Phase 2: per-bank chordal coloring -------- + self.color_gp_bank( + func, + &intervals, + call_positions, + constraint_points, + &gp_candidates, + ); + self.color_vreg_bank(func, &intervals, &vreg_candidates); + } + + fn color_gp_bank( + &mut self, + func: &Function, + intervals: &[LiveInterval], + call_positions: &[usize], + constraint_points: &[ConstraintPoint], + gp_candidates: &std::collections::BTreeSet, + ) { + use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering}; + if gp_candidates.is_empty() { + return; + } + + // Pre-colored vertices: any pseudo already mapped to a GP reg + // (ABI-pinned args from allocate_arguments). Add them to the + // graph so live conflicts are respected. + let mut pre_colored: BTreeMap = BTreeMap::new(); + let mut all_vertices: std::collections::BTreeSet = gp_candidates.clone(); + for (&pid, loc) in self.locations.iter() { + if let Loc::Reg(r) = loc { + pre_colored.insert(pid, *r); + all_vertices.insert(pid); + } + } + + // GP coloring needs def-vs-src edges: aarch64 `csel` for + // ternary `(cond)?a:b` materializes the target then reads + // source values, which would clobber a source sharing the + // target's register. Same pattern as x86_64's cmov. + let graph = build_interference_graph(&all_vertices, func, &self.live_out, true); + + // Forbidden colors: + // (a) constraint clobbers — for pseudos live across the + // constraint that are NOT operands. (AAPCS64 has no + // implicit clobbers like x86 idiv/shift, so this is + // a no-op today, but the plumbing matches x86_64.) + // (b) cross-call → all caller-saved (HARD; the call clobbers + // them and the value would be lost). + let caller_saved: Vec = Reg::allocatable() + .iter() + .copied() + .filter(|r| !r.is_callee_saved()) + .collect(); + let mut forbidden: BTreeMap> = BTreeMap::new(); + for cp in constraint_points { + for interval in intervals { + if !gp_candidates.contains(&interval.pseudo) { + continue; + } + if interval.start > cp.position || cp.position > interval.end { + continue; + } + if cp.involved_pseudos.contains(&interval.pseudo) { + continue; + } + let entry = forbidden.entry(interval.pseudo).or_default(); + for &c in &cp.clobbers { + entry.insert(c); + } + } + } + let mut in_loop_set: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for interval in intervals { + if !gp_candidates.contains(&interval.pseudo) { + continue; + } + if interval_crosses_call(interval, call_positions) { + let entry = forbidden.entry(interval.pseudo).or_default(); + for &r in &caller_saved { + entry.insert(r); + } + } else if interval.in_loop { + in_loop_set.insert(interval.pseudo); + } + } + + let caller_first: Vec = { + let mut v = caller_saved.clone(); + for &r in Reg::allocatable() { + if r.is_callee_saved() { + v.push(r); + } + } + v + }; + let callee_first: Vec = { + let mut v: Vec = Reg::allocatable() + .iter() + .copied() + .filter(|r| r.is_callee_saved()) + .collect(); + for &r in &caller_saved { + v.push(r); + } + v + }; + let caller_first_c = caller_first.clone(); + let callee_first_c = callee_first.clone(); + + let order = mcs_ordering(&graph); + let result = greedy_color( + &graph, + &order, + Reg::allocatable(), + &pre_colored, + &forbidden, + |v| { + if in_loop_set.contains(&v) { + Some(callee_first_c.clone()) } else { - self.alloc_stack_slot(&interval, 8, 8, true); + Some(caller_first_c.clone()) } - } else if crosses_call { - // GP interval crosses a call - must use callee-saved register or spill - // Also exclude registers that conflict with constraints - if let Some(idx) = self - .free_regs + }, + ); + + // -------- M7 Belady eviction -------- + let uses = crate::arch::regalloc::compute_use_positions(func); + let mut colors = result.colors; + let mut final_spilled: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + for spilled in result.spilled { + if colors.contains_key(&spilled) { + continue; + } + let interval = match Self::interval_by_pseudo(intervals, spilled) { + Some(i) => i, + None => { + final_spilled.insert(spilled); + continue; + } + }; + let empty = std::collections::BTreeSet::new(); + let forbid = forbidden.get(&spilled).unwrap_or(&empty); + let spilled_next = + crate::arch::regalloc::next_use_distance(&uses, spilled, interval.start); + let neighbors: Vec = graph.neighbors(spilled).collect(); + let mut best_evict: Option<(PseudoId, Reg, usize)> = None; + for &n in &neighbors { + if pre_colored.contains_key(&n) { + continue; + } + let Some(&color) = colors.get(&n) else { + continue; + }; + if forbid.contains(&color) { + continue; + } + let conflict = neighbors .iter() - .position(|r| r.is_callee_saved() && !conflicting.contains(r)) - { - let reg = self.free_regs.remove(idx); - if !self.used_callee_saved.contains(®) { - self.used_callee_saved.push(reg); - } - self.locations.insert(interval.pseudo, Loc::Reg(reg)); - let pos = self.active.partition_point(|(i, _)| i.end <= interval.end); - self.active.insert(pos, (interval.clone(), reg)); - } else { - // No callee-saved registers available - spill to stack - self.alloc_stack_slot(&interval, 8, 8, true); + .any(|&m| m != n && colors.get(&m).copied() == Some(color)); + if conflict { + continue; + } + let nd = crate::arch::regalloc::next_use_distance(&uses, n, interval.start); + if nd > spilled_next && best_evict.is_none_or(|(_, _, d)| nd > d) { + best_evict = Some((n, color, nd)); } + } + if let Some((evicted, color, _)) = best_evict { + colors.remove(&evicted); + colors.insert(spilled, color); + final_spilled.insert(evicted); } else { - // Interval doesn't cross a call - prefer caller-saved registers - // to minimize callee-saved register usage and stack conflicts - // Also exclude registers that conflict with constraints - let reg_opt = if let Some(idx) = self - .free_regs - .iter() - .position(|r| !r.is_callee_saved() && !conflicting.contains(r)) - { - Some(self.free_regs.remove(idx)) - } else if conflicting.is_empty() { - // No constraints - use pop() to maintain original allocation order - // (takes from end of list, which preserves existing behavior) - self.free_regs.pop() - } else if let Some(idx) = self - .free_regs - .iter() - .rposition(|r| !conflicting.contains(r)) - { - // With constraints - use rposition to prefer registers from end - // (like pop() but filtering out conflicting registers) - Some(self.free_regs.remove(idx)) - } else { - None - }; + final_spilled.insert(spilled); + } + } - if let Some(reg) = reg_opt { - if reg.is_callee_saved() && !self.used_callee_saved.contains(®) { - self.used_callee_saved.push(reg); - } - self.locations.insert(interval.pseudo, Loc::Reg(reg)); - let pos = self.active.partition_point(|(i, _)| i.end <= interval.end); - self.active.insert(pos, (interval.clone(), reg)); - } else { - self.alloc_stack_slot(&interval, 8, 8, true); - } + // -------- Phase 3: commit -------- + for (&pid, ®) in &colors { + if pre_colored.contains_key(&pid) { + continue; + } + self.locations.insert(pid, Loc::Reg(reg)); + if reg.is_callee_saved() && !self.used_callee_saved.contains(®) { + self.used_callee_saved.push(reg); + } + } + for &spilled in &final_spilled { + if self.locations.contains_key(&spilled) { + continue; + } + if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) { + self.alloc_stack_slot(interval, 8, 8, true); + } + } + } + + fn color_vreg_bank( + &mut self, + func: &Function, + intervals: &[LiveInterval], + vreg_candidates: &std::collections::BTreeSet, + ) { + use crate::arch::regalloc::{build_interference_graph, greedy_color, mcs_ordering}; + if vreg_candidates.is_empty() { + return; + } + let mut pre_colored: BTreeMap = BTreeMap::new(); + let mut all_vertices: std::collections::BTreeSet = vreg_candidates.clone(); + for (&pid, loc) in self.locations.iter() { + if let Loc::VReg(r) = loc { + pre_colored.insert(pid, *r); + all_vertices.insert(pid); + } + } + // V-bank coloring does NOT need def-vs-src edges: aarch64 FP + // ops are three-operand at the architectural level (fadd d0, + // d1, d2), so target and source can share a register without + // codegen workarounds. Same rationale as x86_64's XMM bank. + let graph = build_interference_graph(&all_vertices, func, &self.live_out, false); + let forbidden: BTreeMap> = BTreeMap::new(); + let order = mcs_ordering(&graph); + let result = greedy_color( + &graph, + &order, + VReg::allocatable(), + &pre_colored, + &forbidden, + |_| None::>, + ); + for (&pid, ®) in &result.colors { + if pre_colored.contains_key(&pid) { + continue; + } + self.locations.insert(pid, Loc::VReg(reg)); + if reg.is_callee_saved() && !self.used_callee_saved_fp.contains(®) { + self.used_callee_saved_fp.push(reg); + } + } + for &spilled in &result.spilled { + if self.locations.contains_key(&spilled) { + continue; + } + if let Some(interval) = Self::interval_by_pseudo(intervals, spilled) { + self.alloc_stack_slot(interval, 8, 8, true); } } } - fn expire_old_intervals(&mut self, point: usize) { - // Expire GP register intervals - expire_intervals(&mut self.active, &mut self.free_regs, point); - // Expire FP register intervals - expire_intervals(&mut self.active_fp, &mut self.free_fp_regs, point); - // Expire stack slot intervals for reuse - expire_stack_intervals(&mut self.active_stack, &mut self.free_stack_slots, point); + fn interval_by_pseudo(intervals: &[LiveInterval], p: PseudoId) -> Option<&LiveInterval> { + intervals.iter().find(|i| i.pseudo == p) } fn compute_live_intervals(&self, func: &Function) -> LivenessResult { diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs index 61315b3e..a8512668 100644 --- a/cc/arch/regalloc.rs +++ b/cc/arch/regalloc.rs @@ -118,47 +118,6 @@ pub struct ConstraintPoint { // Common Functions // ============================================================================ -/// Expire old intervals from the active list, returning freed registers to the free list. -/// Generic over register type R (works with both GP and FP register types). -pub fn expire_intervals( - active: &mut Vec<(LiveInterval, R)>, - free_regs: &mut Vec, - point: usize, -) { - let mut to_remove = Vec::with_capacity(DEFAULT_SMALL_VEC_CAPACITY); - for (i, (interval, reg)) in active.iter().enumerate() { - if interval.end < point { - free_regs.push(*reg); - to_remove.push(i); - } - } - for i in to_remove.into_iter().rev() { - active.remove(i); - } -} - -/// Expire stack intervals whose live range ended before `point`, -/// returning their slots to the free list. -pub fn expire_stack_intervals( - active_stack: &mut Vec<(LiveInterval, i32, i32)>, - free_slots: &mut BTreeMap>, - point: usize, -) { - active_stack.retain(|(interval, offset, size)| { - if interval.end < point { - let alignment = if *size >= 16 { 16 } else { 8 }; - free_slots.entry(*size).or_default().push(FreeSlot { - offset: *offset, - alignment, - owner: interval.pseudo, - }); - false - } else { - true - } - }); -} - /// Find all positions of call instructions in a function. /// Used by spill_args_across_calls to identify where arguments may be clobbered. /// Includes Call, Longjmp, and Setjmp opcodes since they all invoke external functions @@ -187,61 +146,6 @@ pub fn interval_crosses_call(interval: &LiveInterval, call_positions: &[usize]) .any(|&call_pos| interval.start <= call_pos && call_pos <= interval.end) } -/// Find registers that would conflict with this interval due to constraints. -/// -/// A constraint point models an instruction that clobbers one or more -/// physical registers (e.g. `idivl` clobbers RAX and RDX). For any live -/// interval that overlaps such a point, we must keep that pseudo out of the -/// clobbered registers — otherwise its value would be silently destroyed. -/// -/// There is one exception: pseudos that are direct *operands* of the -/// constraining instruction (`involved_pseudos`) MAY occupy a clobbered -/// register, because the constraining instruction itself reads or writes -/// those registers as part of its semantics. The classic case is the -/// integer dividend, which must be in RAX/EAX for `idiv` to execute. -/// -/// Crucially, that "operand exemption" only holds when the operand's value -/// dies at the constraint point (`interval.end == cp.position`). When the -/// interval extends *past* the clobber (`interval.end > cp.position`), the -/// operand's value is still needed afterward, but the clobbering -/// instruction will have destroyed it. In that case the allocator must -/// pick a NON-clobbered register; codegen will materialize the value into -/// the operand-required register (e.g. RAX) with a move just before the -/// constraint, preserving the original pseudo's contents for later uses. -/// -/// Without this distinction, `Copy`/CSE/SCCP-class passes that extend a -/// pseudo's live range across a `mods.32`/`idivl` boundary silently -/// miscompile (see git history: prior copyprop attempts hung do-while- -/// continue tests by allocating the dividend to RAX with a use after the -/// idiv). -/// -/// Generic over register type R. -pub fn find_conflicting_registers( - interval: &LiveInterval, - constraint_points: &[ConstraintPoint], -) -> HashSet { - let mut conflicts = HashSet::with_capacity(DEFAULT_SMALL_VEC_CAPACITY); - - for cp in constraint_points { - // Use <= on both ends so an interval that exactly meets the - // constraint point still counts as overlapping. - if interval.start <= cp.position && cp.position <= interval.end { - let is_involved = cp.involved_pseudos.contains(&interval.pseudo); - let dies_at_point = interval.end == cp.position; - // The operand exemption only applies when the value is - // consumed AT the constraint point. If it must survive past - // the clobber, it has to live in a non-clobbered register. - if !is_involved || !dies_at_point { - for ® in &cp.clobbers { - conflicts.insert(reg); - } - } - } - } - - conflicts -} - /// Result of liveness analysis: intervals, constraint points, and per-block liveness sets. pub struct LivenessResult { pub intervals: Vec, From 27b8d51387df98d83b49a1efc7151f272878fe76 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 31 May 2026 06:18:15 +0000 Subject: [PATCH 3/4] cc: restore stack slot reuse lost in M6+M7 chordal port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit M6+M7 deleted expire_stack_intervals when it deleted the linear scan core, since neither chordal backend called it. That removed stack slot reuse: every alloc_stack_slot request now created a fresh slot because the free_stack_slots pool was never populated. On x86_64 this is wasteful but harmless — 32-bit addressing absorbs huge frames. On aarch64 it broke 16 int128 codegen tests on CI: stp/ldp accept signed 7-bit immediate offsets (range [-512, 504] in multiples of 8), and int128-heavy tests pushed offsets to #3472 (217 * 16 = 217 unique slots where 20-40 would have sufficed with reuse). Fix: re-introduce expire_stack_intervals in cc/arch/regalloc.rs and call it from both chordal allocators: - Phase 1: at the top of each iteration with interval.start. Intervals come pre-sorted by start position, so this monotonic sweep matches the previous linear-scan-era expiration shape. - Phase 3: with usize::MAX just before spill commits, draining all remaining slots so the chordal-spilled pseudos can reuse any non-interfering slot. The existing try_reuse_stack_slot uses interference checks (via pseudos_interfere over live_in/live_out from the dataflow fixpoint), so reuse is correct for any iteration order. Latent issue noted (not fixed here, separate concern): aarch64 codegen helpers like emit_int128_move_to_stack push raw stp/ldp without large-offset fallback. Even with reuse, frames close to the [-512, 504] threshold will still hit this. The fix is an add-scratch-then-stp pattern in the affected emitters; tracked for a follow-up rather than bundled into this regression fix. Verified: - cc test suite 914+948+204 pass on x86_64 host - cargo build / clippy / fmt clean - aarch64 verification deferred to macOS CI Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/arch/aarch64/regalloc.rs | 25 +++++++++++++++++++++++++ cc/arch/regalloc.rs | 34 ++++++++++++++++++++++++++++++++++ cc/arch/x86_64/regalloc.rs | 23 +++++++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/cc/arch/aarch64/regalloc.rs b/cc/arch/aarch64/regalloc.rs index edf0fb26..b27fd1d1 100644 --- a/cc/arch/aarch64/regalloc.rs +++ b/cc/arch/aarch64/regalloc.rs @@ -1005,6 +1005,17 @@ impl RegAlloc { let mut vreg_candidates: std::collections::BTreeSet = std::collections::BTreeSet::new(); for interval in &intervals { + // Intervals come pre-sorted by start position from + // compute_live_intervals, so this monotonic expiration + // recovers the linear-scan-era slot-reuse behavior the + // chordal sweep would otherwise lose. Without this, + // int128-heavy functions push stp/ldp offsets past + // aarch64's [-512, 504] immediate range. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + interval.start, + ); if self.locations.contains_key(&interval.pseudo) { // Already assigned by allocate_arguments / spill_args // / alloca passes; arg pseudos become pre-colored @@ -1305,6 +1316,13 @@ impl RegAlloc { self.used_callee_saved.push(reg); } } + // Drain any remaining active slots into the free pool so spill + // commits below can reuse them via interference checks. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + usize::MAX, + ); for &spilled in &final_spilled { if self.locations.contains_key(&spilled) { continue; @@ -1357,6 +1375,13 @@ impl RegAlloc { self.used_callee_saved_fp.push(reg); } } + // Same drain-then-reuse pattern as the GP commit; see comment + // on expire_stack_intervals in cc/arch/regalloc.rs. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + usize::MAX, + ); for &spilled in &result.spilled { if self.locations.contains_key(&spilled) { continue; diff --git a/cc/arch/regalloc.rs b/cc/arch/regalloc.rs index a8512668..426b4992 100644 --- a/cc/arch/regalloc.rs +++ b/cc/arch/regalloc.rs @@ -118,6 +118,40 @@ pub struct ConstraintPoint { // Common Functions // ============================================================================ +/// Release stack slots whose owning interval ended before `point` back +/// to the free-slot pool, where future `try_reuse_stack_slot` calls +/// can find them. +/// +/// Linear scan called this on every iteration with the next interval's +/// `start`. The chordal allocator (M6) calls it twice per bank: once +/// in Phase 1 sweeping monotonically by interval start, then once with +/// `usize::MAX` just before Phase 3 spill commits to drain everything +/// so spilled pseudos can reuse any non-interfering slot. +/// +/// Without this, every `alloc_stack_slot` request creates a fresh +/// slot. On x86_64 that's wasteful but harmless. On aarch64 it pushes +/// stp/ldp offsets past the architectural [-512, 504] immediate range +/// and the assembler rejects the output. +pub fn expire_stack_intervals( + active_stack: &mut Vec<(LiveInterval, i32, i32)>, + free_slots: &mut BTreeMap>, + point: usize, +) { + active_stack.retain(|(interval, offset, size)| { + if interval.end < point { + let alignment = if *size >= 16 { 16 } else { 8 }; + free_slots.entry(*size).or_default().push(FreeSlot { + offset: *offset, + alignment, + owner: interval.pseudo, + }); + false + } else { + true + } + }); +} + /// Find all positions of call instructions in a function. /// Used by spill_args_across_calls to identify where arguments may be clobbered. /// Includes Call, Longjmp, and Setjmp opcodes since they all invoke external functions diff --git a/cc/arch/x86_64/regalloc.rs b/cc/arch/x86_64/regalloc.rs index 8bd0f6c0..1b279fba 100644 --- a/cc/arch/x86_64/regalloc.rs +++ b/cc/arch/x86_64/regalloc.rs @@ -1038,6 +1038,15 @@ impl RegAlloc { let mut xmm_candidates: std::collections::BTreeSet = std::collections::BTreeSet::new(); for interval in &intervals { + // Intervals come pre-sorted by start position from + // compute_live_intervals, so this monotonic expiration + // recovers the linear-scan-era slot-reuse behavior the + // chordal sweep would otherwise lose. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + interval.start, + ); if self.locations.contains_key(&interval.pseudo) { // Already assigned by allocate_arguments / spill_args // / alloca passes; arg pseudos become pre-colored @@ -1319,6 +1328,13 @@ impl RegAlloc { self.used_callee_saved.push(reg); } } + // Drain any remaining active slots into the free pool so spill + // commits below can reuse them via interference checks. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + usize::MAX, + ); for &spilled in &final_spilled { if self.locations.contains_key(&spilled) { continue; @@ -1372,6 +1388,13 @@ impl RegAlloc { } self.locations.insert(pid, Loc::Xmm(reg)); } + // Same drain-then-reuse pattern as the GP commit; see comment + // on expire_stack_intervals in cc/arch/regalloc.rs. + crate::arch::regalloc::expire_stack_intervals( + &mut self.active_stack, + &mut self.free_stack_slots, + usize::MAX, + ); for &spilled in &result.spilled { if self.locations.contains_key(&spilled) { continue; From 6af088ebcf91ea279f2ca7eea76a0ff5c7b0cf7d Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 31 May 2026 06:32:47 +0000 Subject: [PATCH 4/4] cc: legalize out-of-range stp/ldp offsets in aarch64 codegen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aarch64 stp/ldp accept signed 7-bit immediate offsets scaled by element size: [-512, +504] step 8 for B64, [-256, +252] step 4 for B32. Body-emitted pair instructions (int128 moves, spilled arg stores, int128 call args) bypassed this constraint and emitted raw stp/ldp with whatever offset stack_mem returned. On deep frames the assembler rejected them ("index must be a multiple of 8 in range [-512, 504]"). The prologue/epilogue already had hand-rolled large-frame splits; the body sites did not. Centralized fix: a pair-address legalization helper on the Aarch64CodeGen. Three new internal methods, two new public emitters: fn pair_offset_fits(offset, size) -> bool fn emit_add_offset(dst, base, offset) fn legalize_pair_addr(size, addr) -> MemAddr pub fn emit_stp_legalized(size, src1, src2, addr) pub fn emit_ldp_legalized(size, addr, dst1, dst2) `legalize_pair_addr` is a no-op for in-range BaseOffset and for PreIndex / PostIndex (the latter is exclusive to the prologue / epilogue, which retain their own split logic). For out-of-range BaseOffset it emits `add X16, base, #offset` and rewrites the addr to `[X16]`. Scratch register choice: X16 (AAPCS64 IP0). Never in the allocator palette, never used by other codegen helpers as a *data* shuttle (they use x9–x11). The legalization convention is documented next to the helper: X16 is clobbered iff the legalizer fires; callers must not rely on X16 being alive past the emit call. In practice every site follows the pattern "compute source addr (may use X16) → load into X9/X10 → legalize destination addr (may reuse X16) → store" — by the time the destination's legalization runs, the source's use of X16 is dead. Migrated body emit sites: cc/arch/aarch64/codegen.rs: - emit_int128_move_to_stack (3 sites — the int128 store pattern that actually triggered the CI failure) - emit_int128_imm_store (1 site) - emit_load int128 path (2 sites) - emit_cbr int128 path (1 site) - emit_return int128 lowering (1 site) - int128 GP-pair arg storage (1 site) cc/arch/aarch64/expression.rs: - load_int128, store_int128 (2 sites) cc/arch/aarch64/call.rs: - int128 call-arg setup (2 sites) Left as raw push_lir, all justified: - Prologue/epilogue PreIndex / PostIndex stp/ldp — already handle their own large-frame splits via emit_sub_sp_imm / emit_add_sp_imm. - zero_stack_frame — only enters the stp branch when offsets fit (lines guard `max_stp_offset <= 504`). - Callee-saved save/restore — offsets bounded by the callee- saved set (≤288 bytes max). - Sites where the address is `MemAddr::Base(X16)` after emit_load_addr — no offset to legalize. FP pair helpers (emit_stp_fp_legalized / emit_ldp_fp_legalized) intentionally not introduced — every current StpFp / LdpFp site is bounded. A code comment marks where to add them when an FP pair instruction grows a body emission with possibly-large offset. Verified: - cc test suite 914+948+204 pass on x86_64 host - cargo build / clippy / fmt clean - aarch64 verification deferred to macOS CI Co-Authored-By: Claude Opus 4.7 (1M context) --- cc/arch/aarch64/call.rs | 19 +-- cc/arch/aarch64/codegen.rs | 286 ++++++++++++++++++++++++++-------- cc/arch/aarch64/expression.rs | 14 +- 3 files changed, 229 insertions(+), 90 deletions(-) diff --git a/cc/arch/aarch64/call.rs b/cc/arch/aarch64/call.rs index 0e7b73cd..1bf22998 100644 --- a/cc/arch/aarch64/call.rs +++ b/cc/arch/aarch64/call.rs @@ -245,12 +245,12 @@ impl Aarch64CodeGen { Loc::Stack(offset) => { // Load lo/hi from int128 stack slot into two consecutive regs let mem = self.stack_mem(offset); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: mem, - dst1: int_arg_regs[int_arg_idx], - dst2: int_arg_regs[int_arg_idx + 1], - }); + self.emit_ldp_legalized( + OperandSize::B64, + mem, + int_arg_regs[int_arg_idx], + int_arg_regs[int_arg_idx + 1], + ); } Loc::Imm(v) => { let lo = v as u64 as i64; @@ -321,12 +321,7 @@ impl Aarch64CodeGen { match loc { Loc::Stack(src_off) => { let mem = self.stack_mem(src_off); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: mem, - dst1: Reg::X9, - dst2: Reg::X10, - }); + self.emit_ldp_legalized(OperandSize::B64, mem, Reg::X9, Reg::X10); } Loc::Imm(v) => { let lo = v as u64 as i64; diff --git a/cc/arch/aarch64/codegen.rs b/cc/arch/aarch64/codegen.rs index b78e27cf..4261142d 100644 --- a/cc/arch/aarch64/codegen.rs +++ b/cc/arch/aarch64/codegen.rs @@ -515,6 +515,195 @@ impl Aarch64CodeGen { } } + // ======================================================================== + // Pair-addressing legalization (stp / ldp / stpfp / ldpfp) + // ======================================================================== + // + // stp/ldp accept signed 7-bit immediate offsets scaled by element + // size: + // B64 / Double : [-512, 504] step 8 + // B32 / Single : [-256, 252] step 4 + // Quad (128b) : [-1024, 1008] step 16 + // + // A deep stack frame (large alloca, int128-heavy locals, many + // spills) routinely overflows these. The pre-fix codegen blindly + // emitted out-of-range offsets and the assembler rejected them + // ("index must be a multiple of 8 in range [-512, 504]"). + // + // Every body-emitted pair instruction that takes a `BaseOffset` + // routes through `emit_{stp,ldp,stp_fp,ldp_fp}_legalized`. The + // legalizer: + // * leaves in-range offsets untouched (zero overhead); + // * materializes out-of-range addresses into the `X16` scratch + // register and rewrites the addr to `[X16]`. + // + // `X16` is AAPCS64 IP0 — linker scratch, never in the allocator + // palette, and never used by other codegen helpers as a *data* + // shuttle (they use x9–x11). Reserving it specifically for + // address materialization keeps the scratch convention clean. + // + // `PreIndex` / `PostIndex` addresses are NOT legalized here. They + // appear only in the prologue/epilogue, which already handles its + // own large-frame split (see `emit_prologue` / `emit_epilogue`). + + /// stp/ldp signed-7-bit-scaled immediate range for `size`. + /// Returns `(min, max, step)` in bytes. + #[inline] + fn pair_offset_range(size: OperandSize) -> (i32, i32, i32) { + match size { + OperandSize::B32 => (-256, 252, 4), + _ => (-512, 504, 8), + } + } + + /// True if `offset` fits the stp/ldp encoding for `size`. + #[inline] + fn pair_offset_fits(offset: i32, size: OperandSize) -> bool { + let (min, max, step) = Self::pair_offset_range(size); + offset >= min && offset <= max && offset % step == 0 + } + + /// Emit `dst = base + offset`, picking the cheapest encoding: + /// * add/sub with 12-bit immediate (single instruction); + /// * two add/sub when the offset fits within 2 × 12-bit; + /// * fall back to `mov dst, imm; add dst, base, dst` for the + /// extreme tail. + /// + /// `dst` must be a register the caller is free to clobber — the + /// pair-legalization path passes X16. + fn emit_add_offset(&mut self, dst: Reg, base: Reg, offset: i32) { + const MAX_IMM12: i32 = 4095; + if offset == 0 { + self.push_lir(Aarch64Inst::Mov { + size: OperandSize::B64, + src: GpOperand::Reg(base), + dst, + }); + return; + } + let positive = offset >= 0; + let abs = offset.unsigned_abs() as i64; + if abs <= MAX_IMM12 as i64 { + self.push_lir(if positive { + Aarch64Inst::Add { + size: OperandSize::B64, + src1: base, + src2: GpOperand::Imm(abs), + dst, + } + } else { + Aarch64Inst::Sub { + size: OperandSize::B64, + src1: base, + src2: GpOperand::Imm(abs), + dst, + } + }); + } else if abs <= 2 * MAX_IMM12 as i64 { + // Two-step: first to `dst`, then chain to `dst`. + self.push_lir(if positive { + Aarch64Inst::Add { + size: OperandSize::B64, + src1: base, + src2: GpOperand::Imm(MAX_IMM12 as i64), + dst, + } + } else { + Aarch64Inst::Sub { + size: OperandSize::B64, + src1: base, + src2: GpOperand::Imm(MAX_IMM12 as i64), + dst, + } + }); + self.push_lir(if positive { + Aarch64Inst::Add { + size: OperandSize::B64, + src1: dst, + src2: GpOperand::Imm(abs - MAX_IMM12 as i64), + dst, + } + } else { + Aarch64Inst::Sub { + size: OperandSize::B64, + src1: dst, + src2: GpOperand::Imm(abs - MAX_IMM12 as i64), + dst, + } + }); + } else { + self.emit_mov_imm(dst, offset as i64, 64); + self.push_lir(Aarch64Inst::Add { + size: OperandSize::B64, + src1: base, + src2: GpOperand::Reg(dst), + dst, + }); + } + } + + /// If `addr` is a `BaseOffset` with an out-of-range offset, + /// materialize `base + offset` into X16 and rewrite the address + /// to `[X16]`. Otherwise returns `addr` unchanged. + /// + /// Convention: X16 is clobbered iff legalization fires. Callers + /// must not rely on X16 being alive past the emit_*_legalized + /// call. In practice this is fine — the pattern is always + /// "compute source addr (may use X16) → load into X9/X10 → + /// legalize destination addr (may reuse X16) → store" — the + /// source's use of X16 is dead by the time the destination's + /// legalization runs. + fn legalize_pair_addr(&mut self, size: OperandSize, addr: MemAddr) -> MemAddr { + if let MemAddr::BaseOffset { base, offset } = addr { + if !Self::pair_offset_fits(offset, size) { + self.emit_add_offset(Reg::X16, base, offset); + return MemAddr::Base(Reg::X16); + } + } + addr + } + + /// Emit `stp src1, src2, addr` with pair-address legalization. + pub(super) fn emit_stp_legalized( + &mut self, + size: OperandSize, + src1: Reg, + src2: Reg, + addr: MemAddr, + ) { + let addr = self.legalize_pair_addr(size, addr); + self.push_lir(Aarch64Inst::Stp { + size, + src1, + src2, + addr, + }); + } + + /// Emit `ldp dst1, dst2, addr` with pair-address legalization. + pub(super) fn emit_ldp_legalized( + &mut self, + size: OperandSize, + addr: MemAddr, + dst1: Reg, + dst2: Reg, + ) { + let addr = self.legalize_pair_addr(size, addr); + self.push_lir(Aarch64Inst::Ldp { + size, + addr, + dst1, + dst2, + }); + } + + // FP pair legalization helpers (emit_{stp,ldp}_fp_legalized) are + // intentionally absent — every current StpFp/LdpFp site emits + // either callee-saved save/restore (offset bounded by the small + // callee-saved set: ≤288 bytes) or prologue PreIndex (handled by + // its own large-frame split). Add them the moment an FP pair + // instruction needs body-emission with a possibly-large offset. + /// Zero-initialize the local variable area of the stack frame. /// This ensures all stack slots start as zero, so narrow writes (8/16/32-bit) /// leave zero in the unwritten upper bytes. @@ -898,12 +1087,12 @@ impl Aarch64CodeGen { if let Some(Loc::Stack(offset)) = self.locations.get_ref(pseudo.id) { if *offset < 0 { - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: arg_regs[int_arg_idx], - src2: arg_regs[int_arg_idx + 1], - addr: self.stack_mem(*offset), - }); + self.emit_stp_legalized( + OperandSize::B64, + arg_regs[int_arg_idx], + arg_regs[int_arg_idx + 1], + self.stack_mem(*offset), + ); } } } @@ -1119,12 +1308,7 @@ impl Aarch64CodeGen { let loc = self.get_location(src); if let Loc::Stack(offset) = loc { let mem = self.stack_mem(offset); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: mem, - dst1: Reg::X0, - dst2: Reg::X1, - }); + self.emit_ldp_legalized(OperandSize::B64, mem, Reg::X0, Reg::X1); } else { // Fallback: load lo half to X0, zero X1 self.emit_move(src, Reg::X0, 64); @@ -1264,12 +1448,12 @@ impl Aarch64CodeGen { if insn.size >= 128 { // 128-bit: load both halves and ORR them to check for non-zero let (_, scratch1, _) = Reg::scratch_regs(); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: self.stack_mem(*offset), - dst1: scratch0, - dst2: scratch1, - }); + self.emit_ldp_legalized( + OperandSize::B64, + self.stack_mem(*offset), + scratch0, + scratch1, + ); self.push_lir(Aarch64Inst::Orr { size: OperandSize::B64, src1: scratch0, @@ -2064,18 +2248,8 @@ impl Aarch64CodeGen { // Stack-to-stack copy: load lo/hi via LDP, store via STP let src_mem = self.stack_mem(src_offset); let dst_mem = self.stack_mem(dst_offset); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: src_mem, - dst1: Reg::X9, - dst2: Reg::X10, - }); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::X10, - addr: dst_mem, - }); + self.emit_ldp_legalized(OperandSize::B64, src_mem, Reg::X9, Reg::X10); + self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, dst_mem); } Loc::Imm(v) => { let lo = v as u64 as i64; @@ -2083,23 +2257,13 @@ impl Aarch64CodeGen { self.emit_mov_imm(Reg::X9, lo, 64); self.emit_mov_imm(Reg::X10, hi, 64); let dst_mem = self.stack_mem(dst_offset); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::X10, - addr: dst_mem, - }); + self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, dst_mem); } _ => { // For other locations, load as 64-bit and zero-extend self.emit_move(src, Reg::X9, 64); let dst_mem = self.stack_mem(dst_offset); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::Xzr, - addr: dst_mem, - }); + self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::Xzr, dst_mem); } } } @@ -2208,18 +2372,13 @@ impl Aarch64CodeGen { if let Loc::Stack(dst_offset) = dst_loc { match self.compute_mem_addr(addr, insn.offset, Reg::X16) { ComputedAddr::Direct(mem_addr) | ComputedAddr::WithSetup(mem_addr) => { - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: mem_addr, - dst1: Reg::X9, - dst2: Reg::X10, - }); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::X10, - addr: self.stack_mem(dst_offset), - }); + self.emit_ldp_legalized(OperandSize::B64, mem_addr, Reg::X9, Reg::X10); + self.emit_stp_legalized( + OperandSize::B64, + Reg::X9, + Reg::X10, + self.stack_mem(dst_offset), + ); } ComputedAddr::Global(name) => { self.emit_load_addr(&name, Reg::X16); @@ -2229,12 +2388,12 @@ impl Aarch64CodeGen { dst1: Reg::X9, dst2: Reg::X10, }); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::X10, - addr: self.stack_mem(dst_offset), - }); + self.emit_stp_legalized( + OperandSize::B64, + Reg::X9, + Reg::X10, + self.stack_mem(dst_offset), + ); } } } @@ -2585,12 +2744,7 @@ impl Aarch64CodeGen { match self.compute_mem_addr(addr, insn.offset, Reg::X16) { ComputedAddr::Direct(mem_addr) | ComputedAddr::WithSetup(mem_addr) => { - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: Reg::X9, - src2: Reg::X10, - addr: mem_addr, - }); + self.emit_stp_legalized(OperandSize::B64, Reg::X9, Reg::X10, mem_addr); } ComputedAddr::Global(name) => { self.emit_load_addr(&name, Reg::X16); diff --git a/cc/arch/aarch64/expression.rs b/cc/arch/aarch64/expression.rs index 216554a2..46cb8440 100644 --- a/cc/arch/aarch64/expression.rs +++ b/cc/arch/aarch64/expression.rs @@ -448,12 +448,7 @@ impl Aarch64CodeGen { match loc { Loc::Stack(offset) => { let mem = self.stack_mem(offset); - self.push_lir(Aarch64Inst::Ldp { - size: OperandSize::B64, - addr: mem, - dst1: lo_reg, - dst2: hi_reg, - }); + self.emit_ldp_legalized(OperandSize::B64, mem, lo_reg, hi_reg); } Loc::Imm(v) => { let lo = v as u64 as i64; @@ -478,12 +473,7 @@ impl Aarch64CodeGen { let dst_loc = self.get_location(target); if let Loc::Stack(offset) = dst_loc { let mem = self.stack_mem(offset); - self.push_lir(Aarch64Inst::Stp { - size: OperandSize::B64, - src1: lo_reg, - src2: hi_reg, - addr: mem, - }); + self.emit_stp_legalized(OperandSize::B64, lo_reg, hi_reg, mem); } }