sgl-project · jiapingW · Jun 13, 2026 · Jun 11, 2026
@@ -7,3 +7,11 @@ bash examples/<script-name>.sh [NUM_GPUS] [TP_SIZE]
 ```
 
 We use the ShareGPT dataset for all the examples for now, but you can replace it with more robust datasets such as perfectblend, magpie-qwen2.5-pro-1m-v0.1, etc.
+
+## D-PACE
+
+DFlash training also supports the D-PACE loss (Dynamic Position-Aware Cross-Entropy). Add `--loss-type dpace` (optionally `--dpace-alpha`, default 0.5) to any `scripts/train_dflash.py` command. `--loss-decay-gamma` is DFlash-only and is ignored by D-PACE variants. Ablation variants are available via `--loss-type dpace-cumulative-confidence-only` and `--loss-type dpace-continuation-value-only`. A ready-to-run example:
+
+```bash
+NUM_GPUS=8 DPACE_ALPHA=0.5 bash examples/run_qwen3_8b_dpace_online.sh
+```
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+ROOT_DIR=$(dirname "$SCRIPT_DIR")
+
+NUM_GPUS=${NUM_GPUS:-${1:-8}}
+ATTENTION_BACKEND=${ATTENTION_BACKEND:-${2:-flex_attention}}
+TARGET_MODEL_PATH=${TARGET_MODEL_PATH:-Qwen/Qwen3-8B}
+DRAFT_CONFIG_PATH=${DRAFT_CONFIG_PATH:-$ROOT_DIR/configs/qwen3-8b-dflash.json}
+TRAIN_DATA_PATH=${TRAIN_DATA_PATH:-$ROOT_DIR/cache/dataset/perfectblend_qwen3-8b_regen.jsonl}
+OUTPUT_DIR=${OUTPUT_DIR:-$ROOT_DIR/outputs/qwen3-8b-dpace}
+CACHE_DIR=${CACHE_DIR:-$ROOT_DIR/cache}
+DPACE_ALPHA=${DPACE_ALPHA:-0.5}
+
+export TORCHINDUCTOR_CACHE_DIR=${TORCHINDUCTOR_CACHE_DIR:-$ROOT_DIR/cache/compiled_kernels}
+export SPECFORGE_DATA_NUM_PROC=${SPECFORGE_DATA_NUM_PROC:-32}
+export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
+
+torchrun \
+    --standalone \
+    --nproc_per_node "$NUM_GPUS" \
+    "$ROOT_DIR/scripts/train_dflash.py" \
+    --target-model-path "$TARGET_MODEL_PATH" \
+    --target-model-backend sglang \
+    --draft-config-path "$DRAFT_CONFIG_PATH" \
+    --train-data-path "$TRAIN_DATA_PATH" \
+    --output-dir "$OUTPUT_DIR" \
+    --cache-dir "$CACHE_DIR" \
+    --num-epochs 6 \
+    --batch-size 4 \
+    --learning-rate 6e-4 \
+    --warmup-ratio 0.04 \
+    --max-grad-norm 1.0 \
+    --max-length 3072 \
+    --chat-template qwen \
+    --attention-backend "$ATTENTION_BACKEND" \
+    --block-size 16 \
+    --num-draft-layers 1 \
+    --num-anchors 512 \
+    --loss-type dpace \
+    --dpace-alpha "$DPACE_ALPHA" \
+    --log-interval 50 \
+    --save-interval 1000 \
+    --report-to wandb \
+    --wandb-project "${WANDB_PROJECT:-dpace-qwen3-8b}" \
+    --wandb-name "${WANDB_NAME:-qwen3-8b-dpace}"
@@ -81,7 +81,26 @@ def parse_args():
         type=float,
         default=None,
         help="Gamma for exponential loss decay weighting (paper Eq.4). "
-        "Suggested: 7 for block_size=16, 5 for 10, 4 for 8. None disables.",
+        "Suggested: 7 for block_size=16, 5 for 10, 4 for 8. None disables. "
+        "Only applies when --loss-type dflash.",
+    )
+    model_group.add_argument(
+        "--loss-type",
+        type=str,
+        default="dflash",
+        choices=[
+            "dflash",
+            "dpace",
+            "dpace-cumulative-confidence-only",
+            "dpace-continuation-value-only",
+        ],
+        help=("Loss variant. Use dpace for Dynamic Position-Aware Cross-Entropy."),
+    )
+    model_group.add_argument(
+        "--dpace-alpha",
+        type=float,
+        default=0.5,
+        help="Smoothing alpha for D-PACE position weights.",
     )
     model_group.add_argument(
         "--embedding-key",
@@ -439,6 +458,8 @@ def main():
         attention_backend=args.attention_backend,
         num_anchors=args.num_anchors,
         loss_decay_gamma=args.loss_decay_gamma,
+        loss_type=args.loss_type,
+        dpace_alpha=args.dpace_alpha,
     )
 
     # Wrap each transformer block as its own FSDP unit so that all-gather /

@@ -19,6 +19,15 @@
     create_block_mask = None
 
 
+_VALID_LOSS_TYPES = {
+    "dflash",
+    "dpace",
+    "dpace-cumulative-confidence-only",
+    "dpace-continuation-value-only",
+}
+_DPACE_LOSS_TYPES = _VALID_LOSS_TYPES - {"dflash"}
+
+
 def create_dflash_sdpa_mask(anchor_positions, block_keep_mask, S, block_size, device):
     B, N = anchor_positions.shape
     Q_LEN = N * block_size
@@ -94,7 +103,7 @@ def dflash_mask_mod(b, h, q_idx, kv_idx):
 
 
 class OnlineDFlashModel(nn.Module):
-    """DFlash online training wrapper with block-wise CE loss."""
+    """DFlash online training wrapper with DFlash and D-PACE losses."""
 
     def __init__(
         self,
@@ -106,8 +115,17 @@ def __init__(
         attention_backend: str = "flex_attention",
         num_anchors: int = 512,
         loss_decay_gamma: Optional[float] = None,
+        loss_type: str = "dflash",
+        dpace_alpha: float = 0.5,
     ):
         super().__init__()
+        if loss_type not in _VALID_LOSS_TYPES:
+            raise ValueError(
+                f"loss_type={loss_type!r}; must be one of {sorted(_VALID_LOSS_TYPES)}"
+            )
+        if not 0.0 <= dpace_alpha <= 1.0:
+            raise ValueError(f"dpace_alpha must be in [0, 1], got {dpace_alpha}")
+
         self.draft_model = draft_model
         self.lm_head = target_lm_head
         self.embed_tokens = target_embed_tokens
@@ -116,6 +134,8 @@ def __init__(
         self.attention_backend = attention_backend
         self.num_anchors = num_anchors
         self.loss_decay_gamma = loss_decay_gamma
+        self.loss_type = loss_type
+        self.dpace_alpha = dpace_alpha
 
         self._cached_block_mask: Optional[BlockMask] = None
         self._cached_seq_len: Optional[int] = None
@@ -211,6 +231,38 @@ def _create_noise_embed(self, input_ids, anchor_positions, block_keep_mask):
 
         return self.embed_tokens(noise_ids)
 
+    def _dpace_weight(
+        self,
+        prob: torch.Tensor,
+        binary_mask: torch.Tensor,
+        binary_mask_b: torch.Tensor,
+        loss_type: str,
+    ) -> torch.Tensor:
+        """Compute detached D-PACE position weights.
+
+        ``prob`` is the draft probability on the target token at each draft
+        position. Invalid positions are treated as multiplicative no-ops inside
+        prefix products and excluded from suffix sums; the caller still
+        multiplies the returned weights by ``binary_mask`` before reduction.
+        """
+        smooth = (1.0 - self.dpace_alpha) * prob + self.dpace_alpha
+        smooth = torch.where(binary_mask_b, smooth, torch.ones_like(smooth))
+        prefix = torch.cumprod(smooth, dim=-1)
+
+        if loss_type == "dpace-cumulative-confidence-only":
+            return prefix
+
+        suffix = torch.flip(
+            torch.cumsum(torch.flip(prefix * binary_mask, dims=[-1]), dim=-1),
+            dims=[-1],
+        )
+
+        if loss_type == "dpace":
+            return suffix
+        if loss_type == "dpace-continuation-value-only":
+            return suffix / prefix.clamp_min(torch.finfo(prefix.dtype).tiny)
+        raise ValueError(f"unknown D-PACE loss_type {loss_type!r}")
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -291,22 +343,39 @@ def forward(
 
         binary_eval_mask = weight_mask.view(-1)
 
-        # --- Loss decay: exp(-(k-1)/γ) so k=1 (1st prediction) gets weight 1.0 ---
-        if self.loss_decay_gamma is not None and self.loss_decay_gamma > 0:
-            k = torch.arange(self.block_size, device=device).view(1, 1, -1)
-            decay_weights = torch.exp(
-                -(k - 1).clamp(min=0).float() / self.loss_decay_gamma
-            )
-            weight_mask = weight_mask * decay_weights
-
         # --- Cross entropy ---
         flat_logits = logits.view(-1, logits.size(-1))
         flat_targets = target_ids.view(-1)
-        flat_weights = weight_mask.view(-1)
 
         loss_per_token = F.cross_entropy(flat_logits, flat_targets, reduction="none")
-        valid_token_count = flat_weights.sum() + 1e-6
-        loss = (loss_per_token * flat_weights).sum() / valid_token_count
+
+        if self.loss_type == "dflash":
+            # Preserve the existing DFlash weighted-mean behavior.
+            loss_weights = weight_mask
+            if self.loss_decay_gamma is not None and self.loss_decay_gamma > 0:
+                k = torch.arange(self.block_size, device=device).view(1, 1, -1)
+                decay_weights = torch.exp(
+                    -(k - 1).clamp(min=0).float() / self.loss_decay_gamma
+                )
+                loss_weights = loss_weights * decay_weights
+
+            flat_weights = loss_weights.view(-1)
+            valid_token_count = flat_weights.sum() + 1e-6
+            loss = (loss_per_token * flat_weights).sum() / valid_token_count
+        elif self.loss_type in _DPACE_LOSS_TYPES:
+            neg_log_q = loss_per_token.view_as(target_ids)
+            with torch.no_grad():
+                q = torch.exp(-neg_log_q)
+                dpace_weights = self._dpace_weight(
+                    q,
+                    weight_mask,
+                    weight_mask > 0,
+                    self.loss_type,
+                )
+            loss_weights = weight_mask * dpace_weights
+            loss = (neg_log_q * loss_weights).sum() / float(bsz)
+        else:
+            raise ValueError(f"unknown loss_type {self.loss_type!r}")
 
         # --- Accuracy ---
         with torch.no_grad():