sgl-project · menogrey · Jun 1, 2026 · May 25, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/docs/get_started/installation.md b/docs/get_started/installation.md
@@ -24,3 +24,18 @@ uv pip install -v . --prerelease=allow
 ```bash
 pip install specforge
 ```
+
+- **Install on Ascend NPU**
+
+1. Pull compatible SGLang image for Ascend NPU, currently `quay.io/ascend/sglang:v0.5.9-cann8.5.0-a3` on A3 device, or `quay.io/ascend/sglang:v0.5.9-cann8.5.0-910b` on A2 device.
+2. Install SpecForge.
+
+```bash
+# git clone the source code
+git clone https://github.com/sgl-project/SpecForge.git
+cd SpecForge
+
+# install specforge
+pip install -r requirements-npu.txt
+pip install . --no-deps
+```
@@ -0,0 +1,30 @@
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+# train eagle3 for llama3.1-8b
+NUM_GPUS=${1:-1}
+TP_SIZE=${2:-1}
+BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64}
+
+# Currently we only train with --max-length 2048 due to OOM issue on A3(64GB)
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3.py \
+    --target-model-path meta-llama/Llama-3.1-8B-Instruct \
+    --draft-model-config $ROOT_DIR/configs/llama3-8B-eagle3.json \
+    --train-data-path $ROOT_DIR/cache/dataset/sharegpt_train.jsonl \
+    --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
+    --output-dir $ROOT_DIR/outputs/llama3-8b-eagle3-sharegpt \
+    --num-epochs 10 \
+    --batch-size 1 \
+    --tp-size $TP_SIZE \
+    --learning-rate 1e-4 \
+    --max-length 2048 \
+    --chat-template llama3 \
+    --cache-dir $ROOT_DIR/cache \
+    --attention-backend sdpa \
+    --target-model-backend sglang \
+    --log-interval 10 \
+    --sglang-mem-fraction-static 0.3
diff --git a/requirements-npu.txt b/requirements-npu.txt
@@ -0,0 +1,24 @@
+# Use the PyTorch CPU wheel index cause torch_npu depends on a CPU version of PyTorch
+--extra-index-url https://download.pytorch.org/whl/cpu
+
+pre-commit
+torch==2.8.0+cpu
+torch_npu==2.8.0.post2
+torchaudio==2.8.0
+torchvision==0.23.0
+transformers==4.57.1
+qwen-vl-utils==0.0.11
+datasets
+setuptools
+tqdm
+wandb
+psutil
+numpy
+accelerate
+pydantic
+sglang==0.5.9
+openai-harmony
+ninja
+packaging
+yunchang
+tensorboard
@@ -59,6 +59,9 @@
 )
 from specforge.modeling.target import Eagle3TargetModel, get_eagle3_target_model
 from specforge.utils import (
+    empty_cache,
+    get_device_type,
+    get_local_device,
     print_args_with_dots,
     print_with_rank,
     rank_0_priority,
@@ -183,7 +186,7 @@ def build_target_model(
                 ),
             )
             .eval()
-            .cuda()
+            .to(device=get_local_device())
         )
     else:
         target_model_kwargs = SGLangBackendArgs.from_args(args).to_kwargs()
@@ -195,7 +198,7 @@ def build_target_model(
                 if hasattr(model_config, "dtype")
                 else model_config.torch_dtype
             ),
-            device="cuda",
+            device=get_device_type(),
             cache_dir=args.model_download_dir,
             trust_remote_code=args.trust_remote_code,
             **target_model_kwargs,
@@ -463,11 +466,11 @@ def generate(
                     output_path, current_batch_indices
                 )
                 exists_tensor = torch.tensor(
-                    exists_list, dtype=torch.bool, device="cuda"
+                    exists_list, dtype=torch.bool, device=get_local_device()
                 )
             else:
                 exists_tensor = torch.tensor(
-                    [False] * batch_size, dtype=torch.bool, device="cuda"
+                    [False] * batch_size, dtype=torch.bool, device=get_local_device()
                 )
             dist.broadcast(exists_tensor, src=tp_rank_0_global, group=tp_group)
 
@@ -504,7 +507,8 @@ def generate(
                 continue
 
             filtered_batch_gpu = {
-                k: v.cuda(non_blocking=True) for k, v in filtered_batch.items()
+                k: v.to(get_local_device(), non_blocking=True)
+                for k, v in filtered_batch.items()
             }
             _, _, aux_hidden_states_list, last_hidden_states_list = self.model.extend(
                 **filtered_batch_gpu,
@@ -559,7 +563,7 @@ def generate(
             del aux_hidden_states_list, last_hidden_states_list, filtered_batch
 
             if batch_idx % 5 == 0:  # Make GC and cache clearing more frequent
-                torch.cuda.empty_cache()
+                empty_cache()
                 gc.collect()
 
             if self.show_progress:

@@ -36,7 +36,12 @@
 from specforge.modeling.target.target_utils import TargetEmbeddingsAndHead
 from specforge.optimizer import BF16Optimizer
 from specforge.tracker import create_tracker
-from specforge.utils import get_last_checkpoint, print_on_rank0, print_with_rank
+from specforge.utils import (
+    get_last_checkpoint,
+    get_local_device,
+    print_on_rank0,
+    print_with_rank,
+)
 
 
 def parse_args():
@@ -159,11 +164,14 @@ def build_models(args) -> Tuple[DFlashTargetModel, DFlashDraftModel]:
     if args.target_model_backend == "sglang":
         target_model_kwargs = SGLangBackendArgs.from_args(args).to_kwargs()
 
+    device = get_local_device()
+    device_type = device.type
+
     target_model = get_dflash_target_model(
         pretrained_model_name_or_path=args.target_model_path,
         backend=args.target_model_backend,
         torch_dtype=torch.bfloat16,
-        device="cuda" if args.target_model_backend == "hf" else None,
+        device=device_type if args.target_model_backend == "hf" else None,
         trust_remote_code=args.trust_remote_code,
         **target_model_kwargs,
     )
@@ -194,7 +202,7 @@ def build_models(args) -> Tuple[DFlashTargetModel, DFlashDraftModel]:
     draft_config._attn_implementation = args.attention_backend
     print_on_rank0(f"Using attention backend: {args.attention_backend}")
 
-    draft_model = DFlashDraftModel(draft_config).cuda().to(torch.bfloat16)
+    draft_model = DFlashDraftModel(draft_config).to(device=device, dtype=torch.bfloat16)
 
     target_model.set_capture_layers(draft_model.target_layer_ids)
 
@@ -426,7 +434,7 @@ def main():
         args.target_model_path,
         embed_key=args.embedding_key,
         lm_head_key=args.lm_head_key,
-        device="cuda",
+        device=device_type,
         trust_remote_code=args.trust_remote_code,
     )
 
@@ -522,13 +530,13 @@ def main():
                 continue
             global_step += 1
 
-            input_ids = data["input_ids"].cuda()
-            attention_mask = data["attention_mask"].cuda()
-            loss_mask = data["loss_mask"].cuda()
+            input_ids = data["input_ids"].to(device, non_blocking=True)
+            attention_mask = data["attention_mask"].to(device, non_blocking=True)
+            loss_mask = data["loss_mask"].to(device, non_blocking=True)
             target_output = target_model.generate_dflash_data(
                 input_ids, attention_mask, loss_mask
             )
-            hidden_states = target_output.hidden_states.cuda()  # Ensure on GPU
+            hidden_states = target_output.hidden_states.to(device, non_blocking=True)
 
             loss, accuracy = dflash_model(
                 input_ids=input_ids,

@@ -47,26 +47,36 @@
 from specforge.tracker import Tracker, create_tracker, get_tracker_class
 from specforge.utils import (
     create_draft_config_from_target,
+    current_device,
+    get_device_module,
+    get_device_type,
     get_last_checkpoint,
+    get_local_device,
     print_args_with_dots,
     print_on_rank0,
     print_with_rank,
     rank_0_priority,
     safe_conversations_generator,
+    synchronize,
 )
 
 
 def print_cuda_memory_debug(label: str) -> None:
-    if os.getenv("SPECFORGE_CI_MEMORY_DEBUG") != "1" or not torch.cuda.is_available():
+    device_type = get_device_type()
+    if os.getenv("SPECFORGE_CI_MEMORY_DEBUG") != "1" or device_type == "cpu":
         return
 
     try:
-        torch.cuda.synchronize()
-        free_bytes, total_bytes = torch.cuda.mem_get_info()
-        allocated_bytes = torch.cuda.memory_allocated()
-        reserved_bytes = torch.cuda.memory_reserved()
+        synchronize()
+        device_module = get_device_module()
+        free_bytes, total_bytes = device_module.mem_get_info()
+        allocated_bytes = device_module.memory_allocated()
+        reserved_bytes = device_module.memory_reserved()
     except Exception as exc:
-        print(f"[memory-debug] {label}: failed to query CUDA memory: {exc}", flush=True)
+        print(
+            f"[memory-debug] {label}: failed to query {device_type} memory: {exc}",
+            flush=True,
+        )
         return
 
     rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else "NA"
@@ -333,7 +343,7 @@ def build_target_model(
                     torch_dtype=torch.bfloat16,
                 )
                 .eval()
-                .cuda()
+                .to(get_local_device())
             )
         else:
             if args.target_model_backend == "sglang":
@@ -344,7 +354,7 @@ def build_target_model(
                 pretrained_model_name_or_path=args.target_model_path,
                 backend=args.target_model_backend,
                 torch_dtype=torch.bfloat16,
-                device="cuda",
+                device=get_device_type(),
                 cache_dir=args.model_download_dir,
                 **target_model_kwargs,
                 trust_remote_code=args.trust_remote_code,
@@ -472,13 +482,13 @@ def build_draft_model(args: Namespace) -> Tuple[AutoDraftModelConfig, nn.Module]
             draft_model_last_checkpoint,
             attention_backend=args.attention_backend,
             torch_dtype=torch.bfloat16,
-        ).cuda()
+        ).to(get_local_device())
     else:
         draft_model = AutoEagle3DraftModel.from_config(
             draft_model_config,
             attention_backend=args.attention_backend,
             torch_dtype=torch.bfloat16,
-        ).cuda()
+        ).to(get_local_device())
 
     # Load training state (optimizer, scheduler, epoch, step) for true resume
     resume_state = None
@@ -680,38 +690,41 @@ def run_forward(
             metric_losses,
             metric_loss_denoms,
         ) = eagle3_model(
-            input_ids=data["input_ids"].cuda(),
-            attention_mask=data["attention_mask"].cuda(),
-            loss_mask=data["loss_mask"].cuda(),
-            pixel_values=data["pixel_values"].cuda(),
-            image_grid_thw=data["image_grid_thw"].cuda(),
+            input_ids=data["input_ids"].to(get_local_device()),
+            attention_mask=data["attention_mask"].to(get_local_device()),
+            loss_mask=data["loss_mask"].to(get_local_device()),
+            pixel_values=data["pixel_values"].to(get_local_device()),
+            image_grid_thw=data["image_grid_thw"].to(get_local_device()),
         )
     else:
         image_grid_thw = None
         if is_online:
             # we generate the eagle3 using the target model in an online fashion
             # Handle VLM data: pixel_values and image_grid_thw are lists
-            # pixel_values = [pv.cuda() for pv in data["pixel_values"]] if args.is_vlm else None
+            # pixel_values = [pv.to(get_local_device()) for pv in data["pixel_values"]] if args.is_vlm else None
             if args.is_vlm:
                 image_grid_thw = (
-                    [thw.cuda().squeeze() for thw in data["image_grid_thw"]]
+                    [
+                        thw.to(get_local_device()).squeeze()
+                        for thw in data["image_grid_thw"]
+                    ]
                     if args.is_vlm
                     else None
                 )
-                pixel_values = data["pixel_values"].cuda()
+                pixel_values = data["pixel_values"].to(get_local_device())
                 eagle3_data = target_model.generate_eagle3_data(
-                    input_ids=data["input_ids"].cuda(),
-                    attention_mask=data["attention_mask"].cuda(),
-                    loss_mask=data["loss_mask"].cuda(),
+                    input_ids=data["input_ids"].to(get_local_device()),
+                    attention_mask=data["attention_mask"].to(get_local_device()),
+                    loss_mask=data["loss_mask"].to(get_local_device()),
                     is_vlm=args.is_vlm,
                     pixel_values=pixel_values,
                     image_grid_thw=image_grid_thw,
                 )
             else:
                 eagle3_data = target_model.generate_eagle3_data(
-                    input_ids=data["input_ids"].cuda(),
-                    attention_mask=data["attention_mask"].cuda(),
-                    loss_mask=data["loss_mask"].cuda(),
+                    input_ids=data["input_ids"].to(get_local_device()),
+                    attention_mask=data["attention_mask"].to(get_local_device()),
+                    loss_mask=data["loss_mask"].to(get_local_device()),
                     shard_returns=args.shard_target_output,
                 )
 
@@ -732,16 +745,16 @@ def run_forward(
             )
         else:
             # we generate the logits using the hidden states loaded from disk
-            attention_mask = data["attention_mask"].cuda()
-            hidden_states = data["hidden_state"].cuda()
+            attention_mask = data["attention_mask"].to(get_local_device())
+            hidden_states = data["hidden_state"].to(get_local_device())
             input_ids, target, loss_mask = target_model.preprocess(
                 data["input_ids"], data["target"], data["loss_mask"]
             )
-            input_ids = input_ids.cuda()
+            input_ids = input_ids.to(get_local_device())
             target = target_model(
-                target.cuda()
+                target.to(get_local_device())
             )  # The `data['target']` value occupies a large amount of GPU memory, with a shape of [seqlen, vocab_size]. It needs to be processed before being loaded into the GPU.
-            loss_mask = loss_mask.cuda()
+            loss_mask = loss_mask.to(get_local_device())
         (
             plosses,
             acceptance_rates,
@@ -757,7 +770,9 @@ def run_forward(
             target=target,
             hidden_states=hidden_states,
             position_ids=(
-                data["position_ids"].cuda() if "position_ids" in data else None
+                data["position_ids"].to(get_local_device())
+                if "position_ids" in data
+                else None
             ),
             image_grid_thw=image_grid_thw,
             is_vlm=args.is_vlm,
@@ -787,8 +802,8 @@ def run_backward_and_update(
         grad_norm = optimizer.step()
         if dist.is_initialized():
             grad_norm = grad_norm.detach().float()
-            if torch.cuda.is_available():
-                grad_norm = grad_norm.to(torch.cuda.current_device())
+            if get_device_type() != "cpu":
+                grad_norm = grad_norm.to(current_device())
             grad_norm = grad_norm.pow(2)
             dist.all_reduce(grad_norm, op=dist.ReduceOp.SUM)
             grad_norm = grad_norm.sqrt()