NovaSky-AI · SumanthRH · Jun 23, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 23, 2026
diff --git a/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh b/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh
@@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \

diff --git a/examples/train/algorithms/dapo/run_dapo_gsm8k.sh b/examples/train/algorithms/dapo/run_dapo_gsm8k.sh
@@ -29,6 +29,7 @@ TOP_P=1.0
 EVAL_TOP_P=0.7
 CLIP_RATIO_C=10.0
 MAX_RESPONSE_LENGTH=1024
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 
 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
   data.train_data="['$DATA_DIR/train.parquet']" \
@@ -75,6 +76,7 @@ uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
   generator.inference_engine.run_engines_locally=true \
   generator.inference_engine.weight_sync_backend=nccl \
   generator.inference_engine.async_engine=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   generator.batched=true \
   environment.env_class=gsm8k \
   generator.n_samples_per_prompt=5 \

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh
@@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 MICRO_FORWARD_BATCH_SIZE_PER_GPU=2
 MICRO_TRAIN_BATCH_SIZE_PER_GPU=2
 

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh
@@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 
 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
   data.train_data="['$TRAIN_FILE']" \

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh
@@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh
@@ -35,7 +35,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8))
 # repro run parameters
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # Fully async specific configuration knobs:

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh
@@ -36,7 +36,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8))
 # repro run parameters
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true
+ENFORCE_EAGER=false
 LR=1e-6
 
 # Fully async specific configuration knobs:

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh
@@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-5
 
 # megatron config

diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh
@@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # megatron config

diff --git a/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh b/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh
@@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \

diff --git a/examples/train/fully_async/fully_async_run_gsm8k.sh b/examples/train/fully_async/fully_async_run_gsm8k.sh
@@ -30,6 +30,7 @@ set -x
 SEQUENCE_MASK_METRIC=geometric
 GEO_MASK_HIGH=1.01
 GEO_MASK_LOW=0.99
+ENFORCE_EAGER=false
 
 RUN_NAME=gsm8k-fully-async-qwen2.5_1.5B-geoMask${GEO_MASK_LOW}_${GEO_MASK_HIGH}-maxStale${MAX_STALENESS_STEPS}-numCon${NUM_PARALLEL_GENERATION_WORKERS}-${NUM_POLICY_GPUS}train${NUM_INFERENCE_GPUS}gen
 
@@ -78,5 +79,5 @@ uv run --isolated --extra fsdp -m examples.train.fully_async.main_fully_async \
   trainer.run_name=${RUN_NAME} \
   trainer.resume_mode=latest \
   trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   $@
diff --git a/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh b/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh
@@ -31,6 +31,7 @@ LR=1e-5
 SEQUENCE_MASK_METRIC=geometric
 GEO_MASK_HIGH=1.01
 GEO_MASK_LOW=0.99
+ENFORCE_EAGER=false
 
 RUN_NAME=gsm8k-fully-async-qwen3-0.6B_lora_${LORA_RANK}_${LORA_ALPHA}
 
@@ -82,5 +83,5 @@ uv run --isolated --extra megatron -m examples.train.fully_async.main_fully_asyn
   trainer.run_name=${RUN_NAME} \
   trainer.resume_mode=latest \
   trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   $@
diff --git a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh
@@ -21,6 +21,7 @@ set -x
 : "${N_SAMPLES_PER_PROMPT:=2}"
 
 RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}"
+ENFORCE_EAGER=false
 
 uv run --isolated --extra fsdp \
   -m examples.train.fully_async.main_fully_async_sim \
@@ -55,7 +56,7 @@ uv run --isolated --extra fsdp \
   generator.inference_engine.weight_sync_backend=nccl \
   generator.inference_engine.async_engine=true \
   generator.inference_engine.gpu_memory_utilization=0.8 \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   generator.batched=false \
   environment.env_class=gsm8k \
   generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \

diff --git a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh
@@ -30,6 +30,7 @@ set -x
 : "${N_SAMPLES_PER_PROMPT:=2}"
 
 RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}"
+ENFORCE_EAGER=false
 
 uv run --isolated --extra fsdp \
   -m examples.train.fully_async.main_fully_async_sim \
@@ -66,7 +67,7 @@ uv run --isolated --extra fsdp \
   generator.inference_engine.weight_sync_backend=nccl \
   generator.inference_engine.async_engine=true \
   generator.inference_engine.gpu_memory_utilization=0.8 \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   generator.batched=false \
   environment.env_class=gsm8k \
   generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \

diff --git a/examples/train/gptoss/run_gsm8k_gptoss.sh b/examples/train/gptoss/run_gsm8k_gptoss.sh
@@ -16,6 +16,7 @@ NUM_GPUS=8
 LOGGER="wandb"  # change to "console" to print to stdout
 
 INFERENCE_BACKEND="vllm"
+ENFORCE_EAGER=false
 
 uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
@@ -30,7 +31,7 @@ uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \
   trainer.flash_attn=false \
   trainer.remove_microbatch_padding=false \
   generator.inference_engine.tensor_parallel_size=4 \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   trainer.epochs=20 \
   trainer.eval_batch_size=32 \
   trainer.eval_before_train=false \

diff --git a/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh b/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh
@@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=128
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # megatron config

diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh
@@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # megatron config

diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh
@@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-5 # 10x compared to full finetuning
 
 # megatron config

diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh b/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh
@@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # megatron config

diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh b/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh
@@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=3e-5
 
 # megatron config

diff --git a/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh b/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh
@@ -57,6 +57,8 @@ MOE_ROUTER_EXPERT_BIAS=true
 OPTIMIZER_CPU_OFFLOAD=true
 OPTIMIZER_OFFLOAD_FRACTION=1.0
 
+ENFORCE_EAGER=false
+
 uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
   data.val_data="['$DATA_DIR/validation.parquet']" \
@@ -68,7 +70,7 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \
   trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
   generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
   generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \
-  generator.inference_engine.enforce_eager=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \
   trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
   trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \

diff --git a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh
@@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=512
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true
+ENFORCE_EAGER=false
 LR=1e-5
 
 uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \

diff --git a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh
@@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512
 MINI_BATCH_SIZE=512
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true
+ENFORCE_EAGER=false
 LR=1e-5
 
 uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \

diff --git a/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh b/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh
@@ -43,7 +43,7 @@ TRAIN_BATCH_SIZE=128
 MINI_BATCH_SIZE=32
 N_SAMPLES_PER_PROMPT=16
 EVAL_N_SAMPLES_PER_PROMPT=32
-ENFORCE_EAGER=true # cuda graphs can cause some instability
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 LR=1e-6
 
 # megatron config

diff --git a/examples/train/tis_correction/run_dapo_tis.sh b/examples/train/tis_correction/run_dapo_tis.sh
@@ -35,6 +35,7 @@ TOP_P=1.0
 EVAL_TOP_P=0.7
 CLIP_RATIO_C=10.0
 MAX_RESPONSE_LENGTH=1024
+ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
 
 uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \
   data.train_data="['$DATA_DIR/train.parquet']" \
@@ -84,6 +85,7 @@ uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \
   generator.inference_engine.run_engines_locally=true \
   generator.inference_engine.weight_sync_backend=nccl \
   generator.inference_engine.async_engine=true \
+  generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
   generator.batched=true \
   environment.env_class=gsm8k \
   generator.n_samples_per_prompt=5 \

diff --git a/skyrl/train/config/config.py b/skyrl/train/config/config.py
@@ -579,7 +579,7 @@ class InferenceEngineConfig(BaseConfig):
     enable_chunked_prefill: bool = True
     enable_return_routed_experts: bool = False
     max_num_batched_tokens: int = 8192
-    enforce_eager: bool = True
+    enforce_eager: bool = False
     """Disable CUDA graphs for stability. Set to ``False`` for higher performance,
     but this may affect convergence for long-running or long-context training jobs."""
     fully_sharded_loras: bool = False