diff --git a/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh b/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh index 8d86bb797e..d297b99981 100644 --- a/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh @@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ diff --git a/examples/train/algorithms/dapo/run_dapo_gsm8k.sh b/examples/train/algorithms/dapo/run_dapo_gsm8k.sh index a30a25d290..9222d80a06 100644 --- a/examples/train/algorithms/dapo/run_dapo_gsm8k.sh +++ b/examples/train/algorithms/dapo/run_dapo_gsm8k.sh @@ -29,6 +29,7 @@ TOP_P=1.0 EVAL_TOP_P=0.7 CLIP_RATIO_C=10.0 MAX_RESPONSE_LENGTH=1024 +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ data.train_data="['$DATA_DIR/train.parquet']" \ @@ -75,6 +76,7 @@ uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ generator.inference_engine.run_engines_locally=true \ generator.inference_engine.weight_sync_backend=nccl \ generator.inference_engine.async_engine=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ generator.batched=true \ environment.env_class=gsm8k \ generator.n_samples_per_prompt=5 \ diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh index 3e94c0ac86..b02d568dc6 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh @@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` MICRO_FORWARD_BATCH_SIZE_PER_GPU=2 MICRO_TRAIN_BATCH_SIZE_PER_GPU=2 diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh index a3d892c739..7ac1a06abc 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh @@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ data.train_data="['$TRAIN_FILE']" \ diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh index e4c35d6734..b9004ecb72 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh @@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh index 6974edcb93..4694a6a7d8 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh @@ -35,7 +35,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8)) # repro run parameters N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # Fully async specific configuration knobs: diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh index f645d878f5..9d741fd89a 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async_onestep.sh @@ -36,7 +36,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8)) # repro run parameters N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true +ENFORCE_EAGER=false LR=1e-6 # Fully async specific configuration knobs: diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh index 9cd5075573..56acd16595 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-5 # megatron config diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh index 3d3cf5c0f6..b2241f5cfd 100644 --- a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh +++ b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # megatron config diff --git a/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh b/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh index 953d0415b5..cfc6b6ad1e 100644 --- a/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh +++ b/examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \ diff --git a/examples/train/fully_async/fully_async_run_gsm8k.sh b/examples/train/fully_async/fully_async_run_gsm8k.sh index 4aa4948e1f..853261f248 100755 --- a/examples/train/fully_async/fully_async_run_gsm8k.sh +++ b/examples/train/fully_async/fully_async_run_gsm8k.sh @@ -30,6 +30,7 @@ set -x SEQUENCE_MASK_METRIC=geometric GEO_MASK_HIGH=1.01 GEO_MASK_LOW=0.99 +ENFORCE_EAGER=false RUN_NAME=gsm8k-fully-async-qwen2.5_1.5B-geoMask${GEO_MASK_LOW}_${GEO_MASK_HIGH}-maxStale${MAX_STALENESS_STEPS}-numCon${NUM_PARALLEL_GENERATION_WORKERS}-${NUM_POLICY_GPUS}train${NUM_INFERENCE_GPUS}gen @@ -78,5 +79,5 @@ uv run --isolated --extra fsdp -m examples.train.fully_async.main_fully_async \ trainer.run_name=${RUN_NAME} \ trainer.resume_mode=latest \ trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ $@ \ No newline at end of file diff --git a/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh b/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh index 9a542e30c8..62e61df950 100644 --- a/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh +++ b/examples/train/fully_async/fully_async_run_gsm8k_megatron_lora.sh @@ -31,6 +31,7 @@ LR=1e-5 SEQUENCE_MASK_METRIC=geometric GEO_MASK_HIGH=1.01 GEO_MASK_LOW=0.99 +ENFORCE_EAGER=false RUN_NAME=gsm8k-fully-async-qwen3-0.6B_lora_${LORA_RANK}_${LORA_ALPHA} @@ -82,5 +83,5 @@ uv run --isolated --extra megatron -m examples.train.fully_async.main_fully_asyn trainer.run_name=${RUN_NAME} \ trainer.resume_mode=latest \ trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ $@ diff --git a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh index 276bc45025..b90d41e3eb 100644 --- a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh +++ b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_e2e.sh @@ -21,6 +21,7 @@ set -x : "${N_SAMPLES_PER_PROMPT:=2}" RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}" +ENFORCE_EAGER=false uv run --isolated --extra fsdp \ -m examples.train.fully_async.main_fully_async_sim \ @@ -55,7 +56,7 @@ uv run --isolated --extra fsdp \ generator.inference_engine.weight_sync_backend=nccl \ generator.inference_engine.async_engine=true \ generator.inference_engine.gpu_memory_utilization=0.8 \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ generator.batched=false \ environment.env_class=gsm8k \ generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ diff --git a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh index 046817b857..7441d9905f 100644 --- a/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh +++ b/examples/train/fully_async/sim_trainer/run_fully_async_sim_gsm8k_external.sh @@ -30,6 +30,7 @@ set -x : "${N_SAMPLES_PER_PROMPT:=2}" RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}" +ENFORCE_EAGER=false uv run --isolated --extra fsdp \ -m examples.train.fully_async.main_fully_async_sim \ @@ -66,7 +67,7 @@ uv run --isolated --extra fsdp \ generator.inference_engine.weight_sync_backend=nccl \ generator.inference_engine.async_engine=true \ generator.inference_engine.gpu_memory_utilization=0.8 \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ generator.batched=false \ environment.env_class=gsm8k \ generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ diff --git a/examples/train/gptoss/run_gsm8k_gptoss.sh b/examples/train/gptoss/run_gsm8k_gptoss.sh index 819ebc38ab..befb264292 100755 --- a/examples/train/gptoss/run_gsm8k_gptoss.sh +++ b/examples/train/gptoss/run_gsm8k_gptoss.sh @@ -16,6 +16,7 @@ NUM_GPUS=8 LOGGER="wandb" # change to "console" to print to stdout INFERENCE_BACKEND="vllm" +ENFORCE_EAGER=false uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \ data.train_data="['$DATA_DIR/train.parquet']" \ @@ -30,7 +31,7 @@ uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \ trainer.flash_attn=false \ trainer.remove_microbatch_padding=false \ generator.inference_engine.tensor_parallel_size=4 \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ trainer.epochs=20 \ trainer.eval_batch_size=32 \ trainer.eval_before_train=false \ diff --git a/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh b/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh index 532ec6bf85..c617f5a50a 100644 --- a/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh +++ b/examples/train/megatron/run_megatron_dapo_qwen3.5_35b_a3b.sh @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=128 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # megatron config diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh index 53965561b7..d198844072 100644 --- a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh +++ b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # megatron config diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh index 152f6b4442..a866d7cadf 100644 --- a/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh +++ b/examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b_lora.sh @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-5 # 10x compared to full finetuning # megatron config diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh b/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh index da0b8a1921..3b0745b6e8 100644 --- a/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh +++ b/examples/train/megatron/run_megatron_dapo_qwen3_4b.sh @@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # megatron config diff --git a/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh b/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh index 4d68761e93..5aea42f68e 100644 --- a/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh +++ b/examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh @@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=3e-5 # megatron config diff --git a/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh b/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh index 249b317360..b0d10951c6 100644 --- a/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh +++ b/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh @@ -57,6 +57,8 @@ MOE_ROUTER_EXPERT_BIAS=true OPTIMIZER_CPU_OFFLOAD=true OPTIMIZER_OFFLOAD_FRACTION=1.0 +ENFORCE_EAGER=false + uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ data.train_data="['$DATA_DIR/train.parquet']" \ data.val_data="['$DATA_DIR/validation.parquet']" \ @@ -68,7 +70,7 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \ - generator.inference_engine.enforce_eager=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ diff --git a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh index 7dffe70c9c..d926ef8a7f 100644 --- a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh +++ b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh @@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=512 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true +ENFORCE_EAGER=false LR=1e-5 uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \ diff --git a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh index 77adc9b769..2c8c5f5fb1 100644 --- a/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh +++ b/examples/train/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh @@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512 MINI_BATCH_SIZE=512 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true +ENFORCE_EAGER=false LR=1e-5 uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \ diff --git a/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh b/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh index 5a480569bd..59a79001e8 100644 --- a/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh +++ b/examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh @@ -43,7 +43,7 @@ TRAIN_BATCH_SIZE=128 MINI_BATCH_SIZE=32 N_SAMPLES_PER_PROMPT=16 EVAL_N_SAMPLES_PER_PROMPT=32 -ENFORCE_EAGER=true # cuda graphs can cause some instability +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` LR=1e-6 # megatron config diff --git a/examples/train/tis_correction/run_dapo_tis.sh b/examples/train/tis_correction/run_dapo_tis.sh index 7b2b6686d5..e298d8c65e 100644 --- a/examples/train/tis_correction/run_dapo_tis.sh +++ b/examples/train/tis_correction/run_dapo_tis.sh @@ -35,6 +35,7 @@ TOP_P=1.0 EVAL_TOP_P=0.7 CLIP_RATIO_C=10.0 MAX_RESPONSE_LENGTH=1024 +ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False` uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \ data.train_data="['$DATA_DIR/train.parquet']" \ @@ -84,6 +85,7 @@ uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \ generator.inference_engine.run_engines_locally=true \ generator.inference_engine.weight_sync_backend=nccl \ generator.inference_engine.async_engine=true \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ generator.batched=true \ environment.env_class=gsm8k \ generator.n_samples_per_prompt=5 \ diff --git a/skyrl/train/config/config.py b/skyrl/train/config/config.py index 659357a026..d369d53288 100644 --- a/skyrl/train/config/config.py +++ b/skyrl/train/config/config.py @@ -579,7 +579,7 @@ class InferenceEngineConfig(BaseConfig): enable_chunked_prefill: bool = True enable_return_routed_experts: bool = False max_num_batched_tokens: int = 8192 - enforce_eager: bool = True + enforce_eager: bool = False """Disable CUDA graphs for stability. Set to ``False`` for higher performance, but this may affect convergence for long-running or long-context training jobs.""" fully_sharded_loras: bool = False