Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
Expand Down
2 changes: 2 additions & 0 deletions examples/train/algorithms/dapo/run_dapo_gsm8k.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ TOP_P=1.0
EVAL_TOP_P=0.7
CLIP_RATIO_C=10.0
MAX_RESPONSE_LENGTH=1024
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`

uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
data.train_data="['$DATA_DIR/train.parquet']" \
Expand Down Expand Up @@ -75,6 +76,7 @@ uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
generator.inference_engine.run_engines_locally=true \
generator.inference_engine.weight_sync_backend=nccl \
generator.inference_engine.async_engine=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
generator.batched=true \
environment.env_class=gsm8k \
generator.n_samples_per_prompt=5 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
MICRO_FORWARD_BATCH_SIZE_PER_GPU=2
MICRO_TRAIN_BATCH_SIZE_PER_GPU=2

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`

uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
data.train_data="['$TRAIN_FILE']" \
Expand Down
2 changes: 1 addition & 1 deletion examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8))
# repro run parameters
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# Fully async specific configuration knobs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ MAX_RESPONSE_LENGTH=$((1024 * 8))
# repro run parameters
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true
ENFORCE_EAGER=false
LR=1e-6

# Fully async specific configuration knobs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-5

# megatron config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# megatron config
Expand Down
2 changes: 1 addition & 1 deletion examples/train/algorithms/sapo/run_sapo_qwen3_4b_aime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

uv run --isolated --extra fsdp -m examples.train.algorithms.dapo.main_dapo \
Expand Down
3 changes: 2 additions & 1 deletion examples/train/fully_async/fully_async_run_gsm8k.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ set -x
SEQUENCE_MASK_METRIC=geometric
GEO_MASK_HIGH=1.01
GEO_MASK_LOW=0.99
ENFORCE_EAGER=false

RUN_NAME=gsm8k-fully-async-qwen2.5_1.5B-geoMask${GEO_MASK_LOW}_${GEO_MASK_HIGH}-maxStale${MAX_STALENESS_STEPS}-numCon${NUM_PARALLEL_GENERATION_WORKERS}-${NUM_POLICY_GPUS}train${NUM_INFERENCE_GPUS}gen

Expand Down Expand Up @@ -78,5 +79,5 @@ uv run --isolated --extra fsdp -m examples.train.fully_async.main_fully_async \
trainer.run_name=${RUN_NAME} \
trainer.resume_mode=latest \
trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
$@
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ LR=1e-5
SEQUENCE_MASK_METRIC=geometric
GEO_MASK_HIGH=1.01
GEO_MASK_LOW=0.99
ENFORCE_EAGER=false

RUN_NAME=gsm8k-fully-async-qwen3-0.6B_lora_${LORA_RANK}_${LORA_ALPHA}

Expand Down Expand Up @@ -82,5 +83,5 @@ uv run --isolated --extra megatron -m examples.train.fully_async.main_fully_asyn
trainer.run_name=${RUN_NAME} \
trainer.resume_mode=latest \
trainer.ckpt_path="$HOME/ckpts/${RUN_NAME}" \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
$@
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set -x
: "${N_SAMPLES_PER_PROMPT:=2}"

RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}"
ENFORCE_EAGER=false

uv run --isolated --extra fsdp \
-m examples.train.fully_async.main_fully_async_sim \
Expand Down Expand Up @@ -55,7 +56,7 @@ uv run --isolated --extra fsdp \
generator.inference_engine.weight_sync_backend=nccl \
generator.inference_engine.async_engine=true \
generator.inference_engine.gpu_memory_utilization=0.8 \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
generator.batched=false \
environment.env_class=gsm8k \
generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ set -x
: "${N_SAMPLES_PER_PROMPT:=2}"

RUN_NAME="${RUN_NAME:-gsm8k-sim-qwen0.5b}"
ENFORCE_EAGER=false

uv run --isolated --extra fsdp \
-m examples.train.fully_async.main_fully_async_sim \
Expand Down Expand Up @@ -66,7 +67,7 @@ uv run --isolated --extra fsdp \
generator.inference_engine.weight_sync_backend=nccl \
generator.inference_engine.async_engine=true \
generator.inference_engine.gpu_memory_utilization=0.8 \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
generator.batched=false \
environment.env_class=gsm8k \
generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
Expand Down
3 changes: 2 additions & 1 deletion examples/train/gptoss/run_gsm8k_gptoss.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ NUM_GPUS=8
LOGGER="wandb" # change to "console" to print to stdout

INFERENCE_BACKEND="vllm"
ENFORCE_EAGER=false

uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \
data.train_data="['$DATA_DIR/train.parquet']" \
Expand All @@ -30,7 +31,7 @@ uv run --isolated --extra fsdp -m skyrl.train.entrypoints.main_base \
trainer.flash_attn=false \
trainer.remove_microbatch_padding=false \
generator.inference_engine.tensor_parallel_size=4 \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
trainer.epochs=20 \
trainer.eval_batch_size=32 \
trainer.eval_before_train=false \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=128
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# megatron config
Expand Down
2 changes: 1 addition & 1 deletion examples/train/megatron/run_megatron_dapo_qwen3_30b_a3b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# megatron config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-5 # 10x compared to full finetuning

# megatron config
Expand Down
2 changes: 1 addition & 1 deletion examples/train/megatron/run_megatron_dapo_qwen3_4b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# megatron config
Expand Down
2 changes: 1 addition & 1 deletion examples/train/megatron/run_megatron_dapo_qwen3_4b_lora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=3e-5

# megatron config
Expand Down
4 changes: 3 additions & 1 deletion examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ MOE_ROUTER_EXPERT_BIAS=true
OPTIMIZER_CPU_OFFLOAD=true
OPTIMIZER_OFFLOAD_FRACTION=1.0

ENFORCE_EAGER=false

uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \
data.train_data="['$DATA_DIR/train.parquet']" \
data.val_data="['$DATA_DIR/validation.parquet']" \
Expand All @@ -68,7 +70,7 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \
trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \
generator.inference_engine.enforce_eager=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \
trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=512
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true
ENFORCE_EAGER=false
LR=1e-5

uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TRAIN_BATCH_SIZE=512
MINI_BATCH_SIZE=512
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true
ENFORCE_EAGER=false
LR=1e-5

uv run --isolated --extra fsdp -m examples.train.on_policy_distillation.main_on_policy_distill \
Expand Down
2 changes: 1 addition & 1 deletion examples/train/router_replay/run_dapo_moonlight_16b_a3b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ TRAIN_BATCH_SIZE=128
MINI_BATCH_SIZE=32
N_SAMPLES_PER_PROMPT=16
EVAL_N_SAMPLES_PER_PROMPT=32
ENFORCE_EAGER=true # cuda graphs can cause some instability
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`
LR=1e-6

# megatron config
Expand Down
2 changes: 2 additions & 0 deletions examples/train/tis_correction/run_dapo_tis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ TOP_P=1.0
EVAL_TOP_P=0.7
CLIP_RATIO_C=10.0
MAX_RESPONSE_LENGTH=1024
ENFORCE_EAGER=true # original DAPO recipe used enforce eager due to instability with vLLM then. TODO: reproduce DAPO with enforce eager `False`

uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \
data.train_data="['$DATA_DIR/train.parquet']" \
Expand Down Expand Up @@ -84,6 +85,7 @@ uv run --isolated --extra fsdp -m examples.train.tis_correction.main_tis_dapo \
generator.inference_engine.run_engines_locally=true \
generator.inference_engine.weight_sync_backend=nccl \
generator.inference_engine.async_engine=true \
generator.inference_engine.enforce_eager=$ENFORCE_EAGER \
generator.batched=true \
environment.env_class=gsm8k \
generator.n_samples_per_prompt=5 \
Expand Down
2 changes: 1 addition & 1 deletion skyrl/train/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ class InferenceEngineConfig(BaseConfig):
enable_chunked_prefill: bool = True
enable_return_routed_experts: bool = False
max_num_batched_tokens: int = 8192
enforce_eager: bool = True
enforce_eager: bool = False
"""Disable CUDA graphs for stability. Set to ``False`` for higher performance,
but this may affect convergence for long-running or long-context training jobs."""
fully_sharded_loras: bool = False
Expand Down
Loading