From 98e4be326b28ebdcaa9e493aab52da74d394b424 Mon Sep 17 00:00:00 2001 From: Vu Dinh Date: Sat, 13 Jun 2026 21:42:10 -0400 Subject: [PATCH 1/2] [train] Save HF processor on checkpoint export for VLMs VLM checkpoints exported by save_hf_configs were missing preprocessor_config.json (and image/video processor configs), so AutoProcessor.from_pretrained() and vLLM fail to load the exported checkpoint. Only the model config, tokenizer, and generation config were saved. Resolve and save the processor from model_config.name_or_path when the model is a VLM, mirroring the existing generation_config handling. No-op for text-only models (no vision_config). Adds check_is_vlm/get_processor helpers to skyrl/utils/tok.py. Signed-off-by: Vu Dinh --- .../skyrl_train/distributed/strategy.py | 12 +++ skyrl/utils/tok.py | 25 ++++- .../test_save_hf_configs_processor.py | 96 +++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py diff --git a/skyrl/backends/skyrl_train/distributed/strategy.py b/skyrl/backends/skyrl_train/distributed/strategy.py index d606be66db..61f9445683 100644 --- a/skyrl/backends/skyrl_train/distributed/strategy.py +++ b/skyrl/backends/skyrl_train/distributed/strategy.py @@ -11,6 +11,7 @@ from transformers import GenerationConfig, PretrainedConfig, PreTrainedTokenizer from skyrl.backends.skyrl_train.utils.io import io +from skyrl.utils.tok import check_is_vlm, get_processor DataT = TypeVar("DataT", bound=Union[Dict[str, Any], torch.Tensor]) @@ -138,6 +139,17 @@ def save_hf_configs(self, model_config: PretrainedConfig, hf_dir: str, tokenizer # if the generation config isn't available, we don't save it logger.warning(f"Could not save generation config for '{model_config.name_or_path}'. Error: {e}") + # VLMs need preprocessor_config.json (+ image/video processor configs) to be + # reloadable by AutoProcessor; the tokenizer alone is insufficient and vLLM + # crashes on load without it. Resolve from the original base model, same as + # generation_config above. No-op for text-only models (no vision_config). + if check_is_vlm(model_config.name_or_path): + try: + processor = get_processor(model_config.name_or_path) + processor.save_pretrained(work_dir) + except Exception as e: + logger.warning(f"Could not save processor for '{model_config.name_or_path}'. Error: {e}") + @staticmethod def get_rng_state(): """Get current RNG state for reproducibility""" diff --git a/skyrl/utils/tok.py b/skyrl/utils/tok.py index a503b12888..dcc057bf6d 100644 --- a/skyrl/utils/tok.py +++ b/skyrl/utils/tok.py @@ -1,6 +1,29 @@ """Tokenization related utilities""" -from transformers import AutoTokenizer, PreTrainedTokenizerFast +from transformers import ( + AutoConfig, + AutoProcessor, + AutoTokenizer, + PreTrainedTokenizerFast, +) + + +def check_is_vlm(model_name_or_path) -> bool: + """Returns True if the model config declares a non-null ``vision_config``.""" + model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) + return hasattr(model_config, "vision_config") and getattr(model_config, "vision_config") is not None + + +def get_processor(model_name_or_path, **tokenizer_kwargs) -> AutoProcessor: + """Gets processor for the given base model with the given parameters + + Sets the pad token ID to EOS token ID if `None`""" + tokenizer_kwargs.setdefault("trust_remote_code", True) + processor = AutoProcessor.from_pretrained(model_name_or_path, **tokenizer_kwargs) + if processor.tokenizer.pad_token_id is None: + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + processor.tokenizer.pad_token = processor.tokenizer.eos_token + return processor def get_tokenizer(model_name_or_path, **tokenizer_kwargs) -> AutoTokenizer: diff --git a/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py b/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py new file mode 100644 index 0000000000..7645ef8e32 --- /dev/null +++ b/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py @@ -0,0 +1,96 @@ +"""CPU tests for DistributedStrategy.save_hf_configs processor handling. + +Verifies that a vision-language checkpoint export writes the HF processor +(``preprocessor_config.json`` etc.) and that text-only exports do not, without +requiring GPUs, distributed init, or network access. +""" + +from unittest.mock import MagicMock, patch + +from skyrl.backends.skyrl_train.distributed.strategy import DistributedStrategy + + +class _StubStrategy(DistributedStrategy): + """Concrete strategy that stubs every abstractmethod so the CPU-only + ``save_hf_configs`` path can be exercised in isolation.""" + + def setup_distributed(self): # pragma: no cover - stub + pass + + def backward(self, loss, model, optimizer, **kwargs): # pragma: no cover - stub + pass + + def optimizer_step(self, optimizer, model, scheduler, name="model", **kwargs): # pragma: no cover - stub + pass + + def save_checkpoint(self, model, ckpt_dir, node_local_rank, optimizer, scheduler, tokenizer): # pragma: no cover + pass + + def load_checkpoint( + self, model, ckpt_dir, optimizer, scheduler, load_module_strict, load_optimizer_states, load_lr_scheduler_states + ): # pragma: no cover - stub + pass + + def save_hf_model(self, model, output_dir, tokenizer=None, **kwargs): # pragma: no cover - stub + pass + + +def _make_model_config(name_or_path="some/base-model"): + model_config = MagicMock() + model_config.name_or_path = name_or_path + # save_pretrained is a no-op for the test; we only care about the processor. + model_config.save_pretrained = MagicMock() + return model_config + + +_STRATEGY_MOD = "skyrl.backends.skyrl_train.distributed.strategy" + + +class TestSaveHfConfigsProcessor: + def test_saves_processor_for_vlm(self, tmp_path): + """A VLM export resolves the processor and saves it into the HF dir.""" + processor = MagicMock() + with ( + patch(f"{_STRATEGY_MOD}.check_is_vlm", return_value=True) as check_is_vlm, + patch(f"{_STRATEGY_MOD}.get_processor", return_value=processor) as get_processor, + patch(f"{_STRATEGY_MOD}.GenerationConfig"), + ): + _StubStrategy().save_hf_configs(_make_model_config(), str(tmp_path)) + + check_is_vlm.assert_called_once_with("some/base-model") + get_processor.assert_called_once_with("some/base-model") + processor.save_pretrained.assert_called_once_with(str(tmp_path)) + + def test_no_processor_for_text_only(self, tmp_path): + """A text-only export never resolves or saves a processor.""" + with ( + patch(f"{_STRATEGY_MOD}.check_is_vlm", return_value=False) as check_is_vlm, + patch(f"{_STRATEGY_MOD}.get_processor") as get_processor, + patch(f"{_STRATEGY_MOD}.GenerationConfig"), + ): + _StubStrategy().save_hf_configs(_make_model_config(), str(tmp_path)) + + check_is_vlm.assert_called_once_with("some/base-model") + get_processor.assert_not_called() + + def test_processor_failure_does_not_raise(self, tmp_path): + """A processor resolution error is swallowed (export must not crash).""" + with ( + patch(f"{_STRATEGY_MOD}.check_is_vlm", return_value=True), + patch(f"{_STRATEGY_MOD}.get_processor", side_effect=RuntimeError("no processor")), + patch(f"{_STRATEGY_MOD}.GenerationConfig"), + ): + # Should not raise. + _StubStrategy().save_hf_configs(_make_model_config(), str(tmp_path)) + + def test_skipped_when_no_name_or_path(self, tmp_path): + """Models initialized without a base path skip processor + gen config.""" + with ( + patch(f"{_STRATEGY_MOD}.check_is_vlm") as check_is_vlm, + patch(f"{_STRATEGY_MOD}.get_processor") as get_processor, + patch(f"{_STRATEGY_MOD}.GenerationConfig"), + ): + _StubStrategy().save_hf_configs(_make_model_config(name_or_path=""), str(tmp_path)) + + check_is_vlm.assert_not_called() + get_processor.assert_not_called() From fb4de2b1e9baaf1404128b3bf724f79900f6fe95 Mon Sep 17 00:00:00 2001 From: Vu Dinh Date: Sun, 14 Jun 2026 23:20:28 -0400 Subject: [PATCH 2/2] [train] Address review: guard VLM check, reuse loaded config - Move the VLM check + processor resolve/save into a single try/except so a processor-resolution failure (offline/transient node) cannot abort the checkpoint save. - check_is_vlm now accepts a PretrainedConfig or a path; save_hf_configs passes the already-loaded model_config, avoiding a redundant AutoConfig.from_pretrained. - Update tests to assert the object is passed, plus a test that a VLM-check failure is swallowed. Signed-off-by: Vu Dinh --- .../skyrl_train/distributed/strategy.py | 10 +++++---- skyrl/utils/tok.py | 13 +++++++++--- .../test_save_hf_configs_processor.py | 21 ++++++++++++++++--- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/skyrl/backends/skyrl_train/distributed/strategy.py b/skyrl/backends/skyrl_train/distributed/strategy.py index 61f9445683..a6b1bb711b 100644 --- a/skyrl/backends/skyrl_train/distributed/strategy.py +++ b/skyrl/backends/skyrl_train/distributed/strategy.py @@ -143,12 +143,14 @@ def save_hf_configs(self, model_config: PretrainedConfig, hf_dir: str, tokenizer # reloadable by AutoProcessor; the tokenizer alone is insufficient and vLLM # crashes on load without it. Resolve from the original base model, same as # generation_config above. No-op for text-only models (no vision_config). - if check_is_vlm(model_config.name_or_path): - try: + # The VLM check reuses the already-loaded model_config (no extra I/O) and the + # whole block is guarded so a processor-resolution failure can't abort the save. + try: + if check_is_vlm(model_config): processor = get_processor(model_config.name_or_path) processor.save_pretrained(work_dir) - except Exception as e: - logger.warning(f"Could not save processor for '{model_config.name_or_path}'. Error: {e}") + except Exception as e: + logger.warning(f"Could not save processor for '{model_config.name_or_path}'. Error: {e}") @staticmethod def get_rng_state(): diff --git a/skyrl/utils/tok.py b/skyrl/utils/tok.py index dcc057bf6d..41298d3111 100644 --- a/skyrl/utils/tok.py +++ b/skyrl/utils/tok.py @@ -8,9 +8,16 @@ ) -def check_is_vlm(model_name_or_path) -> bool: - """Returns True if the model config declares a non-null ``vision_config``.""" - model_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) +def check_is_vlm(model_config_or_path) -> bool: + """Returns True if the model config declares a non-null ``vision_config``. + + Accepts either an already-loaded ``PretrainedConfig`` or a model name/path. + Passing the config avoids a redundant ``AutoConfig.from_pretrained`` round-trip + when the caller already holds it.""" + if isinstance(model_config_or_path, str): + model_config = AutoConfig.from_pretrained(model_config_or_path, trust_remote_code=True) + else: + model_config = model_config_or_path return hasattr(model_config, "vision_config") and getattr(model_config, "vision_config") is not None diff --git a/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py b/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py index 7645ef8e32..e3cfeea020 100644 --- a/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py +++ b/tests/backends/skyrl_train/distributed/test_save_hf_configs_processor.py @@ -50,27 +50,42 @@ class TestSaveHfConfigsProcessor: def test_saves_processor_for_vlm(self, tmp_path): """A VLM export resolves the processor and saves it into the HF dir.""" processor = MagicMock() + model_config = _make_model_config() with ( patch(f"{_STRATEGY_MOD}.check_is_vlm", return_value=True) as check_is_vlm, patch(f"{_STRATEGY_MOD}.get_processor", return_value=processor) as get_processor, patch(f"{_STRATEGY_MOD}.GenerationConfig"), ): - _StubStrategy().save_hf_configs(_make_model_config(), str(tmp_path)) + _StubStrategy().save_hf_configs(model_config, str(tmp_path)) - check_is_vlm.assert_called_once_with("some/base-model") + # The check reuses the loaded config object (no redundant AutoConfig I/O). + check_is_vlm.assert_called_once_with(model_config) get_processor.assert_called_once_with("some/base-model") processor.save_pretrained.assert_called_once_with(str(tmp_path)) def test_no_processor_for_text_only(self, tmp_path): """A text-only export never resolves or saves a processor.""" + model_config = _make_model_config() with ( patch(f"{_STRATEGY_MOD}.check_is_vlm", return_value=False) as check_is_vlm, patch(f"{_STRATEGY_MOD}.get_processor") as get_processor, patch(f"{_STRATEGY_MOD}.GenerationConfig"), ): + _StubStrategy().save_hf_configs(model_config, str(tmp_path)) + + check_is_vlm.assert_called_once_with(model_config) + get_processor.assert_not_called() + + def test_vlm_check_failure_does_not_raise(self, tmp_path): + """A VLM-detection error is swallowed (export must not crash).""" + with ( + patch(f"{_STRATEGY_MOD}.check_is_vlm", side_effect=RuntimeError("config unavailable")), + patch(f"{_STRATEGY_MOD}.get_processor") as get_processor, + patch(f"{_STRATEGY_MOD}.GenerationConfig"), + ): + # Should not raise even though the VLM check blew up. _StubStrategy().save_hf_configs(_make_model_config(), str(tmp_path)) - check_is_vlm.assert_called_once_with("some/base-model") get_processor.assert_not_called() def test_processor_failure_does_not_raise(self, tmp_path):