diff --git a/skyrl/backends/skyrl_train/utils/ppo_utils.py b/skyrl/backends/skyrl_train/utils/ppo_utils.py index c2d1386d33..970a911418 100644 --- a/skyrl/backends/skyrl_train/utils/ppo_utils.py +++ b/skyrl/backends/skyrl_train/utils/ppo_utils.py @@ -1286,7 +1286,7 @@ def compute_advantages_and_returns( token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: np.ndarray, - adv_estimator: AdvantageEstimator, + advantage_estimator: AdvantageEstimator, config: AlgorithmConfig, values: Optional[torch.Tensor] = None, grpo_norm_by_std: bool = True, @@ -1294,7 +1294,7 @@ def compute_advantages_and_returns( lambd=1.0, **kwargs, ) -> tuple[torch.Tensor, torch.Tensor]: - estimator_func = AdvantageEstimatorRegistry.get(adv_estimator) + estimator_func = AdvantageEstimatorRegistry.get(advantage_estimator) return estimator_func( token_level_rewards=token_level_rewards, diff --git a/skyrl/train/trainer.py b/skyrl/train/trainer.py index b968597c91..0e3c9c029a 100644 --- a/skyrl/train/trainer.py +++ b/skyrl/train/trainer.py @@ -1022,7 +1022,7 @@ def compute_advantages_and_returns(self, data: TrainingInputBatch) -> TrainingIn token_level_rewards=token_level_rewards[is_last_step], response_mask=torch.ones_like(last_step_response_mask, dtype=torch.float), index=index[is_last_step.cpu().numpy()], - adv_estimator=self.cfg.trainer.algorithm.advantage_estimator, + advantage_estimator=self.cfg.trainer.algorithm.advantage_estimator, values=values[is_last_step] if values is not None else None, config=self.cfg.trainer.algorithm, gamma=self.cfg.trainer.algorithm.gamma, @@ -1044,7 +1044,7 @@ def compute_advantages_and_returns(self, data: TrainingInputBatch) -> TrainingIn token_level_rewards=token_level_rewards, response_mask=data["response_mask"], index=data.metadata["uids"], - adv_estimator=self.cfg.trainer.algorithm.advantage_estimator, + advantage_estimator=self.cfg.trainer.algorithm.advantage_estimator, config=self.cfg.trainer.algorithm, values=data["values"], gamma=self.cfg.trainer.algorithm.gamma,