Skip to content

Commit afc5dbf

Browse files
fix(checkpointing): Using global step count for checkpoint retention.
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent 1aa9b56 commit afc5dbf

1 file changed

Lines changed: 2 additions & 2 deletions

File tree

src/modalities/checkpointing/checkpoint_saving_strategies.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def get_checkpoint_instruction(
172172
if len(self._saved_recent_checkpoints) > self._num_recent_checkpoints_to_keep
173173
else ([], self._saved_recent_checkpoints)
174174
)
175-
# Do not delete checkpoints that are divisible by k.
176-
checkpoints_to_delete = [cp for cp in checkpoints_to_delete if cp.num_seen_steps_current_run % self._k != 0]
175+
# Do not delete checkpoints that are divisible by k in total training steps.
176+
checkpoints_to_delete = [cp for cp in checkpoints_to_delete if cp.num_seen_steps_total % self._k != 0]
177177

178178
return CheckpointingInstruction(save_current=True, checkpoints_to_delete=checkpoints_to_delete)

0 commit comments

Comments
 (0)