Fix layerwise GaLore optimizer hard to converge with warmup scheduler (#30372)

Update optimization.py
2025-07-31 02:02:21 +06:00 · 2024-04-23 00:00:26 +08:00 · 2024-04-23 00:00:26 +08:00 · f3b3533e19
commit f3b3533e19
parent 0d84901cb7
1 changed files with 2 additions and 3 deletions
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@ -444,9 +444,8 @@ def get_scheduler(

        def scheduler_hook(param):
            # Since the optimizer hook has been already attached we only need to
-            # attach the scheduler hook
-            if param.grad is not None:
-                scheduler_dict[param].step()
+            # attach the scheduler hook, the gradients have been zeroed here
+            scheduler_dict[param].step()

        for param in optimizer_dict.keys():
            if param.requires_grad: