Add inverse sqrt learning rate scheduler (#21495)

Sager611 · web-flow · commit a3034c7004db · 2023-02-07T15:00:50.000-05:00
* added inverse sqrt lr scheduler

* Updated get_scheduler in src/transformers/optimization.py

* Updated src/transformers/__init__.py

* Added inverse sqrt lr scheduler test

* Updated docs/source/en/main_classes/optimizer_schedules.mdx

* Ran style and quality scripts

* Fix get_inverse_sqrt_schedule docstring

* Comment implementation URL
diff --git a/docs/source/en/main_classes/optimizer_schedules.mdx b/docs/source/en/main_classes/optimizer_schedules.mdx
@@ -60,6 +60,8 @@ The `.optimization` module provides:
 
 [[autodoc]] get_polynomial_decay_schedule_with_warmup
 
+[[autodoc]] get_inverse_sqrt_schedule
+
 ### Warmup (TensorFlow)
 
 [[autodoc]] WarmUp
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -2588,6 +2588,7 @@
         "get_constant_schedule_with_warmup",
         "get_cosine_schedule_with_warmup",
         "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_inverse_sqrt_schedule",
         "get_linear_schedule_with_warmup",
         "get_polynomial_decay_schedule_with_warmup",
         "get_scheduler",
@@ -5659,6 +5660,7 @@
             get_constant_schedule_with_warmup,
             get_cosine_schedule_with_warmup,
             get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_inverse_sqrt_schedule,
             get_linear_schedule_with_warmup,
             get_polynomial_decay_schedule_with_warmup,
             get_scheduler,
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
@@ -220,13 +220,50 @@ def lr_lambda(current_step: int):
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
+def get_inverse_sqrt_schedule(
+    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
+):
+    """
+    Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
+    warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        timescale (`int`, *optional*, defaults to `num_warmup_steps`):
+            Time scale.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    # Note: this implementation is adapted from
+    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
+
+    if timescale is None:
+        timescale = num_warmup_steps
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        shift = timescale - num_warmup_steps
+        decay = 1.0 / math.sqrt((current_step + shift) / timescale)
+        return decay
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
 TYPE_TO_SCHEDULER_FUNCTION = {
     SchedulerType.LINEAR: get_linear_schedule_with_warmup,
     SchedulerType.COSINE: get_cosine_schedule_with_warmup,
     SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
     SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
     SchedulerType.CONSTANT: get_constant_schedule,
     SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
 }
 
 
@@ -263,6 +300,9 @@ def get_scheduler(
     if name == SchedulerType.CONSTANT_WITH_WARMUP:
         return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
 
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+
     # All other schedulers require `num_training_steps`
     if num_training_steps is None:
         raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
@@ -363,6 +363,7 @@ class SchedulerType(ExplicitEnum):
     POLYNOMIAL = "polynomial"
     CONSTANT = "constant"
     CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    INVERSE_SQRT = "inverse_sqrt"
 
 
 class TrainerMemoryTracker:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
@@ -7019,6 +7019,10 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
     requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
 
 
+def get_inverse_sqrt_schedule(*args, **kwargs):
+    requires_backends(get_inverse_sqrt_schedule, ["torch"])
+
+
 def get_linear_schedule_with_warmup(*args, **kwargs):
     requires_backends(get_linear_schedule_with_warmup, ["torch"])
 
diff --git a/tests/optimization/test_optimization.py b/tests/optimization/test_optimization.py
@@ -33,6 +33,7 @@
         get_constant_schedule_with_warmup,
         get_cosine_schedule_with_warmup,
         get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_inverse_sqrt_schedule,
         get_linear_schedule_with_warmup,
         get_polynomial_decay_schedule_with_warmup,
     )
@@ -145,6 +146,10 @@ def test_schedulers(self):
                 {**common_kwargs, "power": 2.0, "lr_end": 1e-7},
                 [0.0, 5.0, 10.0, 7.656, 5.625, 3.906, 2.5, 1.406, 0.625, 0.156],
             ),
+            get_inverse_sqrt_schedule: (
+                {"num_warmup_steps": 2},
+                [0.0, 5.0, 10.0, 8.165, 7.071, 6.325, 5.774, 5.345, 5.0, 4.714],
+            ),
         }
 
         for scheduler_func, data in scheds.items():