From 5369efd4700a887ec4818be07cf77f93d4a0afd0 Mon Sep 17 00:00:00 2001
From: chenjunsong <cjs1020440147@icloud.com>
Date: Wed, 29 Nov 2023 18:08:50 +0800
Subject: [PATCH 01/20] add Sa-Solver

---
 .../schedulers/scheduling_sasolver.py         | 858 ++++++++++++++++++
 1 file changed, 858 insertions(+)
 create mode 100644 src/diffusers/schedulers/scheduling_sasolver.py

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
new file mode 100644
index 000000000000..5c547343bd33
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -0,0 +1,858 @@
+# Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2309.05019
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import List, Optional, Tuple, Union, Callable
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+        num_diffusion_timesteps,
+        max_beta=0.999,
+        alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class SASolverScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        predictor_order (`int`, defaults to 2):
+            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
+            sampling, and `predictor_order=3` for unconditional sampling.
+        corrector_order (`int`, defaults to 2):
+            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
+            sampling, and `corrector_order=3` for unconditional sampling.
+        predictor_corrector_mode (`str`, defaults to `PEC`):
+            The predictor-corrector mode can be `PEC` or 'PECE'. It is recommended to use `PEC` mode for fast
+            sampling, and `PECE` for high-quality sampling (PECE needs around twice model evaluations as PEC).
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `data_prediction`):
+            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
+            with `solver_order=2` for guided sampling like in Stable Diffusion.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Default = True.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
+            Diffusion.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            beta_start: float = 0.0001,
+            beta_end: float = 0.02,
+            beta_schedule: str = "linear",
+            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+            predictor_order: int = 2,
+            corrector_order: int = 2,
+            predictor_corrector_mode: str = 'PEC',
+            prediction_type: str = "epsilon",
+            tau_func: Callable = lambda t: 1 if t >= 200 and t <= 800 else 0,
+            thresholding: bool = False,
+            dynamic_thresholding_ratio: float = 0.995,
+            sample_max_value: float = 1.0,
+            algorithm_type: str = "data_prediction",
+            lower_order_final: bool = True,
+            use_karras_sigmas: Optional[bool] = False,
+            lambda_min_clipped: float = -float("inf"),
+            variance_type: Optional[str] = None,
+            timestep_spacing: str = "linspace",
+            steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        if algorithm_type not in ["data_prediction", "noise_prediction"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
+        self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
+
+        self.tau_func = tau_func
+        self.predict_x0 = algorithm_type == "data_prediction"
+        self.lower_order_nums = 0
+        self.last_sample = None
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+            )
+
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            timesteps = np.flip(timesteps).copy().astype(np.int64)
+
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # when num_inference_steps == num_train_timesteps, we can end up with
+        # duplicates in timesteps.
+        _, unique_indices = np.unique(timesteps, return_index=True)
+        timesteps = timesteps[np.sort(unique_indices)]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+                                 None,
+                             ] * max(self.config.predictor_order, self.config.corrector_order - 1)
+        self.lower_order_nums = 0
+        self.last_sample = None
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(sigma)
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def convert_model_output(
+            self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+
+        # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["data_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["noise_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
+                            interval_end ** 2 + 2 * interval_end + 2))
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
+                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
+
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau ** 2) * interval_end
+        interval_start_cov = (1 + tau ** 2) * interval_start
+
+        if order == 0:
+            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
+            (1 + tau ** 2))
+        elif order == 1:
+            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
+        elif order == 2:
+            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
+                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
+        elif order == 3:
+            return torch.exp(interval_end_cov) * (
+                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
+                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
+                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
+
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        """
+
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
+                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2]) / denominator1,
+                     lambda_list[1] * lambda_list[2] / denominator1],
+
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2]) / denominator2,
+                     lambda_list[0] * lambda_list[2] / denominator2],
+
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1]) / denominator3,
+                     lambda_list[0] * lambda_list[1] / denominator3]
+                    ]
+        elif order == 3:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
+                        lambda_list[0] - lambda_list[3])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
+                        lambda_list[1] - lambda_list[3])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
+                        lambda_list[2] - lambda_list[3])
+            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
+                        lambda_list[3] - lambda_list[2])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator1,
+                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
+
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator2,
+                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
+
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
+                         3]) / denominator3,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
+
+                    [1 / denominator4,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
+                         2]) / denominator4,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
+
+                    ]
+
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = 0
+            for j in range(order):
+                if self.predict_x0:
+
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
+                        order - 1 - j, interval_start, interval_end, tau)
+                else:
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
+                        order - 1 - j, interval_start, interval_end)
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, 'the length of coefficients does not match the order'
+        return coefficients
+
+    def stochastic_adams_bashforth_update(
+            self,
+            model_output: torch.FloatTensor,
+            prev_timestep: int,
+            sample: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            order: int,
+            tau: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Predictor.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of SA-Predictor at this timestep.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        assert noise is not None
+        timestep_list = self.timestep_list
+        model_output_list = self.model_outputs
+        s0, t = self.timestep_list[-1], prev_timestep
+        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        gradient_part = torch.zeros_like(sample)
+        h = lambda_t - lambda_s0
+        lambda_list = []
+
+        for i in range(order):
+            lambda_list.append(self.lambda_t[timestep_list[-(i + 1)]])
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = sample
+
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
+                    timestep_list[-2]])
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
+                    timestep_list[-2]])
+
+        for i in range(order):
+            if self.predict_x0:
+
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_output_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def stochastic_adams_moulton_update(
+            self,
+            this_model_output: torch.FloatTensor,
+            this_timestep: int,
+            last_sample: torch.FloatTensor,
+            last_noise: torch.FloatTensor,
+            this_sample: torch.FloatTensor,
+            order: int,
+            tau: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Corrector.
+
+        Args:
+            this_model_output (`torch.FloatTensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.FloatTensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.FloatTensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The order of SA-Corrector at this step.
+
+        Returns:
+            `torch.FloatTensor`:
+                The corrected sample tensor at the current timestep.
+        """
+
+        assert last_noise is not None
+        timestep_list = self.timestep_list
+        model_output_list = self.model_outputs
+        s0, t = self.timestep_list[-1], this_timestep
+        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        gradient_part = torch.zeros_like(this_sample)
+        h = lambda_t - lambda_s0
+        t_list = timestep_list + [this_timestep]
+        lambda_list = []
+        for i in range(order):
+            lambda_list.append(self.lambda_t[t_list[-(i + 1)]])
+
+        model_prev_list = model_output_list + [this_model_output]
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = last_sample
+
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * last_noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timestep: int,
+            sample: torch.FloatTensor,
+            generator=None,
+            return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the SA-Solver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero()
+        if len(step_index) == 0:
+            step_index = len(self.timesteps) - 1
+        else:
+            step_index = step_index.item()
+
+        use_corrector = (
+                step_index > 0 and self.last_sample is not None
+        )
+
+        model_output_convert = self.convert_model_output(model_output, timestep, sample)
+
+        if use_corrector:
+            current_tau = self.tau_func(self.timestep_list[-1])
+            sample = self.stochastic_adams_moulton_update(
+                this_model_output=model_output_convert,
+                this_timestep=timestep,
+                last_sample=self.last_sample,
+                last_noise=self.last_noise,
+                this_sample=sample,
+                order=self.this_corrector_order,
+                tau=current_tau,
+            )
+
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+
+        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+
+        noise = randn_tensor(
+            model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+        )
+
+        if self.config.lower_order_final:
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - step_index + 1)
+        else:
+            this_predictor_order = self.config.predictor_order
+            this_corrector_order = self.config.corrector_order
+
+        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
+        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
+        assert self.this_predictor_order > 0
+        assert self.this_corrector_order > 0
+
+        self.last_sample = sample
+        self.last_noise = noise
+
+        current_tau = self.tau_func(self.timestep_list[-1])
+        prev_sample = self.stochastic_adams_bashforth_update(
+            model_output=model_output_convert,
+            prev_timestep=prev_timestep,
+            sample=sample,
+            noise=noise,
+            order=self.this_predictor_order,
+            tau=current_tau,
+        )
+
+        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
+            self.lower_order_nums += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+            self,
+            original_samples: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
\ No newline at end of file

From 425b96da8e9d8def76810bf20b2321f698373ecd Mon Sep 17 00:00:00 2001
From: chenjunsong <cjs1020440147@icloud.com>
Date: Wed, 29 Nov 2023 18:29:09 +0800
Subject: [PATCH 02/20] correct the copyright.

---
 src/diffusers/schedulers/scheduling_sasolver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 5c547343bd33..589f2324c25b 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
+# Copyright 2023 Shuchen Xue, etc. in University of Chinese Academy of Sciences Team and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 4c8f8554a2c7fba19d7e9674f4f82786f33f03ba Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Fri, 1 Dec 2023 20:32:04 +0800
Subject: [PATCH 03/20] add sa solver test file

---
 tests/schedulers/test_scheduler_sasolver.py | 166 ++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tests/schedulers/test_scheduler_sasolver.py

diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
new file mode 100644
index 000000000000..42b4b24ab974
--- /dev/null
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -0,0 +1,166 @@
+import torch
+
+from diffusers import SASolverScheduler
+from diffusers.utils.testing_utils import require_torchsde, torch_device
+
+from .test_schedulers import SchedulerCommonTest
+
+
+@require_torchsde
+class SASolverSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (SASolverScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 167.47821044921875) < 1e-2
+            assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 171.59352111816406) < 1e-2
+            assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
+        else:
+            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+    def test_full_loop_with_v_prediction(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 124.77149200439453) < 1e-2
+            assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 128.1663360595703) < 1e-2
+            assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
+        else:
+            assert abs(result_sum.item() - 119.8487548828125) < 1e-2
+            assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
+
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 167.46957397460938) < 1e-2
+            assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 171.59353637695312) < 1e-2
+            assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
+        else:
+            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        if torch_device in ["mps"]:
+            assert abs(result_sum.item() - 176.66974135742188) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+        elif torch_device in ["cuda"]:
+            assert abs(result_sum.item() - 177.63653564453125) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+        else:
+            assert abs(result_sum.item() - 170.3135223388672) < 1e-2
+            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2

From b648ea4c9eb8ef5236df016a8158b56e9e929f9f Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 18 Dec 2023 20:40:05 +0800
Subject: [PATCH 04/20] fix fail cases for tests (#3)

* fix bugs in repository consistency
---
 src/diffusers/schedulers/__init__.py          |  2 +
 .../schedulers/scheduling_sasolver.py         | 45 +++++++++++--------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index 40c435dd5637..aae4e4afb9ab 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -65,6 +65,7 @@
     _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
     _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
     _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
+    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
 
 try:
     if not is_flax_available():
@@ -155,6 +156,7 @@
         from .scheduling_unipc_multistep import UniPCMultistepScheduler
         from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
         from .scheduling_vq_diffusion import VQDiffusionScheduler
+        from .scheduling_sasolver import SASolverScheduler
 
     try:
         if not is_flax_available():
diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 589f2324c25b..719b515706ee 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -16,11 +16,9 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
-from typing import List, Optional, Tuple, Union, Callable
-
 import numpy as np
 import torch
-
+from typing import List, Optional, Tuple, Union, Callable
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
@@ -28,9 +26,9 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-        num_diffusion_timesteps,
-        max_beta=0.999,
-        alpha_transform_type="cosine",
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -275,13 +273,13 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
         https://arxiv.org/abs/2205.11487
         """
         dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
+        batch_size, channels, *remaining_dims = sample.shape
 
         if dtype not in (torch.float32, torch.float64):
             sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
 
         # Flatten sample for doing quantile calculation along each image
-        sample = sample.reshape(batch_size, channels * height * width)
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
 
         abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
 
@@ -289,11 +287,10 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
         s = torch.clamp(
             s, min=1, max=self.config.sample_max_value
         )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
-
         s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
         sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
-        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
         sample = sample.to(dtype)
 
         return sample
@@ -301,7 +298,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
-        log_sigma = np.log(sigma)
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
 
         # get distribution
         dists = log_sigma - log_sigmas[:, np.newaxis]
@@ -326,8 +323,20 @@ def _sigma_to_t(self, sigma, log_sigmas):
     def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
-        sigma_min: float = in_sigmas[-1].item()
-        sigma_max: float = in_sigmas[0].item()
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
 
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
@@ -832,10 +841,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
     def add_noise(
-            self,
-            original_samples: torch.FloatTensor,
-            noise: torch.FloatTensor,
-            timesteps: torch.IntTensor,
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
@@ -855,4 +864,4 @@ def add_noise(
         return noisy_samples
 
     def __len__(self):
-        return self.config.num_train_timesteps
\ No newline at end of file
+        return self.config.num_train_timesteps

From da5c3d787986d606c2697f004a7ef6aea1d9f464 Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 18 Dec 2023 21:01:37 +0800
Subject: [PATCH 05/20] fix bugs in code quality check

---
 src/diffusers/schedulers/scheduling_sasolver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 719b515706ee..c5dc915b96cd 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -19,9 +19,9 @@
 import numpy as np
 import torch
 from typing import List, Optional, Tuple, Union, Callable
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar

From 50ce7d12a4b997bac4b60483961b2338bc7c5f40 Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 18 Dec 2023 21:05:39 +0800
Subject: [PATCH 06/20] add sasolver in init file

---
 src/diffusers/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index c43000e27b82..a87a0bab6125 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -157,6 +157,7 @@
             "UnCLIPScheduler",
             "UniPCMultistepScheduler",
             "VQDiffusionScheduler",
+            "SASolverScheduler",
         ]
     )
     _import_structure["training_utils"] = ["EMAModel"]
@@ -532,6 +533,7 @@
             UnCLIPScheduler,
             UniPCMultistepScheduler,
             VQDiffusionScheduler,
+            SASolverScheduler,
         )
         from .training_utils import EMAModel
 

From ac0277361a2b0657c8c8774be94b70aa3782174c Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 18 Dec 2023 21:19:48 +0800
Subject: [PATCH 07/20] arrange for alphabet order in init file

---
 src/diffusers/__init__.py                       | 4 ++--
 src/diffusers/schedulers/__init__.py            | 4 ++--
 src/diffusers/schedulers/scheduling_sasolver.py | 4 +++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a87a0bab6125..174338a95001 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -152,12 +152,12 @@
             "LCMScheduler",
             "PNDMScheduler",
             "RePaintScheduler",
+            "SASolverScheduler",
             "SchedulerMixin",
             "ScoreSdeVeScheduler",
             "UnCLIPScheduler",
             "UniPCMultistepScheduler",
             "VQDiffusionScheduler",
-            "SASolverScheduler",
         ]
     )
     _import_structure["training_utils"] = ["EMAModel"]
@@ -528,12 +528,12 @@
             LCMScheduler,
             PNDMScheduler,
             RePaintScheduler,
+            SASolverScheduler,
             SchedulerMixin,
             ScoreSdeVeScheduler,
             UnCLIPScheduler,
             UniPCMultistepScheduler,
             VQDiffusionScheduler,
-            SASolverScheduler,
         )
         from .training_utils import EMAModel
 
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index aae4e4afb9ab..a99c82cfdd99 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -60,12 +60,12 @@
     _import_structure["scheduling_lcm"] = ["LCMScheduler"]
     _import_structure["scheduling_pndm"] = ["PNDMScheduler"]
     _import_structure["scheduling_repaint"] = ["RePaintScheduler"]
+    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
     _import_structure["scheduling_sde_ve"] = ["ScoreSdeVeScheduler"]
     _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"]
     _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
     _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
     _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
-    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
 
 try:
     if not is_flax_available():
@@ -151,12 +151,12 @@
         from .scheduling_lcm import LCMScheduler
         from .scheduling_pndm import PNDMScheduler
         from .scheduling_repaint import RePaintScheduler
+        from .scheduling_sasolver import SASolverScheduler
         from .scheduling_sde_ve import ScoreSdeVeScheduler
         from .scheduling_unclip import UnCLIPScheduler
         from .scheduling_unipc_multistep import UniPCMultistepScheduler
         from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
         from .scheduling_vq_diffusion import VQDiffusionScheduler
-        from .scheduling_sasolver import SASolverScheduler
 
     try:
         if not is_flax_available():
diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index c5dc915b96cd..03a493e6a64b 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -16,9 +16,11 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
+from typing import List, Optional, Tuple, Union, Callable
+
 import numpy as np
 import torch
-from typing import List, Optional, Tuple, Union, Callable
+
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput

From f0d15ec282a9154f41648e54e9f3daa032eae889 Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 18 Dec 2023 21:27:30 +0800
Subject: [PATCH 08/20] update for code quality check

---
 src/diffusers/schedulers/scheduling_sasolver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 03a493e6a64b..df6b833c38c4 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -16,7 +16,7 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
-from typing import List, Optional, Tuple, Union, Callable
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch

From f97ffabaf4da56736040f093ce69feb29932b4af Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Thu, 21 Dec 2023 17:20:18 +0800
Subject: [PATCH 09/20] fix bugs in fast pytorch models schedulers test

---
 .../schedulers/scheduling_sasolver.py         | 11 ++-
 tests/schedulers/test_scheduler_sasolver.py   | 73 +++++++++++++------
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index df6b833c38c4..7ed335be6864 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -103,6 +103,10 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
             Video](https://imagen.research.google/video/paper.pdf) paper).
+        tau_func (`Callable`, *optional*):
+            Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver
+            will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla
+            diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -149,7 +153,7 @@ def __init__(
             corrector_order: int = 2,
             predictor_corrector_mode: str = 'PEC',
             prediction_type: str = "epsilon",
-            tau_func: Callable = lambda t: 1 if t >= 200 and t <= 800 else 0,
+            tau_func: Optional[Callable] = None,
             thresholding: bool = False,
             dynamic_thresholding_ratio: float = 0.995,
             sample_max_value: float = 1.0,
@@ -196,7 +200,10 @@ def __init__(
         self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
         self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
 
-        self.tau_func = tau_func
+        if tau_func is None:
+            self.tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0
+        else:
+            self.tau_func = tau_func
         self.predict_x0 = algorithm_type == "data_prediction"
         self.lower_order_nums = 0
         self.last_sample = None
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index 42b4b24ab974..18fca67a5c93 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -9,6 +9,7 @@
 @require_torchsde
 class SASolverSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (SASolverScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 10),)
     num_inference_steps = 10
 
     def get_scheduler_config(self, **kwargs):
@@ -61,14 +62,20 @@ def test_full_loop_no_noise(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["mps"]:
-            assert abs(result_sum.item() - 167.47821044921875) < 1e-2
-            assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+            print('no_noise, mps, sum:', result_sum.item())
+            print('no_noise, mps, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 167.47821044921875) < 1e-2
+            # assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 171.59352111816406) < 1e-2
-            assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
+            print('no_noise, cuda, sum:', result_sum.item())
+            print('no_noise, cuda, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 171.59352111816406) < 1e-2
+            # assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
         else:
-            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
-            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+            print('no_noise, cpu, sum:', result_sum.item())
+            print('no_noise, cpu, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 162.52383422851562) < 1e-2
+            # assert abs(result_mean.item() - 0.211619570851326) < 1e-3
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
@@ -93,14 +100,20 @@ def test_full_loop_with_v_prediction(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["mps"]:
-            assert abs(result_sum.item() - 124.77149200439453) < 1e-2
-            assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+            print('v_prediction, mps, sum:', result_sum.item())
+            print('v_prediction, mps, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 124.77149200439453) < 1e-2
+            # assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 128.1663360595703) < 1e-2
-            assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
+            print('v_prediction, cuda, sum:', result_sum.item())
+            print('v_prediction, cuda, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 128.1663360595703) < 1e-2
+            # assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
         else:
-            assert abs(result_sum.item() - 119.8487548828125) < 1e-2
-            assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
+            print('v_prediction, cpu, sum:', result_sum.item())
+            print('v_prediction, cpu, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 119.8487548828125) < 1e-2
+            # assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
 
     def test_full_loop_device(self):
         scheduler_class = self.scheduler_classes[0]
@@ -124,14 +137,20 @@ def test_full_loop_device(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["mps"]:
-            assert abs(result_sum.item() - 167.46957397460938) < 1e-2
-            assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+            print('full_loop_device, mps, sum:', result_sum.item())
+            print('full_loop_device, mps, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 167.46957397460938) < 1e-2
+            # assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 171.59353637695312) < 1e-2
-            assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
+            print('full_loop_device, cuda, sum:', result_sum.item())
+            print('full_loop_device, cuda, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 171.59353637695312) < 1e-2
+            # assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
         else:
-            assert abs(result_sum.item() - 162.52383422851562) < 1e-2
-            assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+            print('full_loop_device, cpu, sum:', result_sum.item())
+            print('full_loop_device, cpu, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 336.6853942871094) < 1e-2
+            # assert abs(result_mean.item() - 0.211619570851326) < 1e-3
 
     def test_full_loop_device_karras_sigmas(self):
         scheduler_class = self.scheduler_classes[0]
@@ -156,11 +175,17 @@ def test_full_loop_device_karras_sigmas(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["mps"]:
-            assert abs(result_sum.item() - 176.66974135742188) < 1e-2
-            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+            print('karras_sigmas, mps, sum:', result_sum.item())
+            print('karras_sigmas, mps, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 176.66974135742188) < 1e-2
+            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 177.63653564453125) < 1e-2
-            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+            print('karras_sigmas, cuda, sum:', result_sum.item())
+            print('karras_sigmas, cuda, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 177.63653564453125) < 1e-2
+            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
         else:
-            assert abs(result_sum.item() - 170.3135223388672) < 1e-2
-            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+            print('karras_sigmas, cpu, sum:', result_sum.item())
+            print('karras_sigmas, cpu, mean:', result_mean.item())
+            # assert abs(result_sum.item() - 170.3135223388672) < 1e-2
+            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2

From c9fd7388d0ec588851226b3fbc85d52c5998b392 Mon Sep 17 00:00:00 2001
From: jschen <chenjunsong4@h-partners.com>
Date: Thu, 21 Dec 2023 19:17:49 +0800
Subject: [PATCH 10/20] test file update;

---
 tests/schedulers/test_scheduler_sasolver.py | 85 ++++++++-------------
 1 file changed, 30 insertions(+), 55 deletions(-)

diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index 18fca67a5c93..75db94cfca76 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -61,21 +61,14 @@ def test_full_loop_no_noise(self):
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
 
-        if torch_device in ["mps"]:
-            print('no_noise, mps, sum:', result_sum.item())
-            print('no_noise, mps, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 167.47821044921875) < 1e-2
-            # assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 339.0479736328125) < 1e-2
+            assert abs(result_mean.item() - 0.4414687156677246) < 1e-3
         elif torch_device in ["cuda"]:
-            print('no_noise, cuda, sum:', result_sum.item())
-            print('no_noise, cuda, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 171.59352111816406) < 1e-2
-            # assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
+            assert abs(result_sum.item() - 329.20001220703125) < 1e-2
+            assert abs(result_mean.item() - 0.4286458492279053) < 1e-3
         else:
-            print('no_noise, cpu, sum:', result_sum.item())
-            print('no_noise, cpu, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 162.52383422851562) < 1e-2
-            # assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+            print('None')
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
@@ -87,9 +80,10 @@ def test_full_loop_with_v_prediction(self):
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
         sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
 
         for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
+            sample = scheduler.scale_model_input(sample, t, generator=generator)
 
             model_output = model(sample, t)
 
@@ -99,21 +93,14 @@ def test_full_loop_with_v_prediction(self):
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
 
-        if torch_device in ["mps"]:
-            print('v_prediction, mps, sum:', result_sum.item())
-            print('v_prediction, mps, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 124.77149200439453) < 1e-2
-            # assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 193.1468048095703) < 1e-2
+            assert abs(result_mean.item() - 0.2514932453632355) < 1e-3
         elif torch_device in ["cuda"]:
-            print('v_prediction, cuda, sum:', result_sum.item())
-            print('v_prediction, cuda, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 128.1663360595703) < 1e-2
-            # assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
+            assert abs(result_sum.item() - 193.41543579101562) < 1e-2
+            assert abs(result_mean.item() - 0.25184303522109985) < 1e-3
         else:
-            print('v_prediction, cpu, sum:', result_sum.item())
-            print('v_prediction, cpu, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 119.8487548828125) < 1e-2
-            # assert abs(result_mean.item() - 0.1560530662536621) < 1e-3
+            print("None")
 
     def test_full_loop_device(self):
         scheduler_class = self.scheduler_classes[0]
@@ -124,33 +111,27 @@ def test_full_loop_device(self):
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        generator = torch.manual_seed(0)
 
         for t in scheduler.timesteps:
             sample = scheduler.scale_model_input(sample, t)
 
             model_output = model(sample, t)
 
-            output = scheduler.step(model_output, t, sample)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
 
-        if torch_device in ["mps"]:
-            print('full_loop_device, mps, sum:', result_sum.item())
-            print('full_loop_device, mps, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 167.46957397460938) < 1e-2
-            # assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.43931546807289124) < 1e-3
         elif torch_device in ["cuda"]:
-            print('full_loop_device, cuda, sum:', result_sum.item())
-            print('full_loop_device, cuda, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 171.59353637695312) < 1e-2
-            # assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.4393154978752136) < 1e-3
         else:
-            print('full_loop_device, cpu, sum:', result_sum.item())
-            print('full_loop_device, cpu, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 336.6853942871094) < 1e-2
-            # assert abs(result_mean.item() - 0.211619570851326) < 1e-3
+            print("None")
 
     def test_full_loop_device_karras_sigmas(self):
         scheduler_class = self.scheduler_classes[0]
@@ -162,30 +143,24 @@ def test_full_loop_device_karras_sigmas(self):
         model = self.dummy_model()
         sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
         sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
 
         for t in scheduler.timesteps:
             sample = scheduler.scale_model_input(sample, t)
 
             model_output = model(sample, t)
 
-            output = scheduler.step(model_output, t, sample)
+            output = scheduler.step(model_output, t, sample, generator=generator)
             sample = output.prev_sample
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
 
-        if torch_device in ["mps"]:
-            print('karras_sigmas, mps, sum:', result_sum.item())
-            print('karras_sigmas, mps, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 176.66974135742188) < 1e-2
-            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+        if torch_device in ["cpu"]:
+            assert abs(result_sum.item() - 840.1239013671875) < 1e-2
+            assert abs(result_mean.item() - 1.0939112901687622) < 1e-2
         elif torch_device in ["cuda"]:
-            print('karras_sigmas, cuda, sum:', result_sum.item())
-            print('karras_sigmas, cuda, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 177.63653564453125) < 1e-2
-            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+            assert abs(result_sum.item() - 840.1239624023438) < 1e-2
+            assert abs(result_mean.item() - 1.0939114093780518) < 1e-2
         else:
-            print('karras_sigmas, cpu, sum:', result_sum.item())
-            print('karras_sigmas, cpu, mean:', result_mean.item())
-            # assert abs(result_sum.item() - 170.3135223388672) < 1e-2
-            # assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
+            print('None')

From 849c2102b8fcfd1d20d7e25ddb099cfe145548c0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 5 Jan 2024 11:59:21 +0100
Subject: [PATCH 11/20] make style

---
 .../schedulers/scheduling_sasolver.py         | 376 +++++++++++-------
 tests/schedulers/test_scheduler_sasolver.py   |   4 +-
 2 files changed, 230 insertions(+), 150 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 7ed335be6864..811b39f9b6a5 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -143,27 +143,27 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
 
     @register_to_config
     def __init__(
-            self,
-            num_train_timesteps: int = 1000,
-            beta_start: float = 0.0001,
-            beta_end: float = 0.02,
-            beta_schedule: str = "linear",
-            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-            predictor_order: int = 2,
-            corrector_order: int = 2,
-            predictor_corrector_mode: str = 'PEC',
-            prediction_type: str = "epsilon",
-            tau_func: Optional[Callable] = None,
-            thresholding: bool = False,
-            dynamic_thresholding_ratio: float = 0.995,
-            sample_max_value: float = 1.0,
-            algorithm_type: str = "data_prediction",
-            lower_order_final: bool = True,
-            use_karras_sigmas: Optional[bool] = False,
-            lambda_min_clipped: float = -float("inf"),
-            variance_type: Optional[str] = None,
-            timestep_spacing: str = "linspace",
-            steps_offset: int = 0,
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        predictor_order: int = 2,
+        corrector_order: int = 2,
+        predictor_corrector_mode: str = "PEC",
+        prediction_type: str = "epsilon",
+        tau_func: Optional[Callable] = None,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "data_prediction",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -171,9 +171,7 @@ def __init__(
             self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
-            )
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -265,8 +263,8 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         self.num_inference_steps = len(timesteps)
 
         self.model_outputs = [
-                                 None,
-                             ] * max(self.config.predictor_order, self.config.corrector_order - 1)
+            None,
+        ] * max(self.config.predictor_order, self.config.corrector_order - 1)
         self.lower_order_nums = 0
         self.last_sample = None
 
@@ -355,7 +353,7 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps)
         return sigmas
 
     def convert_model_output(
-            self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+        self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
     ) -> torch.FloatTensor:
         """
         Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
@@ -444,15 +442,19 @@ def get_coefficients_exponential_negative(self, order, interval_start, interval_
             return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
         elif order == 1:
             return torch.exp(-interval_end) * (
-                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
+                (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)
+            )
         elif order == 2:
             return torch.exp(-interval_end) * (
-                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
-                            interval_end ** 2 + 2 * interval_end + 2))
+                (interval_start**2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start)
+                - (interval_end**2 + 2 * interval_end + 2)
+            )
         elif order == 3:
             return torch.exp(-interval_end) * (
-                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
-                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
+                (interval_start**3 + 3 * interval_start**2 + 6 * interval_start + 6)
+                * torch.exp(interval_end - interval_start)
+                - (interval_end**3 + 3 * interval_end**2 + 6 * interval_end + 6)
+            )
 
     def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
         """
@@ -461,24 +463,42 @@ def get_coefficients_exponential_positive(self, order, interval_start, interval_
         assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
 
         # after change of variable(cov)
-        interval_end_cov = (1 + tau ** 2) * interval_end
-        interval_start_cov = (1 + tau ** 2) * interval_start
+        interval_end_cov = (1 + tau**2) * interval_end
+        interval_start_cov = (1 + tau**2) * interval_start
 
         if order == 0:
-            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
-            (1 + tau ** 2))
+            return (
+                torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (1 + tau**2)
+            )
         elif order == 1:
-            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
-                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov - 1)
+                    - (interval_start_cov - 1) * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 2)
+            )
         elif order == 2:
-            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
-                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
-                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**2 - 2 * interval_end_cov + 2)
+                    - (interval_start_cov**2 - 2 * interval_start_cov + 2)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 3)
+            )
         elif order == 3:
-            return torch.exp(interval_end_cov) * (
-                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
-                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
-                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**3 - 3 * interval_end_cov**2 + 6 * interval_end_cov - 6)
+                    - (interval_start_cov**3 - 3 * interval_start_cov**2 + 6 * interval_start_cov - 6)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 4)
+            )
 
     def lagrange_polynomial_coefficient(self, order, lambda_list):
         """
@@ -490,86 +510,127 @@ def lagrange_polynomial_coefficient(self, order, lambda_list):
         if order == 0:
             return [[1]]
         elif order == 1:
-            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
-                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
+            return [
+                [1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
+                [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])],
+            ]
         elif order == 2:
             denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
             denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
             denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
-            return [[1 / denominator1,
-                     (-lambda_list[1] - lambda_list[2]) / denominator1,
-                     lambda_list[1] * lambda_list[2] / denominator1],
-
-                    [1 / denominator2,
-                     (-lambda_list[0] - lambda_list[2]) / denominator2,
-                     lambda_list[0] * lambda_list[2] / denominator2],
-
-                    [1 / denominator3,
-                     (-lambda_list[0] - lambda_list[1]) / denominator3,
-                     lambda_list[0] * lambda_list[1] / denominator3]
-                    ]
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2]) / denominator1,
+                    lambda_list[1] * lambda_list[2] / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2]) / denominator2,
+                    lambda_list[0] * lambda_list[2] / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1]) / denominator3,
+                    lambda_list[0] * lambda_list[1] / denominator3,
+                ],
+            ]
         elif order == 3:
-            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
-                        lambda_list[0] - lambda_list[3])
-            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
-                        lambda_list[1] - lambda_list[3])
-            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
-                        lambda_list[2] - lambda_list[3])
-            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
-                        lambda_list[3] - lambda_list[2])
-            return [[1 / denominator1,
-                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
-                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
-                         3]) / denominator1,
-                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
-
-                    [1 / denominator2,
-                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
-                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
-                         3]) / denominator2,
-                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
-
-                    [1 / denominator3,
-                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
-                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
-                         3]) / denominator3,
-                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
-
-                    [1 / denominator4,
-                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
-                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
-                         2]) / denominator4,
-                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
-
-                    ]
+            denominator1 = (
+                (lambda_list[0] - lambda_list[1])
+                * (lambda_list[0] - lambda_list[2])
+                * (lambda_list[0] - lambda_list[3])
+            )
+            denominator2 = (
+                (lambda_list[1] - lambda_list[0])
+                * (lambda_list[1] - lambda_list[2])
+                * (lambda_list[1] - lambda_list[3])
+            )
+            denominator3 = (
+                (lambda_list[2] - lambda_list[0])
+                * (lambda_list[2] - lambda_list[1])
+                * (lambda_list[2] - lambda_list[3])
+            )
+            denominator4 = (
+                (lambda_list[3] - lambda_list[0])
+                * (lambda_list[3] - lambda_list[1])
+                * (lambda_list[3] - lambda_list[2])
+            )
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                    (
+                        lambda_list[1] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator1,
+                    (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                    (
+                        lambda_list[0] * lambda_list[2]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator2,
+                    (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[1] * lambda_list[3]
+                    )
+                    / denominator3,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3,
+                ],
+                [
+                    1 / denominator4,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[2]
+                    )
+                    / denominator4,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4,
+                ],
+            ]
 
     def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
         assert order in [1, 2, 3, 4]
-        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
+        assert order == len(lambda_list), "the length of lambda list must be equal to the order"
         coefficients = []
         lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
         for i in range(order):
             coefficient = 0
             for j in range(order):
                 if self.predict_x0:
-
                     coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
-                        order - 1 - j, interval_start, interval_end, tau)
+                        order - 1 - j, interval_start, interval_end, tau
+                    )
                 else:
                     coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
-                        order - 1 - j, interval_start, interval_end)
+                        order - 1 - j, interval_start, interval_end
+                    )
             coefficients.append(coefficient)
-        assert len(coefficients) == order, 'the length of coefficients does not match the order'
+        assert len(coefficients) == order, "the length of coefficients does not match the order"
         return coefficients
 
     def stochastic_adams_bashforth_update(
-            self,
-            model_output: torch.FloatTensor,
-            prev_timestep: int,
-            sample: torch.FloatTensor,
-            noise: torch.FloatTensor,
-            order: int,
-            tau: torch.FloatTensor,
+        self,
+        model_output: torch.FloatTensor,
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        order: int,
+        tau: torch.FloatTensor,
     ) -> torch.FloatTensor:
         """
         One step for the SA-Predictor.
@@ -608,35 +669,45 @@ def stochastic_adams_bashforth_update(
         x = sample
 
         if self.predict_x0:
-            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
                 # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
                 # ODE case
                 # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
                 # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
-                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
-                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
-                                (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
-                    timestep_list[-2]])
-                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
-                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
-                                (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
-                    timestep_list[-2]])
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (self.lambda_t[timestep_list[-1]] - self.lambda_t[timestep_list[-2]])
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (self.lambda_t[timestep_list[-1]] - self.lambda_t[timestep_list[-2]])
+                )
 
         for i in range(order):
             if self.predict_x0:
-
-                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
-                    i] * model_output_list[-(i + 1)]
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_output_list[-(i + 1)]
+                )
             else:
-                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
 
         if self.predict_x0:
-            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * noise
         else:
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
 
         if self.predict_x0:
-            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -644,14 +715,14 @@ def stochastic_adams_bashforth_update(
         return x_t
 
     def stochastic_adams_moulton_update(
-            self,
-            this_model_output: torch.FloatTensor,
-            this_timestep: int,
-            last_sample: torch.FloatTensor,
-            last_noise: torch.FloatTensor,
-            this_sample: torch.FloatTensor,
-            order: int,
-            tau: torch.FloatTensor,
+        self,
+        this_model_output: torch.FloatTensor,
+        this_timestep: int,
+        last_sample: torch.FloatTensor,
+        last_noise: torch.FloatTensor,
+        this_sample: torch.FloatTensor,
+        order: int,
+        tau: torch.FloatTensor,
     ) -> torch.FloatTensor:
         """
         One step for the SA-Corrector.
@@ -694,32 +765,43 @@ def stochastic_adams_moulton_update(
         x = last_sample
 
         if self.predict_x0:
-            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
                 # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
                 # ODE case
                 # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
                 # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
-                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
-                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
-                                (1 + tau ** 2) ** 2 * h))
-                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
-                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
-                                (1 + tau ** 2) ** 2 * h))
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
 
         for i in range(order):
             if self.predict_x0:
-                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
-                    i] * model_prev_list[-(i + 1)]
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_prev_list[-(i + 1)]
+                )
             else:
-                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
 
         if self.predict_x0:
-            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * last_noise
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
         else:
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
 
         if self.predict_x0:
-            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -727,12 +809,12 @@ def stochastic_adams_moulton_update(
         return x_t
 
     def step(
-            self,
-            model_output: torch.FloatTensor,
-            timestep: int,
-            sample: torch.FloatTensor,
-            generator=None,
-            return_dict: bool = True,
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
     ) -> Union[SchedulerOutput, Tuple]:
         """
         Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
@@ -769,9 +851,7 @@ def step(
         else:
             step_index = step_index.item()
 
-        use_corrector = (
-                step_index > 0 and self.last_sample is not None
-        )
+        use_corrector = step_index > 0 and self.last_sample is not None
 
         model_output_convert = self.convert_model_output(model_output, timestep, sample)
 
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index 75db94cfca76..fab477ad7d7e 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -68,7 +68,7 @@ def test_full_loop_no_noise(self):
             assert abs(result_sum.item() - 329.20001220703125) < 1e-2
             assert abs(result_mean.item() - 0.4286458492279053) < 1e-3
         else:
-            print('None')
+            print("None")
 
     def test_full_loop_with_v_prediction(self):
         scheduler_class = self.scheduler_classes[0]
@@ -163,4 +163,4 @@ def test_full_loop_device_karras_sigmas(self):
             assert abs(result_sum.item() - 840.1239624023438) < 1e-2
             assert abs(result_mean.item() - 1.0939114093780518) < 1e-2
         else:
-            print('None')
+            print("None")

From 4c56eac87e4d7947dce8265ea3cee66f5627936d Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 15 Jan 2024 10:56:15 +0800
Subject: [PATCH 12/20] modify for step_index

---
 .../schedulers/scheduling_sasolver.py         | 230 +++++++++++++-----
 1 file changed, 172 insertions(+), 58 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 811b39f9b6a5..14ce918c29c3 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -96,9 +96,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
         corrector_order (`int`, defaults to 2):
             The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
             sampling, and `corrector_order=3` for unconditional sampling.
-        predictor_corrector_mode (`str`, defaults to `PEC`):
-            The predictor-corrector mode can be `PEC` or 'PECE'. It is recommended to use `PEC` mode for fast
-            sampling, and `PECE` for high-quality sampling (PECE needs around twice model evaluations as PEC).
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
@@ -151,7 +148,6 @@ def __init__(
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         predictor_order: int = 2,
         corrector_order: int = 2,
-        predictor_corrector_mode: str = "PEC",
         prediction_type: str = "epsilon",
         tau_func: Optional[Callable] = None,
         thresholding: bool = False,
@@ -184,6 +180,7 @@ def __init__(
         self.alpha_t = torch.sqrt(self.alphas_cumprod)
         self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
         self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -205,6 +202,15 @@ def __init__(
         self.predict_x0 = algorithm_type == "data_prediction"
         self.lower_order_nums = 0
         self.last_sample = None
+        self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
 
     def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
         """
@@ -247,27 +253,29 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         if self.config.use_karras_sigmas:
             log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
             sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
             timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
-            timesteps = np.flip(timesteps).copy().astype(np.int64)
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
 
         self.sigmas = torch.from_numpy(sigmas)
-
-        # when num_inference_steps == num_train_timesteps, we can end up with
-        # duplicates in timesteps.
-        _, unique_indices = np.unique(timesteps, return_index=True)
-        timesteps = timesteps[np.sort(unique_indices)]
-
-        self.timesteps = torch.from_numpy(timesteps).to(device)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
 
         self.num_inference_steps = len(timesteps)
-
         self.model_outputs = [
             None,
         ] * max(self.config.predictor_order, self.config.corrector_order - 1)
         self.lower_order_nums = 0
         self.last_sample = None
 
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
         """
@@ -325,6 +333,13 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = (1 - w) * low_idx + w * high_idx
         t = t.reshape(sigma.shape)
         return t
+    
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
@@ -353,16 +368,20 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps)
         return sigmas
 
     def convert_model_output(
-        self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
     ) -> torch.FloatTensor:
         """
-        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
-        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is
+        designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an
         integral of the data prediction model.
 
         <Tip>
 
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise
         prediction and data prediction models.
 
         </Tip>
@@ -370,8 +389,6 @@ def convert_model_output(
         Args:
             model_output (`torch.FloatTensor`):
                 The direct output from the learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
             sample (`torch.FloatTensor`):
                 A current instance of a sample created by the diffusion process.
 
@@ -379,19 +396,31 @@ def convert_model_output(
             `torch.FloatTensor`:
                 The converted model output.
         """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
 
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
         # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
         if self.config.algorithm_type in ["data_prediction"]:
             if self.config.prediction_type == "epsilon":
                 # SA-Solver only needs the "mean" output.
                 if self.config.variance_type in ["learned", "learned_range"]:
                     model_output = model_output[:, :3]
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
@@ -413,10 +442,8 @@ def convert_model_output(
                 else:
                     epsilon = model_output
             elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
             elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = alpha_t * model_output + sigma_t * sample
             else:
                 raise ValueError(
@@ -626,11 +653,12 @@ def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list,
     def stochastic_adams_bashforth_update(
         self,
         model_output: torch.FloatTensor,
-        prev_timestep: int,
+        *args,
         sample: torch.FloatTensor,
         noise: torch.FloatTensor,
         order: int,
         tau: torch.FloatTensor,
+        **kwargs,
     ) -> torch.FloatTensor:
         """
         One step for the SA-Predictor.
@@ -649,20 +677,50 @@ def stochastic_adams_bashforth_update(
             `torch.FloatTensor`:
                 The sample tensor at the previous timestep.
         """
-
-        assert noise is not None
-        timestep_list = self.timestep_list
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if noise is None:
+            if len(args) > 2:
+                noise = args[2]
+            else:
+                raise ValueError(" missing `noise` as a required keyward argument")       
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 4:
+                tau = args[4]
+            else:
+                raise ValueError(" missing `tau` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
         model_output_list = self.model_outputs
-        s0, t = self.timestep_list[-1], prev_timestep
-        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        
         gradient_part = torch.zeros_like(sample)
         h = lambda_t - lambda_s0
         lambda_list = []
 
         for i in range(order):
-            lambda_list.append(self.lambda_t[timestep_list[-(i + 1)]])
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
 
         gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
 
@@ -676,17 +734,21 @@ def stochastic_adams_bashforth_update(
                 # ODE case
                 # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
                 # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                temp_s = self.step_index - 1
+                temp_sigma = self.sigmas[self.step_index - 1]
+                temp_alpha_s, temp_sigma_s = self._sigma_to_alpha_sigma_t(temp_sigma)
+                temp_lambda_s = torch.log(temp_alpha_s) - torch.log(temp_sigma_s)
                 gradient_coefficients[0] += (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
                     * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
-                    / (self.lambda_t[timestep_list[-1]] - self.lambda_t[timestep_list[-2]])
+                    / (lambda_s0 - temp_lambda_s)
                 )
                 gradient_coefficients[1] -= (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
                     * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
-                    / (self.lambda_t[timestep_list[-1]] - self.lambda_t[timestep_list[-2]])
+                    / (lambda_s0 - temp_lambda_s)
                 )
 
         for i in range(order):
@@ -717,12 +779,13 @@ def stochastic_adams_bashforth_update(
     def stochastic_adams_moulton_update(
         self,
         this_model_output: torch.FloatTensor,
-        this_timestep: int,
+        *args,
         last_sample: torch.FloatTensor,
         last_noise: torch.FloatTensor,
         this_sample: torch.FloatTensor,
         order: int,
         tau: torch.FloatTensor,
+        **kwargs,
     ) -> torch.FloatTensor:
         """
         One step for the SA-Corrector.
@@ -744,19 +807,55 @@ def stochastic_adams_moulton_update(
                 The corrected sample tensor at the current timestep.
         """
 
-        assert last_noise is not None
-        timestep_list = self.timestep_list
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if last_noise is None:
+            if len(args) > 2:
+                last_noise = args[2]
+            else:
+                raise ValueError(" missing`last_noise` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 3:
+                this_sample = args[3]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 4:
+                order = args[4]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 5:
+                tau = args[5]
+            else:
+                raise ValueError(" missing`tau` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
         model_output_list = self.model_outputs
-        s0, t = self.timestep_list[-1], this_timestep
-        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
         gradient_part = torch.zeros_like(this_sample)
         h = lambda_t - lambda_s0
-        t_list = timestep_list + [this_timestep]
         lambda_list = []
         for i in range(order):
-            lambda_list.append(self.lambda_t[t_list[-(i + 1)]])
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
 
         model_prev_list = model_output_list + [this_model_output]
 
@@ -808,6 +907,25 @@ def stochastic_adams_moulton_update(
         x_t = x_t.to(x.dtype)
         return x_t
 
+    def _init_step_index(self, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+
+        index_candidates = (self.timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        self._step_index = step_index
+
     def step(
         self,
         model_output: torch.FloatTensor,
@@ -843,23 +961,17 @@ def step(
                 "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
             )
 
-        if isinstance(timestep, torch.Tensor):
-            timestep = timestep.to(self.timesteps.device)
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-
-        use_corrector = step_index > 0 and self.last_sample is not None
+        if self.step_index is None:
+            self._init_step_index(timestep)
+            
+        use_corrector = self.step_index > 0 and self.last_sample is not None
 
-        model_output_convert = self.convert_model_output(model_output, timestep, sample)
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
 
         if use_corrector:
             current_tau = self.tau_func(self.timestep_list[-1])
             sample = self.stochastic_adams_moulton_update(
                 this_model_output=model_output_convert,
-                this_timestep=timestep,
                 last_sample=self.last_sample,
                 last_noise=self.last_noise,
                 this_sample=sample,
@@ -867,7 +979,7 @@ def step(
                 tau=current_tau,
             )
 
-        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+
 
         for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
@@ -881,8 +993,8 @@ def step(
         )
 
         if self.config.lower_order_final:
-            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - step_index)
-            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - step_index + 1)
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - self.step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - self.step_index + 1)
         else:
             this_predictor_order = self.config.predictor_order
             this_corrector_order = self.config.corrector_order
@@ -898,7 +1010,6 @@ def step(
         current_tau = self.tau_func(self.timestep_list[-1])
         prev_sample = self.stochastic_adams_bashforth_update(
             model_output=model_output_convert,
-            prev_timestep=prev_timestep,
             sample=sample,
             noise=noise,
             order=self.this_predictor_order,
@@ -908,6 +1019,9 @@ def step(
         if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
             self.lower_order_nums += 1
 
+        # upon completion increase step index by one
+        self._step_index += 1
+
         if not return_dict:
             return (prev_sample,)
 

From d1b4dbaf4da8954a0a0eab8fae746f87d3fb3566 Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 15 Jan 2024 11:26:20 +0800
Subject: [PATCH 13/20] fix bugs

---
 src/diffusers/schedulers/scheduling_sasolver.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 14ce918c29c3..4e105f4c07e6 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -22,6 +22,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
@@ -333,7 +334,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = (1 - w) * low_idx + w * high_idx
         t = t.reshape(sigma.shape)
         return t
-    
+
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
     def _sigma_to_alpha_sigma_t(self, sigma):
         alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
@@ -710,7 +711,7 @@ def stochastic_adams_bashforth_update(
         alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
         lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
         lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        
+
         gradient_part = torch.zeros_like(sample)
         h = lambda_t - lambda_s0
         lambda_list = []
@@ -734,7 +735,6 @@ def stochastic_adams_bashforth_update(
                 # ODE case
                 # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
                 # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
-                temp_s = self.step_index - 1
                 temp_sigma = self.sigmas[self.step_index - 1]
                 temp_alpha_s, temp_sigma_s = self._sigma_to_alpha_sigma_t(temp_sigma)
                 temp_lambda_s = torch.log(temp_alpha_s) - torch.log(temp_sigma_s)
@@ -963,7 +963,7 @@ def step(
 
         if self.step_index is None:
             self._init_step_index(timestep)
-            
+
         use_corrector = self.step_index > 0 and self.last_sample is not None
 
         model_output_convert = self.convert_model_output(model_output, sample=sample)

From b4eb69adcaa7fca5169e6b63700684e2593c4868 Mon Sep 17 00:00:00 2001
From: jschen <chenjunsong4@h-partners.com>
Date: Mon, 15 Jan 2024 11:49:17 +0800
Subject: [PATCH 14/20] assert results bug fixed

---
 .../schedulers/scheduling_sasolver.py         |  2 +-
 tests/schedulers/test_scheduler_sasolver.py   | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 4e105f4c07e6..d81fb84f667e 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -688,7 +688,7 @@ def stochastic_adams_bashforth_update(
             if len(args) > 2:
                 noise = args[2]
             else:
-                raise ValueError(" missing `noise` as a required keyward argument")       
+                raise ValueError(" missing `noise` as a required keyward argument")
         if order is None:
             if len(args) > 3:
                 order = args[3]
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index fab477ad7d7e..de949dbb778d 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -62,11 +62,11 @@ def test_full_loop_no_noise(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["cpu"]:
-            assert abs(result_sum.item() - 339.0479736328125) < 1e-2
-            assert abs(result_mean.item() - 0.4414687156677246) < 1e-3
+            assert abs(result_sum.item() - 328.8799133300781) < 1e-2
+            assert abs(result_mean.item() - 0.42822906374931335) < 1e-3
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 329.20001220703125) < 1e-2
-            assert abs(result_mean.item() - 0.4286458492279053) < 1e-3
+            assert abs(result_sum.item() - 329.1999816894531) < 1e-2
+            assert abs(result_mean.item() - 0.4286458194255829) < 1e-3
         else:
             print("None")
 
@@ -94,11 +94,11 @@ def test_full_loop_with_v_prediction(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["cpu"]:
-            assert abs(result_sum.item() - 193.1468048095703) < 1e-2
-            assert abs(result_mean.item() - 0.2514932453632355) < 1e-3
+            assert abs(result_sum.item() - 193.1467742919922) < 1e-2
+            assert abs(result_mean.item() - 0.2514931857585907) < 1e-3
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 193.41543579101562) < 1e-2
-            assert abs(result_mean.item() - 0.25184303522109985) < 1e-3
+            assert abs(result_sum.item() - 193.4154052734375) < 1e-2
+            assert abs(result_mean.item() - 0.2518429756164551) < 1e-3
         else:
             print("None")
 
@@ -157,10 +157,10 @@ def test_full_loop_device_karras_sigmas(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["cpu"]:
-            assert abs(result_sum.item() - 840.1239013671875) < 1e-2
-            assert abs(result_mean.item() - 1.0939112901687622) < 1e-2
+            assert abs(result_sum.item() - 837.2554931640625) < 1e-2
+            assert abs(result_mean.item() - 1.0901764631271362) < 1e-2
         elif torch_device in ["cuda"]:
-            assert abs(result_sum.item() - 840.1239624023438) < 1e-2
-            assert abs(result_mean.item() - 1.0939114093780518) < 1e-2
+            assert abs(result_sum.item() - 837.25537109375) < 1e-2
+            assert abs(result_mean.item() - 1.0901763439178467) < 1e-2
         else:
             print("None")

From 300f14fea52dedcd75e6daf8b512e7f6075b5cb4 Mon Sep 17 00:00:00 2001
From: jschen <chenjunsong4@h-partners.com>
Date: Mon, 15 Jan 2024 12:21:11 +0800
Subject: [PATCH 15/20] test_step_shape bug fixed.

---
 .../schedulers/scheduling_sasolver.py         |  2 --
 tests/schedulers/test_scheduler_sasolver.py   | 31 +++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index d81fb84f667e..2ff9d002d691 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -979,8 +979,6 @@ def step(
                 tau=current_tau,
             )
 
-
-
         for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
             self.timestep_list[i] = self.timestep_list[i + 1]
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index de949dbb778d..1b8a3dc69ac2 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -23,6 +23,37 @@ def get_scheduler_config(self, **kwargs):
         config.update(**kwargs)
         return config
 
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = \
+                dummy_past_residuals[: max(scheduler.config.predictor_order, scheduler.config.corrector_order - 1)]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
     def test_timesteps(self):
         for timesteps in [10, 50, 100, 1000]:
             self.check_over_configs(num_train_timesteps=timesteps)

From 9a82bb7f87730dfbfe94871dbd3e0ca3a4fb2fc8 Mon Sep 17 00:00:00 2001
From: jschen <chenjunsong4@h-partners.com>
Date: Mon, 15 Jan 2024 12:37:19 +0800
Subject: [PATCH 16/20] assert results bug fixed

---
 tests/schedulers/test_scheduler_sasolver.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index 1b8a3dc69ac2..4e5b4a04e0b8 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -80,9 +80,10 @@ def test_full_loop_no_noise(self):
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
         sample = sample.to(torch_device)
+        generator = torch.manual_seed(0)
 
         for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
+            sample = scheduler.scale_model_input(sample, t, generator=generator)
 
             model_output = model(sample, t)
 
@@ -93,8 +94,8 @@ def test_full_loop_no_noise(self):
         result_mean = torch.mean(torch.abs(sample))
 
         if torch_device in ["cpu"]:
-            assert abs(result_sum.item() - 328.8799133300781) < 1e-2
-            assert abs(result_mean.item() - 0.42822906374931335) < 1e-3
+            assert abs(result_sum.item() - 337.394287109375) < 1e-2
+            assert abs(result_mean.item() - 0.43931546807289124) < 1e-3
         elif torch_device in ["cuda"]:
             assert abs(result_sum.item() - 329.1999816894531) < 1e-2
             assert abs(result_mean.item() - 0.4286458194255829) < 1e-3

From 29cdbea0aa07e58ca620eb19dcf4000ffeadf03f Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 15 Jan 2024 12:37:21 +0800
Subject: [PATCH 17/20] make style

---
 .../schedulers/scheduling_sasolver.py         | 280 ++++++++++++++----
 tests/schedulers/test_scheduler_sasolver.py   |  24 +-
 2 files changed, 236 insertions(+), 68 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 2ff9d002d691..e09db049ce99 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -165,15 +165,27 @@ def __init__(
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
         elif beta_schedule == "linear":
-            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+            self.betas = torch.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=torch.float32
+            )
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+            raise NotImplementedError(
+                f"{beta_schedule} does is not implemented for {self.__class__}"
+            )
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
@@ -187,11 +199,15 @@ def __init__(
         self.init_noise_sigma = 1.0
 
         if algorithm_type not in ["data_prediction", "noise_prediction"]:
-            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+            raise NotImplementedError(
+                f"{algorithm_type} does is not implemented for {self.__class__}"
+            )
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = np.linspace(
+            0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32
+        )[::-1].copy()
         self.timesteps = torch.from_numpy(timesteps)
         self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
         self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
@@ -213,7 +229,9 @@ def step_index(self):
         """
         return self._step_index
 
-    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+    def set_timesteps(
+        self, num_inference_steps: int = None, device: Union[str, torch.device] = None
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -225,26 +243,38 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         """
         # Clipping the minimum of all lambda(t) for numerical stability.
         # This is critical for cosine (squaredcos_cap_v2) noise schedule.
-        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        clipped_idx = torch.searchsorted(
+            torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped
+        )
         last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
 
         # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
         if self.config.timestep_spacing == "linspace":
             timesteps = (
-                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
             )
 
         elif self.config.timestep_spacing == "leading":
             step_ratio = last_timestep // (num_inference_steps + 1)
             # creates integer timesteps by multiplying by ratio
             # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps = (
+                (np.arange(0, num_inference_steps + 1) * step_ratio)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
             timesteps += self.config.steps_offset
         elif self.config.timestep_spacing == "trailing":
             step_ratio = self.config.num_train_timesteps / num_inference_steps
             # creates integer timesteps by multiplying by ratio
             # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps = (
+                np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            )
             timesteps -= 1
         else:
             raise ValueError(
@@ -255,8 +285,12 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
         if self.config.use_karras_sigmas:
             log_sigmas = np.log(sigmas)
             sigmas = np.flip(sigmas).copy()
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = self._convert_to_karras(
+                in_sigmas=sigmas, num_inference_steps=num_inference_steps
+            )
+            timesteps = np.array(
+                [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]
+            ).round()
             sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -264,7 +298,9 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
             sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
 
         self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(
+            device=device, dtype=torch.int64
+        )
 
         self.num_inference_steps = len(timesteps)
         self.model_outputs = [
@@ -292,7 +328,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
         batch_size, channels, *remaining_dims = sample.shape
 
         if dtype not in (torch.float32, torch.float64):
-            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+            sample = (
+                sample.float()
+            )  # upcast for quantile calculation, and clamp not implemented for cpu half
 
         # Flatten sample for doing quantile calculation along each image
         sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
@@ -304,7 +342,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
             s, min=1, max=self.config.sample_max_value
         )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
         s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = (
+            torch.clamp(sample, -s, s) / s
+        )  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = sample.reshape(batch_size, channels, *remaining_dims)
         sample = sample.to(dtype)
@@ -320,7 +360,11 @@ def _sigma_to_t(self, sigma, log_sigmas):
         dists = log_sigma - log_sigmas[:, np.newaxis]
 
         # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        low_idx = (
+            np.cumsum((dists >= 0), axis=0)
+            .argmax(axis=0)
+            .clip(max=log_sigmas.shape[0] - 2)
+        )
         high_idx = low_idx + 1
 
         low = log_sigmas[low_idx]
@@ -343,7 +387,9 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+    def _convert_to_karras(
+        self, in_sigmas: torch.FloatTensor, num_inference_steps
+    ) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
         # Hack to make sure that other schedulers which copy this function don't break
@@ -460,21 +506,27 @@ def convert_model_output(
 
             return epsilon
 
-    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+    def get_coefficients_exponential_negative(
+        self, order, interval_start, interval_end
+    ):
         """
         Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
         """
         assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
 
         if order == 0:
-            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+            return torch.exp(-interval_end) * (
+                torch.exp(interval_end - interval_start) - 1
+            )
         elif order == 1:
             return torch.exp(-interval_end) * (
-                (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)
+                (interval_start + 1) * torch.exp(interval_end - interval_start)
+                - (interval_end + 1)
             )
         elif order == 2:
             return torch.exp(-interval_end) * (
-                (interval_start**2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start)
+                (interval_start**2 + 2 * interval_start + 2)
+                * torch.exp(interval_end - interval_start)
                 - (interval_end**2 + 2 * interval_end + 2)
             )
         elif order == 3:
@@ -484,7 +536,9 @@ def get_coefficients_exponential_negative(self, order, interval_start, interval_
                 - (interval_end**3 + 3 * interval_end**2 + 6 * interval_end + 6)
             )
 
-    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+    def get_coefficients_exponential_positive(
+        self, order, interval_start, interval_end, tau
+    ):
         """
         Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
         """
@@ -496,14 +550,17 @@ def get_coefficients_exponential_positive(self, order, interval_start, interval_
 
         if order == 0:
             return (
-                torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (1 + tau**2)
+                torch.exp(interval_end_cov)
+                * (1 - torch.exp(-(interval_end_cov - interval_start_cov)))
+                / (1 + tau**2)
             )
         elif order == 1:
             return (
                 torch.exp(interval_end_cov)
                 * (
                     (interval_end_cov - 1)
-                    - (interval_start_cov - 1) * torch.exp(-(interval_end_cov - interval_start_cov))
+                    - (interval_start_cov - 1)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
                 )
                 / ((1 + tau**2) ** 2)
             )
@@ -521,8 +578,18 @@ def get_coefficients_exponential_positive(self, order, interval_start, interval_
             return (
                 torch.exp(interval_end_cov)
                 * (
-                    (interval_end_cov**3 - 3 * interval_end_cov**2 + 6 * interval_end_cov - 6)
-                    - (interval_start_cov**3 - 3 * interval_start_cov**2 + 6 * interval_start_cov - 6)
+                    (
+                        interval_end_cov**3
+                        - 3 * interval_end_cov**2
+                        + 6 * interval_end_cov
+                        - 6
+                    )
+                    - (
+                        interval_start_cov**3
+                        - 3 * interval_start_cov**2
+                        + 6 * interval_start_cov
+                        - 6
+                    )
                     * torch.exp(-(interval_end_cov - interval_start_cov))
                 )
                 / ((1 + tau**2) ** 4)
@@ -539,13 +606,25 @@ def lagrange_polynomial_coefficient(self, order, lambda_list):
             return [[1]]
         elif order == 1:
             return [
-                [1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
-                [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])],
+                [
+                    1 / (lambda_list[0] - lambda_list[1]),
+                    -lambda_list[1] / (lambda_list[0] - lambda_list[1]),
+                ],
+                [
+                    1 / (lambda_list[1] - lambda_list[0]),
+                    -lambda_list[0] / (lambda_list[1] - lambda_list[0]),
+                ],
             ]
         elif order == 2:
-            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
-            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
-            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (
+                lambda_list[0] - lambda_list[2]
+            )
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (
+                lambda_list[1] - lambda_list[2]
+            )
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (
+                lambda_list[2] - lambda_list[1]
+            )
             return [
                 [
                     1 / denominator1,
@@ -631,24 +710,36 @@ def lagrange_polynomial_coefficient(self, order, lambda_list):
                 ],
             ]
 
-    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+    def get_coefficients_fn(
+        self, order, interval_start, interval_end, lambda_list, tau
+    ):
         assert order in [1, 2, 3, 4]
-        assert order == len(lambda_list), "the length of lambda list must be equal to the order"
+        assert order == len(
+            lambda_list
+        ), "the length of lambda list must be equal to the order"
         coefficients = []
-        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(
+            order - 1, lambda_list
+        )
         for i in range(order):
             coefficient = 0
             for j in range(order):
                 if self.predict_x0:
-                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
+                    coefficient += lagrange_coefficient[i][
+                        j
+                    ] * self.get_coefficients_exponential_positive(
                         order - 1 - j, interval_start, interval_end, tau
                     )
                 else:
-                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
+                    coefficient += lagrange_coefficient[i][
+                        j
+                    ] * self.get_coefficients_exponential_negative(
                         order - 1 - j, interval_start, interval_end
                     )
             coefficients.append(coefficient)
-        assert len(coefficients) == order, "the length of coefficients does not match the order"
+        assert (
+            len(coefficients) == order
+        ), "the length of coefficients does not match the order"
         return coefficients
 
     def stochastic_adams_bashforth_update(
@@ -706,7 +797,10 @@ def stochastic_adams_bashforth_update(
                 "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
             )
         model_output_list = self.model_outputs
-        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
         alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
         alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
         lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
@@ -722,8 +816,9 @@ def stochastic_adams_bashforth_update(
             lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
             lambda_list.append(lambda_si)
 
-
-        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+        gradient_coefficients = self.get_coefficients_fn(
+            order, lambda_s0, lambda_t, lambda_list, tau
+        )
 
         x = sample
 
@@ -741,13 +836,21 @@ def stochastic_adams_bashforth_update(
                 gradient_coefficients[0] += (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    * (
+                        h**2 / 2
+                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
+                        / ((1 + tau**2) ** 2)
+                    )
                     / (lambda_s0 - temp_lambda_s)
                 )
                 gradient_coefficients[1] -= (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    * (
+                        h**2 / 2
+                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
+                        / ((1 + tau**2) ** 2)
+                    )
                     / (lambda_s0 - temp_lambda_s)
                 )
 
@@ -761,7 +864,12 @@ def stochastic_adams_bashforth_update(
                     * model_output_list[-(i + 1)]
                 )
             else:
-                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+                gradient_part += (
+                    -(1 + tau**2)
+                    * alpha_t
+                    * gradient_coefficients[i]
+                    * model_output_list[-(i + 1)]
+                )
 
         if self.predict_x0:
             noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * noise
@@ -769,7 +877,11 @@ def stochastic_adams_bashforth_update(
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
 
         if self.predict_x0:
-            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+            x_t = (
+                torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x
+                + gradient_part
+                + noise_part
+            )
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -841,7 +953,10 @@ def stochastic_adams_moulton_update(
             )
 
         model_output_list = self.model_outputs
-        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
         alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
         alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
 
@@ -856,10 +971,11 @@ def stochastic_adams_moulton_update(
             lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
             lambda_list.append(lambda_si)
 
-
         model_prev_list = model_output_list + [this_model_output]
 
-        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+        gradient_coefficients = self.get_coefficients_fn(
+            order, lambda_s0, lambda_t, lambda_list, tau
+        )
 
         x = last_sample
 
@@ -874,12 +990,20 @@ def stochastic_adams_moulton_update(
                 gradient_coefficients[0] += (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                    * (
+                        h / 2
+                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
+                        / ((1 + tau**2) ** 2 * h)
+                    )
                 )
                 gradient_coefficients[1] -= (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                    * (
+                        h / 2
+                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
+                        / ((1 + tau**2) ** 2 * h)
+                    )
                 )
 
         for i in range(order):
@@ -892,15 +1016,26 @@ def stochastic_adams_moulton_update(
                     * model_prev_list[-(i + 1)]
                 )
             else:
-                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+                gradient_part += (
+                    -(1 + tau**2)
+                    * alpha_t
+                    * gradient_coefficients[i]
+                    * model_prev_list[-(i + 1)]
+                )
 
         if self.predict_x0:
-            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
+            noise_part = (
+                sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
+            )
         else:
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
 
         if self.predict_x0:
-            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+            x_t = (
+                torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x
+                + gradient_part
+                + noise_part
+            )
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -979,7 +1114,9 @@ def step(
                 tau=current_tau,
             )
 
-        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
+        for i in range(
+            max(self.config.predictor_order, self.config.corrector_order - 1) - 1
+        ):
             self.model_outputs[i] = self.model_outputs[i + 1]
             self.timestep_list[i] = self.timestep_list[i + 1]
 
@@ -987,18 +1124,29 @@ def step(
         self.timestep_list[-1] = timestep
 
         noise = randn_tensor(
-            model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
         )
 
         if self.config.lower_order_final:
-            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - self.step_index)
-            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - self.step_index + 1)
+            this_predictor_order = min(
+                self.config.predictor_order, len(self.timesteps) - self.step_index
+            )
+            this_corrector_order = min(
+                self.config.corrector_order, len(self.timesteps) - self.step_index + 1
+            )
         else:
             this_predictor_order = self.config.predictor_order
             this_corrector_order = self.config.corrector_order
 
-        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
-        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
+        self.this_predictor_order = min(
+            this_predictor_order, self.lower_order_nums + 1
+        )  # warmup for multistep
+        self.this_corrector_order = min(
+            this_corrector_order, self.lower_order_nums + 2
+        )  # warmup for multistep
         assert self.this_predictor_order > 0
         assert self.this_corrector_order > 0
 
@@ -1014,7 +1162,9 @@ def step(
             tau=current_tau,
         )
 
-        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
+        if self.lower_order_nums < max(
+            self.config.predictor_order, self.config.corrector_order - 1
+        ):
             self.lower_order_nums += 1
 
         # upon completion increase step index by one
@@ -1025,7 +1175,9 @@ def step(
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+    def scale_model_input(
+        self, sample: torch.FloatTensor, *args, **kwargs
+    ) -> torch.FloatTensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
@@ -1048,7 +1200,9 @@ def add_noise(
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        alphas_cumprod = self.alphas_cumprod.to(
+            device=original_samples.device, dtype=original_samples.dtype
+        )
         timesteps = timesteps.to(original_samples.device)
 
         sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
@@ -1061,7 +1215,9 @@ def add_noise(
         while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        noisy_samples = (
+            sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        )
         return noisy_samples
 
     def __len__(self):
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index 1b8a3dc69ac2..32ab889d203f 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -37,19 +37,29 @@ def test_step_shape(self):
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+            elif num_inference_steps is not None and not hasattr(
+                scheduler, "set_timesteps"
+            ):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
             dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = \
-                dummy_past_residuals[: max(scheduler.config.predictor_order, scheduler.config.corrector_order - 1)]
+            scheduler.model_outputs = dummy_past_residuals[
+                : max(
+                    scheduler.config.predictor_order,
+                    scheduler.config.corrector_order - 1,
+                )
+            ]
 
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+            output_0 = scheduler.step(
+                residual, time_step_0, sample, **kwargs
+            ).prev_sample
+            output_1 = scheduler.step(
+                residual, time_step_1, sample, **kwargs
+            ).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -59,7 +69,9 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+        for beta_start, beta_end in zip(
+            [0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]
+        ):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):

From 236b75d4934efafc19826a3de8c57484edfa58e3 Mon Sep 17 00:00:00 2001
From: scxue <xueshuchen17@mails.ucas.edu.cn>
Date: Mon, 15 Jan 2024 12:56:35 +0800
Subject: [PATCH 18/20] fix copy inconsistencies

---
 .../schedulers/scheduling_sasolver.py         | 26 +++++--------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index e09db049ce99..25f559fcd299 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -328,9 +328,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
         batch_size, channels, *remaining_dims = sample.shape
 
         if dtype not in (torch.float32, torch.float64):
-            sample = (
-                sample.float()
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
 
         # Flatten sample for doing quantile calculation along each image
         sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
@@ -342,9 +340,7 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
             s, min=1, max=self.config.sample_max_value
         )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
         s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = (
-            torch.clamp(sample, -s, s) / s
-        )  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
 
         sample = sample.reshape(batch_size, channels, *remaining_dims)
         sample = sample.to(dtype)
@@ -360,11 +356,7 @@ def _sigma_to_t(self, sigma, log_sigmas):
         dists = log_sigma - log_sigmas[:, np.newaxis]
 
         # get sigmas range
-        low_idx = (
-            np.cumsum((dists >= 0), axis=0)
-            .argmax(axis=0)
-            .clip(max=log_sigmas.shape[0] - 2)
-        )
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
         high_idx = low_idx + 1
 
         low = log_sigmas[low_idx]
@@ -387,9 +379,7 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(
-        self, in_sigmas: torch.FloatTensor, num_inference_steps
-    ) -> torch.FloatTensor:
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
         # Hack to make sure that other schedulers which copy this function don't break
@@ -1200,9 +1190,7 @@ def add_noise(
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.to(
-            device=original_samples.device, dtype=original_samples.dtype
-        )
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
         timesteps = timesteps.to(original_samples.device)
 
         sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
@@ -1215,9 +1203,7 @@ def add_noise(
         while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
             sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
 
-        noisy_samples = (
-            sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        )
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
     def __len__(self):

From 15a7abd5065274c91d956933a155bfc386eab584 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 Jan 2024 08:38:21 +0000
Subject: [PATCH 19/20] make style

---
 .../schedulers/scheduling_sasolver.py         | 217 ++++--------------
 tests/schedulers/test_scheduler_sasolver.py   |  16 +-
 2 files changed, 52 insertions(+), 181 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 25f559fcd299..e25178fe8eb2 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -165,9 +165,7 @@ def __init__(
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
         elif beta_schedule == "linear":
-            self.betas = torch.linspace(
-                beta_start, beta_end, num_train_timesteps, dtype=torch.float32
-            )
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
             self.betas = (
@@ -183,9 +181,7 @@ def __init__(
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
         else:
-            raise NotImplementedError(
-                f"{beta_schedule} does is not implemented for {self.__class__}"
-            )
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
@@ -199,15 +195,11 @@ def __init__(
         self.init_noise_sigma = 1.0
 
         if algorithm_type not in ["data_prediction", "noise_prediction"]:
-            raise NotImplementedError(
-                f"{algorithm_type} does is not implemented for {self.__class__}"
-            )
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
 
         # setable values
         self.num_inference_steps = None
-        timesteps = np.linspace(
-            0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32
-        )[::-1].copy()
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
         self.timesteps = torch.from_numpy(timesteps)
         self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
         self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
@@ -229,9 +221,7 @@ def step_index(self):
         """
         return self._step_index
 
-    def set_timesteps(
-        self, num_inference_steps: int = None, device: Union[str, torch.device] = None
-    ):
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -243,38 +233,26 @@ def set_timesteps(
         """
         # Clipping the minimum of all lambda(t) for numerical stability.
         # This is critical for cosine (squaredcos_cap_v2) noise schedule.
-        clipped_idx = torch.searchsorted(
-            torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped
-        )
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
         last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
 
         # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
         if self.config.timestep_spacing == "linspace":
             timesteps = (
-                np.linspace(0, last_timestep - 1, num_inference_steps + 1)
-                .round()[::-1][:-1]
-                .copy()
-                .astype(np.int64)
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
             )
 
         elif self.config.timestep_spacing == "leading":
             step_ratio = last_timestep // (num_inference_steps + 1)
             # creates integer timesteps by multiplying by ratio
             # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = (
-                (np.arange(0, num_inference_steps + 1) * step_ratio)
-                .round()[::-1][:-1]
-                .copy()
-                .astype(np.int64)
-            )
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
             timesteps += self.config.steps_offset
         elif self.config.timestep_spacing == "trailing":
             step_ratio = self.config.num_train_timesteps / num_inference_steps
             # creates integer timesteps by multiplying by ratio
             # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = (
-                np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
-            )
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
             timesteps -= 1
         else:
             raise ValueError(
@@ -285,12 +263,8 @@ def set_timesteps(
         if self.config.use_karras_sigmas:
             log_sigmas = np.log(sigmas)
             sigmas = np.flip(sigmas).copy()
-            sigmas = self._convert_to_karras(
-                in_sigmas=sigmas, num_inference_steps=num_inference_steps
-            )
-            timesteps = np.array(
-                [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]
-            ).round()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
             sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
         else:
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
@@ -298,9 +272,7 @@ def set_timesteps(
             sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
 
         self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(
-            device=device, dtype=torch.int64
-        )
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
 
         self.num_inference_steps = len(timesteps)
         self.model_outputs = [
@@ -496,27 +468,21 @@ def convert_model_output(
 
             return epsilon
 
-    def get_coefficients_exponential_negative(
-        self, order, interval_start, interval_end
-    ):
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
         """
         Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
         """
         assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
 
         if order == 0:
-            return torch.exp(-interval_end) * (
-                torch.exp(interval_end - interval_start) - 1
-            )
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
         elif order == 1:
             return torch.exp(-interval_end) * (
-                (interval_start + 1) * torch.exp(interval_end - interval_start)
-                - (interval_end + 1)
+                (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)
             )
         elif order == 2:
             return torch.exp(-interval_end) * (
-                (interval_start**2 + 2 * interval_start + 2)
-                * torch.exp(interval_end - interval_start)
+                (interval_start**2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start)
                 - (interval_end**2 + 2 * interval_end + 2)
             )
         elif order == 3:
@@ -526,9 +492,7 @@ def get_coefficients_exponential_negative(
                 - (interval_end**3 + 3 * interval_end**2 + 6 * interval_end + 6)
             )
 
-    def get_coefficients_exponential_positive(
-        self, order, interval_start, interval_end, tau
-    ):
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
         """
         Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
         """
@@ -540,17 +504,14 @@ def get_coefficients_exponential_positive(
 
         if order == 0:
             return (
-                torch.exp(interval_end_cov)
-                * (1 - torch.exp(-(interval_end_cov - interval_start_cov)))
-                / (1 + tau**2)
+                torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (1 + tau**2)
             )
         elif order == 1:
             return (
                 torch.exp(interval_end_cov)
                 * (
                     (interval_end_cov - 1)
-                    - (interval_start_cov - 1)
-                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                    - (interval_start_cov - 1) * torch.exp(-(interval_end_cov - interval_start_cov))
                 )
                 / ((1 + tau**2) ** 2)
             )
@@ -568,18 +529,8 @@ def get_coefficients_exponential_positive(
             return (
                 torch.exp(interval_end_cov)
                 * (
-                    (
-                        interval_end_cov**3
-                        - 3 * interval_end_cov**2
-                        + 6 * interval_end_cov
-                        - 6
-                    )
-                    - (
-                        interval_start_cov**3
-                        - 3 * interval_start_cov**2
-                        + 6 * interval_start_cov
-                        - 6
-                    )
+                    (interval_end_cov**3 - 3 * interval_end_cov**2 + 6 * interval_end_cov - 6)
+                    - (interval_start_cov**3 - 3 * interval_start_cov**2 + 6 * interval_start_cov - 6)
                     * torch.exp(-(interval_end_cov - interval_start_cov))
                 )
                 / ((1 + tau**2) ** 4)
@@ -606,15 +557,9 @@ def lagrange_polynomial_coefficient(self, order, lambda_list):
                 ],
             ]
         elif order == 2:
-            denominator1 = (lambda_list[0] - lambda_list[1]) * (
-                lambda_list[0] - lambda_list[2]
-            )
-            denominator2 = (lambda_list[1] - lambda_list[0]) * (
-                lambda_list[1] - lambda_list[2]
-            )
-            denominator3 = (lambda_list[2] - lambda_list[0]) * (
-                lambda_list[2] - lambda_list[1]
-            )
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
             return [
                 [
                     1 / denominator1,
@@ -700,36 +645,24 @@ def lagrange_polynomial_coefficient(self, order, lambda_list):
                 ],
             ]
 
-    def get_coefficients_fn(
-        self, order, interval_start, interval_end, lambda_list, tau
-    ):
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
         assert order in [1, 2, 3, 4]
-        assert order == len(
-            lambda_list
-        ), "the length of lambda list must be equal to the order"
+        assert order == len(lambda_list), "the length of lambda list must be equal to the order"
         coefficients = []
-        lagrange_coefficient = self.lagrange_polynomial_coefficient(
-            order - 1, lambda_list
-        )
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
         for i in range(order):
             coefficient = 0
             for j in range(order):
                 if self.predict_x0:
-                    coefficient += lagrange_coefficient[i][
-                        j
-                    ] * self.get_coefficients_exponential_positive(
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
                         order - 1 - j, interval_start, interval_end, tau
                     )
                 else:
-                    coefficient += lagrange_coefficient[i][
-                        j
-                    ] * self.get_coefficients_exponential_negative(
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
                         order - 1 - j, interval_start, interval_end
                     )
             coefficients.append(coefficient)
-        assert (
-            len(coefficients) == order
-        ), "the length of coefficients does not match the order"
+        assert len(coefficients) == order, "the length of coefficients does not match the order"
         return coefficients
 
     def stochastic_adams_bashforth_update(
@@ -806,9 +739,7 @@ def stochastic_adams_bashforth_update(
             lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
             lambda_list.append(lambda_si)
 
-        gradient_coefficients = self.get_coefficients_fn(
-            order, lambda_s0, lambda_t, lambda_list, tau
-        )
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
 
         x = sample
 
@@ -826,21 +757,13 @@ def stochastic_adams_bashforth_update(
                 gradient_coefficients[0] += (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (
-                        h**2 / 2
-                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
-                        / ((1 + tau**2) ** 2)
-                    )
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
                     / (lambda_s0 - temp_lambda_s)
                 )
                 gradient_coefficients[1] -= (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (
-                        h**2 / 2
-                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
-                        / ((1 + tau**2) ** 2)
-                    )
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
                     / (lambda_s0 - temp_lambda_s)
                 )
 
@@ -854,12 +777,7 @@ def stochastic_adams_bashforth_update(
                     * model_output_list[-(i + 1)]
                 )
             else:
-                gradient_part += (
-                    -(1 + tau**2)
-                    * alpha_t
-                    * gradient_coefficients[i]
-                    * model_output_list[-(i + 1)]
-                )
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
 
         if self.predict_x0:
             noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * noise
@@ -867,11 +785,7 @@ def stochastic_adams_bashforth_update(
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
 
         if self.predict_x0:
-            x_t = (
-                torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x
-                + gradient_part
-                + noise_part
-            )
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -963,9 +877,7 @@ def stochastic_adams_moulton_update(
 
         model_prev_list = model_output_list + [this_model_output]
 
-        gradient_coefficients = self.get_coefficients_fn(
-            order, lambda_s0, lambda_t, lambda_list, tau
-        )
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
 
         x = last_sample
 
@@ -980,20 +892,12 @@ def stochastic_adams_moulton_update(
                 gradient_coefficients[0] += (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (
-                        h / 2
-                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
-                        / ((1 + tau**2) ** 2 * h)
-                    )
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
                 )
                 gradient_coefficients[1] -= (
                     1.0
                     * torch.exp((1 + tau**2) * lambda_t)
-                    * (
-                        h / 2
-                        - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h)))
-                        / ((1 + tau**2) ** 2 * h)
-                    )
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
                 )
 
         for i in range(order):
@@ -1006,26 +910,15 @@ def stochastic_adams_moulton_update(
                     * model_prev_list[-(i + 1)]
                 )
             else:
-                gradient_part += (
-                    -(1 + tau**2)
-                    * alpha_t
-                    * gradient_coefficients[i]
-                    * model_prev_list[-(i + 1)]
-                )
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
 
         if self.predict_x0:
-            noise_part = (
-                sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
-            )
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
         else:
             noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
 
         if self.predict_x0:
-            x_t = (
-                torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x
-                + gradient_part
-                + noise_part
-            )
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
         else:
             x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
 
@@ -1104,9 +997,7 @@ def step(
                 tau=current_tau,
             )
 
-        for i in range(
-            max(self.config.predictor_order, self.config.corrector_order - 1) - 1
-        ):
+        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
             self.model_outputs[i] = self.model_outputs[i + 1]
             self.timestep_list[i] = self.timestep_list[i + 1]
 
@@ -1121,22 +1012,14 @@ def step(
         )
 
         if self.config.lower_order_final:
-            this_predictor_order = min(
-                self.config.predictor_order, len(self.timesteps) - self.step_index
-            )
-            this_corrector_order = min(
-                self.config.corrector_order, len(self.timesteps) - self.step_index + 1
-            )
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - self.step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - self.step_index + 1)
         else:
             this_predictor_order = self.config.predictor_order
             this_corrector_order = self.config.corrector_order
 
-        self.this_predictor_order = min(
-            this_predictor_order, self.lower_order_nums + 1
-        )  # warmup for multistep
-        self.this_corrector_order = min(
-            this_corrector_order, self.lower_order_nums + 2
-        )  # warmup for multistep
+        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
+        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
         assert self.this_predictor_order > 0
         assert self.this_corrector_order > 0
 
@@ -1152,9 +1035,7 @@ def step(
             tau=current_tau,
         )
 
-        if self.lower_order_nums < max(
-            self.config.predictor_order, self.config.corrector_order - 1
-        ):
+        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
             self.lower_order_nums += 1
 
         # upon completion increase step index by one
@@ -1165,9 +1046,7 @@ def step(
 
         return SchedulerOutput(prev_sample=prev_sample)
 
-    def scale_model_input(
-        self, sample: torch.FloatTensor, *args, **kwargs
-    ) -> torch.FloatTensor:
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
diff --git a/tests/schedulers/test_scheduler_sasolver.py b/tests/schedulers/test_scheduler_sasolver.py
index ff078e787d2d..574194632df0 100644
--- a/tests/schedulers/test_scheduler_sasolver.py
+++ b/tests/schedulers/test_scheduler_sasolver.py
@@ -37,9 +37,7 @@ def test_step_shape(self):
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(
-                scheduler, "set_timesteps"
-            ):
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
             # copy over dummy past residuals (must be done after set_timesteps)
@@ -54,12 +52,8 @@ def test_step_shape(self):
             time_step_0 = scheduler.timesteps[5]
             time_step_1 = scheduler.timesteps[6]
 
-            output_0 = scheduler.step(
-                residual, time_step_0, sample, **kwargs
-            ).prev_sample
-            output_1 = scheduler.step(
-                residual, time_step_1, sample, **kwargs
-            ).prev_sample
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -69,9 +63,7 @@ def test_timesteps(self):
             self.check_over_configs(num_train_timesteps=timesteps)
 
     def test_betas(self):
-        for beta_start, beta_end in zip(
-            [0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]
-        ):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):

From 98600e314dafbd54c1029be30ef783309765754c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 21 Jan 2024 08:50:23 +0000
Subject: [PATCH 20/20] fix-copies

---
 src/diffusers/utils/dummy_pt_objects.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index d306a3575b1f..8f1442b522f8 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -990,6 +990,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class SASolverScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class SchedulerMixin(metaclass=DummyObject):
     _backends = ["torch"]