From 20e5be74d827f2225259f1c0e4e785cbcc98fb7c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 11 May 2023 21:14:30 +0000
Subject: [PATCH 001/119] refactor prior_transformer

adding conversion script

add pipeline

add step_index from pipeline, + remove permute

add zero pad token

remove copy from statement for betas_for_alpha_bar function
---
 scripts/convert_shap_e_to_diffusers.py        | 324 ++++++++++++++++++
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/models/prior_transformer.py     | 106 +++++-
 src/diffusers/pipelines/__init__.py           |   1 +
 src/diffusers/pipelines/shap_e/__init__.py    |  15 +
 .../pipelines/shap_e/pipeline_shap_e.py       | 311 +++++++++++++++++
 .../schedulers/scheduling_heun_discrete.py    |  61 +++-
 .../dummy_torch_and_transformers_objects.py   |  15 +
 8 files changed, 804 insertions(+), 30 deletions(-)
 create mode 100644 scripts/convert_shap_e_to_diffusers.py
 create mode 100644 src/diffusers/pipelines/shap_e/__init__.py
 create mode 100644 src/diffusers/pipelines/shap_e/pipeline_shap_e.py
diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
new file mode 100644
index 000000000000..4a159fc61f61
--- /dev/null
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -0,0 +1,324 @@
+import argparse
+import tempfile
+
+import torch
+from accelerate import load_checkpoint_and_dispatch
+
+from diffusers.models.prior_transformer import PriorTransformer
+
+
+"""
+Example - From the diffusers root directory:
+
+Download weights:
+```sh
+$ wget  "https://openaipublic.azureedge.net/main/shap-e/text_cond.pt"
+```
+
+Convert the model:
+```sh
+$ python scripts/convert_shap_e_to_diffusers.py \
+      --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
+      --dump_path /home/yiyi_huggingface_co/model_repo/shape \
+      --debug prior
+```
+"""
+
+
+# prior
+
+PRIOR_ORIGINAL_PREFIX = "wrapped"
+
+# Uses default arguments
+PRIOR_CONFIG = {
+    "num_attention_heads": 16,
+    "attention_head_dim": 1024 // 16,
+    "num_layers": 24,
+    "embedding_dim": 1024,
+    "num_embeddings": 1024,
+    "additional_embeddings": 0,
+    "act_fn": "gelu",
+    "time_embed_dim": 1024 * 4,
+    "clip_embedding_dim": 768,
+    "out_dim": 1024 * 2,
+    "has_pre_norm": True,
+    "has_encoder_hidden_states_proj": False,
+    "has_prd_embedding": False,
+    "has_post_process": False,
+}
+
+
+def prior_model_from_original_config():
+    model = PriorTransformer(**PRIOR_CONFIG)
+
+    return model
+
+
+def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.c_fc -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.bias"],
+        }
+    )
+
+    # <original>.time_embed.c_proj -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.bias"],
+        }
+    )
+
+    # <original>.clip_img_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.bias"],
+        }
+    )
+
+    # <original>.text_emb_proj -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.bias"],
+        }
+    )
+
+    # <original>.positional_embedding -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.pos_emb"][None, :]})
+
+    # <original>.ln_pre -> <diffusers>.norm_in
+    diffusers_checkpoint.update(
+        {
+            "norm_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.weight"],
+            "norm_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.bias"],
+        }
+    )
+
+    # <original>.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.backbone.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.final_ln -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.bias"],
+        }
+    )
+
+    # <original>.out_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_attention_to_diffusers(
+    checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim
+):
+    diffusers_checkpoint = {}
+
+    # <original>.c_qkv -> <diffusers>.{to_q, to_k, to_v}
+    [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions(
+        weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"],
+        bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"],
+        split=3,
+        chunk_size=attention_head_dim,
+    )
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_q.weight": q_weight,
+            f"{diffusers_attention_prefix}.to_q.bias": q_bias,
+            f"{diffusers_attention_prefix}.to_k.weight": k_weight,
+            f"{diffusers_attention_prefix}.to_k.bias": k_bias,
+            f"{diffusers_attention_prefix}.to_v.weight": v_weight,
+            f"{diffusers_attention_prefix}.to_v.bias": v_bias,
+        }
+    )
+
+    # <original>.c_proj -> <diffusers>.to_out.0
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"],
+            f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix):
+    diffusers_checkpoint = {
+        # <original>.c_fc -> <diffusers>.net.0.proj
+        f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"],
+        f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"],
+        # <original>.c_proj -> <diffusers>.net.2
+        f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"],
+        f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"],
+    }
+
+    return diffusers_checkpoint
+
+
+# done prior
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            assert weights[weights_biases_idx] is None
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+# done unet utils
+
+
+# Driver functions
+
+
+def prior(*, args, checkpoint_map_location):
+    print("loading prior")
+
+    prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint)
+
+    del prior_checkpoint
+
+    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+
+    print("done loading prior")
+
+    return prior_model
+
+
+def load_checkpoint_to_model(checkpoint, model, strict=False):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        if strict:
+            model.load_state_dict(torch.load(file.name), strict=True)
+        else:
+            load_checkpoint_and_dispatch(model, file.name, device_map="auto")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--prior_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the prior checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=None,
+        type=str,
+        required=False,
+        help="Only run a specific stage of the convert script. Used for debugging",
+    )
+
+    args = parser.parse_args()
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    if args.debug is not None:
+        print(f"debug: only executing {args.debug}")
+
+    if args.debug is None:
+        print("YiYi TO-DO")
+    elif args.debug == "prior":
+        prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
+    else:
+        raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 402f6eaa749a..359c058780aa 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -137,6 +137,7 @@
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
+        ShapEPipeline,
         StableDiffusionAttendAndExcitePipeline,
         StableDiffusionControlNetImg2ImgPipeline,
         StableDiffusionControlNetInpaintPipeline,
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index b245612e6fc1..1d712ee6724c 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass
 from typing import Optional, Union
 
@@ -58,6 +59,14 @@ def __init__(
         num_embeddings=77,
         additional_embeddings=4,
         dropout: float = 0.0,
+        act_fn: str = "silu",
+        has_pre_norm: bool = False,
+        has_encoder_hidden_states_proj: bool = True,
+        has_prd_embedding: bool = True,
+        has_post_process: bool = True,
+        time_embed_dim: Optional[int] = None,
+        clip_embedding_dim: Optional[int] = None,
+        out_dim: Optional[int] = None,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
@@ -65,17 +74,33 @@ def __init__(
         inner_dim = num_attention_heads * attention_head_dim
         self.additional_embeddings = additional_embeddings
 
+        if time_embed_dim is None:
+            time_embed_dim = inner_dim
+
+        if clip_embedding_dim is None:
+            clip_embedding_dim = embedding_dim
+
+        if out_dim is None:
+            out_dim = embedding_dim
+
         self.time_proj = Timesteps(inner_dim, True, 0)
-        self.time_embedding = TimestepEmbedding(inner_dim, inner_dim)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=act_fn)
 
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
 
-        self.embedding_proj = nn.Linear(embedding_dim, inner_dim)
-        self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
+
+        if has_encoder_hidden_states_proj:
+            self.encoder_hidden_states_proj = nn.Linear(clip_embedding_dim, inner_dim)
+        else:
+            self.encoder_hidden_states_proj = None
 
         self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
 
-        self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        if has_prd_embedding:
+            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        else:
+            self.prd_embedding = None
 
         self.transformer_blocks = nn.ModuleList(
             [
@@ -91,8 +116,14 @@ def __init__(
             ]
         )
 
+        if has_pre_norm:
+            self.norm_in = nn.LayerNorm(inner_dim)
+        else:
+            self.norm_in = None
+
         self.norm_out = nn.LayerNorm(inner_dim)
-        self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
+
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, out_dim)
 
         causal_attention_mask = torch.full(
             [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
@@ -100,16 +131,19 @@ def __init__(
         causal_attention_mask.triu_(1)
         causal_attention_mask = causal_attention_mask[None, ...]
         self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
-
-        self.clip_mean = nn.Parameter(torch.zeros(1, embedding_dim))
-        self.clip_std = nn.Parameter(torch.zeros(1, embedding_dim))
+        if has_post_process:
+            self.clip_mean = nn.Parameter(torch.zeros(1, clip_embedding_dim))
+            self.clip_std = nn.Parameter(torch.zeros(1, clip_embedding_dim))
+        else:
+            self.clip_mean = None
+            self.clip_std = None
 
     def forward(
         self,
         hidden_states,
         timestep: Union[torch.Tensor, float, int],
         proj_embedding: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.BoolTensor] = None,
         return_dict: bool = True,
     ):
@@ -152,23 +186,49 @@ def forward(
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
 
+        # Rescale the features to have unit variance
+        # YiYi TO-DO: It was normalized before during encode_prompt step, move this step to pipeline
+        if self.clip_mean is None:
+            proj_embedding = math.sqrt(proj_embedding.shape[1]) * proj_embedding
         proj_embeddings = self.embedding_proj(proj_embedding)
-        encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
+
         hidden_states = self.proj_in(hidden_states)
-        prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
 
+        tokens = []
+
+        if encoder_hidden_states is not None:
+            tokens.append(encoder_hidden_states)
+
+        tokens = tokens + [
+            proj_embeddings[:, None, :],
+            time_embeddings[:, None, :],
+            hidden_states[:, None, :] if len(hidden_states.shape) == 2 else hidden_states,
+        ]
+
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+            tokens.append(prd_embedding)
+
         hidden_states = torch.cat(
-            [
-                encoder_hidden_states,
-                proj_embeddings[:, None, :],
-                time_embeddings[:, None, :],
-                hidden_states[:, None, :],
-                prd_embedding,
-            ],
+            tokens,
             dim=1,
         )
 
+        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings = 2 + (encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0)
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (0, 0, additional_embeddings, self.prd_embedding.shape[1] if self.prd_embedding is not None else 0),
+                value=0.0,
+            )
+
         hidden_states = hidden_states + positional_embeddings
 
         if attention_mask is not None:
@@ -177,11 +237,19 @@ def forward(
             attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
             attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
 
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
+
         for block in self.transformer_blocks:
             hidden_states = block(hidden_states, attention_mask=attention_mask)
 
         hidden_states = self.norm_out(hidden_states)
-        hidden_states = hidden_states[:, -1]
+
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings:]
+
         predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
 
         if not return_dict:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 9e68538f233c..1fd0c505eca5 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -66,6 +66,7 @@
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
+    from .shap_e import ShapEPipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
         StableDiffusionAttendAndExcitePipeline,
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
new file mode 100644
index 000000000000..bc8c04d50a03
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -0,0 +1,15 @@
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+else:
+    from .pipeline_shap_e import ShapEPipeline
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
new file mode 100644
index 000000000000..182ca699bde9
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -0,0 +1,311 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...models import PriorTransformer
+from ...pipelines import DiffusionPipeline
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    is_accelerate_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+
+        ```
+"""
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for ShapEPipeline.
+
+    Args:
+        images (`torch.FloatTensor`)
+            3D latent representation
+    """
+
+    latents: Union[torch.FloatTensor, np.ndarray]
+
+
+class ShapEPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset with Shap.E
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: HeunDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
+            return self.device
+        for module in self.text_encoder.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        len(prompt) if isinstance(prompt, list) else 1
+
+        # YiYi Notes: set pad_token_id to be 0, not sure why I can't set in the config file
+        self.tokenizer.pad_token_id = 0
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+        prompt_embeds = text_encoder_output.text_embeds
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        # in Shap-E it normalize the prompt_embeds and then later rescale it, not sure why
+        # YiYi TO-DO: move rescale out of prior_transformer and apply it here
+        prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        sigma_min: float = 1e-3,
+        sigma_max: float = 160.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`ShapEPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        # prior
+
+        self.scheduler.set_timesteps(
+            num_inference_steps, device=device, sigma_min=sigma_min, sigma_max=sigma_max, use_karras_sigmas=True
+        )
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, num_embeddings * embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        # for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            # clip between -1 and 1
+            noise_pred = noise_pred.clamp(-1, 1)
+
+            if do_classifier_free_guidance is not None:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+                step_index=i,
+            ).prev_sample
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            latents = latents.cpu().numpy()
+
+        if not return_dict:
+            return latents
+
+        return ShapEPipelineOutput(latents)
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 100e2012ea20..93465b2d639c 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -22,8 +22,11 @@
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
 
-# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_bar_fn=None,
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -44,11 +47,14 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     def alpha_bar(time_step):
         return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
 
+    if alpha_bar_fn is None:
+        alpha_bar_fn = alpha_bar
+
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
@@ -106,6 +112,8 @@ def __init__(
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "exp":
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_bar_fn=lambda t: math.exp(t * -12.0))
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
@@ -152,6 +160,9 @@ def set_timesteps(
         num_inference_steps: int,
         device: Union[str, torch.device] = None,
         num_train_timesteps: Optional[int] = None,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
+        use_karras_sigmas: Optional[bool] = None,  # overwrite the self.config.use_karras_sigma
     ):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -166,15 +177,25 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        if sigma_min is not None and sigma_max is not None:
+            sigmas = torch.tensor([sigma_max, sigma_min])
+
+        else:
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        log_sigmas = np.log(sigmas)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+            log_sigmas = np.log(sigmas)
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
-        if self.use_karras_sigmas:
+        if use_karras_sigmas is None:
+            use_karras_sigmas = self.use_karras_sigmas
+
+        if use_karras_sigmas:
             sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            if self.config.beta_schedule == "exp":
+                timesteps = np.array([self._sigma_to_t_yiyi(sigma) for sigma in sigmas])
+            else:
+                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -220,6 +241,22 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
+    # YiYi Notes: Taking from the origional repo, will refactor and not introduce dependency on spicy
+    def _sigma_to_t_yiyi(self, sigma):
+        alpha_cumprod = 1.0 / (sigma**2 + 1)
+
+        if alpha_cumprod > self.alphas_cumprod[0]:
+            return 0
+        elif alpha_cumprod <= self.alphas_cumprod[-1]:
+            return len(self.alphas_cumprod) - 1
+        else:
+            from scipy import interpolate
+
+            timestep = interpolate.interp1d(self.alphas_cumprod, np.arange(0, len(self.alphas_cumprod)))(
+                alpha_cumprod
+            )  # yiyi testing, origin implementation
+        return int(timestep)
+
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
@@ -244,6 +281,7 @@ def step(
         timestep: Union[float, torch.FloatTensor],
         sample: Union[torch.FloatTensor, np.ndarray],
         return_dict: bool = True,
+        step_index: Optional[int] = None,
     ) -> Union[SchedulerOutput, Tuple]:
         """
         Args:
@@ -258,7 +296,8 @@ def step(
             [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
-        step_index = self.index_for_timestep(timestep)
+        if step_index is None:
+            step_index = self.index_for_timestep(timestep)
 
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]
@@ -284,7 +323,7 @@ def step(
                 sample / (sigma_input**2 + 1)
             )
         elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
+            pred_original_sample = model_output
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 95d07c081ccd..cc060b5572a3 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -227,6 +227,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ShapEPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 9324a54c67bc79ab5b572467b7e9546c84726abd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 18 Jun 2023 19:45:29 +0000
Subject: [PATCH 002/119] add

---
 scripts/convert_shap_e_to_diffusers.py        |  62 +++++++-
 src/diffusers/pipelines/shap_e/__init__.py    |   2 +
 src/diffusers/pipelines/shap_e/camera.py      | 142 ++++++++++++++++++
 src/diffusers/pipelines/shap_e/params_proj.py |  96 ++++++++++++
 src/diffusers/pipelines/shap_e/renderer.py    | 103 +++++++++++++
 5 files changed, 402 insertions(+), 3 deletions(-)
 create mode 100644 src/diffusers/pipelines/shap_e/camera.py
 create mode 100644 src/diffusers/pipelines/shap_e/params_proj.py
 create mode 100644 src/diffusers/pipelines/shap_e/renderer.py

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 4a159fc61f61..b6f5780d962e 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -3,8 +3,10 @@
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
+from collections import OrderedDict
 
 from diffusers.models.prior_transformer import PriorTransformer
+from diffusers.pipelines.shap_e import ShapEParamsProjModel
 
 
 """
@@ -19,8 +21,9 @@
 ```sh
 $ python scripts/convert_shap_e_to_diffusers.py \
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
-      --dump_path /home/yiyi_huggingface_co/model_repo/shape \
-      --debug prior
+      --params_proj_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
+      --dump_path /home/yiyi_huggingface_co/model_repo/shape/params_proj \
+      --debug params_proj
 ```
 """
 
@@ -216,6 +219,30 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 # done prior
 
 
+# params_proj
+
+PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
+
+PARAMS_PROJ_CONFIG = {}
+
+def params_proj_model_from_original_config():
+    model = ShapEParamsProjModel(**PARAMS_PROJ_CONFIG)
+
+    return model
+
+
+def params_proj_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+
+    diffusers_checkpoint = {
+        k: checkpoint[f"{PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()
+    }
+
+    return diffusers_checkpoint
+
+
+# done params_proj
+
+
 # TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
 def split_attentions(*, weight, bias, split, chunk_size):
     weights = [None] * split
@@ -267,6 +294,24 @@ def prior(*, args, checkpoint_map_location):
     return prior_model
 
 
+def params_proj(*, args, checkpoint_map_location):
+    print("loading params_proj")
+
+    params_proj_checkpoint = torch.load(args.params_proj_checkpoint_path, map_location=checkpoint_map_location)
+    
+    params_proj_model = params_proj_model_from_original_config()
+
+    params_proj_diffusers_checkpoint = params_proj_original_checkpoint_to_diffusers_checkpoint(params_proj_model, params_proj_checkpoint)
+
+    del params_proj_checkpoint
+
+    load_checkpoint_to_model(params_proj_diffusers_checkpoint,params_proj_model, strict=True)
+
+    print("done loading params_proj")
+
+    return params_proj_model
+
+
 def load_checkpoint_to_model(checkpoint, model, strict=False):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
@@ -286,7 +331,15 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         "--prior_checkpoint_path",
         default=None,
         type=str,
-        required=True,
+        required=False,
+        help="Path to the prior checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--params_proj_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
         help="Path to the prior checkpoint to convert.",
     )
 
@@ -320,5 +373,8 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "prior":
         prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
         prior_model.save_pretrained(args.dump_path)
+    elif args.debug == "params_proj":
+        params_proj_model = params_proj(args=args, checkpoint_map_location=checkpoint_map_location)
+        params_proj_model.save_pretrained(args.dump_path)
     else:
         raise ValueError(f"unknown debug value : {args.debug}")
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index bc8c04d50a03..a9041801dce5 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -13,3 +13,5 @@
     from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
 else:
     from .pipeline_shap_e import ShapEPipeline
+    from .params_proj import ShapEParamsProjModel
+    from .renderer import MLPNeRSTFModel
diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py
new file mode 100644
index 000000000000..db92c41e37cb
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/camera.py
@@ -0,0 +1,142 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+import torch
+import numpy as np
+
+@dataclass
+class DifferentiableProjectiveCamera:
+    """
+    Implements a batch, differentiable, standard pinhole camera
+    """
+
+    origin: torch.Tensor  # [batch_size x 3]
+    x: torch.Tensor  # [batch_size x 3]
+    y: torch.Tensor  # [batch_size x 3]
+    z: torch.Tensor  # [batch_size x 3]
+    width: int
+    height: int
+    x_fov: float
+    y_fov: float
+
+    def __post_init__(self):
+        assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0]
+        assert self.x.shape[1] == self.y.shape[1] == self.z.shape[1] == self.origin.shape[1] == 3
+        assert (
+            len(self.x.shape)
+            == len(self.y.shape)
+            == len(self.z.shape)
+            == len(self.origin.shape)
+            == 2
+        )
+
+    def resolution(self):
+        return torch.from_numpy(np.array([self.width, self.height], dtype=np.float32))
+
+    def fov(self):
+        return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32))
+
+    def image_coords(self) -> torch.Tensor:
+        """
+        :return: coords of shape (width * height, 2)
+        """
+        pixel_indices = torch.arange(self.height * self.width)
+        coords = torch.stack(
+            [
+                pixel_indices % self.width,
+                torch.div(pixel_indices, self.width, rounding_mode="trunc"),
+            ],
+            axis=1,
+        )
+        return coords
+
+    def camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
+        batch_size, *shape, n_coords = coords.shape
+        assert n_coords == 2
+        assert batch_size == self.origin.shape[0]
+        flat = coords.view(batch_size, -1, 2)
+
+        res = self.resolution().to(flat.device)
+        fov = self.fov().to(flat.device)
+
+        fracs = (flat.float() / (res - 1)) * 2 - 1
+        fracs = fracs * torch.tan(fov / 2)
+
+        fracs = fracs.view(batch_size, -1, 2)
+        directions = (
+            self.z.view(batch_size, 1, 3)
+            + self.x.view(batch_size, 1, 3) * fracs[:, :, :1]
+            + self.y.view(batch_size, 1, 3) * fracs[:, :, 1:]
+        )
+        directions = directions / directions.norm(dim=-1, keepdim=True)
+        rays = torch.stack(
+            [
+                torch.broadcast_to(
+                    self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]
+                ),
+                directions,
+            ],
+            dim=2,
+        )
+        return rays.view(batch_size, *shape, 2, 3)
+
+    def resize_image(self, width: int, height: int) -> "DifferentiableProjectiveCamera":
+        """
+        Creates a new camera for the resized view assuming the aspect ratio does not change.
+        """
+        assert width * self.height == height * self.width, "The aspect ratio should not change."
+        return DifferentiableProjectiveCamera(
+            origin=self.origin,
+            x=self.x,
+            y=self.y,
+            z=self.z,
+            width=width,
+            height=height,
+            x_fov=self.x_fov,
+            y_fov=self.y_fov,
+        )
+
+@dataclass
+class DifferentiableCameraBatch:
+    """
+    Annotate a differentiable camera with a multi-dimensional batch shape.
+    """
+
+    shape: Tuple[int]
+    flat_camera: DifferentiableProjectiveCamera
+
+def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraBatch:
+    origins = []
+    xs = []
+    ys = []
+    zs = []
+    for theta in np.linspace(0, 2 * np.pi, num=20):
+        z = np.array([np.sin(theta), np.cos(theta), -0.5])
+        z /= np.sqrt(np.sum(z**2))
+        origin = -z * 4
+        x = np.array([np.cos(theta), -np.sin(theta), 0.0])
+        y = np.cross(z, x)
+        origins.append(origin)
+        xs.append(x)
+        ys.append(y)
+        zs.append(z)
+    return DifferentiableCameraBatch(
+        shape=(1, len(xs)),
+        flat_camera=DifferentiableProjectiveCamera(
+            origin=torch.from_numpy(np.stack(origins, axis=0)).float().to(device),
+            x=torch.from_numpy(np.stack(xs, axis=0)).float().to(device),
+            y=torch.from_numpy(np.stack(ys, axis=0)).float().to(device),
+            z=torch.from_numpy(np.stack(zs, axis=0)).float().to(device),
+            width=size,
+            height=size,
+            x_fov=0.7,
+            y_fov=0.7,
+        ),
+    )
+
+def get_image_coords(width, height) -> torch.Tensor:
+    pixel_indices = torch.arange(height * width)
+    # torch throws warnings for pixel_indices // width
+    pixel_indices_div = torch.div(pixel_indices, width, rounding_mode="trunc")
+    coords = torch.stack([pixel_indices % width, pixel_indices_div], dim=1)
+    return coords
\ No newline at end of file
diff --git a/src/diffusers/pipelines/shap_e/params_proj.py b/src/diffusers/pipelines/shap_e/params_proj.py
new file mode 100644
index 000000000000..1910889bc63d
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/params_proj.py
@@ -0,0 +1,96 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+from typing import Tuple, Optional
+from collections import OrderedDict
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+class ChannelsProj(nn.Module):
+    def __init__(
+        self,
+        *,
+        vectors: int,
+        channels: int,
+        d_latent: int,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(d_latent, vectors * channels)
+        self.norm = nn.LayerNorm(channels)
+        self.d_latent = d_latent
+        self.vectors = vectors
+        self.channels = channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_bvd = x
+        w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent)
+        b_vc = self.proj.bias.view(1, self.vectors, self.channels)
+        h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd)
+        h = self.norm(h)
+
+        h = h + b_vc
+        return h
+
+
+class ShapEParamsProjModel(ModelMixin, ConfigMixin):
+    """
+    project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
+
+    For more details, see the original paper: 
+    """
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = ((256, 93),(256, 256),(256, 256),(256, 256),),
+        d_latent: int = 1024,
+    ):  
+        super().__init__()
+
+        # check inputs
+        if len(param_names) != len(param_shapes):
+            raise ValueError(
+                f"Must provide same number of `param_names` as `param_shapes`"
+            )
+        self.projections = nn.ModuleDict({})
+        for k, (vectors, channels) in zip(param_names, param_shapes):
+            self.projections[_sanitize_name(k)] = ChannelsProj(
+                vectors=vectors,
+                channels=channels,
+                d_latent=d_latent,
+            )
+
+    def forward(self, x: torch.Tensor):
+        out = {}
+        start = 0
+        for k, shape in zip(self.config.param_names, self.config.param_shapes):
+            vectors, _ = shape
+            end = start + vectors
+            x_bvd = x[:, start:end]
+            out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape)
+            start = end
+        return out
+
+def _sanitize_name(x: str) -> str:
+    return x.replace(".", "__")
\ No newline at end of file
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
new file mode 100644
index 000000000000..45530b1df55b
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -0,0 +1,103 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+import math
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
+    """
+    Concatenate x and its positional encodings, following NeRF.
+
+    Reference: https://arxiv.org/pdf/2210.04628.pdf
+    """
+    if min_deg == max_deg:
+        return x
+    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype)
+    *shape, dim = x.shape
+    xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
+    assert xb.shape[-1] == dim * (max_deg - min_deg)
+    emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin()
+    return torch.cat([x, emb], dim=-1)
+
+def encode_position(position):
+
+    return posenc_nerf(position, min_deg=0, max_deg=15)
+
+def encode_direction(position, direction=None):
+    if direction is None:
+        return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8))
+    else:
+        return posenc_nerf(direction, min_deg=0, max_deg=8)
+
+class MLPNeRSTFModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,
+
+    ):  
+        super().__init__()
+        # Instantiate the MLP
+        
+        # Find out the dimension of encoded position and direction 
+        dummy = torch.eye(1, 3)
+        d_posenc_pos = encode_position(position=dummy).shape[-1]
+        d_posenc_dir = encode_direction(position=dummy).shape[-1]
+
+        mlp_widths = [d_hidden] * n_hidden_layers
+        input_widths = [d_posenc_pos] + mlp_widths
+        output_widths = mlp_widths + [n_output]
+
+        if insert_direction_at is not None:
+            input_widths[insert_direction_at] += d_posenc_dir
+    
+        self.mlp = nn.ModuleList(
+                [
+                    nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)
+                ]
+            )
+
+        if act_fn == "swish":
+            self.activation = lambda x: torch.sigmoid(x)
+        else:
+            raise ValueError(f"Unsupported activation function {act_fn}")
+        
+    
+    def forward(self, *, positions, directions):
+
+        h = encode_position(position)
+        h_preact = h
+        h_directionless = None
+        for i, layer in enumerate(self.mlp):
+            if i == self.config.insert_direction_at: # 4 in the config 
+                h_directionless = h_preact
+                h_direction = encode_direction(position, direction=direction)
+                h = torch.cat([h, h_direction], dim=-1)
+
+            h = layer(h)
+            h_preact = h
+            if i < len(self.mlp) - 1:
+                h = self.activation(h)
+        h_final = h
+        if h_directionless is None:
+            h_directionless = h_preact
+        return h_final, h_directionless
\ No newline at end of file

From 303eeb3ac94e43b9a9a715d567f016d1a305f40d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 09:46:47 +0000
Subject: [PATCH 003/119]  add

---
 src/diffusers/pipelines/shap_e/__init__.py |   3 +-
 src/diffusers/pipelines/shap_e/camera.py   |   9 +-
 src/diffusers/pipelines/shap_e/renderer.py | 299 ++++++++++++++++++++-
 3 files changed, 296 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index a9041801dce5..1a615d351db2 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -14,4 +14,5 @@
 else:
     from .pipeline_shap_e import ShapEPipeline
     from .params_proj import ShapEParamsProjModel
-    from .renderer import MLPNeRSTFModel
+    from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler
+    from .camera import create_pan_cameras
diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py
index db92c41e37cb..080cf61fde31 100644
--- a/src/diffusers/pipelines/shap_e/camera.py
+++ b/src/diffusers/pipelines/shap_e/camera.py
@@ -132,11 +132,4 @@ def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraB
             x_fov=0.7,
             y_fov=0.7,
         ),
-    )
-
-def get_image_coords(width, height) -> torch.Tensor:
-    pixel_indices = torch.arange(height * width)
-    # torch throws warnings for pixel_indices // width
-    pixel_indices_div = torch.div(pixel_indices, width, rounding_mode="trunc")
-    coords = torch.stack([pixel_indices % width, pixel_indices_div], dim=1)
-    return coords
\ No newline at end of file
+    )
\ No newline at end of file
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 45530b1df55b..53348795f5a8 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -12,12 +12,214 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dataclasses import dataclass
+
 import torch
 from torch import nn
 import math
+import torch.nn.functional as F
+
+import numpy as np
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
+from ...utils import BaseOutput
+
+from typing import Optional, Dict
+
+
+class VoidNeRFModel(nn.Module):
+    """
+    Implements the default empty space model where all queries are rendered as
+    background.
+    """
+
+    def __init__(self, background, channel_scale= 255.0):
+        super().__init__()
+        background = nn.Parameter(
+            torch.from_numpy(np.array(background)).to(dtype=torch.float32)
+            / channel_scale
+        )
+
+        self.register_buffer("background", background)
+
+    def forward(self, position):
+        background = self.background[None].to(position.device)
+
+        shape = position.shape[:-1]
+        ones = [1] * (len(shape) - 1)
+        n_channels = background.shape[-1]
+        background = torch.broadcast_to(
+            background.view(background.shape[0], *ones, n_channels), [*shape, n_channels]
+        )
+
+        return background
+
+@dataclass
+class VolumeRange:
+    t0: torch.Tensor
+    t1: torch.Tensor
+    intersected: torch.Tensor
+
+    def __post_init__(self):
+        assert self.t0.shape == self.t1.shape == self.intersected.shape
+
+    def partition(self, ts):
+        """
+        Partitions t0 and t1 into n_samples intervals.
+
+        :param ts: [batch_size, *shape, n_samples, 1]
+        :return: a tuple of (
+            lower: [batch_size, *shape, n_samples, 1]
+            upper: [batch_size, *shape, n_samples, 1]
+            delta: [batch_size, *shape, n_samples, 1]
+        ) where
+
+            ts \\in [lower, upper]
+            deltas = upper - lower
+        """
+        #print(" ")
+        #print(f" inside BoundingBoxVolume.partition:")
+        #print(f" - ts: {ts.shape}, {ts.abs().sum()}")
+        mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5
+        #print(f" - mids: {mids.shape}, {mids.abs().sum()}")
+        lower = torch.cat([self.t0[..., None, :], mids], dim=-2)
+        #print(f" -t0: {self.t0.shape}, {self.t0.abs().sum()}")
+        upper = torch.cat([mids, self.t1[..., None, :]], dim=-2)
+        #print(f" -upper: {upper.shape}, {upper.abs().sum()}")
+        delta = upper - lower
+        assert lower.shape == upper.shape == delta.shape == ts.shape
+        return lower, upper, delta
+
+class BoundingBoxVolume(nn.Module):
+    """
+    Axis-aligned bounding box defined by the two opposite corners.
+    """
+
+    def __init__(
+        self, 
+        *, 
+        bbox_min, 
+        bbox_max, 
+        min_dist: float = 0.0,
+        min_t_range: float = 1e-3,
+    ):
+        """
+        :param bbox_min: the left/bottommost corner of the bounding box
+        :param bbox_max: the other corner of the bounding box
+        :param min_dist: all rays should start at least this distance away from the origin.
+        """
+        super().__init__()
+
+        self.bbox_min = torch.tensor(bbox_min)
+        self.bbox_max = torch.tensor(bbox_max)
+        self.min_dist = min_dist
+        self.min_t_range = min_t_range
+        self.bbox = torch.stack([self.bbox_min, self.bbox_max])
+        assert self.bbox.shape == (2, 3)
+        assert self.min_dist >= 0.0
+        assert self.min_t_range > 0.0
+
+    def intersect(
+        self,
+        origin: torch.Tensor,
+        direction: torch.Tensor,
+        t0_lower: Optional[torch.Tensor] = None,
+        epsilon=1e-6,
+    ):
+        """
+        :param origin: [batch_size, *shape, 3]
+        :param direction: [batch_size, *shape, 3]
+        :param t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume.
+        :param params: Optional meta parameters in case Volume is parametric
+        :param epsilon: to stabilize calculations
+
+        :return: A tuple of (t0, t1, intersected) where each has a shape
+            [batch_size, *shape, 1]. If a ray intersects with the volume, `o + td` is
+            in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed
+            to be on the boundary of the volume.
+        """
+
+        batch_size, *shape, _ = origin.shape
+        ones = [1] * len(shape)
+        bbox = self.bbox.view(1, *ones, 2, 3).to(origin.device)
+
+        def _safe_divide(a, b, epsilon=1e-6):
+            return a / torch.where(b < 0, b - epsilon, b + epsilon)
+
+        ts = _safe_divide(bbox - origin[..., None, :], direction[..., None, :], epsilon=epsilon)
+
+        # Cases to think about:
+        #
+        #   1. t1 <= t0: the ray does not pass through the AABB.
+        #   2. t0 < t1 <= 0: the ray intersects but the BB is behind the origin.
+        #   3. t0 <= 0 <= t1: the ray starts from inside the BB
+        #   4. 0 <= t0 < t1: the ray is not inside and intersects with the BB twice.
+        #
+        # 1 and 4 are clearly handled from t0 < t1 below.
+        # Making t0 at least min_dist (>= 0) takes care of 2 and 3.
+        t0 = ts.min(dim=-2).values.max(dim=-1, keepdim=True).values.clamp(self.min_dist)
+        t1 = ts.max(dim=-2).values.min(dim=-1, keepdim=True).values
+        assert t0.shape == t1.shape == (batch_size, *shape, 1)
+        if t0_lower is not None:
+            assert t0.shape == t0_lower.shape
+            t0 = torch.maximum(t0, t0_lower)
+
+        intersected = t0 + self.min_t_range < t1
+        t0 = torch.where(intersected, t0, torch.zeros_like(t0))
+        t1 = torch.where(intersected, t1, torch.ones_like(t1))
+
+        return VolumeRange(t0=t0, t1=t1, intersected=intersected)
+
+class StratifiedRaySampler(nn.Module):
+    """
+    Instead of fixed intervals, a sample is drawn uniformly at random from each
+    interval.
+    """
+
+    def __init__(self, depth_mode: str = "linear"):
+        """
+        :param depth_mode: linear samples ts linearly in depth. harmonic ensures
+            closer points are sampled more densely.
+        """
+        self.depth_mode = depth_mode
+        assert self.depth_mode in ("linear", "geometric", "harmonic")
+
+    def sample(
+        self,
+        t0: torch.Tensor,
+        t1: torch.Tensor,
+        n_samples: int,
+        epsilon: float = 1e-3,
+    ) -> torch.Tensor:
+        """
+        :param t0: start time has shape [batch_size, *shape, 1]
+        :param t1: finish time has shape [batch_size, *shape, 1]
+        :param n_samples: number of ts to sample
+        :return: sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        ones = [1] * (len(t0.shape) - 1)
+        ts = torch.linspace(0, 1, n_samples).view(*ones, n_samples).to(t0.dtype).to(t0.device)
+
+        if self.depth_mode == "linear":
+            ts = t0 * (1.0 - ts) + t1 * ts
+        elif self.depth_mode == "geometric":
+            ts = (t0.clamp(epsilon).log() * (1.0 - ts) + t1.clamp(epsilon).log() * ts).exp()
+        elif self.depth_mode == "harmonic":
+            # The original NeRF recommends this interpolation scheme for
+            # spherical scenes, but there could be some weird edge cases when
+            # the observer crosses from the inner to outer volume.
+            ts = 1.0 / (1.0 / t0.clamp(epsilon) * (1.0 - ts) + 1.0 / t1.clamp(epsilon) * ts)
+
+        mids = 0.5 * (ts[..., 1:] + ts[..., :-1])
+        upper = torch.cat([mids, t1], dim=-1)
+        lower = torch.cat([t0, mids], dim=-1)
+        torch.manual_seed(0) # yiyi notes: add a random seed here 
+        t_rand = torch.rand_like(ts)
+
+        ts = lower + (upper - lower) * t_rand
+        return ts.unsqueeze(-1)
+
 
 def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
     """
@@ -27,7 +229,10 @@ def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.T
     """
     if min_deg == max_deg:
         return x
-    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype)
+    print(" ")
+    print(f" inside posenc_nerf")
+    print(f" - x.device {x.device}, x.dtype: {x.dtype}")
+    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device)
     *shape, dim = x.shape
     xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
     assert xb.shape[-1] == dim * (max_deg - min_deg)
@@ -44,6 +249,18 @@ def encode_direction(position, direction=None):
     else:
         return posenc_nerf(direction, min_deg=0, max_deg=8)
 
+def swish(x):
+    return x * torch.sigmoid(x)
+
+@dataclass
+class MLPNeRFModelOutput(BaseOutput):
+
+    density: torch.Tensor
+    signed_distance: torch.Tensor
+    channels: torch.Tensor
+    ts: torch.Tensor
+
+
 class MLPNeRSTFModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
@@ -54,8 +271,10 @@ def __init__(
         act_fn: str = "swish",
         insert_direction_at: int = 4,
 
+
     ):  
         super().__init__()
+
         # Instantiate the MLP
         
         # Find out the dimension of encoded position and direction 
@@ -77,27 +296,95 @@ def __init__(
             )
 
         if act_fn == "swish":
-            self.activation = lambda x: torch.sigmoid(x)
+            #self.activation = swish
+            # yiyi testing: 
+            self.activation = lambda x: F.silu(x)
         else:
             raise ValueError(f"Unsupported activation function {act_fn}")
         
-    
-    def forward(self, *, positions, directions):
+        self.sdf_activation = torch.tanh
+        self.density_activation = torch.nn.functional.relu
+        self.channel_activation = torch.sigmoid
+        
+    def map_indices_to_keys(self, output):
+
+        h_map = {
+            "sdf": (0, 1),
+            "density_coarse": (1, 2),
+            "density_fine":(2, 3),
+            "stf": (3, 6),
+            "nerf_coarse": (6, 9),
+            "nerf_fine" : (9, 12) }
+
+        mapped_output = {k: output[..., start:end] for k, (start, end) in h_map.items()}
+
+        return mapped_output
+        
+
+    def forward(self, *, position, direction, ts, nerf_level = "coarse"):
+        print(" ")
+        print(f" model inputs:")
+        print(f" - position: {position.shape}, {position.abs().sum()}")
+        print(f" - direction: {direction}")
+
 
         h = encode_position(position)
+        print(f" position after encode -> h: {h.shape}, {h.abs().sum()}")
         h_preact = h
         h_directionless = None
         for i, layer in enumerate(self.mlp):
+            print(f" ")
+            print(f" ***** layer {i}")
             if i == self.config.insert_direction_at: # 4 in the config 
+                print(" insert direction")
                 h_directionless = h_preact
                 h_direction = encode_direction(position, direction=direction)
                 h = torch.cat([h, h_direction], dim=-1)
-
+                print(f" -> h with direction: {h.shape}, {h.abs().sum()}")
+            #batch_size, *shape, d_in = h.shape
+            #h = h.view(batch_size, -1, d_in)
+            print(f" h: {h.shape}, {h.abs().sum()}")
+            #print(h[0,0,:])
+            print(f" weight: {layer.weight.shape}, {layer.weight.abs().sum()}")
+            #print(layer.weight[0,:])
+            #print(f" bias: {layer.bias.shape}, {layer.bias.abs().sum()}")
             h = layer(h)
+            #print(f" -> layer -> {h.shape}, {h.abs().sum()}")
+            #print(h[0,0,0])
+
             h_preact = h
             if i < len(self.mlp) - 1:
+                print(self.activation)
                 h = self.activation(h)
+                print(f" -> act -> {h.shape}, {h.abs().sum()}")
         h_final = h
         if h_directionless is None:
             h_directionless = h_preact
-        return h_final, h_directionless
\ No newline at end of file
+        print(" ")
+        print(" ***************************")
+        print(" out:")
+        print(f" - h_final:{h_final.shape},{h_final.abs().sum()}")
+        print(f" - h_directionless: {h_directionless.shape}, {h_directionless.abs().sum()}")
+        print(" ***************************")
+        print(" ")
+
+        activation = self.map_indices_to_keys(h_final)
+
+        if nerf_level == "coarse":
+            h_density = activation['density_coarse']
+            h_channels = activation['nerf_coarse']
+        else:
+            h_density = activation['density_fine']
+            h_channels = activation['nerf_fine']
+        
+        density=self.density_activation(h_density)
+        signed_distance=self.sdf_activation(activation['sdf'])
+        channels=self.channel_activation(h_channels)
+        print(" model out /raw !!" )
+        print(f" density: {density.shape}, {density.abs().sum()}")
+        print(f" signed_distance: {signed_distance.shape}, {signed_distance.abs().sum()}")
+        print(f" channels: {channels.shape}, {channels.abs().sum()}")
+        print(f" ts: {ts.shape}, {ts.abs().sum()}")
+        
+        # yiyi notes: I think signed_distance is not used 
+        return MLPNeRFModelOutput(density = density, signed_distance= signed_distance, channels=channels, ts=ts)
\ No newline at end of file

From a101a17c56f66a5cbc9c258c4c073d48bd1d8932 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 20:38:24 +0000
Subject: [PATCH 004/119] update conversion script for renderer model

---
 scripts/convert_shap_e_to_diffusers.py | 56 ++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index b6f5780d962e..79a58cc03f45 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -6,7 +6,7 @@
 from collections import OrderedDict
 
 from diffusers.models.prior_transformer import PriorTransformer
-from diffusers.pipelines.shap_e import ShapEParamsProjModel
+from diffusers.pipelines.shap_e import ShapEParamsProjModel, MLPNeRSTFModel
 
 
 """
@@ -21,9 +21,9 @@
 ```sh
 $ python scripts/convert_shap_e_to_diffusers.py \
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
-      --params_proj_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
-      --dump_path /home/yiyi_huggingface_co/model_repo/shape/params_proj \
-      --debug params_proj
+      --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
+      --dump_path /home/yiyi_huggingface_co/model_repo/shape/renderer \
+      --debug renderer
 ```
 """
 
@@ -243,6 +243,28 @@ def params_proj_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 # done params_proj
 
 
+# renderer
+
+RENDERER_ORIGINAL_PREFIX = "renderer.nerstf"
+
+RENDERER_CONFIG = {}
+
+def renderer_model_from_original_config():
+    model = MLPNeRSTFModel(**RENDERER_CONFIG)
+
+    return model
+
+def renderer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {
+        k: checkpoint[f"{RENDERER_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()
+    }
+
+    return diffusers_checkpoint
+
+# done renderer
+
+
+
 # TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
 def split_attentions(*, weight, bias, split, chunk_size):
     weights = [None] * split
@@ -297,7 +319,7 @@ def prior(*, args, checkpoint_map_location):
 def params_proj(*, args, checkpoint_map_location):
     print("loading params_proj")
 
-    params_proj_checkpoint = torch.load(args.params_proj_checkpoint_path, map_location=checkpoint_map_location)
+    params_proj_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
     
     params_proj_model = params_proj_model_from_original_config()
 
@@ -311,6 +333,23 @@ def params_proj(*, args, checkpoint_map_location):
 
     return params_proj_model
 
+def renderer(*, args, checkpoint_map_location):
+    print(" loading renderer")
+
+    renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
+    
+    renderer_model = renderer_model_from_original_config()
+
+    renderer_diffusers_checkpoint = renderer_original_checkpoint_to_diffusers_checkpoint(renderer_model, renderer_checkpoint)
+
+    del renderer_checkpoint
+
+    load_checkpoint_to_model(renderer_diffusers_checkpoint, renderer_model, strict=True)
+
+    print("done loading renderer")
+
+    return renderer_model
+
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
     with tempfile.NamedTemporaryFile() as file:
@@ -336,11 +375,11 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     )
 
     parser.add_argument(
-        "--params_proj_checkpoint_path",
+        "--transmitter_checkpoint_path",
         default=None,
         type=str,
         required=False,
-        help="Path to the prior checkpoint to convert.",
+        help="Path to the transmitter checkpoint to convert.",
     )
 
     parser.add_argument(
@@ -376,5 +415,8 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "params_proj":
         params_proj_model = params_proj(args=args, checkpoint_map_location=checkpoint_map_location)
         params_proj_model.save_pretrained(args.dump_path)
+    elif args.debug == "renderer":
+        renderer_model = renderer(args=args, checkpoint_map_location=checkpoint_map_location)
+        renderer_model.save_pretrained(args.dump_path)
     else:
         raise ValueError(f"unknown debug value : {args.debug}")

From affe807418a3346a60368c1dcf280cea400c7582 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 20:38:45 +0000
Subject: [PATCH 005/119] refactor camera a little bit

---
 src/diffusers/pipelines/shap_e/camera.py | 52 +++++++++++++-----------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py
index 080cf61fde31..cbc58ac73989 100644
--- a/src/diffusers/pipelines/shap_e/camera.py
+++ b/src/diffusers/pipelines/shap_e/camera.py
@@ -18,6 +18,7 @@ class DifferentiableProjectiveCamera:
     height: int
     x_fov: float
     y_fov: float
+    shape: Tuple[int]
 
     def __post_init__(self):
         assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0]
@@ -35,8 +36,8 @@ def resolution(self):
 
     def fov(self):
         return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32))
-
-    def image_coords(self) -> torch.Tensor:
+    
+    def get_image_coords(self) -> torch.Tensor:
         """
         :return: coords of shape (width * height, 2)
         """
@@ -49,15 +50,30 @@ def image_coords(self) -> torch.Tensor:
             axis=1,
         )
         return coords
+    
+    @property
+    def camera_rays(self):
+
+        batch_size, *inner_shape = self.shape
+        inner_batch_size = int(np.prod(inner_shape))
+
+        coords = self.get_image_coords()
+        coords = torch.broadcast_to(coords.unsqueeze(0), [batch_size * inner_batch_size, *coords.shape])
+        rays = self.get_camera_rays(coords)
 
-    def camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
+        rays = rays.view(batch_size, inner_batch_size * self.height * self.width, 2, 3)
+
+        return rays
+
+    def get_camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
         batch_size, *shape, n_coords = coords.shape
         assert n_coords == 2
         assert batch_size == self.origin.shape[0]
+
         flat = coords.view(batch_size, -1, 2)
 
-        res = self.resolution().to(flat.device)
-        fov = self.fov().to(flat.device)
+        res = self.resolution()
+        fov = self.fov()
 
         fracs = (flat.float() / (res - 1)) * 2 - 1
         fracs = fracs * torch.tan(fov / 2)
@@ -96,16 +112,8 @@ def resize_image(self, width: int, height: int) -> "DifferentiableProjectiveCame
             y_fov=self.y_fov,
         )
 
-@dataclass
-class DifferentiableCameraBatch:
-    """
-    Annotate a differentiable camera with a multi-dimensional batch shape.
-    """
 
-    shape: Tuple[int]
-    flat_camera: DifferentiableProjectiveCamera
-
-def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraBatch:
+def create_pan_cameras(size: int) -> DifferentiableProjectiveCamera:
     origins = []
     xs = []
     ys = []
@@ -120,16 +128,14 @@ def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraB
         xs.append(x)
         ys.append(y)
         zs.append(z)
-    return DifferentiableCameraBatch(
-        shape=(1, len(xs)),
-        flat_camera=DifferentiableProjectiveCamera(
-            origin=torch.from_numpy(np.stack(origins, axis=0)).float().to(device),
-            x=torch.from_numpy(np.stack(xs, axis=0)).float().to(device),
-            y=torch.from_numpy(np.stack(ys, axis=0)).float().to(device),
-            z=torch.from_numpy(np.stack(zs, axis=0)).float().to(device),
+    return DifferentiableProjectiveCamera(
+            origin=torch.from_numpy(np.stack(origins, axis=0)).float(),
+            x=torch.from_numpy(np.stack(xs, axis=0)).float(),
+            y=torch.from_numpy(np.stack(ys, axis=0)).float(),
+            z=torch.from_numpy(np.stack(zs, axis=0)).float(),
             width=size,
             height=size,
             x_fov=0.7,
             y_fov=0.7,
-        ),
-    )
\ No newline at end of file
+            shape=(1, len(xs))
+        )

From fca2532f24e52011c66e2ccd1babbe1f40addd9c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 22:53:56 +0000
Subject: [PATCH 006/119] clean up

---
 src/diffusers/pipelines/shap_e/__init__.py    |   2 +-
 .../pipelines/shap_e/pipeline_shap_e.py       | 214 ++++++++++++++++-
 src/diffusers/pipelines/shap_e/renderer.py    | 227 +++++++++++-------
 3 files changed, 345 insertions(+), 98 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 1a615d351db2..c01d8240976d 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -14,5 +14,5 @@
 else:
     from .pipeline_shap_e import ShapEPipeline
     from .params_proj import ShapEParamsProjModel
-    from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler
+    from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler, ImportanceRaySampler
     from .camera import create_pan_cameras
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 182ca699bde9..310b3f75d307 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -15,6 +15,8 @@
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
+import PIL
+
 import numpy as np
 import torch
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer
@@ -22,6 +24,11 @@
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
 from ...schedulers import HeunDiscreteScheduler
+
+from .params_proj import ShapEParamsProjModel
+from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler, ImportanceRaySampler
+from .camera import create_pan_cameras
+
 from ...utils import (
     BaseOutput,
     is_accelerate_available,
@@ -40,6 +47,54 @@
         ```
 """
 
+def merge_results(
+    self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor
+):
+    """
+    :param a: [..., n_a, ...]. The other dictionary containing the b's may
+        contain extra tensors from earlier calculations, so a can be None.
+    :param b: [..., n_b, ...]
+    :param dim: dimension to merge
+    :param indices: how the merged results should be sorted at the end
+    :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
+    """
+    merged = torch.cat([a, b], dim=dim)
+    return torch.gather(merged, dim=dim, index=torch.broadcast_to(indices, merged.shape))
+
+def integrate_samples(volume_range, ts, density, channels):
+    r"""
+    Function integrating the model output.
+    
+    Args:
+        volume_range: Specifies the integral range [t0, t1]
+        ts: timesteps
+        density: torch.Tensor [batch_size, *shape, n_samples, 1]
+        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
+    returns: 
+        channels: integrated rgb output
+        weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density *transmittance)[i] weight for each rgb output at [..., i, :].
+        transmittance: transmittance of this volume
+    )
+    """
+
+    # 1. Calculate the weights
+    _, _, dt = volume_range.partition(ts)
+    ddensity = density * dt
+
+    mass = torch.cumsum(ddensity, dim=-2)
+    transmittance = torch.exp(-mass[..., -1, :])
+
+    alphas = 1.0 - torch.exp(-ddensity)
+    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
+    # This is the probability of light hitting and reflecting off of
+    # something at depth [..., i, :].
+    weights = alphas * Ts
+
+    # 2. Integrate channels
+    channels = torch.sum(channels * weights, dim=-2)
+
+    return channels, weights, transmittance
+
 
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
@@ -48,10 +103,10 @@ class ShapEPipelineOutput(BaseOutput):
 
     Args:
         images (`torch.FloatTensor`)
-            3D latent representation
+            a list of images for 3D rendering
     """
 
-    latents: Union[torch.FloatTensor, np.ndarray]
+    images: Union[PIL.Image.Image, np.ndarray]
 
 
 class ShapEPipeline(DiffusionPipeline):
@@ -79,6 +134,8 @@ def __init__(
         text_encoder: CLIPTextModelWithProjection,
         tokenizer: CLIPTokenizer,
         scheduler: HeunDiscreteScheduler,
+        params_proj: ShapEParamsProjModel,
+        renderer: MLPNeRSTFModel,
     ):
         super().__init__()
 
@@ -87,7 +144,11 @@ def __init__(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             scheduler=scheduler,
+            params_proj=params_proj,
+            renderer=renderer,
         )
+        self.void = VoidNeRFModel(background=[0., 0., 0.], channel_scale = 255.0)
+        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -184,6 +245,90 @@ def _encode_prompt(
 
         return prompt_embeds
 
+    @torch.no_grad()
+    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
+
+        """
+        Perform volumetric rendering over a partition of possible t's in the union
+        of rendering volumes (written below with some abuse of notations)
+
+            C(r) := sum(
+                transmittance(t[i]) *
+                integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t),
+                    [t[i], t[i + 1]],
+                )
+                for i in range(len(parts))
+            ) + transmittance(t[-1]) * void_model(t[-1]).channels
+
+        where
+
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the
+        probability of light passing through the volume specified by [t[0], s].
+        (transmittance of 1 means light can pass freely)
+        2) density and channels are obtained by evaluating the appropriate
+        part.model at time t.
+        3) [t[i], t[i + 1]] is defined as the range of t where the ray intersects
+        (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface
+        of the shell (if bounded). If the ray does not intersect, the integral over
+        this segment is evaluated as 0 and transmittance(t[i + 1]) :=
+        transmittance(t[i]).
+        4) The last term is integration to infinity (e.g. [t[-1], math.inf]) that
+        is evaluated by the void_model (i.e. we consider this space to be empty).
+        
+        args:
+            rays: [batch_size x ... x 2 x 3] origin and direction.
+            sampler: disjoint volume integrals.
+            n_samples: number of ts to sample.
+            prev_model_outputs: model outputs from the previous rendering step, including 
+
+        :return: A tuple of
+            - `channels`
+            - A importance samplers for additional fine-grained rendering
+            - raw model output
+        """
+        origin, direction = rays[..., 0, :], rays[..., 1, :]
+        
+        # Integrate over [t[i], t[i + 1]]
+
+        # 1 Intersect the rays with the current volume and sample ts to integrate along.
+        vrange = self.volume.intersect(origin, direction, t0_lower=None)
+        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
+        ts = ts.to(rays.dtype)
+        
+        if prev_model_out is not None:
+            # Append the previous ts now before fprop because previous
+            # rendering used a different model and we can't reuse the output.
+            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
+        
+        batch_size, *_shape, _t0_dim = vrange.t0.shape
+        _, *ts_shape, _ts_dim = ts.shape
+
+        # 2. Get the points along the ray and query the model
+        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
+        positions = origin.unsqueeze(-2) + ts * directions
+
+        optional_directions = directions if render_with_direction else None
+        
+        model_out = self.renderer(position=positions, direction=optional_directions, ts=ts, nerf_level = "coarse" if prev_model_out is None else "fine")
+        
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(vrange, model_out.ts, model_out.density, model_out.channels)
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(
+            vrange.intersected, transmittance, torch.ones_like(transmittance)
+        )
+        channels = torch.where(
+            vrange.intersected, channels, torch.zeros_like(channels)
+        )
+        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+        channels = channels + transmittance * self.void(origin)
+  
+        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
+
+        return channels, weighted_sampler, model_out
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -196,6 +341,10 @@ def __call__(
         guidance_scale: float = 4.0,
         sigma_min: float = 1e-3,
         sigma_max: float = 160.0,
+        size: int = 64,
+        ray_batch_size: int = 4096,
+        n_coarse_samples= 64,
+        n_fine_samples= 128,
         output_type: Optional[str] = "pt",  # pt only
         return_dict: bool = True,
     ):
@@ -248,6 +397,7 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
         prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        
         # prior
 
         self.scheduler.set_timesteps(
@@ -266,7 +416,8 @@ def __call__(
             latents,
             self.scheduler,
         )
-        # for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
         latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
 
         for i, t in enumerate(self.progress_bar(timesteps)):
@@ -298,14 +449,53 @@ def __call__(
                 sample=latents,
                 step_index=i,
             ).prev_sample
-
-        if output_type not in ["pt", "np"]:
-            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
-
-        if output_type == "np":
-            latents = latents.cpu().numpy()
-
+        
+        # project the the paramters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # update the mlp layers of the renderer 
+        for name, param in self.renderer.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+        
+        # create cameras object
+        camera = create_pan_cameras(size)
+        rays = camera.camera_rays
+        rays = rays.to(device)
+        n_batches = rays.shape[1] // ray_batch_size
+        
+        coarse_sampler = StratifiedRaySampler()
+
+        images = []
+        with self.progress_bar(total=n_batches) as progress_bar:
+            for idx in range(n_batches):
+                rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
+                
+                # render rays with coarse, stratified samples.
+                _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
+                # Then, render with additional importance-weighted ray samples.
+                channels, _ , _ = self.render_rays(rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out) 
+
+                images.append(channels)
+                progress_bar.update()
+            
+        images = torch.cat(images, dim=1)
+        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+        
+        if output_type not in ["np", "pil"]:
+            raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
+
+        images = images.cpu().numpy()
+
+        
+        if output_type == "pil":
+            images = self.numpy_to_pil(images)
+        
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        
         if not return_dict:
-            return latents
+            return(images,)
 
-        return ShapEPipelineOutput(latents)
+        return ShapEPipelineOutput(images=images)
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 53348795f5a8..9c58ad25f438 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -28,6 +28,58 @@
 from typing import Optional, Dict
 
 
+def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
+    r"""
+    Sample from the given discrete probability distribution with replacement.
+
+    The i-th bin is assumed to have mass pmf[i].
+
+    Args:
+        pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all()
+        n_samples: number of samples
+
+    Return: 
+        indices sampled with replacement
+    """
+
+    *shape, support_size, last_dim = pmf.shape
+    assert last_dim == 1
+
+    cdf = torch.cumsum(pmf.view(-1, support_size), dim=1)
+    inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device))
+
+    return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1)
+
+
+def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
+    """
+    Concatenate x and its positional encodings, following NeRF.
+
+    Reference: https://arxiv.org/pdf/2210.04628.pdf
+    """
+    if min_deg == max_deg:
+        return x
+
+    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device)
+    *shape, dim = x.shape
+    xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
+    assert xb.shape[-1] == dim * (max_deg - min_deg)
+    emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin()
+    return torch.cat([x, emb], dim=-1)
+
+
+def encode_position(position):
+
+    return posenc_nerf(position, min_deg=0, max_deg=15)
+
+
+def encode_direction(position, direction=None):
+    if direction is None:
+        return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8))
+    else:
+        return posenc_nerf(direction, min_deg=0, max_deg=8)
+
+
 class VoidNeRFModel(nn.Module):
     """
     Implements the default empty space model where all queries are rendered as
@@ -55,6 +107,7 @@ def forward(self, position):
 
         return background
 
+
 @dataclass
 class VolumeRange:
     t0: torch.Tensor
@@ -68,29 +121,28 @@ def partition(self, ts):
         """
         Partitions t0 and t1 into n_samples intervals.
 
-        :param ts: [batch_size, *shape, n_samples, 1]
-        :return: a tuple of (
+        Args:
+            ts: [batch_size, *shape, n_samples, 1]
+        
+        Return: 
+            
             lower: [batch_size, *shape, n_samples, 1]
             upper: [batch_size, *shape, n_samples, 1]
             delta: [batch_size, *shape, n_samples, 1]
-        ) where
-
+        
+        where
             ts \\in [lower, upper]
             deltas = upper - lower
         """
-        #print(" ")
-        #print(f" inside BoundingBoxVolume.partition:")
-        #print(f" - ts: {ts.shape}, {ts.abs().sum()}")
+
         mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5
-        #print(f" - mids: {mids.shape}, {mids.abs().sum()}")
         lower = torch.cat([self.t0[..., None, :], mids], dim=-2)
-        #print(f" -t0: {self.t0.shape}, {self.t0.abs().sum()}")
         upper = torch.cat([mids, self.t1[..., None, :]], dim=-2)
-        #print(f" -upper: {upper.shape}, {upper.abs().sum()}")
         delta = upper - lower
         assert lower.shape == upper.shape == delta.shape == ts.shape
         return lower, upper, delta
 
+
 class BoundingBoxVolume(nn.Module):
     """
     Axis-aligned bounding box defined by the two opposite corners.
@@ -105,20 +157,22 @@ def __init__(
         min_t_range: float = 1e-3,
     ):
         """
-        :param bbox_min: the left/bottommost corner of the bounding box
-        :param bbox_max: the other corner of the bounding box
-        :param min_dist: all rays should start at least this distance away from the origin.
+        Args:
+            bbox_min: the left/bottommost corner of the bounding box
+            bbox_max: the other corner of the bounding box
+            min_dist: all rays should start at least this distance away from the origin.
         """
         super().__init__()
 
-        self.bbox_min = torch.tensor(bbox_min)
-        self.bbox_max = torch.tensor(bbox_max)
         self.min_dist = min_dist
         self.min_t_range = min_t_range
+
+        self.bbox_min = torch.tensor(bbox_min)
+        self.bbox_max = torch.tensor(bbox_max)
         self.bbox = torch.stack([self.bbox_min, self.bbox_max])
         assert self.bbox.shape == (2, 3)
-        assert self.min_dist >= 0.0
-        assert self.min_t_range > 0.0
+        assert min_dist >= 0.0
+        assert min_t_range > 0.0
 
     def intersect(
         self,
@@ -128,13 +182,15 @@ def intersect(
         epsilon=1e-6,
     ):
         """
-        :param origin: [batch_size, *shape, 3]
-        :param direction: [batch_size, *shape, 3]
-        :param t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume.
-        :param params: Optional meta parameters in case Volume is parametric
-        :param epsilon: to stabilize calculations
-
-        :return: A tuple of (t0, t1, intersected) where each has a shape
+        Args:
+            origin: [batch_size, *shape, 3]
+            direction: [batch_size, *shape, 3]
+            t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume.
+            params: Optional meta parameters in case Volume is parametric
+            epsilon: to stabilize calculations
+
+        Return: 
+            A tuple of (t0, t1, intersected) where each has a shape
             [batch_size, *shape, 1]. If a ray intersects with the volume, `o + td` is
             in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed
             to be on the boundary of the volume.
@@ -171,6 +227,7 @@ def _safe_divide(a, b, epsilon=1e-6):
 
         return VolumeRange(t0=t0, t1=t1, intersected=intersected)
 
+
 class StratifiedRaySampler(nn.Module):
     """
     Instead of fixed intervals, a sample is drawn uniformly at random from each
@@ -193,10 +250,12 @@ def sample(
         epsilon: float = 1e-3,
     ) -> torch.Tensor:
         """
-        :param t0: start time has shape [batch_size, *shape, 1]
-        :param t1: finish time has shape [batch_size, *shape, 1]
-        :param n_samples: number of ts to sample
-        :return: sampled ts of shape [batch_size, *shape, n_samples, 1]
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return: 
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
         """
         ones = [1] * (len(t0.shape) - 1)
         ts = torch.linspace(0, 1, n_samples).view(*ones, n_samples).to(t0.dtype).to(t0.device)
@@ -221,36 +280,62 @@ def sample(
         return ts.unsqueeze(-1)
 
 
-def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
+class ImportanceRaySampler(nn.Module):
     """
-    Concatenate x and its positional encodings, following NeRF.
-
-    Reference: https://arxiv.org/pdf/2210.04628.pdf
+    Given the initial estimate of densities, this samples more from
+    regions/bins expected to have objects.
     """
-    if min_deg == max_deg:
-        return x
-    print(" ")
-    print(f" inside posenc_nerf")
-    print(f" - x.device {x.device}, x.dtype: {x.dtype}")
-    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device)
-    *shape, dim = x.shape
-    xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
-    assert xb.shape[-1] == dim * (max_deg - min_deg)
-    emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin()
-    return torch.cat([x, emb], dim=-1)
 
-def encode_position(position):
+    def __init__(
+        self, volume_range: VolumeRange, ts: torch.Tensor, weights: torch.Tensor, blur_pool: bool = False, alpha: float = 1e-5
+    ):
+        """
+        Args:
+            volume_range: the range in which a ray intersects the given volume.
+            ts: earlier samples from the coarse rendering step
+            weights: discretized version of density * transmittance
+            blur_pool: if true, use 2-tap max + 2-tap blur filter from mip-NeRF.
+            alpha: small value to add to weights.
+        """
+        self.volume_range = volume_range
+        self.ts = ts.clone().detach()
+        self.weights = weights.clone().detach()
+        self.blur_pool = blur_pool
+        self.alpha = alpha
+
+    @torch.no_grad()
+    def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Tensor:
+        """
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return: 
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        lower, upper, _ = self.volume_range.partition(self.ts)
 
-    return posenc_nerf(position, min_deg=0, max_deg=15)
+        batch_size, *shape, n_coarse_samples, _ = self.ts.shape
 
-def encode_direction(position, direction=None):
-    if direction is None:
-        return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8))
-    else:
-        return posenc_nerf(direction, min_deg=0, max_deg=8)
+        weights = self.weights
+        if self.blur_pool:
+            padded = torch.cat([weights[..., :1, :], weights, weights[..., -1:, :]], dim=-2)
+            maxes = torch.maximum(padded[..., :-1, :], padded[..., 1:, :])
+            weights = 0.5 * (maxes[..., :-1, :] + maxes[..., 1:, :])
+        weights = weights + self.alpha
+        pmf = weights / weights.sum(dim=-2, keepdim=True)
+        inds = sample_pmf(pmf, n_samples)
+        assert inds.shape == (batch_size, *shape, n_samples, 1)
+        assert (inds >= 0).all() and (inds < n_coarse_samples).all()
+
+        t_rand = torch.rand(inds.shape, device=inds.device)
+        lower_ = torch.gather(lower, -2, inds)
+        upper_ = torch.gather(upper, -2, inds)
+
+        ts = lower_ + (upper_ - lower_) * t_rand
+        ts = torch.sort(ts, dim=-2).values
+        return ts
 
-def swish(x):
-    return x * torch.sigmoid(x)
 
 @dataclass
 class MLPNeRFModelOutput(BaseOutput):
@@ -322,51 +407,28 @@ def map_indices_to_keys(self, output):
         
 
     def forward(self, *, position, direction, ts, nerf_level = "coarse"):
-        print(" ")
-        print(f" model inputs:")
-        print(f" - position: {position.shape}, {position.abs().sum()}")
-        print(f" - direction: {direction}")
-
 
         h = encode_position(position)
-        print(f" position after encode -> h: {h.shape}, {h.abs().sum()}")
+
         h_preact = h
         h_directionless = None
         for i, layer in enumerate(self.mlp):
-            print(f" ")
-            print(f" ***** layer {i}")
             if i == self.config.insert_direction_at: # 4 in the config 
-                print(" insert direction")
+
                 h_directionless = h_preact
                 h_direction = encode_direction(position, direction=direction)
                 h = torch.cat([h, h_direction], dim=-1)
-                print(f" -> h with direction: {h.shape}, {h.abs().sum()}")
-            #batch_size, *shape, d_in = h.shape
-            #h = h.view(batch_size, -1, d_in)
-            print(f" h: {h.shape}, {h.abs().sum()}")
-            #print(h[0,0,:])
-            print(f" weight: {layer.weight.shape}, {layer.weight.abs().sum()}")
-            #print(layer.weight[0,:])
-            #print(f" bias: {layer.bias.shape}, {layer.bias.abs().sum()}")
+
             h = layer(h)
-            #print(f" -> layer -> {h.shape}, {h.abs().sum()}")
-            #print(h[0,0,0])
 
             h_preact = h
+
             if i < len(self.mlp) - 1:
-                print(self.activation)
                 h = self.activation(h)
-                print(f" -> act -> {h.shape}, {h.abs().sum()}")
+
         h_final = h
         if h_directionless is None:
             h_directionless = h_preact
-        print(" ")
-        print(" ***************************")
-        print(" out:")
-        print(f" - h_final:{h_final.shape},{h_final.abs().sum()}")
-        print(f" - h_directionless: {h_directionless.shape}, {h_directionless.abs().sum()}")
-        print(" ***************************")
-        print(" ")
 
         activation = self.map_indices_to_keys(h_final)
 
@@ -380,11 +442,6 @@ def forward(self, *, position, direction, ts, nerf_level = "coarse"):
         density=self.density_activation(h_density)
         signed_distance=self.sdf_activation(activation['sdf'])
         channels=self.channel_activation(h_channels)
-        print(" model out /raw !!" )
-        print(f" density: {density.shape}, {density.abs().sum()}")
-        print(f" signed_distance: {signed_distance.shape}, {signed_distance.abs().sum()}")
-        print(f" channels: {channels.shape}, {channels.abs().sum()}")
-        print(f" ts: {ts.shape}, {ts.abs().sum()}")
         
         # yiyi notes: I think signed_distance is not used 
         return MLPNeRFModelOutput(density = density, signed_distance= signed_distance, channels=channels, ts=ts)
\ No newline at end of file

From f23425345933c4585ec0f1dee781b7cf49a739ce Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 22:57:10 +0000
Subject: [PATCH 007/119] style

---
 scripts/convert_shap_e_to_diffusers.py        |  32 ++--
 src/diffusers/pipelines/shap_e/__init__.py    |  13 +-
 src/diffusers/pipelines/shap_e/camera.py      |  40 ++---
 src/diffusers/pipelines/shap_e/params_proj.py |  25 +--
 .../pipelines/shap_e/pipeline_shap_e.py       | 143 +++++++++---------
 src/diffusers/pipelines/shap_e/renderer.py    | 140 ++++++++---------
 6 files changed, 187 insertions(+), 206 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 79a58cc03f45..217acce43dcd 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -3,10 +3,9 @@
 
 import torch
 from accelerate import load_checkpoint_and_dispatch
-from collections import OrderedDict
 
 from diffusers.models.prior_transformer import PriorTransformer
-from diffusers.pipelines.shap_e import ShapEParamsProjModel, MLPNeRSTFModel
+from diffusers.pipelines.shap_e import MLPNeRSTFModel, ShapEParamsProjModel
 
 
 """
@@ -225,6 +224,7 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 
 PARAMS_PROJ_CONFIG = {}
 
+
 def params_proj_model_from_original_config():
     model = ShapEParamsProjModel(**PARAMS_PROJ_CONFIG)
 
@@ -232,10 +232,7 @@ def params_proj_model_from_original_config():
 
 
 def params_proj_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-
-    diffusers_checkpoint = {
-        k: checkpoint[f"{PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()
-    }
+    diffusers_checkpoint = {k: checkpoint[f"{PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()}
 
     return diffusers_checkpoint
 
@@ -249,20 +246,20 @@ def params_proj_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 
 RENDERER_CONFIG = {}
 
+
 def renderer_model_from_original_config():
     model = MLPNeRSTFModel(**RENDERER_CONFIG)
 
     return model
 
+
 def renderer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {
-        k: checkpoint[f"{RENDERER_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()
-    }
+    diffusers_checkpoint = {k: checkpoint[f"{RENDERER_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()}
 
     return diffusers_checkpoint
 
-# done renderer
 
+# done renderer
 
 
 # TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
@@ -320,27 +317,32 @@ def params_proj(*, args, checkpoint_map_location):
     print("loading params_proj")
 
     params_proj_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
-    
+
     params_proj_model = params_proj_model_from_original_config()
 
-    params_proj_diffusers_checkpoint = params_proj_original_checkpoint_to_diffusers_checkpoint(params_proj_model, params_proj_checkpoint)
+    params_proj_diffusers_checkpoint = params_proj_original_checkpoint_to_diffusers_checkpoint(
+        params_proj_model, params_proj_checkpoint
+    )
 
     del params_proj_checkpoint
 
-    load_checkpoint_to_model(params_proj_diffusers_checkpoint,params_proj_model, strict=True)
+    load_checkpoint_to_model(params_proj_diffusers_checkpoint, params_proj_model, strict=True)
 
     print("done loading params_proj")
 
     return params_proj_model
 
+
 def renderer(*, args, checkpoint_map_location):
     print(" loading renderer")
 
     renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
-    
+
     renderer_model = renderer_model_from_original_config()
 
-    renderer_diffusers_checkpoint = renderer_original_checkpoint_to_diffusers_checkpoint(renderer_model, renderer_checkpoint)
+    renderer_diffusers_checkpoint = renderer_original_checkpoint_to_diffusers_checkpoint(
+        renderer_model, renderer_checkpoint
+    )
 
     del renderer_checkpoint
 
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index c01d8240976d..76ca0ea814f1 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -12,7 +12,14 @@
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
 else:
-    from .pipeline_shap_e import ShapEPipeline
-    from .params_proj import ShapEParamsProjModel
-    from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler, ImportanceRaySampler
     from .camera import create_pan_cameras
+    from .params_proj import ShapEParamsProjModel
+    from .pipeline_shap_e import ShapEPipeline
+    from .renderer import (
+        BoundingBoxVolume,
+        ImportanceRaySampler,
+        MLPNeRFModelOutput,
+        MLPNeRSTFModel,
+        StratifiedRaySampler,
+        VoidNeRFModel,
+    )
diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py
index cbc58ac73989..cfe86fb7c2cf 100644
--- a/src/diffusers/pipelines/shap_e/camera.py
+++ b/src/diffusers/pipelines/shap_e/camera.py
@@ -1,8 +1,9 @@
 from dataclasses import dataclass
 from typing import Tuple
 
-import torch
 import numpy as np
+import torch
+
 
 @dataclass
 class DifferentiableProjectiveCamera:
@@ -23,20 +24,14 @@ class DifferentiableProjectiveCamera:
     def __post_init__(self):
         assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0]
         assert self.x.shape[1] == self.y.shape[1] == self.z.shape[1] == self.origin.shape[1] == 3
-        assert (
-            len(self.x.shape)
-            == len(self.y.shape)
-            == len(self.z.shape)
-            == len(self.origin.shape)
-            == 2
-        )
+        assert len(self.x.shape) == len(self.y.shape) == len(self.z.shape) == len(self.origin.shape) == 2
 
     def resolution(self):
         return torch.from_numpy(np.array([self.width, self.height], dtype=np.float32))
 
     def fov(self):
         return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32))
-    
+
     def get_image_coords(self) -> torch.Tensor:
         """
         :return: coords of shape (width * height, 2)
@@ -50,10 +45,9 @@ def get_image_coords(self) -> torch.Tensor:
             axis=1,
         )
         return coords
-    
+
     @property
     def camera_rays(self):
-
         batch_size, *inner_shape = self.shape
         inner_batch_size = int(np.prod(inner_shape))
 
@@ -87,9 +81,7 @@ def get_camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
         directions = directions / directions.norm(dim=-1, keepdim=True)
         rays = torch.stack(
             [
-                torch.broadcast_to(
-                    self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]
-                ),
+                torch.broadcast_to(self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]),
                 directions,
             ],
             dim=2,
@@ -129,13 +121,13 @@ def create_pan_cameras(size: int) -> DifferentiableProjectiveCamera:
         ys.append(y)
         zs.append(z)
     return DifferentiableProjectiveCamera(
-            origin=torch.from_numpy(np.stack(origins, axis=0)).float(),
-            x=torch.from_numpy(np.stack(xs, axis=0)).float(),
-            y=torch.from_numpy(np.stack(ys, axis=0)).float(),
-            z=torch.from_numpy(np.stack(zs, axis=0)).float(),
-            width=size,
-            height=size,
-            x_fov=0.7,
-            y_fov=0.7,
-            shape=(1, len(xs))
-        )
+        origin=torch.from_numpy(np.stack(origins, axis=0)).float(),
+        x=torch.from_numpy(np.stack(xs, axis=0)).float(),
+        y=torch.from_numpy(np.stack(ys, axis=0)).float(),
+        z=torch.from_numpy(np.stack(zs, axis=0)).float(),
+        width=size,
+        height=size,
+        x_fov=0.7,
+        y_fov=0.7,
+        shape=(1, len(xs)),
+    )
diff --git a/src/diffusers/pipelines/shap_e/params_proj.py b/src/diffusers/pipelines/shap_e/params_proj.py
index 1910889bc63d..47098e92d20e 100644
--- a/src/diffusers/pipelines/shap_e/params_proj.py
+++ b/src/diffusers/pipelines/shap_e/params_proj.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Tuple
+
 import torch
 from torch import nn
 
-from typing import Tuple, Optional
-from collections import OrderedDict
-
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 
+
 class ChannelsProj(nn.Module):
     def __init__(
         self,
@@ -51,8 +51,9 @@ class ShapEParamsProjModel(ModelMixin, ConfigMixin):
     """
     project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
 
-    For more details, see the original paper: 
+    For more details, see the original paper:
     """
+
     @register_to_config
     def __init__(
         self,
@@ -63,16 +64,19 @@ def __init__(
             "nerstf.mlp.2.weight",
             "nerstf.mlp.3.weight",
         ),
-        param_shapes: Tuple[Tuple[int]] = ((256, 93),(256, 256),(256, 256),(256, 256),),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
         d_latent: int = 1024,
-    ):  
+    ):
         super().__init__()
 
         # check inputs
         if len(param_names) != len(param_shapes):
-            raise ValueError(
-                f"Must provide same number of `param_names` as `param_shapes`"
-            )
+            raise ValueError("Must provide same number of `param_names` as `param_shapes`")
         self.projections = nn.ModuleDict({})
         for k, (vectors, channels) in zip(param_names, param_shapes):
             self.projections[_sanitize_name(k)] = ChannelsProj(
@@ -92,5 +96,6 @@ def forward(self, x: torch.Tensor):
             start = end
         return out
 
+
 def _sanitize_name(x: str) -> str:
-    return x.replace(".", "__")
\ No newline at end of file
+    return x.replace(".", "__")
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 310b3f75d307..5a7bacac5ebf 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -15,20 +15,14 @@
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
-import PIL
-
 import numpy as np
+import PIL
 import torch
 from transformers import CLIPTextModelWithProjection, CLIPTokenizer
 
 from ...models import PriorTransformer
 from ...pipelines import DiffusionPipeline
 from ...schedulers import HeunDiscreteScheduler
-
-from .params_proj import ShapEParamsProjModel
-from .renderer import MLPNeRSTFModel, MLPNeRFModelOutput, VoidNeRFModel, BoundingBoxVolume, StratifiedRaySampler, ImportanceRaySampler
-from .camera import create_pan_cameras
-
 from ...utils import (
     BaseOutput,
     is_accelerate_available,
@@ -36,6 +30,15 @@
     randn_tensor,
     replace_example_docstring,
 )
+from .camera import create_pan_cameras
+from .params_proj import ShapEParamsProjModel
+from .renderer import (
+    BoundingBoxVolume,
+    ImportanceRaySampler,
+    MLPNeRSTFModel,
+    StratifiedRaySampler,
+    VoidNeRFModel,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -47,33 +50,30 @@
         ```
 """
 
-def merge_results(
-    self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor
-):
+
+def merge_results(self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor):
     """
     :param a: [..., n_a, ...]. The other dictionary containing the b's may
         contain extra tensors from earlier calculations, so a can be None.
-    :param b: [..., n_b, ...]
-    :param dim: dimension to merge
-    :param indices: how the merged results should be sorted at the end
-    :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
+    :param b: [..., n_b, ...] :param dim: dimension to merge :param indices: how the merged results should be sorted at
+    the end :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
     """
     merged = torch.cat([a, b], dim=dim)
     return torch.gather(merged, dim=dim, index=torch.broadcast_to(indices, merged.shape))
 
+
 def integrate_samples(volume_range, ts, density, channels):
     r"""
     Function integrating the model output.
-    
+
     Args:
         volume_range: Specifies the integral range [t0, t1]
         ts: timesteps
         density: torch.Tensor [batch_size, *shape, n_samples, 1]
         channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
-    returns: 
-        channels: integrated rgb output
-        weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density *transmittance)[i] weight for each rgb output at [..., i, :].
-        transmittance: transmittance of this volume
+    returns:
+        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
+        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
     )
     """
 
@@ -147,7 +147,7 @@ def __init__(
             params_proj=params_proj,
             renderer=renderer,
         )
-        self.void = VoidNeRFModel(background=[0., 0., 0.], channel_scale = 255.0)
+        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
         self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
@@ -247,40 +247,29 @@ def _encode_prompt(
 
     @torch.no_grad()
     def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
-
         """
-        Perform volumetric rendering over a partition of possible t's in the union
-        of rendering volumes (written below with some abuse of notations)
+        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
+        with some abuse of notations)
 
             C(r) := sum(
-                transmittance(t[i]) *
-                integrate(
-                    lambda t: density(t) * channels(t) * transmittance(t),
-                    [t[i], t[i + 1]],
-                )
-                for i in range(len(parts))
+                transmittance(t[i]) * integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
+                ) for i in range(len(parts))
             ) + transmittance(t[-1]) * void_model(t[-1]).channels
 
         where
 
-        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the
-        probability of light passing through the volume specified by [t[0], s].
-        (transmittance of 1 means light can pass freely)
-        2) density and channels are obtained by evaluating the appropriate
-        part.model at time t.
-        3) [t[i], t[i + 1]] is defined as the range of t where the ray intersects
-        (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface
-        of the shell (if bounded). If the ray does not intersect, the integral over
-        this segment is evaluated as 0 and transmittance(t[i + 1]) :=
-        transmittance(t[i]).
-        4) The last term is integration to infinity (e.g. [t[-1], math.inf]) that
-        is evaluated by the void_model (i.e. we consider this space to be empty).
-        
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
+        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
+        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
+        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
+        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
+        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
+        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+
         args:
-            rays: [batch_size x ... x 2 x 3] origin and direction.
-            sampler: disjoint volume integrals.
-            n_samples: number of ts to sample.
-            prev_model_outputs: model outputs from the previous rendering step, including 
+            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
+            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
 
         :return: A tuple of
             - `channels`
@@ -288,19 +277,19 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
             - raw model output
         """
         origin, direction = rays[..., 0, :], rays[..., 1, :]
-        
+
         # Integrate over [t[i], t[i + 1]]
 
         # 1 Intersect the rays with the current volume and sample ts to integrate along.
         vrange = self.volume.intersect(origin, direction, t0_lower=None)
         ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
         ts = ts.to(rays.dtype)
-        
+
         if prev_model_out is not None:
             # Append the previous ts now before fprop because previous
             # rendering used a different model and we can't reuse the output.
             ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
-        
+
         batch_size, *_shape, _t0_dim = vrange.t0.shape
         _, *ts_shape, _ts_dim = ts.shape
 
@@ -309,22 +298,25 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
         positions = origin.unsqueeze(-2) + ts * directions
 
         optional_directions = directions if render_with_direction else None
-        
-        model_out = self.renderer(position=positions, direction=optional_directions, ts=ts, nerf_level = "coarse" if prev_model_out is None else "fine")
-        
-        # 3. Integrate the model results
-        channels, weights, transmittance = integrate_samples(vrange, model_out.ts, model_out.density, model_out.channels)
 
-        # 4. Clean up results that do not intersect with the volume.
-        transmittance = torch.where(
-            vrange.intersected, transmittance, torch.ones_like(transmittance)
+        model_out = self.renderer(
+            position=positions,
+            direction=optional_directions,
+            ts=ts,
+            nerf_level="coarse" if prev_model_out is None else "fine",
         )
-        channels = torch.where(
-            vrange.intersected, channels, torch.zeros_like(channels)
+
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(
+            vrange, model_out.ts, model_out.density, model_out.channels
         )
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
+        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
         # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
         channels = channels + transmittance * self.void(origin)
-  
+
         weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
 
         return channels, weighted_sampler, model_out
@@ -343,8 +335,8 @@ def __call__(
         sigma_max: float = 160.0,
         size: int = 64,
         ray_batch_size: int = 4096,
-        n_coarse_samples= 64,
-        n_fine_samples= 128,
+        n_coarse_samples=64,
+        n_fine_samples=128,
         output_type: Optional[str] = "pt",  # pt only
         return_dict: bool = True,
     ):
@@ -397,7 +389,7 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
         prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
-        
+
         # prior
 
         self.scheduler.set_timesteps(
@@ -449,53 +441,54 @@ def __call__(
                 sample=latents,
                 step_index=i,
             ).prev_sample
-        
+
         # project the the paramters from the generated latents
         projected_params = self.params_proj(latents)
 
-        # update the mlp layers of the renderer 
+        # update the mlp layers of the renderer
         for name, param in self.renderer.state_dict().items():
             if f"nerstf.{name}" in projected_params.keys():
                 param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
-        
+
         # create cameras object
         camera = create_pan_cameras(size)
         rays = camera.camera_rays
         rays = rays.to(device)
         n_batches = rays.shape[1] // ray_batch_size
-        
+
         coarse_sampler = StratifiedRaySampler()
 
         images = []
         with self.progress_bar(total=n_batches) as progress_bar:
             for idx in range(n_batches):
                 rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
-                
+
                 # render rays with coarse, stratified samples.
                 _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
                 # Then, render with additional importance-weighted ray samples.
-                channels, _ , _ = self.render_rays(rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out) 
+                channels, _, _ = self.render_rays(
+                    rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
+                )
 
                 images.append(channels)
                 progress_bar.update()
-            
+
         images = torch.cat(images, dim=1)
         images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
-        
+
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
 
         images = images.cpu().numpy()
 
-        
         if output_type == "pil":
             images = self.numpy_to_pil(images)
-        
+
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
-        
+
         if not return_dict:
-            return(images,)
+            return (images,)
 
         return ShapEPipelineOutput(images=images)
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 9c58ad25f438..5ada113224f6 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -12,21 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from dataclasses import dataclass
+from typing import Optional
 
+import numpy as np
 import torch
-from torch import nn
-import math
 import torch.nn.functional as F
-
-import numpy as np
+from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 from ...utils import BaseOutput
 
-from typing import Optional, Dict
-
 
 def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
     r"""
@@ -38,7 +36,7 @@ def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
         pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all()
         n_samples: number of samples
 
-    Return: 
+    Return:
         indices sampled with replacement
     """
 
@@ -69,7 +67,6 @@ def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.T
 
 
 def encode_position(position):
-
     return posenc_nerf(position, min_deg=0, max_deg=15)
 
 
@@ -82,16 +79,12 @@ def encode_direction(position, direction=None):
 
 class VoidNeRFModel(nn.Module):
     """
-    Implements the default empty space model where all queries are rendered as
-    background.
+    Implements the default empty space model where all queries are rendered as background.
     """
 
-    def __init__(self, background, channel_scale= 255.0):
+    def __init__(self, background, channel_scale=255.0):
         super().__init__()
-        background = nn.Parameter(
-            torch.from_numpy(np.array(background)).to(dtype=torch.float32)
-            / channel_scale
-        )
+        background = nn.Parameter(torch.from_numpy(np.array(background)).to(dtype=torch.float32) / channel_scale)
 
         self.register_buffer("background", background)
 
@@ -101,9 +94,7 @@ def forward(self, position):
         shape = position.shape[:-1]
         ones = [1] * (len(shape) - 1)
         n_channels = background.shape[-1]
-        background = torch.broadcast_to(
-            background.view(background.shape[0], *ones, n_channels), [*shape, n_channels]
-        )
+        background = torch.broadcast_to(background.view(background.shape[0], *ones, n_channels), [*shape, n_channels])
 
         return background
 
@@ -123,16 +114,14 @@ def partition(self, ts):
 
         Args:
             ts: [batch_size, *shape, n_samples, 1]
-        
-        Return: 
-            
-            lower: [batch_size, *shape, n_samples, 1]
-            upper: [batch_size, *shape, n_samples, 1]
-            delta: [batch_size, *shape, n_samples, 1]
-        
+
+        Return:
+
+            lower: [batch_size, *shape, n_samples, 1] upper: [batch_size, *shape, n_samples, 1] delta: [batch_size,
+            *shape, n_samples, 1]
+
         where
-            ts \\in [lower, upper]
-            deltas = upper - lower
+            ts \\in [lower, upper] deltas = upper - lower
         """
 
         mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5
@@ -149,10 +138,10 @@ class BoundingBoxVolume(nn.Module):
     """
 
     def __init__(
-        self, 
-        *, 
-        bbox_min, 
-        bbox_max, 
+        self,
+        *,
+        bbox_min,
+        bbox_max,
         min_dist: float = 0.0,
         min_t_range: float = 1e-3,
     ):
@@ -189,11 +178,10 @@ def intersect(
             params: Optional meta parameters in case Volume is parametric
             epsilon: to stabilize calculations
 
-        Return: 
-            A tuple of (t0, t1, intersected) where each has a shape
-            [batch_size, *shape, 1]. If a ray intersects with the volume, `o + td` is
-            in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed
-            to be on the boundary of the volume.
+        Return:
+            A tuple of (t0, t1, intersected) where each has a shape [batch_size, *shape, 1]. If a ray intersects with
+            the volume, `o + td` is in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed to
+            be on the boundary of the volume.
         """
 
         batch_size, *shape, _ = origin.shape
@@ -230,8 +218,7 @@ def _safe_divide(a, b, epsilon=1e-6):
 
 class StratifiedRaySampler(nn.Module):
     """
-    Instead of fixed intervals, a sample is drawn uniformly at random from each
-    interval.
+    Instead of fixed intervals, a sample is drawn uniformly at random from each interval.
     """
 
     def __init__(self, depth_mode: str = "linear"):
@@ -254,7 +241,7 @@ def sample(
             t0: start time has shape [batch_size, *shape, 1]
             t1: finish time has shape [batch_size, *shape, 1]
             n_samples: number of ts to sample
-        Return: 
+        Return:
             sampled ts of shape [batch_size, *shape, n_samples, 1]
         """
         ones = [1] * (len(t0.shape) - 1)
@@ -273,7 +260,8 @@ def sample(
         mids = 0.5 * (ts[..., 1:] + ts[..., :-1])
         upper = torch.cat([mids, t1], dim=-1)
         lower = torch.cat([t0, mids], dim=-1)
-        torch.manual_seed(0) # yiyi notes: add a random seed here 
+        # yiyi notes: add a random seed here for testing, don't forget to remove
+        torch.manual_seed(0)
         t_rand = torch.rand_like(ts)
 
         ts = lower + (upper - lower) * t_rand
@@ -282,12 +270,16 @@ def sample(
 
 class ImportanceRaySampler(nn.Module):
     """
-    Given the initial estimate of densities, this samples more from
-    regions/bins expected to have objects.
+    Given the initial estimate of densities, this samples more from regions/bins expected to have objects.
     """
 
     def __init__(
-        self, volume_range: VolumeRange, ts: torch.Tensor, weights: torch.Tensor, blur_pool: bool = False, alpha: float = 1e-5
+        self,
+        volume_range: VolumeRange,
+        ts: torch.Tensor,
+        weights: torch.Tensor,
+        blur_pool: bool = False,
+        alpha: float = 1e-5,
     ):
         """
         Args:
@@ -310,7 +302,7 @@ def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Te
             t0: start time has shape [batch_size, *shape, 1]
             t1: finish time has shape [batch_size, *shape, 1]
             n_samples: number of ts to sample
-        Return: 
+        Return:
             sampled ts of shape [batch_size, *shape, n_samples, 1]
         """
         lower, upper, _ = self.volume_range.partition(self.ts)
@@ -339,7 +331,6 @@ def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Te
 
 @dataclass
 class MLPNeRFModelOutput(BaseOutput):
-
     density: torch.Tensor
     signed_distance: torch.Tensor
     channels: torch.Tensor
@@ -355,14 +346,12 @@ def __init__(
         n_hidden_layers: int = 6,
         act_fn: str = "swish",
         insert_direction_at: int = 4,
-
-
-    ):  
+    ):
         super().__init__()
 
         # Instantiate the MLP
-        
-        # Find out the dimension of encoded position and direction 
+
+        # Find out the dimension of encoded position and direction
         dummy = torch.eye(1, 3)
         d_posenc_pos = encode_position(position=dummy).shape[-1]
         d_posenc_dir = encode_direction(position=dummy).shape[-1]
@@ -373,48 +362,41 @@ def __init__(
 
         if insert_direction_at is not None:
             input_widths[insert_direction_at] += d_posenc_dir
-    
-        self.mlp = nn.ModuleList(
-                [
-                    nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)
-                ]
-            )
+
+        self.mlp = nn.ModuleList([nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)])
 
         if act_fn == "swish":
-            #self.activation = swish
-            # yiyi testing: 
+            # self.activation = swish
+            # yiyi testing:
             self.activation = lambda x: F.silu(x)
         else:
             raise ValueError(f"Unsupported activation function {act_fn}")
-        
+
         self.sdf_activation = torch.tanh
         self.density_activation = torch.nn.functional.relu
         self.channel_activation = torch.sigmoid
-        
-    def map_indices_to_keys(self, output):
 
+    def map_indices_to_keys(self, output):
         h_map = {
             "sdf": (0, 1),
             "density_coarse": (1, 2),
-            "density_fine":(2, 3),
+            "density_fine": (2, 3),
             "stf": (3, 6),
             "nerf_coarse": (6, 9),
-            "nerf_fine" : (9, 12) }
+            "nerf_fine": (9, 12),
+        }
 
         mapped_output = {k: output[..., start:end] for k, (start, end) in h_map.items()}
 
         return mapped_output
-        
-
-    def forward(self, *, position, direction, ts, nerf_level = "coarse"):
 
+    def forward(self, *, position, direction, ts, nerf_level="coarse"):
         h = encode_position(position)
 
         h_preact = h
         h_directionless = None
         for i, layer in enumerate(self.mlp):
-            if i == self.config.insert_direction_at: # 4 in the config 
-
+            if i == self.config.insert_direction_at:  # 4 in the config
                 h_directionless = h_preact
                 h_direction = encode_direction(position, direction=direction)
                 h = torch.cat([h, h_direction], dim=-1)
@@ -433,15 +415,15 @@ def forward(self, *, position, direction, ts, nerf_level = "coarse"):
         activation = self.map_indices_to_keys(h_final)
 
         if nerf_level == "coarse":
-            h_density = activation['density_coarse']
-            h_channels = activation['nerf_coarse']
+            h_density = activation["density_coarse"]
+            h_channels = activation["nerf_coarse"]
         else:
-            h_density = activation['density_fine']
-            h_channels = activation['nerf_fine']
-        
-        density=self.density_activation(h_density)
-        signed_distance=self.sdf_activation(activation['sdf'])
-        channels=self.channel_activation(h_channels)
-        
-        # yiyi notes: I think signed_distance is not used 
-        return MLPNeRFModelOutput(density = density, signed_distance= signed_distance, channels=channels, ts=ts)
\ No newline at end of file
+            h_density = activation["density_fine"]
+            h_channels = activation["nerf_fine"]
+
+        density = self.density_activation(h_density)
+        signed_distance = self.sdf_activation(activation["sdf"])
+        channels = self.channel_activation(h_channels)
+
+        # yiyi notes: I think signed_distance is not used
+        return MLPNeRFModelOutput(density=density, signed_distance=signed_distance, channels=channels, ts=ts)

From 73ce7f77909f2e6a94222271f7f64da310b3fa39 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 20 Jun 2023 22:58:00 +0000
Subject: [PATCH 008/119] fix copies

---
 .../utils/dummy_torch_and_transformers_objects.py         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index cc060b5572a3..f09692d5d94d 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -227,7 +227,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class ShapEPipeline(metaclass=DummyObject):
+class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -242,7 +242,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LDMTextToImagePipeline(metaclass=DummyObject):
+class PaintByExamplePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -257,7 +257,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class PaintByExamplePipeline(metaclass=DummyObject):
+class SemanticStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -272,7 +272,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class SemanticStableDiffusionPipeline(metaclass=DummyObject):
+class ShapEPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):

From 252e7a830d8f44384ee61d45df602294fdd8c6d3 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 22 Jun 2023 10:15:37 -1000
Subject: [PATCH 009/119] Update
 src/diffusers/schedulers/scheduling_heun_discrete.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/schedulers/scheduling_heun_discrete.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 93465b2d639c..f1c5af7e08d2 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -281,7 +281,6 @@ def step(
         timestep: Union[float, torch.FloatTensor],
         sample: Union[torch.FloatTensor, np.ndarray],
         return_dict: bool = True,
-        step_index: Optional[int] = None,
     ) -> Union[SchedulerOutput, Tuple]:
         """
         Args:

From 48adaa6b579c8307825e324fa05beca2a41d3177 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 22 Jun 2023 10:15:46 -1000
Subject: [PATCH 010/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5a7bacac5ebf..6344ca985cc1 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -439,7 +439,6 @@ def __call__(
                 noise_pred,
                 timestep=t,
                 sample=latents,
-                step_index=i,
             ).prev_sample
 
         # project the the paramters from the generated latents

From ba0db21a32c945ba8c4c30d8eb00f2928112d8ee Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 22 Jun 2023 10:16:02 -1000
Subject: [PATCH 011/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 6344ca985cc1..1908b7f05b68 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -412,7 +412,7 @@ def __call__(
         # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
         latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
 
-        for i, t in enumerate(self.progress_bar(timesteps)):
+        for t in self.progress_bar(timesteps):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)

From dfb6a2ec6fcda0f01ced2c616e1d7ef57129956b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 22 Jun 2023 20:18:04 +0000
Subject: [PATCH 012/119] alpha_transform_type

---
 .../pipelines/shap_e/pipeline_shap_e.py        |  6 +++++-
 .../schedulers/scheduling_heun_discrete.py     | 18 +++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5a7bacac5ebf..40c2e31ee635 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -337,7 +337,7 @@ def __call__(
         ray_batch_size: int = 4096,
         n_coarse_samples=64,
         n_fine_samples=128,
-        output_type: Optional[str] = "pt",  # pt only
+        output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
         """
@@ -441,6 +441,10 @@ def __call__(
                 sample=latents,
                 step_index=i,
             ).prev_sample
+        
+        # YiYi testing only: I don't think we need to return latent for this pipeline
+        if output_type == 'latent':
+            return ShapEPipelineOutput(images=latents)
 
         # project the the paramters from the generated latents
         projected_params = self.params_proj(latents)
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 93465b2d639c..4b9fd44446a3 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -25,7 +25,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_bar_fn=None,
+    alpha_transform_type="cosine", # cosine, exp
 ) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -43,12 +43,12 @@ def betas_for_alpha_bar(
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    if alpha_bar_fn is None:
-        alpha_bar_fn = alpha_bar
+    if alpha_transform_type == "cosine":
+        alpha_bar_fn = lambda t:  math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == 'exp': 
+        alpha_bar_fn = lambda t: math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_tranform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -111,9 +111,9 @@ def __init__(
             )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type='cosine')
         elif beta_schedule == "exp":
-            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_bar_fn=lambda t: math.exp(t * -12.0))
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type='exp')
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 

From 55e59d9527111590d3e7d7ad0b09b45f0cdac3a8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 22 Jun 2023 20:21:42 +0000
Subject: [PATCH 013/119] remove step_index argument

---
 src/diffusers/schedulers/scheduling_heun_discrete.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 8f04d186b9a5..f069c488f783 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -295,8 +295,7 @@ def step(
             [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
-        if step_index is None:
-            step_index = self.index_for_timestep(timestep)
+        step_index = self.index_for_timestep(timestep)
 
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]

From 68ef317a015eca962650b8deef6a5318b5313917 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 23 Jun 2023 02:48:00 +0000
Subject: [PATCH 014/119] remove get_sigmas_karras

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  3 +-
 .../schedulers/scheduling_heun_discrete.py    | 84 ++++++++++---------
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index c952b539d2d3..40c2e31ee635 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -412,7 +412,7 @@ def __call__(
         # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
         latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
 
-        for t in self.progress_bar(timesteps):
+        for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -439,6 +439,7 @@ def __call__(
                 noise_pred,
                 timestep=t,
                 sample=latents,
+                step_index=i,
             ).prev_sample
         
         # YiYi testing only: I don't think we need to return latent for this pipeline
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index f069c488f783..e6d0c68f3081 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -177,8 +177,16 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
+        if use_karras_sigmas is None:
+            use_karras_sigmas = self.use_karras_sigmas
+
         if sigma_min is not None and sigma_max is not None:
-            sigmas = torch.tensor([sigma_max, sigma_min])
+            
+            if use_karras_sigmas is not None:
+                sigmas = torch.tensor([sigma_max, sigma_min])
+                log_sigmas = None
+            else:
+                raise ValueError(f"`sigma_min` and `sigma_max` arguments are only supported when `use_karras_sigma` is not None")
 
         else:
             timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
@@ -187,15 +195,9 @@ def set_timesteps(
             log_sigmas = np.log(sigmas)
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
-        if use_karras_sigmas is None:
-            use_karras_sigmas = self.use_karras_sigmas
-
         if use_karras_sigmas:
             sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            if self.config.beta_schedule == "exp":
-                timesteps = np.array([self._sigma_to_t_yiyi(sigma) for sigma in sigmas])
-            else:
-                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -217,45 +219,47 @@ def set_timesteps(
         self.prev_derivative = None
         self.dt = None
 
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
-        # get log sigma
-        log_sigma = np.log(sigma)
+        
+        # perform interpolation on sigmas if log_sigmas is not None
+        if log_sigmas is not None:
+            # get log sigma
+            log_sigma = np.log(sigma)
 
-        # get distribution
-        dists = log_sigma - log_sigmas[:, np.newaxis]
+            # get distribution
+            dists = log_sigma - log_sigmas[:, np.newaxis]
 
-        # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
+            # get sigmas range
+            low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+            high_idx = low_idx + 1
 
-        low = log_sigmas[low_idx]
-        high = log_sigmas[high_idx]
+            low = log_sigmas[low_idx]
+            high = log_sigmas[high_idx]
 
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = np.clip(w, 0, 1)
+            # interpolate sigmas
+            w = (low - log_sigma) / (low - high)
+            w = np.clip(w, 0, 1)
 
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
+            # transform interpolation to time range
+            t = (1 - w) * low_idx + w * high_idx
+            t = t.reshape(sigma.shape)
 
-    # YiYi Notes: Taking from the origional repo, will refactor and not introduce dependency on spicy
-    def _sigma_to_t_yiyi(self, sigma):
-        alpha_cumprod = 1.0 / (sigma**2 + 1)
-
-        if alpha_cumprod > self.alphas_cumprod[0]:
-            return 0
-        elif alpha_cumprod <= self.alphas_cumprod[-1]:
-            return len(self.alphas_cumprod) - 1
         else:
-            from scipy import interpolate
+            # perform interpolation on alphas_cumprod
+
+            alpha_cumprod = 1.0 / (sigma**2 + 1)
+            if alpha_cumprod > self.alphas_cumprod[0]:
+                t = 0
+
+            elif alpha_cumprod <= self.alphas_cumprod[-1]:
+                t = len(self.alphas_cumprod) - 1
+
+            else:
+                t = np.interp(alpha_cumprod, self.alphas_cumprod.numpy()[::-1].copy(), np.arange(0, len(self.alphas_cumprod))[::-1])
+                t = int(t)
+
+        return t
 
-            timestep = interpolate.interp1d(self.alphas_cumprod, np.arange(0, len(self.alphas_cumprod)))(
-                alpha_cumprod
-            )  # yiyi testing, origin implementation
-        return int(timestep)
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
@@ -280,6 +284,7 @@ def step(
         model_output: Union[torch.FloatTensor, np.ndarray],
         timestep: Union[float, torch.FloatTensor],
         sample: Union[torch.FloatTensor, np.ndarray],
+        step_index: Optional[int] = None,
         return_dict: bool = True,
     ) -> Union[SchedulerOutput, Tuple]:
         """
@@ -295,7 +300,8 @@ def step(
             [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
-        step_index = self.index_for_timestep(timestep)
+        if step_index is None:
+            step_index = self.index_for_timestep(timestep)
 
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]

From 6ec68eec402586e55b956d18491dbde32c0bbcba Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 23 Jun 2023 02:54:41 +0000
Subject: [PATCH 015/119] remove _yiyi_sigma_to_t

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  4 +--
 .../schedulers/scheduling_heun_discrete.py    | 33 ++++++++++++-------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 40c2e31ee635..23afa0b3b4dc 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -441,9 +441,9 @@ def __call__(
                 sample=latents,
                 step_index=i,
             ).prev_sample
-        
+
         # YiYi testing only: I don't think we need to return latent for this pipeline
-        if output_type == 'latent':
+        if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 
         # project the the paramters from the generated latents
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index e6d0c68f3081..d7cbc5d2d382 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -25,7 +25,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine", # cosine, exp
+    alpha_transform_type="cosine",  # cosine, exp
 ) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -44,11 +44,17 @@ def betas_for_alpha_bar(
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
     if alpha_transform_type == "cosine":
-        alpha_bar_fn = lambda t:  math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
-    elif alpha_transform_type == 'exp': 
-        alpha_bar_fn = lambda t: math.exp(t * -12.0)
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
     else:
-        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_tranform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
@@ -111,9 +117,9 @@ def __init__(
             )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type='cosine')
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
         elif beta_schedule == "exp":
-            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type='exp')
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="exp")
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
@@ -181,12 +187,13 @@ def set_timesteps(
             use_karras_sigmas = self.use_karras_sigmas
 
         if sigma_min is not None and sigma_max is not None:
-            
             if use_karras_sigmas is not None:
                 sigmas = torch.tensor([sigma_max, sigma_min])
                 log_sigmas = None
             else:
-                raise ValueError(f"`sigma_min` and `sigma_max` arguments are only supported when `use_karras_sigma` is not None")
+                raise ValueError(
+                    "`sigma_min` and `sigma_max` arguments are only supported when `use_karras_sigma` is not None"
+                )
 
         else:
             timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
@@ -220,7 +227,6 @@ def set_timesteps(
         self.dt = None
 
     def _sigma_to_t(self, sigma, log_sigmas):
-        
         # perform interpolation on sigmas if log_sigmas is not None
         if log_sigmas is not None:
             # get log sigma
@@ -255,12 +261,15 @@ def _sigma_to_t(self, sigma, log_sigmas):
                 t = len(self.alphas_cumprod) - 1
 
             else:
-                t = np.interp(alpha_cumprod, self.alphas_cumprod.numpy()[::-1].copy(), np.arange(0, len(self.alphas_cumprod))[::-1])
+                t = np.interp(
+                    alpha_cumprod,
+                    self.alphas_cumprod.numpy()[::-1].copy(),
+                    np.arange(0, len(self.alphas_cumprod))[::-1],
+                )
                 t = int(t)
 
         return t
 
-
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""

From 5b5a8e6be918fefd114a2945ed89d8e8fa8be21b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 23 Jun 2023 03:11:55 +0000
Subject: [PATCH 016/119] move the rescale prompt_embeds from prior_transformer
 to pipeline

---
 src/diffusers/models/prior_transformer.py         | 7 +------
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 4 ++++
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 6c535a6cfc6c..4eda1721adbc 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -1,4 +1,3 @@
-import math
 from dataclasses import dataclass
 from typing import Dict, Optional, Union
 
@@ -249,11 +248,7 @@ def forward(
         # but time_embedding might be fp16, so we need to cast here.
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
-
-        # Rescale the features to have unit variance
-        # YiYi TO-DO: It was normalized before during encode_prompt step, move this step to pipeline
-        if self.clip_mean is None:
-            proj_embedding = math.sqrt(proj_embedding.shape[1]) * proj_embedding
+        
         proj_embeddings = self.embedding_proj(proj_embedding)
         if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
             encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 23afa0b3b4dc..ddbf1a71a93d 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -242,6 +243,9 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        
+        # Rescale the features to have unit variance (this step is taken from the original repo)
+        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         return prompt_embeds
 

From 3f6b435d65dad3e5514cad2f5dd9e4419ca78e0b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 03:09:06 +0000
Subject: [PATCH 017/119] replace baddbmm with einsum to match origial repo

---
 src/diffusers/models/attention.py           | 12 +++++
 src/diffusers/models/attention_processor.py | 57 ++++++++++++++++-----
 2 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 8805257ebe9a..1648a8205ee2 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -139,6 +139,12 @@ def forward(
             )
         else:
             norm_hidden_states = self.norm1(hidden_states)
+        #print(" ")
+        #print(" -----")
+        #print(" inside transformer block")
+        #print(f" input h: {hidden_states.shape}, {hidden_states.abs().sum()}")
+        #print(f" -> ln_1 :{norm_hidden_states.shape}, {norm_hidden_states.abs().sum()}")
+
 
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
@@ -147,9 +153,12 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
+        #print(f" -> attn: {attn_output.shape}, {attn_output.abs().sum()}")
+
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
+        #print(f" -> attn+ x: {hidden_states.shape}, {hidden_states.abs().sum()}")
 
         # 2. Cross-Attention
         if self.attn2 is not None:
@@ -167,16 +176,19 @@ def forward(
 
         # 3. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
+        #print(f" -> ln_2: {norm_hidden_states.shape}, {norm_hidden_states.abs().sum()}")
 
         if self.use_ada_layer_norm_zero:
             norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
         ff_output = self.ff(norm_hidden_states)
+        #print(f" -> mlp: {ff_output.shape}, {ff_output.abs().sum()}")
 
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
 
         hidden_states = ff_output + hidden_states
+        #print(f" -> x + mlp: {hidden_states.shape},{hidden_states.abs().sum()}")
 
         return hidden_states
 
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 0bc7886c2653..65723ebe0a69 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Callable, Optional, Union
-
+import math
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -345,6 +345,7 @@ def head_to_batch_dim(self, tensor, out_dim=3):
         return tensor
 
     def get_attention_scores(self, query, key, attention_mask=None):
+        #print(f"    inside get_attention_scores")
         dtype = query.dtype
         if self.upcast_attention:
             query = query.float()
@@ -358,17 +359,32 @@ def get_attention_scores(self, query, key, attention_mask=None):
         else:
             baddbmm_input = attention_mask
             beta = 1
-
-        attention_scores = torch.baddbmm(
-            baddbmm_input,
-            query,
-            key.transpose(-1, -2),
-            beta=beta,
-            alpha=self.scale,
-        )
+        #print(f" self.scale: {self.scale}")
+        scale_sqrt = math.sqrt(self.scale)
+        #print(f" scale_sqrt: {scale_sqrt}")
+        #print(f" k input: {key.transpose(-1, -2).shape}")
+#         attention_scores = torch.baddbmm(
+#             baddbmm_input,
+#             #query * scale_sqrt, # yiyi testing
+#             query, # [32, 1026, 64]
+#             #key.transpose(-1, -2) * scale_sqrt, # yiyi testing
+#             key.transpose(-1, -2), # [32, 64, 1026]
+#             beta=beta,
+# #            alpha=self.scale, # yiyi testing: scale q and k before the matrix multiplication (vs afterword with alpha)
+#             # alpha=self.scale, # yiyi testing: comment back
+#         )
+        # yiyi testing
+        attention_scores = torch.einsum(
+            "btc,bsc->bts", query, key
+        )  # More stable with f16 than dividing afterwards
+        #print(f" q@v: {attention_scores.shape},{attention_scores.abs().sum()}")
+        attention_scores = attention_scores * self.scale # yiyi testing: comment out
+        #print(f" scaled q@v: {attention_scores.shape},{attention_scores.abs().sum()}")
         del baddbmm_input
-
+        # yiyi testing:
+        #attention_scores = attention_scores.float() # yiyi added for testing
         if self.upcast_softmax:
+        #    print("upscale_softmax")
             attention_scores = attention_scores.float()
 
         attention_probs = attention_scores.softmax(dim=-1)
@@ -451,14 +467,17 @@ def __call__(
         encoder_hidden_states=None,
         attention_mask=None,
         temb=None,
-    ):
+    ):  
+        print(" ")
+        print(" ********")
+        print(f"  inside AttnProcessor")
         residual = hidden_states
 
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
 
         input_ndim = hidden_states.ndim
-
+        print(f" input_ndim: {input_ndim}")
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
@@ -467,25 +486,37 @@ def __call__(
             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
+      
         if attn.group_norm is not None:
+            print(f" group_norm")
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
+            print(f" encoder_hidden_states is None")
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
+            print(" norm_cross")
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
+        print(f" -q: {query.shape},{query.abs().sum()}")
+        print(f" -k: {key.shape},{key.abs().sum()}")
+        print(f" -v: {value.shape},{value.abs().sum()}")
+
         query = attn.head_to_batch_dim(query)
         key = attn.head_to_batch_dim(key)
         value = attn.head_to_batch_dim(value)
+        print(f" -> head_to_batch_dim:")
+        print(f" -q: {query.shape},{query.abs().sum()}")
+        print(f" -k: {key.shape},{key.abs().sum()}")
+        print(f" -v: {value.shape},{value.abs().sum()}")
 
         attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        print(f" attention_probs: {attention_probs.shape}, {attention_probs.abs().sum()}")
         hidden_states = torch.bmm(attention_probs, value)
         hidden_states = attn.batch_to_head_dim(hidden_states)
 

From dd1991b3f67dda4b734fed337b4b26ca6d87e46f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 03:09:42 +0000
Subject: [PATCH 018/119] Revert "replace baddbmm with einsum to match origial
 repo"

This reverts commit 3f6b435d65dad3e5514cad2f5dd9e4419ca78e0b.
---
 src/diffusers/models/attention.py           | 12 -----
 src/diffusers/models/attention_processor.py | 57 +++++----------------
 2 files changed, 13 insertions(+), 56 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 1648a8205ee2..8805257ebe9a 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -139,12 +139,6 @@ def forward(
             )
         else:
             norm_hidden_states = self.norm1(hidden_states)
-        #print(" ")
-        #print(" -----")
-        #print(" inside transformer block")
-        #print(f" input h: {hidden_states.shape}, {hidden_states.abs().sum()}")
-        #print(f" -> ln_1 :{norm_hidden_states.shape}, {norm_hidden_states.abs().sum()}")
-
 
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
@@ -153,12 +147,9 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-        #print(f" -> attn: {attn_output.shape}, {attn_output.abs().sum()}")
-
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
-        #print(f" -> attn+ x: {hidden_states.shape}, {hidden_states.abs().sum()}")
 
         # 2. Cross-Attention
         if self.attn2 is not None:
@@ -176,19 +167,16 @@ def forward(
 
         # 3. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
-        #print(f" -> ln_2: {norm_hidden_states.shape}, {norm_hidden_states.abs().sum()}")
 
         if self.use_ada_layer_norm_zero:
             norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
         ff_output = self.ff(norm_hidden_states)
-        #print(f" -> mlp: {ff_output.shape}, {ff_output.abs().sum()}")
 
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
 
         hidden_states = ff_output + hidden_states
-        #print(f" -> x + mlp: {hidden_states.shape},{hidden_states.abs().sum()}")
 
         return hidden_states
 
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 65723ebe0a69..0bc7886c2653 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Callable, Optional, Union
-import math
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -345,7 +345,6 @@ def head_to_batch_dim(self, tensor, out_dim=3):
         return tensor
 
     def get_attention_scores(self, query, key, attention_mask=None):
-        #print(f"    inside get_attention_scores")
         dtype = query.dtype
         if self.upcast_attention:
             query = query.float()
@@ -359,32 +358,17 @@ def get_attention_scores(self, query, key, attention_mask=None):
         else:
             baddbmm_input = attention_mask
             beta = 1
-        #print(f" self.scale: {self.scale}")
-        scale_sqrt = math.sqrt(self.scale)
-        #print(f" scale_sqrt: {scale_sqrt}")
-        #print(f" k input: {key.transpose(-1, -2).shape}")
-#         attention_scores = torch.baddbmm(
-#             baddbmm_input,
-#             #query * scale_sqrt, # yiyi testing
-#             query, # [32, 1026, 64]
-#             #key.transpose(-1, -2) * scale_sqrt, # yiyi testing
-#             key.transpose(-1, -2), # [32, 64, 1026]
-#             beta=beta,
-# #            alpha=self.scale, # yiyi testing: scale q and k before the matrix multiplication (vs afterword with alpha)
-#             # alpha=self.scale, # yiyi testing: comment back
-#         )
-        # yiyi testing
-        attention_scores = torch.einsum(
-            "btc,bsc->bts", query, key
-        )  # More stable with f16 than dividing afterwards
-        #print(f" q@v: {attention_scores.shape},{attention_scores.abs().sum()}")
-        attention_scores = attention_scores * self.scale # yiyi testing: comment out
-        #print(f" scaled q@v: {attention_scores.shape},{attention_scores.abs().sum()}")
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
         del baddbmm_input
-        # yiyi testing:
-        #attention_scores = attention_scores.float() # yiyi added for testing
+
         if self.upcast_softmax:
-        #    print("upscale_softmax")
             attention_scores = attention_scores.float()
 
         attention_probs = attention_scores.softmax(dim=-1)
@@ -467,17 +451,14 @@ def __call__(
         encoder_hidden_states=None,
         attention_mask=None,
         temb=None,
-    ):  
-        print(" ")
-        print(" ********")
-        print(f"  inside AttnProcessor")
+    ):
         residual = hidden_states
 
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
 
         input_ndim = hidden_states.ndim
-        print(f" input_ndim: {input_ndim}")
+
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
@@ -486,37 +467,25 @@ def __call__(
             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-      
+
         if attn.group_norm is not None:
-            print(f" group_norm")
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
         query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
-            print(f" encoder_hidden_states is None")
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            print(" norm_cross")
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
-        print(f" -q: {query.shape},{query.abs().sum()}")
-        print(f" -k: {key.shape},{key.abs().sum()}")
-        print(f" -v: {value.shape},{value.abs().sum()}")
-
         query = attn.head_to_batch_dim(query)
         key = attn.head_to_batch_dim(key)
         value = attn.head_to_batch_dim(value)
-        print(f" -> head_to_batch_dim:")
-        print(f" -q: {query.shape},{query.abs().sum()}")
-        print(f" -k: {key.shape},{key.abs().sum()}")
-        print(f" -v: {value.shape},{value.abs().sum()}")
 
         attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        print(f" attention_probs: {attention_probs.shape}, {attention_probs.abs().sum()}")
         hidden_states = torch.bmm(attention_probs, value)
         hidden_states = attn.batch_to_head_dim(hidden_states)
 

From 3069e8e2d016a60e2030d79106f033a18cdffbd5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 03:37:09 +0000
Subject: [PATCH 019/119] add step_index to scale_model_input

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py    | 2 +-
 src/diffusers/schedulers/scheduling_heun_discrete.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index ddbf1a71a93d..8f9902cbf748 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -419,7 +419,7 @@ def __call__(
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
 
             noise_pred = self.prior(
                 scaled_model_input,
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index d7cbc5d2d382..fee6e3a36417 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -146,6 +146,7 @@ def scale_model_input(
         self,
         sample: torch.FloatTensor,
         timestep: Union[float, torch.FloatTensor],
+        step_index: Optional[int]=None,
     ) -> torch.FloatTensor:
         """
         Args:
@@ -155,7 +156,8 @@ def scale_model_input(
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
-        step_index = self.index_for_timestep(timestep)
+        if step_index is not None:
+            step_index = self.index_for_timestep(timestep)
 
         sigma = self.sigmas[step_index]
         sample = sample / ((sigma**2 + 1) ** 0.5)

From 6bd46da964def53cdea4595fec5fe1a4a529d2db Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 03:37:39 +0000
Subject: [PATCH 020/119] Revert "move the rescale prompt_embeds from
 prior_transformer to pipeline"

This reverts commit 5b5a8e6be918fefd114a2945ed89d8e8fa8be21b.
---
 src/diffusers/models/prior_transformer.py         | 7 ++++++-
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 4 ----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 4eda1721adbc..6c535a6cfc6c 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass
 from typing import Dict, Optional, Union
 
@@ -248,7 +249,11 @@ def forward(
         # but time_embedding might be fp16, so we need to cast here.
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
-        
+
+        # Rescale the features to have unit variance
+        # YiYi TO-DO: It was normalized before during encode_prompt step, move this step to pipeline
+        if self.clip_mean is None:
+            proj_embedding = math.sqrt(proj_embedding.shape[1]) * proj_embedding
         proj_embeddings = self.embedding_proj(proj_embedding)
         if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
             encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 8f9902cbf748..10bf02939da4 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -243,9 +242,6 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        
-        # Rescale the features to have unit variance (this step is taken from the original repo)
-        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         return prompt_embeds
 

From 8f9b2320c5cb6513b401f8cf1f3fe1850d7a7295 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 04:21:08 +0000
Subject: [PATCH 021/119] move rescale from prior_transformer to pipeline

---
 src/diffusers/models/prior_transformer.py         | 4 ----
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 6c535a6cfc6c..0eafa2ba9503 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -250,10 +250,6 @@ def forward(
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
 
-        # Rescale the features to have unit variance
-        # YiYi TO-DO: It was normalized before during encode_prompt step, move this step to pipeline
-        if self.clip_mean is None:
-            proj_embedding = math.sqrt(proj_embedding.shape[1]) * proj_embedding
         proj_embeddings = self.embedding_proj(proj_embedding)
         if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
             encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 10bf02939da4..ef6d3193c63b 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -242,6 +243,8 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        print(f" inside pipeline._encode_prompt")
+        print(f" prompt_embeds: {prompt_embeds.shape}, {prompt_embeds.abs().sum()}, {prompt_embeds.abs().sum(-1)}")
 
         return prompt_embeds
 
@@ -389,6 +392,8 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
         prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        # Rescale the features to have unit variance
+        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         # prior
 

From 7eae66b9b9fb74ec6a6630f2469ccd2319a5affd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 06:33:17 +0000
Subject: [PATCH 022/119] correct step_index in scale_model_input

---
 src/diffusers/schedulers/scheduling_heun_discrete.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index fee6e3a36417..8ae732a653cc 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -156,7 +156,7 @@ def scale_model_input(
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
-        if step_index is not None:
+        if step_index is None:
             step_index = self.index_for_timestep(timestep)
 
         sigma = self.sigmas[step_index]

From ea99154f4b4441451e984918a3e5d1ac84d61dce Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 09:11:55 +0000
Subject: [PATCH 023/119] remove print lines

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index ef6d3193c63b..5b05ac82080f 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -243,8 +243,6 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        print(f" inside pipeline._encode_prompt")
-        print(f" prompt_embeds: {prompt_embeds.shape}, {prompt_embeds.abs().sum()}, {prompt_embeds.abs().sum(-1)}")
 
         return prompt_embeds
 

From 49eb879e3833042bd828aacaf60675f70680c711 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 09:12:46 +0000
Subject: [PATCH 024/119] refactor prior - reduce arguments

---
 src/diffusers/models/prior_transformer.py | 26 ++++++++++-------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 0eafa2ba9503..42e74fe95256 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -60,15 +60,11 @@ def __init__(
         num_embeddings=77,
         additional_embeddings=4,
         dropout: float = 0.0,
-        act_fn: str = "silu",
-        has_pre_norm: bool = False,
-        has_encoder_hidden_states_proj: bool = True,
-        has_prd_embedding: bool = True,
-        has_post_process: bool = True,
+        time_embed_act_fn: str = "silu",
         time_embed_dim: Optional[int] = None,
         clip_embedding_dim: Optional[int] = None,
         out_dim: Optional[int] = None,
-    ):
+    ): 
         super().__init__()
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
@@ -85,20 +81,20 @@ def __init__(
             out_dim = embedding_dim
 
         self.time_proj = Timesteps(inner_dim, True, 0)
-        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=act_fn)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
 
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
 
         self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
 
-        if has_encoder_hidden_states_proj:
-            self.encoder_hidden_states_proj = nn.Linear(clip_embedding_dim, inner_dim)
+        if self.config.clip_embedding_dim is None:
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
         else:
             self.encoder_hidden_states_proj = None
 
         self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
 
-        if has_prd_embedding:
+        if self.config.out_dim is None:
             self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
         else:
             self.prd_embedding = None
@@ -117,13 +113,13 @@ def __init__(
             ]
         )
 
-        if has_pre_norm:
+        if self.config.out_dim is not None:
             self.norm_in = nn.LayerNorm(inner_dim)
         else:
             self.norm_in = None
 
         self.norm_out = nn.LayerNorm(inner_dim)
-
+        
         self.proj_to_clip_embeddings = nn.Linear(inner_dim, out_dim)
 
         causal_attention_mask = torch.full(
@@ -132,9 +128,9 @@ def __init__(
         causal_attention_mask.triu_(1)
         causal_attention_mask = causal_attention_mask[None, ...]
         self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
-        if has_post_process:
-            self.clip_mean = nn.Parameter(torch.zeros(1, clip_embedding_dim))
-            self.clip_std = nn.Parameter(torch.zeros(1, clip_embedding_dim))
+        if self.config.out_dim is None:
+            self.clip_mean = nn.Parameter(torch.zeros(1, out_dim))
+            self.clip_std = nn.Parameter(torch.zeros(1, out_dim))
         else:
             self.clip_mean = None
             self.clip_std = None

From 90cc68cefa7d5053202c9b9d82f25140ade6cc0d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 26 Jun 2023 09:13:42 +0000
Subject: [PATCH 025/119] make style

---
 src/diffusers/models/prior_transformer.py            | 5 ++---
 src/diffusers/schedulers/scheduling_heun_discrete.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 42e74fe95256..ee116465cfe5 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -1,4 +1,3 @@
-import math
 from dataclasses import dataclass
 from typing import Dict, Optional, Union
 
@@ -64,7 +63,7 @@ def __init__(
         time_embed_dim: Optional[int] = None,
         clip_embedding_dim: Optional[int] = None,
         out_dim: Optional[int] = None,
-    ): 
+    ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
@@ -119,7 +118,7 @@ def __init__(
             self.norm_in = None
 
         self.norm_out = nn.LayerNorm(inner_dim)
-        
+
         self.proj_to_clip_embeddings = nn.Linear(inner_dim, out_dim)
 
         causal_attention_mask = torch.full(
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 8ae732a653cc..3f37766ed2ee 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -146,7 +146,7 @@ def scale_model_input(
         self,
         sample: torch.FloatTensor,
         timestep: Union[float, torch.FloatTensor],
-        step_index: Optional[int]=None,
+        step_index: Optional[int] = None,
     ) -> torch.FloatTensor:
         """
         Args:

From 0aac1185606ed21fa558797a9ba7fd1c4b930ab9 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 27 Jun 2023 00:58:57 +0000
Subject: [PATCH 026/119] add prior_image

---
 scripts/convert_shap_e_to_diffusers.py    | 200 ++++++++++++++++++++--
 src/diffusers/models/prior_transformer.py |   6 +
 2 files changed, 193 insertions(+), 13 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 217acce43dcd..b98bbdc03596 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -20,9 +20,10 @@
 ```sh
 $ python scripts/convert_shap_e_to_diffusers.py \
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
+      --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \
       --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
-      --dump_path /home/yiyi_huggingface_co/model_repo/shape/renderer \
-      --debug renderer
+      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e/prior_image \
+      --debug prior_image
 ```
 """
 
@@ -39,14 +40,10 @@
     "embedding_dim": 1024,
     "num_embeddings": 1024,
     "additional_embeddings": 0,
-    "act_fn": "gelu",
+    "time_embed_act_fn": "gelu",
     "time_embed_dim": 1024 * 4,
     "clip_embedding_dim": 768,
     "out_dim": 1024 * 2,
-    "has_pre_norm": True,
-    "has_encoder_hidden_states_proj": False,
-    "has_prd_embedding": False,
-    "has_post_process": False,
 }
 
 
@@ -75,7 +72,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
         }
     )
 
-    # <original>.clip_img_proj -> <diffusers>.proj_in
+    # <original>.input_proj -> <diffusers>.proj_in
     diffusers_checkpoint.update(
         {
             "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.weight"],
@@ -83,7 +80,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
         }
     )
 
-    # <original>.text_emb_proj -> <diffusers>.embedding_proj
+    # <original>.clip_emb -> <diffusers>.embedding_proj
     diffusers_checkpoint.update(
         {
             "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.weight"],
@@ -91,7 +88,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
         }
     )
 
-    # <original>.positional_embedding -> <diffusers>.positional_embedding
+    # <original>.pos_emb -> <diffusers>.positional_embedding
     diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.pos_emb"][None, :]})
 
     # <original>.ln_pre -> <diffusers>.norm_in
@@ -102,7 +99,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
         }
     )
 
-    # <original>.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    # <original>.backbone.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
     for idx in range(len(model.transformer_blocks)):
         diffusers_transformer_prefix = f"transformer_blocks.{idx}"
         original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.backbone.resblocks.{idx}"
@@ -148,7 +145,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
             }
         )
 
-    # <original>.final_ln -> <diffusers>.norm_out
+    # <original>.ln_post -> <diffusers>.norm_out
     diffusers_checkpoint.update(
         {
             "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.weight"],
@@ -156,7 +153,7 @@ def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
         }
     )
 
-    # <original>.out_proj -> <diffusers>.proj_to_clip_embeddings
+    # <original>.output_proj -> <diffusers>.proj_to_clip_embeddings
     diffusers_checkpoint.update(
         {
             "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.weight"],
@@ -218,6 +215,154 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 # done prior
 
 
+# prior_image (only slightly different from prior)
+
+
+PRIOR_IMAGE_ORIGINAL_PREFIX = "wrapped"
+
+# Uses default arguments
+PRIOR_IMAGE_CONFIG = {
+    "num_attention_heads": 16,
+    "attention_head_dim": 1024 // 16,
+    "num_layers": 24,
+    "embedding_dim": 1024,
+    "num_embeddings": 1024,
+    "additional_embeddings": 0,
+    "time_embed_act_fn": "gelu",
+    "embedding_proj_norm": True,
+    "time_embed_dim": 1024 * 4,
+    "clip_embedding_dim": 1024,
+    "out_dim": 1024 * 2,
+}
+
+def prior_image_model_from_original_config():
+    model = PriorTransformer(**PRIOR_IMAGE_CONFIG)
+
+    return model
+
+
+def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # <original>.time_embed.c_fc -> <diffusers>.time_embedding.linear_1
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.weight"],
+            "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.bias"],
+        }
+    )
+
+    # <original>.time_embed.c_proj -> <diffusers>.time_embedding.linear_2
+    diffusers_checkpoint.update(
+        {
+            "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.weight"],
+            "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.bias"],
+        }
+    )
+
+    # <original>.input_proj -> <diffusers>.proj_in
+    diffusers_checkpoint.update(
+        {
+            "proj_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.weight"],
+            "proj_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.bias"],
+        }
+    )
+
+    # <original>.clip_embed.0 -> <diffusers>.embedding_proj_norm
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj_norm.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.weight"],
+            "embedding_proj_norm.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.bias"],
+        }
+    )
+
+    # <original>..clip_embed.1 -> <diffusers>.embedding_proj
+    diffusers_checkpoint.update(
+        {
+            "embedding_proj.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.weight"],
+            "embedding_proj.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.bias"],
+        }
+    )
+
+    # <original>.pos_emb -> <diffusers>.positional_embedding
+    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.pos_emb"][None, :]})
+
+    # <original>.ln_pre -> <diffusers>.norm_in
+    diffusers_checkpoint.update(
+        {
+            "norm_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.weight"],
+            "norm_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.bias"],
+        }
+    )
+
+    # <original>.backbone.resblocks.<x> -> <diffusers>.transformer_blocks.<x>
+    for idx in range(len(model.transformer_blocks)):
+        diffusers_transformer_prefix = f"transformer_blocks.{idx}"
+        original_transformer_prefix = f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.backbone.resblocks.{idx}"
+
+        # <original>.attn -> <diffusers>.attn1
+        diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1"
+        original_attention_prefix = f"{original_transformer_prefix}.attn"
+        diffusers_checkpoint.update(
+            prior_attention_to_diffusers(
+                checkpoint,
+                diffusers_attention_prefix=diffusers_attention_prefix,
+                original_attention_prefix=original_attention_prefix,
+                attention_head_dim=model.attention_head_dim,
+            )
+        )
+
+        # <original>.mlp -> <diffusers>.ff
+        diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff"
+        original_ff_prefix = f"{original_transformer_prefix}.mlp"
+        diffusers_checkpoint.update(
+            prior_ff_to_diffusers(
+                checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix
+            )
+        )
+
+        # <original>.ln_1 -> <diffusers>.norm1
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_1.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"],
+            }
+        )
+
+        # <original>.ln_2 -> <diffusers>.norm3
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[
+                    f"{original_transformer_prefix}.ln_2.weight"
+                ],
+                f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"],
+            }
+        )
+
+    # <original>.ln_post -> <diffusers>.norm_out
+    diffusers_checkpoint.update(
+        {
+            "norm_out.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.weight"],
+            "norm_out.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.bias"],
+        }
+    )
+
+    # <original>.output_proj -> <diffusers>.proj_to_clip_embeddings
+    diffusers_checkpoint.update(
+        {
+            "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.weight"],
+            "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+# done prior_image
+
+
 # params_proj
 
 PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
@@ -313,6 +458,24 @@ def prior(*, args, checkpoint_map_location):
     return prior_model
 
 
+def prior_image(*, args, checkpoint_map_location):
+    print("loading prior_image")
+
+    prior_checkpoint = torch.load(args.prior_image_checkpoint_path, map_location=checkpoint_map_location)
+
+    prior_model = prior_image_model_from_original_config()
+
+    prior_diffusers_checkpoint = prior_image_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint)
+
+    del prior_checkpoint
+
+    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+
+    print("done loading prior_image")
+
+    return prior_model
+
+
 def params_proj(*, args, checkpoint_map_location):
     print("loading params_proj")
 
@@ -376,6 +539,14 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
         help="Path to the prior checkpoint to convert.",
     )
 
+    parser.add_argument(
+        "--prior_image_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the prior_image checkpoint to convert.",
+    )
+
     parser.add_argument(
         "--transmitter_checkpoint_path",
         default=None,
@@ -414,6 +585,9 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "prior":
         prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location)
         prior_model.save_pretrained(args.dump_path)
+    elif args.debug == "prior_image":
+        prior_model = prior_image(args=args, checkpoint_map_location=checkpoint_map_location)
+        prior_model.save_pretrained(args.dump_path)
     elif args.debug == "params_proj":
         params_proj_model = params_proj(args=args, checkpoint_map_location=checkpoint_map_location)
         params_proj_model.save_pretrained(args.dump_path)
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index ee116465cfe5..8b750311cb1d 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -60,6 +60,7 @@ def __init__(
         additional_embeddings=4,
         dropout: float = 0.0,
         time_embed_act_fn: str = "silu",
+        embedding_proj_norm: bool = False,
         time_embed_dim: Optional[int] = None,
         clip_embedding_dim: Optional[int] = None,
         out_dim: Optional[int] = None,
@@ -84,6 +85,11 @@ def __init__(
 
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
 
+        if embedding_proj_norm:
+            self.embedding_proj_norm = nn.LayerNorm(clip_embedding_dim)
+        else:
+            self.embedding_proj_norm = None
+            
         self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
 
         if self.config.clip_embedding_dim is None:

From d9f9101917e79baabd61ed8524fedf470f14ef16 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 27 Jun 2023 14:27:03 +0000
Subject: [PATCH 027/119] arg embedding_proj_norm -> norm_embedding_proj

---
 scripts/convert_shap_e_to_diffusers.py    |  2 +-
 src/diffusers/models/prior_transformer.py | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index b98bbdc03596..6494c1883973 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -229,7 +229,7 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
     "num_embeddings": 1024,
     "additional_embeddings": 0,
     "time_embed_act_fn": "gelu",
-    "embedding_proj_norm": True,
+    "norm_embedding_proj": True,
     "time_embed_dim": 1024 * 4,
     "clip_embedding_dim": 1024,
     "out_dim": 1024 * 2,
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 8b750311cb1d..4691a622647f 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -60,7 +60,7 @@ def __init__(
         additional_embeddings=4,
         dropout: float = 0.0,
         time_embed_act_fn: str = "silu",
-        embedding_proj_norm: bool = False,
+        norm_embedding_proj: bool = False,
         time_embed_dim: Optional[int] = None,
         clip_embedding_dim: Optional[int] = None,
         out_dim: Optional[int] = None,
@@ -85,7 +85,7 @@ def __init__(
 
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
 
-        if embedding_proj_norm:
+        if norm_embedding_proj:
             self.embedding_proj_norm = nn.LayerNorm(clip_embedding_dim)
         else:
             self.embedding_proj_norm = None
@@ -262,14 +262,22 @@ def forward(
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
 
         tokens = []
+        additional_embeddings =0 
 
         if encoder_hidden_states is not None:
             tokens.append(encoder_hidden_states)
+            additional_embeddings += encoder_hidden_states.shape[1]
+        
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+        
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
 
         tokens = tokens + [
-            proj_embeddings[:, None, :],
+            proj_embeddings,
             time_embeddings[:, None, :],
-            hidden_states[:, None, :] if len(hidden_states.shape) == 2 else hidden_states,
+            hidden_states,
         ]
 
         if self.prd_embedding is not None:
@@ -282,7 +290,7 @@ def forward(
         )
 
         # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
-        additional_embeddings = 2 + (encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0)
+        additional_embeddings = additional_embeddings + proj_embeddings.shape[1] + 1
         if positional_embeddings.shape[1] < hidden_states.shape[1]:
             positional_embeddings = F.pad(
                 positional_embeddings,

From 2c8439121e20925900a7a66e7865b9913349db0d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 27 Jun 2023 23:56:46 +0000
Subject: [PATCH 028/119] add pre-norm for proj_embedding

---
 scripts/convert_shap_e_to_diffusers.py    | 5 +++--
 src/diffusers/models/prior_transformer.py | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 6494c1883973..cf6c11555728 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -222,8 +222,8 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
 
 # Uses default arguments
 PRIOR_IMAGE_CONFIG = {
-    "num_attention_heads": 16,
-    "attention_head_dim": 1024 // 16,
+    "num_attention_heads": 8,
+    "attention_head_dim": 1024 // 8,
     "num_layers": 24,
     "embedding_dim": 1024,
     "num_embeddings": 1024,
@@ -461,6 +461,7 @@ def prior(*, args, checkpoint_map_location):
 def prior_image(*, args, checkpoint_map_location):
     print("loading prior_image")
 
+    print(f"load checkpoint from {args.prior_image_checkpoint_path}")
     prior_checkpoint = torch.load(args.prior_image_checkpoint_path, map_location=checkpoint_map_location)
 
     prior_model = prior_image_model_from_original_config()
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 4691a622647f..789a61ec5309 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -250,6 +250,9 @@ def forward(
         # but time_embedding might be fp16, so we need to cast here.
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
+        
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
 
         proj_embeddings = self.embedding_proj(proj_embedding)
         if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:

From 501fed70053f1fedd753150f0c4725005877083c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 28 Jun 2023 00:38:24 +0000
Subject: [PATCH 029/119] move rescale prompt from pipeline to _encode_prompt

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5b05ac82080f..0083968f8590 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -243,6 +243,10 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        
+        # Rescale the features to have unit variance
+        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
+
 
         return prompt_embeds
 
@@ -390,8 +394,6 @@ def __call__(
 
         do_classifier_free_guidance = guidance_scale > 1.0
         prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
-        # Rescale the features to have unit variance
-        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         # prior
 

From f970afd228cf13bbc457684f340bcbbd9d1c9b08 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 28 Jun 2023 02:20:29 +0000
Subject: [PATCH 030/119] add img2img pipeline

---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   2 +-
 src/diffusers/pipelines/shap_e/__init__.py    |   1 +
 .../shap_e/pipeline_shap_e_img2img.py         | 486 ++++++++++++++++++
 4 files changed, 489 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 501e2c54ca92..1fe675ad3bd7 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -140,6 +140,7 @@
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
         ShapEPipeline,
+        ShapEImg2ImgPipeline,
         StableDiffusionAttendAndExcitePipeline,
         StableDiffusionControlNetImg2ImgPipeline,
         StableDiffusionControlNetInpaintPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index ee0fc76bdf53..4fee68f6ceb2 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -66,7 +66,7 @@
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-    from .shap_e import ShapEPipeline
+    from .shap_e import ShapEPipeline, ShapEImg2ImgPipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
         StableDiffusionAttendAndExcitePipeline,
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 76ca0ea814f1..1f44166acd11 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -15,6 +15,7 @@
     from .camera import create_pan_cameras
     from .params_proj import ShapEParamsProjModel
     from .pipeline_shap_e import ShapEPipeline
+    from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
     from .renderer import (
         BoundingBoxVolume,
         ImportanceRaySampler,
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
new file mode 100644
index 000000000000..1dd0136a4cfd
--- /dev/null
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -0,0 +1,486 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModel
+
+from ...models import PriorTransformer
+from ...pipelines import DiffusionPipeline
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    is_accelerate_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from .camera import create_pan_cameras
+from .params_proj import ShapEParamsProjModel
+from .renderer import (
+    BoundingBoxVolume,
+    ImportanceRaySampler,
+    MLPNeRSTFModel,
+    StratifiedRaySampler,
+    VoidNeRFModel,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+
+        ```
+"""
+
+
+def merge_results(self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor):
+    """
+    :param a: [..., n_a, ...]. The other dictionary containing the b's may
+        contain extra tensors from earlier calculations, so a can be None.
+    :param b: [..., n_b, ...] :param dim: dimension to merge :param indices: how the merged results should be sorted at
+    the end :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
+    """
+    merged = torch.cat([a, b], dim=dim)
+    return torch.gather(merged, dim=dim, index=torch.broadcast_to(indices, merged.shape))
+
+
+def integrate_samples(volume_range, ts, density, channels):
+    r"""
+    Function integrating the model output.
+
+    Args:
+        volume_range: Specifies the integral range [t0, t1]
+        ts: timesteps
+        density: torch.Tensor [batch_size, *shape, n_samples, 1]
+        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
+    returns:
+        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
+        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
+    )
+    """
+
+    # 1. Calculate the weights
+    _, _, dt = volume_range.partition(ts)
+    ddensity = density * dt
+
+    mass = torch.cumsum(ddensity, dim=-2)
+    transmittance = torch.exp(-mass[..., -1, :])
+
+    alphas = 1.0 - torch.exp(-ddensity)
+    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
+    # This is the probability of light hitting and reflecting off of
+    # something at depth [..., i, :].
+    weights = alphas * Ts
+
+    # 2. Integrate channels
+    channels = torch.sum(channels * weights, dim=-2)
+
+    return channels, weights, transmittance
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for ShapEPipeline.
+
+    Args:
+        images (`torch.FloatTensor`)
+            a list of images for 3D rendering
+    """
+
+    images: Union[PIL.Image.Image, np.ndarray]
+
+
+class ShapEImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset with Shap.E
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModel,
+        image_processor: CLIPImageProcessor,
+        scheduler: HeunDiscreteScheduler,
+        params_proj: ShapEParamsProjModel,
+        renderer: MLPNeRSTFModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+            scheduler=scheduler,
+            params_proj=params_proj,
+            renderer=renderer,
+        )
+        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
+        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
+            return self.device
+        for module in self.text_encoder.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_image(
+        self,
+        images,
+        device,
+        num_images_per_image,
+        do_classifier_free_guidance,
+    ):  
+        
+        if not isinstance(images, PIL.Image.Image):
+            images = [images]
+        
+        images = (
+            self.image_processor(images, return_tensors="pt")
+            .pixel_values[0]
+            .unsqueeze(0)
+            .to(dtype=self.image_encoder.dtype, device=device)
+        )
+
+        image_embeds = self.image_encoder(images)["last_hidden_state"]
+        image_embeds = image_embeds[:, 1:, :].contiguous().float() # batch_size, dim, 256
+
+        image_embeds = image_embeds.repeat_interleave(num_images_per_image, dim=0)
+
+        if do_classifier_free_guidance:
+            negative_image_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        
+        return image_embeds
+
+    @torch.no_grad()
+    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
+        """
+        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
+        with some abuse of notations)
+
+            C(r) := sum(
+                transmittance(t[i]) * integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
+                ) for i in range(len(parts))
+            ) + transmittance(t[-1]) * void_model(t[-1]).channels
+
+        where
+
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
+        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
+        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
+        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
+        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
+        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
+        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+
+        args:
+            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
+            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
+
+        :return: A tuple of
+            - `channels`
+            - A importance samplers for additional fine-grained rendering
+            - raw model output
+        """
+        origin, direction = rays[..., 0, :], rays[..., 1, :]
+
+        # Integrate over [t[i], t[i + 1]]
+
+        # 1 Intersect the rays with the current volume and sample ts to integrate along.
+        vrange = self.volume.intersect(origin, direction, t0_lower=None)
+        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
+        ts = ts.to(rays.dtype)
+
+        if prev_model_out is not None:
+            # Append the previous ts now before fprop because previous
+            # rendering used a different model and we can't reuse the output.
+            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
+
+        batch_size, *_shape, _t0_dim = vrange.t0.shape
+        _, *ts_shape, _ts_dim = ts.shape
+
+        # 2. Get the points along the ray and query the model
+        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
+        positions = origin.unsqueeze(-2) + ts * directions
+
+        optional_directions = directions if render_with_direction else None
+
+        model_out = self.renderer(
+            position=positions,
+            direction=optional_directions,
+            ts=ts,
+            nerf_level="coarse" if prev_model_out is None else "fine",
+        )
+
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(
+            vrange, model_out.ts, model_out.density, model_out.channels
+        )
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
+        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
+        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+        channels = channels + transmittance * self.void(origin)
+
+        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
+
+        return channels, weighted_sampler, model_out
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        num_images_per_image: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        sigma_min: float = 1e-3,
+        sigma_max: float = 160.0,
+        size: int = 64,
+        ray_batch_size: int = 4096,
+        n_coarse_samples=64,
+        n_fine_samples=128,
+        output_type: Optional[str] = "pil",  # pil, np, latent
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`ShapEPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `list` but is {type(image)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_image
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        image_embeds = self._encode_image(image, device, num_images_per_image, do_classifier_free_guidance)
+
+        # prior
+
+        self.scheduler.set_timesteps(
+            num_inference_steps, device=device, sigma_min=sigma_min, sigma_max=sigma_max, use_karras_sigmas=True
+        )
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, num_embeddings * embedding_dim),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=image_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            # clip between -1 and 1
+            noise_pred = noise_pred.clamp(-1, 1)
+
+            if do_classifier_free_guidance is not None:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+                step_index=i,
+            ).prev_sample
+
+        # YiYi testing only: I don't think we need to return latent for this pipeline
+        if output_type == "latent":
+            return ShapEPipelineOutput(images=latents)
+
+        # project the the paramters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # update the mlp layers of the renderer
+        for name, param in self.renderer.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+
+        # create cameras object
+        camera = create_pan_cameras(size)
+        rays = camera.camera_rays
+        rays = rays.to(device)
+        n_batches = rays.shape[1] // ray_batch_size
+
+        coarse_sampler = StratifiedRaySampler()
+
+        images = []
+        with self.progress_bar(total=n_batches) as progress_bar:
+            for idx in range(n_batches):
+                rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
+
+                # render rays with coarse, stratified samples.
+                _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
+                # Then, render with additional importance-weighted ray samples.
+                channels, _, _ = self.render_rays(
+                    rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
+                )
+
+                images.append(channels)
+                progress_bar.update()
+
+        images = torch.cat(images, dim=1)
+        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+
+        if output_type not in ["np", "pil"]:
+            raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
+
+        images = images.cpu().numpy()
+
+        if output_type == "pil":
+            images = self.numpy_to_pil(images)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (images,)
+
+        return ShapEPipelineOutput(images=images)

From 443450cf2dd4f67e9c2ccff026bee7b9f77a384f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 28 Jun 2023 02:21:51 +0000
Subject: [PATCH 031/119] style

---
 scripts/convert_shap_e_to_diffusers.py                 |  5 ++++-
 src/diffusers/__init__.py                              |  2 +-
 src/diffusers/models/prior_transformer.py              | 10 +++++-----
 src/diffusers/pipelines/__init__.py                    |  2 +-
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py      |  3 +--
 .../pipelines/shap_e/pipeline_shap_e_img2img.py        |  9 +++------
 6 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index cf6c11555728..7c5ee8f482b3 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -235,6 +235,7 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
     "out_dim": 1024 * 2,
 }
 
+
 def prior_image_model_from_original_config():
     model = PriorTransformer(**PRIOR_IMAGE_CONFIG)
 
@@ -285,7 +286,9 @@ def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     )
 
     # <original>.pos_emb -> <diffusers>.positional_embedding
-    diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.pos_emb"][None, :]})
+    diffusers_checkpoint.update(
+        {"positional_embedding": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.pos_emb"][None, :]}
+    )
 
     # <original>.ln_pre -> <diffusers>.norm_in
     diffusers_checkpoint.update(
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 1fe675ad3bd7..dc22f5ed11f2 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -139,8 +139,8 @@
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
-        ShapEPipeline,
         ShapEImg2ImgPipeline,
+        ShapEPipeline,
         StableDiffusionAttendAndExcitePipeline,
         StableDiffusionControlNetImg2ImgPipeline,
         StableDiffusionControlNetInpaintPipeline,
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 789a61ec5309..158acce20771 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -89,7 +89,7 @@ def __init__(
             self.embedding_proj_norm = nn.LayerNorm(clip_embedding_dim)
         else:
             self.embedding_proj_norm = None
-            
+
         self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
 
         if self.config.clip_embedding_dim is None:
@@ -250,7 +250,7 @@ def forward(
         # but time_embedding might be fp16, so we need to cast here.
         timesteps_projected = timesteps_projected.to(dtype=self.dtype)
         time_embeddings = self.time_embedding(timesteps_projected)
-        
+
         if self.embedding_proj_norm is not None:
             proj_embedding = self.embedding_proj_norm(proj_embedding)
 
@@ -265,15 +265,15 @@ def forward(
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
 
         tokens = []
-        additional_embeddings =0 
+        additional_embeddings = 0
 
         if encoder_hidden_states is not None:
             tokens.append(encoder_hidden_states)
             additional_embeddings += encoder_hidden_states.shape[1]
-        
+
         if len(proj_embeddings.shape) == 2:
             proj_embeddings = proj_embeddings[:, None, :]
-        
+
         if len(hidden_states.shape) == 2:
             hidden_states = hidden_states[:, None, :]
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 4fee68f6ceb2..bd4dcada0277 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -66,7 +66,7 @@
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-    from .shap_e import ShapEPipeline, ShapEImg2ImgPipeline
+    from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
         StableDiffusionAttendAndExcitePipeline,
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 0083968f8590..b7e9d1aa4961 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -243,11 +243,10 @@ def _encode_prompt(
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-        
+
         # Rescale the features to have unit variance
         prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
-
         return prompt_embeds
 
     @torch.no_grad()
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 1dd0136a4cfd..1cc0c4f32080 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
@@ -205,11 +204,10 @@ def _encode_image(
         device,
         num_images_per_image,
         do_classifier_free_guidance,
-    ):  
-        
+    ):
         if not isinstance(images, PIL.Image.Image):
             images = [images]
-        
+
         images = (
             self.image_processor(images, return_tensors="pt")
             .pixel_values[0]
@@ -218,7 +216,7 @@ def _encode_image(
         )
 
         image_embeds = self.image_encoder(images)["last_hidden_state"]
-        image_embeds = image_embeds[:, 1:, :].contiguous().float() # batch_size, dim, 256
+        image_embeds = image_embeds[:, 1:, :].contiguous().float()  # batch_size, dim, 256
 
         image_embeds = image_embeds.repeat_interleave(num_images_per_image, dim=0)
 
@@ -230,7 +228,6 @@ def _encode_image(
             # to avoid doing two forward passes
             image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
-        
         return image_embeds
 
     @torch.no_grad()

From 8e27fbc8278fd865bc43cc77132a1596b6075bb0 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 28 Jun 2023 02:22:47 +0000
Subject: [PATCH 032/119] copies

---
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index a85f222730e0..e0dce2cd4ecf 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -272,6 +272,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ShapEImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class ShapEPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 2902f100dcc1e7ce71163183b33bc7f5e23d3e25 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:12:24 -1000
Subject: [PATCH 033/119] Update src/diffusers/models/prior_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 158acce20771..05e97dcb7e61 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -60,7 +60,7 @@ def __init__(
         additional_embeddings=4,
         dropout: float = 0.0,
         time_embed_act_fn: str = "silu",
-        norm_embedding_proj: bool = False,
+        embedding_proj_norm: Optional[str] = None,
         time_embed_dim: Optional[int] = None,
         clip_embedding_dim: Optional[int] = None,
         out_dim: Optional[int] = None,

From 7c5aa91fc5a58032a187c1f7a954882677c42106 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:13:46 -1000
Subject: [PATCH 034/119] Update src/diffusers/models/prior_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 05e97dcb7e61..8cf3bc7747b0 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -62,7 +62,7 @@ def __init__(
         time_embed_act_fn: str = "silu",
         embedding_proj_norm: Optional[str] = None,
         time_embed_dim: Optional[int] = None,
-        clip_embedding_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
         out_dim: Optional[int] = None,
     ):
         super().__init__()

From 7194b869883183bd605b87660ed0fd5624d0d166 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:14:55 -1000
Subject: [PATCH 035/119] Update src/diffusers/models/prior_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 8cf3bc7747b0..3c354e83aa1b 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -71,14 +71,9 @@ def __init__(
         inner_dim = num_attention_heads * attention_head_dim
         self.additional_embeddings = additional_embeddings
 
-        if time_embed_dim is None:
-            time_embed_dim = inner_dim
-
-        if clip_embedding_dim is None:
-            clip_embedding_dim = embedding_dim
-
-        if out_dim is None:
-            out_dim = embedding_dim
+        time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        out_dim = out_dim or embedding_dim
 
         self.time_proj = Timesteps(inner_dim, True, 0)
         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)

From 6a460a574f0bc3cf526c721255ca533714423b7f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:18:29 -1000
Subject: [PATCH 036/119] Update src/diffusers/models/prior_transformer.py

add arg: encoder_hid_proj

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 3c354e83aa1b..466bcdf37c47 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -87,7 +87,7 @@ def __init__(
 
         self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
 
-        if self.config.clip_embedding_dim is None:
+        if encoder_hid_proj is not None:
             self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
         else:
             self.encoder_hidden_states_proj = None

From fc9218409f5bc8f964c1e7b44e84bdd29e2ace60 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:19:24 -1000
Subject: [PATCH 037/119] Update src/diffusers/models/prior_transformer.py

add new config: norm_in_type

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 466bcdf37c47..ffcc73292a29 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -113,10 +113,12 @@ def __init__(
             ]
         )
 
-        if self.config.out_dim is not None:
+        if norm_in_type == "layer":
             self.norm_in = nn.LayerNorm(inner_dim)
-        else:
+        elif norm_in_type is None:
             self.norm_in = None
+       else:
+           raise ValueError(f"{norm_in_type} does not exist.")
 
         self.norm_out = nn.LayerNorm(inner_dim)
 

From 6ca8e552d464992eb1d4b46e797e8639e55f78eb Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:20:32 -1000
Subject: [PATCH 038/119] Update src/diffusers/models/prior_transformer.py

add new config: added_emb_type

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index ffcc73292a29..3e4ef3aaca68 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -94,7 +94,7 @@ def __init__(
 
         self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
 
-        if self.config.out_dim is None:
+        if added_emb_type is "prd":
             self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
         else:
             self.prd_embedding = None

From 92d84f9cccfc4ad306e57b348d1bfbde39c62f0b Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:23:15 -1000
Subject: [PATCH 039/119] Update src/diffusers/models/prior_transformer.py

rename out_dim -> clip_embed_dim

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 3e4ef3aaca68..390f0d2675de 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -122,7 +122,7 @@ def __init__(
 
         self.norm_out = nn.LayerNorm(inner_dim)
 
-        self.proj_to_clip_embeddings = nn.Linear(inner_dim, out_dim)
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
 
         causal_attention_mask = torch.full(
             [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0

From b2c31f270b32d402f101a8e2f368afaa1fb6e51d Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:43:27 -1000
Subject: [PATCH 040/119] Update src/diffusers/models/prior_transformer.py

rename config: out_dim -> clip_embed_dim

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 390f0d2675de..4704581a9e7b 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -63,7 +63,7 @@ def __init__(
         embedding_proj_norm: Optional[str] = None,
         time_embed_dim: Optional[int] = None,
         embedding_proj_dim: Optional[int] = None,
-        out_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads

From 04160fa6576883b8ec757055777a6342f7737bca Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:45:02 -1000
Subject: [PATCH 041/119] Update src/diffusers/models/prior_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 4704581a9e7b..c11dfe77abd5 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -261,7 +261,7 @@ def forward(
 
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
 
-        tokens = []
+        additional_embeds = []
         additional_embeddings = 0
 
         if encoder_hidden_states is not None:

From ece3babc725aee7e6a37e59f3038ff51ee24b996 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 28 Jun 2023 08:45:14 -1000
Subject: [PATCH 042/119] Update src/diffusers/models/prior_transformer.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index c11dfe77abd5..08a563c14518 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -262,7 +262,7 @@ def forward(
         positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
 
         additional_embeds = []
-        additional_embeddings = 0
+        additional_embeddings_len = 0
 
         if encoder_hidden_states is not None:
             tokens.append(encoder_hidden_states)

From 1ac3c42c8eb52fbd1bb04871b03a33575e47cdb7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 01:21:28 +0000
Subject: [PATCH 043/119] finish refactor prior_tranformer

---
 scripts/convert_shap_e_to_diffusers.py    | 39 +++++++++---
 src/diffusers/models/prior_transformer.py | 72 ++++++++++++++---------
 2 files changed, 73 insertions(+), 38 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 7c5ee8f482b3..43d0bae11caf 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -22,7 +22,7 @@
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
       --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \
       --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
-      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e/prior_image \
+      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e-img2img/prior\
       --debug prior_image
 ```
 """
@@ -32,7 +32,6 @@
 
 PRIOR_ORIGINAL_PREFIX = "wrapped"
 
-# Uses default arguments
 PRIOR_CONFIG = {
     "num_attention_heads": 16,
     "attention_head_dim": 1024 // 16,
@@ -41,9 +40,12 @@
     "num_embeddings": 1024,
     "additional_embeddings": 0,
     "time_embed_act_fn": "gelu",
+    "norm_in_type": "layer",
+    "encoder_hid_proj_type": None,
+    "added_emb_type": None,
     "time_embed_dim": 1024 * 4,
-    "clip_embedding_dim": 768,
-    "out_dim": 1024 * 2,
+    "embedding_proj_dim": 768,
+    "clip_embed_dim": 1024 * 2,
 }
 
 
@@ -229,10 +231,13 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix
     "num_embeddings": 1024,
     "additional_embeddings": 0,
     "time_embed_act_fn": "gelu",
-    "norm_embedding_proj": True,
+    "norm_in_type": "layer",
+    "embedding_proj_norm_type": "layer",
+    "encoder_hid_proj_type": None,
+    "added_emb_type": None,
     "time_embed_dim": 1024 * 4,
-    "clip_embedding_dim": 1024,
-    "out_dim": 1024 * 2,
+    "embedding_proj_dim": 1024,
+    "clip_embed_dim": 1024 * 2,
 }
 
 
@@ -454,7 +459,7 @@ def prior(*, args, checkpoint_map_location):
 
     del prior_checkpoint
 
-    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+    load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model)
 
     print("done loading prior")
 
@@ -473,7 +478,7 @@ def prior_image(*, args, checkpoint_map_location):
 
     del prior_checkpoint
 
-    load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True)
+    load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model)
 
     print("done loading prior_image")
 
@@ -520,6 +525,22 @@ def renderer(*, args, checkpoint_map_location):
     return renderer_model
 
 
+# prior model will expect clip_mean and clip_std, whic are missing from the state_dict
+PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
+
+def load_prior_checkpoint_to_model(checkpoint, model):
+    with tempfile.NamedTemporaryFile() as file:
+        torch.save(checkpoint, file.name)
+        del checkpoint
+        missing_keys, unexpected_keys = model.load_state_dict(torch.load(file.name), strict=False)
+        missing_keys = list(set(missing_keys) - set(PRIOR_EXPECTED_MISSING_KEYS))
+        
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"Unexpected keys when loading prior model: {unexpected_keys}")
+        if len(missing_keys) > 0:
+            raise ValueError(f"Missing keys when loading prior model: {missing_keys}")        
+
+
 def load_checkpoint_to_model(checkpoint, model, strict=False):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 08a563c14518..e6519f13e52f 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -38,15 +38,25 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
         attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
         num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
-        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the CLIP embeddings. Note that CLIP
-            image embeddings and text embeddings are both the same dimension.
-        num_embeddings (`int`, *optional*, defaults to 77): The max number of clip embeddings allowed. I.e. the
-            length of the prompt after it has been tokenized.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77): the number of embeddings of the model input `hidden_states`
         additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
             projected hidden_states. The actual length of the used hidden_states is `num_embeddings +
             additional_embeddings`.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'): the activation function to use to create timestep embedding
+        norm_in_type (`str`, *optional*, defaults to None): the normalization layer to apply on hidden states before 
+            passing to Transformer blocks. Set it to `None` if normalization is not needed. 
+        embedding_proj_norm_type (`str`, *optional*, defaults to None): the normalization layer to apply on the input `proj_embedding`.
+            Set it to `None` if normalization is not needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`): the projection layer to apply on the input `encoder_hidden_states`.
+            Set it to `None` if `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): the additional embedding to condition model. 
+            `prd` indicating higher text-image dot products. if it is `None`, will not prepend additional embedding.
+        time_embed_dim (`int, *optional*, defaults to None): the dimension of timestep embedding.
+            If None, will set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None): the dimension of `proj_embedding`. If None, will set to `embedding_dim`
+        clip_embed_dim (`int`, *optional*, default to None): the dimension of output. If None, will set to `embedding_dim`
     """
 
     @register_to_config
@@ -60,7 +70,10 @@ def __init__(
         additional_embeddings=4,
         dropout: float = 0.0,
         time_embed_act_fn: str = "silu",
-        embedding_proj_norm: Optional[str] = None,
+        norm_in_type: Optional[str] = None, # layer
+        embedding_proj_norm_type: Optional[str] = None, # layer
+        encoder_hid_proj_type: Optional[str] = "linear", # linear 
+        added_emb_type: Optional[str] = "prd", # prd
         time_embed_dim: Optional[int] = None,
         embedding_proj_dim: Optional[int] = None,
         clip_embed_dim: Optional[int] = None,
@@ -73,28 +86,32 @@ def __init__(
 
         time_embed_dim = time_embed_dim or inner_dim
         embedding_proj_dim = embedding_proj_dim or embedding_dim
-        out_dim = out_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
 
         self.time_proj = Timesteps(inner_dim, True, 0)
         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
 
         self.proj_in = nn.Linear(embedding_dim, inner_dim)
 
-        if norm_embedding_proj:
-            self.embedding_proj_norm = nn.LayerNorm(clip_embedding_dim)
-        else:
+        if embedding_proj_norm_type is None:
             self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
 
-        self.embedding_proj = nn.Linear(clip_embedding_dim, inner_dim)
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
 
-        if encoder_hid_proj is not None:
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
             self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
         else:
-            self.encoder_hidden_states_proj = None
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
 
         self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
 
-        if added_emb_type is "prd":
+        if added_emb_type == "prd":
             self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
         else:
             self.prd_embedding = None
@@ -117,7 +134,7 @@ def __init__(
             self.norm_in = nn.LayerNorm(inner_dim)
         elif norm_in_type is None:
             self.norm_in = None
-       else:
+        else:
            raise ValueError(f"{norm_in_type} does not exist.")
 
         self.norm_out = nn.LayerNorm(inner_dim)
@@ -130,12 +147,9 @@ def __init__(
         causal_attention_mask.triu_(1)
         causal_attention_mask = causal_attention_mask[None, ...]
         self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
-        if self.config.out_dim is None:
-            self.clip_mean = nn.Parameter(torch.zeros(1, out_dim))
-            self.clip_std = nn.Parameter(torch.zeros(1, out_dim))
-        else:
-            self.clip_mean = None
-            self.clip_std = None
+
+        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
+        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
 
     @property
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
@@ -265,8 +279,8 @@ def forward(
         additional_embeddings_len = 0
 
         if encoder_hidden_states is not None:
-            tokens.append(encoder_hidden_states)
-            additional_embeddings += encoder_hidden_states.shape[1]
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
 
         if len(proj_embeddings.shape) == 2:
             proj_embeddings = proj_embeddings[:, None, :]
@@ -274,7 +288,7 @@ def forward(
         if len(hidden_states.shape) == 2:
             hidden_states = hidden_states[:, None, :]
 
-        tokens = tokens + [
+        additional_embeds = additional_embeds + [
             proj_embeddings,
             time_embeddings[:, None, :],
             hidden_states,
@@ -282,19 +296,19 @@ def forward(
 
         if self.prd_embedding is not None:
             prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
-            tokens.append(prd_embedding)
+            additional_embeds.append(prd_embedding)
 
         hidden_states = torch.cat(
-            tokens,
+            additional_embeds,
             dim=1,
         )
 
         # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
-        additional_embeddings = additional_embeddings + proj_embeddings.shape[1] + 1
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
         if positional_embeddings.shape[1] < hidden_states.shape[1]:
             positional_embeddings = F.pad(
                 positional_embeddings,
-                (0, 0, additional_embeddings, self.prd_embedding.shape[1] if self.prd_embedding is not None else 0),
+                (0, 0, additional_embeddings_len, self.prd_embedding.shape[1] if self.prd_embedding is not None else 0),
                 value=0.0,
             )
 
@@ -317,7 +331,7 @@ def forward(
         if self.prd_embedding is not None:
             hidden_states = hidden_states[:, -1]
         else:
-            hidden_states = hidden_states[:, additional_embeddings:]
+            hidden_states = hidden_states[:, additional_embeddings_len:]
 
         predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
 

From 1c6d30b2a529e73b4ec6e77dbbf5a20279ba7a0c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 01:23:33 +0000
Subject: [PATCH 044/119] make style

---
 scripts/convert_shap_e_to_diffusers.py    |  5 ++-
 src/diffusers/models/prior_transformer.py | 45 ++++++++++++++---------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 43d0bae11caf..8574100046f8 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -528,17 +528,18 @@ def renderer(*, args, checkpoint_map_location):
 # prior model will expect clip_mean and clip_std, whic are missing from the state_dict
 PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
 
+
 def load_prior_checkpoint_to_model(checkpoint, model):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
         del checkpoint
         missing_keys, unexpected_keys = model.load_state_dict(torch.load(file.name), strict=False)
         missing_keys = list(set(missing_keys) - set(PRIOR_EXPECTED_MISSING_KEYS))
-        
+
         if len(unexpected_keys) > 0:
             raise ValueError(f"Unexpected keys when loading prior model: {unexpected_keys}")
         if len(missing_keys) > 0:
-            raise ValueError(f"Missing keys when loading prior model: {missing_keys}")        
+            raise ValueError(f"Missing keys when loading prior model: {missing_keys}")
 
 
 def load_checkpoint_to_model(checkpoint, model, strict=False):
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index e6519f13e52f..7ffc9ccff5ea 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -39,24 +39,30 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
         num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
         embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
-        num_embeddings (`int`, *optional*, defaults to 77): the number of embeddings of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            the number of embeddings of the model input `hidden_states`
         additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
             projected hidden_states. The actual length of the used hidden_states is `num_embeddings +
             additional_embeddings`.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        time_embed_act_fn (`str`, *optional*, defaults to 'silu'): the activation function to use to create timestep embedding
-        norm_in_type (`str`, *optional*, defaults to None): the normalization layer to apply on hidden states before 
-            passing to Transformer blocks. Set it to `None` if normalization is not needed. 
-        embedding_proj_norm_type (`str`, *optional*, defaults to None): the normalization layer to apply on the input `proj_embedding`.
-            Set it to `None` if normalization is not needed.
-        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`): the projection layer to apply on the input `encoder_hidden_states`.
-            Set it to `None` if `encoder_hidden_states` is `None`.
-        added_emb_type (`str`, *optional*, defaults to `prd`): the additional embedding to condition model. 
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            the activation function to use to create timestep embedding
+        norm_in_type (`str`, *optional*, defaults to None): the normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            the normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            the projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): the additional embedding to condition model.
             `prd` indicating higher text-image dot products. if it is `None`, will not prepend additional embedding.
         time_embed_dim (`int, *optional*, defaults to None): the dimension of timestep embedding.
             If None, will set to `num_attention_heads * attention_head_dim`
-        embedding_proj_dim (`int`, *optional*, default to None): the dimension of `proj_embedding`. If None, will set to `embedding_dim`
-        clip_embed_dim (`int`, *optional*, default to None): the dimension of output. If None, will set to `embedding_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            the dimension of `proj_embedding`. If None, will set to `embedding_dim`
+        clip_embed_dim (`int`, *optional*, default to None):
+            the dimension of output. If None, will set to `embedding_dim`
     """
 
     @register_to_config
@@ -70,10 +76,10 @@ def __init__(
         additional_embeddings=4,
         dropout: float = 0.0,
         time_embed_act_fn: str = "silu",
-        norm_in_type: Optional[str] = None, # layer
-        embedding_proj_norm_type: Optional[str] = None, # layer
-        encoder_hid_proj_type: Optional[str] = "linear", # linear 
-        added_emb_type: Optional[str] = "prd", # prd
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
         time_embed_dim: Optional[int] = None,
         embedding_proj_dim: Optional[int] = None,
         clip_embed_dim: Optional[int] = None,
@@ -135,7 +141,7 @@ def __init__(
         elif norm_in_type is None:
             self.norm_in = None
         else:
-           raise ValueError(f"{norm_in_type} does not exist.")
+            raise ValueError(f"{norm_in_type} does not exist.")
 
         self.norm_out = nn.LayerNorm(inner_dim)
 
@@ -308,7 +314,12 @@ def forward(
         if positional_embeddings.shape[1] < hidden_states.shape[1]:
             positional_embeddings = F.pad(
                 positional_embeddings,
-                (0, 0, additional_embeddings_len, self.prd_embedding.shape[1] if self.prd_embedding is not None else 0),
+                (
+                    0,
+                    0,
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
                 value=0.0,
             )
 

From 734882056a5845957693c08eaa7189b9f294571d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 05:24:19 +0000
Subject: [PATCH 045/119] refactor renderer

---
 scripts/convert_shap_e_to_diffusers.py        |  75 ++---
 src/diffusers/pipelines/shap_e/__init__.py    |   3 +-
 .../pipelines/shap_e/pipeline_shap_e.py       | 173 +----------
 src/diffusers/pipelines/shap_e/renderer.py    | 274 +++++++++++++++++-
 4 files changed, 299 insertions(+), 226 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 8574100046f8..85ade1b1d8a6 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -5,7 +5,7 @@
 from accelerate import load_checkpoint_and_dispatch
 
 from diffusers.models.prior_transformer import PriorTransformer
-from diffusers.pipelines.shap_e import MLPNeRSTFModel, ShapEParamsProjModel
+from diffusers.pipelines.shap_e import MLPNeRSTFModel, ShapEParamsProjModel, ShapERenderer
 
 
 """
@@ -22,8 +22,8 @@
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
       --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \
       --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
-      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e-img2img/prior\
-      --debug prior_image
+      --dump_path /home/yiyi_huggingface_co/model_repo/test-shap-e-renderer\
+      --debug renderer
 ```
 """
 
@@ -371,47 +371,35 @@ def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 # done prior_image
 
 
-# params_proj
+# renderer 
 
-PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
-
-PARAMS_PROJ_CONFIG = {}
+RENDERER_CONFIG = {}
 
 
-def params_proj_model_from_original_config():
-    model = ShapEParamsProjModel(**PARAMS_PROJ_CONFIG)
+def renderer_model_from_original_config():
+    model = ShapERenderer(**RENDERER_CONFIG)
 
     return model
 
 
-def params_proj_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {k: checkpoint[f"{PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()}
-
-    return diffusers_checkpoint
-
-
-# done params_proj
-
-
-# renderer
+RENDERER_MLP_ORIGINAL_PREFIX = "renderer.nerstf"
 
-RENDERER_ORIGINAL_PREFIX = "renderer.nerstf"
+RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
 
-RENDERER_CONFIG = {}
-
-
-def renderer_model_from_original_config():
-    model = MLPNeRSTFModel(**RENDERER_CONFIG)
-
-    return model
+def renderer_model_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    
+    diffusers_checkpoint = {}
 
+    diffusers_checkpoint.update(
+        { 
+            "mlp": {k: checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()},
+            "params_proj": {k: checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.params_proj.state_dict().keys()},
 
-def renderer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    diffusers_checkpoint = {k: checkpoint[f"{RENDERER_ORIGINAL_PREFIX}.{k}"] for k in model.state_dict().keys()}
+        }
+    )
 
     return diffusers_checkpoint
 
-
 # done renderer
 
 
@@ -485,39 +473,18 @@ def prior_image(*, args, checkpoint_map_location):
     return prior_model
 
 
-def params_proj(*, args, checkpoint_map_location):
-    print("loading params_proj")
-
-    params_proj_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
-
-    params_proj_model = params_proj_model_from_original_config()
-
-    params_proj_diffusers_checkpoint = params_proj_original_checkpoint_to_diffusers_checkpoint(
-        params_proj_model, params_proj_checkpoint
-    )
-
-    del params_proj_checkpoint
-
-    load_checkpoint_to_model(params_proj_diffusers_checkpoint, params_proj_model, strict=True)
-
-    print("done loading params_proj")
-
-    return params_proj_model
-
-
 def renderer(*, args, checkpoint_map_location):
+
     print(" loading renderer")
 
     renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
 
     renderer_model = renderer_model_from_original_config()
 
-    renderer_diffusers_checkpoint = renderer_original_checkpoint_to_diffusers_checkpoint(
-        renderer_model, renderer_checkpoint
-    )
+    renderer_diffusers_checkpoint = renderer_model_original_checkpoint_to_diffusers_checkpoint(renderer_model, renderer_checkpoint)
 
     del renderer_checkpoint
-
+    
     load_checkpoint_to_model(renderer_diffusers_checkpoint, renderer_model, strict=True)
 
     print("done loading renderer")
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 1f44166acd11..7f4333ff26a4 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -13,7 +13,6 @@
     from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
 else:
     from .camera import create_pan_cameras
-    from .params_proj import ShapEParamsProjModel
     from .pipeline_shap_e import ShapEPipeline
     from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
     from .renderer import (
@@ -22,5 +21,7 @@
         MLPNeRFModelOutput,
         MLPNeRSTFModel,
         StratifiedRaySampler,
+        ShapEParamsProjModel,
         VoidNeRFModel,
+        ShapERenderer,
     )
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index b7e9d1aa4961..a58fa092038c 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -31,15 +31,8 @@
     randn_tensor,
     replace_example_docstring,
 )
-from .camera import create_pan_cameras
-from .params_proj import ShapEParamsProjModel
-from .renderer import (
-    BoundingBoxVolume,
-    ImportanceRaySampler,
-    MLPNeRSTFModel,
-    StratifiedRaySampler,
-    VoidNeRFModel,
-)
+
+from .renderer import ShapERenderer
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -51,52 +44,6 @@
         ```
 """
 
-
-def merge_results(self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor):
-    """
-    :param a: [..., n_a, ...]. The other dictionary containing the b's may
-        contain extra tensors from earlier calculations, so a can be None.
-    :param b: [..., n_b, ...] :param dim: dimension to merge :param indices: how the merged results should be sorted at
-    the end :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
-    """
-    merged = torch.cat([a, b], dim=dim)
-    return torch.gather(merged, dim=dim, index=torch.broadcast_to(indices, merged.shape))
-
-
-def integrate_samples(volume_range, ts, density, channels):
-    r"""
-    Function integrating the model output.
-
-    Args:
-        volume_range: Specifies the integral range [t0, t1]
-        ts: timesteps
-        density: torch.Tensor [batch_size, *shape, n_samples, 1]
-        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
-    returns:
-        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
-        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
-    )
-    """
-
-    # 1. Calculate the weights
-    _, _, dt = volume_range.partition(ts)
-    ddensity = density * dt
-
-    mass = torch.cumsum(ddensity, dim=-2)
-    transmittance = torch.exp(-mass[..., -1, :])
-
-    alphas = 1.0 - torch.exp(-ddensity)
-    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
-    # This is the probability of light hitting and reflecting off of
-    # something at depth [..., i, :].
-    weights = alphas * Ts
-
-    # 2. Integrate channels
-    channels = torch.sum(channels * weights, dim=-2)
-
-    return channels, weights, transmittance
-
-
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
     """
@@ -135,8 +82,7 @@ def __init__(
         text_encoder: CLIPTextModelWithProjection,
         tokenizer: CLIPTokenizer,
         scheduler: HeunDiscreteScheduler,
-        params_proj: ShapEParamsProjModel,
-        renderer: MLPNeRSTFModel,
+        renderer: ShapERenderer,
     ):
         super().__init__()
 
@@ -145,11 +91,8 @@ def __init__(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             scheduler=scheduler,
-            params_proj=params_proj,
             renderer=renderer,
         )
-        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
-        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -249,82 +192,6 @@ def _encode_prompt(
 
         return prompt_embeds
 
-    @torch.no_grad()
-    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
-        """
-        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
-        with some abuse of notations)
-
-            C(r) := sum(
-                transmittance(t[i]) * integrate(
-                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
-                ) for i in range(len(parts))
-            ) + transmittance(t[-1]) * void_model(t[-1]).channels
-
-        where
-
-        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
-        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
-        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
-        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
-        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
-        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
-        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
-
-        args:
-            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
-            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
-
-        :return: A tuple of
-            - `channels`
-            - A importance samplers for additional fine-grained rendering
-            - raw model output
-        """
-        origin, direction = rays[..., 0, :], rays[..., 1, :]
-
-        # Integrate over [t[i], t[i + 1]]
-
-        # 1 Intersect the rays with the current volume and sample ts to integrate along.
-        vrange = self.volume.intersect(origin, direction, t0_lower=None)
-        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
-        ts = ts.to(rays.dtype)
-
-        if prev_model_out is not None:
-            # Append the previous ts now before fprop because previous
-            # rendering used a different model and we can't reuse the output.
-            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
-
-        batch_size, *_shape, _t0_dim = vrange.t0.shape
-        _, *ts_shape, _ts_dim = ts.shape
-
-        # 2. Get the points along the ray and query the model
-        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
-        positions = origin.unsqueeze(-2) + ts * directions
-
-        optional_directions = directions if render_with_direction else None
-
-        model_out = self.renderer(
-            position=positions,
-            direction=optional_directions,
-            ts=ts,
-            nerf_level="coarse" if prev_model_out is None else "fine",
-        )
-
-        # 3. Integrate the model results
-        channels, weights, transmittance = integrate_samples(
-            vrange, model_out.ts, model_out.density, model_out.channels
-        )
-
-        # 4. Clean up results that do not intersect with the volume.
-        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
-        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
-        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
-        channels = channels + transmittance * self.void(origin)
-
-        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
-
-        return channels, weighted_sampler, model_out
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -450,39 +317,7 @@ def __call__(
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 
-        # project the the paramters from the generated latents
-        projected_params = self.params_proj(latents)
-
-        # update the mlp layers of the renderer
-        for name, param in self.renderer.state_dict().items():
-            if f"nerstf.{name}" in projected_params.keys():
-                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
-
-        # create cameras object
-        camera = create_pan_cameras(size)
-        rays = camera.camera_rays
-        rays = rays.to(device)
-        n_batches = rays.shape[1] // ray_batch_size
-
-        coarse_sampler = StratifiedRaySampler()
-
-        images = []
-        with self.progress_bar(total=n_batches) as progress_bar:
-            for idx in range(n_batches):
-                rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
-
-                # render rays with coarse, stratified samples.
-                _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
-                # Then, render with additional importance-weighted ray samples.
-                channels, _, _ = self.render_rays(
-                    rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
-                )
-
-                images.append(channels)
-                progress_bar.update()
-
-        images = torch.cat(images, dim=1)
-        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+        images = self.renderer.decode(latents)
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 5ada113224f6..042ee00ea5a2 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
@@ -24,6 +24,9 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 from ...utils import BaseOutput
+from .camera import create_pan_cameras
+
+from transformers import PreTrainedModel
 
 
 def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
@@ -77,6 +80,44 @@ def encode_direction(position, direction=None):
         return posenc_nerf(direction, min_deg=0, max_deg=8)
 
 
+def _sanitize_name(x: str) -> str:
+    return x.replace(".", "__")
+
+
+def integrate_samples(volume_range, ts, density, channels):
+    r"""
+    Function integrating the model output.
+
+    Args:
+        volume_range: Specifies the integral range [t0, t1]
+        ts: timesteps
+        density: torch.Tensor [batch_size, *shape, n_samples, 1]
+        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
+    returns:
+        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
+        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
+    )
+    """
+
+    # 1. Calculate the weights
+    _, _, dt = volume_range.partition(ts)
+    ddensity = density * dt
+
+    mass = torch.cumsum(ddensity, dim=-2)
+    transmittance = torch.exp(-mass[..., -1, :])
+
+    alphas = 1.0 - torch.exp(-ddensity)
+    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
+    # This is the probability of light hitting and reflecting off of
+    # something at depth [..., i, :].
+    weights = alphas * Ts
+
+    # 2. Integrate channels
+    channels = torch.sum(channels * weights, dim=-2)
+
+    return channels, weights, transmittance
+
+
 class VoidNeRFModel(nn.Module):
     """
     Implements the default empty space model where all queries are rendered as background.
@@ -336,7 +377,6 @@ class MLPNeRFModelOutput(BaseOutput):
     channels: torch.Tensor
     ts: torch.Tensor
 
-
 class MLPNeRSTFModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
@@ -427,3 +467,233 @@ def forward(self, *, position, direction, ts, nerf_level="coarse"):
 
         # yiyi notes: I think signed_distance is not used
         return MLPNeRFModelOutput(density=density, signed_distance=signed_distance, channels=channels, ts=ts)
+
+
+class ChannelsProj(nn.Module):
+    def __init__(
+        self,
+        *,
+        vectors: int,
+        channels: int,
+        d_latent: int,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(d_latent, vectors * channels)
+        self.norm = nn.LayerNorm(channels)
+        self.d_latent = d_latent
+        self.vectors = vectors
+        self.channels = channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_bvd = x
+        w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent)
+        b_vc = self.proj.bias.view(1, self.vectors, self.channels)
+        h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd)
+        h = self.norm(h)
+
+        h = h + b_vc
+        return h
+
+
+class ShapEParamsProjModel(ModelMixin, ConfigMixin):
+    """
+    project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
+
+    For more details, see the original paper:
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+    ):
+        super().__init__()
+
+        # check inputs
+        if len(param_names) != len(param_shapes):
+            raise ValueError("Must provide same number of `param_names` as `param_shapes`")
+        self.projections = nn.ModuleDict({})
+        for k, (vectors, channels) in zip(param_names, param_shapes):
+            self.projections[_sanitize_name(k)] = ChannelsProj(
+                vectors=vectors,
+                channels=channels,
+                d_latent=d_latent,
+            )
+
+    def forward(self, x: torch.Tensor):
+        out = {}
+        start = 0
+        for k, shape in zip(self.config.param_names, self.config.param_shapes):
+            vectors, _ = shape
+            end = start + vectors
+            x_bvd = x[:, start:end]
+            out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape)
+            start = end
+        return out
+
+
+class ShapERenderer(ModelMixin, ConfigMixin):
+    
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,):
+
+        super().__init__()
+
+        self.params_proj = ShapEParamsProjModel(
+            param_names = param_names,
+            param_shapes = param_shapes,
+            d_latent = d_latent,
+        )
+        self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at)
+        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
+        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
+
+    @torch.no_grad()
+    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
+        """
+        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
+        with some abuse of notations)
+
+            C(r) := sum(
+                transmittance(t[i]) * integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
+                ) for i in range(len(parts))
+            ) + transmittance(t[-1]) * void_model(t[-1]).channels
+
+        where
+
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
+        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
+        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
+        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
+        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
+        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
+        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+
+        args:
+            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
+            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
+
+        :return: A tuple of
+            - `channels`
+            - A importance samplers for additional fine-grained rendering
+            - raw model output
+        """
+        origin, direction = rays[..., 0, :], rays[..., 1, :]
+
+        # Integrate over [t[i], t[i + 1]]
+
+        # 1 Intersect the rays with the current volume and sample ts to integrate along.
+        vrange = self.volume.intersect(origin, direction, t0_lower=None)
+        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
+        ts = ts.to(rays.dtype)
+
+        if prev_model_out is not None:
+            # Append the previous ts now before fprop because previous
+            # rendering used a different model and we can't reuse the output.
+            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
+
+        batch_size, *_shape, _t0_dim = vrange.t0.shape
+        _, *ts_shape, _ts_dim = ts.shape
+
+        # 2. Get the points along the ray and query the model
+        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
+        positions = origin.unsqueeze(-2) + ts * directions
+
+        optional_directions = directions if render_with_direction else None
+
+        model_out = self.mlp(
+            position=positions,
+            direction=optional_directions,
+            ts=ts,
+            nerf_level="coarse" if prev_model_out is None else "fine",
+        )
+
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(
+            vrange, model_out.ts, model_out.density, model_out.channels
+        )
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
+        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
+        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+        channels = channels + transmittance * self.void(origin)
+
+        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
+
+        return channels, weighted_sampler, model_out
+    
+    @torch.no_grad()
+    def decode(self, latents: torch.FloatTensor):
+
+        # project the the paramters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # update the mlp layers of the renderer
+        for name, param in self.mlp.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+        
+        # create cameras object
+        camera = create_pan_cameras(size)
+        rays = camera.camera_rays
+        rays = rays.to(device)
+        n_batches = rays.shape[1] // ray_batch_size
+
+        coarse_sampler = StratifiedRaySampler()
+
+        images = []
+
+        for idx in range(n_batches):
+            rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
+
+            # render rays with coarse, stratified samples.
+            _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
+            # Then, render with additional importance-weighted ray samples.
+            channels, _, _ = self.render_rays(
+                rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
+            )
+
+            images.append(channels)
+
+        images = torch.cat(images, dim=1)
+        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+
+        return images
+
+
+    

From 172c2db398ea0b19143da7f27a1e6daa882cb6aa Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 07:21:40 +0000
Subject: [PATCH 046/119] fix

---
 scripts/convert_shap_e_to_diffusers.py           | 16 +++++++++-------
 .../pipelines/shap_e/pipeline_shap_e.py          |  9 ++++++++-
 src/diffusers/pipelines/shap_e/renderer.py       | 10 +++++++++-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 85ade1b1d8a6..7f1ac7c84627 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -22,7 +22,7 @@
       --prior_checkpoint_path  /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \
       --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \
       --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\
-      --dump_path /home/yiyi_huggingface_co/model_repo/test-shap-e-renderer\
+      --dump_path /home/yiyi_huggingface_co/model_repo/shap-e/renderer\
       --debug renderer
 ```
 """
@@ -389,13 +389,16 @@ def renderer_model_from_original_config():
 def renderer_model_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
     
     diffusers_checkpoint = {}
-
     diffusers_checkpoint.update(
-        { 
-            "mlp": {k: checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()},
-            "params_proj": {k: checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.params_proj.state_dict().keys()},
+        {f"mlp.{k}": checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()}
+    )
+            
+    diffusers_checkpoint.update(
+        {f"params_proj.{k}": checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.params_proj.state_dict().keys()}
+    )
 
-        }
+    diffusers_checkpoint.update(
+        {"void.background": torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32)}
     )
 
     return diffusers_checkpoint
@@ -495,7 +498,6 @@ def renderer(*, args, checkpoint_map_location):
 # prior model will expect clip_mean and clip_std, whic are missing from the state_dict
 PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
 
-
 def load_prior_checkpoint_to_model(checkpoint, model):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index a58fa092038c..4e5275272e63 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -317,7 +317,14 @@ def __call__(
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 
-        images = self.renderer.decode(latents)
+        images = self.renderer.decode(
+            latents, 
+            device,
+            size=size, 
+            ray_batch_size=ray_batch_size, 
+            n_coarse_samples=n_coarse_samples,
+            n_fine_samples=n_fine_samples,
+            )
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 042ee00ea5a2..1d3d66796e80 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -658,7 +658,15 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
         return channels, weighted_sampler, model_out
     
     @torch.no_grad()
-    def decode(self, latents: torch.FloatTensor):
+    def decode(
+        self, 
+        latents, 
+        device, 
+        size: int = 64,
+        ray_batch_size: int = 4096,
+        n_coarse_samples=64,
+        n_fine_samples=128,
+        ):
 
         # project the the paramters from the generated latents
         projected_params = self.params_proj(latents)

From d8bb607c0de1c78dd5e7bdcc3bfe1e7c15e9dfc9 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 07:24:19 +0000
Subject: [PATCH 047/119] make style

---
 scripts/convert_shap_e_to_diffusers.py        | 29 +++++++++---------
 src/diffusers/pipelines/shap_e/__init__.py    |  4 +--
 .../pipelines/shap_e/pipeline_shap_e.py       | 10 +++----
 src/diffusers/pipelines/shap_e/renderer.py    | 30 ++++++++-----------
 4 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py
index 7f1ac7c84627..d92db176f422 100644
--- a/scripts/convert_shap_e_to_diffusers.py
+++ b/scripts/convert_shap_e_to_diffusers.py
@@ -5,7 +5,7 @@
 from accelerate import load_checkpoint_and_dispatch
 
 from diffusers.models.prior_transformer import PriorTransformer
-from diffusers.pipelines.shap_e import MLPNeRSTFModel, ShapEParamsProjModel, ShapERenderer
+from diffusers.pipelines.shap_e import ShapERenderer
 
 
 """
@@ -371,7 +371,7 @@ def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
 # done prior_image
 
 
-# renderer 
+# renderer
 
 RENDERER_CONFIG = {}
 
@@ -386,23 +386,25 @@ def renderer_model_from_original_config():
 
 RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj"
 
+
 def renderer_model_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
-    
     diffusers_checkpoint = {}
     diffusers_checkpoint.update(
         {f"mlp.{k}": checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()}
     )
-            
-    diffusers_checkpoint.update(
-        {f"params_proj.{k}": checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] for k in model.params_proj.state_dict().keys()}
-    )
 
     diffusers_checkpoint.update(
-        {"void.background": torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32)}
+        {
+            f"params_proj.{k}": checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"]
+            for k in model.params_proj.state_dict().keys()
+        }
     )
 
+    diffusers_checkpoint.update({"void.background": torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32)})
+
     return diffusers_checkpoint
 
+
 # done renderer
 
 
@@ -477,17 +479,18 @@ def prior_image(*, args, checkpoint_map_location):
 
 
 def renderer(*, args, checkpoint_map_location):
-
     print(" loading renderer")
 
     renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location)
 
     renderer_model = renderer_model_from_original_config()
 
-    renderer_diffusers_checkpoint = renderer_model_original_checkpoint_to_diffusers_checkpoint(renderer_model, renderer_checkpoint)
+    renderer_diffusers_checkpoint = renderer_model_original_checkpoint_to_diffusers_checkpoint(
+        renderer_model, renderer_checkpoint
+    )
 
     del renderer_checkpoint
-    
+
     load_checkpoint_to_model(renderer_diffusers_checkpoint, renderer_model, strict=True)
 
     print("done loading renderer")
@@ -498,6 +501,7 @@ def renderer(*, args, checkpoint_map_location):
 # prior model will expect clip_mean and clip_std, whic are missing from the state_dict
 PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
 
+
 def load_prior_checkpoint_to_model(checkpoint, model):
     with tempfile.NamedTemporaryFile() as file:
         torch.save(checkpoint, file.name)
@@ -583,9 +587,6 @@ def load_checkpoint_to_model(checkpoint, model, strict=False):
     elif args.debug == "prior_image":
         prior_model = prior_image(args=args, checkpoint_map_location=checkpoint_map_location)
         prior_model.save_pretrained(args.dump_path)
-    elif args.debug == "params_proj":
-        params_proj_model = params_proj(args=args, checkpoint_map_location=checkpoint_map_location)
-        params_proj_model.save_pretrained(args.dump_path)
     elif args.debug == "renderer":
         renderer_model = renderer(args=args, checkpoint_map_location=checkpoint_map_location)
         renderer_model.save_pretrained(args.dump_path)
diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py
index 7f4333ff26a4..04aa1f2f6d78 100644
--- a/src/diffusers/pipelines/shap_e/__init__.py
+++ b/src/diffusers/pipelines/shap_e/__init__.py
@@ -20,8 +20,8 @@
         ImportanceRaySampler,
         MLPNeRFModelOutput,
         MLPNeRSTFModel,
-        StratifiedRaySampler,
         ShapEParamsProjModel,
-        VoidNeRFModel,
         ShapERenderer,
+        StratifiedRaySampler,
+        VoidNeRFModel,
     )
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 4e5275272e63..c5f9d57aeef5 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -31,7 +31,6 @@
     randn_tensor,
     replace_example_docstring,
 )
-
 from .renderer import ShapERenderer
 
 
@@ -44,6 +43,7 @@
         ```
 """
 
+
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
     """
@@ -318,13 +318,13 @@ def __call__(
             return ShapEPipelineOutput(images=latents)
 
         images = self.renderer.decode(
-            latents, 
+            latents,
             device,
-            size=size, 
-            ray_batch_size=ray_batch_size, 
+            size=size,
+            ray_batch_size=ray_batch_size,
             n_coarse_samples=n_coarse_samples,
             n_fine_samples=n_fine_samples,
-            )
+        )
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 1d3d66796e80..3a1045e17aba 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -26,8 +26,6 @@
 from ...utils import BaseOutput
 from .camera import create_pan_cameras
 
-from transformers import PreTrainedModel
-
 
 def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
     r"""
@@ -377,6 +375,7 @@ class MLPNeRFModelOutput(BaseOutput):
     channels: torch.Tensor
     ts: torch.Tensor
 
+
 class MLPNeRSTFModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
@@ -546,7 +545,6 @@ def forward(self, x: torch.Tensor):
 
 
 class ShapERenderer(ModelMixin, ConfigMixin):
-    
     @register_to_config
     def __init__(
         self,
@@ -568,14 +566,14 @@ def __init__(
         n_output: int = 12,
         n_hidden_layers: int = 6,
         act_fn: str = "swish",
-        insert_direction_at: int = 4,):
-
+        insert_direction_at: int = 4,
+    ):
         super().__init__()
 
         self.params_proj = ShapEParamsProjModel(
-            param_names = param_names,
-            param_shapes = param_shapes,
-            d_latent = d_latent,
+            param_names=param_names,
+            param_shapes=param_shapes,
+            d_latent=d_latent,
         )
         self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at)
         self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
@@ -656,18 +654,17 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
         weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
 
         return channels, weighted_sampler, model_out
-    
+
     @torch.no_grad()
     def decode(
-        self, 
-        latents, 
-        device, 
+        self,
+        latents,
+        device,
         size: int = 64,
         ray_batch_size: int = 4096,
         n_coarse_samples=64,
         n_fine_samples=128,
-        ):
-
+    ):
         # project the the paramters from the generated latents
         projected_params = self.params_proj(latents)
 
@@ -675,7 +672,7 @@ def decode(
         for name, param in self.mlp.state_dict().items():
             if f"nerstf.{name}" in projected_params.keys():
                 param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
-        
+
         # create cameras object
         camera = create_pan_cameras(size)
         rays = camera.camera_rays
@@ -702,6 +699,3 @@ def decode(
         images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
 
         return images
-
-
-    

From 51d84b21fcc89528b6403c1cc9f5c5c208890e7a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 07:41:48 +0000
Subject: [PATCH 048/119] refactor img2img

---
 .../shap_e/pipeline_shap_e_img2img.py         | 178 ++----------------
 1 file changed, 11 insertions(+), 167 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 1cc0c4f32080..74cc5dfe326c 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -32,13 +32,7 @@
 )
 from .camera import create_pan_cameras
 from .params_proj import ShapEParamsProjModel
-from .renderer import (
-    BoundingBoxVolume,
-    ImportanceRaySampler,
-    MLPNeRSTFModel,
-    StratifiedRaySampler,
-    VoidNeRFModel,
-)
+from .renderer import ShapERenderer
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -51,51 +45,6 @@
 """
 
 
-def merge_results(self, a: [torch.Tensor], b: torch.Tensor, dim: int, indices: torch.Tensor):
-    """
-    :param a: [..., n_a, ...]. The other dictionary containing the b's may
-        contain extra tensors from earlier calculations, so a can be None.
-    :param b: [..., n_b, ...] :param dim: dimension to merge :param indices: how the merged results should be sorted at
-    the end :return: a concatted and sorted tensor of size [..., n_a + n_b, ...]
-    """
-    merged = torch.cat([a, b], dim=dim)
-    return torch.gather(merged, dim=dim, index=torch.broadcast_to(indices, merged.shape))
-
-
-def integrate_samples(volume_range, ts, density, channels):
-    r"""
-    Function integrating the model output.
-
-    Args:
-        volume_range: Specifies the integral range [t0, t1]
-        ts: timesteps
-        density: torch.Tensor [batch_size, *shape, n_samples, 1]
-        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
-    returns:
-        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
-        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
-    )
-    """
-
-    # 1. Calculate the weights
-    _, _, dt = volume_range.partition(ts)
-    ddensity = density * dt
-
-    mass = torch.cumsum(ddensity, dim=-2)
-    transmittance = torch.exp(-mass[..., -1, :])
-
-    alphas = 1.0 - torch.exp(-ddensity)
-    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
-    # This is the probability of light hitting and reflecting off of
-    # something at depth [..., i, :].
-    weights = alphas * Ts
-
-    # 2. Integrate channels
-    channels = torch.sum(channels * weights, dim=-2)
-
-    return channels, weights, transmittance
-
-
 @dataclass
 class ShapEPipelineOutput(BaseOutput):
     """
@@ -134,8 +83,7 @@ def __init__(
         image_encoder: CLIPVisionModel,
         image_processor: CLIPImageProcessor,
         scheduler: HeunDiscreteScheduler,
-        params_proj: ShapEParamsProjModel,
-        renderer: MLPNeRSTFModel,
+        renderer: ShapERenderer,
     ):
         super().__init__()
 
@@ -144,11 +92,8 @@ def __init__(
             image_encoder=image_encoder,
             image_processor=image_processor,
             scheduler=scheduler,
-            params_proj=params_proj,
             renderer=renderer,
         )
-        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
-        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
 
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -230,82 +175,6 @@ def _encode_image(
 
         return image_embeds
 
-    @torch.no_grad()
-    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
-        """
-        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
-        with some abuse of notations)
-
-            C(r) := sum(
-                transmittance(t[i]) * integrate(
-                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
-                ) for i in range(len(parts))
-            ) + transmittance(t[-1]) * void_model(t[-1]).channels
-
-        where
-
-        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
-        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
-        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
-        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
-        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
-        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
-        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
-
-        args:
-            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
-            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
-
-        :return: A tuple of
-            - `channels`
-            - A importance samplers for additional fine-grained rendering
-            - raw model output
-        """
-        origin, direction = rays[..., 0, :], rays[..., 1, :]
-
-        # Integrate over [t[i], t[i + 1]]
-
-        # 1 Intersect the rays with the current volume and sample ts to integrate along.
-        vrange = self.volume.intersect(origin, direction, t0_lower=None)
-        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
-        ts = ts.to(rays.dtype)
-
-        if prev_model_out is not None:
-            # Append the previous ts now before fprop because previous
-            # rendering used a different model and we can't reuse the output.
-            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
-
-        batch_size, *_shape, _t0_dim = vrange.t0.shape
-        _, *ts_shape, _ts_dim = ts.shape
-
-        # 2. Get the points along the ray and query the model
-        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
-        positions = origin.unsqueeze(-2) + ts * directions
-
-        optional_directions = directions if render_with_direction else None
-
-        model_out = self.renderer(
-            position=positions,
-            direction=optional_directions,
-            ts=ts,
-            nerf_level="coarse" if prev_model_out is None else "fine",
-        )
-
-        # 3. Integrate the model results
-        channels, weights, transmittance = integrate_samples(
-            vrange, model_out.ts, model_out.density, model_out.channels
-        )
-
-        # 4. Clean up results that do not intersect with the volume.
-        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
-        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
-        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
-        channels = channels + transmittance * self.void(origin)
-
-        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
-
-        return channels, weighted_sampler, model_out
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -430,40 +299,15 @@ def __call__(
         # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
-
-        # project the the paramters from the generated latents
-        projected_params = self.params_proj(latents)
-
-        # update the mlp layers of the renderer
-        for name, param in self.renderer.state_dict().items():
-            if f"nerstf.{name}" in projected_params.keys():
-                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
-
-        # create cameras object
-        camera = create_pan_cameras(size)
-        rays = camera.camera_rays
-        rays = rays.to(device)
-        n_batches = rays.shape[1] // ray_batch_size
-
-        coarse_sampler = StratifiedRaySampler()
-
-        images = []
-        with self.progress_bar(total=n_batches) as progress_bar:
-            for idx in range(n_batches):
-                rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
-
-                # render rays with coarse, stratified samples.
-                _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
-                # Then, render with additional importance-weighted ray samples.
-                channels, _, _ = self.render_rays(
-                    rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
-                )
-
-                images.append(channels)
-                progress_bar.update()
-
-        images = torch.cat(images, dim=1)
-        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+        
+        images = self.renderer.decode(
+            latents,
+            device,
+            size=size,
+            ray_batch_size=ray_batch_size,
+            n_coarse_samples=n_coarse_samples,
+            n_fine_samples=n_fine_samples,
+        )
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")

From 30b5391c2da152fddc8697a3d6d6362387fe2f26 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 07:46:45 +0000
Subject: [PATCH 049/119] remove params_proj

---
 src/diffusers/pipelines/shap_e/params_proj.py | 101 ------------------
 .../shap_e/pipeline_shap_e_img2img.py         |   4 +-
 2 files changed, 1 insertion(+), 104 deletions(-)
 delete mode 100644 src/diffusers/pipelines/shap_e/params_proj.py

diff --git a/src/diffusers/pipelines/shap_e/params_proj.py b/src/diffusers/pipelines/shap_e/params_proj.py
deleted file mode 100644
index 47098e92d20e..000000000000
--- a/src/diffusers/pipelines/shap_e/params_proj.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-from torch import nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-
-
-class ChannelsProj(nn.Module):
-    def __init__(
-        self,
-        *,
-        vectors: int,
-        channels: int,
-        d_latent: int,
-    ):
-        super().__init__()
-        self.proj = nn.Linear(d_latent, vectors * channels)
-        self.norm = nn.LayerNorm(channels)
-        self.d_latent = d_latent
-        self.vectors = vectors
-        self.channels = channels
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_bvd = x
-        w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent)
-        b_vc = self.proj.bias.view(1, self.vectors, self.channels)
-        h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd)
-        h = self.norm(h)
-
-        h = h + b_vc
-        return h
-
-
-class ShapEParamsProjModel(ModelMixin, ConfigMixin):
-    """
-    project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
-
-    For more details, see the original paper:
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        *,
-        param_names: Tuple[str] = (
-            "nerstf.mlp.0.weight",
-            "nerstf.mlp.1.weight",
-            "nerstf.mlp.2.weight",
-            "nerstf.mlp.3.weight",
-        ),
-        param_shapes: Tuple[Tuple[int]] = (
-            (256, 93),
-            (256, 256),
-            (256, 256),
-            (256, 256),
-        ),
-        d_latent: int = 1024,
-    ):
-        super().__init__()
-
-        # check inputs
-        if len(param_names) != len(param_shapes):
-            raise ValueError("Must provide same number of `param_names` as `param_shapes`")
-        self.projections = nn.ModuleDict({})
-        for k, (vectors, channels) in zip(param_names, param_shapes):
-            self.projections[_sanitize_name(k)] = ChannelsProj(
-                vectors=vectors,
-                channels=channels,
-                d_latent=d_latent,
-            )
-
-    def forward(self, x: torch.Tensor):
-        out = {}
-        start = 0
-        for k, shape in zip(self.config.param_names, self.config.param_shapes):
-            vectors, _ = shape
-            end = start + vectors
-            x_bvd = x[:, start:end]
-            out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape)
-            start = end
-        return out
-
-
-def _sanitize_name(x: str) -> str:
-    return x.replace(".", "__")
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 74cc5dfe326c..52f82ea7b1bd 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -30,8 +30,6 @@
     randn_tensor,
     replace_example_docstring,
 )
-from .camera import create_pan_cameras
-from .params_proj import ShapEParamsProjModel
 from .renderer import ShapERenderer
 
 
@@ -299,7 +297,7 @@ def __call__(
         # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
-        
+
         images = self.renderer.decode(
             latents,
             device,

From 06306265482ff3e73d6c32be39b2fb843715146e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 29 Jun 2023 23:36:37 +0000
Subject: [PATCH 050/119] add test

---
 src/diffusers/pipelines/shap_e/renderer.py |   3 +-
 tests/pipelines/shap_e/__init__.py         |   0
 tests/pipelines/shap_e/test_shap_e.py      | 197 +++++++++++++++++++++
 3 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 tests/pipelines/shap_e/__init__.py
 create mode 100644 tests/pipelines/shap_e/test_shap_e.py

diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 3a1045e17aba..9b81220c43b4 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -567,6 +567,7 @@ def __init__(
         n_hidden_layers: int = 6,
         act_fn: str = "swish",
         insert_direction_at: int = 4,
+        background: Tuple[float] = (0.0, 0.0, 0.0,),
     ):
         super().__init__()
 
@@ -576,7 +577,7 @@ def __init__(
             d_latent=d_latent,
         )
         self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at)
-        self.void = VoidNeRFModel(background=[0.0, 0.0, 0.0], channel_scale=255.0)
+        self.void = VoidNeRFModel(background=background, channel_scale=255.0)
         self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
 
     @torch.no_grad()
diff --git a/tests/pipelines/shap_e/__init__.py b/tests/pipelines/shap_e/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
new file mode 100644
index 000000000000..7e01479e1c89
--- /dev/null
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -0,0 +1,197 @@
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import ShapEPipeline, HeunDiscreteScheduler, PriorTransformer
+from diffusers.pipelines.shap_e import ShapERenderer
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+enable_full_determinism()
+
+class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ShapEPipeline
+    params = ["prompt"]
+    batch_params = ["prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "guidance_scale"
+        "size",
+        "ray_batch_size",
+        "n_coarse_samples",
+        "n_fine_samples",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+    
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def renderer_dim(self):
+        return 8
+    
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 16,
+            "embedding_dim": self.time_input_dim,
+            "num_embeddings": 32,
+            "embedding_proj_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "num_layers": 1,
+            "clip_embed_dim": self.time_input_dim * 2,
+            "additional_embeddings": 0,
+            "time_embed_act_fn": "gelu",
+            "norm_in_type": "layer",
+            "encoder_hid_proj_type": None,
+            "added_emb_type": None,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        return model
+
+    @property
+    def dummy_renderer(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "param_shapes": (
+                (self.renderer_dim, 93),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+            ),
+            "d_latent": self.time_input_dim,
+            "d_hidden": self.renderer_dim,
+            "n_output": 12,
+            "background": (1.0, 1.0, 1.0,),
+        }
+        model = ShapERenderer(**model_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        renderer = self.dummy_renderer
+
+        scheduler = HeunDiscreteScheduler(
+              beta_schedule="exp",
+              num_train_timesteps=1024,
+              prediction_type="sample",
+              use_karras_sigmas=False,
+        )
+        components = {
+            "prior": prior,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "renderer": renderer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "horse",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "size":64,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_shap_e(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (20, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.00392157,
+                0.00392157,
+                0.00392157,
+                0.00392157, 
+                0.00392157, 
+                0.00392157,
+                0.00392157, 
+                0.00392157, 
+                0.00392157
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file

From d92350ef913f0bcac465482e3b91ae5c97eb9a5b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 01:21:56 +0000
Subject: [PATCH 051/119] add upcast_softmax to prior_transformer

---
 src/diffusers/models/attention.py          | 3 +++
 src/diffusers/models/prior_transformer.py  | 2 ++
 src/diffusers/pipelines/shap_e/renderer.py | 5 ++++-
 tests/pipelines/shap_e/test_shap_e.py      | 5 +++--
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 8805257ebe9a..0cbed5457d4e 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -58,6 +58,7 @@ def __init__(
         only_cross_attention: bool = False,
         double_self_attention: bool = False,
         upcast_attention: bool = False,
+        upcast_softmax: bool = False,
         norm_elementwise_affine: bool = True,
         norm_type: str = "layer_norm",
         final_dropout: bool = False,
@@ -90,6 +91,7 @@ def __init__(
             bias=attention_bias,
             cross_attention_dim=cross_attention_dim if only_cross_attention else None,
             upcast_attention=upcast_attention,
+            upcast_softmax=upcast_softmax,
         )
 
         # 2. Cross-Attn
@@ -110,6 +112,7 @@ def __init__(
                 dropout=dropout,
                 bias=attention_bias,
                 upcast_attention=upcast_attention,
+                upcast_softmax=upcast_softmax,
             )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 7ffc9ccff5ea..094b3d3b8fd4 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -83,6 +83,7 @@ def __init__(
         time_embed_dim: Optional[int] = None,
         embedding_proj_dim: Optional[int] = None,
         clip_embed_dim: Optional[int] = None,
+        upcast_softmax: bool = False,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
@@ -131,6 +132,7 @@ def __init__(
                     dropout=dropout,
                     activation_fn="gelu",
                     attention_bias=True,
+                    upcast_softmax=upcast_softmax,
                 )
                 for d in range(num_layers)
             ]
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 9b81220c43b4..3eb8225d395f 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -631,7 +631,10 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
         # 2. Get the points along the ray and query the model
         directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
         positions = origin.unsqueeze(-2) + ts * directions
-
+        
+        directions = directions.to(self.mlp.dtype)
+        positions = positions.to(self.mlp.dtype)
+        
         optional_directions = directions if render_with_direction else None
 
         model_out = self.mlp(
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 7e01479e1c89..e134c59f9ded 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -38,7 +38,7 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "num_inference_steps",
         "generator",
         "latents",
-        "guidance_scale"
+        "guidance_scale",
         "size",
         "ray_batch_size",
         "n_coarse_samples",
@@ -104,6 +104,7 @@ def dummy_prior(self):
             "norm_in_type": "layer",
             "encoder_hid_proj_type": None,
             "added_emb_type": None,
+            "upcast_softmax": True,
         }
 
         model = PriorTransformer(**model_kwargs)
@@ -123,7 +124,7 @@ def dummy_renderer(self):
             "d_latent": self.time_input_dim,
             "d_hidden": self.renderer_dim,
             "n_output": 12,
-            "background": (1.0, 1.0, 1.0,),
+            "background": (1e-8, 1e-8, 1e-8,),
         }
         model = ShapERenderer(**model_kwargs)
         return model

From 2dfc35131c4570a341497e6a96371afdfca4f63a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 03:04:36 +0000
Subject: [PATCH 052/119] enable num_images_per_prompt, add save_gif utility

---
 .../pipelines/shap_e/pipeline_shap_e.py       | 46 +++++++++++++------
 tests/pipelines/shap_e/test_shap_e.py         | 37 +++++++++++++--
 2 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index c5f9d57aeef5..a38cb5cb42a3 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -54,7 +54,7 @@ class ShapEPipelineOutput(BaseOutput):
             a list of images for 3D rendering
     """
 
-    images: Union[PIL.Image.Image, np.ndarray]
+    images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
 
 
 class ShapEPipeline(DiffusionPipeline):
@@ -191,12 +191,24 @@ def _encode_prompt(
         prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         return prompt_embeds
+    
+    @staticmethod
+    def save_gif(images:List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
+        images[0].save(
+            f"{image_name}.gif",
+            save_all=save_all, 
+            append_images=images[1:], 
+            optimize=optimize, 
+            duration=duration, 
+            loop=loop
+        )
+
 
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        prompt: Union[str, List[str]],
+        prompt: str,
         num_images_per_prompt: int = 1,
         num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -247,10 +259,10 @@ def __call__(
             [`ShapEPipelineOutput`] or `tuple`
         """
 
-        if isinstance(prompt, str):
+        if isinstance(prompt, str) or isinstance(prompt, list) and len(prompt) ==1:
             batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
+        elif isinstance(prompt, list) and len(prompt) > 1:
+            raise ValueError(f"this pipeline does not support more than one prompt")
         else:
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
@@ -316,15 +328,21 @@ def __call__(
         # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
+        
+        images = []
+        for i, latent in enumerate(latents):
+
+            image = self.renderer.decode(
+                latent[None,:],
+                device,
+                size=size,
+                ray_batch_size=ray_batch_size,
+                n_coarse_samples=n_coarse_samples,
+                n_fine_samples=n_fine_samples,
+            )
+            images.append(image)
 
-        images = self.renderer.decode(
-            latents,
-            device,
-            size=size,
-            ray_batch_size=ray_batch_size,
-            n_coarse_samples=n_coarse_samples,
-            n_fine_samples=n_fine_samples,
-        )
+        images = torch.stack(images)
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
@@ -332,7 +350,7 @@ def __call__(
         images = images.cpu().numpy()
 
         if output_type == "pil":
-            images = self.numpy_to_pil(images)
+            images = [self.numpy_to_pil(image) for image in images]
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index e134c59f9ded..05cb1bbd9fc2 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -21,7 +21,7 @@
 
 from diffusers import ShapEPipeline, HeunDiscreteScheduler, PriorTransformer
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
 
@@ -32,7 +32,7 @@
 class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEPipeline
     params = ["prompt"]
-    batch_params = ["prompt"]
+    batch_params = []
     required_optional_params = [
         "num_images_per_prompt",
         "num_inference_steps",
@@ -176,7 +176,7 @@ def test_shap_e(self):
         pipe.set_progress_bar_config(disable=None)
 
         output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
+        image = output.images[0]
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (20, 64, 64, 3)
@@ -195,4 +195,33 @@ def test_shap_e(self):
             ]
         )
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skip(reason="Batching is not supported for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip(reason="Batching is not supported for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    # overwrite because:
+    #  1. this pipeline support num_images_per_prompt but does not support batching
+    #  2. this pipeline outputs 3d images, i.e a list of N lists of images, where N is our num_image_per_prompts
+    def test_num_images_per_prompt(self):
+        
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompts = [1, 2]
+
+        
+        for num_images_per_prompt in num_images_per_prompts:
+            print(f"num: {num_images_per_prompt}")
+            inputs = self.get_dummy_inputs(torch_device)
+            images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
+
+            assert len(images) == batch_size * num_images_per_prompt

From 295077a098e966b532ecf94c1c43eef747fa3eee Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 03:41:10 +0000
Subject: [PATCH 053/119] add

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 1 -
 tests/pipelines/shap_e/test_shap_e.py             | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index a38cb5cb42a3..13c4a122d9e1 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -203,7 +203,6 @@ def save_gif(images:List[PIL.Image.Image], image_name: int, save_all=True, optim
             loop=loop
         )
 
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 05cb1bbd9fc2..5b7fef5204b9 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -220,8 +220,7 @@ def test_num_images_per_prompt(self):
 
         
         for num_images_per_prompt in num_images_per_prompts:
-            print(f"num: {num_images_per_prompt}")
             inputs = self.get_dummy_inputs(torch_device)
             images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
 
-            assert len(images) == batch_size * num_images_per_prompt
+            assert len(images) == batch_size * num_images_per_prompt
\ No newline at end of file

From 2027e9bee4d5187c8bc39fe246a1b2edbfab3dc1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 05:27:38 +0000
Subject: [PATCH 054/119] add fast test

---
 tests/pipelines/shap_e/test_shap_e.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 5b7fef5204b9..6e36b612e148 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -124,7 +124,7 @@ def dummy_renderer(self):
             "d_latent": self.time_input_dim,
             "d_hidden": self.renderer_dim,
             "n_output": 12,
-            "background": (1e-8, 1e-8, 1e-8,),
+            "background": (0.1, 0.1, 0.1,),
         }
         model = ShapERenderer(**model_kwargs)
         return model
@@ -159,9 +159,11 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "horse",
             "generator": generator,
-            "num_inference_steps": 2,
+            "num_inference_steps": 4,
             "size":64,
             "output_type": "np",
+            "sigma_max": 16.,
+            "sigma_min": 15.,
         }
         return inputs
 
@@ -183,16 +185,15 @@ def test_shap_e(self):
 
         expected_slice = np.array(
             [
-                0.00392157,
-                0.00392157,
-                0.00392157,
-                0.00392157, 
-                0.00392157, 
-                0.00392157,
-                0.00392157, 
-                0.00392157, 
-                0.00392157
-            ]
+                0.00039216, 
+                0.00039216, 
+                0.00039216, 
+                0.00039216, 
+                0.00039216, 
+                0.00039216,
+                0.00039216, 
+                0.00039216, 
+                0.00039216]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 7ff4fcc9de31df2f06b14a5cc39f7e38eb1397cc Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 05:28:55 +0000
Subject: [PATCH 055/119] make style

---
 .../pipelines/shap_e/pipeline_shap_e.py       | 23 ++++----
 src/diffusers/pipelines/shap_e/renderer.py    | 10 +++-
 tests/pipelines/shap_e/test_shap_e.py         | 58 ++++++++++---------
 3 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 13c4a122d9e1..29373f9ef4ea 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -191,16 +191,16 @@ def _encode_prompt(
         prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
 
         return prompt_embeds
-    
+
     @staticmethod
-    def save_gif(images:List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
+    def save_gif(images: List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
         images[0].save(
             f"{image_name}.gif",
-            save_all=save_all, 
-            append_images=images[1:], 
-            optimize=optimize, 
-            duration=duration, 
-            loop=loop
+            save_all=save_all,
+            append_images=images[1:],
+            optimize=optimize,
+            duration=duration,
+            loop=loop,
         )
 
     @torch.no_grad()
@@ -258,10 +258,10 @@ def __call__(
             [`ShapEPipelineOutput`] or `tuple`
         """
 
-        if isinstance(prompt, str) or isinstance(prompt, list) and len(prompt) ==1:
+        if isinstance(prompt, str) or isinstance(prompt, list) and len(prompt) == 1:
             batch_size = 1
         elif isinstance(prompt, list) and len(prompt) > 1:
-            raise ValueError(f"this pipeline does not support more than one prompt")
+            raise ValueError("this pipeline does not support more than one prompt")
         else:
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
@@ -327,12 +327,11 @@ def __call__(
         # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
-        
+
         images = []
         for i, latent in enumerate(latents):
-
             image = self.renderer.decode(
-                latent[None,:],
+                latent[None, :],
                 device,
                 size=size,
                 ray_batch_size=ray_batch_size,
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 3eb8225d395f..4547cb778837 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -567,7 +567,11 @@ def __init__(
         n_hidden_layers: int = 6,
         act_fn: str = "swish",
         insert_direction_at: int = 4,
-        background: Tuple[float] = (0.0, 0.0, 0.0,),
+        background: Tuple[float] = (
+            0.0,
+            0.0,
+            0.0,
+        ),
     ):
         super().__init__()
 
@@ -631,10 +635,10 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with
         # 2. Get the points along the ray and query the model
         directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
         positions = origin.unsqueeze(-2) + ts * directions
-        
+
         directions = directions.to(self.mlp.dtype)
         positions = positions.to(self.mlp.dtype)
-        
+
         optional_directions = directions if render_with_direction else None
 
         model_out = self.mlp(
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 6e36b612e148..9f868c05eeac 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -12,23 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
-import random
 import unittest
 
 import numpy as np
 import torch
+from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
 
-from diffusers import ShapEPipeline, HeunDiscreteScheduler, PriorTransformer
+from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
 
-from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+from ..test_pipelines_common import PipelineTesterMixin
 
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 enable_full_determinism()
 
+
 class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEPipeline
     params = ["prompt"]
@@ -47,11 +46,11 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "return_dict",
     ]
     test_xformers_attention = False
-    
+
     @property
     def text_embedder_hidden_size(self):
         return 32
-    
+
     @property
     def time_input_dim(self):
         return 32
@@ -63,7 +62,7 @@ def time_embed_dim(self):
     @property
     def renderer_dim(self):
         return 8
-    
+
     @property
     def dummy_tokenizer(self):
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
@@ -124,7 +123,11 @@ def dummy_renderer(self):
             "d_latent": self.time_input_dim,
             "d_hidden": self.renderer_dim,
             "n_output": 12,
-            "background": (0.1, 0.1, 0.1,),
+            "background": (
+                0.1,
+                0.1,
+                0.1,
+            ),
         }
         model = ShapERenderer(**model_kwargs)
         return model
@@ -136,10 +139,10 @@ def get_dummy_components(self):
         renderer = self.dummy_renderer
 
         scheduler = HeunDiscreteScheduler(
-              beta_schedule="exp",
-              num_train_timesteps=1024,
-              prediction_type="sample",
-              use_karras_sigmas=False,
+            beta_schedule="exp",
+            num_train_timesteps=1024,
+            prediction_type="sample",
+            use_karras_sigmas=False,
         )
         components = {
             "prior": prior,
@@ -160,10 +163,10 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt": "horse",
             "generator": generator,
             "num_inference_steps": 4,
-            "size":64,
+            "size": 64,
             "output_type": "np",
-            "sigma_max": 16.,
-            "sigma_min": 15.,
+            "sigma_max": 16.0,
+            "sigma_min": 15.0,
         }
         return inputs
 
@@ -185,15 +188,16 @@ def test_shap_e(self):
 
         expected_slice = np.array(
             [
-                0.00039216, 
-                0.00039216, 
-                0.00039216, 
-                0.00039216, 
-                0.00039216, 
                 0.00039216,
-                0.00039216, 
-                0.00039216, 
-                0.00039216]
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+            ]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -210,7 +214,6 @@ def test_inference_batch_single_identical(self):
     #  1. this pipeline support num_images_per_prompt but does not support batching
     #  2. this pipeline outputs 3d images, i.e a list of N lists of images, where N is our num_image_per_prompts
     def test_num_images_per_prompt(self):
-        
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe = pipe.to(torch_device)
@@ -219,9 +222,8 @@ def test_num_images_per_prompt(self):
         batch_size = 1
         num_images_per_prompts = [1, 2]
 
-        
         for num_images_per_prompt in num_images_per_prompts:
             inputs = self.get_dummy_inputs(torch_device)
             images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
 
-            assert len(images) == batch_size * num_images_per_prompt
\ No newline at end of file
+            assert len(images) == batch_size * num_images_per_prompt

From 32145b826a8d099f771ffb0712cc784bd899b13f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 06:13:39 +0000
Subject: [PATCH 056/119] add slow test

---
 tests/pipelines/shap_e/test_shap_e.py | 39 +++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 9f868c05eeac..4854e790166a 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import gc
 
 import numpy as np
 import torch
@@ -20,9 +21,10 @@
 
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+from diffusers.utils import load_numpy, nightly, slow
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 enable_full_determinism()
@@ -227,3 +229,36 @@ def test_num_images_per_prompt(self):
             images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
 
             assert len(images) == batch_size * num_images_per_prompt
+
+
+@slow
+@require_torch_gpu
+class ShapEPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_shap_e(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/shap_e/test_shap_e_np_out.npy"
+        )
+        pipe = ShapEPipeline.from_pretrained("YiYiXu/shap-e")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        images = pipe(
+            "a shark", 
+            generator=generator, 
+            guidance_scale=15.0,
+            num_inference_steps= 64, 
+            size = 64, 
+            output_type='np').images[0]
+
+        assert images.shape == (20, 64, 64, 3)
+
+        assert_mean_pixel_difference(images, expected_image)
\ No newline at end of file

From c393463be34f14ced7826e336f004f6220227c9d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 06:14:14 +0000
Subject: [PATCH 057/119] style

---
 tests/pipelines/shap_e/test_shap_e.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 4854e790166a..f6cd0787cf2b 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 import gc
+import unittest
 
 import numpy as np
 import torch
@@ -21,7 +21,7 @@
 
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
-from diffusers.utils import load_numpy, nightly, slow
+from diffusers.utils import load_numpy, slow
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
@@ -252,13 +252,9 @@ def test_shap_e(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
 
         images = pipe(
-            "a shark", 
-            generator=generator, 
-            guidance_scale=15.0,
-            num_inference_steps= 64, 
-            size = 64, 
-            output_type='np').images[0]
+            "a shark", generator=generator, guidance_scale=15.0, num_inference_steps=64, size=64, output_type="np"
+        ).images[0]
 
         assert images.shape == (20, 64, 64, 3)
 
-        assert_mean_pixel_difference(images, expected_image)
\ No newline at end of file
+        assert_mean_pixel_difference(images, expected_image)

From a98094d7ac0ee41ea6c2cacce4217436fa0081af Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 09:27:19 +0000
Subject: [PATCH 058/119] add test for img2img

---
 .../shap_e/pipeline_shap_e_img2img.py         |  84 ++++--
 tests/pipelines/shap_e/test_shap_e_img2img.py | 278 ++++++++++++++++++
 2 files changed, 331 insertions(+), 31 deletions(-)
 create mode 100644 tests/pipelines/shap_e/test_shap_e_img2img.py

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 52f82ea7b1bd..587bc50812cd 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -117,7 +117,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         device = torch.device(f"cuda:{gpu_id}")
 
         models = [
-            self.text_encoder,
+            self.image_encoder,
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -130,9 +130,9 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
+        if self.device != torch.device("meta") or not hasattr(self.image_encoder, "_hf_hook"):
             return self.device
-        for module in self.text_encoder.modules():
+        for module in self.image_encoder.modules():
             if (
                 hasattr(module, "_hf_hook")
                 and hasattr(module._hf_hook, "execution_device")
@@ -143,25 +143,26 @@ def _execution_device(self):
 
     def _encode_image(
         self,
-        images,
+        image,
         device,
-        num_images_per_image,
+        num_images_per_prompt,
         do_classifier_free_guidance,
     ):
-        if not isinstance(images, PIL.Image.Image):
-            images = [images]
-
-        images = (
-            self.image_processor(images, return_tensors="pt")
-            .pixel_values[0]
-            .unsqueeze(0)
-            .to(dtype=self.image_encoder.dtype, device=device)
-        )
 
-        image_embeds = self.image_encoder(images)["last_hidden_state"]
-        image_embeds = image_embeds[:, 1:, :].contiguous().float()  # batch_size, dim, 256
+        if isinstance(image, PIL.Image.Image):
+
+            image = (
+                self.image_processor(image, return_tensors="pt")
+                .pixel_values[0]
+                .unsqueeze(0)
+            )
+        
+        image = image.to(dtype=self.image_encoder.dtype, device=device)
 
-        image_embeds = image_embeds.repeat_interleave(num_images_per_image, dim=0)
+        image_embeds = self.image_encoder(image)["last_hidden_state"]
+        image_embeds = image_embeds[:, 1:, :].contiguous()  # batch_size, dim, 256
+
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
         if do_classifier_free_guidance:
             negative_image_embeds = torch.zeros_like(image_embeds)
@@ -173,12 +174,23 @@ def _encode_image(
 
         return image_embeds
 
+    @staticmethod
+    def save_gif(images: List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
+        images[0].save(
+            f"{image_name}.gif",
+            save_all=save_all,
+            append_images=images[1:],
+            optimize=optimize,
+            duration=duration,
+            loop=loop,
+        )
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image: Union[PIL.Image.Image, List[PIL.Image.Image]],
-        num_images_per_image: int = 1,
+        num_images_per_prompt: int = 1,
         num_inference_steps: int = 25,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
@@ -230,17 +242,20 @@ def __call__(
 
         if isinstance(image, PIL.Image.Image):
             batch_size = 1
-        elif isinstance(image, list):
+        elif isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
             batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]         
         else:
-            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `list` but is {type(image)}")
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
+
 
         device = self._execution_device
 
-        batch_size = batch_size * num_images_per_image
+        batch_size = batch_size * num_images_per_prompt
 
         do_classifier_free_guidance = guidance_scale > 1.0
-        image_embeds = self._encode_image(image, device, num_images_per_image, do_classifier_free_guidance)
+        image_embeds = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
 
         # prior
 
@@ -298,14 +313,21 @@ def __call__(
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 
-        images = self.renderer.decode(
-            latents,
-            device,
-            size=size,
-            ray_batch_size=ray_batch_size,
-            n_coarse_samples=n_coarse_samples,
-            n_fine_samples=n_fine_samples,
-        )
+        images = []
+        for i, latent in enumerate(latents):
+            print()
+            image = self.renderer.decode(
+                latent[None,:],
+                device,
+                size=size,
+                ray_batch_size=ray_batch_size,
+                n_coarse_samples=n_coarse_samples,
+                n_fine_samples=n_fine_samples,
+            )
+
+            images.append(image)
+        
+        images = torch.stack(images)
 
         if output_type not in ["np", "pil"]:
             raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}")
@@ -313,7 +335,7 @@ def __call__(
         images = images.cpu().numpy()
 
         if output_type == "pil":
-            images = self.numpy_to_pil(images)
+            images = [self.numpy_to_pil(image) for image in images]
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
new file mode 100644
index 000000000000..13602c87e5b0
--- /dev/null
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -0,0 +1,278 @@
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPVisionConfig, CLIPVisionModel, CLIPImageProcessor
+
+from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
+from diffusers.pipelines.shap_e import ShapERenderer
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+
+
+enable_full_determinism()
+
+
+class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ShapEImg2ImgPipeline
+    params = ["image"]
+    batch_params = []
+    required_optional_params = [
+        "num_images_per_prompt",
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "guidance_scale",
+        "size",
+        "ray_batch_size",
+        "n_coarse_samples",
+        "n_fine_samples",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
+    @property
+    def time_input_dim(self):
+        return 32
+
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+
+    @property
+    def renderer_dim(self):
+        return 8
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            image_size=64,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=1,
+        )
+
+        model = CLIPVisionModel(config)
+        return model
+
+    @property
+    def dummy_image_processor(self):
+        image_processor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
+        return image_processor
+
+    @property
+    def dummy_prior(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 16,
+            "embedding_dim": self.time_input_dim,
+            "num_embeddings": 32,
+            "embedding_proj_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "num_layers": 1,
+            "clip_embed_dim": self.time_input_dim * 2,
+            "additional_embeddings": 0,
+            "time_embed_act_fn": "gelu",
+            "norm_in_type": "layer",
+            "embedding_proj_norm_type": "layer",
+            "encoder_hid_proj_type": None,
+            "added_emb_type": None,
+            "upcast_softmax": True,
+        }
+
+        model = PriorTransformer(**model_kwargs)
+        return model
+
+    @property
+    def dummy_renderer(self):
+        torch.manual_seed(0)
+
+        model_kwargs = {
+            "param_shapes": (
+                (self.renderer_dim, 93),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+                (self.renderer_dim, 8),
+            ),
+            "d_latent": self.time_input_dim,
+            "d_hidden": self.renderer_dim,
+            "n_output": 12,
+            "background": (
+                0.1,
+                0.1,
+                0.1,
+            ),
+        }
+        model = ShapERenderer(**model_kwargs)
+        return model
+
+    def get_dummy_components(self):
+        prior = self.dummy_prior
+        image_encoder = self.dummy_image_encoder
+        image_processor = self.dummy_image_processor
+        renderer = self.dummy_renderer
+
+        scheduler = HeunDiscreteScheduler(
+            beta_schedule="exp",
+            num_train_timesteps=1024,
+            prediction_type="sample",
+            use_karras_sigmas=False,
+        )
+        components = {
+            "prior": prior,
+            "image_encoder": image_encoder,
+            "image_processor": image_processor,
+            "renderer": renderer,
+            "scheduler": scheduler,
+        }
+
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+
+        input_image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "image": input_image,
+            "generator": generator,
+            "num_inference_steps": 4,
+            "size": 64,
+            "output_type": "np",
+            "sigma_max": 16.0,
+            "sigma_min": 15.0,
+        }
+        return inputs
+
+    def test_shap_e(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images[0]
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (20, 64, 64, 3)
+
+        expected_slice = np.array(
+            [
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+                0.00039216,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skip(reason="Batching is not supported for this pipeline.")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip(reason="Batching is not supported for this pipeline.")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    # overwrite because:
+    #  this pipeline outputs 3d images, i.e a list of N lists of images, where N is our num_image_per_prompts
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompts = [1, 2]
+
+        for num_images_per_prompt in num_images_per_prompts:
+            inputs = self.get_dummy_inputs(torch_device)
+            images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
+
+            assert len(images) == batch_size * num_images_per_prompt
+
+
+@slow
+@require_torch_gpu
+class ShapEImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_shap_e_img2img(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/shap_e/corgi.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/shap_e/test_shap_e_img2img_out.npy"
+        )
+        pipe = ShapEImg2ImgPipeline.from_pretrained("YiYiXu/shap-e-img2img")
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        images = pipe(
+            input_image, generator=generator, guidance_scale=3.0, num_inference_steps=64, size=64, output_type="np"
+        ).images[0]
+
+        assert images.shape == (20, 64, 64, 3)
+
+        assert_mean_pixel_difference(images, expected_image)

From 944f8a64dff725d4b577b2e2a6b30be905110b61 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 09:28:32 +0000
Subject: [PATCH 059/119] refactor

---
 .../shap_e/pipeline_shap_e_img2img.py         | 19 +++++++------------
 tests/pipelines/shap_e/test_shap_e_img2img.py |  6 ++----
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 587bc50812cd..a33f39e09729 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -148,15 +148,9 @@ def _encode_image(
         num_images_per_prompt,
         do_classifier_free_guidance,
     ):
-
         if isinstance(image, PIL.Image.Image):
+            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)
 
-            image = (
-                self.image_processor(image, return_tensors="pt")
-                .pixel_values[0]
-                .unsqueeze(0)
-            )
-        
         image = image.to(dtype=self.image_encoder.dtype, device=device)
 
         image_embeds = self.image_encoder(image)["last_hidden_state"]
@@ -245,10 +239,11 @@ def __call__(
         elif isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
             batch_size = len(image)
         elif isinstance(image, torch.Tensor):
-            batch_size = image.shape[0]         
+            batch_size = image.shape[0]
         else:
-            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
-
+            raise ValueError(
+                f"`image` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image` or `torch.Tensor` but is {type(image)}"
+            )
 
         device = self._execution_device
 
@@ -317,7 +312,7 @@ def __call__(
         for i, latent in enumerate(latents):
             print()
             image = self.renderer.decode(
-                latent[None,:],
+                latent[None, :],
                 device,
                 size=size,
                 ray_batch_size=ray_batch_size,
@@ -326,7 +321,7 @@ def __call__(
             )
 
             images.append(image)
-        
+
         images = torch.stack(images)
 
         if output_type not in ["np", "pil"]:
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 13602c87e5b0..74f99fab9c26 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import torch
-from transformers import CLIPVisionConfig, CLIPVisionModel, CLIPImageProcessor
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
 
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
@@ -169,7 +169,6 @@ def get_dummy_components(self):
         return components
 
     def get_dummy_inputs(self, device, seed=0):
-
         input_image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
 
         if str(device).startswith("mps"):
@@ -256,8 +255,7 @@ def tearDown(self):
 
     def test_shap_e_img2img(self):
         input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/shap_e/corgi.png"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/shap_e/corgi.png"
         )
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"

From 3cac8d273189518cad855f54ee4f656b5b27d60d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 15:41:25 +0000
Subject: [PATCH 060/119] enable batching

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  6 ++--
 .../shap_e/pipeline_shap_e_img2img.py         | 17 ++++++-----
 tests/pipelines/shap_e/test_shap_e.py         | 28 +------------------
 tests/pipelines/shap_e/test_shap_e_img2img.py | 27 +-----------------
 4 files changed, 15 insertions(+), 63 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 29373f9ef4ea..1e4792852154 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -258,10 +258,10 @@ def __call__(
             [`ShapEPipelineOutput`] or `tuple`
         """
 
-        if isinstance(prompt, str) or isinstance(prompt, list) and len(prompt) == 1:
+        if isinstance(prompt, str):
             batch_size = 1
-        elif isinstance(prompt, list) and len(prompt) > 1:
-            raise ValueError("this pipeline does not support more than one prompt")
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
         else:
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index a33f39e09729..105292da0759 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -147,9 +147,12 @@ def _encode_image(
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
-    ):
-        if isinstance(image, PIL.Image.Image):
-            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)
+    ):  
+        if isinstance(image, List) and isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)      
 
         image = image.to(dtype=self.image_encoder.dtype, device=device)
 
@@ -233,16 +236,16 @@ def __call__(
         Returns:
             [`ShapEPipelineOutput`] or `tuple`
         """
-
+               
         if isinstance(image, PIL.Image.Image):
             batch_size = 1
-        elif isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            batch_size = len(image)
         elif isinstance(image, torch.Tensor):
             batch_size = image.shape[0]
+        elif isinstance(image, list) and isinstance(image[0], (torch.Tensor, PIL.Image.Image)):
+            batch_size = len(image)
         else:
             raise ValueError(
-                f"`image` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image` or `torch.Tensor` but is {type(image)}"
+                f"`image` has to be of type `PIL.Image.Image`, `torch.Tensor`, `List[PIL.Image.Image]` or `List[torch.Tensor]` but is {type(image)}"
             )
 
         device = self._execution_device
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index f6cd0787cf2b..bc6e0b42565f 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -33,7 +33,7 @@
 class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEPipeline
     params = ["prompt"]
-    batch_params = []
+    batch_params = ["prompt"]
     required_optional_params = [
         "num_images_per_prompt",
         "num_inference_steps",
@@ -204,32 +204,6 @@ def test_shap_e(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    @unittest.skip(reason="Batching is not supported for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    @unittest.skip(reason="Batching is not supported for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    # overwrite because:
-    #  1. this pipeline support num_images_per_prompt but does not support batching
-    #  2. this pipeline outputs 3d images, i.e a list of N lists of images, where N is our num_image_per_prompts
-    def test_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        batch_size = 1
-        num_images_per_prompts = [1, 2]
-
-        for num_images_per_prompt in num_images_per_prompts:
-            inputs = self.get_dummy_inputs(torch_device)
-            images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-
-            assert len(images) == batch_size * num_images_per_prompt
-
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 74f99fab9c26..db4c376c241c 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -34,7 +34,7 @@
 class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEImg2ImgPipeline
     params = ["image"]
-    batch_params = []
+    batch_params = ["image"]
     required_optional_params = [
         "num_images_per_prompt",
         "num_inference_steps",
@@ -218,31 +218,6 @@ def test_shap_e(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    @unittest.skip(reason="Batching is not supported for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    @unittest.skip(reason="Batching is not supported for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    # overwrite because:
-    #  this pipeline outputs 3d images, i.e a list of N lists of images, where N is our num_image_per_prompts
-    def test_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        batch_size = 1
-        num_images_per_prompts = [1, 2]
-
-        for num_images_per_prompt in num_images_per_prompts:
-            inputs = self.get_dummy_inputs(torch_device)
-            images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-
-            assert len(images) == batch_size * num_images_per_prompt
-
 
 @slow
 @require_torch_gpu

From de15046b24beb92972151fc6fc3fefc971749b26 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Fri, 30 Jun 2023 15:42:07 +0000
Subject: [PATCH 061/119] style

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 105292da0759..b80a7752f20d 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -147,12 +147,12 @@ def _encode_image(
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
-    ):  
+    ):
         if isinstance(image, List) and isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
 
         if not isinstance(image, torch.Tensor):
-            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)      
+            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)
 
         image = image.to(dtype=self.image_encoder.dtype, device=device)
 
@@ -236,7 +236,7 @@ def __call__(
         Returns:
             [`ShapEPipelineOutput`] or `tuple`
         """
-               
+
         if isinstance(image, PIL.Image.Image):
             batch_size = 1
         elif isinstance(image, torch.Tensor):

From e44fcb50516bbe1e671ca7c5bd62e90575487759 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 2 Jul 2023 21:12:07 +0000
Subject: [PATCH 062/119] refactor scheduler

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  6 +--
 .../shap_e/pipeline_shap_e_img2img.py         |  6 +--
 .../schedulers/scheduling_heun_discrete.py    | 52 +++++++++----------
 3 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 1e4792852154..fd200b508c5a 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -213,8 +213,6 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        sigma_min: float = 1e-3,
-        sigma_max: float = 160.0,
         size: int = 64,
         ray_batch_size: int = 4096,
         n_coarse_samples=64,
@@ -274,9 +272,7 @@ def __call__(
 
         # prior
 
-        self.scheduler.set_timesteps(
-            num_inference_steps, device=device, sigma_min=sigma_min, sigma_max=sigma_max, use_karras_sigmas=True
-        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
 
         num_embeddings = self.prior.config.num_embeddings
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index b80a7752f20d..0e47fb882fff 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -192,8 +192,6 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        sigma_min: float = 1e-3,
-        sigma_max: float = 160.0,
         size: int = 64,
         ray_batch_size: int = 4096,
         n_coarse_samples=64,
@@ -257,9 +255,7 @@ def __call__(
 
         # prior
 
-        self.scheduler.set_timesteps(
-            num_inference_steps, device=device, sigma_min=sigma_min, sigma_max=sigma_max, use_karras_sigmas=True
-        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
 
         num_embeddings = self.prior.config.num_embeddings
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 3f37766ed2ee..cb30059fcae3 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -105,6 +106,8 @@ def __init__(
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         prediction_type: str = "epsilon",
         use_karras_sigmas: Optional[bool] = False,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -168,9 +171,6 @@ def set_timesteps(
         num_inference_steps: int,
         device: Union[str, torch.device] = None,
         num_train_timesteps: Optional[int] = None,
-        sigma_min: Optional[float] = None,
-        sigma_max: Optional[float] = None,
-        use_karras_sigmas: Optional[bool] = None,  # overwrite the self.config.use_karras_sigma
     ):
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -185,28 +185,32 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-        if use_karras_sigmas is None:
-            use_karras_sigmas = self.use_karras_sigmas
 
-        if sigma_min is not None and sigma_max is not None:
-            if use_karras_sigmas is not None:
-                sigmas = torch.tensor([sigma_max, sigma_min])
-                log_sigmas = None
-            else:
-                raise ValueError(
-                    "`sigma_min` and `sigma_max` arguments are only supported when `use_karras_sigma` is not None"
-                )
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
-        else:
-            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
-            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-            log_sigmas = np.log(sigmas)
-            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        if self.config.sigma_min is not None or self.config.sigma_max is not None:
+            if not self.config.use_karras_sigmas:
+                warnings.warn(
+                    "`sigma_min` and `sigma_max` will be ignored when `use_karras_sigmas` is set to `False` "
+                )
+                use_log_sigmas = True
+            else:
+                use_log_sigmas = False
+        else:
+            use_log_sigmas = True
+                
+        if self.config.use_karras_sigmas:
 
-        if use_karras_sigmas:
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+            sigmas = self._convert_to_karras(
+                sigma_min= sigmas[-1].item() if self.config.sigma_min is None else self.config.sigma_min,
+                sigma_max= sigmas[0].item() if self.config.sigma_max is None else self.config.sigma_max, 
+                num_inference_steps=self.num_inference_steps)
+    
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas = log_sigmas if use_log_sigmas else None) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -272,13 +276,9 @@ def _sigma_to_t(self, sigma, log_sigmas):
 
         return t
 
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+    def _convert_to_karras(self, sigma_min, sigma_max, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
-        sigma_min: float = in_sigmas[-1].item()
-        sigma_max: float = in_sigmas[0].item()
-
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
         min_inv_rho = sigma_min ** (1 / rho)

From fb2319e23b955e9027163f821ad53c96fa4486a4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 2 Jul 2023 21:54:54 +0000
Subject: [PATCH 063/119] update test

---
 tests/pipelines/shap_e/test_shap_e.py         | 6 +++---
 tests/pipelines/shap_e/test_shap_e_img2img.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index bc6e0b42565f..4c7490cf1710 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -144,7 +144,9 @@ def get_dummy_components(self):
             beta_schedule="exp",
             num_train_timesteps=1024,
             prediction_type="sample",
-            use_karras_sigmas=False,
+            use_karras_sigmas=True,
+            sigma_max= 16.0,
+            sigma_min= 15.0,
         )
         components = {
             "prior": prior,
@@ -167,8 +169,6 @@ def get_dummy_inputs(self, device, seed=0):
             "num_inference_steps": 4,
             "size": 64,
             "output_type": "np",
-            "sigma_max": 16.0,
-            "sigma_min": 15.0,
         }
         return inputs
 
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index db4c376c241c..6dc39758c5e8 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -156,7 +156,9 @@ def get_dummy_components(self):
             beta_schedule="exp",
             num_train_timesteps=1024,
             prediction_type="sample",
-            use_karras_sigmas=False,
+            use_karras_sigmas=True,
+            sigma_max=16.0,
+            sigma_min=15.0,
         )
         components = {
             "prior": prior,
@@ -181,8 +183,6 @@ def get_dummy_inputs(self, device, seed=0):
             "num_inference_steps": 4,
             "size": 64,
             "output_type": "np",
-            "sigma_max": 16.0,
-            "sigma_min": 15.0,
         }
         return inputs
 

From 9c78816558ce893c49a7a8971aca04298742f862 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 2 Jul 2023 21:55:40 +0000
Subject: [PATCH 064/119] style

---
 .../schedulers/scheduling_heun_discrete.py    | 19 ++++++++++---------
 tests/pipelines/shap_e/test_shap_e.py         |  4 ++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index cb30059fcae3..9193f2c05cc2 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 import math
+import warnings
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -185,7 +185,6 @@ def set_timesteps(
 
         num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
 
-
         timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
@@ -202,15 +201,17 @@ def set_timesteps(
                 use_log_sigmas = False
         else:
             use_log_sigmas = True
-                
-        if self.config.use_karras_sigmas:
 
+        if self.config.use_karras_sigmas:
             sigmas = self._convert_to_karras(
-                sigma_min= sigmas[-1].item() if self.config.sigma_min is None else self.config.sigma_min,
-                sigma_max= sigmas[0].item() if self.config.sigma_max is None else self.config.sigma_max, 
-                num_inference_steps=self.num_inference_steps)
-    
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas = log_sigmas if use_log_sigmas else None) for sigma in sigmas])
+                sigma_min=sigmas[-1].item() if self.config.sigma_min is None else self.config.sigma_min,
+                sigma_max=sigmas[0].item() if self.config.sigma_max is None else self.config.sigma_max,
+                num_inference_steps=self.num_inference_steps,
+            )
+
+            timesteps = np.array(
+                [self._sigma_to_t(sigma, log_sigmas=log_sigmas if use_log_sigmas else None) for sigma in sigmas]
+            )
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 4c7490cf1710..9519d397092c 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -145,8 +145,8 @@ def get_dummy_components(self):
             num_train_timesteps=1024,
             prediction_type="sample",
             use_karras_sigmas=True,
-            sigma_max= 16.0,
-            sigma_min= 15.0,
+            sigma_max=16.0,
+            sigma_min=15.0,
         )
         components = {
             "prior": prior,

From 76c19d643d51e98364cd06f05ad90992d0c4fdef Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 2 Jul 2023 22:15:57 +0000
Subject: [PATCH 065/119] attempt to solve batch related tests timeout

---
 tests/pipelines/shap_e/test_shap_e.py         | 33 +++++++++++++++++++
 tests/pipelines/shap_e/test_shap_e_img2img.py | 32 ++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 9519d397092c..4e804511dc2a 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -204,6 +204,39 @@ def test_shap_e(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_inference_batch_consistent(self):
+        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
+        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+
+        self._test_inference_batch_single_identical(
+            batch_size=2,
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+        )
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompt = 2
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        for key in inputs.keys():
+            if key in self.batch_params:
+                inputs[key] = batch_size * [inputs[key]]
+
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+        assert images.shape[0] == batch_size * num_images_per_prompt
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 6dc39758c5e8..6deef515b376 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -218,6 +218,38 @@ def test_shap_e(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_inference_batch_consistent(self):
+        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
+        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    def test_inference_batch_single_identical(self):
+        test_max_difference = torch_device == "cpu"
+        relax_max_difference = True
+        self._test_inference_batch_single_identical(
+            batch_size=2,
+            test_max_difference=test_max_difference,
+            relax_max_difference=relax_max_difference,
+        )
+
+    def test_num_images_per_prompt(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_size = 1
+        num_images_per_prompt = 2
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        for key in inputs.keys():
+            if key in self.batch_params:
+                inputs[key] = batch_size * [inputs[key]]
+
+        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+        assert images.shape[0] == batch_size * num_images_per_prompt
+
 
 @slow
 @require_torch_gpu

From f6d184b777b8d3e41ed17942a0d315bce959cf08 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 07:52:16 +0000
Subject: [PATCH 066/119] add doc

---
 docs/source/en/api/pipelines/shap_e.mdx       | 38 +++++++++++++++++++
 .../pipelines/shap_e/pipeline_shap_e.py       | 19 ++++++++++
 .../shap_e/pipeline_shap_e_img2img.py         | 21 ++++++++++
 3 files changed, 78 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/shap_e.mdx

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
new file mode 100644
index 000000000000..0278cbf86f2a
--- /dev/null
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -0,0 +1,38 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Shap-E
+
+## Overview
+
+The Shap-E model was proposed in [ Shap-E: Generating Conditional 3D Implicit Functions](https://arxiv.org/abs/2305.02463) by Alex Nichol and Heewon Jun from [OpenAI](https://github.com/openai). 
+
+The abstract of paper is the following:
+
+*We present Shap-E, a conditional generative model for 3D assets. Unlike recent work on 3D generative models which produce a single output representation, Shap-E directly generates the parameters of implicit functions that can be rendered as both textured meshes and neural radiance fields. We train Shap-E in two stages: first, we train an encoder that deterministically maps 3D assets into the parameters of an implicit function; second, we train a conditional diffusion model on outputs of the encoder. When trained on a large dataset of paired 3D and text data, our resulting models are capable of generating complex and diverse 3D assets in a matter of seconds. When compared to Point-E, an explicit generative model over point clouds, Shap-E converges faster and reaches comparable or better sample quality despite modeling a higher-dimensional, multi-representation output space.*
+
+The original codebase can be found [here](https://github.com/openai/shap-e)
+
+## Available Pipelines:
+
+| Pipeline | Tasks |
+|---|---|
+| [pipeline_shap_e.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e.py) | *Text-to-Image Generation* | 
+| [pipeline_shap_e_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py) | *Image-to-Image Generation* |
+
+## ShapEPipeline
+[[autodoc]] ShapEPipeline
+	- all
+	- __call__
+
+[[autodoc]] ShapEImg2ImgPipeline
+	- all
+	- __call__
+
+
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index fd200b508c5a..81cffe06f415 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -39,7 +39,26 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> import torch
+        >>> from diffusers import ShapEPipeline
 
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "YiYiXu/shap-e"
+        >>> pipe = ShapEPipeline.from_pretrained(repo)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 15.0
+        >>> prompt = "a shark"
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     size=256,
+        ... ).images
+
+        >>> pipe.save_gif(images[0], "shark.png")
         ```
 """
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 0e47fb882fff..3461053d24fc 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -38,7 +38,28 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> from PIL import Image
+        >>> import torch
+        >>> from diffusers import ShapEImg2ImgPipeline
 
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "YiYiXu/shap-e-img2img"
+        >>> pipe = ShapEImg2ImgPipeline.from_pretrained(repo)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 3.0
+        >>> image = Image.open("corgi.png")
+
+
+        >>> images = pipe(
+        ...     image,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     size=256,
+        ... ).images
+
+        >>> pipe.save_gif(images[0], "corgi_out")
         ```
 """
 

From a4e12f4c82d07c7a09bcc9f16ae464650e3dc74a Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 3 Jul 2023 07:12:25 -1000
Subject: [PATCH 067/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 81cffe06f415..af0cc066333e 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -232,10 +232,6 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        size: int = 64,
-        ray_batch_size: int = 4096,
-        n_coarse_samples=64,
-        n_fine_samples=128,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):

From f4849f54cd75ea64cc637706593a263ac10212ba Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 3 Jul 2023 07:12:37 -1000
Subject: [PATCH 068/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 3461053d24fc..03f3894c6475 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -213,10 +213,6 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        size: int = 64,
-        ray_batch_size: int = 4096,
-        n_coarse_samples=64,
-        n_fine_samples=128,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):

From 19aa590c6a4aed311ac101ead58e39938986b95b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 17:36:09 +0000
Subject: [PATCH 069/119] hardcode rendering related config

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py         | 7 ++++---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 7 ++++---
 tests/pipelines/shap_e/test_shap_e.py                     | 3 ---
 tests/pipelines/shap_e/test_shap_e_img2img.py             | 3 ---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index af0cc066333e..bc8358705053 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -232,6 +232,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
+        size: int = 64, 
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
@@ -345,9 +346,9 @@ def __call__(
                 latent[None, :],
                 device,
                 size=size,
-                ray_batch_size=ray_batch_size,
-                n_coarse_samples=n_coarse_samples,
-                n_fine_samples=n_fine_samples,
+                ray_batch_size=4096,
+                n_coarse_samples=64,
+                n_fine_samples=128,
             )
             images.append(image)
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 03f3894c6475..ec62fc93a60b 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -213,6 +213,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
+        size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
@@ -331,9 +332,9 @@ def __call__(
                 latent[None, :],
                 device,
                 size=size,
-                ray_batch_size=ray_batch_size,
-                n_coarse_samples=n_coarse_samples,
-                n_fine_samples=n_fine_samples,
+                ray_batch_size=4096,
+                n_coarse_samples=64,
+                n_fine_samples=128,
             )
 
             images.append(image)
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 4e804511dc2a..94a678e3929e 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -41,9 +41,6 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "latents",
         "guidance_scale",
         "size",
-        "ray_batch_size",
-        "n_coarse_samples",
-        "n_fine_samples",
         "output_type",
         "return_dict",
     ]
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 6deef515b376..c4087dfb023b 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -42,9 +42,6 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "latents",
         "guidance_scale",
         "size",
-        "ray_batch_size",
-        "n_coarse_samples",
-        "n_fine_samples",
         "output_type",
         "return_dict",
     ]

From 73438f65c4803df0c658c36dc9e7bfc907413a98 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 18:28:29 +0000
Subject: [PATCH 070/119] update betas_for_alpha_bar on ddpm_scheduler

---
 src/diffusers/schedulers/scheduling_ddpm.py   | 21 +++++++++++++++----
 .../schedulers/scheduling_heun_discrete.py    |  2 +-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 5d24766d68c7..b9f47ce6c246 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -44,7 +44,11 @@ class DDPMSchedulerOutput(BaseOutput):
     pred_original_sample: Optional[torch.FloatTensor] = None
 
 
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
+    num_diffusion_timesteps, 
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+    ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -61,15 +65,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 9193f2c05cc2..d0de14c22eb5 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -22,7 +22,7 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
-
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,

From 66dadc1d9063a39224a939deb682fdca4642c64c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 18:30:09 +0000
Subject: [PATCH 071/119] fix copies

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  2 +-
 src/diffusers/schedulers/scheduling_ddim.py   | 19 ++++++++++++++++---
 .../schedulers/scheduling_ddim_inverse.py     | 19 ++++++++++++++++---
 .../schedulers/scheduling_ddim_parallel.py    | 19 ++++++++++++++++---
 src/diffusers/schedulers/scheduling_ddpm.py   |  4 ++--
 .../schedulers/scheduling_ddpm_parallel.py    | 19 ++++++++++++++++---
 .../schedulers/scheduling_deis_multistep.py   | 19 ++++++++++++++++---
 .../scheduling_dpmsolver_multistep.py         | 19 ++++++++++++++++---
 .../scheduling_dpmsolver_multistep_inverse.py | 19 ++++++++++++++++---
 .../schedulers/scheduling_dpmsolver_sde.py    | 19 ++++++++++++++++---
 .../scheduling_dpmsolver_singlestep.py        | 19 ++++++++++++++++---
 .../scheduling_euler_ancestral_discrete.py    | 19 ++++++++++++++++---
 .../schedulers/scheduling_euler_discrete.py   | 19 ++++++++++++++++---
 .../schedulers/scheduling_heun_discrete.py    |  3 ++-
 .../scheduling_k_dpm_2_ancestral_discrete.py  | 19 ++++++++++++++++---
 .../schedulers/scheduling_k_dpm_2_discrete.py | 19 ++++++++++++++++---
 .../schedulers/scheduling_lms_discrete.py     | 19 ++++++++++++++++---
 src/diffusers/schedulers/scheduling_pndm.py   | 19 ++++++++++++++++---
 .../schedulers/scheduling_repaint.py          | 19 ++++++++++++++++---
 src/diffusers/schedulers/scheduling_unclip.py | 19 ++++++++++++++++---
 20 files changed, 277 insertions(+), 55 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index bc8358705053..023b1403d04c 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -232,7 +232,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        size: int = 64, 
+        size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index bab6f8acea03..de56116b6c58 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -48,6 +48,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -64,15 +68,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 2c9fc036a027..71fa5a48c92b 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -47,6 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -63,15 +67,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 22b7d8ec97dc..5a8b3b75a4fc 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -48,6 +48,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -64,15 +68,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index b9f47ce6c246..544322a7bba0 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -45,10 +45,10 @@ class DDPMSchedulerOutput(BaseOutput):
 
 
 def betas_for_alpha_bar(
-    num_diffusion_timesteps, 
+    num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
-    ):
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 2719d90b9314..8c8c1615784c 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -47,6 +47,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -63,15 +67,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index 56c362018c18..bedb87f1d0eb 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -27,6 +27,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,15 +47,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index d7c29d5488a5..e940681beef2 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -27,6 +27,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,15 +47,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index b424ebbff262..60b5f97b7077 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -27,6 +27,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,15 +47,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index ae9229981152..70da63f0c4c3 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -77,6 +77,10 @@ def __call__(self, sigma, sigma_next):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -93,15 +97,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 721dd5e5bb85..82a880bef78a 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -30,6 +30,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -46,15 +50,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 6b08e9bfc207..9270fcac0fcb 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -48,6 +48,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -64,15 +68,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 7237128cbf07..b1266bbad53d 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -48,6 +48,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -64,15 +68,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index d0de14c22eb5..2b4687505bfb 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -22,12 +22,13 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
 
+
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
-) -> torch.Tensor:
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 2fa0431e1292..d822f1837506 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -25,6 +25,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -41,15 +45,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index bb80c4a54bfe..7c9fb4a46c7d 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -24,6 +24,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -40,15 +44,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 0656475c3093..29f2753c5a4d 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -46,6 +46,10 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -62,15 +66,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 01c02a21bbfc..9b1839d2d03b 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -26,6 +26,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -42,15 +46,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index f2f97b38f3d3..2a7401e58361 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -44,6 +44,10 @@ class RePaintSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,15 +64,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index d44edcb1812a..88476820094a 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -45,6 +45,10 @@ class UnCLIPSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",  # cosine, exp
+):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -61,15 +65,24 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
+    if alpha_transform_type == "cosine":
 
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
 
     betas = []
     for i in range(num_diffusion_timesteps):
         t1 = i / num_diffusion_timesteps
         t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
     return torch.tensor(betas, dtype=torch.float32)
 
 

From a06ae4a24565520c764681499df489277d976b11 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 18:35:45 +0000
Subject: [PATCH 072/119] fix

---
 src/diffusers/schedulers/scheduling_ddim.py                     | 2 +-
 src/diffusers/schedulers/scheduling_ddim_inverse.py             | 2 +-
 src/diffusers/schedulers/scheduling_ddim_parallel.py            | 2 +-
 src/diffusers/schedulers/scheduling_ddpm_parallel.py            | 2 +-
 src/diffusers/schedulers/scheduling_deis_multistep.py           | 2 +-
 src/diffusers/schedulers/scheduling_dpmsolver_multistep.py      | 2 +-
 .../schedulers/scheduling_dpmsolver_multistep_inverse.py        | 2 +-
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py            | 2 +-
 src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py     | 2 +-
 src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py | 2 +-
 src/diffusers/schedulers/scheduling_euler_discrete.py           | 2 +-
 .../schedulers/scheduling_k_dpm_2_ancestral_discrete.py         | 2 +-
 src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py         | 2 +-
 src/diffusers/schedulers/scheduling_lms_discrete.py             | 2 +-
 src/diffusers/schedulers/scheduling_pndm.py                     | 2 +-
 src/diffusers/schedulers/scheduling_repaint.py                  | 2 +-
 src/diffusers/schedulers/scheduling_unclip.py                   | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index de56116b6c58..fe2f1e106006 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -47,7 +47,7 @@ class DDIMSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 71fa5a48c92b..70cc56272a21 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -46,7 +46,7 @@ class DDIMSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 5a8b3b75a4fc..cafc24862c05 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -47,7 +47,7 @@ class DDIMParallelSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 8c8c1615784c..8126a0ebe5c8 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -46,7 +46,7 @@ class DDPMParallelSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index bedb87f1d0eb..6586cb8ebff6 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -26,7 +26,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index e940681beef2..aacd41c98738 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -26,7 +26,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index 60b5f97b7077..e34fd056c795 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -26,7 +26,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 70da63f0c4c3..f313746bef55 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -76,7 +76,7 @@ def __call__(self, sigma, sigma_next):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 82a880bef78a..3db7973d7fee 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -29,7 +29,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 9270fcac0fcb..7d7852bab576 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -47,7 +47,7 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index b1266bbad53d..2029fd6b4ca7 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -47,7 +47,7 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index d822f1837506..e7b7d2fa1792 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -24,7 +24,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 7c9fb4a46c7d..1666d74bd331 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -23,7 +23,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 29f2753c5a4d..8220f43f1467 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -45,7 +45,7 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 9b1839d2d03b..d528067aca4e 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -25,7 +25,7 @@
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index 2a7401e58361..e1c31a60c942 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -43,7 +43,7 @@ class RePaintSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index 88476820094a..ac090360bae4 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -44,7 +44,7 @@ class UnCLIPSchedulerOutput(BaseOutput):
 
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",  # cosine, exp

From b46b3388798aa49b97321b8009d8c8ddf32abddb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 19:21:31 +0000
Subject: [PATCH 073/119] export_to_gif

---
 .../pipelines/shap_e/pipeline_shap_e.py       | 15 +++-----------
 .../shap_e/pipeline_shap_e_img2img.py         | 14 +++----------
 src/diffusers/utils/__init__.py               |  2 +-
 src/diffusers/utils/testing_utils.py          | 20 +++++++++++++++++++
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 023b1403d04c..5a7cd43881bb 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -41,6 +41,7 @@
         ```py
         >>> import torch
         >>> from diffusers import ShapEPipeline
+        >>> from diffusers.utils import export_to_gif
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -58,7 +59,8 @@
         ...     size=256,
         ... ).images
 
-        >>> pipe.save_gif(images[0], "shark.png")
+        >>> gif_path = export_to_gif(images, "shark_3d")
+        >>> print(f"output saved to folder: {gif_path}")
         ```
 """
 
@@ -211,17 +213,6 @@ def _encode_prompt(
 
         return prompt_embeds
 
-    @staticmethod
-    def save_gif(images: List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
-        images[0].save(
-            f"{image_name}.gif",
-            save_all=save_all,
-            append_images=images[1:],
-            optimize=optimize,
-            duration=duration,
-            loop=loop,
-        )
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index ec62fc93a60b..664ba879186f 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -41,6 +41,7 @@
         >>> from PIL import Image
         >>> import torch
         >>> from diffusers import ShapEImg2ImgPipeline
+        >>> diffusers.utils import export_to_gif
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -59,7 +60,8 @@
         ...     size=256,
         ... ).images
 
-        >>> pipe.save_gif(images[0], "corgi_out")
+        >>> gif_path = export_to_gif(images, "corgi_3d")
+        >>> print(f" output saved to folder: {gif_path}")
         ```
 """
 
@@ -192,16 +194,6 @@ def _encode_image(
 
         return image_embeds
 
-    @staticmethod
-    def save_gif(images: List[PIL.Image.Image], image_name: int, save_all=True, optimize=False, duration=100, loop=0):
-        images[0].save(
-            f"{image_name}.gif",
-            save_all=save_all,
-            append_images=images[1:],
-            optimize=optimize,
-            duration=duration,
-            loop=loop,
-        )
 
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 36cbe82f79e7..3ba9b94195e1 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -103,7 +103,7 @@
     )
     from .torch_utils import maybe_allow_in_graph
 
-from .testing_utils import export_to_video
+from .testing_utils import export_to_video, export_to_gif
 
 
 logger = get_logger(__name__)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index dcb80169de74..202198abaae4 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -300,6 +300,26 @@ def preprocess_image(image: PIL.Image, batch_size: int):
     return 2.0 * image - 1.0
 
 
+def export_to_gif(images: List[List[PIL.Image.Image]], output_gif_path: str=None) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.TemporaryDirectory().name
+    
+    output_gif_path = Path(output_gif_path)
+
+    output_gif_path.mkdir(exist_ok=True, parents=True)
+    
+    for i, image in enumerate(images):
+        image[0].save(
+            output_gif_path / f"out_{i}.gif",
+            save_all=True,
+            append_images=image[1:],
+            optimize=False,
+            duration=100,
+            loop=0,
+        )
+    return output_gif_path
+    
+
 def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
     if is_opencv_available():
         import cv2

From 29c0bba16e633ba121f97c6864b1556b98bd02d7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 19:24:22 +0000
Subject: [PATCH 074/119] style

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py         | 1 -
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 4 +---
 src/diffusers/utils/__init__.py                           | 2 +-
 src/diffusers/utils/testing_utils.py                      | 8 ++++----
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5a7cd43881bb..5ca71fda7995 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -60,7 +60,6 @@
         ... ).images
 
         >>> gif_path = export_to_gif(images, "shark_3d")
-        >>> print(f"output saved to folder: {gif_path}")
         ```
 """
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 664ba879186f..dd7241003631 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -41,7 +41,7 @@
         >>> from PIL import Image
         >>> import torch
         >>> from diffusers import ShapEImg2ImgPipeline
-        >>> diffusers.utils import export_to_gif
+        >>> from diffusers.utils import export_to_gif
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -61,7 +61,6 @@
         ... ).images
 
         >>> gif_path = export_to_gif(images, "corgi_3d")
-        >>> print(f" output saved to folder: {gif_path}")
         ```
 """
 
@@ -194,7 +193,6 @@ def _encode_image(
 
         return image_embeds
 
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 3ba9b94195e1..93497e637ebb 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -103,7 +103,7 @@
     )
     from .torch_utils import maybe_allow_in_graph
 
-from .testing_utils import export_to_video, export_to_gif
+from .testing_utils import export_to_gif, export_to_video
 
 
 logger = get_logger(__name__)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 202198abaae4..9ab4984bbd39 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -300,14 +300,14 @@ def preprocess_image(image: PIL.Image, batch_size: int):
     return 2.0 * image - 1.0
 
 
-def export_to_gif(images: List[List[PIL.Image.Image]], output_gif_path: str=None) -> str:
+def export_to_gif(images: List[List[PIL.Image.Image]], output_gif_path: str = None) -> str:
     if output_gif_path is None:
         output_gif_path = tempfile.TemporaryDirectory().name
-    
+
     output_gif_path = Path(output_gif_path)
 
     output_gif_path.mkdir(exist_ok=True, parents=True)
-    
+
     for i, image in enumerate(images):
         image[0].save(
             output_gif_path / f"out_{i}.gif",
@@ -318,7 +318,7 @@ def export_to_gif(images: List[List[PIL.Image.Image]], output_gif_path: str=None
             loop=0,
         )
     return output_gif_path
-    
+
 
 def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
     if is_opencv_available():

From 6d2c2047be0cb1dc97921b30ac15dcbb42675d9c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 20:22:09 +0000
Subject: [PATCH 075/119] second attempt to speed up batching tests

---
 tests/pipelines/shap_e/test_shap_e.py         | 2 +-
 tests/pipelines/shap_e/test_shap_e_img2img.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 94a678e3929e..5a31f3b00554 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -163,7 +163,7 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "horse",
             "generator": generator,
-            "num_inference_steps": 4,
+            "num_inference_steps": 1,
             "size": 64,
             "output_type": "np",
         }
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index c4087dfb023b..fc9ca9ae58db 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -177,7 +177,7 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "image": input_image,
             "generator": generator,
-            "num_inference_steps": 4,
+            "num_inference_steps": 1,
             "size": 64,
             "output_type": "np",
         }

From 88db77048b7e785746c899d7b1899139f610e66d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 20:31:52 +0000
Subject: [PATCH 076/119] add doc page to index

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 72808df049c9..15d94616e509 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -224,6 +224,8 @@
       title: Self-Attention Guidance
     - local: api/pipelines/semantic_stable_diffusion
       title: Semantic Guidance
+    - local: api/pipelines/shap_e
+      title: Shap-E
     - local: api/pipelines/spectrogram_diffusion
       title: Spectrogram Diffusion
     - sections:

From 4978d3c3ef42ce2b82e49e17b10436591d062326 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 3 Jul 2023 21:31:20 +0000
Subject: [PATCH 077/119] Remove intermediate clipping

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py     |  5 +----
 .../pipelines/shap_e/pipeline_shap_e_img2img.py       |  3 ---
 src/diffusers/schedulers/scheduling_heun_discrete.py  | 11 +++++++++++
 src/diffusers/utils/testing_utils.py                  |  6 ++----
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5ca71fda7995..2cb665ec4c3e 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -59,7 +59,7 @@
         ...     size=256,
         ... ).images
 
-        >>> gif_path = export_to_gif(images, "shark_3d")
+        >>> gif_path = export_to_gif(images, "shark_3d.gif")
         ```
 """
 
@@ -312,9 +312,6 @@ def __call__(
                 scaled_model_input.shape[2], dim=2
             )  # batch_size, num_embeddings, embedding_dim
 
-            # clip between -1 and 1
-            noise_pred = noise_pred.clamp(-1, 1)
-
             if do_classifier_free_guidance is not None:
                 noise_pred_uncond, noise_pred = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index dd7241003631..33c394f6abe9 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -297,9 +297,6 @@ def __call__(
                 scaled_model_input.shape[2], dim=2
             )  # batch_size, num_embeddings, embedding_dim
 
-            # clip between -1 and 1
-            noise_pred = noise_pred.clamp(-1, 1)
-
             if do_classifier_free_guidance is not None:
                 noise_pred_uncond, noise_pred = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 2b4687505bfb..e7087f6ec007 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -88,6 +88,10 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
             https://imagen.research.google/video/paper.pdf).
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
              This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
              noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
@@ -107,6 +111,8 @@ def __init__(
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         prediction_type: str = "epsilon",
         use_karras_sigmas: Optional[bool] = False,
+        clip_sample: Optional[bool] = False,
+        clip_sample_range: float = 1.0,
         sigma_min: Optional[float] = None,
         sigma_max: Optional[float] = None,
     ):
@@ -346,6 +352,11 @@ def step(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
             )
 
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
         if self.state_in_first_order:
             # 2. Convert to an ODE derivative for 1st order
             derivative = (sample - pred_original_sample) / sigma_hat
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 9ab4984bbd39..8dd3434432db 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -300,17 +300,15 @@ def preprocess_image(image: PIL.Image, batch_size: int):
     return 2.0 * image - 1.0
 
 
-def export_to_gif(images: List[List[PIL.Image.Image]], output_gif_path: str = None) -> str:
+def export_to_gif(images: List[PIL.Image.Image], output_gif_path: str = None) -> str:
     if output_gif_path is None:
         output_gif_path = tempfile.TemporaryDirectory().name
 
     output_gif_path = Path(output_gif_path)
 
-    output_gif_path.mkdir(exist_ok=True, parents=True)
-
     for i, image in enumerate(images):
         image[0].save(
-            output_gif_path / f"out_{i}.gif",
+            output_gif_path,
             save_all=True,
             append_images=image[1:],
             optimize=False,

From 3330cad32c4aed906fc5b2413387115e58bec0f3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 3 Jul 2023 21:48:07 +0000
Subject: [PATCH 078/119] 3rd attempt to speed up batching tests

---
 tests/pipelines/shap_e/test_shap_e.py         | 4 ++--
 tests/pipelines/shap_e/test_shap_e_img2img.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 5a31f3b00554..fb9ca3d4efa3 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -164,7 +164,7 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt": "horse",
             "generator": generator,
             "num_inference_steps": 1,
-            "size": 64,
+            "size": 32,
             "output_type": "np",
         }
         return inputs
@@ -183,7 +183,7 @@ def test_shap_e(self):
         image = output.images[0]
         image_slice = image[0, -3:, -3:, -1]
 
-        assert image.shape == (20, 64, 64, 3)
+        assert image.shape == (20, 32, 32, 3)
 
         expected_slice = np.array(
             [
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index fc9ca9ae58db..8410fe3be34e 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -178,7 +178,7 @@ def get_dummy_inputs(self, device, seed=0):
             "image": input_image,
             "generator": generator,
             "num_inference_steps": 1,
-            "size": 64,
+            "size": 32,
             "output_type": "np",
         }
         return inputs
@@ -197,7 +197,7 @@ def test_shap_e(self):
         image = output.images[0]
         image_slice = image[0, -3:, -3:, -1]
 
-        assert image.shape == (20, 64, 64, 3)
+        assert image.shape == (20, 32, 32, 3)
 
         expected_slice = np.array(
             [

From bce48afc1db6aec55312d8b7eb68ec3328e92a8e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 10:53:14 +0000
Subject: [PATCH 079/119] Remvoe time index

---
 .../pipelines/shap_e/pipeline_shap_e.py       |  3 +--
 .../shap_e/pipeline_shap_e_img2img.py         |  3 +--
 .../schedulers/scheduling_heun_discrete.py    | 20 +++++++++++++------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 2cb665ec4c3e..b69fd8b91658 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -299,7 +299,7 @@ def __call__(
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             noise_pred = self.prior(
                 scaled_model_input,
@@ -320,7 +320,6 @@ def __call__(
                 noise_pred,
                 timestep=t,
                 sample=latents,
-                step_index=i,
             ).prev_sample
 
         # YiYi testing only: I don't think we need to return latent for this pipeline
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 33c394f6abe9..2f1222ba210a 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -284,7 +284,7 @@ def __call__(
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             noise_pred = self.prior(
                 scaled_model_input,
@@ -305,7 +305,6 @@ def __call__(
                 noise_pred,
                 timestep=t,
                 sample=latents,
-                step_index=i,
             ).prev_sample
 
         # YiYi testing only: I don't think we need to return latent for this pipeline
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index e7087f6ec007..01e6a0afad04 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -15,6 +15,7 @@
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
+from collections import defaultdict
 
 import numpy as np
 import torch
@@ -140,23 +141,33 @@ def __init__(
         self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
         self.use_karras_sigmas = use_karras_sigmas
 
+        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
+        # we need an index counter
+        self._index_counter = defaultdict(int)
+
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
         indices = (schedule_timesteps == timestep).nonzero()
 
+        # exp beta schedules might have more than twice the same consecutive timestep
+        # to make sure we select the correct index, let's keep track of counts
+        self._index_counter[int(timestep)] += 1
+
+        if self._index_counter[int(timestep)] > 2:
+            pos = self._index_counter[int(timestep)] - 1
         if self.state_in_first_order:
             pos = -1
         else:
             pos = 0
+
         return indices[pos].item()
 
     def scale_model_input(
         self,
         sample: torch.FloatTensor,
         timestep: Union[float, torch.FloatTensor],
-        step_index: Optional[int] = None,
     ) -> torch.FloatTensor:
         """
         Args:
@@ -166,8 +177,7 @@ def scale_model_input(
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
-        if step_index is None:
-            step_index = self.index_for_timestep(timestep)
+        step_index = self.index_for_timestep(timestep)
 
         sigma = self.sigmas[step_index]
         sample = sample / ((sigma**2 + 1) ** 0.5)
@@ -303,7 +313,6 @@ def step(
         model_output: Union[torch.FloatTensor, np.ndarray],
         timestep: Union[float, torch.FloatTensor],
         sample: Union[torch.FloatTensor, np.ndarray],
-        step_index: Optional[int] = None,
         return_dict: bool = True,
     ) -> Union[SchedulerOutput, Tuple]:
         """
@@ -319,8 +328,7 @@ def step(
             [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
-        if step_index is None:
-            step_index = self.index_for_timestep(timestep)
+        step_index = self.index_for_timestep(timestep)
 
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]

From 3acaa3dcb8f04c8183dd2d79ebce17a3bf9e0b6b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 11:28:03 +0000
Subject: [PATCH 080/119] simplify scheduler

---
 .../schedulers/scheduling_dpmsolver_sde.py    |  8 +-
 .../schedulers/scheduling_heun_discrete.py    | 86 ++++++-------------
 .../scheduling_k_dpm_2_ancestral_discrete.py  |  8 +-
 .../schedulers/scheduling_k_dpm_2_discrete.py |  8 +-
 4 files changed, 36 insertions(+), 74 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index f313746bef55..4cdf2da5a9a4 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -194,10 +194,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
+        # exp beta schedules might have more than twice the same consecutive timestep
+        # to make sure we select the correct index, let's keep track of counts
+        pos = self._index_counter[timestep.cpu().item()]
+
         return indices[pos].item()
 
     def scale_model_input(
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 01e6a0afad04..53f9df5291e6 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 import math
-import warnings
-from typing import List, Optional, Tuple, Union
 from collections import defaultdict
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -153,14 +152,7 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         # exp beta schedules might have more than twice the same consecutive timestep
         # to make sure we select the correct index, let's keep track of counts
-        self._index_counter[int(timestep)] += 1
-
-        if self._index_counter[int(timestep)] > 2:
-            pos = self._index_counter[int(timestep)] - 1
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
+        pos = self._index_counter[timestep.cpu().item()]
 
         return indices[pos].item()
 
@@ -208,27 +200,14 @@ def set_timesteps(
         log_sigmas = np.log(sigmas)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
-        if self.config.sigma_min is not None or self.config.sigma_max is not None:
-            if not self.config.use_karras_sigmas:
-                warnings.warn(
-                    "`sigma_min` and `sigma_max` will be ignored when `use_karras_sigmas` is set to `False` "
-                )
-                use_log_sigmas = True
-            else:
-                use_log_sigmas = False
-        else:
-            use_log_sigmas = True
-
         if self.config.use_karras_sigmas:
             sigmas = self._convert_to_karras(
-                sigma_min=sigmas[-1].item() if self.config.sigma_min is None else self.config.sigma_min,
-                sigma_max=sigmas[0].item() if self.config.sigma_max is None else self.config.sigma_max,
+                sigma_min=sigmas[-1].item(),
+                sigma_max=sigmas[0].item(),
                 num_inference_steps=self.num_inference_steps,
             )
 
-            timesteps = np.array(
-                [self._sigma_to_t(sigma, log_sigmas=log_sigmas if use_log_sigmas else None) for sigma in sigmas]
-            )
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas=log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -250,48 +229,28 @@ def set_timesteps(
         self.prev_derivative = None
         self.dt = None
 
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
-        # perform interpolation on sigmas if log_sigmas is not None
-        if log_sigmas is not None:
-            # get log sigma
-            log_sigma = np.log(sigma)
+        # get log sigma
+        log_sigma = np.log(sigma)
 
-            # get distribution
-            dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
 
-            # get sigmas range
-            low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-            high_idx = low_idx + 1
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
 
-            low = log_sigmas[low_idx]
-            high = log_sigmas[high_idx]
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
 
-            # interpolate sigmas
-            w = (low - log_sigma) / (low - high)
-            w = np.clip(w, 0, 1)
-
-            # transform interpolation to time range
-            t = (1 - w) * low_idx + w * high_idx
-            t = t.reshape(sigma.shape)
-
-        else:
-            # perform interpolation on alphas_cumprod
-
-            alpha_cumprod = 1.0 / (sigma**2 + 1)
-            if alpha_cumprod > self.alphas_cumprod[0]:
-                t = 0
-
-            elif alpha_cumprod <= self.alphas_cumprod[-1]:
-                t = len(self.alphas_cumprod) - 1
-
-            else:
-                t = np.interp(
-                    alpha_cumprod,
-                    self.alphas_cumprod.numpy()[::-1].copy(),
-                    np.arange(0, len(self.alphas_cumprod))[::-1],
-                )
-                t = int(t)
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
 
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
         return t
 
     def _convert_to_karras(self, sigma_min, sigma_max, num_inference_steps) -> torch.FloatTensor:
@@ -330,6 +289,9 @@ def step(
         """
         step_index = self.index_for_timestep(timestep)
 
+        # advance index counter by 1
+        self._index_counter[timestep.cpu().item()] += 1
+
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]
             sigma_next = self.sigmas[step_index + 1]
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index e7b7d2fa1792..2f6030260c2e 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -134,10 +134,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
+        # exp beta schedules might have more than twice the same consecutive timestep
+        # to make sure we select the correct index, let's keep track of counts
+        pos = self._index_counter[timestep.cpu().item()]
+
         return indices[pos].item()
 
     def scale_model_input(
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 1666d74bd331..44ad3226489a 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -133,10 +133,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
+        # exp beta schedules might have more than twice the same consecutive timestep
+        # to make sure we select the correct index, let's keep track of counts
+        pos = self._index_counter[timestep.cpu().item()]
+
         return indices[pos].item()
 
     def scale_model_input(

From d5c1effe5e69ba2a98c04c20f32fe56aabbba1b4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 11:39:06 +0000
Subject: [PATCH 081/119] Fix more

---
 src/diffusers/schedulers/scheduling_heun_discrete.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 53f9df5291e6..d987b13ad904 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -140,10 +140,6 @@ def __init__(
         self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
         self.use_karras_sigmas = use_karras_sigmas
 
-        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
-        # we need an index counter
-        self._index_counter = defaultdict(int)
-
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
@@ -229,6 +225,10 @@ def set_timesteps(
         self.prev_derivative = None
         self.dt = None
 
+        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
+        # we need an index counter
+        self._index_counter = defaultdict(int)
+
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma

From 51e04ad2b3b8769b129776b164b3b010797766ef Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 11:41:25 +0000
Subject: [PATCH 082/119] Fix more

---
 .../schedulers/scheduling_heun_discrete.py      | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index d987b13ad904..181a2637d865 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -113,8 +113,6 @@ def __init__(
         use_karras_sigmas: Optional[bool] = False,
         clip_sample: Optional[bool] = False,
         clip_sample_range: float = 1.0,
-        sigma_min: Optional[float] = None,
-        sigma_max: Optional[float] = None,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -197,13 +195,8 @@ def set_timesteps(
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
 
         if self.config.use_karras_sigmas:
-            sigmas = self._convert_to_karras(
-                sigma_min=sigmas[-1].item(),
-                sigma_max=sigmas[0].item(),
-                num_inference_steps=self.num_inference_steps,
-            )
-
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas=log_sigmas) for sigma in sigmas])
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
 
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
@@ -253,9 +246,13 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    def _convert_to_karras(self, sigma_min, sigma_max, num_inference_steps) -> torch.FloatTensor:
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
         """Constructs the noise schedule of Karras et al. (2022)."""
 
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
         rho = 7.0  # 7.0 is the value used in the paper
         ramp = np.linspace(0, 1, num_inference_steps)
         min_inv_rho = sigma_min ** (1 / rho)

From 67ca56c6b0104b5e843e30b5db01a2dac351f1ff Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 12:19:40 +0000
Subject: [PATCH 083/119] fix more

---
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py  | 11 ++++++++---
 src/diffusers/schedulers/scheduling_heun_discrete.py  | 11 ++++++++---
 .../scheduling_k_dpm_2_ancestral_discrete.py          | 11 ++++++++---
 .../schedulers/scheduling_k_dpm_2_discrete.py         | 11 ++++++++---
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 4cdf2da5a9a4..ca19b7a50cb7 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -194,9 +194,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # exp beta schedules might have more than twice the same consecutive timestep
-        # to make sure we select the correct index, let's keep track of counts
-        pos = self._index_counter[timestep.cpu().item()]
+        # The sigma index that is taken for the **very** first `step` 
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in 
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        if len(self._index_counter) == 0:
+            pos = 1 if len(indices) > 1 else 0
+        else:
+            pos = self._index_counter[timestep.cpu().item()]
 
         return indices[pos].item()
 
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 181a2637d865..5b93dfe5e8b5 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -144,9 +144,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # exp beta schedules might have more than twice the same consecutive timestep
-        # to make sure we select the correct index, let's keep track of counts
-        pos = self._index_counter[timestep.cpu().item()]
+        # The sigma index that is taken for the **very** first `step` 
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in 
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        if len(self._index_counter) == 0:
+            pos = 1 if len(indices) > 1 else 0
+        else:
+            pos = self._index_counter[timestep.cpu().item()]
 
         return indices[pos].item()
 
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 2f6030260c2e..8cd718d6d81b 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -134,9 +134,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # exp beta schedules might have more than twice the same consecutive timestep
-        # to make sure we select the correct index, let's keep track of counts
-        pos = self._index_counter[timestep.cpu().item()]
+        # The sigma index that is taken for the **very** first `step` 
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in 
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        if len(self._index_counter) == 0:
+            pos = 1 if len(indices) > 1 else 0
+        else:
+            pos = self._index_counter[timestep.cpu().item()]
 
         return indices[pos].item()
 
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 44ad3226489a..45ccdc7e18c2 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -133,9 +133,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # exp beta schedules might have more than twice the same consecutive timestep
-        # to make sure we select the correct index, let's keep track of counts
-        pos = self._index_counter[timestep.cpu().item()]
+        # The sigma index that is taken for the **very** first `step` 
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in 
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        if len(self._index_counter) == 0:
+            pos = 1 if len(indices) > 1 else 0
+        else:
+            pos = self._index_counter[timestep.cpu().item()]
 
         return indices[pos].item()
 

From 7c4c75bfaea102d0d7404078253304f1853410cb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 12:30:57 +0000
Subject: [PATCH 084/119] make style

---
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py     | 4 ++--
 src/diffusers/schedulers/scheduling_heun_discrete.py     | 4 ++--
 .../schedulers/scheduling_k_dpm_2_ancestral_discrete.py  | 4 ++--
 src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py  | 4 ++--
 tests/pipelines/shap_e/test_shap_e.py                    | 9 +++------
 tests/pipelines/shap_e/test_shap_e_img2img.py            | 8 +++-----
 6 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index ca19b7a50cb7..92a56bc047fa 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -194,9 +194,9 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # The sigma index that is taken for the **very** first `step` 
+        # The sigma index that is taken for the **very** first `step`
         # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in 
+        # This way we can ensure we don't accidentally skip a sigma in
         # case we start in the middle of the denoising schedule (e.g. for image-to-image)
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 5b93dfe5e8b5..27490ca01187 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -144,9 +144,9 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # The sigma index that is taken for the **very** first `step` 
+        # The sigma index that is taken for the **very** first `step`
         # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in 
+        # This way we can ensure we don't accidentally skip a sigma in
         # case we start in the middle of the denoising schedule (e.g. for image-to-image)
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 8cd718d6d81b..bd1b8ae859ad 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -134,9 +134,9 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # The sigma index that is taken for the **very** first `step` 
+        # The sigma index that is taken for the **very** first `step`
         # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in 
+        # This way we can ensure we don't accidentally skip a sigma in
         # case we start in the middle of the denoising schedule (e.g. for image-to-image)
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 45ccdc7e18c2..909d3a8c6d88 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -133,9 +133,9 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         indices = (schedule_timesteps == timestep).nonzero()
 
-        # The sigma index that is taken for the **very** first `step` 
+        # The sigma index that is taken for the **very** first `step`
         # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in 
+        # This way we can ensure we don't accidentally skip a sigma in
         # case we start in the middle of the denoising schedule (e.g. for image-to-image)
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 5a31f3b00554..b78e4fee5588 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -22,14 +22,11 @@
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
 from diffusers.utils import load_numpy, slow
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-enable_full_determinism()
-
-
 class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEPipeline
     params = ["prompt"]
@@ -142,8 +139,8 @@ def get_dummy_components(self):
             num_train_timesteps=1024,
             prediction_type="sample",
             use_karras_sigmas=True,
-            sigma_max=16.0,
-            sigma_min=15.0,
+            clip_sample=True,
+            clip_sample_range=1.0,
         )
         components = {
             "prior": prior,
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index fc9ca9ae58db..3cc13a782abe 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -23,14 +23,11 @@
 from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline
 from diffusers.pipelines.shap_e import ShapERenderer
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, torch_device
 
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
-enable_full_determinism()
-
-
 class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = ShapEImg2ImgPipeline
     params = ["image"]
@@ -154,8 +151,9 @@ def get_dummy_components(self):
             num_train_timesteps=1024,
             prediction_type="sample",
             use_karras_sigmas=True,
-            sigma_max=16.0,
             sigma_min=15.0,
+            clip_sample=True,
+            clip_sample_range=1.0,
         )
         components = {
             "prior": prior,

From a88ea4c7950ecbabff8d9d75fcc1610ab369addd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 12:38:50 +0000
Subject: [PATCH 085/119] fix schedulers

---
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py | 12 +++++++++++-
 src/diffusers/schedulers/scheduling_heun_discrete.py |  6 ++++--
 .../scheduling_k_dpm_2_ancestral_discrete.py         | 12 +++++++++++-
 .../schedulers/scheduling_k_dpm_2_discrete.py        | 12 +++++++++++-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 92a56bc047fa..c4fdff578ba6 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -201,7 +202,8 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
         else:
-            pos = self._index_counter[timestep.cpu().item()]
+            timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+            pos = self._index_counter[timestep_int]
 
         return indices[pos].item()
 
@@ -278,6 +280,10 @@ def set_timesteps(
         self.sample = None
         self.mid_point_sigma = None
 
+        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
+        # we need an index counter
+        self._index_counter = defaultdict(int)
+
     def _second_order_timesteps(self, sigmas, log_sigmas):
         def sigma_fn(_t):
             return np.exp(-_t)
@@ -359,6 +365,10 @@ def step(
         """
         step_index = self.index_for_timestep(timestep)
 
+        # advance index counter by 1
+        timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+        self._index_counter[timestep_int] += 1
+
         # Create a noise sampler if it hasn't been created yet
         if self.noise_sampler is None:
             min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max()
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 27490ca01187..d05b602b3f9a 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -151,7 +151,8 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
         else:
-            pos = self._index_counter[timestep.cpu().item()]
+            timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+            pos = self._index_counter[timestep_int]
 
         return indices[pos].item()
 
@@ -292,7 +293,8 @@ def step(
         step_index = self.index_for_timestep(timestep)
 
         # advance index counter by 1
-        self._index_counter[timestep.cpu().item()] += 1
+        timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+        self._index_counter[timestep_int] += 1
 
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index bd1b8ae859ad..86dad80cb9f7 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -141,7 +142,8 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
         else:
-            pos = self._index_counter[timestep.cpu().item()]
+            timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+            pos = self._index_counter[timestep_int]
 
         return indices[pos].item()
 
@@ -231,6 +233,10 @@ def set_timesteps(
 
         self.sample = None
 
+        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
+        # we need an index counter
+        self._index_counter = defaultdict(int)
+
     def sigma_to_t(self, sigma):
         # get log sigma
         log_sigma = sigma.log()
@@ -281,6 +287,10 @@ def step(
         """
         step_index = self.index_for_timestep(timestep)
 
+        # advance index counter by 1
+        timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+        self._index_counter[timestep_int] += 1
+
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]
             sigma_interpol = self.sigmas_interpol[step_index]
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 909d3a8c6d88..a6b7474fbe8b 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -140,7 +141,8 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         if len(self._index_counter) == 0:
             pos = 1 if len(indices) > 1 else 0
         else:
-            pos = self._index_counter[timestep.cpu().item()]
+            timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+            pos = self._index_counter[timestep_int]
 
         return indices[pos].item()
 
@@ -220,6 +222,10 @@ def set_timesteps(
 
         self.sample = None
 
+        # for exp beta schedules, such as the one for `pipeline_shap_e.py`
+        # we need an index counter
+        self._index_counter = defaultdict(int)
+
     def sigma_to_t(self, sigma):
         # get log sigma
         log_sigma = sigma.log()
@@ -269,6 +275,10 @@ def step(
         """
         step_index = self.index_for_timestep(timestep)
 
+        # advance index counter by 1
+        timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep
+        self._index_counter[timestep_int] += 1
+
         if self.state_in_first_order:
             sigma = self.sigmas[step_index]
             sigma_interpol = self.sigmas_interpol[step_index + 1]

From 68f70c36b25ae4968c079cc2353686459cb03cfc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 12:39:43 +0000
Subject: [PATCH 086/119] fix some more tests

---
 tests/pipelines/shap_e/test_shap_e_img2img.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 3cc13a782abe..b1ed6279dad1 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -151,7 +151,6 @@ def get_dummy_components(self):
             num_train_timesteps=1024,
             prediction_type="sample",
             use_karras_sigmas=True,
-            sigma_min=15.0,
             clip_sample=True,
             clip_sample_range=1.0,
         )

From ec392477a37b8d56d564bbaf5928911fc788d15e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 13:24:36 +0000
Subject: [PATCH 087/119] finish

---
 tests/schedulers/test_scheduler_heun.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py
index 2fd50425938f..365a7f677ee6 100644
--- a/tests/schedulers/test_scheduler_heun.py
+++ b/tests/schedulers/test_scheduler_heun.py
@@ -30,9 +30,13 @@ def test_betas(self):
             self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
 
     def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
+        for schedule in ["linear", "scaled_linear", "exp"]:
             self.check_over_configs(beta_schedule=schedule)
 
+    def test_clip_sample(self):
+        for clip_sample_range in [1.0, 2.0, 3.0]:
+            self.check_over_configs(clip_sample_range=clip_sample_range, clip_sample=True)
+
     def test_prediction_type(self):
         for prediction_type in ["epsilon", "v_prediction"]:
             self.check_over_configs(prediction_type=prediction_type)

From 9ebf8165cdd2fc3b2857cfe312d23fb4e2879d3f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 4 Jul 2023 16:36:08 +0000
Subject: [PATCH 088/119] add one more test

---
 tests/schedulers/test_scheduler_heun.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py
index 365a7f677ee6..ae0fe26b11ba 100644
--- a/tests/schedulers/test_scheduler_heun.py
+++ b/tests/schedulers/test_scheduler_heun.py
@@ -38,7 +38,7 @@ def test_clip_sample(self):
             self.check_over_configs(clip_sample_range=clip_sample_range, clip_sample=True)
 
     def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
+        for prediction_type in ["epsilon", "v_prediction", "sample"]:
             self.check_over_configs(prediction_type=prediction_type)
 
     def test_full_loop_no_noise(self):

From 438946a673fd634e9d1d78d1d20dd911f48f1435 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 5 Jul 2023 07:15:30 -1000
Subject: [PATCH 089/119] Apply suggestions from code review

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/pipelines/shap_e.mdx       |  5 ++--
 src/diffusers/models/prior_transformer.py     | 27 ++++++++-----------
 .../pipelines/shap_e/pipeline_shap_e.py       |  6 ++---
 .../shap_e/pipeline_shap_e_img2img.py         |  2 +-
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 0278cbf86f2a..e31d8b9a8cc8 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -11,9 +11,10 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Shap-E model was proposed in [ Shap-E: Generating Conditional 3D Implicit Functions](https://arxiv.org/abs/2305.02463) by Alex Nichol and Heewon Jun from [OpenAI](https://github.com/openai). 
 
-The abstract of paper is the following:
+The Shap-E model was proposed in [Shap-E: Generating Conditional 3D Implicit Functions](https://arxiv.org/abs/2305.02463) by Alex Nichol and Heewon Jun from [OpenAI](https://github.com/openai). 
+
+The abstract of the paper is the following:
 
 *We present Shap-E, a conditional generative model for 3D assets. Unlike recent work on 3D generative models which produce a single output representation, Shap-E directly generates the parameters of implicit functions that can be rendered as both textured meshes and neural radiance fields. We train Shap-E in two stages: first, we train an encoder that deterministically maps 3D assets into the parameters of an implicit function; second, we train a conditional diffusion model on outputs of the encoder. When trained on a large dataset of paired 3D and text data, our resulting models are capable of generating complex and diverse 3D assets in a matter of seconds. When compared to Point-E, an explicit generative model over point clouds, Shap-E converges faster and reaches comparable or better sample quality despite modeling a higher-dimensional, multi-representation output space.*
 
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index ca587b408a61..3930cf71c83b 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -41,24 +41,19 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
             additional_embeddings`.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
-            the activation function to use to create timestep embedding
-        norm_in_type (`str`, *optional*, defaults to None): the normalization layer to apply on hidden states before
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'): The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
             passing to Transformer blocks. Set it to `None` if normalization is not needed.
-        embedding_proj_norm_type (`str`, *optional*, defaults to None):
-            the normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+        embedding_proj_norm_type (`str`, *optional*, defaults to None): The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
             needed.
-        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
-            the projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`): The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
             `encoder_hidden_states` is `None`.
-        added_emb_type (`str`, *optional*, defaults to `prd`): the additional embedding to condition model.
-            `prd` indicating higher text-image dot products. if it is `None`, will not prepend additional embedding.
-        time_embed_dim (`int, *optional*, defaults to None): the dimension of timestep embedding.
-            If None, will set to `num_attention_heads * attention_head_dim`
-        embedding_proj_dim (`int`, *optional*, default to None):
-            the dimension of `proj_embedding`. If None, will set to `embedding_dim`
-        clip_embed_dim (`int`, *optional*, default to None):
-            the dimension of output. If None, will set to `embedding_dim`
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Default is `prd`, indicating higher text-image dot products. If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None): The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None): The dimension of the output. If None, will be set to `embedding_dim`.
     """
 
     @register_to_config
@@ -139,7 +134,7 @@ def __init__(
         elif norm_in_type is None:
             self.norm_in = None
         else:
-            raise ValueError(f"{norm_in_type} does not exist.")
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
 
         self.norm_out = nn.LayerNorm(inner_dim)
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index b69fd8b91658..37a1be1c3464 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -45,7 +45,7 @@
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-        >>> repo = "YiYiXu/shap-e"
+        >>> repo = "openai/shap-e"
         >>> pipe = ShapEPipeline.from_pretrained(repo)
         >>> pipe = pipe.to(device)
 
@@ -79,7 +79,7 @@ class ShapEPipelineOutput(BaseOutput):
 
 class ShapEPipeline(DiffusionPipeline):
     """
-    Pipeline for generating latent representation of a 3D asset with Shap.E
+    Pipeline for generating latent representation of a 3D asset with Shap-E
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -234,7 +234,7 @@ def __call__(
                 The prompt or prompts to guide the image generation.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
-            num_inference_steps (`int`, *optional*, defaults to 100):
+            num_inference_steps (`int`, *optional*, defaults to 25):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 2f1222ba210a..947777f6dd14 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -45,7 +45,7 @@
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-        >>> repo = "YiYiXu/shap-e-img2img"
+        >>> repo = "openai/shap-e-img2img"
         >>> pipe = ShapEImg2ImgPipeline.from_pretrained(repo)
         >>> pipe = pipe.to(device)
 

From 1dab07537a88ae3d1e1fdb596d9c620517e47a1c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 18:13:03 +0000
Subject: [PATCH 090/119] style

---
 src/diffusers/models/prior_transformer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 3930cf71c83b..7994eeb1b033 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -41,19 +41,25 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
             additional_embeddings`.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        time_embed_act_fn (`str`, *optional*, defaults to 'silu'): The activation function to use to create timestep embeddings.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
         norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
             passing to Transformer blocks. Set it to `None` if normalization is not needed.
-        embedding_proj_norm_type (`str`, *optional*, defaults to None): The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
             needed.
-        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`): The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
             `encoder_hidden_states` is `None`.
         added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
-            Default is `prd`, indicating higher text-image dot products. If it is `None`, no additional embeddings will be prepended.
+            Default is `prd`, indicating higher text-image dot products. If it is `None`, no additional embeddings will
+            be prepended.
         time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
             If None, will be set to `num_attention_heads * attention_head_dim`
-        embedding_proj_dim (`int`, *optional*, default to None): The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
-        clip_embed_dim (`int`, *optional*, default to None): The dimension of the output. If None, will be set to `embedding_dim`.
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
     """
 
     @register_to_config

From ca2b5409c5d878095ffcea57a4b20adaceb470b9 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 19:42:48 +0000
Subject: [PATCH 091/119] apply feedbacks

---
 src/diffusers/pipelines/shap_e/camera.py      | 14 ++++++
 .../pipelines/shap_e/pipeline_shap_e.py       | 49 ++++++++++++++++---
 .../shap_e/pipeline_shap_e_img2img.py         | 15 +++---
 src/diffusers/pipelines/shap_e/renderer.py    |  2 +-
 src/diffusers/schedulers/scheduling_ddpm.py   |  4 +-
 src/diffusers/utils/testing_utils.py          | 25 +++++-----
 6 files changed, 79 insertions(+), 30 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py
index cfe86fb7c2cf..7ef0d6607022 100644
--- a/src/diffusers/pipelines/shap_e/camera.py
+++ b/src/diffusers/pipelines/shap_e/camera.py
@@ -1,3 +1,17 @@
+# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Tuple
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 37a1be1c3464..5c2210da419e 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -40,13 +40,13 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import ShapEPipeline
+        >>> from diffusers import DiffusionPipeline
         >>> from diffusers.utils import export_to_gif
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         >>> repo = "openai/shap-e"
-        >>> pipe = ShapEPipeline.from_pretrained(repo)
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
         >>> pipe = pipe.to(device)
 
         >>> guidance_scale = 15.0
@@ -59,7 +59,7 @@
         ...     size=256,
         ... ).images
 
-        >>> gif_path = export_to_gif(images, "shark_3d.gif")
+        >>> gif_path = export_to_gif(images[0], "shark_3d.gif")
         ```
 """
 
@@ -79,7 +79,7 @@ class ShapEPipelineOutput(BaseOutput):
 
 class ShapEPipeline(DiffusionPipeline):
     """
-    Pipeline for generating latent representation of a 3D asset with Shap-E
+    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -94,6 +94,8 @@ class ShapEPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
+        renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object with NeRF rendering method
     """
 
     def __init__(
@@ -113,7 +115,8 @@ def __init__(
             scheduler=scheduler,
             renderer=renderer,
         )
-
+    
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -139,11 +142,40 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.text_encoder,
+            self.renderer
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.prior, self.renderer]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
     @property
     def _execution_device(self):
         r"""
@@ -195,8 +227,7 @@ def _encode_prompt(
         prompt_embeds = text_encoder_output.text_embeds
 
         prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        # in Shap-E it normalize the prompt_embeds and then later rescale it, not sure why
-        # YiYi TO-DO: move rescale out of prior_transformer and apply it here
+        # in Shap-E it normalize the prompt_embeds and then later rescale it
         prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True)
 
         if do_classifier_free_guidance:
@@ -250,6 +281,8 @@ def __call__(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
+            size (`int`, *optional*, default to 64):
+                the width and height of each image frame of the generated 3d output 
             output_type (`str`, *optional*, defaults to `"pt"`):
                 The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
                 (`torch.Tensor`).
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 947777f6dd14..c0ce01c794a9 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -40,13 +40,13 @@
         ```py
         >>> from PIL import Image
         >>> import torch
-        >>> from diffusers import ShapEImg2ImgPipeline
+        >>> from diffusers import DiffusionPipeline
         >>> from diffusers.utils import export_to_gif
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         >>> repo = "openai/shap-e-img2img"
-        >>> pipe = ShapEImg2ImgPipeline.from_pretrained(repo)
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
         >>> pipe = pipe.to(device)
 
         >>> guidance_scale = 3.0
@@ -60,7 +60,7 @@
         ...     size=256,
         ... ).images
 
-        >>> gif_path = export_to_gif(images, "corgi_3d")
+        >>> gif_path = export_to_gif(images[0], "corgi_3d.gif")
         ```
 """
 
@@ -80,7 +80,7 @@ class ShapEPipelineOutput(BaseOutput):
 
 class ShapEImg2ImgPipeline(DiffusionPipeline):
     """
-    Pipeline for generating latent representation of a 3D asset with Shap.E
+    Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
@@ -95,6 +95,8 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
+        renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object with NeRF rendering method
     """
 
     def __init__(
@@ -114,7 +116,8 @@ def __init__(
             scheduler=scheduler,
             renderer=renderer,
         )
-
+    
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index 4547cb778837..ecc9f26769c7 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 75be169cb54f..a1b7d7aaa9c2 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -47,7 +47,7 @@ class DDPMSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -61,6 +61,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 8dd3434432db..64eb3ac925e9 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -300,21 +300,18 @@ def preprocess_image(image: PIL.Image, batch_size: int):
     return 2.0 * image - 1.0
 
 
-def export_to_gif(images: List[PIL.Image.Image], output_gif_path: str = None) -> str:
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str:
     if output_gif_path is None:
-        output_gif_path = tempfile.TemporaryDirectory().name
-
-    output_gif_path = Path(output_gif_path)
-
-    for i, image in enumerate(images):
-        image[0].save(
-            output_gif_path,
-            save_all=True,
-            append_images=image[1:],
-            optimize=False,
-            duration=100,
-            loop=0,
-        )
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=100,
+        loop=0,
+    )
     return output_gif_path
 
 

From 1dfb3f814fed5a673a34e9bd07d4c3e5cc32908c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 19:44:32 +0000
Subject: [PATCH 092/119] style

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py   | 13 ++++++-------
 .../pipelines/shap_e/pipeline_shap_e_img2img.py     |  5 +++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 5c2210da419e..ea3b59ffd68e 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -27,6 +27,7 @@
 from ...utils import (
     BaseOutput,
     is_accelerate_available,
+    is_accelerate_version,
     logging,
     randn_tensor,
     replace_example_docstring,
@@ -95,7 +96,8 @@ class ShapEPipeline(DiffusionPipeline):
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
         renderer ([`ShapERenderer`]):
-            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object with NeRF rendering method
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object
+            with NeRF rendering method
     """
 
     def __init__(
@@ -115,7 +117,7 @@ def __init__(
             scheduler=scheduler,
             renderer=renderer,
         )
-    
+
     # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:
@@ -140,10 +142,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        models = [
-            self.text_encoder,
-            self.renderer
-        ]
+        models = [self.text_encoder, self.renderer]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
@@ -282,7 +281,7 @@ def __call__(
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
             size (`int`, *optional*, default to 64):
-                the width and height of each image frame of the generated 3d output 
+                the width and height of each image frame of the generated 3d output
             output_type (`str`, *optional*, defaults to `"pt"`):
                 The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
                 (`torch.Tensor`).
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index c0ce01c794a9..956008d79a71 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -96,7 +96,8 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
         renderer ([`ShapERenderer`]):
-            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object with NeRF rendering method
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object
+            with NeRF rendering method
     """
 
     def __init__(
@@ -116,7 +117,7 @@ def __init__(
             scheduler=scheduler,
             renderer=renderer,
         )
-    
+
     # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
     def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
         if latents is None:

From 1a82d1ea084bcaee848ea397b6482d0bb8dd653a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 19:45:48 +0000
Subject: [PATCH 093/119] fix copies

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py             | 1 +
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py     | 1 +
 src/diffusers/schedulers/scheduling_ddim.py                   | 4 +++-
 src/diffusers/schedulers/scheduling_ddim_inverse.py           | 4 +++-
 src/diffusers/schedulers/scheduling_ddim_parallel.py          | 4 +++-
 src/diffusers/schedulers/scheduling_ddpm_parallel.py          | 4 +++-
 src/diffusers/schedulers/scheduling_deis_multistep.py         | 4 +++-
 src/diffusers/schedulers/scheduling_dpmsolver_multistep.py    | 4 +++-
 .../schedulers/scheduling_dpmsolver_multistep_inverse.py      | 4 +++-
 src/diffusers/schedulers/scheduling_dpmsolver_sde.py          | 4 +++-
 src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py   | 4 +++-
 .../schedulers/scheduling_euler_ancestral_discrete.py         | 4 +++-
 src/diffusers/schedulers/scheduling_euler_discrete.py         | 4 +++-
 src/diffusers/schedulers/scheduling_heun_discrete.py          | 4 +++-
 .../schedulers/scheduling_k_dpm_2_ancestral_discrete.py       | 4 +++-
 src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py       | 4 +++-
 src/diffusers/schedulers/scheduling_lms_discrete.py           | 4 +++-
 src/diffusers/schedulers/scheduling_pndm.py                   | 4 +++-
 src/diffusers/schedulers/scheduling_repaint.py                | 4 +++-
 src/diffusers/schedulers/scheduling_unclip.py                 | 4 +++-
 20 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index ea3b59ffd68e..77ab1924d557 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -126,6 +126,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
             if latents.shape != shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
             latents = latents.to(device)
+
         latents = latents * scheduler.init_noise_sigma
         return latents
 
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 956008d79a71..ec1b55c3a098 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -126,6 +126,7 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
             if latents.shape != shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
             latents = latents.to(device)
+
         latents = latents * scheduler.init_noise_sigma
         return latents
 
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 6e75f2425e3c..a93255ca600e 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -50,7 +50,7 @@ class DDIMSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -64,6 +64,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 70cc56272a21..c04aabe035b5 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -49,7 +49,7 @@ class DDIMSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -63,6 +63,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index dd65aa34e890..db3ea0e1cca5 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -50,7 +50,7 @@ class DDIMParallelSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -64,6 +64,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index 41c783cf0079..a92e175877d2 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -49,7 +49,7 @@ class DDPMParallelSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -63,6 +63,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index 9d1930088e62..36947294922b 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -29,7 +29,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -43,6 +43,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 85659ae25a1b..d7516fa601e1 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -29,7 +29,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -43,6 +43,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index e34fd056c795..a6736b354419 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -29,7 +29,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -43,6 +43,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index 20a65714fa8b..a31e97b69651 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -80,7 +80,7 @@ def __call__(self, sigma, sigma_next):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -94,6 +94,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 3db7973d7fee..93975a27fc6e 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -32,7 +32,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -46,6 +46,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 742b086d7fe0..065f657032e6 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -50,7 +50,7 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -64,6 +64,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 514e925617f1..cb126d4b953c 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -50,7 +50,7 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -64,6 +64,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index 86869a3cedd6..5f694fd60fc9 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -27,7 +27,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -41,6 +41,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 3569d80f84cb..bdf9379b9b90 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -28,7 +28,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -42,6 +42,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index e3e26cf93d20..a6a1b4e6640d 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -27,7 +27,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -41,6 +41,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 8fa4ab35561a..d58d4ce45bd1 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -48,7 +48,7 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -62,6 +62,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index b1b59589e0d2..794eb3674c1b 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -28,7 +28,7 @@
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -42,6 +42,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index e1c31a60c942..41e7450d2df6 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -46,7 +46,7 @@ class RePaintSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -60,6 +60,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index ac090360bae4..b2dbc7a26f33 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -47,7 +47,7 @@ class UnCLIPSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
-    alpha_transform_type="cosine",  # cosine, exp
+    alpha_transform_type="cosine",
 ):
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
@@ -61,6 +61,8 @@ def betas_for_alpha_bar(
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
 
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs

From 172eeebe2f0a2ad6159c89d3c7cb708320fd13dd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 20:56:09 +0000
Subject: [PATCH 094/119] add one example

---
 docs/source/en/api/pipelines/shap_e.mdx | 39 +++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index e31d8b9a8cc8..d4ea973edf1f 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -37,3 +37,42 @@ The original codebase can be found [here](https://github.com/openai/shap-e)
 	- __call__
 
 
+## Use Examples:
+
+In the following, we will walk you through some examples of how to use Shap-E pipelines to create 3D objects in gif format.
+
+#### Text-to-Image generation 
+
+We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers.
+
+```python
+import torch
+
+from diffusers import DiffusionPipeline
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+repo = "openai/shap-e"
+pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+pipe = pipe.to(device)
+
+guidance_scale = 15.0
+prompt = "A birthday cupcake"
+
+images = pipe(
+    prompt, 
+    guidance_scale=guidance_scale,
+    num_inference_steps= 64, 
+    size = 256,
+    ).images
+```
+
+The output of [`ShapEPipeline`] is a list of lists of images frames. Each list of frames can be used to create a 3D object. Let's use the `export_to_gif` utility function in diffusers to make a 3D cupcake!
+
+```python
+from diffusers.utils import export_to_gif
+
+gif_path=export_to_gif(images[0], "cupcake_3d.gif")
+```
+
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/cake_out.gif)

From 4171e6c4a1396c9d007d3b5733b33638f0fbec7d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 20:56:50 +0000
Subject: [PATCH 095/119] style

---
 docs/source/en/api/pipelines/shap_e.mdx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index d4ea973edf1f..09d9b677ab85 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -50,7 +50,7 @@ import torch
 
 from diffusers import DiffusionPipeline
 
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 repo = "openai/shap-e"
 pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
@@ -60,11 +60,11 @@ guidance_scale = 15.0
 prompt = "A birthday cupcake"
 
 images = pipe(
-    prompt, 
+    prompt,
     guidance_scale=guidance_scale,
-    num_inference_steps= 64, 
-    size = 256,
-    ).images
+    num_inference_steps=64,
+    size=256,
+).images
 ```
 
 The output of [`ShapEPipeline`] is a list of lists of images frames. Each list of frames can be used to create a 3D object. Let's use the `export_to_gif` utility function in diffusers to make a 3D cupcake!
@@ -72,7 +72,7 @@ The output of [`ShapEPipeline`] is a list of lists of images frames. Each list o
 ```python
 from diffusers.utils import export_to_gif
 
-gif_path=export_to_gif(images[0], "cupcake_3d.gif")
+gif_path = export_to_gif(images[0], "cupcake_3d.gif")
 ```
 
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/cake_out.gif)

From 39e8aa84e5c7d5d388312506906a22105fd887bf Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 21:14:36 +0000
Subject: [PATCH 096/119] add example for img2img

---
 docs/source/en/api/pipelines/shap_e.mdx | 34 +++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 09d9b677ab85..d6cffdc76445 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -76,3 +76,37 @@ gif_path = export_to_gif(images[0], "cupcake_3d.gif")
 ```
 
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/cake_out.gif)
+
+
+### Image-to-Image generation
+
+You can use [`ShapEImg2ImgPipeline`] along with other text-to-image pipelines in diffusers and turn your 2D generation into 3D. 
+
+In this example, We will first genrate a cheese burger with a simple prompt "A cheeseburger, white background" and then use the shap-e image-to-image pipeline to make a 3D cheeseburger :)
+
+```python
+from PIL import Image
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import export_to_gif
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+repo = "openai/shap-e-img2img"
+pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+pipe = pipe.to(device)
+
+guidance_scale = 3.0
+image = Image.open("burger.png").resize((256, 256))
+
+images = pipe(
+    image,
+    guidance_scale=guidance_scale,
+    num_inference_steps=64,
+    size=256,
+).images
+
+gif_path = export_to_gif(images[0], "burger_3d.gif")
+```
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_in.png)
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif)
\ No newline at end of file

From c604ab60cd762bb0bfb313c327ddc973658badc1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 21:31:26 +0000
Subject: [PATCH 097/119] fix doc

---
 docs/source/en/api/pipelines/shap_e.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index d6cffdc76445..9457f546d6c1 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -41,7 +41,7 @@ The original codebase can be found [here](https://github.com/openai/shap-e)
 
 In the following, we will walk you through some examples of how to use Shap-E pipelines to create 3D objects in gif format.
 
-#### Text-to-Image generation 
+### Text-to-Image generation 
 
 We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers.
 

From 2016fe0ad979229d3866a9c2f1a1995caf0a5700 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 21:53:50 +0000
Subject: [PATCH 098/119] fix more doc strings

---
 src/diffusers/models/prior_transformer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 7994eeb1b033..bc6f631bd4c6 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -36,7 +36,7 @@ class PriorTransformer(ModelMixin, ConfigMixin):
         num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
         embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
         num_embeddings (`int`, *optional*, defaults to 77):
-            the number of embeddings of the model input `hidden_states`
+            The number of embeddings of the model input `hidden_states`
         additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
             projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
             additional_embeddings`.
@@ -52,8 +52,9 @@ class PriorTransformer(ModelMixin, ConfigMixin):
             The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
             `encoder_hidden_states` is `None`.
         added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
-            Default is `prd`, indicating higher text-image dot products. If it is `None`, no additional embeddings will
-            be prepended.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
         time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
             If None, will be set to `num_attention_heads * attention_head_dim`
         embedding_proj_dim (`int`, *optional*, default to None):

From 00119be84acbb225d41131d38c441543b34a4b3d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 23:24:25 +0000
Subject: [PATCH 099/119] size -> frame_size

---
 docs/source/en/api/pipelines/shap_e.mdx                |  4 ++--
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py      | 10 +++++-----
 .../pipelines/shap_e/pipeline_shap_e_img2img.py        |  9 ++++++---
 tests/pipelines/shap_e/test_shap_e.py                  |  6 +++---
 tests/pipelines/shap_e/test_shap_e_img2img.py          |  6 +++---
 5 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 9457f546d6c1..ecd45ba80c25 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -63,7 +63,7 @@ images = pipe(
     prompt,
     guidance_scale=guidance_scale,
     num_inference_steps=64,
-    size=256,
+    frame_size=256,
 ).images
 ```
 
@@ -103,7 +103,7 @@ images = pipe(
     image,
     guidance_scale=guidance_scale,
     num_inference_steps=64,
-    size=256,
+    frame_size=256,
 ).images
 
 gif_path = export_to_gif(images[0], "burger_3d.gif")
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 77ab1924d557..dd7f228643ff 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -57,7 +57,7 @@
         ...     prompt,
         ...     guidance_scale=guidance_scale,
         ...     num_inference_steps=64,
-        ...     size=256,
+        ...     frame_size=256,
         ... ).images
 
         >>> gif_path = export_to_gif(images[0], "shark_3d.gif")
@@ -143,7 +143,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        models = [self.text_encoder, self.renderer]
+        models = [self.text_encoder, self.prior]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
@@ -253,7 +253,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        size: int = 64,
+        frame_size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
@@ -281,7 +281,7 @@ def __call__(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
-            size (`int`, *optional*, default to 64):
+            frame_size (`int`, *optional*, default to 64):
                 the width and height of each image frame of the generated 3d output
             output_type (`str`, *optional*, defaults to `"pt"`):
                 The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
@@ -364,7 +364,7 @@ def __call__(
             image = self.renderer.decode(
                 latent[None, :],
                 device,
-                size=size,
+                size=frame_size,
                 ray_batch_size=4096,
                 n_coarse_samples=64,
                 n_fine_samples=128,
diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index ec1b55c3a098..7a7b06cf1704 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -57,7 +57,7 @@
         ...     image,
         ...     guidance_scale=guidance_scale,
         ...     num_inference_steps=64,
-        ...     size=256,
+        ...     frame_size=256,
         ... ).images
 
         >>> gif_path = export_to_gif(images[0], "corgi_3d.gif")
@@ -145,6 +145,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         models = [
             self.image_encoder,
+            self.prior
         ]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
@@ -208,7 +209,7 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         guidance_scale: float = 4.0,
-        size: int = 64,
+        frame_size: int = 64,
         output_type: Optional[str] = "pil",  # pil, np, latent
         return_dict: bool = True,
     ):
@@ -236,6 +237,8 @@ def __call__(
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
+            frame_size (`int`, *optional*, default to 64):
+                the width and height of each image frame of the generated 3d output
             output_type (`str`, *optional*, defaults to `"pt"`):
                 The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
                 (`torch.Tensor`).
@@ -322,7 +325,7 @@ def __call__(
             image = self.renderer.decode(
                 latent[None, :],
                 device,
-                size=size,
+                size=frame_size,
                 ray_batch_size=4096,
                 n_coarse_samples=64,
                 n_fine_samples=128,
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 11fe2afbbc37..319eb5ad032d 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -37,7 +37,7 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "generator",
         "latents",
         "guidance_scale",
-        "size",
+        "frame_size",
         "output_type",
         "return_dict",
     ]
@@ -161,7 +161,7 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt": "horse",
             "generator": generator,
             "num_inference_steps": 1,
-            "size": 32,
+            "frame_size": 32,
             "output_type": "np",
         }
         return inputs
@@ -253,7 +253,7 @@ def test_shap_e(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
 
         images = pipe(
-            "a shark", generator=generator, guidance_scale=15.0, num_inference_steps=64, size=64, output_type="np"
+            "a shark", generator=generator, guidance_scale=15.0, num_inference_steps=64, frame_size=64, output_type="np"
         ).images[0]
 
         assert images.shape == (20, 64, 64, 3)
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 0e7d5cb5b218..0c990f4b1324 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -38,7 +38,7 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         "generator",
         "latents",
         "guidance_scale",
-        "size",
+        "frame_size",
         "output_type",
         "return_dict",
     ]
@@ -175,7 +175,7 @@ def get_dummy_inputs(self, device, seed=0):
             "image": input_image,
             "generator": generator,
             "num_inference_steps": 1,
-            "size": 32,
+            "frame_size": 32,
             "output_type": "np",
         }
         return inputs
@@ -269,7 +269,7 @@ def test_shap_e_img2img(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
 
         images = pipe(
-            input_image, generator=generator, guidance_scale=3.0, num_inference_steps=64, size=64, output_type="np"
+            input_image, generator=generator, guidance_scale=3.0, num_inference_steps=64, frame_size=64, output_type="np"
         ).images[0]
 
         assert images.shape == (20, 64, 64, 3)

From 37416733935a5c2a03c019997af569be12e5a6fb Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 23:25:06 +0000
Subject: [PATCH 100/119] style

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 5 +----
 tests/pipelines/shap_e/test_shap_e.py                     | 7 ++++++-
 tests/pipelines/shap_e/test_shap_e_img2img.py             | 7 ++++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 7a7b06cf1704..7376e2eb77ac 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -143,10 +143,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        models = [
-            self.image_encoder,
-            self.prior
-        ]
+        models = [self.image_encoder, self.prior]
         for cpu_offloaded_model in models:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 319eb5ad032d..a09a598544ce 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -253,7 +253,12 @@ def test_shap_e(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
 
         images = pipe(
-            "a shark", generator=generator, guidance_scale=15.0, num_inference_steps=64, frame_size=64, output_type="np"
+            "a shark",
+            generator=generator,
+            guidance_scale=15.0,
+            num_inference_steps=64,
+            frame_size=64,
+            output_type="np",
         ).images[0]
 
         assert images.shape == (20, 64, 64, 3)
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 0c990f4b1324..3fb1c2456ef8 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -269,7 +269,12 @@ def test_shap_e_img2img(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
 
         images = pipe(
-            input_image, generator=generator, guidance_scale=3.0, num_inference_steps=64, frame_size=64, output_type="np"
+            input_image,
+            generator=generator,
+            guidance_scale=3.0,
+            num_inference_steps=64,
+            frame_size=64,
+            output_type="np",
         ).images[0]
 
         assert images.shape == (20, 64, 64, 3)

From fd2aa2b5d960b55890c18d416b33156675b8ae89 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 23:36:21 +0000
Subject: [PATCH 101/119] update doc

---
 docs/source/en/api/pipelines/shap_e.mdx | 34 ++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index ecd45ba80c25..28864caaa46f 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -82,19 +82,40 @@ gif_path = export_to_gif(images[0], "cupcake_3d.gif")
 
 You can use [`ShapEImg2ImgPipeline`] along with other text-to-image pipelines in diffusers and turn your 2D generation into 3D. 
 
-In this example, We will first genrate a cheese burger with a simple prompt "A cheeseburger, white background" and then use the shap-e image-to-image pipeline to make a 3D cheeseburger :)
+In this example, We will first genrate a cheeseburger with a simple prompt "A cheeseburger, white background" 
 
 ```python
-from PIL import Image
-import torch
 from diffusers import DiffusionPipeline
-from diffusers.utils import export_to_gif
+import torch
 
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
+pipe_prior.to("cuda")
+
+t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+t2i_pipe.to("cuda")
+
+prompt = "A cheeseburger, white background"
+
+image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple()
+image = t2i_pipe(
+    prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds,
+).images[0]
+
+image.save("burger.png")
+
+```
+
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_in.png)
+
+we will then use the Shap-E image-to-image pipeline to turn it into a 3D cheeseburger :)
+
+```python
+from PIL import Image
+from diffusers.utils import export_to_gif
 
 repo = "openai/shap-e-img2img"
 pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
-pipe = pipe.to(device)
+pipe = pipe.to("cuda")
 
 guidance_scale = 3.0
 image = Image.open("burger.png").resize((256, 256))
@@ -108,5 +129,4 @@ images = pipe(
 
 gif_path = export_to_gif(images[0], "burger_3d.gif")
 ```
-![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_in.png)
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif)
\ No newline at end of file

From 4d4ad1fda4a3c29c438ecffdfa27653ef1e00267 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 23:36:53 +0000
Subject: [PATCH 102/119] style

---
 docs/source/en/api/pipelines/shap_e.mdx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 28864caaa46f..c634daeb48b3 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -98,11 +98,12 @@ prompt = "A cheeseburger, white background"
 
 image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple()
 image = t2i_pipe(
-    prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds,
+    prompt,
+    image_embeds=image_embeds,
+    negative_image_embeds=negative_image_embeds,
 ).images[0]
 
 image.save("burger.png")
-
 ```
 
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_in.png)

From 51506bd31454089ead28543fd30de3a18c07a86c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 5 Jul 2023 23:45:56 +0000
Subject: [PATCH 103/119] fix on doc

---
 docs/source/en/api/pipelines/shap_e.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index c634daeb48b3..6da9487221fc 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -27,11 +27,12 @@ The original codebase can be found [here](https://github.com/openai/shap-e)
 | [pipeline_shap_e.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e.py) | *Text-to-Image Generation* | 
 | [pipeline_shap_e_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py) | *Image-to-Image Generation* |
 
-## ShapEPipeline
+### ShapEPipeline
 [[autodoc]] ShapEPipeline
 	- all
 	- __call__
 
+### ShapEImg2ImgPipeline
 [[autodoc]] ShapEImg2ImgPipeline
 	- all
 	- __call__

From b4a92f4b244934f5728839000ce085c4cd5da977 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 01:17:47 +0000
Subject: [PATCH 104/119] update repo name

---
 tests/pipelines/shap_e/test_shap_e.py         | 2 +-
 tests/pipelines/shap_e/test_shap_e_img2img.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index a09a598544ce..198a35915002 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -246,7 +246,7 @@ def test_shap_e(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/shap_e/test_shap_e_np_out.npy"
         )
-        pipe = ShapEPipeline.from_pretrained("YiYiXu/shap-e")
+        pipe = ShapEPipeline.from_pretrained("openai/shap-e")
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 3fb1c2456ef8..63284f14a863 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -262,7 +262,7 @@ def test_shap_e_img2img(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/shap_e/test_shap_e_img2img_out.npy"
         )
-        pipe = ShapEImg2ImgPipeline.from_pretrained("YiYiXu/shap-e-img2img")
+        pipe = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img")
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 

From f36c668257f9537773c08bac7bc771f5dba1a20d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 6 Jul 2023 10:59:25 +0530
Subject: [PATCH 105/119] improve the usage example in shap-e img2img

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 7376e2eb77ac..c418b3bb8640 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -41,7 +41,7 @@
         >>> from PIL import Image
         >>> import torch
         >>> from diffusers import DiffusionPipeline
-        >>> from diffusers.utils import export_to_gif
+        >>> from diffusers.utils import export_to_gif, load_image
 
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -50,8 +50,8 @@
         >>> pipe = pipe.to(device)
 
         >>> guidance_scale = 3.0
-        >>> image = Image.open("corgi.png")
-
+        >>> image_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png"
+        >>> image = load_image(image_url).convert("RGB")
 
         >>> images = pipe(
         ...     image,

From 82cd50e0f707af4cff25eedd0b8e0f0020a00672 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 6 Jul 2023 11:09:07 +0530
Subject: [PATCH 106/119] add usage examples in the shap-e docs.

---
 docs/source/en/api/pipelines/shap_e.mdx | 72 ++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 6da9487221fc..15a5cd138f57 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -18,7 +18,7 @@ The abstract of the paper is the following:
 
 *We present Shap-E, a conditional generative model for 3D assets. Unlike recent work on 3D generative models which produce a single output representation, Shap-E directly generates the parameters of implicit functions that can be rendered as both textured meshes and neural radiance fields. We train Shap-E in two stages: first, we train an encoder that deterministically maps 3D assets into the parameters of an implicit function; second, we train a conditional diffusion model on outputs of the encoder. When trained on a large dataset of paired 3D and text data, our resulting models are capable of generating complex and diverse 3D assets in a matter of seconds. When compared to Point-E, an explicit generative model over point clouds, Shap-E converges faster and reaches comparable or better sample quality despite modeling a higher-dimensional, multi-representation output space.*
 
-The original codebase can be found [here](https://github.com/openai/shap-e)
+The original codebase can be found [here](https://github.com/openai/shap-e).
 
 ## Available Pipelines:
 
@@ -27,6 +27,76 @@ The original codebase can be found [here](https://github.com/openai/shap-e)
 | [pipeline_shap_e.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e.py) | *Text-to-Image Generation* | 
 | [pipeline_shap_e_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py) | *Image-to-Image Generation* |
 
+## Available checkpoints 
+
+* [`openai/shap-e`](https://huggingface.co/openai/shap-e)
+* [`openai/shap-e-img2img`](https://huggingface.co/openai/shap-e-img2img)
+
+## Usage examples 
+
+### Text-to-3D image generation
+
+```python 
+import torch
+from diffusers import ShapEPipeline
+from diffusers.utils import export_to_gif
+
+
+ckpt_id = "openai/shap-e"
+pipe = ShapEPipeline.from_pretrained(repo).to("cuda")
+
+
+guidance_scale = 15.0
+prompt = "a shark"
+images = pipe(
+    prompt,
+    guidance_scale=guidance_scale,
+    num_inference_steps=64,
+    size=256,
+).images
+
+gif_path = export_to_gif(images, "shark_3d.gif")
+```
+
+| **Prompt** | **Image** |
+|---|---|
+| A shark | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/shark_3d.gif) |
+| A bird | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/bird_3d.gif) |
+| A bowl of vegetables | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/veg_3d.gif) |
+
+### 3D image generation from a 2D image
+
+```pythonimport torch
+from diffusers import ShapEImg2ImgPipeline
+from diffusers.utils import export_to_gif, load_image
+
+ckpt_id = "openai/shap-e-img2img"
+pipe = ShapEImg2ImgPipeline.from_pretrained(repo).to("cuda")
+
+img_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png"
+image = load_image(img_url)
+
+generator = torch.Generator(device="cuda").manual_seed(0)
+batch_size = 4
+guidance_scale = 3.0
+
+images = pipe(
+    image, 
+    num_images_per_prompt=batch_size, 
+    generator=generator, 
+    guidance_scale=guidance_scale,
+    num_inference_steps=64, 
+    size=256, 
+    output_type="pil"
+).images
+
+gif_path = export_to_gif(images, "corgi_sampled_3d.gif")
+```
+
+| **Reference corgi image<br>in 2D** | **Sampled image<br>in 3D (one)** | **Sampled image <br>in 3D (two)** |
+|---|---|---|
+| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi_sampled_3d.gif) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi_sampled_3d_two.gif) |
+
 ### ShapEPipeline
 [[autodoc]] ShapEPipeline
 	- all

From c271f62e4775368a24174836a6d80bb83707a56f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 6 Jul 2023 12:39:05 +0530
Subject: [PATCH 107/119] consolidate examples.

---
 docs/source/en/api/pipelines/shap_e.mdx | 92 ++++---------------------
 1 file changed, 13 insertions(+), 79 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 15a5cd138f57..56188abaa701 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -32,87 +32,11 @@ The original codebase can be found [here](https://github.com/openai/shap-e).
 * [`openai/shap-e`](https://huggingface.co/openai/shap-e)
 * [`openai/shap-e-img2img`](https://huggingface.co/openai/shap-e-img2img)
 
-## Usage examples 
-
-### Text-to-3D image generation
-
-```python 
-import torch
-from diffusers import ShapEPipeline
-from diffusers.utils import export_to_gif
-
-
-ckpt_id = "openai/shap-e"
-pipe = ShapEPipeline.from_pretrained(repo).to("cuda")
-
-
-guidance_scale = 15.0
-prompt = "a shark"
-images = pipe(
-    prompt,
-    guidance_scale=guidance_scale,
-    num_inference_steps=64,
-    size=256,
-).images
-
-gif_path = export_to_gif(images, "shark_3d.gif")
-```
-
-| **Prompt** | **Image** |
-|---|---|
-| A shark | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/shark_3d.gif) |
-| A bird | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/bird_3d.gif) |
-| A bowl of vegetables | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/veg_3d.gif) |
-
-### 3D image generation from a 2D image
-
-```pythonimport torch
-from diffusers import ShapEImg2ImgPipeline
-from diffusers.utils import export_to_gif, load_image
-
-ckpt_id = "openai/shap-e-img2img"
-pipe = ShapEImg2ImgPipeline.from_pretrained(repo).to("cuda")
-
-img_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png"
-image = load_image(img_url)
-
-generator = torch.Generator(device="cuda").manual_seed(0)
-batch_size = 4
-guidance_scale = 3.0
-
-images = pipe(
-    image, 
-    num_images_per_prompt=batch_size, 
-    generator=generator, 
-    guidance_scale=guidance_scale,
-    num_inference_steps=64, 
-    size=256, 
-    output_type="pil"
-).images
-
-gif_path = export_to_gif(images, "corgi_sampled_3d.gif")
-```
-
-| **Reference corgi image<br>in 2D** | **Sampled image<br>in 3D (one)** | **Sampled image <br>in 3D (two)** |
-|---|---|---|
-| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi_sampled_3d.gif) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi_sampled_3d_two.gif) |
-
-### ShapEPipeline
-[[autodoc]] ShapEPipeline
-	- all
-	- __call__
-
-### ShapEImg2ImgPipeline
-[[autodoc]] ShapEImg2ImgPipeline
-	- all
-	- __call__
-
-
-## Use Examples:
+## Use Examples
 
 In the following, we will walk you through some examples of how to use Shap-E pipelines to create 3D objects in gif format.
 
-### Text-to-Image generation 
+### Text-to-3D image generation 
 
 We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers.
 
@@ -201,4 +125,14 @@ images = pipe(
 
 gif_path = export_to_gif(images[0], "burger_3d.gif")
 ```
-![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif)
\ No newline at end of file
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif)
+
+### ShapEPipeline
+[[autodoc]] ShapEPipeline
+	- all
+	- __call__
+
+### ShapEImg2ImgPipeline
+[[autodoc]] ShapEImg2ImgPipeline
+	- all
+	- __call__
\ No newline at end of file

From 8c7a2760eb2863913ea1c04b7bcce5ff7023e140 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 6 Jul 2023 12:54:42 +0530
Subject: [PATCH 108/119] minor fix.

---
 docs/source/en/api/pipelines/shap_e.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 56188abaa701..60a3606e3609 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -32,7 +32,7 @@ The original codebase can be found [here](https://github.com/openai/shap-e).
 * [`openai/shap-e`](https://huggingface.co/openai/shap-e)
 * [`openai/shap-e-img2img`](https://huggingface.co/openai/shap-e-img2img)
 
-## Use Examples
+## Usage Examples
 
 In the following, we will walk you through some examples of how to use Shap-E pipelines to create 3D objects in gif format.
 
@@ -127,12 +127,12 @@ gif_path = export_to_gif(images[0], "burger_3d.gif")
 ```
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif)
 
-### ShapEPipeline
+## ShapEPipeline
 [[autodoc]] ShapEPipeline
 	- all
 	- __call__
 
-### ShapEImg2ImgPipeline
+## ShapEImg2ImgPipeline
 [[autodoc]] ShapEImg2ImgPipeline
 	- all
 	- __call__
\ No newline at end of file

From c8b08acb77ea6a91c450f2c5f61bdd92d71f14f2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 6 Jul 2023 08:54:02 +0000
Subject: [PATCH 109/119] update doc

---
 docs/source/en/api/pipelines/shap_e.mdx | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx
index 60a3606e3609..fcb32da31bca 100644
--- a/docs/source/en/api/pipelines/shap_e.mdx
+++ b/docs/source/en/api/pipelines/shap_e.mdx
@@ -38,7 +38,7 @@ In the following, we will walk you through some examples of how to use Shap-E pi
 
 ### Text-to-3D image generation 
 
-We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers.
+We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake for :firecracker: diffusers library's 1 year birthday. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers.
 
 ```python
 import torch
@@ -52,7 +52,7 @@ pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
 pipe = pipe.to(device)
 
 guidance_scale = 15.0
-prompt = "A birthday cupcake"
+prompt = ["A firecracker", "A birthday cupcake"]
 
 images = pipe(
     prompt,
@@ -67,9 +67,10 @@ The output of [`ShapEPipeline`] is a list of lists of images frames. Each list o
 ```python
 from diffusers.utils import export_to_gif
 
-gif_path = export_to_gif(images[0], "cupcake_3d.gif")
+export_to_gif(images[0], "firecracker_3d.gif")
+export_to_gif(images[1], "cake_3d.gif")
 ```
-
+![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/firecracker_out.gif)
 ![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/cake_out.gif)
 
 

From ce4dbd28a099d34a0c3687ece124f0b5258b34ea Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 13:32:23 +0200
Subject: [PATCH 110/119] Apply suggestions from code review

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index c418b3bb8640..7a0827ab0f7f 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -312,7 +312,6 @@ def __call__(
                 sample=latents,
             ).prev_sample
 
-        # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 

From addff79311a06c2fb1474e8270fe9ce56ac011b6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 13:35:42 +0200
Subject: [PATCH 111/119] Apply suggestions from code review

---
 src/diffusers/models/attention.py         | 3 ---
 src/diffusers/models/prior_transformer.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 4f2500894cd3..6b05bf35e87f 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -58,7 +58,6 @@ def __init__(
         only_cross_attention: bool = False,
         double_self_attention: bool = False,
         upcast_attention: bool = False,
-        upcast_softmax: bool = False,
         norm_elementwise_affine: bool = True,
         norm_type: str = "layer_norm",
         final_dropout: bool = False,
@@ -91,7 +90,6 @@ def __init__(
             bias=attention_bias,
             cross_attention_dim=cross_attention_dim if only_cross_attention else None,
             upcast_attention=upcast_attention,
-            upcast_softmax=upcast_softmax,
         )
 
         # 2. Cross-Attn
@@ -112,7 +110,6 @@ def __init__(
                 dropout=dropout,
                 bias=attention_bias,
                 upcast_attention=upcast_attention,
-                upcast_softmax=upcast_softmax,
             )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index bc6f631bd4c6..39edd1852f71 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -81,7 +81,6 @@ def __init__(
         time_embed_dim: Optional[int] = None,
         embedding_proj_dim: Optional[int] = None,
         clip_embed_dim: Optional[int] = None,
-        upcast_softmax: bool = False,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
@@ -130,7 +129,6 @@ def __init__(
                     dropout=dropout,
                     activation_fn="gelu",
                     attention_bias=True,
-                    upcast_softmax=upcast_softmax,
                 )
                 for d in range(num_layers)
             ]

From 8b97857d7d37a884df41cb5884a415d15ab2fd0c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 11:36:07 +0000
Subject: [PATCH 112/119] remove upcast

---
 tests/pipelines/shap_e/test_shap_e.py         | 1 -
 tests/pipelines/shap_e/test_shap_e_img2img.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py
index 198a35915002..d095dd9d49b9 100644
--- a/tests/pipelines/shap_e/test_shap_e.py
+++ b/tests/pipelines/shap_e/test_shap_e.py
@@ -99,7 +99,6 @@ def dummy_prior(self):
             "norm_in_type": "layer",
             "encoder_hid_proj_type": None,
             "added_emb_type": None,
-            "upcast_softmax": True,
         }
 
         model = PriorTransformer(**model_kwargs)
diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py
index 63284f14a863..f6638a994fdd 100644
--- a/tests/pipelines/shap_e/test_shap_e_img2img.py
+++ b/tests/pipelines/shap_e/test_shap_e_img2img.py
@@ -111,7 +111,6 @@ def dummy_prior(self):
             "embedding_proj_norm_type": "layer",
             "encoder_hid_proj_type": None,
             "added_emb_type": None,
-            "upcast_softmax": True,
         }
 
         model = PriorTransformer(**model_kwargs)

From c0912dc2de23cc9206213ba2a7b307f81e123e2d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 12:00:18 +0000
Subject: [PATCH 113/119] Make sure background is white

---
 src/diffusers/pipelines/shap_e/renderer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index ecc9f26769c7..8b075e671f63 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -568,9 +568,9 @@ def __init__(
         act_fn: str = "swish",
         insert_direction_at: int = 4,
         background: Tuple[float] = (
-            0.0,
-            0.0,
-            0.0,
+            255.0,
+            255.0,
+            255.0,
         ),
     ):
         super().__init__()

From 5c609e61c0a59d2d72af04d4acef4873ddae0257 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 14:05:07 +0200
Subject: [PATCH 114/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index dd7f228643ff..3959053c2bac 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -355,7 +355,6 @@ def __call__(
                 sample=latents,
             ).prev_sample
 
-        # YiYi testing only: I don't think we need to return latent for this pipeline
         if output_type == "latent":
             return ShapEPipelineOutput(images=latents)
 

From eb7225c3c7864d8602345ccee2956094f242375a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 14:10:03 +0200
Subject: [PATCH 115/119] Apply suggestions from code review

---
 src/diffusers/models/prior_transformer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 39edd1852f71..43dd3538b003 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -117,8 +117,10 @@ def __init__(
 
         if added_emb_type == "prd":
             self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
-        else:
+        elif added_emb_type is None:
             self.prd_embedding = None
+        else:
+            raise ValueError(f"`added_embed_type`: {added_embed_type} is not supported. Make sure to choose one of `"prd"` or `None`.)
 
         self.transformer_blocks = nn.ModuleList(
             [

From 49e1123c1bc1bfc93be3f201ad3bcaf6dc104ccc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 6 Jul 2023 12:11:35 +0000
Subject: [PATCH 116/119] Finish

---
 src/diffusers/models/prior_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 43dd3538b003..72adf13b79c1 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -120,7 +120,7 @@ def __init__(
         elif added_emb_type is None:
             self.prd_embedding = None
         else:
-            raise ValueError(f"`added_embed_type`: {added_embed_type} is not supported. Make sure to choose one of `"prd"` or `None`.)
+            raise ValueError(f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`.")
 
         self.transformer_blocks = nn.ModuleList(
             [

From ad5d7b1a53ded6427d1323b0373e88e6edfcdd80 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:37:18 +0200
Subject: [PATCH 117/119] Apply suggestions from code review

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
index 7a0827ab0f7f..b99b808e5953 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -96,8 +96,8 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
         renderer ([`ShapERenderer`]):
-            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object
-            with NeRF rendering method
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
+            with the NeRF rendering method
     """
 
     def __init__(

From 2bd69b62a98d57e502b7994be479d2982e0af717 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:37:39 +0200
Subject: [PATCH 118/119] Update
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py

---
 src/diffusers/pipelines/shap_e/pipeline_shap_e.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
index 3959053c2bac..5d96fc7bb9f4 100644
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -96,8 +96,8 @@ class ShapEPipeline(DiffusionPipeline):
         scheduler ([`HeunDiscreteScheduler`]):
             A scheduler to be used in combination with `prior` to generate image embedding.
         renderer ([`ShapERenderer`]):
-            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D object
-            with NeRF rendering method
+            Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects
+            with the NeRF rendering method
     """
 
     def __init__(

From f338eda9ae2d9fb83081180a77bfec0ca32e5f32 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 6 Jul 2023 14:38:15 +0200
Subject: [PATCH 119/119] Make style

---
 src/diffusers/models/prior_transformer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
index 72adf13b79c1..9f3c61dd7561 100644
--- a/src/diffusers/models/prior_transformer.py
+++ b/src/diffusers/models/prior_transformer.py
@@ -120,7 +120,9 @@ def __init__(
         elif added_emb_type is None:
             self.prd_embedding = None
         else:
-            raise ValueError(f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`.")
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
 
         self.transformer_blocks = nn.ModuleList(
             [