huggingface · patrickvonplaten · Nov 29, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 23, 2023
diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md
@@ -92,6 +92,19 @@ imageio.mimsave("video.mp4", result, fps=4)
 ```
 
 
+- #### SDXL Support
+In order to use the SDXL model when generating a video from prompt, use the `TextToVideoZeroSDXLPipeline` pipeline:
+
+```python
+import torch
+from diffusers import TextToVideoZeroSDXLPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
+    model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+```
+
 ### Text-To-Video with Pose Control
 To generate a video from prompt with additional pose control
 
@@ -141,7 +154,33 @@ To generate a video from prompt with additional pose control
     result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
     imageio.mimsave("video.mp4", result, fps=4)
     ```
-
+- #### SDXL Support
+
+	Since our attention processor also works with SDXL, it can be utilized to generate a video from prompt using ControlNet models powered by SDXL:
+	```python
+	import torch
+	from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+	from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
+
+	controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0'
+	model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
+
+	controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16)
+	pipe = StableDiffusionControlNetPipeline.from_pretrained(
+		model_id, controlnet=controlnet, torch_dtype=torch.float16
+	).to('cuda')
+
+	# Set the attention processor
+	pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
+	pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
+
+	# fix latents for all frames
+	latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
+
+	prompt = "Darth Vader dancing in a desert"
+	result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
+	imageio.mimsave("video.mp4", result, fps=4)
+	```
 
 ### Text-To-Video with Edge Control
 
@@ -253,5 +292,10 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__
 
+## TextToVideoZeroSDXLPipeline
+[[autodoc]] TextToVideoZeroSDXLPipeline
+	- all
+	- __call__
+
 ## TextToVideoPipelineOutput
 [[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -279,6 +279,7 @@
             "StableUnCLIPPipeline",
             "TextToVideoSDPipeline",
             "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
             "UnCLIPImageVariationPipeline",
             "UnCLIPPipeline",
             "UniDiffuserModel",
@@ -628,6 +629,7 @@
             StableUnCLIPPipeline,
             TextToVideoSDPipeline,
             TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
             UnCLIPImageVariationPipeline,
             UnCLIPPipeline,
             UniDiffuserModel,

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -162,6 +162,7 @@
     _import_structure["text_to_video_synthesis"] = [
         "TextToVideoSDPipeline",
         "TextToVideoZeroPipeline",
+        "TextToVideoZeroSDXLPipeline",
         "VideoToVideoSDPipeline",
     ]
     _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
@@ -386,6 +387,7 @@
         from .text_to_video_synthesis import (
             TextToVideoSDPipeline,
             TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
             VideoToVideoSDPipeline,
         )
         from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline

diff --git a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -25,6 +25,7 @@
     _import_structure["pipeline_text_to_video_synth"] = ["TextToVideoSDPipeline"]
     _import_structure["pipeline_text_to_video_synth_img2img"] = ["VideoToVideoSDPipeline"]
     _import_structure["pipeline_text_to_video_zero"] = ["TextToVideoZeroPipeline"]
+    _import_structure["pipeline_text_to_video_zero_sdxl"] = ["TextToVideoZeroSDXLPipeline"]
 
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -38,6 +39,7 @@
         from .pipeline_text_to_video_synth import TextToVideoSDPipeline
         from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline
         from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
+        from .pipeline_text_to_video_zero_sdxl import TextToVideoZeroSDXLPipeline
 
 else:
     import sys

diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -13,6 +13,7 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline, StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
 
 
 def rearrange_0(tensor, f):
@@ -135,7 +136,7 @@ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_ma
 
         # Cross Frame Attention
         if not is_cross_attention:
-            video_length = key.size()[0] // self.batch_size
+            video_length = max(1, key.size()[0] // self.batch_size)
             first_frame_index = [0] * video_length
 
             # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
@@ -339,7 +340,7 @@ def forward_loop(self, x_t0, t0, t1, generator):
             x_t1:
                 Forward process applied to x_t0 from time t0 to t1.
         """
-        eps = torch.randn(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
+        eps = randn_tensor(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
         alpha_vec = torch.prod(self.scheduler.alphas[t0:t1])
         x_t1 = torch.sqrt(alpha_vec) * x_t0 + torch.sqrt(1 - alpha_vec) * eps
         return x_t1