Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/source/en/optimization/fp16.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ You may see a small performance boost in VAE decode on multi-image batches. Ther

## Offloading to CPU with accelerate for memory savings

For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
For additional memory savings, you can offload the weights to CPU and only load them to GPU when performing the forward pass.

To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:

Expand All @@ -162,16 +162,15 @@ pipe = StableDiffusionPipeline.from_pretrained(

torch_dtype=torch.float16,
)
pipe = pipe.to("cuda")

prompt = "a photo of an astronaut riding a horse on mars"
pipe.enable_sequential_cpu_offload()
image = pipe(prompt).images[0]
```

And you can get the memory consumption to < 2GB.
And you can get the memory consumption to < 3GB.

If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
If is also possible to chain it with attention slicing for minimal memory consumption (< 2GB).

```Python
import torch
Expand All @@ -182,7 +181,6 @@ pipe = StableDiffusionPipeline.from_pretrained(

torch_dtype=torch.float16,
)
pipe = pipe.to("cuda")

prompt = "a photo of an astronaut riding a horse on mars"
pipe.enable_sequential_cpu_offload()
Expand All @@ -191,6 +189,8 @@ pipe.enable_attention_slicing(1)
image = pipe(prompt).images[0]
```

**Note**: When using `enable_sequential_cpu_offload()`, it is important to **not** move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal. See [this issue](https://github.com/huggingface/diffusers/issues/1934) for more information.

## Using Channels Last memory format

Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
Expand Down Expand Up @@ -357,4 +357,4 @@ with torch.inference_mode():

# optional: You can disable it via
# pipe.disable_xformers_memory_efficient_attention()
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, device, offload_buffers=True)

@property
def _execution_device(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
def _execution_device(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,13 +236,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
def _execution_device(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,13 +272,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,10 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
device = torch.device(f"cuda:{gpu_id}")

for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
cpu_offload(cpu_offloaded_model, device)

if self.safety_checker is not None:
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
# fix by only offloading self.safety_checker for now
cpu_offload(self.safety_checker.vision_model, device)
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
Expand Down