From 03565688b4b7eae9a26ac5a24009fd8bc82f37d1 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Mon, 13 Nov 2023 17:41:08 +0100 Subject: [PATCH 01/14] begin doc --- docs/source/en/using-diffusers/lcm_lora.md | 326 +++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 docs/source/en/using-diffusers/lcm_lora.md diff --git a/docs/source/en/using-diffusers/lcm_lora.md b/docs/source/en/using-diffusers/lcm_lora.md new file mode 100644 index 000000000000..00b117be3f9e --- /dev/null +++ b/docs/source/en/using-diffusers/lcm_lora.md @@ -0,0 +1,326 @@ + + +# Performing inference with LCM-LoRA + +Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. + +From the [official website](https://latent-consistency-models.github.io/): + +> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations. + +For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). + +However, for latent consistency distillation, each model needs to be distilled separately. The core idea with LCM-LoRA is to train just a small number of adapters, known as LoRA layers, instead of the full model. The resulting LoRAs can then be applied to any fine-tuned version of the model without having to distil them separately. Additionally, the LoRAs can be applied to other tasks, such as image-to-image generation, controlnet/t2iadapter, inpainting, animatediff. The LCM-LoRA can also be combined with other style LoRAs, generating styled-images in very few steps. (4-8) + +This guide shows how to perform inference with LCM-LoRAs for +- text-to-image +- image-to-image +- combined with style LoRAs +- controlent/t2iadapter +- inpainting +- animatediff + +## Text-to-image + +You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models. + +```python +import torch +from diffusers import DiffusionPipeline, LCMScheduler + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), + variant="fp16", + torch_dtype=torch.float16 +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") + +prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 +).images[0] +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i.png) + +Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. + + + +You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process. + +You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0]. + + + +### Inference with a fine-tuned model + +As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distil them separately. Let's look at how we can perform inference with a fine-tuned model: + +```python +from diffusers import DiffusionPipeline, LCMScheduler + +pipe = DiffusionPipeline.from_pretrained( + "Linaqruf/animagine-xl", + scheduler=LCMScheduler.from_pretrained("Linaqruf/animagine-xl", subfolder="scheduler"), + variant="fp16", + torch_dtype=torch.float16 +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") + +prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 +).images[0] +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_finetuned.png) + + +## Image-to-image + +LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use SD-v1-5 model and the LCM-LoRA for SD-v1-5. + +```python +import torch +from diffusers import AutoPipelineForImage2Image, LCMScheduler +from diffusers.utils import make_image_grid, load_image + +pipe = AutoPipelineForImage2Image.from_pretrained( + "Lykon/dreamshaper-7", + scheduler=LCMScheduler.from_pretrained(model_id, subfolder="scheduler"), + torch_dtype=torch.float16, + variant="fp16", +).to(device) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +# prepare image +url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png" +init_image = load_image(url) +prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" + +# pass prompt and image to pipeline +image = pipe(prompt, image=init_image, num_inference_steps=4, guidance_scale=1, strength=0.6).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i.png) + + + + +Based on your prompt and the image you provide, you can get different results. To get the best results, we recommend you to try different values for `num_inference_steps`, `strength` and `guidance_scale` parameters and choose the best one. + + + + +## Combined with style LoRAs + +LCM-LoRA can be combined with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). + +```python +import torch +from diffusers import DiffusionPipeline, LCMScheduler + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), + variant="fp16", + torch_dtype=torch.float16 +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm") +pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") + +pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8]) + +prompt = "papercut, a cute fox" +image = pipe(prompt, num_inference_steps=4, guidance_scale=1).images[0] +image +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_papercut.png) + + +## Controlnet/t2iadapter + +LCM-LoRA can be used with controlnet/t2iadapter. Let's look at how we can perform inference with controlnet/t2iadapter and LCM-LoRA. + +### Controlnet with SD-v1-5 and LCM-LoRA +For this example we'll use SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny controlnet. + +```python +import torch +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler +from diffusers.utils import load_image +from PIL import Image +import cv2 +import numpy as np + +image = load_image( + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" +).resize((512, 512)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) +canny_image + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + scheduler=LCMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"), + controlnet=controlnet, + torch_dtype=torch.float16, + safety_checker=None, + variant="fp16" +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +image = pipe( + "the mona lisa", image=canny_image, num_inference_steps=4, guidance_scale=1.5, controlnet_conditioning_scale=0.8, cross_attention_kwargs={"scale": 1}, +).images[0] +make_image_grid([canny_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_controlnet.png) + + + +The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. + + +### T2iadapter with SDXL and LCM-LoRA + +```python +from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler +from diffusers.utils import load_image, make_image_grid +from controlnet_aux.canny import CannyDetector +import torch + +# load adapter +adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") + +pipe = StableDiffusionXLAdapterPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + adapter=adapter, + scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") +canny_detector = CannyDetector() + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") + +url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" +image = load_image(url) + +# Detect the canny map in low resolution to avoid high-frequency details +canny_image = canny_detector(image, detect_resolution=384, image_resolution=1024) + +prompt = "Mystical fairy in real, magic, 4k picture, high quality" +negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" + +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=canny_image, + num_inference_steps=4, + guidance_scale=1.5, + adapter_conditioning_scale=0.8, + adapter_conditioning_factor=1 +).images[0] +make_image_grid([canny_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_t2iadapter.png) + + +## Inpainting + +LCM-LoRA can be used for inpainting as well. Let's look at how we can perform inpainting with LCM-LoRA. + +```python +import torch +from diffusers import AutoPipelineForInpainting, LCMScheduler +from diffusers.utils import load_image, make_image_grid + +pipe = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", + torch_dtype=torch.float16, + scheduler=LCMScheduler.from_pretrained( "runwayml/stable-diffusion-inpainting", subfolder="scheduler"), + variant="fp16", +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +# load base and mask image +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png") + +# generator = torch.Generator("cuda").manual_seed(92) +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +image = pipe( + prompt=prompt, + image=init_image, + mask_image=mask_image, + generator=generator, + num_inference_steps=4, + guidance_scale=4, +).images[0] +make_image_grid([init_image, mask_image, image], rows=1, cols=3) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_inpainting.png) + + +## Animatediff + + +```python +import torch +from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler +from diffusers.utils import export_to_gif + +adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5") +pipe = AnimateDiffPipeline.from_pretrained( + "frankjoshua/toonyou_beta6", + scheduler=LCMScheduler.from_pretrained("Lykon/dreamshaper-7", subfolder="scheduler"), + motion_adapter=adapter, +).to("cuda") + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm") +pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora") + +pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2]) + +prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" +output = pipe(prompt=prompt, num_inference_steps=5, guidance_scale=1.25, cross_attention_kwargs={"scale": 1}, num_frames=24) +frames = output.frames[0] +export_to_gif(frames, "animation.gif") +``` \ No newline at end of file From f5e851bdb3973472573bb21533f8b109a3478133 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Tue, 14 Nov 2023 12:59:20 +0100 Subject: [PATCH 02/14] fix examples --- docs/source/en/using-diffusers/lcm_lora.md | 61 +++++++++++++++------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/docs/source/en/using-diffusers/lcm_lora.md b/docs/source/en/using-diffusers/lcm_lora.md index 00b117be3f9e..a40c46f65d61 100644 --- a/docs/source/en/using-diffusers/lcm_lora.md +++ b/docs/source/en/using-diffusers/lcm_lora.md @@ -49,13 +49,13 @@ pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" -generator = torch.manual_seed(0) +generator = torch.manual_seed(42) image = pipe( prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 ).images[0] ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png) Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. @@ -91,7 +91,7 @@ image = pipe( ).images[0] ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_finetuned.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i_finetuned.png) ## Image-to-image @@ -105,10 +105,10 @@ from diffusers.utils import make_image_grid, load_image pipe = AutoPipelineForImage2Image.from_pretrained( "Lykon/dreamshaper-7", - scheduler=LCMScheduler.from_pretrained(model_id, subfolder="scheduler"), + scheduler=LCMScheduler.from_pretrained("Lykon/dreamshaper-7", subfolder="scheduler"), torch_dtype=torch.float16, variant="fp16", -).to(device) +).to("cuda") pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") @@ -118,11 +118,12 @@ init_image = load_image(url) prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" # pass prompt and image to pipeline -image = pipe(prompt, image=init_image, num_inference_steps=4, guidance_scale=1, strength=0.6).images[0] +generator = torch.manual_seed(0) +image = pipe(prompt, image=init_image, num_inference_steps=4, guidance_scale=1, strength=0.6, generator=generator).images[0] make_image_grid([init_image, image], rows=1, cols=2) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_i2i.png) @@ -153,11 +154,12 @@ pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safeten pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8]) prompt = "papercut, a cute fox" -image = pipe(prompt, num_inference_steps=4, guidance_scale=1).images[0] +generator = torch.manual_seed(0) +image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0] image ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_papercut.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png) ## Controlnet/t2iadapter @@ -201,14 +203,20 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained( ).to("cuda") pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") - +generator = torch.manual_seed(0) image = pipe( - "the mona lisa", image=canny_image, num_inference_steps=4, guidance_scale=1.5, controlnet_conditioning_scale=0.8, cross_attention_kwargs={"scale": 1}, + "the mona lisa", + image=canny_image, + num_inference_steps=4, + guidance_scale=1.5, + controlnet_conditioning_scale=0.8, + cross_attention_kwargs={"scale": 1}, + generator=generator, ).images[0] make_image_grid([canny_image, image], rows=1, cols=2) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_controlnet.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png) @@ -246,6 +254,7 @@ canny_image = canny_detector(image, detect_resolution=384, image_resolution=1024 prompt = "Mystical fairy in real, magic, 4k picture, high quality" negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" +generator = torch.manual_seed(0) image = pipe( prompt=prompt, negative_prompt=negative_prompt, @@ -253,12 +262,13 @@ image = pipe( num_inference_steps=4, guidance_scale=1.5, adapter_conditioning_scale=0.8, - adapter_conditioning_factor=1 + adapter_conditioning_factor=1, + generator=generator, ).images[0] make_image_grid([canny_image, image], rows=1, cols=2) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i_t2iadapter.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2iadapter.png) ## Inpainting @@ -285,6 +295,7 @@ mask_image = load_image("https://huggingface.co/datasets/huggingface/documentati # generator = torch.Generator("cuda").manual_seed(92) prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +generator = torch.manual_seed(0) image = pipe( prompt=prompt, image=init_image, @@ -296,11 +307,12 @@ image = pipe( make_image_grid([init_image, mask_image, image], rows=1, cols=3) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_inpainting.png) +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_inpainting.png) -## Animatediff +## AnimateDiff +[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results we need to generate multiple frame (16-24) and doing this with standard SD models can be very slow. LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. ```python import torch @@ -310,7 +322,7 @@ from diffusers.utils import export_to_gif adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5") pipe = AnimateDiffPipeline.from_pretrained( "frankjoshua/toonyou_beta6", - scheduler=LCMScheduler.from_pretrained("Lykon/dreamshaper-7", subfolder="scheduler"), + scheduler=LCMScheduler.from_pretrained("frankjoshua/toonyou_beta6", subfolder="scheduler"), motion_adapter=adapter, ).to("cuda") @@ -320,7 +332,16 @@ pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="di pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2]) prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" -output = pipe(prompt=prompt, num_inference_steps=5, guidance_scale=1.25, cross_attention_kwargs={"scale": 1}, num_frames=24) -frames = output.frames[0] +generator = torch.manual_seed(0) +frames = pipe( + prompt=prompt, + num_inference_steps=5, + guidance_scale=1.25, + cross_attention_kwargs={"scale": 1}, + num_frames=24, + generator=generator +).frames[0] export_to_gif(frames, "animation.gif") -``` \ No newline at end of file +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_animatediff.gif) \ No newline at end of file From 0cc9a5b16c0ebf2ba4a2bd32cccd0fb903bd7158 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Tue, 14 Nov 2023 13:32:19 +0100 Subject: [PATCH 03/14] add in toctree --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/{using-diffusers => tutorials}/lcm_lora.md | 0 2 files changed, 2 insertions(+) rename docs/source/en/{using-diffusers => tutorials}/lcm_lora.md (100%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a0c6159991b5..44fe3297bea0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,6 +19,8 @@ title: Train a diffusion model - local: tutorials/using_peft_for_inference title: Inference with PEFT + -local: tutorials/inference_with_lcm_lora + title: Inference with LCM-LoRA title: Tutorials - sections: - sections: diff --git a/docs/source/en/using-diffusers/lcm_lora.md b/docs/source/en/tutorials/lcm_lora.md similarity index 100% rename from docs/source/en/using-diffusers/lcm_lora.md rename to docs/source/en/tutorials/lcm_lora.md From 0fb995b6d52d500e9c92b9e8896446cae5a43205 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Tue, 14 Nov 2023 13:48:32 +0100 Subject: [PATCH 04/14] fix toctree --- docs/source/en/_toctree.yml | 2 +- .../en/tutorials/{lcm_lora.md => inference_with_lcm_lora.md} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/source/en/tutorials/{lcm_lora.md => inference_with_lcm_lora.md} (100%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 44fe3297bea0..8193f1f9b690 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,7 +19,7 @@ title: Train a diffusion model - local: tutorials/using_peft_for_inference title: Inference with PEFT - -local: tutorials/inference_with_lcm_lora + - local: tutorials/inference_with_lcm_lora title: Inference with LCM-LoRA title: Tutorials - sections: diff --git a/docs/source/en/tutorials/lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md similarity index 100% rename from docs/source/en/tutorials/lcm_lora.md rename to docs/source/en/tutorials/inference_with_lcm_lora.md From 1e1d71d5894a08185af34b6800f08f2f0210753b Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Wed, 15 Nov 2023 11:43:55 +0100 Subject: [PATCH 05/14] improve copy --- .../en/tutorials/inference_with_lcm_lora.md | 67 +++++++++++++++---- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index a40c46f65d61..3ec85879d05c 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -22,6 +22,10 @@ For a more technical overview of LCMs, refer to [the paper](https://huggingface. However, for latent consistency distillation, each model needs to be distilled separately. The core idea with LCM-LoRA is to train just a small number of adapters, known as LoRA layers, instead of the full model. The resulting LoRAs can then be applied to any fine-tuned version of the model without having to distil them separately. Additionally, the LoRAs can be applied to other tasks, such as image-to-image generation, controlnet/t2iadapter, inpainting, animatediff. The LCM-LoRA can also be combined with other style LoRAs, generating styled-images in very few steps. (4-8) +LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6). + +For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556). + This guide shows how to perform inference with LCM-LoRAs for - text-to-image - image-to-image @@ -30,6 +34,7 @@ This guide shows how to perform inference with LCM-LoRAs for - inpainting - animatediff + ## Text-to-image You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models. @@ -40,11 +45,14 @@ from diffusers import DiffusionPipeline, LCMScheduler pipe = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", - scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), variant="fp16", torch_dtype=torch.float16 ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" @@ -69,18 +77,21 @@ You can also use guidance with LCM-LoRA, but due to the nature of training the m ### Inference with a fine-tuned model -As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distil them separately. Let's look at how we can perform inference with a fine-tuned model: +As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distil them separately. Let's look at how we can perform inference with a fine-tuned model. In this example we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime. ```python from diffusers import DiffusionPipeline, LCMScheduler pipe = DiffusionPipeline.from_pretrained( "Linaqruf/animagine-xl", - scheduler=LCMScheduler.from_pretrained("Linaqruf/animagine-xl", subfolder="scheduler"), variant="fp16", torch_dtype=torch.float16 ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck" @@ -96,7 +107,7 @@ image = pipe( ## Image-to-image -LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use SD-v1-5 model and the LCM-LoRA for SD-v1-5. +LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `. ```python import torch @@ -105,11 +116,14 @@ from diffusers.utils import make_image_grid, load_image pipe = AutoPipelineForImage2Image.from_pretrained( "Lykon/dreamshaper-7", - scheduler=LCMScheduler.from_pretrained("Lykon/dreamshaper-7", subfolder="scheduler"), torch_dtype=torch.float16, variant="fp16", ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") # prepare image @@ -135,7 +149,8 @@ Based on your prompt and the image you provide, you can get different results. T ## Combined with style LoRAs -LCM-LoRA can be combined with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). +LCM-LoRA can be combined with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). +To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters). ```python import torch @@ -143,14 +158,18 @@ from diffusers import DiffusionPipeline, LCMScheduler pipe = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", - scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), variant="fp16", torch_dtype=torch.float16 ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LoRAs pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm") pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") +# Combine LoRAs pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8]) prompt = "papercut, a cute fox" @@ -195,14 +214,18 @@ canny_image controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - scheduler=LCMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler"), controlnet=controlnet, torch_dtype=torch.float16, safety_checker=None, variant="fp16" ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + generator = torch.manual_seed(0) image = pipe( "the mona lisa", @@ -223,7 +246,15 @@ make_image_grid([canny_image, image], rows=1, cols=2) The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. -### T2iadapter with SDXL and LCM-LoRA +### T2IAdapter with SDXL and LCM-LoRA + +This example shows how to use the LCM-LoRA with the [Canny T2IAdapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL. + +Before running this example, you need to install the `controlnet_aux` package. + +```bash +pip install controlnet_aux +``` ```python from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler @@ -237,12 +268,15 @@ adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", to pipe = StableDiffusionXLAdapterPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", adapter=adapter, - scheduler=LCMScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"), torch_dtype=torch.float16, variant="fp16", ).to("cuda") canny_detector = CannyDetector() +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" @@ -283,10 +317,13 @@ from diffusers.utils import load_image, make_image_grid pipe = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, - scheduler=LCMScheduler.from_pretrained( "runwayml/stable-diffusion-inpainting", subfolder="scheduler"), variant="fp16", ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") # load base and mask image @@ -312,7 +349,8 @@ make_image_grid([init_image, mask_image, image], rows=1, cols=3) ## AnimateDiff -[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results we need to generate multiple frame (16-24) and doing this with standard SD models can be very slow. LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. +[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results we need to generate multiple frame (16-24) and doing this with standard SD models can be very slow. +LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. ```python import torch @@ -322,10 +360,13 @@ from diffusers.utils import export_to_gif adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5") pipe = AnimateDiffPipeline.from_pretrained( "frankjoshua/toonyou_beta6", - scheduler=LCMScheduler.from_pretrained("frankjoshua/toonyou_beta6", subfolder="scheduler"), motion_adapter=adapter, ).to("cuda") +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm") pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora") From d2c0b2f2c1a39a29b3b9c3a6969d3ff34914a94a Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Wed, 15 Nov 2023 11:52:54 +0100 Subject: [PATCH 06/14] improve introductions --- docs/source/en/tutorials/inference_with_lcm_lora.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index 3ec85879d05c..be69c1765fe7 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -34,6 +34,17 @@ This guide shows how to perform inference with LCM-LoRAs for - inpainting - animatediff +Before going through this guide, we'll take a look at the general workflow. LCM-LoRAs are similar to other stable diffusion LoRAs so they can be used with any `pipeline` that supports LoRAs. +To do inference with LCM-LoRAs, you need to follow these steps: + +- Load the task specific pipeline and model. +- Set the scheduler to [`LCMScheduler`]. +- Load the LCM-LoRA weights for the model. +- Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8]. +- Perform inference with the pipeline with the usual parameters. + +Let's look at how we can perform inference with LCM-LoRAs for different tasks. + ## Text-to-image From f9f09c7692ccf52704d56e109b56b89079c543e4 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Wed, 15 Nov 2023 13:04:24 +0100 Subject: [PATCH 07/14] add lcm doc --- docs/source/en/_toctree.yml | 5 +- docs/source/en/tutorials/inferenc_wth_lcm.md | 270 ++++++++++++++++++ .../en/tutorials/inference_with_lcm_lora.md | 9 +- docs/source/en/using-diffusers/lcm.md | 154 ---------- 4 files changed, 280 insertions(+), 158 deletions(-) create mode 100644 docs/source/en/tutorials/inferenc_wth_lcm.md delete mode 100644 docs/source/en/using-diffusers/lcm.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 8193f1f9b690..06bb3af1594f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -21,7 +21,8 @@ title: Inference with PEFT - local: tutorials/inference_with_lcm_lora title: Inference with LCM-LoRA - title: Tutorials + - local: tutorials/inference_with_lcm + title: Inference with LCMs - sections: - sections: - local: using-diffusers/loading_overview @@ -74,8 +75,6 @@ title: Overview - local: using-diffusers/sdxl title: Stable Diffusion XL - - local: using-diffusers/lcm - title: Latent Consistency Models - local: using-diffusers/kandinsky title: Kandinsky - local: using-diffusers/controlnet diff --git a/docs/source/en/tutorials/inferenc_wth_lcm.md b/docs/source/en/tutorials/inferenc_wth_lcm.md new file mode 100644 index 000000000000..3ad25cd11558 --- /dev/null +++ b/docs/source/en/tutorials/inferenc_wth_lcm.md @@ -0,0 +1,270 @@ + + +# Performing inference with LCM + +Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. + +From the [official website](https://latent-consistency-models.github.io/): + +> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations. + +For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). + +LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8). + +This guide shows how to perform inference with LCMs for +- text-to-image +- image-to-image +- combined with style LoRAs +- controlent/t2iadapter + +## Text-to-image + +You'll use the [`StableDiffusionXLPipeline`] pipeline with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models. + +```python +from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler +import torch + +unet = UNet2DConditionModel.from_pretrained( + "latent-consistency/lcm-sdxl", + torch_dtype=torch.float16, + variant="fp16", +) +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16", +).to("cuda") +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0 +).images[0] +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png) + +Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. + +Some details to keep in mind: + +* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process. +* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases. + + +## Image-to-image + +LCMs can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well. + +```python +import torch +from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler +from diffusers.utils import make_image_grid, load_image + +unet = UNet2DConditionModel.from_pretrained( + "SimianLuo/LCM_Dreamshaper_v7", + subfolder="unet", + torch_dtype=torch.float16, +) + +pipe = AutoPipelineForImage2Image.from_pretrained( + "Lykon/dreamshaper-7", + unet=unet, + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# prepare image +url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png" +init_image = load_image(url) +prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" + +# pass prompt and image to pipeline +generator = torch.manual_seed(0) +image = pipe( + prompt, + image=init_image, + num_inference_steps=4, + guidance_scale=7.5, + strength=0.5, + generator=generator +).images[0] +make_image_grid([init_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_i2i.png) + + + + +Based on your prompt and the image you provide, you can get different results. To get the best results, we recommend you to try different values for `num_inference_steps`, `strength` and `guidance_scale` parameters and choose the best one. + + + + +## Combined with style LoRAs + +LCMs can be used with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). + +```python +from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler +import torch + +unet = UNet2DConditionModel.from_pretrained( + "latent-consistency/lcm-sdxl", + torch_dtype=torch.float16, + variant="fp16", +) +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16", +).to("cuda") +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") + +prompt = "papercut, a cute fox" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0 +).images[0] +image +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png) + + +## Controlnet/t2iadapter + +LCM can be used with controlnet/t2iadapter. Let's look at how we can perform inference with controlnet/t2iadapter. + +### Controlnet with SD-v1-5 and LCM-LoRA +For this example we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny controlnet, but the same steps can be applied to other LCM models as well. + +```python +import torch +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler +from diffusers.utils import load_image +from PIL import Image +import cv2 +import numpy as np + +image = load_image( + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" +).resize((512, 512)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) +canny_image + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetPipeline.from_pretrained( + "SimianLuo/LCM_Dreamshaper_v7", + controlnet=controlnet, + torch_dtype=torch.float16, + safety_checker=None, +).to("cuda") + +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +generator = torch.manual_seed(0) +image = pipe( + "the mona lisa", + image=canny_image, + num_inference_steps=4, + guidance_scale=1, + controlnet_conditioning_scale=0.75, + generator=generator, +).images[0] +make_image_grid([canny_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png) + + + +The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. + + +### T2IAdapter with SDXL and LCM-LoRA + +This example shows how to use the `lcm-sdxl` with the [Canny T2IAdapter](TencentARC/t2i-adapter-canny-sdxl-1.0). + +Before running this example, you need to install the `controlnet_aux` package. + +```bash +pip install controlnet_aux +``` + +```python +from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler +from diffusers.utils import load_image, make_image_grid +from controlnet_aux.canny import CannyDetector +import torch + +# load adapter +adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") + +unet = UNet2DConditionModel.from_pretrained( + "latent-consistency/lcm-sdxl", + torch_dtype=torch.float16, + variant="fp16", +) +pipe = StableDiffusionXLAdapterPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + unet=unet, + adapter=adapter, + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +canny_detector = CannyDetector() + + +url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" +image = load_image(url) + +# Detect the canny map in low resolution to avoid high-frequency details +canny_image = canny_detector(image, detect_resolution=384, image_resolution=1024) + +prompt = "Mystical fairy in real, magic, 4k picture, high quality" +negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=canny_image, + num_inference_steps=4, + guidance_scale=1.5, + adapter_conditioning_scale=0.8, + adapter_conditioning_factor=1, + generator=generator, +).images[0] +make_image_grid([canny_image, image], rows=1, cols=2) +``` + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png) diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index be69c1765fe7..4d4143742e73 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -144,7 +144,14 @@ prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k # pass prompt and image to pipeline generator = torch.manual_seed(0) -image = pipe(prompt, image=init_image, num_inference_steps=4, guidance_scale=1, strength=0.6, generator=generator).images[0] +image = pipe( + prompt, + image=init_image, + num_inference_steps=4, + guidance_scale=1, + strength=0.6, + generator=generator +).images[0] make_image_grid([init_image, image], rows=1, cols=2) ``` diff --git a/docs/source/en/using-diffusers/lcm.md b/docs/source/en/using-diffusers/lcm.md deleted file mode 100644 index 39bc2426a92b..000000000000 --- a/docs/source/en/using-diffusers/lcm.md +++ /dev/null @@ -1,154 +0,0 @@ - - -# Performing inference with LCM - -Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. - -From the [official website](https://latent-consistency-models.github.io/): - -> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations. - -For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). - -This guide shows how to perform inference with LCMs for text-to-image and image-to-image generation tasks. It will also cover performing inference with LoRA checkpoints. - -## Text-to-image - -You'll use the [`StableDiffusionXLPipeline`] here changing the `unet`. The UNet was distilled from the SDXL UNet using the framework introduced in LCM. Another important component is the scheduler: [`LCMScheduler`]. Together with the distilled UNet and the scheduler, LCM enables a fast inference workflow overcoming the slow iterative nature of diffusion models. - -```python -from diffusers import DiffusionPipeline, UNet2DConditionModel, LCMScheduler -import torch - -unet = UNet2DConditionModel.from_pretrained( - "latent-consistency/lcm-sdxl", - torch_dtype=torch.float16, - variant="fp16", -) -pipe = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16 -).to("cuda") -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" - -generator = torch.manual_seed(0) -image = pipe( - prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0 -).images[0] -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_intro.png) - -Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. - -Some details to keep in mind: - -* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process. -* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases. - -## Image-to-image - -The findings above apply to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs: - -```python -from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler -from diffusers.utils import load_image -import torch - -unet = UNet2DConditionModel.from_pretrained( - "latent-consistency/lcm-sdxl", - torch_dtype=torch.float16, - variant="fp16", -) -pipe = AutoPipelineForImage2Image.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16 -).to("cuda") -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -prompt = "High altitude snowy mountains" -image = load_image( - "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/snowy_mountains.jpeg" -) - -generator = torch.manual_seed(0) -image = pipe( - prompt=prompt, - image=image, - num_inference_steps=4, - generator=generator, - guidance_scale=8.0, -).images[0] -``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_i2i.png) - -## LoRA - -It is possible to generalize the LCM framework to use with [LoRA](../training/lora.md). It effectively eliminates the need to conduct expensive fine-tuning runs as LoRA training concerns just a few number of parameters compared to full fine-tuning. During inference, the [`LCMScheduler`] comes to the advantage as it enables very few-steps inference without compromising the quality. - -We recommend to disable `guidance_scale` by setting it 0. The model is trained to follow prompts accurately -even without using guidance scale. You can however, still use guidance scale in which case we recommend -using values between 1.0 and 2.0. - -### Text-to-image - -```python -from diffusers import DiffusionPipeline, LCMScheduler -import torch - -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -lcm_lora_id = "latent-consistency/lcm-lora-sdxl" - -pipe = DiffusionPipeline.from_pretrained(model_id, variant="fp16", torch_dtype=torch.float16).to("cuda") - -pipe.load_lora_weights(lcm_lora_id) -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -prompt = "close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux" -image = pipe( - prompt=prompt, - num_inference_steps=4, - guidance_scale=0, # set guidance scale to 0 to disable it -).images[0] -``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lora_lcm.png) - -### Image-to-image - -Extending LCM LoRA to image-to-image is possible: - -```python -from diffusers import StableDiffusionXLImg2ImgPipeline, LCMScheduler -from diffusers.utils import load_image -import torch - -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -lcm_lora_id = "latent-consistency/lcm-lora-sdxl" - -pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, variant="fp16", torch_dtype=torch.float16).to("cuda") - -pipe.load_lora_weights(lcm_lora_id) -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -prompt = "close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux" - -image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lora_lcm.png") - -image = pipe( - prompt=prompt, - image=image, - num_inference_steps=4, - guidance_scale=0, # set guidance scale to 0 to disable it -).images[0] -``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_lora_i2i.png) From 1adb1fcb120b9784f5b558859826f65224e09184 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Wed, 15 Nov 2023 13:14:21 +0100 Subject: [PATCH 08/14] fix filename --- .../en/tutorials/{inferenc_wth_lcm.md => inference_with_lcm.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/en/tutorials/{inferenc_wth_lcm.md => inference_with_lcm.md} (100%) diff --git a/docs/source/en/tutorials/inferenc_wth_lcm.md b/docs/source/en/tutorials/inference_with_lcm.md similarity index 100% rename from docs/source/en/tutorials/inferenc_wth_lcm.md rename to docs/source/en/tutorials/inference_with_lcm.md From 9b84385ce23a43214f98b1c8111ecefe4c1b035b Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Thu, 16 Nov 2023 11:54:25 +0100 Subject: [PATCH 09/14] Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/_toctree.yml | 4 +-- .../source/en/tutorials/inference_with_lcm.md | 28 ++++++++-------- .../en/tutorials/inference_with_lcm_lora.md | 32 +++++++++---------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7612ae1001c2..16dd53331b7f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -20,9 +20,9 @@ - local: tutorials/using_peft_for_inference title: Inference with PEFT - local: tutorials/inference_with_lcm_lora - title: Inference with LCM-LoRA + title: Latent Consistency Model-LoRA - local: tutorials/inference_with_lcm - title: Inference with LCMs + title: Latent Consistency Model - sections: - sections: - local: using-diffusers/loading_overview diff --git a/docs/source/en/tutorials/inference_with_lcm.md b/docs/source/en/tutorials/inference_with_lcm.md index 3ad25cd11558..0ded9f38f7a6 100644 --- a/docs/source/en/tutorials/inference_with_lcm.md +++ b/docs/source/en/tutorials/inference_with_lcm.md @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Performing inference with LCM +# Latent Consistency Model Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. @@ -26,11 +26,11 @@ This guide shows how to perform inference with LCMs for - text-to-image - image-to-image - combined with style LoRAs -- controlent/t2iadapter +- ControlNet/T2I-Adapter ## Text-to-image -You'll use the [`StableDiffusionXLPipeline`] pipeline with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models. +You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models. ```python from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler @@ -66,7 +66,7 @@ Some details to keep in mind: ## Image-to-image -LCMs can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well. +LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well. ```python import torch @@ -110,14 +110,14 @@ make_image_grid([init_image, image], rows=1, cols=2) -Based on your prompt and the image you provide, you can get different results. To get the best results, we recommend you to try different values for `num_inference_steps`, `strength` and `guidance_scale` parameters and choose the best one. +You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one. -## Combined with style LoRAs +## Combine with style LoRAs -LCMs can be used with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). +LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). ```python from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler @@ -147,12 +147,12 @@ image ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png) -## Controlnet/t2iadapter +## ControlNet/T2I-Adapter -LCM can be used with controlnet/t2iadapter. Let's look at how we can perform inference with controlnet/t2iadapter. +Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM. -### Controlnet with SD-v1-5 and LCM-LoRA -For this example we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny controlnet, but the same steps can be applied to other LCM models as well. +### ControlNet +For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well. ```python import torch @@ -204,12 +204,12 @@ make_image_grid([canny_image, image], rows=1, cols=2) -The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. +The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one. -### T2IAdapter with SDXL and LCM-LoRA +### T2I-Adapter -This example shows how to use the `lcm-sdxl` with the [Canny T2IAdapter](TencentARC/t2i-adapter-canny-sdxl-1.0). +This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0). Before running this example, you need to install the `controlnet_aux` package. diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index 4d4143742e73..739a90d3c52c 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -20,7 +20,7 @@ From the [official website](https://latent-consistency-models.github.io/): For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). -However, for latent consistency distillation, each model needs to be distilled separately. The core idea with LCM-LoRA is to train just a small number of adapters, known as LoRA layers, instead of the full model. The resulting LoRAs can then be applied to any fine-tuned version of the model without having to distil them separately. Additionally, the LoRAs can be applied to other tasks, such as image-to-image generation, controlnet/t2iadapter, inpainting, animatediff. The LCM-LoRA can also be combined with other style LoRAs, generating styled-images in very few steps. (4-8) +However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a small number of adapters, known as LoRA layers, instead of the full model. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately. Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff. The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8). LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6). @@ -29,10 +29,10 @@ For more details about LCM-LoRA, refer to [the technical report](https://hugging This guide shows how to perform inference with LCM-LoRAs for - text-to-image - image-to-image -- combined with style LoRAs -- controlent/t2iadapter +- combined with styled LoRAs +- ControlNet/T2I-Adapter - inpainting -- animatediff +- AnimateDiff Before going through this guide, we'll take a look at the general workflow. LCM-LoRAs are similar to other stable diffusion LoRAs so they can be used with any `pipeline` that supports LoRAs. To do inference with LCM-LoRAs, you need to follow these steps: @@ -88,7 +88,7 @@ You can also use guidance with LCM-LoRA, but due to the nature of training the m ### Inference with a fine-tuned model -As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distil them separately. Let's look at how we can perform inference with a fine-tuned model. In this example we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime. +As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime. ```python from diffusers import DiffusionPipeline, LCMScheduler @@ -160,14 +160,14 @@ make_image_grid([init_image, image], rows=1, cols=2) -Based on your prompt and the image you provide, you can get different results. To get the best results, we recommend you to try different values for `num_inference_steps`, `strength` and `guidance_scale` parameters and choose the best one. +You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one. -## Combined with style LoRAs +## Combine with styled LoRAs -LCM-LoRA can be combined with other style LoRAs, generating styled-images in very few steps. (4-8). In the following example we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). +LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters). ```python @@ -199,12 +199,12 @@ image ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png) -## Controlnet/t2iadapter +## ControlNet/T2I-Adapter -LCM-LoRA can be used with controlnet/t2iadapter. Let's look at how we can perform inference with controlnet/t2iadapter and LCM-LoRA. +Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA. -### Controlnet with SD-v1-5 and LCM-LoRA -For this example we'll use SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny controlnet. +### ControlNet +For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet. ```python import torch @@ -264,9 +264,9 @@ make_image_grid([canny_image, image], rows=1, cols=2) The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. -### T2IAdapter with SDXL and LCM-LoRA +### T2I-Adapter -This example shows how to use the LCM-LoRA with the [Canny T2IAdapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL. +This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL. Before running this example, you need to install the `controlnet_aux` package. @@ -325,7 +325,7 @@ make_image_grid([canny_image, image], rows=1, cols=2) ## Inpainting -LCM-LoRA can be used for inpainting as well. Let's look at how we can perform inpainting with LCM-LoRA. +LCM-LoRA can be used for inpainting as well. ```python import torch @@ -367,7 +367,7 @@ make_image_grid([init_image, mask_image, image], rows=1, cols=3) ## AnimateDiff -[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results we need to generate multiple frame (16-24) and doing this with standard SD models can be very slow. +[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. ```python From f383066e25036cdea8d29761ffca8abe75970301 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Thu, 16 Nov 2023 13:37:21 +0100 Subject: [PATCH 10/14] address Sayak's comments --- docs/source/en/tutorials/inference_with_lcm.md | 1 - .../en/tutorials/inference_with_lcm_lora.md | 17 ++++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/source/en/tutorials/inference_with_lcm.md b/docs/source/en/tutorials/inference_with_lcm.md index 0ded9f38f7a6..fad7cf274d3a 100644 --- a/docs/source/en/tutorials/inference_with_lcm.md +++ b/docs/source/en/tutorials/inference_with_lcm.md @@ -175,7 +175,6 @@ image = cv2.Canny(image, low_threshold, high_threshold) image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) canny_image = Image.fromarray(image) -canny_image controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index 739a90d3c52c..bcbd13461a18 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -20,7 +20,10 @@ From the [official website](https://latent-consistency-models.github.io/): For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). -However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a small number of adapters, known as LoRA layers, instead of the full model. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately. Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff. The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8). +However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case. +This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately. +Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc. +The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8). LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6). @@ -34,8 +37,8 @@ This guide shows how to perform inference with LCM-LoRAs for - inpainting - AnimateDiff -Before going through this guide, we'll take a look at the general workflow. LCM-LoRAs are similar to other stable diffusion LoRAs so they can be used with any `pipeline` that supports LoRAs. -To do inference with LCM-LoRAs, you need to follow these steps: +Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs. +LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs. - Load the task specific pipeline and model. - Set the scheduler to [`LCMScheduler`]. @@ -45,6 +48,11 @@ To do inference with LCM-LoRAs, you need to follow these steps: Let's look at how we can perform inference with LCM-LoRAs for different tasks. +First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support. + +```bash +pip install -U peft +``` ## Text-to-image @@ -227,7 +235,6 @@ image = cv2.Canny(image, low_threshold, high_threshold) image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) canny_image = Image.fromarray(image) -canny_image controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( @@ -367,7 +374,7 @@ make_image_grid([init_image, mask_image, image], rows=1, cols=3) ## AnimateDiff -[AnimateDiff](https://arxiv.org/abs/2307.04725) allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. +[`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. ```python From 6a85b322ebb982c3c2f077338c1eff68eef48a0c Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Thu, 16 Nov 2023 13:43:16 +0100 Subject: [PATCH 11/14] remove controlnet aux --- .../source/en/tutorials/inference_with_lcm.md | 38 ++++++++-------- .../en/tutorials/inference_with_lcm_lora.md | 43 +++++++++++-------- 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/docs/source/en/tutorials/inference_with_lcm.md b/docs/source/en/tutorials/inference_with_lcm.md index fad7cf274d3a..4ab588960b4e 100644 --- a/docs/source/en/tutorials/inference_with_lcm.md +++ b/docs/source/en/tutorials/inference_with_lcm.md @@ -210,17 +210,30 @@ The inference parameters in this example might not work for all examples, so we This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0). -Before running this example, you need to install the `controlnet_aux` package. - -```bash -pip install controlnet_aux -``` - ```python +import torch +import cv2 +import numpy as np +from PIL import Image + from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler from diffusers.utils import load_image, make_image_grid -from controlnet_aux.canny import CannyDetector -import torch + +# Prepare image +# Detect the canny map in low resolution to avoid high-frequency details +image = load_image( + "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" +).resize((384, 384)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image).resize((1024, 1024)) # load adapter adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") @@ -240,15 +253,6 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained( pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) -canny_detector = CannyDetector() - - -url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" -image = load_image(url) - -# Detect the canny map in low resolution to avoid high-frequency details -canny_image = canny_detector(image, detect_resolution=384, image_resolution=1024) - prompt = "Mystical fairy in real, magic, 4k picture, high quality" negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index bcbd13461a18..28f27a9fc6af 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -216,11 +216,12 @@ For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with ```python import torch -from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler -from diffusers.utils import load_image -from PIL import Image import cv2 import numpy as np +from PIL import Image + +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler +from diffusers.utils import load_image image = load_image( "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" @@ -275,17 +276,30 @@ The inference parameters in this example might not work for all examples, so we This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL. -Before running this example, you need to install the `controlnet_aux` package. - -```bash -pip install controlnet_aux -``` - ```python +import torch +import cv2 +import numpy as np +from PIL import Image + from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler from diffusers.utils import load_image, make_image_grid -from controlnet_aux.canny import CannyDetector -import torch + +# Prepare image +# Detect the canny map in low resolution to avoid high-frequency details +image = load_image( + "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" +).resize((384, 384)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image).resize((1024, 1024)) # load adapter adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") @@ -296,7 +310,6 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained( torch_dtype=torch.float16, variant="fp16", ).to("cuda") -canny_detector = CannyDetector() # set scheduler pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) @@ -304,12 +317,6 @@ pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) # load LCM-LoRA pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") -url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" -image = load_image(url) - -# Detect the canny map in low resolution to avoid high-frequency details -canny_image = canny_detector(image, detect_resolution=384, image_resolution=1024) - prompt = "Mystical fairy in real, magic, 4k picture, high quality" negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" From 5a3ae20782a8af53ed5c65a22ea187c7e637d69e Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Thu, 16 Nov 2023 14:00:10 +0100 Subject: [PATCH 12/14] open in colab --- docs/source/en/tutorials/inference_with_lcm.md | 2 ++ docs/source/en/tutorials/inference_with_lcm_lora.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/en/tutorials/inference_with_lcm.md b/docs/source/en/tutorials/inference_with_lcm.md index 4ab588960b4e..55a5abddccb0 100644 --- a/docs/source/en/tutorials/inference_with_lcm.md +++ b/docs/source/en/tutorials/inference_with_lcm.md @@ -10,6 +10,8 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> +[[open-in-colab]] + # Latent Consistency Model Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/tutorials/inference_with_lcm_lora.md index 28f27a9fc6af..554e5fda2c2a 100644 --- a/docs/source/en/tutorials/inference_with_lcm_lora.md +++ b/docs/source/en/tutorials/inference_with_lcm_lora.md @@ -10,6 +10,8 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> +[[open-in-colab]] + # Performing inference with LCM-LoRA Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. From 67a4c17fc595e946bb4f4b4f071b0cc54d3a765f Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Thu, 16 Nov 2023 14:07:59 +0100 Subject: [PATCH 13/14] move to Specific pipeline examples --- docs/source/en/_toctree.yml | 8 ++++---- .../{tutorials => using-diffusers}/inference_with_lcm.md | 0 .../inference_with_lcm_lora.md | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename docs/source/en/{tutorials => using-diffusers}/inference_with_lcm.md (100%) rename docs/source/en/{tutorials => using-diffusers}/inference_with_lcm_lora.md (100%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 16dd53331b7f..9d3c0b462d88 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,10 +19,6 @@ title: Train a diffusion model - local: tutorials/using_peft_for_inference title: Inference with PEFT - - local: tutorials/inference_with_lcm_lora - title: Latent Consistency Model-LoRA - - local: tutorials/inference_with_lcm - title: Latent Consistency Model - sections: - sections: - local: using-diffusers/loading_overview @@ -93,6 +89,10 @@ title: Community pipelines - local: using-diffusers/contribute_pipeline title: Contribute a community pipeline + - local: using-diffusers/inference_with_lcm_lora + title: Latent Consistency Model-LoRA + - local: using-diffusers/inference_with_lcm + title: Latent Consistency Model title: Specific pipeline examples - sections: - local: training/overview diff --git a/docs/source/en/tutorials/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md similarity index 100% rename from docs/source/en/tutorials/inference_with_lcm.md rename to docs/source/en/using-diffusers/inference_with_lcm.md diff --git a/docs/source/en/tutorials/inference_with_lcm_lora.md b/docs/source/en/using-diffusers/inference_with_lcm_lora.md similarity index 100% rename from docs/source/en/tutorials/inference_with_lcm_lora.md rename to docs/source/en/using-diffusers/inference_with_lcm_lora.md From 072e8ff9dd0b9fbe133efd87692bac7a4e1fe2a2 Mon Sep 17 00:00:00 2001 From: patil-suraj Date: Thu, 16 Nov 2023 15:16:00 +0100 Subject: [PATCH 14/14] update controlent and adapter examples --- .../en/using-diffusers/inference_with_lcm.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/source/en/using-diffusers/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md index 55a5abddccb0..36b3c6c810fc 100644 --- a/docs/source/en/using-diffusers/inference_with_lcm.md +++ b/docs/source/en/using-diffusers/inference_with_lcm.md @@ -158,11 +158,12 @@ For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/Simi ```python import torch -from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler -from diffusers.utils import load_image -from PIL import Image import cv2 import numpy as np +from PIL import Image + +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler +from diffusers.utils import load_image, make_image_grid image = load_image( "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" @@ -194,8 +195,6 @@ image = pipe( "the mona lisa", image=canny_image, num_inference_steps=4, - guidance_scale=1, - controlnet_conditioning_scale=0.75, generator=generator, ).images[0] make_image_grid([canny_image, image], rows=1, cols=2) @@ -235,7 +234,7 @@ high_threshold = 200 image = cv2.Canny(image, low_threshold, high_threshold) image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) -canny_image = Image.fromarray(image).resize((1024, 1024)) +canny_image = Image.fromarray(image).resize((1024, 1216)) # load adapter adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") @@ -264,12 +263,12 @@ image = pipe( negative_prompt=negative_prompt, image=canny_image, num_inference_steps=4, - guidance_scale=1.5, + guidance_scale=5, adapter_conditioning_scale=0.8, adapter_conditioning_factor=1, generator=generator, ).images[0] -make_image_grid([canny_image, image], rows=1, cols=2) +grid = make_image_grid([canny_image, image], rows=1, cols=2) ``` ![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png)