|
16 | 16 | import gc |
17 | 17 | import unittest |
18 | 18 |
|
| 19 | +import numpy as np |
19 | 20 | import torch |
20 | 21 | from parameterized import parameterized |
21 | 22 |
|
22 | | -from diffusers import AsymmetricAutoencoderKL, AutoencoderKL, AutoencoderTiny, ConsistencyDecoderVae |
| 23 | +from diffusers import ( |
| 24 | + AsymmetricAutoencoderKL, |
| 25 | + AutoencoderKL, |
| 26 | + AutoencoderTiny, |
| 27 | + ConsistencyDecoderVae, |
| 28 | + StableDiffusionPipeline, |
| 29 | +) |
23 | 30 | from diffusers.utils.import_utils import is_xformers_available |
| 31 | +from diffusers.utils.loading_utils import load_image |
24 | 32 | from diffusers.utils.testing_utils import ( |
25 | 33 | enable_full_determinism, |
26 | 34 | floats_tensor, |
@@ -795,3 +803,94 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice): |
795 | 803 |
|
796 | 804 | tolerance = 3e-3 if torch_device != "mps" else 1e-2 |
797 | 805 | assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) |
| 806 | + |
| 807 | + |
| 808 | +@slow |
| 809 | +class ConsistencyDecoderVaeIntegrationTests(unittest.TestCase): |
| 810 | + def tearDown(self): |
| 811 | + # clean up the VRAM after each test |
| 812 | + super().tearDown() |
| 813 | + gc.collect() |
| 814 | + torch.cuda.empty_cache() |
| 815 | + |
| 816 | + def test_encode_decode(self): |
| 817 | + vae = ConsistencyDecoderVae.from_pretrained("williamberman/consistency-decoder") # TODO - update |
| 818 | + vae.to(torch_device) |
| 819 | + |
| 820 | + image = load_image( |
| 821 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 822 | + "/img2img/sketch-mountains-input.jpg" |
| 823 | + ).resize((256, 256)) |
| 824 | + image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[ |
| 825 | + None, :, :, : |
| 826 | + ].cuda() |
| 827 | + |
| 828 | + latent = vae.encode(image).latent_dist.mean |
| 829 | + |
| 830 | + sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample |
| 831 | + |
| 832 | + actual_output = sample[0, :2, :2, :2].flatten().cpu() |
| 833 | + expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024]) |
| 834 | + |
| 835 | + assert torch_all_close(actual_output, expected_output, atol=5e-3) |
| 836 | + |
| 837 | + def test_sd(self): |
| 838 | + vae = ConsistencyDecoderVae.from_pretrained("williamberman/consistency-decoder") # TODO - update |
| 839 | + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", vae=vae, safety_checker=None) |
| 840 | + pipe.to(torch_device) |
| 841 | + |
| 842 | + out = pipe( |
| 843 | + "horse", num_inference_steps=2, output_type="pt", generator=torch.Generator("cpu").manual_seed(0) |
| 844 | + ).images[0] |
| 845 | + |
| 846 | + actual_output = out[:2, :2, :2].flatten().cpu() |
| 847 | + expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759]) |
| 848 | + |
| 849 | + assert torch_all_close(actual_output, expected_output, atol=5e-3) |
| 850 | + |
| 851 | + def test_encode_decode_f16(self): |
| 852 | + vae = ConsistencyDecoderVae.from_pretrained( |
| 853 | + "williamberman/consistency-decoder", torch_dtype=torch.float16 |
| 854 | + ) # TODO - update |
| 855 | + vae.to(torch_device) |
| 856 | + |
| 857 | + image = load_image( |
| 858 | + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| 859 | + "/img2img/sketch-mountains-input.jpg" |
| 860 | + ).resize((256, 256)) |
| 861 | + image = ( |
| 862 | + torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :] |
| 863 | + .half() |
| 864 | + .cuda() |
| 865 | + ) |
| 866 | + |
| 867 | + latent = vae.encode(image).latent_dist.mean |
| 868 | + |
| 869 | + sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample |
| 870 | + |
| 871 | + actual_output = sample[0, :2, :2, :2].flatten().cpu() |
| 872 | + expected_output = torch.tensor( |
| 873 | + [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471], dtype=torch.float16 |
| 874 | + ) |
| 875 | + |
| 876 | + assert torch_all_close(actual_output, expected_output, atol=5e-3) |
| 877 | + |
| 878 | + def test_sd_f16(self): |
| 879 | + vae = ConsistencyDecoderVae.from_pretrained( |
| 880 | + "williamberman/consistency-decoder", torch_dtype=torch.float16 |
| 881 | + ) # TODO - update |
| 882 | + pipe = StableDiffusionPipeline.from_pretrained( |
| 883 | + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, vae=vae, safety_checker=None |
| 884 | + ) |
| 885 | + pipe.to(torch_device) |
| 886 | + |
| 887 | + out = pipe( |
| 888 | + "horse", num_inference_steps=2, output_type="pt", generator=torch.Generator("cpu").manual_seed(0) |
| 889 | + ).images[0] |
| 890 | + |
| 891 | + actual_output = out[:2, :2, :2].flatten().cpu() |
| 892 | + expected_output = torch.tensor( |
| 893 | + [0.2510, 0.3776, 0.0000, 0.0285, 0.1519, 0.1814, 0.0000, 0.0000], dtype=torch.float16 |
| 894 | + ) |
| 895 | + |
| 896 | + assert torch_all_close(actual_output, expected_output, atol=5e-3) |
0 commit comments