diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index 63b6767a6f8f..8dee7c33eac6 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -86,6 +86,7 @@ def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args, acceler controlnet=controlnet, safety_checker=None, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config) @@ -249,10 +250,13 @@ def parse_args(input_args=None): type=str, default=None, required=False, - help=( - "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" - " float32 precision." - ), + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", ) parser.add_argument( "--tokenizer_name", @@ -767,11 +771,13 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant + ) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) if args.controlnet_model_name_or_path: diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py index b4fa96dae8ff..41a29c3945ab 100644 --- a/examples/controlnet/train_controlnet_sdxl.py +++ b/examples/controlnet/train_controlnet_sdxl.py @@ -74,6 +74,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step) unet=unet, controlnet=controlnet, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config) @@ -243,15 +244,18 @@ def parse_args(input_args=None): help="Path to pretrained controlnet model or model identifier from huggingface.co/models." " If not specified controlnet weights are initialized from unet.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--revision", type=str, default=None, required=False, - help=( - "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" - " float32 precision." - ), + help="Revision of pretrained model identifier from huggingface.co/models.", ) parser.add_argument( "--tokenizer_name", @@ -793,10 +797,16 @@ def main(args): # Load the tokenizers tokenizer_one = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_two = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) # import correct text encoder classes @@ -810,10 +820,10 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) vae_path = ( args.pretrained_model_name_or_path @@ -824,9 +834,10 @@ def main(args): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) if args.controlnet_model_name_or_path: diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index d7f78841a81a..c619a46dd99d 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -332,6 +332,12 @@ def parse_args(input_args=None): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--tokenizer_name", type=str, @@ -740,6 +746,7 @@ def main(args): torch_dtype=torch_dtype, safety_checker=None, revision=args.revision, + variant=args.variant, ) pipeline.set_progress_bar_config(disable=True) @@ -801,11 +808,13 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant + ) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # Adding a modifier token which is optimized #### @@ -1229,6 +1238,7 @@ def main(args): text_encoder=accelerator.unwrap_model(text_encoder), tokenizer=tokenizer, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) @@ -1278,7 +1288,7 @@ def main(args): # Final inference # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype ) pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) pipeline = pipeline.to(accelerator.device) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 92b57b728673..41854501144b 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -139,6 +139,7 @@ def log_validation( text_encoder=text_encoder, unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, **pipeline_args, ) @@ -239,10 +240,13 @@ def parse_args(input_args=None): type=str, default=None, required=False, - help=( - "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" - " float32 precision." - ), + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", ) parser.add_argument( "--tokenizer_name", @@ -859,6 +863,7 @@ def main(args): torch_dtype=torch_dtype, safety_checker=None, revision=args.revision, + variant=args.variant, ) pipeline.set_progress_bar_config(disable=True) @@ -912,18 +917,18 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) if model_has_vae(args): vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) else: vae = None unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format @@ -1379,6 +1384,7 @@ def compute_text_embeddings(prompt): args.pretrained_model_name_or_path, unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, **pipeline_args, ) diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py index 5e8c385133e2..680c9dffdfcb 100644 --- a/examples/dreambooth/train_dreambooth_flax.py +++ b/examples/dreambooth/train_dreambooth_flax.py @@ -460,7 +460,10 @@ def collate_fn(examples): # Load models and create wrapper for stable diffusion text_encoder = FlaxCLIPTextModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", dtype=weight_dtype, revision=args.revision + args.pretrained_model_name_or_path, + subfolder="text_encoder", + dtype=weight_dtype, + revision=args.revision, ) vae, vae_params = FlaxAutoencoderKL.from_pretrained( vae_arg, @@ -468,7 +471,10 @@ def collate_fn(examples): **vae_kwargs, ) unet, unet_params = FlaxUNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", dtype=weight_dtype, revision=args.revision + args.pretrained_model_name_or_path, + subfolder="unet", + dtype=weight_dtype, + revision=args.revision, ) # Optimization diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index b82dfa38c172..3ba775b543d8 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -183,6 +183,12 @@ def parse_args(input_args=None): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--tokenizer_name", type=str, @@ -750,6 +756,7 @@ def main(args): torch_dtype=torch_dtype, safety_checker=None, revision=args.revision, + variant=args.variant, ) pipeline.set_progress_bar_config(disable=True) @@ -803,11 +810,11 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) try: vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) except OSError: # IF does not have a VAE so let's just set it to None @@ -815,7 +822,7 @@ def main(args): vae = None unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # We only train the additional adapter LoRA layers @@ -1310,6 +1317,7 @@ def compute_text_embeddings(prompt): unet=accelerator.unwrap_model(unet), text_encoder=None if args.pre_compute_text_embeddings else accelerator.unwrap_model(text_encoder), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) @@ -1395,7 +1403,7 @@ def compute_text_embeddings(prompt): # Final inference # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype ) # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index f4e7887c1c13..bbe8dab731e9 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -204,6 +204,12 @@ def parse_args(input_args=None): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -877,6 +883,7 @@ def main(args): args.pretrained_model_name_or_path, torch_dtype=torch_dtype, revision=args.revision, + variant=args.variant, ) pipeline.set_progress_bar_config(disable=True) @@ -915,10 +922,16 @@ def main(args): # Load the tokenizers tokenizer_one = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_two = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) # import correct text encoder classes @@ -932,10 +945,10 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) vae_path = ( args.pretrained_model_name_or_path @@ -943,10 +956,13 @@ def main(args): else args.pretrained_vae_model_name_or_path ) vae = AutoencoderKL.from_pretrained( - vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision + vae_path, + subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, + revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # We only train the additional adapter LoRA layers @@ -1571,10 +1587,16 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # create pipeline if not args.train_text_encoder: text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, + subfolder="text_encoder", + revision=args.revision, + variant=args.variant, ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, + subfolder="text_encoder_2", + revision=args.revision, + variant=args.variant, ) pipeline = StableDiffusionXLPipeline.from_pretrained( args.pretrained_model_name_or_path, @@ -1583,6 +1605,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): text_encoder_2=accelerator.unwrap_model(text_encoder_two), unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) @@ -1660,10 +1683,15 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = StableDiffusionXLPipeline.from_pretrained( - args.pretrained_model_name_or_path, vae=vae, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, + vae=vae, + revision=args.revision, + variant=args.variant, + torch_dtype=weight_dtype, ) # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index b9b1c9cc5b3b..2766e4c99086 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -78,6 +78,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -435,9 +441,11 @@ def main(): args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision ) text_encoder = CLIPTextModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant + ) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision ) @@ -915,6 +923,7 @@ def collate_fn(examples): text_encoder=accelerator.unwrap_model(text_encoder), vae=accelerator.unwrap_model(vae), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) @@ -966,6 +975,7 @@ def collate_fn(examples): vae=accelerator.unwrap_model(vae), unet=unet, revision=args.revision, + variant=args.variant, ) pipeline.save_pretrained(args.output_dir) diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py index 6b503cb29275..9b57b5eb08f9 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py @@ -118,6 +118,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -484,9 +490,10 @@ def main(): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # InstructPix2Pix uses an additional image for conditioning. To accommodate that, @@ -695,10 +702,16 @@ def preprocess_images(examples): # Load scheduler, tokenizer and models. tokenizer_1 = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_2 = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) text_encoder_cls_1 = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) text_encoder_cls_2 = import_model_class_from_model_name_or_path( @@ -708,10 +721,10 @@ def preprocess_images(examples): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder_1 = text_encoder_cls_1.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_2 = text_encoder_cls_2.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) # We ALWAYS pre-compute the additional condition embeddings needed for SDXL @@ -1109,6 +1122,7 @@ def collate_fn(examples): tokenizer_2=tokenizer_2, vae=vae, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) @@ -1176,6 +1190,7 @@ def collate_fn(examples): vae=vae, unet=unet, revision=args.revision, + variant=args.variant, ) pipeline.save_pretrained(args.output_dir) diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py index d1c9113bbd9d..f8e58bdb80fa 100644 --- a/examples/t2i_adapter/train_t2i_adapter_sdxl.py +++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py @@ -85,6 +85,7 @@ def log_validation(vae, unet, adapter, args, accelerator, weight_dtype, step): unet=unet, adapter=adapter, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) @@ -262,6 +263,12 @@ def parse_args(input_args=None): " float32 precision." ), ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--tokenizer_name", type=str, @@ -812,10 +819,16 @@ def main(args): # Load the tokenizers tokenizer_one = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_two = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) # import correct text encoder classes @@ -829,10 +842,10 @@ def main(args): # Load scheduler and models noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) vae_path = ( args.pretrained_model_name_or_path @@ -843,9 +856,10 @@ def main(args): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) if args.adapter_model_name_or_path: diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 628a0c9d7d96..9a5482054939 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -148,6 +148,7 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight unet=accelerator.unwrap_model(unet), safety_checker=None, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) @@ -209,6 +210,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -567,10 +574,10 @@ def deepspeed_zero_init_disabled_context_manager(): # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded. with ContextManagers(deepspeed_zero_init_disabled_context_manager()): text_encoder = CLIPTextModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) unet = UNet2DConditionModel.from_pretrained( @@ -585,7 +592,7 @@ def deepspeed_zero_init_disabled_context_manager(): # Create EMA for the unet. if args.use_ema: ema_unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config) @@ -1026,6 +1033,7 @@ def collate_fn(examples): vae=vae, unet=unet, revision=args.revision, + variant=args.variant, ) pipeline.save_pretrained(args.output_dir) diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index e62d03c730b1..aad29d1f565c 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -54,6 +54,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index b7309196dec8..7d731c994bdd 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -130,6 +130,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -454,9 +460,11 @@ def main(): text_encoder = CLIPTextModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant + ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # freeze parameters of models to save more memory unet.requires_grad_(False) @@ -881,6 +889,7 @@ def collate_fn(examples): args.pretrained_model_name_or_path, unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) @@ -937,7 +946,7 @@ def collate_fn(examples): # Final inference # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype ) pipeline = pipeline.to(accelerator.device) diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index 96bfe9e16783..b69a85e4f463 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -180,6 +180,12 @@ def parse_args(input_args=None): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -570,10 +576,16 @@ def main(args): # Load the tokenizers tokenizer_one = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_two = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) # import correct text encoder classes @@ -587,10 +599,10 @@ def main(args): # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) vae_path = ( args.pretrained_model_name_or_path @@ -598,10 +610,13 @@ def main(args): else args.pretrained_vae_model_name_or_path ) vae = AutoencoderKL.from_pretrained( - vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision + vae_path, + subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, + revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # We only train the additional adapter LoRA layers @@ -1176,6 +1191,7 @@ def compute_time_ids(original_size, crops_coords_top_left): text_encoder_2=accelerator.unwrap_model(text_encoder_two), unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) @@ -1241,7 +1257,11 @@ def compute_time_ids(original_size, crops_coords_top_left): # Final inference # Load previous pipeline pipeline = StableDiffusionXLPipeline.from_pretrained( - args.pretrained_model_name_or_path, vae=vae, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, + vae=vae, + revision=args.revision, + variant=args.variant, + torch_dtype=weight_dtype, ) pipeline = pipeline.to(accelerator.device) diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 041464e701cc..ee15e6f7def6 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -148,6 +148,12 @@ def parse_args(input_args=None): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--dataset_name", type=str, @@ -618,10 +624,16 @@ def main(args): # Load the tokenizers tokenizer_one = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, ) tokenizer_two = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False + args.pretrained_model_name_or_path, + subfolder="tokenizer_2", + revision=args.revision, + use_fast=False, ) # import correct text encoder classes @@ -636,10 +648,10 @@ def main(args): noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") # Check for terminal SNR in combination with SNR Gamma text_encoder_one = text_encoder_cls_one.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant ) text_encoder_two = text_encoder_cls_two.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision + args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant ) vae_path = ( args.pretrained_model_name_or_path @@ -647,10 +659,13 @@ def main(args): else args.pretrained_vae_model_name_or_path ) vae = AutoencoderKL.from_pretrained( - vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision + vae_path, + subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, + revision=args.revision, + variant=args.variant, ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # Freeze vae and text encoders. @@ -677,7 +692,7 @@ def main(args): # Create EMA for the unet. if args.use_ema: ema_unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config) @@ -1145,12 +1160,14 @@ def compute_time_ids(original_size, crops_coords_top_left): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, ) pipeline = StableDiffusionXLPipeline.from_pretrained( args.pretrained_model_name_or_path, vae=vae, unet=accelerator.unwrap_model(unet), revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) if args.prediction_type is not None: @@ -1198,10 +1215,16 @@ def compute_time_ids(original_size, crops_coords_top_left): vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline = StableDiffusionXLPipeline.from_pretrained( - args.pretrained_model_name_or_path, unet=unet, vae=vae, revision=args.revision, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, + unet=unet, + vae=vae, + revision=args.revision, + variant=args.variant, + torch_dtype=weight_dtype, ) if args.prediction_type is not None: scheduler_args = {"prediction_type": args.prediction_type} diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 8ce998aab1fb..8e932add92af 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -126,6 +126,7 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight vae=vae, safety_checker=None, revision=args.revision, + variant=args.variant, torch_dtype=weight_dtype, ) pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) @@ -206,6 +207,12 @@ def parse_args(): required=False, help="Revision of pretrained model identifier from huggingface.co/models.", ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16", + ) parser.add_argument( "--tokenizer_name", type=str, @@ -624,9 +631,11 @@ def main(): text_encoder = CLIPTextModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant + ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) # Add the placeholder token in tokenizer