Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,16 +683,19 @@ def check_inputs(
self,
prompt,
image,
mask_image,
height,
width,
callback_steps,
output_type,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
controlnet_conditioning_scale=1.0,
control_guidance_start=0.0,
control_guidance_end=1.0,
callback_on_step_end_tensor_inputs=None,
padding_mask_crop=None,
):
if height is not None and height % 8 != 0 or width is not None and width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
Expand Down Expand Up @@ -736,6 +739,24 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)

if padding_mask_crop is not None:
if self.unet.config.in_channels != 4 and self.unet.config.in_channels != 9:
raise ValueError(
f"The UNet should have 4 or 9 input channels for inpainting mask crop, but has"
f" {self.unet.config.in_channels} input channels."
)
if not isinstance(image, PIL.Image.Image):
raise ValueError(
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
)
if not isinstance(mask_image, PIL.Image.Image):
raise ValueError(
f"The mask image should be a PIL image when inpainting mask crop, but is of type"
f" {type(mask_image)}."
)
if output_type != "pil":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")

# `prompt` needs more sophisticated handling when there are multiple
# conditionings.
if isinstance(self.controlnet, MultiControlNetModel):
Expand Down Expand Up @@ -862,7 +883,6 @@ def check_image(self, image, prompt, prompt_embeds):
f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
)

# Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
def prepare_control_image(
self,
image,
Expand All @@ -872,10 +892,14 @@ def prepare_control_image(
num_images_per_prompt,
device,
dtype,
crops_coords,
resize_mode,
do_classifier_free_guidance=False,
guess_mode=False,
):
image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
image = self.control_image_processor.preprocess(
image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
).to(dtype=torch.float32)
image_batch_size = image.shape[0]

if image_batch_size == 1:
Expand Down Expand Up @@ -1074,6 +1098,7 @@ def __call__(
control_image: PipelineImageInput = None,
height: Optional[int] = None,
width: Optional[int] = None,
padding_mask_crop: Optional[int] = None,
strength: float = 1.0,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
Expand Down Expand Up @@ -1130,6 +1155,12 @@ def __call__(
The height in pixels of the generated image.
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
The width in pixels of the generated image.
padding_mask_crop (`int`, *optional*, defaults to `None`):
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
and contain information inreleant for inpainging, such as background.
strength (`float`, *optional*, defaults to 1.0):
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
Expand Down Expand Up @@ -1240,16 +1271,19 @@ def __call__(
self.check_inputs(
prompt,
control_image,
mask_image,
height,
width,
callback_steps,
output_type,
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
controlnet_conditioning_scale,
control_guidance_start,
control_guidance_end,
callback_on_step_end_tensor_inputs,
padding_mask_crop,
)

self._guidance_scale = guidance_scale
Expand All @@ -1264,6 +1298,17 @@ def __call__(
else:
batch_size = prompt_embeds.shape[0]

if padding_mask_crop is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

um just saw your issue #6435
maybe we need to move this code into prepare_control_image()?

see my comment here #6435 (comment)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it would work. width and height are still None in there. Do you think we should handle None in get_crop_region?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok!
you can use self.image_processor. get_default_height_width(image) to get it

if width is None or height is None:
default_height, default_width = self.image_processor.get_default_height_width(image)
width = width or default_width
height = height or default_height
crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
resize_mode = "fill"
else:
crops_coords = None
resize_mode = "default"

device = self._execution_device

if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
Expand Down Expand Up @@ -1315,6 +1360,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
device=device,
dtype=controlnet.dtype,
crops_coords=crops_coords,
resize_mode=resize_mode,
do_classifier_free_guidance=self.do_classifier_free_guidance,
guess_mode=guess_mode,
)
Expand All @@ -1330,6 +1377,8 @@ def __call__(
num_images_per_prompt=num_images_per_prompt,
device=device,
dtype=controlnet.dtype,
crops_coords=crops_coords,
resize_mode=resize_mode,
do_classifier_free_guidance=self.do_classifier_free_guidance,
guess_mode=guess_mode,
)
Expand All @@ -1341,10 +1390,15 @@ def __call__(
assert False

# 4.1 Preprocess mask and image - resizes image and mask w.r.t height and width
init_image = self.image_processor.preprocess(image, height=height, width=width)
original_image = image
init_image = self.image_processor.preprocess(
image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
)
init_image = init_image.to(dtype=torch.float32)

mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
mask = self.mask_processor.preprocess(
mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
)

masked_image = init_image * (mask < 0.5)
_, _, height, width = init_image.shape
Expand Down Expand Up @@ -1534,6 +1588,9 @@ def __call__(

image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)

if padding_mask_crop is not None:
image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]

# Offload all models
self.maybe_free_model_hooks()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,11 @@ def check_inputs(
prompt,
prompt_2,
image,
mask_image,
strength,
num_inference_steps,
callback_steps,
output_type,
negative_prompt=None,
negative_prompt_2=None,
prompt_embeds=None,
Expand All @@ -570,6 +572,7 @@ def check_inputs(
control_guidance_start=0.0,
control_guidance_end=1.0,
callback_on_step_end_tensor_inputs=None,
padding_mask_crop=None,
):
if strength < 0 or strength > 1:
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
Expand Down Expand Up @@ -632,6 +635,24 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)

if padding_mask_crop is not None:
if self.unet.config.in_channels != 4 and self.unet.config.in_channels != 9:
raise ValueError(
f"The UNet should have 4 or 9 input channels for inpainting mask crop, but has"
f" {self.unet.config.in_channels} input channels."
)
if not isinstance(image, PIL.Image.Image):
raise ValueError(
f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
)
if not isinstance(mask_image, PIL.Image.Image):
raise ValueError(
f"The mask image should be a PIL image when inpainting mask crop, but is of type"
f" {type(mask_image)}."
)
if output_type != "pil":
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")

if prompt_embeds is not None and pooled_prompt_embeds is None:
raise ValueError(
"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
Expand Down Expand Up @@ -745,10 +766,14 @@ def prepare_control_image(
num_images_per_prompt,
device,
dtype,
crops_coords,
resize_mode,
do_classifier_free_guidance=False,
guess_mode=False,
):
image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
image = self.control_image_processor.preprocess(
image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
).to(dtype=torch.float32)
image_batch_size = image.shape[0]

if image_batch_size == 1:
Expand Down Expand Up @@ -1066,6 +1091,7 @@ def __call__(
] = None,
height: Optional[int] = None,
width: Optional[int] = None,
padding_mask_crop: Optional[int] = None,
strength: float = 0.9999,
num_inference_steps: int = 50,
denoising_start: Optional[float] = None,
Expand Down Expand Up @@ -1121,6 +1147,12 @@ def __call__(
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The width in pixels of the generated image.
padding_mask_crop (`int`, *optional*, defaults to `None`):
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
`padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
and contain information inreleant for inpainging, such as background.
strength (`float`, *optional*, defaults to 0.9999):
Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
Expand Down Expand Up @@ -1290,9 +1322,11 @@ def __call__(
prompt,
prompt_2,
control_image,
mask_image,
strength,
num_inference_steps,
callback_steps,
output_type,
negative_prompt,
negative_prompt_2,
prompt_embeds,
Expand All @@ -1303,6 +1337,7 @@ def __call__(
control_guidance_start,
control_guidance_end,
callback_on_step_end_tensor_inputs,
padding_mask_crop,
)

self._guidance_scale = guidance_scale
Expand Down Expand Up @@ -1370,7 +1405,21 @@ def denoising_value_valid(dnv):

# 5. Preprocess mask and image - resizes image and mask w.r.t height and width
# 5.1 Prepare init image
init_image = self.image_processor.preprocess(image, height=height, width=width)
if padding_mask_crop is not None:
if width is None or height is None:
default_height, default_width = self.image_processor.get_default_height_width(image)
width = width or default_width
height = height or default_height
crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
resize_mode = "fill"
else:
crops_coords = None
resize_mode = "default"

original_image = image
init_image = self.image_processor.preprocess(
image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
)
init_image = init_image.to(dtype=torch.float32)

# 5.2 Prepare control images
Expand All @@ -1383,6 +1432,8 @@ def denoising_value_valid(dnv):
num_images_per_prompt=num_images_per_prompt,
device=device,
dtype=controlnet.dtype,
crops_coords=crops_coords,
resize_mode=resize_mode,
do_classifier_free_guidance=self.do_classifier_free_guidance,
guess_mode=guess_mode,
)
Expand All @@ -1398,6 +1449,8 @@ def denoising_value_valid(dnv):
num_images_per_prompt=num_images_per_prompt,
device=device,
dtype=controlnet.dtype,
crops_coords=crops_coords,
resize_mode=resize_mode,
do_classifier_free_guidance=self.do_classifier_free_guidance,
guess_mode=guess_mode,
)
Expand All @@ -1409,7 +1462,9 @@ def denoising_value_valid(dnv):
raise ValueError(f"{controlnet.__class__} is not supported.")

# 5.3 Prepare mask
mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
mask = self.mask_processor.preprocess(
mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
)

masked_image = init_image * (mask < 0.5)
_, _, height, width = init_image.shape
Expand Down Expand Up @@ -1684,6 +1739,9 @@ def denoising_value_valid(dnv):

image = self.image_processor.postprocess(image, output_type=output_type)

if padding_mask_crop is not None:
image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]

# Offload all models
self.maybe_free_model_hooks()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ def check_inputs(
width,
strength,
callback_steps,
output_type,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
Expand Down Expand Up @@ -693,9 +694,9 @@ def check_inputs(
f" {negative_prompt_embeds.shape}."
)
if padding_mask_crop is not None:
if self.unet.config.in_channels != 4:
if self.unet.config.in_channels != 4 and self.unet.config.in_channels != 9:
raise ValueError(
f"The UNet should have 4 input channels for inpainting mask crop, but has"
f"The UNet should have 4 or 9 input channels for inpainting mask crop, but has"
f" {self.unet.config.in_channels} input channels."
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can remove this warning

if not isinstance(image, PIL.Image.Image):
Expand All @@ -707,6 +708,8 @@ def check_inputs(
f"The mask image should be a PIL image when inpainting mask crop, but is of type"
f" {type(mask_image)}."
)
if output_type != "pil":
raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")

def prepare_latents(
self,
Expand Down Expand Up @@ -1166,6 +1169,7 @@ def __call__(
width,
strength,
callback_steps,
output_type,
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
Expand Down
Loading