Debug sd conv gn geglu (#7)

jackalcooper · liujuncheng · web-flow · commit f42995458095 · 2022-11-07T15:25:26.000+08:00
Co-authored-by: liujuncheng &lt;liujuncheng1022@gmail.com&gt;
diff --git a/src/diffusers/models/attention_oneflow.py b/src/diffusers/models/attention_oneflow.py
@@ -349,5 +349,13 @@ def __init__(self, dim_in: int, dim_out: int):
         self.proj = nn.Linear(dim_in, dim_out * 2)
 
     def forward(self, hidden_states):
+        x_shape = hidden_states.shape
+        if len(x_shape) != 2:
+            hidden_states = hidden_states.reshape(-1, x_shape[-1])
+        out = torch._C.fused_geglu(hidden_states, self.proj.weight, self.proj.bias)
+        if len(x_shape) != 2:
+            out_shape = x_shape[0:len(x_shape) -1 ] + (-1, )
+            out = out.reshape(out_shape)
+        return out
         hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
         return hidden_states * F.gelu(gate)
diff --git a/src/diffusers/pipeline_oneflow_utils.py b/src/diffusers/pipeline_oneflow_utils.py
@@ -353,7 +353,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 class_name = "OneFlow" + class_name
                 print(f"[oneflow]", f"[{name}]", f"{library_name}.{class_name}")
             else:
-                print(f"[python]", f"[{name}]", f"{library_name}.{class_name}")
+                print(f"[diffusers]", f"[{name}]", f"{library_name}.{class_name}")
             # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
             if class_name.startswith("Flax"):
                 class_name = class_name[4:]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py
@@ -32,6 +32,7 @@ def __init__(self, unet):
         self.config.enable_cudnn_conv_heuristic_search_algo(False)
 
     def build(self, latent_model_input, t, text_embeddings):
+        text_embeddings = torch._C.amp_white_identity(text_embeddings)
         return self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
 class OneFlowStableDiffusionPipeline(DiffusionPipeline):
@@ -98,6 +99,8 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.unet_graph = UNetGraph(self.unet)
+        self.unet_compiled = False
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
@@ -185,6 +188,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
 
+        from timeit import default_timer as timer
+        start = timer()
         if "torch_device" in kwargs:
             device = kwargs.pop("torch_device")
             warnings.warn(
@@ -271,12 +276,17 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
-        unet_graph = UNetGraph(self.unet)
-
-        print("[oneflow]", "compiling unet beforehand to make sure the progress bar is more accurate")
-        i, t = list(enumerate(self.scheduler.timesteps))[0]
-        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-        unet_graph._compile(latent_model_input, t, text_embeddings)
+        compilation_start = timer()
+        compilation_time = 0
+        if self.unet_compiled == False:
+            print("[oneflow]", "compiling unet beforehand to make sure the progress bar is more accurate")
+            i, t = list(enumerate(self.scheduler.timesteps))[0]
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            self.unet_graph._compile(latent_model_input, t, text_embeddings)
+            self.unet_compiled = True
+            self.unet_graph(latent_model_input, t, text_embeddings) # warmup
+            compilation_time = timer() - compilation_start
+            print("[oneflow]", "[elapsed(s)]", "[unet compilation]", compilation_time)
 
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             torch._oneflow_internal.profiler.RangePush(f"denoise-{i}")
@@ -289,7 +299,7 @@ def __call__(
 
             # predict the noise residual
             torch._oneflow_internal.profiler.RangePush(f"denoise-{i}-unet-graph")
-            noise_pred = unet_graph(latent_model_input, t, text_embeddings)
+            noise_pred = self.unet_graph(latent_model_input, t, text_embeddings)
             torch._oneflow_internal.profiler.RangePop()
 
             # perform guidance
@@ -310,6 +320,8 @@ def __call__(
         if isinstance(latents, np.ndarray):
             latents = torch.from_numpy(latents)
         image = self.vae.decode(latents).sample
+        print("[oneflow]", "[elapsed(s)]", "[image]", timer() - start - compilation_time)
+        post_process_start = timer()
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
@@ -328,4 +340,6 @@ def __call__(
             return (image, has_nsfw_concept)
         import torch as og_torch
         assert og_torch.cuda.is_initialized() is False
+
+        print("[oneflow]", "[elapsed(s)]", "[post-process]", timer() - post_process_start)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)