add meta backend for ROIAlign (#1585)

jiayisunx · web-flow · commit f5ce6193496a · 2023-04-26T16:52:46.000+08:00
diff --git a/csrc/cpu/aten/ROIAlign.cpp b/csrc/cpu/aten/ROIAlign.cpp
@@ -13,15 +13,19 @@ namespace cpu {
 DEFINE_DISPATCH(roi_align_forward_kernel_stub);
 DEFINE_DISPATCH(roi_align_backward_kernel_stub);
 
-at::Tensor IPEXROIAlignOp::_forward(
+at::Tensor ROIAlign_forward_impl(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
     int64_t pooled_height,
     int64_t pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
-  RECORD_FUNCTION("IPEXROIAlignOp::_forward", c10::ArrayRef<c10::IValue>({}));
+#if defined(IPEX_DISP_OP)
+  printf("torch_ipex::ROIAlign_forward\n");
+#endif
+  RECORD_FUNCTION(
+      "torch_ipex::ROIAlign_forward", c10::ArrayRef<c10::IValue>({}));
 
   return roi_align_forward_kernel_stub(
       kCPU,
@@ -34,6 +38,66 @@ at::Tensor IPEXROIAlignOp::_forward(
       aligned);
 }
 
+at::Tensor ROIAlign_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t batch_size,
+    int64_t channels,
+    int64_t height,
+    int64_t width,
+    int64_t sampling_ratio,
+    bool aligned,
+    bool is_channels_last) {
+#if defined(IPEX_DISP_OP)
+  printf("torch_ipex::ROIAlign_backward\n");
+#endif
+  RECORD_FUNCTION(
+      "torch_ipex::ROIAlign_backward", c10::ArrayRef<c10::IValue>({}));
+
+  return roi_align_backward_kernel_stub(
+      kCPU,
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned,
+      is_channels_last);
+}
+
+at::Tensor IPEXROIAlignOp::_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  at::AutoDispatchBelowADInplaceOrView g;
+  RECORD_FUNCTION("IPEXROIAlignOp::_forward", c10::ArrayRef<c10::IValue>({}));
+
+  static auto op = torch::Dispatcher::singleton()
+                       .findSchemaOrThrow("torch_ipex::ROIAlign_forward", "")
+                       .typed<decltype(ROIAlign_forward)>();
+
+  return op.call(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
 at::Tensor IPEXROIAlignOp::forward(
     torch::autograd::AutogradContext* ctx,
     const at::Tensor& input,
@@ -45,7 +109,7 @@ at::Tensor IPEXROIAlignOp::forward(
     bool aligned) {
   RECORD_FUNCTION("IPEXROIAlignOp::forward", c10::ArrayRef<c10::IValue>({}));
 
-  ctx->saved_data["input_shape"] = input.sizes();
+  ctx->saved_data["input_shape"] = input.sym_sizes();
   ctx->saved_data["spatial_scale"] = spatial_scale;
   ctx->saved_data["pooled_height"] = pooled_height;
   ctx->saved_data["pooled_width"] = pooled_width;
@@ -55,8 +119,7 @@ at::Tensor IPEXROIAlignOp::forward(
       input.is_contiguous(at::MemoryFormat::ChannelsLast);
   ctx->save_for_backward({rois});
 
-  return roi_align_forward_kernel_stub(
-      kCPU,
+  return _forward(
       input,
       rois,
       spatial_scale,
@@ -81,8 +144,11 @@ torch::autograd::variable_list IPEXROIAlignOp::backward(
   auto saved = ctx->get_saved_variables();
   at::Tensor rois = saved[0];
 
-  at::Tensor grad_input = roi_align_backward_kernel_stub(
-      kCPU,
+  static auto op = torch::Dispatcher::singleton()
+                       .findSchemaOrThrow("torch_ipex::ROIAlign_backward", "")
+                       .typed<decltype(ROIAlign_backward)>();
+
+  auto grad_input = op.call(
       grad_outputs[0],
       rois,
       spatial_scale,
@@ -134,45 +200,26 @@ at::Tensor ROIAlign_forward(
       aligned);
 }
 
-} // namespace cpu
-} // namespace torch_ipex
-
-namespace torch_ipex {
-namespace autocast {
-
-at::Tensor roi_align_autocast(
+at::Tensor ROIAlign_forward_meta(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
     int64_t pooled_height,
     int64_t pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
-  static auto op = torch::Dispatcher::singleton()
-                       .findSchemaOrThrow("torchvision::roi_align", "")
-                       .typed<decltype(torch_ipex::cpu::ROIAlign_forward)>();
-  if (input.scalar_type() == at::ScalarType::BFloat16) {
-    return op.call(
-        input,
-        cpu_cached_cast(at::kFloat, rois),
-        spatial_scale,
-        pooled_height,
-        pooled_width,
-        sampling_ratio,
-        aligned);
-  } else {
-    return op.call(
-        input,
-        cpu_cached_cast(input.scalar_type(), rois),
-        spatial_scale,
-        pooled_height,
-        pooled_width,
-        sampling_ratio,
-        aligned);
-  }
+  auto num_rois = rois.sym_size(0);
+  auto channels = input.sym_size(1);
+  return at::empty_symint(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
 }
 
+} // namespace cpu
+} // namespace torch_ipex
+
+namespace torch_ipex {
+namespace autocast {
+
 at::Tensor ROIAlign_forward(
     const at::Tensor& input,
     const at::Tensor& rois,
@@ -222,6 +269,21 @@ IPEX_TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
       "ROIAlign_forward",
       c10::DispatchKey::AutocastCPU,
       torch_ipex::autocast::ROIAlign_forward);
+  m.impl(
+      "ROIAlign_forward",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::ROIAlign_forward_impl);
+  m.impl(
+      "ROIAlign_forward",
+      c10::DispatchKey::Meta,
+      torch_ipex::cpu::ROIAlign_forward_meta);
+  // bw
+  m.def(
+      "ROIAlign_backward(Tensor grad, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width, int sampling_ratio, bool aligned, bool is_channels_last) -> Tensor");
+  m.impl(
+      "ROIAlign_backward",
+      c10::DispatchKey::CPU,
+      torch_ipex::cpu::ROIAlign_backward);
 }
 
 IPEX_TORCH_LIBRARY_FRAGMENT(torchvision, m) {
@@ -232,7 +294,7 @@ IPEX_TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.impl(
       "roi_align",
       c10::DispatchKey::AutocastCPU,
-      torch_ipex::autocast::roi_align_autocast);
+      torch_ipex::autocast::ROIAlign_forward);
 }
 
 } // namespace
diff --git a/csrc/cpu/aten/ROIAlign.h b/csrc/cpu/aten/ROIAlign.h
@@ -7,6 +7,29 @@
 namespace torch_ipex {
 namespace cpu {
 
+at::Tensor ROIAlign_forward_impl(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned);
+
+at::Tensor ROIAlign_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t batch_size,
+    int64_t channels,
+    int64_t height,
+    int64_t width,
+    int64_t sampling_ratio,
+    bool aligned,
+    bool is_channels_last);
+
 class IPEXROIAlignOp : public torch::autograd::Function<IPEXROIAlignOp> {
  public:
   // forward function without autograd overhead, will go this way when only do
@@ -44,6 +67,15 @@ at::Tensor ROIAlign_forward(
     int64_t sampling_ratio,
     bool aligned);
 
+at::Tensor ROIAlign_forward_meta(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 namespace {
 
 template <typename T>
diff --git a/csrc/cpu/aten/kernels/ROIAlignKrnl.cpp b/csrc/cpu/aten/kernels/ROIAlignKrnl.cpp
@@ -673,12 +673,6 @@ at::Tensor roi_align_forward_kernel_impl(
     int64_t pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
-#if defined(IPEX_DISP_OP)
-  printf("torch_ipex::ROIAlign_forward\n");
-#endif
-  RECORD_FUNCTION(
-      "torch_ipex::ROIAlign_forward", c10::ArrayRef<c10::IValue>({}));
-
   TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor");
   TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor");
   TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
@@ -741,12 +735,6 @@ at::Tensor roi_align_backward_kernel_impl(
     int64_t sampling_ratio,
     bool aligned,
     bool is_channels_last) {
-#if defined(IPEX_DISP_OP)
-  printf("torch_ipex::ROIAlign_backward\n");
-#endif
-  RECORD_FUNCTION(
-      "torch_ipex::ROIAlign_backward", c10::ArrayRef<c10::IValue>({}));
-
   TORCH_CHECK(grad.device().is_cpu(), "grad must be a CPU tensor");
   TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor");
 
diff --git a/tests/cpu/test_roialign.py b/tests/cpu/test_roialign.py
@@ -1,4 +1,5 @@
 import unittest, copy
+import itertools
 import torch
 import intel_extension_for_pytorch as ipex
 from common_utils import TestCase
@@ -219,6 +220,54 @@ def test_torchvision_roialign(self):
             self.assertTrue(x4.grad.dtype == torch.bfloat16)
             self.assertTrue(torch.allclose(gt_x.grad.to(x4.dtype), x4.grad, rtol=1e-5, atol=1e-5))
 
+    @skipIfNoTorchVision
+    def test_torchvision_roialign_torchcompile(self):
+        pool_size = 5
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10)
+        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
+                             [0, 0, 5, 4, 9],
+                             [0, 5, 5, 9, 9],
+                             [1, 0, 0, 9, 9]])
+        pool_h, pool_w = pool_size, pool_size
+
+        # TODO: add dynamic tests when 'ipex' backend supports it.
+        for dtype, backend, dynamic in itertools.product([torch.float32, torch.bfloat16], ['ipex', 'inductor'], [False]):
+            torch._dynamo.reset()
+            torchcompile_torchvision_fn = torch.compile(torchvision_fn, backend=backend, dynamic=dynamic)
+            x = x.to(dtype=dtype)
+            rois = rois.to(dtype=dtype)
+            # forward
+            with torch.cpu.amp.autocast(enabled=(dtype==torch.bfloat16)), torch.no_grad():
+                y0 = torchvision_fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+                y1 = torchcompile_torchvision_fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+                self.assertEqual(y0, y1)
+                self.assertTrue(y1.dtype == dtype)
+
+    @skipIfNoTorchVision
+    def test_roialign_torchcompile(self):
+        pool_size = 5
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10)
+        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
+                             [0, 0, 5, 4, 9],
+                             [0, 5, 5, 9, 9],
+                             [1, 0, 0, 9, 9]])
+        pool_h, pool_w = pool_size, pool_size
+        torch._dynamo.allow_in_graph(ipex.nn.modules._roi_align.RoIAlign)
+
+        # TODO: add dynamic tests when 'ipex' backend supports it.
+        for dtype, backend, dynamic in itertools.product([torch.float32, torch.bfloat16], ['ipex', 'inductor'], [False]):
+            torch._dynamo.reset()
+            torchcompile_fn = torch.compile(fn, backend=backend, dynamic=dynamic)
+            x = x.to(dtype=dtype)
+            rois = rois.to(dtype=dtype)
+            # forward
+            with torch.cpu.amp.autocast(enabled=(dtype==torch.bfloat16)), torch.no_grad():
+                y0 = fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+                y1 = torchcompile_fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+                self.assertEqual(y0, y1)
+                self.assertTrue(y1.dtype == dtype)
 
 if __name__ == '__main__':
     test = unittest.main()