pytorch · Gasoonjia · Aug 11, 2025 · Jul 29, 2025 · Jul 29, 2025 · Aug 3, 2025
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones via element-wise copy while preserving dim_order.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+
+  // Ensure input and output dtype match.
+  ET_KERNEL_CHECK(
+      ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  // Ensure input and output shapes match, resizing if necessary.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  // Select the correct input dtype and copy the tensors.
+  ET_SWITCH_REALHBBF16_TYPES(
+      self.scalar_type(),
+      ctx,
+      "dim_order_ops::_clone_dim_order.out",
+      CTYPE,
+      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
+  return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 template <typename T>
 using Optional = std::optional<T>;
 
-namespace {
-
-template <typename SELF_CTYPE, typename OUT_CTYPE>
-void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
-  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
-  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
-
-  // Here we make a slightly off-label use of
-  // BroadcastIndexesRange. It always assumes it doesn't have to care
-  // about different dim_order between input and output, but we can
-  // just force it to respect strides (and thus dim_order) for its
-  // inputs using support_noncontiguous_input_tensors=true, and then pretend
-  // the output is just another input.
-  for (const auto [unused_index, self_data_index, out_data_index] :
-       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
-           /*dummy output*/ self, self, out)) {
-    (void)unused_index;
-    out_data[out_data_index] =
-        static_cast<OUT_CTYPE>(self_data[self_data_index]);
-  }
-}
-} // namespace
-
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(

@@ -9,6 +9,7 @@
 #pragma once
 #include <c10/util/irange.h>
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -77,6 +78,29 @@ void as_strided_copy(
   }
 }
 
+/**
+ * Copies and casts a tensor while preserving input dim_order.
+ */
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  // Here we make a slightly off-label use of
+  // BroadcastIndexesRange. It always assumes it doesn't have to care
+  // about different dim_order between input and output, but we can
+  // just force it to respect strides (and thus dim_order) for its
+  // inputs using support_noncontiguous_input_tensors=true, and then pretend
+  // the output is just another input.
+  for (const auto [unused_index, self_data_index, out_data_index] :
+       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
+           /*dummy output*/ self, self, out)) {
+    (void)unused_index;
+    out_data[out_data_index] =
+        static_cast<OUT_CTYPE>(self_data[self_data_index]);
+  }
+}
+
 bool check_cat_args(
     executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,

@@ -147,6 +147,9 @@ def define_common_targets():
             "copy_ops_util.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
+        exported_deps = [
+            ":broadcast_util",
+        ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ],
@@ -348,7 +351,6 @@ def define_common_targets():
             ],
         )
 
-
         runtime.cxx_library(
             name = "arange_util{}".format(suffix),
             srcs = ["arange_util.cpp"],

@@ -1009,3 +1009,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::_to_dim_order_copy_out
+
+- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_clone_dim_order_out
@@ -108,6 +108,7 @@ add_custom_target(
 set(all_test_sources
     "BinaryLogicalOpTest.cpp"
     "op__to_dim_order_copy_test.cpp"
+    "op__clone_dim_order_test.cpp"
     "op_abs_test.cpp"
     "op_acos_test.cpp"
     "op_acosh_test.cpp"