Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions kernels/portable/cpu/op__clone_dim_order.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
namespace executor {
namespace native {

using Tensor = executorch::aten::Tensor;

template <typename T>
using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;

/**
* _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
* dim_order=None, Tensor(a!) out) -> Tensor(a!)
*
* Clones via element-wise copy while preserving dim_order.
*/
Tensor& _clone_dim_order_out(
KernelRuntimeContext& ctx,
const Tensor& self,
bool non_blocking,
OptionalArrayRef<int64_t> dim_order,
Tensor& out) {
(void)ctx;

// Ensure input and output dtype match.
ET_KERNEL_CHECK(
ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);

// Ensure output has the same layout as input or matches dim_order.
ET_KERNEL_CHECK(
ctx,
check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
InvalidArgument,
out);

// Ensure input and output shapes match, resizing if necessary.
ET_KERNEL_CHECK(
ctx,
resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
InvalidArgument,
out);

if (self.numel() == 0) {
return out;
}

// Select the correct input dtype and copy the tensors.
ET_SWITCH_REALHBBF16_TYPES(
self.scalar_type(),
ctx,
"dim_order_ops::_clone_dim_order.out",
CTYPE,
[&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });

return out;
}

Tensor& _clone_dim_order_out(
const Tensor& self,
bool non_blocking,
OptionalArrayRef<int64_t> dim_order,
Tensor& out) {
executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
}

} // namespace native
} // namespace executor
} // namespace torch
23 changes: 0 additions & 23 deletions kernels/portable/cpu/op__to_dim_order_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
template <typename T>
using Optional = std::optional<T>;

namespace {

template <typename SELF_CTYPE, typename OUT_CTYPE>
void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
auto out_data = out.mutable_data_ptr<OUT_CTYPE>();

// Here we make a slightly off-label use of
// BroadcastIndexesRange. It always assumes it doesn't have to care
// about different dim_order between input and output, but we can
// just force it to respect strides (and thus dim_order) for its
// inputs using support_noncontiguous_input_tensors=true, and then pretend
// the output is just another input.
for (const auto [unused_index, self_data_index, out_data_index] :
BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
/*dummy output*/ self, self, out)) {
(void)unused_index;
out_data[out_data_index] =
static_cast<OUT_CTYPE>(self_data[self_data_index]);
}
}
} // namespace

// _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
// dim_order=None, Tensor(a!) out) -> Tensor(a!)
Tensor& _to_dim_order_copy_out(
Expand Down
24 changes: 24 additions & 0 deletions kernels/portable/cpu/util/copy_ops_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once
#include <c10/util/irange.h>

#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand Down Expand Up @@ -77,6 +78,29 @@ void as_strided_copy(
}
}

/**
* Copies and casts a tensor while preserving input dim_order.
*/
template <typename SELF_CTYPE, typename OUT_CTYPE>
void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
auto out_data = out.mutable_data_ptr<OUT_CTYPE>();

// Here we make a slightly off-label use of
// BroadcastIndexesRange. It always assumes it doesn't have to care
// about different dim_order between input and output, but we can
// just force it to respect strides (and thus dim_order) for its
// inputs using support_noncontiguous_input_tensors=true, and then pretend
// the output is just another input.
for (const auto [unused_index, self_data_index, out_data_index] :
BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
/*dummy output*/ self, self, out)) {
(void)unused_index;
out_data[out_data_index] =
static_cast<OUT_CTYPE>(self_data[self_data_index]);
}
}

bool check_cat_args(
executorch::aten::ArrayRef<Tensor> tensors,
int64_t dim,
Expand Down
4 changes: 3 additions & 1 deletion kernels/portable/cpu/util/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ def define_common_targets():
"copy_ops_util.h",
],
compiler_flags = ["-Wno-missing-prototypes"],
exported_deps = [
":broadcast_util",
],
deps = [
"//executorch/runtime/kernel:kernel_includes",
],
Expand Down Expand Up @@ -348,7 +351,6 @@ def define_common_targets():
],
)


runtime.cxx_library(
name = "arange_util{}".format(suffix),
srcs = ["arange_util.cpp"],
Expand Down
5 changes: 5 additions & 0 deletions kernels/portable/functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1009,3 +1009,8 @@
kernels:
- arg_meta: null
kernel_name: torch::executor::_to_dim_order_copy_out

- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: torch::executor::_clone_dim_order_out
1 change: 1 addition & 0 deletions kernels/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ add_custom_target(
set(all_test_sources
"BinaryLogicalOpTest.cpp"
"op__to_dim_order_copy_test.cpp"
"op__clone_dim_order_test.cpp"
"op_abs_test.cpp"
"op_acos_test.cpp"
"op_acosh_test.cpp"
Expand Down
Loading
Loading