xref: /aosp_15_r20/external/executorch/kernels/portable/cpu/util/repeat_util.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <cstring>
10 
11 #include <executorch/runtime/core/exec_aten/exec_aten.h>
12 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
13 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
14 #include <executorch/runtime/platform/assert.h>
15 
16 namespace torch {
17 namespace executor {
18 
19 using Tensor = exec_aten::Tensor;
20 
21 namespace {
22 
check_repeat_args(Tensor self,exec_aten::ArrayRef<int64_t> repeats,Tensor & out)23 bool check_repeat_args(
24     Tensor self,
25     exec_aten::ArrayRef<int64_t> repeats,
26     Tensor& out) {
27   // Ensure the self tensors list is non-empty.
28   ET_LOG_MSG_AND_RETURN_IF_FALSE(
29       repeats.size() >= self.dim(),
30       "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
31 
32   // Repeat arrayref shall not contain negative element.
33   bool all_non_negative = true;
34   for (auto repeat : repeats) {
35     all_non_negative = all_non_negative && (repeat >= 0);
36   }
37   ET_LOG_MSG_AND_RETURN_IF_FALSE(
38       all_non_negative, "Trying to create tensor with negative dimension");
39 
40   /// Check if out.size() is legal.
41   ET_LOG_MSG_AND_RETURN_IF_FALSE(
42       out.dim() == repeats.size(),
43       "The dimension of out shall equal size of repeats, but now is %zd and %zd",
44       out.dim(),
45       repeats.size());
46 
47   // Right now we only support the tensors whose dimension is no greater than
48   // kTensorDimensionLimit. Only check out tensor because the number of
49   // dimension of out tensor shall have more than or equal to self tensor
50   ET_LOG_MSG_AND_RETURN_IF_FALSE(
51       out.dim() <= kTensorDimensionLimit,
52       "The dimension of input and output should not be larger than %zd",
53       kTensorDimensionLimit);
54 
55   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(out, self));
56 
57   // We pad one to the beginning of self.size() to make its length equal
58   // repeats, and called it reformat_self_size. We then make point-to-point mul
59   // of reformat_self_size and repeats. The result should equal out.size().
60   size_t reformat_self_size[kTensorDimensionLimit];
61   for (size_t i = 0; i < out.dim() - self.dim(); i++) {
62     reformat_self_size[i] = 1;
63   }
64 
65   for (int64_t i = 0; i < self.dim(); i++) {
66     reformat_self_size[out.dim() - 1 - i] = self.size(self.dim() - 1 - i);
67   }
68   for (size_t i = 0; i < repeats.size(); i++) {
69     ET_LOG_MSG_AND_RETURN_IF_FALSE(
70         reformat_self_size[i] * repeats[i] == out.size(i),
71         "Expect out size at dimension %zu is %" PRId64 ", but now is %zd",
72         i,
73         reformat_self_size[i] * repeats[i],
74         out.size(i));
75   }
76 
77   return true;
78 }
79 
80 // Given the indices to a point in an n-D tensor, and the stride (in bytes)
81 // along each dimension, return the offset from origin to that point.
compute_access_offset(const size_t * indices,const size_t * strides,size_t num_entries)82 size_t compute_access_offset(
83     const size_t* indices,
84     const size_t* strides,
85     size_t num_entries) {
86   size_t offset = 0;
87   for (int i = num_entries - 1; i >= 0; --i) {
88     // @lint-ignore CLANGTIDY indices and strides share same length.
89     offset += indices[i] * strides[i];
90   }
91   return offset;
92 }
93 
94 // Copy an self array to multiple coordinates of the out tensor.
95 //'in_offset' identifies the offset to the source data in self tensor.
96 //'out_offset' identifies the offset in out tensor where the source data
97 // should be copied.
98 //'strides' indicates the stride along each dimension of the out tensor.
repeat_internal(const Tensor & self,Tensor & out,size_t in_offset,size_t out_offset,const size_t * strides)99 void repeat_internal(
100     const Tensor& self,
101     Tensor& out,
102     size_t in_offset,
103     size_t out_offset,
104     const size_t* strides) {
105   const char* src = self.const_data_ptr<char>() + in_offset;
106   char* dest = out.mutable_data_ptr<char>() + out_offset;
107 
108   // Treats zero-dim self as one-dim tensor with size {1}.
109   ssize_t self_dim = self.dim() ? self.dim() : 1;
110   int32_t one = 1;
111   exec_aten::ArrayRef<int32_t> self_size =
112       self.dim() ? self.sizes() : exec_aten::ArrayRef<int32_t>(&one, 1);
113 
114   // Get the size of the array in bytes.
115   size_t num_bytes = self_size[self_dim - 1] * out.element_size();
116   if (num_bytes == 0) {
117     return;
118   }
119 
120   // Visualize the out tensor as a set of 1D arrays. Given an n-dimensional
121   // out X[d0, d1, ..., d{N-2}, d{N-1}], we can view the out as a tensor
122   // X'[d0, d1, ..., d{N-2}, 1], where each point is a array of length
123   // size(d{N-1}). Below is the strategy to iterate over the relevant points in
124   // X'. We create an n-D slot array, where each index corresponds to a
125   // dimension of X'. A valid value of slot array is one which corresponds to
126   // a data point in X'.
127   size_t slots[kTensorDimensionLimit];
128   memset(slots, 0, self_dim * sizeof(slots[0]));
129 
130   // The increment along index of slot array to reach the next possible valid
131   // value.
132   int64_t incr[kTensorDimensionLimit];
133   for (size_t i = 0; i < self_dim; i++) {
134     incr[i] = self_size[i];
135   }
136 
137   // And now copy the self data to possibly multiple points in the out
138   // tensor. Note that if the self is n-dimensional tensor, we limit copying
139   // to only n dimensions in out tensor (out can be higher-dimensional
140   // than self).
141   size_t index = self_dim - 1;
142   size_t start = out.dim() - self_dim;
143   while (slots[0] != out.size(start)) {
144     // Compute the offset (from origin) in the out tensor where this self
145     // data will be copied to.
146     size_t offset = compute_access_offset(slots, strides, self_dim);
147     memcpy(dest + offset, src, num_bytes);
148 
149     // Find the next valid value of slot array.
150     slots[index] += incr[index];
151     // If we have reached the limit in the innermost dimension, successively
152     // increment the slot index of outer dimensions.
153     while (slots[index] == out.size(start + index)) {
154       if (index == 0) {
155         break;
156       }
157       slots[index--] = 0;
158       slots[index] += incr[index];
159     }
160     index = self_dim - 1;
161   }
162 }
163 
164 } // namespace
165 
166 // TODO(gasoonjia): dynamic allocate array to support tensor dimension larger
167 // than kTensorDimensionLimit.
repeat_tensor(const Tensor & self,exec_aten::ArrayRef<int64_t> repeats,Tensor & out)168 Error repeat_tensor(
169     const Tensor& self,
170     exec_aten::ArrayRef<int64_t> repeats,
171     Tensor& out) {
172   // Verify that the args are valid.
173   ET_CHECK_OR_RETURN_ERROR(
174       check_repeat_args(self, repeats, out),
175       InvalidArgument,
176       "Repeat arguments are invalid.");
177 
178   // Returns out if out.numel == 0, nothing needs to be repeated.
179   if (out.numel() == 0) {
180     return Error::Ok;
181   }
182 
183   ssize_t element_size = out.element_size();
184 
185   // The underlying data of tensor out shall equal tensor self.
186   // Treats it specially to circumvent zero-dim tensor issue.
187   if (out.numel() == 1) {
188     const char* src = self.const_data_ptr<char>();
189     char* dest = out.mutable_data_ptr<char>();
190     memcpy(dest, src, element_size);
191     return Error::Ok;
192   }
193 
194   // Treats zero-dim self as one-dim tensor with size {1}.
195   ssize_t self_dim = self.dim() ? self.dim() : 1;
196   int32_t one = 1;
197   exec_aten::ArrayRef<int32_t> self_size = self.sizes().empty()
198       ? exec_aten::ArrayRef<int32_t>(&one, 1)
199       : self.sizes();
200 
201   // Compute the stride (in bytes) along each out tensor dimension.
202   size_t strides[kTensorDimensionLimit];
203   memset(strides, 0, sizeof(strides[0]) * self_dim);
204   size_t start = out.dim() - self_dim;
205   size_t accum_offset = element_size;
206 
207   for (ssize_t i = self_dim - 1; i >= 0; i--) {
208     strides[i] = accum_offset;
209     accum_offset *= out.size(start + i);
210   }
211 
212   // Given an n-dimensional self X[d0, d1, ..., d{N-2}, d{N-1}], we can view
213   // the self as a tensor X'[d0, d1, ..., d{N-2}, 1], where each point is a
214   // 1D array of length size(d{N-1}). Now we need a strategy to iterate over
215   // all the points in X'. We do not want to use getLeadingDims(), as we want
216   // to know the indices explicitly so that we can compute the appropriate
217   // offset for both self and out tensor at that index. To achieve this,
218   // we create an n-D slot array, where each index corresponds to a dimension
219   // of X'. A valid value of slot array is the one that corresponds to a
220   // valid index in X'.
221   size_t slots[kTensorDimensionLimit];
222   memset(slots, 0, self_dim * sizeof(slots[0]));
223 
224   // 'limits' indicates the upper bound on each index in slot. Note that we
225   // copy the entire array along the innermost dimension as a direct memcpy,
226   // so we reset the upper bound of innermost dim to 1. 'in_incr' indicates
227   // the size (in bytes) of the self data.
228   int64_t limits[kTensorDimensionLimit];
229   for (size_t i = 0; i < self_dim; i++) {
230     limits[i] = self_size[i];
231   }
232 
233   // @lint-ignore CLANGTIDY Here limits is guaranteend a non-empty array.
234   size_t in_incr = limits[self_dim - 1] * element_size;
235   limits[self_dim - 1] = 1;
236   // 'in_offset' indicates the offset (in bytes) from the origin to an self
237   // data.
238   size_t in_offset = 0;
239   size_t index = self_dim - 1;
240 
241   // Below, we copy the entire self tensor into the out tensor (at origin),
242   // one array a time. To do so, we iterate over all the valid values of slots
243   // array. The repeat_internal() takes care of replicating the array along the
244   // coordinates specified by repeats array.
245   while (slots[0] != limits[0]) {
246     // Compute the offset (from origin) in the out tensor where the self
247     // array (with indices in self tensor indicated by slots) will be copied.
248     size_t out_offset = compute_access_offset(slots, strides, self_dim);
249     // Now repeatedly copy the array to multiple coordinates in the out
250     // tensor. Curtail the copy along as many dimensions as the self tensor.
251     // The copy along remaining higher dimensions can be done via trivial
252     // memcpy.
253     repeat_internal(self, out, in_offset, out_offset, strides);
254 
255     // Find the next valid value of slot array
256     slots[index]++;
257     // If we have reached the limit in the innermost dimension, successively
258     // increment the slot index of outer dimensions.
259     while (slots[index] == limits[index]) {
260       if (index == 0) {
261         break;
262       }
263       slots[index--] = 0;
264       slots[index]++;
265     }
266     index = self_dim - 1;
267     in_offset += in_incr;
268   }
269 
270   // And now if an n-D self was meant to be replicated to m dimensions where
271   // m>n, we can just do simple memcpy for (m-n) dimensions.
272   const char* src = out.const_data_ptr<char>();
273   char* dest = out.mutable_data_ptr<char>() + accum_offset;
274   for (int i = start - 1; i >= 0; --i) {
275     for (int j = 0; j < repeats[i] - 1; ++j) {
276       memcpy(dest, src, accum_offset);
277       dest += accum_offset;
278     }
279     accum_offset *= out.size(i);
280   }
281 
282   return Error::Ok;
283 }
284 
285 } // namespace executor
286 } // namespace torch
287