1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <cstring>
10
11 #include <executorch/runtime/core/exec_aten/exec_aten.h>
12 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
13 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
14 #include <executorch/runtime/platform/assert.h>
15
16 namespace torch {
17 namespace executor {
18
19 using Tensor = exec_aten::Tensor;
20
21 namespace {
22
check_repeat_args(Tensor self,exec_aten::ArrayRef<int64_t> repeats,Tensor & out)23 bool check_repeat_args(
24 Tensor self,
25 exec_aten::ArrayRef<int64_t> repeats,
26 Tensor& out) {
27 // Ensure the self tensors list is non-empty.
28 ET_LOG_MSG_AND_RETURN_IF_FALSE(
29 repeats.size() >= self.dim(),
30 "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
31
32 // Repeat arrayref shall not contain negative element.
33 bool all_non_negative = true;
34 for (auto repeat : repeats) {
35 all_non_negative = all_non_negative && (repeat >= 0);
36 }
37 ET_LOG_MSG_AND_RETURN_IF_FALSE(
38 all_non_negative, "Trying to create tensor with negative dimension");
39
40 /// Check if out.size() is legal.
41 ET_LOG_MSG_AND_RETURN_IF_FALSE(
42 out.dim() == repeats.size(),
43 "The dimension of out shall equal size of repeats, but now is %zd and %zd",
44 out.dim(),
45 repeats.size());
46
47 // Right now we only support the tensors whose dimension is no greater than
48 // kTensorDimensionLimit. Only check out tensor because the number of
49 // dimension of out tensor shall have more than or equal to self tensor
50 ET_LOG_MSG_AND_RETURN_IF_FALSE(
51 out.dim() <= kTensorDimensionLimit,
52 "The dimension of input and output should not be larger than %zd",
53 kTensorDimensionLimit);
54
55 ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(out, self));
56
57 // We pad one to the beginning of self.size() to make its length equal
58 // repeats, and called it reformat_self_size. We then make point-to-point mul
59 // of reformat_self_size and repeats. The result should equal out.size().
60 size_t reformat_self_size[kTensorDimensionLimit];
61 for (size_t i = 0; i < out.dim() - self.dim(); i++) {
62 reformat_self_size[i] = 1;
63 }
64
65 for (int64_t i = 0; i < self.dim(); i++) {
66 reformat_self_size[out.dim() - 1 - i] = self.size(self.dim() - 1 - i);
67 }
68 for (size_t i = 0; i < repeats.size(); i++) {
69 ET_LOG_MSG_AND_RETURN_IF_FALSE(
70 reformat_self_size[i] * repeats[i] == out.size(i),
71 "Expect out size at dimension %zu is %" PRId64 ", but now is %zd",
72 i,
73 reformat_self_size[i] * repeats[i],
74 out.size(i));
75 }
76
77 return true;
78 }
79
80 // Given the indices to a point in an n-D tensor, and the stride (in bytes)
81 // along each dimension, return the offset from origin to that point.
compute_access_offset(const size_t * indices,const size_t * strides,size_t num_entries)82 size_t compute_access_offset(
83 const size_t* indices,
84 const size_t* strides,
85 size_t num_entries) {
86 size_t offset = 0;
87 for (int i = num_entries - 1; i >= 0; --i) {
88 // @lint-ignore CLANGTIDY indices and strides share same length.
89 offset += indices[i] * strides[i];
90 }
91 return offset;
92 }
93
94 // Copy an self array to multiple coordinates of the out tensor.
95 //'in_offset' identifies the offset to the source data in self tensor.
96 //'out_offset' identifies the offset in out tensor where the source data
97 // should be copied.
98 //'strides' indicates the stride along each dimension of the out tensor.
repeat_internal(const Tensor & self,Tensor & out,size_t in_offset,size_t out_offset,const size_t * strides)99 void repeat_internal(
100 const Tensor& self,
101 Tensor& out,
102 size_t in_offset,
103 size_t out_offset,
104 const size_t* strides) {
105 const char* src = self.const_data_ptr<char>() + in_offset;
106 char* dest = out.mutable_data_ptr<char>() + out_offset;
107
108 // Treats zero-dim self as one-dim tensor with size {1}.
109 ssize_t self_dim = self.dim() ? self.dim() : 1;
110 int32_t one = 1;
111 exec_aten::ArrayRef<int32_t> self_size =
112 self.dim() ? self.sizes() : exec_aten::ArrayRef<int32_t>(&one, 1);
113
114 // Get the size of the array in bytes.
115 size_t num_bytes = self_size[self_dim - 1] * out.element_size();
116 if (num_bytes == 0) {
117 return;
118 }
119
120 // Visualize the out tensor as a set of 1D arrays. Given an n-dimensional
121 // out X[d0, d1, ..., d{N-2}, d{N-1}], we can view the out as a tensor
122 // X'[d0, d1, ..., d{N-2}, 1], where each point is a array of length
123 // size(d{N-1}). Below is the strategy to iterate over the relevant points in
124 // X'. We create an n-D slot array, where each index corresponds to a
125 // dimension of X'. A valid value of slot array is one which corresponds to
126 // a data point in X'.
127 size_t slots[kTensorDimensionLimit];
128 memset(slots, 0, self_dim * sizeof(slots[0]));
129
130 // The increment along index of slot array to reach the next possible valid
131 // value.
132 int64_t incr[kTensorDimensionLimit];
133 for (size_t i = 0; i < self_dim; i++) {
134 incr[i] = self_size[i];
135 }
136
137 // And now copy the self data to possibly multiple points in the out
138 // tensor. Note that if the self is n-dimensional tensor, we limit copying
139 // to only n dimensions in out tensor (out can be higher-dimensional
140 // than self).
141 size_t index = self_dim - 1;
142 size_t start = out.dim() - self_dim;
143 while (slots[0] != out.size(start)) {
144 // Compute the offset (from origin) in the out tensor where this self
145 // data will be copied to.
146 size_t offset = compute_access_offset(slots, strides, self_dim);
147 memcpy(dest + offset, src, num_bytes);
148
149 // Find the next valid value of slot array.
150 slots[index] += incr[index];
151 // If we have reached the limit in the innermost dimension, successively
152 // increment the slot index of outer dimensions.
153 while (slots[index] == out.size(start + index)) {
154 if (index == 0) {
155 break;
156 }
157 slots[index--] = 0;
158 slots[index] += incr[index];
159 }
160 index = self_dim - 1;
161 }
162 }
163
164 } // namespace
165
166 // TODO(gasoonjia): dynamic allocate array to support tensor dimension larger
167 // than kTensorDimensionLimit.
repeat_tensor(const Tensor & self,exec_aten::ArrayRef<int64_t> repeats,Tensor & out)168 Error repeat_tensor(
169 const Tensor& self,
170 exec_aten::ArrayRef<int64_t> repeats,
171 Tensor& out) {
172 // Verify that the args are valid.
173 ET_CHECK_OR_RETURN_ERROR(
174 check_repeat_args(self, repeats, out),
175 InvalidArgument,
176 "Repeat arguments are invalid.");
177
178 // Returns out if out.numel == 0, nothing needs to be repeated.
179 if (out.numel() == 0) {
180 return Error::Ok;
181 }
182
183 ssize_t element_size = out.element_size();
184
185 // The underlying data of tensor out shall equal tensor self.
186 // Treats it specially to circumvent zero-dim tensor issue.
187 if (out.numel() == 1) {
188 const char* src = self.const_data_ptr<char>();
189 char* dest = out.mutable_data_ptr<char>();
190 memcpy(dest, src, element_size);
191 return Error::Ok;
192 }
193
194 // Treats zero-dim self as one-dim tensor with size {1}.
195 ssize_t self_dim = self.dim() ? self.dim() : 1;
196 int32_t one = 1;
197 exec_aten::ArrayRef<int32_t> self_size = self.sizes().empty()
198 ? exec_aten::ArrayRef<int32_t>(&one, 1)
199 : self.sizes();
200
201 // Compute the stride (in bytes) along each out tensor dimension.
202 size_t strides[kTensorDimensionLimit];
203 memset(strides, 0, sizeof(strides[0]) * self_dim);
204 size_t start = out.dim() - self_dim;
205 size_t accum_offset = element_size;
206
207 for (ssize_t i = self_dim - 1; i >= 0; i--) {
208 strides[i] = accum_offset;
209 accum_offset *= out.size(start + i);
210 }
211
212 // Given an n-dimensional self X[d0, d1, ..., d{N-2}, d{N-1}], we can view
213 // the self as a tensor X'[d0, d1, ..., d{N-2}, 1], where each point is a
214 // 1D array of length size(d{N-1}). Now we need a strategy to iterate over
215 // all the points in X'. We do not want to use getLeadingDims(), as we want
216 // to know the indices explicitly so that we can compute the appropriate
217 // offset for both self and out tensor at that index. To achieve this,
218 // we create an n-D slot array, where each index corresponds to a dimension
219 // of X'. A valid value of slot array is the one that corresponds to a
220 // valid index in X'.
221 size_t slots[kTensorDimensionLimit];
222 memset(slots, 0, self_dim * sizeof(slots[0]));
223
224 // 'limits' indicates the upper bound on each index in slot. Note that we
225 // copy the entire array along the innermost dimension as a direct memcpy,
226 // so we reset the upper bound of innermost dim to 1. 'in_incr' indicates
227 // the size (in bytes) of the self data.
228 int64_t limits[kTensorDimensionLimit];
229 for (size_t i = 0; i < self_dim; i++) {
230 limits[i] = self_size[i];
231 }
232
233 // @lint-ignore CLANGTIDY Here limits is guaranteend a non-empty array.
234 size_t in_incr = limits[self_dim - 1] * element_size;
235 limits[self_dim - 1] = 1;
236 // 'in_offset' indicates the offset (in bytes) from the origin to an self
237 // data.
238 size_t in_offset = 0;
239 size_t index = self_dim - 1;
240
241 // Below, we copy the entire self tensor into the out tensor (at origin),
242 // one array a time. To do so, we iterate over all the valid values of slots
243 // array. The repeat_internal() takes care of replicating the array along the
244 // coordinates specified by repeats array.
245 while (slots[0] != limits[0]) {
246 // Compute the offset (from origin) in the out tensor where the self
247 // array (with indices in self tensor indicated by slots) will be copied.
248 size_t out_offset = compute_access_offset(slots, strides, self_dim);
249 // Now repeatedly copy the array to multiple coordinates in the out
250 // tensor. Curtail the copy along as many dimensions as the self tensor.
251 // The copy along remaining higher dimensions can be done via trivial
252 // memcpy.
253 repeat_internal(self, out, in_offset, out_offset, strides);
254
255 // Find the next valid value of slot array
256 slots[index]++;
257 // If we have reached the limit in the innermost dimension, successively
258 // increment the slot index of outer dimensions.
259 while (slots[index] == limits[index]) {
260 if (index == 0) {
261 break;
262 }
263 slots[index--] = 0;
264 slots[index]++;
265 }
266 index = self_dim - 1;
267 in_offset += in_incr;
268 }
269
270 // And now if an n-D self was meant to be replicated to m dimensions where
271 // m>n, we can just do simple memcpy for (m-n) dimensions.
272 const char* src = out.const_data_ptr<char>();
273 char* dest = out.mutable_data_ptr<char>() + accum_offset;
274 for (int i = start - 1; i >= 0; --i) {
275 for (int j = 0; j < repeats[i] - 1; ++j) {
276 memcpy(dest, src, accum_offset);
277 dest += accum_offset;
278 }
279 accum_offset *= out.size(i);
280 }
281
282 return Error::Ok;
283 }
284
285 } // namespace executor
286 } // namespace torch
287