1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12
13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
14 #define EIGEN_USE_GPU
15
16 #include "main.h"
17 #include <unsupported/Eigen/CXX11/Tensor>
18
19 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
20
21 using Eigen::Tensor;
22 using Eigen::RowMajor;
23
24 // Context for evaluation on cpu
25 struct CPUContext {
CPUContextCPUContext26 CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
27 kernel_1d_(0) = 3.14f;
28 kernel_1d_(1) = 2.7f;
29
30 kernel_2d_(0,0) = 3.14f;
31 kernel_2d_(1,0) = 2.7f;
32 kernel_2d_(0,1) = 0.2f;
33 kernel_2d_(1,1) = 7.0f;
34
35 kernel_3d_(0,0,0) = 3.14f;
36 kernel_3d_(0,1,0) = 2.7f;
37 kernel_3d_(0,0,1) = 0.2f;
38 kernel_3d_(0,1,1) = 7.0f;
39 kernel_3d_(1,0,0) = -1.0f;
40 kernel_3d_(1,1,0) = -0.3f;
41 kernel_3d_(1,0,1) = -0.7f;
42 kernel_3d_(1,1,1) = -0.5f;
43 }
44
deviceCPUContext45 const Eigen::DefaultDevice& device() const { return cpu_device_; }
46
in1CPUContext47 const Eigen::Tensor<float, 3>& in1() const { return in1_; }
in2CPUContext48 const Eigen::Tensor<float, 3>& in2() const { return in2_; }
outCPUContext49 Eigen::Tensor<float, 3>& out() { return out_; }
kernel1dCPUContext50 const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
kernel2dCPUContext51 const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
kernel3dCPUContext52 const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
53
54 private:
55 const Eigen::Tensor<float, 3>& in1_;
56 const Eigen::Tensor<float, 3>& in2_;
57 Eigen::Tensor<float, 3>& out_;
58
59 Eigen::Tensor<float, 1> kernel_1d_;
60 Eigen::Tensor<float, 2> kernel_2d_;
61 Eigen::Tensor<float, 3> kernel_3d_;
62
63 Eigen::DefaultDevice cpu_device_;
64 };
65
66
67 // Context for evaluation on GPU
68 struct GPUContext {
GPUContextGPUContext69 GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
70 assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
71 float kernel_1d_val[] = {3.14f, 2.7f};
72 assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
73
74 assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
75 float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
76 assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
77
78 assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
79 float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
80 assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
81 }
~GPUContextGPUContext82 ~GPUContext() {
83 assert(gpuFree(kernel_1d_) == gpuSuccess);
84 assert(gpuFree(kernel_2d_) == gpuSuccess);
85 assert(gpuFree(kernel_3d_) == gpuSuccess);
86 }
87
deviceGPUContext88 const Eigen::GpuDevice& device() const { return gpu_device_; }
89
in1GPUContext90 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
in2GPUContext91 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
outGPUContext92 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
kernel1dGPUContext93 Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
kernel2dGPUContext94 Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
kernel3dGPUContext95 Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
96
97 private:
98 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
99 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
100 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
101
102 float* kernel_1d_;
103 float* kernel_2d_;
104 float* kernel_3d_;
105
106 Eigen::GpuStreamDevice stream_;
107 Eigen::GpuDevice gpu_device_;
108 };
109
110
111 // The actual expression to evaluate
112 template <typename Context>
test_contextual_eval(Context * context)113 void test_contextual_eval(Context* context)
114 {
115 context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
116 }
117
118 template <typename Context>
test_forced_contextual_eval(Context * context)119 void test_forced_contextual_eval(Context* context)
120 {
121 context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
122 }
123
124 template <typename Context>
test_compound_assignment(Context * context)125 void test_compound_assignment(Context* context)
126 {
127 context->out().device(context->device()) = context->in1().constant(2.718f);
128 context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
129 }
130
131
132 template <typename Context>
test_contraction(Context * context)133 void test_contraction(Context* context)
134 {
135 Eigen::array<std::pair<int, int>, 2> dims;
136 dims[0] = std::make_pair(1, 1);
137 dims[1] = std::make_pair(2, 2);
138
139 Eigen::array<int, 2> shape(40, 50*70);
140
141 Eigen::DSizes<int, 2> indices(0,0);
142 Eigen::DSizes<int, 2> sizes(40,40);
143
144 context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
145 }
146
147
148 template <typename Context>
test_1d_convolution(Context * context)149 void test_1d_convolution(Context* context)
150 {
151 Eigen::DSizes<int, 3> indices(0,0,0);
152 Eigen::DSizes<int, 3> sizes(40,49,70);
153
154 Eigen::array<int, 1> dims(1);
155 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
156 }
157
158 template <typename Context>
test_2d_convolution(Context * context)159 void test_2d_convolution(Context* context)
160 {
161 Eigen::DSizes<int, 3> indices(0,0,0);
162 Eigen::DSizes<int, 3> sizes(40,49,69);
163
164 Eigen::array<int, 2> dims(1,2);
165 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
166 }
167
168 template <typename Context>
test_3d_convolution(Context * context)169 void test_3d_convolution(Context* context)
170 {
171 Eigen::DSizes<int, 3> indices(0,0,0);
172 Eigen::DSizes<int, 3> sizes(39,49,69);
173
174 Eigen::array<int, 3> dims(0,1,2);
175 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
176 }
177
178
test_cpu()179 void test_cpu() {
180 Eigen::Tensor<float, 3> in1(40,50,70);
181 Eigen::Tensor<float, 3> in2(40,50,70);
182 Eigen::Tensor<float, 3> out(40,50,70);
183
184 in1 = in1.random() + in1.constant(10.0f);
185 in2 = in2.random() + in2.constant(10.0f);
186
187 CPUContext context(in1, in2, out);
188 test_contextual_eval(&context);
189 for (int i = 0; i < 40; ++i) {
190 for (int j = 0; j < 50; ++j) {
191 for (int k = 0; k < 70; ++k) {
192 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
193 }
194 }
195 }
196
197 test_forced_contextual_eval(&context);
198 for (int i = 0; i < 40; ++i) {
199 for (int j = 0; j < 50; ++j) {
200 for (int k = 0; k < 70; ++k) {
201 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
202 }
203 }
204 }
205
206 test_compound_assignment(&context);
207 for (int i = 0; i < 40; ++i) {
208 for (int j = 0; j < 50; ++j) {
209 for (int k = 0; k < 70; ++k) {
210 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
211 }
212 }
213 }
214
215 test_contraction(&context);
216 for (int i = 0; i < 40; ++i) {
217 for (int j = 0; j < 40; ++j) {
218 const float result = out(i,j,0);
219 float expected = 0;
220 for (int k = 0; k < 50; ++k) {
221 for (int l = 0; l < 70; ++l) {
222 expected += in1(i, k, l) * in2(j, k, l);
223 }
224 }
225 VERIFY_IS_APPROX(expected, result);
226 }
227 }
228
229 test_1d_convolution(&context);
230 for (int i = 0; i < 40; ++i) {
231 for (int j = 0; j < 49; ++j) {
232 for (int k = 0; k < 70; ++k) {
233 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
234 }
235 }
236 }
237
238 test_2d_convolution(&context);
239 for (int i = 0; i < 40; ++i) {
240 for (int j = 0; j < 49; ++j) {
241 for (int k = 0; k < 69; ++k) {
242 const float result = out(i,j,k);
243 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
244 (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
245 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
246 continue;
247 }
248 VERIFY_IS_APPROX(expected, result);
249 }
250 }
251 }
252
253 test_3d_convolution(&context);
254 for (int i = 0; i < 39; ++i) {
255 for (int j = 0; j < 49; ++j) {
256 for (int k = 0; k < 69; ++k) {
257 const float result = out(i,j,k);
258 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
259 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
260 (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
261 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
262 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
263 continue;
264 }
265 VERIFY_IS_APPROX(expected, result);
266 }
267 }
268 }
269 }
270
test_gpu()271 void test_gpu() {
272 Eigen::Tensor<float, 3> in1(40,50,70);
273 Eigen::Tensor<float, 3> in2(40,50,70);
274 Eigen::Tensor<float, 3> out(40,50,70);
275 in1 = in1.random() + in1.constant(10.0f);
276 in2 = in2.random() + in2.constant(10.0f);
277
278 std::size_t in1_bytes = in1.size() * sizeof(float);
279 std::size_t in2_bytes = in2.size() * sizeof(float);
280 std::size_t out_bytes = out.size() * sizeof(float);
281
282 float* d_in1;
283 float* d_in2;
284 float* d_out;
285 gpuMalloc((void**)(&d_in1), in1_bytes);
286 gpuMalloc((void**)(&d_in2), in2_bytes);
287 gpuMalloc((void**)(&d_out), out_bytes);
288
289 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
290 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
291
292 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
293 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
294 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
295
296 GPUContext context(gpu_in1, gpu_in2, gpu_out);
297 test_contextual_eval(&context);
298 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
299 for (int i = 0; i < 40; ++i) {
300 for (int j = 0; j < 50; ++j) {
301 for (int k = 0; k < 70; ++k) {
302 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
303 }
304 }
305 }
306
307 test_forced_contextual_eval(&context);
308 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
309 for (int i = 0; i < 40; ++i) {
310 for (int j = 0; j < 50; ++j) {
311 for (int k = 0; k < 70; ++k) {
312 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
313 }
314 }
315 }
316
317 test_compound_assignment(&context);
318 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
319 for (int i = 0; i < 40; ++i) {
320 for (int j = 0; j < 50; ++j) {
321 for (int k = 0; k < 70; ++k) {
322 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
323 }
324 }
325 }
326
327 test_contraction(&context);
328 assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
329 for (int i = 0; i < 40; ++i) {
330 for (int j = 0; j < 40; ++j) {
331 const float result = out(i,j,0);
332 float expected = 0;
333 for (int k = 0; k < 50; ++k) {
334 for (int l = 0; l < 70; ++l) {
335 expected += in1(i, k, l) * in2(j, k, l);
336 }
337 }
338 VERIFY_IS_APPROX(expected, result);
339 }
340 }
341
342 test_1d_convolution(&context);
343 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
344 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
345 for (int i = 0; i < 40; ++i) {
346 for (int j = 0; j < 49; ++j) {
347 for (int k = 0; k < 70; ++k) {
348 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
349 }
350 }
351 }
352
353 test_2d_convolution(&context);
354 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
355 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
356 for (int i = 0; i < 40; ++i) {
357 for (int j = 0; j < 49; ++j) {
358 for (int k = 0; k < 69; ++k) {
359 const float result = out(i,j,k);
360 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
361 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
362 VERIFY_IS_APPROX(expected, result);
363 }
364 }
365 }
366
367 #if !defined(EIGEN_USE_HIP)
368 // disable this test on the HIP platform
369 // 3D tensor convolutions seem to hang on the HIP platform
370
371 test_3d_convolution(&context);
372 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
373 assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
374 for (int i = 0; i < 39; ++i) {
375 for (int j = 0; j < 49; ++j) {
376 for (int k = 0; k < 69; ++k) {
377 const float result = out(i,j,k);
378 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
379 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
380 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
381 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
382 VERIFY_IS_APPROX(expected, result);
383 }
384 }
385 }
386
387 #endif
388
389 }
390
391
EIGEN_DECLARE_TEST(cxx11_tensor_device)392 EIGEN_DECLARE_TEST(cxx11_tensor_device)
393 {
394 CALL_SUBTEST_1(test_cpu());
395 CALL_SUBTEST_2(test_gpu());
396 }
397