1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12
13 #define EIGEN_USE_GPU
14
15 #include "main.h"
16 #include <unsupported/Eigen/CXX11/Tensor>
17
18 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
19
20 #define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11
21
22 using Eigen::Tensor;
23
test_gpu_nullary()24 void test_gpu_nullary() {
25 Tensor<float, 1, 0, int> in1(2);
26 Tensor<float, 1, 0, int> in2(2);
27 in1.setRandom();
28 in2.setRandom();
29
30 std::size_t tensor_bytes = in1.size() * sizeof(float);
31
32 float* d_in1;
33 float* d_in2;
34 gpuMalloc((void**)(&d_in1), tensor_bytes);
35 gpuMalloc((void**)(&d_in2), tensor_bytes);
36 gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
37 gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
38
39 Eigen::GpuStreamDevice stream;
40 Eigen::GpuDevice gpu_device(&stream);
41
42 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
43 d_in1, 2);
44 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
45 d_in2, 2);
46
47 gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
48 gpu_in2.device(gpu_device) = gpu_in2.random();
49
50 Tensor<float, 1, 0, int> new1(2);
51 Tensor<float, 1, 0, int> new2(2);
52
53 assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
54 gpu_device.stream()) == gpuSuccess);
55 assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
56 gpu_device.stream()) == gpuSuccess);
57
58 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
59
60 for (int i = 0; i < 2; ++i) {
61 VERIFY_IS_APPROX(new1(i), 3.14f);
62 VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
63 }
64
65 gpuFree(d_in1);
66 gpuFree(d_in2);
67 }
68
test_gpu_elementwise_small()69 void test_gpu_elementwise_small() {
70 Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
71 Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
72 Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
73 in1.setRandom();
74 in2.setRandom();
75
76 std::size_t in1_bytes = in1.size() * sizeof(float);
77 std::size_t in2_bytes = in2.size() * sizeof(float);
78 std::size_t out_bytes = out.size() * sizeof(float);
79
80 float* d_in1;
81 float* d_in2;
82 float* d_out;
83 gpuMalloc((void**)(&d_in1), in1_bytes);
84 gpuMalloc((void**)(&d_in2), in2_bytes);
85 gpuMalloc((void**)(&d_out), out_bytes);
86
87 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
88 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
89
90 Eigen::GpuStreamDevice stream;
91 Eigen::GpuDevice gpu_device(&stream);
92
93 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
94 d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
95 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
96 d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
97 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
98 d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
99
100 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
101
102 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
103 gpu_device.stream()) == gpuSuccess);
104 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
105
106 for (int i = 0; i < 2; ++i) {
107 VERIFY_IS_APPROX(
108 out(Eigen::array<Eigen::DenseIndex, 1>(i)),
109 in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
110 }
111
112 gpuFree(d_in1);
113 gpuFree(d_in2);
114 gpuFree(d_out);
115 }
116
test_gpu_elementwise()117 void test_gpu_elementwise()
118 {
119 Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
120 Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
121 Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
122 Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
123 in1.setRandom();
124 in2.setRandom();
125 in3.setRandom();
126
127 std::size_t in1_bytes = in1.size() * sizeof(float);
128 std::size_t in2_bytes = in2.size() * sizeof(float);
129 std::size_t in3_bytes = in3.size() * sizeof(float);
130 std::size_t out_bytes = out.size() * sizeof(float);
131
132 float* d_in1;
133 float* d_in2;
134 float* d_in3;
135 float* d_out;
136 gpuMalloc((void**)(&d_in1), in1_bytes);
137 gpuMalloc((void**)(&d_in2), in2_bytes);
138 gpuMalloc((void**)(&d_in3), in3_bytes);
139 gpuMalloc((void**)(&d_out), out_bytes);
140
141 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
142 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
143 gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
144
145 Eigen::GpuStreamDevice stream;
146 Eigen::GpuDevice gpu_device(&stream);
147
148 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
149 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
150 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
151 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
152
153 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
154
155 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
156 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
157
158 for (int i = 0; i < 72; ++i) {
159 for (int j = 0; j < 53; ++j) {
160 for (int k = 0; k < 97; ++k) {
161 VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
162 }
163 }
164 }
165
166 gpuFree(d_in1);
167 gpuFree(d_in2);
168 gpuFree(d_in3);
169 gpuFree(d_out);
170 }
171
test_gpu_props()172 void test_gpu_props() {
173 Tensor<float, 1> in1(200);
174 Tensor<bool, 1> out(200);
175 in1.setRandom();
176
177 std::size_t in1_bytes = in1.size() * sizeof(float);
178 std::size_t out_bytes = out.size() * sizeof(bool);
179
180 float* d_in1;
181 bool* d_out;
182 gpuMalloc((void**)(&d_in1), in1_bytes);
183 gpuMalloc((void**)(&d_out), out_bytes);
184
185 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
186
187 Eigen::GpuStreamDevice stream;
188 Eigen::GpuDevice gpu_device(&stream);
189
190 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
191 d_in1, 200);
192 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
193 d_out, 200);
194
195 gpu_out.device(gpu_device) = (gpu_in1.isnan)();
196
197 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
198 gpu_device.stream()) == gpuSuccess);
199 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
200
201 for (int i = 0; i < 200; ++i) {
202 VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
203 }
204
205 gpuFree(d_in1);
206 gpuFree(d_out);
207 }
208
test_gpu_reduction()209 void test_gpu_reduction()
210 {
211 Tensor<float, 4> in1(72,53,97,113);
212 Tensor<float, 2> out(72,97);
213 in1.setRandom();
214
215 std::size_t in1_bytes = in1.size() * sizeof(float);
216 std::size_t out_bytes = out.size() * sizeof(float);
217
218 float* d_in1;
219 float* d_out;
220 gpuMalloc((void**)(&d_in1), in1_bytes);
221 gpuMalloc((void**)(&d_out), out_bytes);
222
223 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
224
225 Eigen::GpuStreamDevice stream;
226 Eigen::GpuDevice gpu_device(&stream);
227
228 Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
229 Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
230
231 array<Eigen::DenseIndex, 2> reduction_axis;
232 reduction_axis[0] = 1;
233 reduction_axis[1] = 3;
234
235 gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
236
237 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
238 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
239
240 for (int i = 0; i < 72; ++i) {
241 for (int j = 0; j < 97; ++j) {
242 float expected = 0;
243 for (int k = 0; k < 53; ++k) {
244 for (int l = 0; l < 113; ++l) {
245 expected =
246 std::max<float>(expected, in1(i, k, j, l));
247 }
248 }
249 VERIFY_IS_APPROX(out(i,j), expected);
250 }
251 }
252
253 gpuFree(d_in1);
254 gpuFree(d_out);
255 }
256
257 template<int DataLayout>
test_gpu_contraction()258 void test_gpu_contraction()
259 {
260 // with these dimensions, the output has 300 * 140 elements, which is
261 // more than 30 * 1024, which is the number of threads in blocks on
262 // a 15 SM GK110 GPU
263 Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
264 Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
265 Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
266
267 t_left.setRandom();
268 t_right.setRandom();
269
270 std::size_t t_left_bytes = t_left.size() * sizeof(float);
271 std::size_t t_right_bytes = t_right.size() * sizeof(float);
272 std::size_t t_result_bytes = t_result.size() * sizeof(float);
273
274 float* d_t_left;
275 float* d_t_right;
276 float* d_t_result;
277
278 gpuMalloc((void**)(&d_t_left), t_left_bytes);
279 gpuMalloc((void**)(&d_t_right), t_right_bytes);
280 gpuMalloc((void**)(&d_t_result), t_result_bytes);
281
282 gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
283 gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
284
285 Eigen::GpuStreamDevice stream;
286 Eigen::GpuDevice gpu_device(&stream);
287
288 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
289 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
290 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
291
292 typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
293 MapXf m_left(t_left.data(), 300, 93);
294 MapXf m_right(t_right.data(), 93, 140);
295 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
296
297 typedef Tensor<float, 1>::DimensionPair DimPair;
298 Eigen::array<DimPair, 2> dims;
299 dims[0] = DimPair(2, 0);
300 dims[1] = DimPair(3, 1);
301
302 m_result = m_left * m_right;
303 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
304
305 gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
306
307 for (DenseIndex i = 0; i < t_result.size(); i++) {
308 if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
309 std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
310 assert(false);
311 }
312 }
313
314 gpuFree(d_t_left);
315 gpuFree(d_t_right);
316 gpuFree(d_t_result);
317 }
318
319 template<int DataLayout>
test_gpu_convolution_1d()320 void test_gpu_convolution_1d()
321 {
322 Tensor<float, 4, DataLayout> input(74,37,11,137);
323 Tensor<float, 1, DataLayout> kernel(4);
324 Tensor<float, 4, DataLayout> out(74,34,11,137);
325 input = input.constant(10.0f) + input.random();
326 kernel = kernel.constant(7.0f) + kernel.random();
327
328 std::size_t input_bytes = input.size() * sizeof(float);
329 std::size_t kernel_bytes = kernel.size() * sizeof(float);
330 std::size_t out_bytes = out.size() * sizeof(float);
331
332 float* d_input;
333 float* d_kernel;
334 float* d_out;
335 gpuMalloc((void**)(&d_input), input_bytes);
336 gpuMalloc((void**)(&d_kernel), kernel_bytes);
337 gpuMalloc((void**)(&d_out), out_bytes);
338
339 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
340 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
341
342 Eigen::GpuStreamDevice stream;
343 Eigen::GpuDevice gpu_device(&stream);
344
345 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
346 Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
347 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
348
349 Eigen::array<Eigen::DenseIndex, 1> dims(1);
350 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
351
352 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
353 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
354
355 for (int i = 0; i < 74; ++i) {
356 for (int j = 0; j < 34; ++j) {
357 for (int k = 0; k < 11; ++k) {
358 for (int l = 0; l < 137; ++l) {
359 const float result = out(i,j,k,l);
360 const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
361 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
362 VERIFY_IS_APPROX(result, expected);
363 }
364 }
365 }
366 }
367
368 gpuFree(d_input);
369 gpuFree(d_kernel);
370 gpuFree(d_out);
371 }
372
test_gpu_convolution_inner_dim_col_major_1d()373 void test_gpu_convolution_inner_dim_col_major_1d()
374 {
375 Tensor<float, 4, ColMajor> input(74,9,11,7);
376 Tensor<float, 1, ColMajor> kernel(4);
377 Tensor<float, 4, ColMajor> out(71,9,11,7);
378 input = input.constant(10.0f) + input.random();
379 kernel = kernel.constant(7.0f) + kernel.random();
380
381 std::size_t input_bytes = input.size() * sizeof(float);
382 std::size_t kernel_bytes = kernel.size() * sizeof(float);
383 std::size_t out_bytes = out.size() * sizeof(float);
384
385 float* d_input;
386 float* d_kernel;
387 float* d_out;
388 gpuMalloc((void**)(&d_input), input_bytes);
389 gpuMalloc((void**)(&d_kernel), kernel_bytes);
390 gpuMalloc((void**)(&d_out), out_bytes);
391
392 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
393 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
394
395 Eigen::GpuStreamDevice stream;
396 Eigen::GpuDevice gpu_device(&stream);
397
398 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
399 Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
400 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
401
402 Eigen::array<Eigen::DenseIndex, 1> dims(0);
403 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
404
405 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
406 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
407
408 for (int i = 0; i < 71; ++i) {
409 for (int j = 0; j < 9; ++j) {
410 for (int k = 0; k < 11; ++k) {
411 for (int l = 0; l < 7; ++l) {
412 const float result = out(i,j,k,l);
413 const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
414 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
415 VERIFY_IS_APPROX(result, expected);
416 }
417 }
418 }
419 }
420
421 gpuFree(d_input);
422 gpuFree(d_kernel);
423 gpuFree(d_out);
424 }
425
test_gpu_convolution_inner_dim_row_major_1d()426 void test_gpu_convolution_inner_dim_row_major_1d()
427 {
428 Tensor<float, 4, RowMajor> input(7,9,11,74);
429 Tensor<float, 1, RowMajor> kernel(4);
430 Tensor<float, 4, RowMajor> out(7,9,11,71);
431 input = input.constant(10.0f) + input.random();
432 kernel = kernel.constant(7.0f) + kernel.random();
433
434 std::size_t input_bytes = input.size() * sizeof(float);
435 std::size_t kernel_bytes = kernel.size() * sizeof(float);
436 std::size_t out_bytes = out.size() * sizeof(float);
437
438 float* d_input;
439 float* d_kernel;
440 float* d_out;
441 gpuMalloc((void**)(&d_input), input_bytes);
442 gpuMalloc((void**)(&d_kernel), kernel_bytes);
443 gpuMalloc((void**)(&d_out), out_bytes);
444
445 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
446 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
447
448 Eigen::GpuStreamDevice stream;
449 Eigen::GpuDevice gpu_device(&stream);
450
451 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
452 Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
453 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
454
455 Eigen::array<Eigen::DenseIndex, 1> dims(3);
456 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
457
458 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
459 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
460
461 for (int i = 0; i < 7; ++i) {
462 for (int j = 0; j < 9; ++j) {
463 for (int k = 0; k < 11; ++k) {
464 for (int l = 0; l < 71; ++l) {
465 const float result = out(i,j,k,l);
466 const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
467 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
468 VERIFY_IS_APPROX(result, expected);
469 }
470 }
471 }
472 }
473
474 gpuFree(d_input);
475 gpuFree(d_kernel);
476 gpuFree(d_out);
477 }
478
479 template<int DataLayout>
test_gpu_convolution_2d()480 void test_gpu_convolution_2d()
481 {
482 Tensor<float, 4, DataLayout> input(74,37,11,137);
483 Tensor<float, 2, DataLayout> kernel(3,4);
484 Tensor<float, 4, DataLayout> out(74,35,8,137);
485 input = input.constant(10.0f) + input.random();
486 kernel = kernel.constant(7.0f) + kernel.random();
487
488 std::size_t input_bytes = input.size() * sizeof(float);
489 std::size_t kernel_bytes = kernel.size() * sizeof(float);
490 std::size_t out_bytes = out.size() * sizeof(float);
491
492 float* d_input;
493 float* d_kernel;
494 float* d_out;
495 gpuMalloc((void**)(&d_input), input_bytes);
496 gpuMalloc((void**)(&d_kernel), kernel_bytes);
497 gpuMalloc((void**)(&d_out), out_bytes);
498
499 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
500 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
501
502 Eigen::GpuStreamDevice stream;
503 Eigen::GpuDevice gpu_device(&stream);
504
505 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
506 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
507 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
508
509 Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
510 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
511
512 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
513 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
514
515 for (int i = 0; i < 74; ++i) {
516 for (int j = 0; j < 35; ++j) {
517 for (int k = 0; k < 8; ++k) {
518 for (int l = 0; l < 137; ++l) {
519 const float result = out(i,j,k,l);
520 const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
521 input(i,j+1,k+0,l) * kernel(1,0) +
522 input(i,j+2,k+0,l) * kernel(2,0) +
523 input(i,j+0,k+1,l) * kernel(0,1) +
524 input(i,j+1,k+1,l) * kernel(1,1) +
525 input(i,j+2,k+1,l) * kernel(2,1) +
526 input(i,j+0,k+2,l) * kernel(0,2) +
527 input(i,j+1,k+2,l) * kernel(1,2) +
528 input(i,j+2,k+2,l) * kernel(2,2) +
529 input(i,j+0,k+3,l) * kernel(0,3) +
530 input(i,j+1,k+3,l) * kernel(1,3) +
531 input(i,j+2,k+3,l) * kernel(2,3);
532 VERIFY_IS_APPROX(result, expected);
533 }
534 }
535 }
536 }
537
538 gpuFree(d_input);
539 gpuFree(d_kernel);
540 gpuFree(d_out);
541 }
542
543 template<int DataLayout>
test_gpu_convolution_3d()544 void test_gpu_convolution_3d()
545 {
546 Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
547 Tensor<float, 3, DataLayout> kernel(3,4,2);
548 Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
549 input = input.constant(10.0f) + input.random();
550 kernel = kernel.constant(7.0f) + kernel.random();
551
552 std::size_t input_bytes = input.size() * sizeof(float);
553 std::size_t kernel_bytes = kernel.size() * sizeof(float);
554 std::size_t out_bytes = out.size() * sizeof(float);
555
556 float* d_input;
557 float* d_kernel;
558 float* d_out;
559 gpuMalloc((void**)(&d_input), input_bytes);
560 gpuMalloc((void**)(&d_kernel), kernel_bytes);
561 gpuMalloc((void**)(&d_out), out_bytes);
562
563 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
564 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
565
566 Eigen::GpuStreamDevice stream;
567 Eigen::GpuDevice gpu_device(&stream);
568
569 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
570 Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
571 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
572
573 Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
574 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
575
576 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
577 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
578
579 for (int i = 0; i < 74; ++i) {
580 for (int j = 0; j < 35; ++j) {
581 for (int k = 0; k < 8; ++k) {
582 for (int l = 0; l < 136; ++l) {
583 for (int m = 0; m < 17; ++m) {
584 const float result = out(i,j,k,l,m);
585 const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
586 input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
587 input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
588 input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
589 input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
590 input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
591 input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
592 input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
593 input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
594 input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
595 input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
596 input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
597 input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
598 input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
599 input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
600 input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
601 input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
602 input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
603 input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
604 input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
605 input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
606 input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
607 input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
608 input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
609 VERIFY_IS_APPROX(result, expected);
610 }
611 }
612 }
613 }
614 }
615
616 gpuFree(d_input);
617 gpuFree(d_kernel);
618 gpuFree(d_out);
619 }
620
621
622 #if EIGEN_GPU_TEST_C99_MATH
623 template <typename Scalar>
test_gpu_lgamma(const Scalar stddev)624 void test_gpu_lgamma(const Scalar stddev)
625 {
626 Tensor<Scalar, 2> in(72,97);
627 in.setRandom();
628 in *= in.constant(stddev);
629 Tensor<Scalar, 2> out(72,97);
630 out.setZero();
631
632 std::size_t bytes = in.size() * sizeof(Scalar);
633
634 Scalar* d_in;
635 Scalar* d_out;
636 gpuMalloc((void**)(&d_in), bytes);
637 gpuMalloc((void**)(&d_out), bytes);
638
639 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
640
641 Eigen::GpuStreamDevice stream;
642 Eigen::GpuDevice gpu_device(&stream);
643
644 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
645 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
646
647 gpu_out.device(gpu_device) = gpu_in.lgamma();
648
649 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
650 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
651
652 for (int i = 0; i < 72; ++i) {
653 for (int j = 0; j < 97; ++j) {
654 VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
655 }
656 }
657
658 gpuFree(d_in);
659 gpuFree(d_out);
660 }
661 #endif
662
663 template <typename Scalar>
test_gpu_digamma()664 void test_gpu_digamma()
665 {
666 Tensor<Scalar, 1> in(7);
667 Tensor<Scalar, 1> out(7);
668 Tensor<Scalar, 1> expected_out(7);
669 out.setZero();
670
671 in(0) = Scalar(1);
672 in(1) = Scalar(1.5);
673 in(2) = Scalar(4);
674 in(3) = Scalar(-10.5);
675 in(4) = Scalar(10000.5);
676 in(5) = Scalar(0);
677 in(6) = Scalar(-1);
678
679 expected_out(0) = Scalar(-0.5772156649015329);
680 expected_out(1) = Scalar(0.03648997397857645);
681 expected_out(2) = Scalar(1.2561176684318);
682 expected_out(3) = Scalar(2.398239129535781);
683 expected_out(4) = Scalar(9.210340372392849);
684 expected_out(5) = std::numeric_limits<Scalar>::infinity();
685 expected_out(6) = std::numeric_limits<Scalar>::infinity();
686
687 std::size_t bytes = in.size() * sizeof(Scalar);
688
689 Scalar* d_in;
690 Scalar* d_out;
691 gpuMalloc((void**)(&d_in), bytes);
692 gpuMalloc((void**)(&d_out), bytes);
693
694 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
695
696 Eigen::GpuStreamDevice stream;
697 Eigen::GpuDevice gpu_device(&stream);
698
699 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
700 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
701
702 gpu_out.device(gpu_device) = gpu_in.digamma();
703
704 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
705 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
706
707 for (int i = 0; i < 5; ++i) {
708 VERIFY_IS_APPROX(out(i), expected_out(i));
709 }
710 for (int i = 5; i < 7; ++i) {
711 VERIFY_IS_EQUAL(out(i), expected_out(i));
712 }
713
714 gpuFree(d_in);
715 gpuFree(d_out);
716 }
717
718 template <typename Scalar>
test_gpu_zeta()719 void test_gpu_zeta()
720 {
721 Tensor<Scalar, 1> in_x(6);
722 Tensor<Scalar, 1> in_q(6);
723 Tensor<Scalar, 1> out(6);
724 Tensor<Scalar, 1> expected_out(6);
725 out.setZero();
726
727 in_x(0) = Scalar(1);
728 in_x(1) = Scalar(1.5);
729 in_x(2) = Scalar(4);
730 in_x(3) = Scalar(-10.5);
731 in_x(4) = Scalar(10000.5);
732 in_x(5) = Scalar(3);
733
734 in_q(0) = Scalar(1.2345);
735 in_q(1) = Scalar(2);
736 in_q(2) = Scalar(1.5);
737 in_q(3) = Scalar(3);
738 in_q(4) = Scalar(1.0001);
739 in_q(5) = Scalar(-2.5);
740
741 expected_out(0) = std::numeric_limits<Scalar>::infinity();
742 expected_out(1) = Scalar(1.61237534869);
743 expected_out(2) = Scalar(0.234848505667);
744 expected_out(3) = Scalar(1.03086757337e-5);
745 expected_out(4) = Scalar(0.367879440865);
746 expected_out(5) = Scalar(0.054102025820864097);
747
748 std::size_t bytes = in_x.size() * sizeof(Scalar);
749
750 Scalar* d_in_x;
751 Scalar* d_in_q;
752 Scalar* d_out;
753 gpuMalloc((void**)(&d_in_x), bytes);
754 gpuMalloc((void**)(&d_in_q), bytes);
755 gpuMalloc((void**)(&d_out), bytes);
756
757 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
758 gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
759
760 Eigen::GpuStreamDevice stream;
761 Eigen::GpuDevice gpu_device(&stream);
762
763 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
764 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
765 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
766
767 gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
768
769 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
770 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
771
772 VERIFY_IS_EQUAL(out(0), expected_out(0));
773 VERIFY((std::isnan)(out(3)));
774
775 for (int i = 1; i < 6; ++i) {
776 if (i != 3) {
777 VERIFY_IS_APPROX(out(i), expected_out(i));
778 }
779 }
780
781 gpuFree(d_in_x);
782 gpuFree(d_in_q);
783 gpuFree(d_out);
784 }
785
786 template <typename Scalar>
test_gpu_polygamma()787 void test_gpu_polygamma()
788 {
789 Tensor<Scalar, 1> in_x(7);
790 Tensor<Scalar, 1> in_n(7);
791 Tensor<Scalar, 1> out(7);
792 Tensor<Scalar, 1> expected_out(7);
793 out.setZero();
794
795 in_n(0) = Scalar(1);
796 in_n(1) = Scalar(1);
797 in_n(2) = Scalar(1);
798 in_n(3) = Scalar(17);
799 in_n(4) = Scalar(31);
800 in_n(5) = Scalar(28);
801 in_n(6) = Scalar(8);
802
803 in_x(0) = Scalar(2);
804 in_x(1) = Scalar(3);
805 in_x(2) = Scalar(25.5);
806 in_x(3) = Scalar(4.7);
807 in_x(4) = Scalar(11.8);
808 in_x(5) = Scalar(17.7);
809 in_x(6) = Scalar(30.2);
810
811 expected_out(0) = Scalar(0.644934066848);
812 expected_out(1) = Scalar(0.394934066848);
813 expected_out(2) = Scalar(0.0399946696496);
814 expected_out(3) = Scalar(293.334565435);
815 expected_out(4) = Scalar(0.445487887616);
816 expected_out(5) = Scalar(-2.47810300902e-07);
817 expected_out(6) = Scalar(-8.29668781082e-09);
818
819 std::size_t bytes = in_x.size() * sizeof(Scalar);
820
821 Scalar* d_in_x;
822 Scalar* d_in_n;
823 Scalar* d_out;
824 gpuMalloc((void**)(&d_in_x), bytes);
825 gpuMalloc((void**)(&d_in_n), bytes);
826 gpuMalloc((void**)(&d_out), bytes);
827
828 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
829 gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
830
831 Eigen::GpuStreamDevice stream;
832 Eigen::GpuDevice gpu_device(&stream);
833
834 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
835 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
836 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
837
838 gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
839
840 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
841 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
842
843 for (int i = 0; i < 7; ++i) {
844 VERIFY_IS_APPROX(out(i), expected_out(i));
845 }
846
847 gpuFree(d_in_x);
848 gpuFree(d_in_n);
849 gpuFree(d_out);
850 }
851
852 template <typename Scalar>
test_gpu_igamma()853 void test_gpu_igamma()
854 {
855 Tensor<Scalar, 2> a(6, 6);
856 Tensor<Scalar, 2> x(6, 6);
857 Tensor<Scalar, 2> out(6, 6);
858 out.setZero();
859
860 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
861 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
862
863 for (int i = 0; i < 6; ++i) {
864 for (int j = 0; j < 6; ++j) {
865 a(i, j) = a_s[i];
866 x(i, j) = x_s[j];
867 }
868 }
869
870 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
871 Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
872 {0.0, 0.6321205588285578, 0.7768698398515702,
873 0.9816843611112658, 9.999500016666262e-05, 1.0},
874 {0.0, 0.4275932955291202, 0.608374823728911,
875 0.9539882943107686, 7.522076445089201e-07, 1.0},
876 {0.0, 0.01898815687615381, 0.06564245437845008,
877 0.5665298796332909, 4.166333347221828e-18, 1.0},
878 {0.0, 0.9999780593618628, 0.9999899967080838,
879 0.9999996219837988, 0.9991370418689945, 1.0},
880 {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
881
882
883
884 std::size_t bytes = a.size() * sizeof(Scalar);
885
886 Scalar* d_a;
887 Scalar* d_x;
888 Scalar* d_out;
889 assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
890 assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
891 assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
892
893 gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
894 gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
895
896 Eigen::GpuStreamDevice stream;
897 Eigen::GpuDevice gpu_device(&stream);
898
899 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
900 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
901 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
902
903 gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
904
905 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
906 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
907
908 for (int i = 0; i < 6; ++i) {
909 for (int j = 0; j < 6; ++j) {
910 if ((std::isnan)(igamma_s[i][j])) {
911 VERIFY((std::isnan)(out(i, j)));
912 } else {
913 VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
914 }
915 }
916 }
917
918 gpuFree(d_a);
919 gpuFree(d_x);
920 gpuFree(d_out);
921 }
922
923 template <typename Scalar>
test_gpu_igammac()924 void test_gpu_igammac()
925 {
926 Tensor<Scalar, 2> a(6, 6);
927 Tensor<Scalar, 2> x(6, 6);
928 Tensor<Scalar, 2> out(6, 6);
929 out.setZero();
930
931 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
932 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
933
934 for (int i = 0; i < 6; ++i) {
935 for (int j = 0; j < 6; ++j) {
936 a(i, j) = a_s[i];
937 x(i, j) = x_s[j];
938 }
939 }
940
941 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
942 Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
943 {1.0, 0.36787944117144233, 0.22313016014842982,
944 0.018315638888734182, 0.9999000049998333, 0.0},
945 {1.0, 0.5724067044708798, 0.3916251762710878,
946 0.04601170568923136, 0.9999992477923555, 0.0},
947 {1.0, 0.9810118431238462, 0.9343575456215499,
948 0.4334701203667089, 1.0, 0.0},
949 {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
950 3.7801620118431334e-07, 0.0008629581310054535,
951 0.0},
952 {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
953
954 std::size_t bytes = a.size() * sizeof(Scalar);
955
956 Scalar* d_a;
957 Scalar* d_x;
958 Scalar* d_out;
959 gpuMalloc((void**)(&d_a), bytes);
960 gpuMalloc((void**)(&d_x), bytes);
961 gpuMalloc((void**)(&d_out), bytes);
962
963 gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
964 gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
965
966 Eigen::GpuStreamDevice stream;
967 Eigen::GpuDevice gpu_device(&stream);
968
969 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
970 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
971 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
972
973 gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
974
975 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
976 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
977
978 for (int i = 0; i < 6; ++i) {
979 for (int j = 0; j < 6; ++j) {
980 if ((std::isnan)(igammac_s[i][j])) {
981 VERIFY((std::isnan)(out(i, j)));
982 } else {
983 VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
984 }
985 }
986 }
987
988 gpuFree(d_a);
989 gpuFree(d_x);
990 gpuFree(d_out);
991 }
992
993 #if EIGEN_GPU_TEST_C99_MATH
994 template <typename Scalar>
test_gpu_erf(const Scalar stddev)995 void test_gpu_erf(const Scalar stddev)
996 {
997 Tensor<Scalar, 2> in(72,97);
998 in.setRandom();
999 in *= in.constant(stddev);
1000 Tensor<Scalar, 2> out(72,97);
1001 out.setZero();
1002
1003 std::size_t bytes = in.size() * sizeof(Scalar);
1004
1005 Scalar* d_in;
1006 Scalar* d_out;
1007 assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
1008 assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
1009
1010 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
1011
1012 Eigen::GpuStreamDevice stream;
1013 Eigen::GpuDevice gpu_device(&stream);
1014
1015 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1016 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1017
1018 gpu_out.device(gpu_device) = gpu_in.erf();
1019
1020 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1021 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1022
1023 for (int i = 0; i < 72; ++i) {
1024 for (int j = 0; j < 97; ++j) {
1025 VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1026 }
1027 }
1028
1029 gpuFree(d_in);
1030 gpuFree(d_out);
1031 }
1032
1033 template <typename Scalar>
test_gpu_erfc(const Scalar stddev)1034 void test_gpu_erfc(const Scalar stddev)
1035 {
1036 Tensor<Scalar, 2> in(72,97);
1037 in.setRandom();
1038 in *= in.constant(stddev);
1039 Tensor<Scalar, 2> out(72,97);
1040 out.setZero();
1041
1042 std::size_t bytes = in.size() * sizeof(Scalar);
1043
1044 Scalar* d_in;
1045 Scalar* d_out;
1046 gpuMalloc((void**)(&d_in), bytes);
1047 gpuMalloc((void**)(&d_out), bytes);
1048
1049 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
1050
1051 Eigen::GpuStreamDevice stream;
1052 Eigen::GpuDevice gpu_device(&stream);
1053
1054 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1055 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1056
1057 gpu_out.device(gpu_device) = gpu_in.erfc();
1058
1059 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1060 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1061
1062 for (int i = 0; i < 72; ++i) {
1063 for (int j = 0; j < 97; ++j) {
1064 VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1065 }
1066 }
1067
1068 gpuFree(d_in);
1069 gpuFree(d_out);
1070 }
1071 #endif
1072 template <typename Scalar>
test_gpu_ndtri()1073 void test_gpu_ndtri()
1074 {
1075 Tensor<Scalar, 1> in_x(8);
1076 Tensor<Scalar, 1> out(8);
1077 Tensor<Scalar, 1> expected_out(8);
1078 out.setZero();
1079
1080 in_x(0) = Scalar(1);
1081 in_x(1) = Scalar(0.);
1082 in_x(2) = Scalar(0.5);
1083 in_x(3) = Scalar(0.2);
1084 in_x(4) = Scalar(0.8);
1085 in_x(5) = Scalar(0.9);
1086 in_x(6) = Scalar(0.1);
1087 in_x(7) = Scalar(0.99);
1088 in_x(8) = Scalar(0.01);
1089
1090 expected_out(0) = std::numeric_limits<Scalar>::infinity();
1091 expected_out(1) = -std::numeric_limits<Scalar>::infinity();
1092 expected_out(2) = Scalar(0.0);
1093 expected_out(3) = Scalar(-0.8416212335729142);
1094 expected_out(4) = Scalar(0.8416212335729142);
1095 expected_out(5) = Scalar(1.2815515655446004);
1096 expected_out(6) = Scalar(-1.2815515655446004);
1097 expected_out(7) = Scalar(2.3263478740408408);
1098 expected_out(8) = Scalar(-2.3263478740408408);
1099
1100 std::size_t bytes = in_x.size() * sizeof(Scalar);
1101
1102 Scalar* d_in_x;
1103 Scalar* d_out;
1104 gpuMalloc((void**)(&d_in_x), bytes);
1105 gpuMalloc((void**)(&d_out), bytes);
1106
1107 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1108
1109 Eigen::GpuStreamDevice stream;
1110 Eigen::GpuDevice gpu_device(&stream);
1111
1112 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
1113 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
1114
1115 gpu_out.device(gpu_device) = gpu_in_x.ndtri();
1116
1117 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1118 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1119
1120 VERIFY_IS_EQUAL(out(0), expected_out(0));
1121 VERIFY((std::isnan)(out(3)));
1122
1123 for (int i = 1; i < 6; ++i) {
1124 if (i != 3) {
1125 VERIFY_IS_APPROX(out(i), expected_out(i));
1126 }
1127 }
1128
1129 gpuFree(d_in_x);
1130 gpuFree(d_out);
1131 }
1132
1133 template <typename Scalar>
test_gpu_betainc()1134 void test_gpu_betainc()
1135 {
1136 Tensor<Scalar, 1> in_x(125);
1137 Tensor<Scalar, 1> in_a(125);
1138 Tensor<Scalar, 1> in_b(125);
1139 Tensor<Scalar, 1> out(125);
1140 Tensor<Scalar, 1> expected_out(125);
1141 out.setZero();
1142
1143 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1144
1145 Array<Scalar, 1, Dynamic> x(125);
1146 Array<Scalar, 1, Dynamic> a(125);
1147 Array<Scalar, 1, Dynamic> b(125);
1148 Array<Scalar, 1, Dynamic> v(125);
1149
1150 a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1151 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1152 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1153 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1154 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1155 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1156 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1157 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1158 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1159 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1160 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1161 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1162 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1163 31.62177660168379, 31.62177660168379, 31.62177660168379,
1164 31.62177660168379, 31.62177660168379, 31.62177660168379,
1165 31.62177660168379, 31.62177660168379, 31.62177660168379,
1166 31.62177660168379, 31.62177660168379, 31.62177660168379,
1167 31.62177660168379, 31.62177660168379, 31.62177660168379,
1168 31.62177660168379, 31.62177660168379, 31.62177660168379,
1169 31.62177660168379, 31.62177660168379, 31.62177660168379,
1170 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1171 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1172 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1173 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1174
1175 b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1176 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1177 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1178 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1179 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1180 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1181 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1182 31.62177660168379, 31.62177660168379, 31.62177660168379,
1183 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1184 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1185 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1186 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1187 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1188 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1189 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1190 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1191 31.62177660168379, 31.62177660168379, 31.62177660168379,
1192 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1193 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1194 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1195 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1196 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1197 999.999, 999.999, 999.999;
1198
1199 x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1200 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1201 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1202 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1203 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1204 -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1205 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1206 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1207 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1208
1209 v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1210 nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1211 nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1212 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1213 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1214 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1215 nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1216 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1217 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1218 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1219 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1220 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1221 nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1222 nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1223 nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1224 nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1225 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1226 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1227 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1228 0.5000000000004947, 0.9999999999999999, nan;
1229
1230 for (int i = 0; i < 125; ++i) {
1231 in_x(i) = x(i);
1232 in_a(i) = a(i);
1233 in_b(i) = b(i);
1234 expected_out(i) = v(i);
1235 }
1236
1237 std::size_t bytes = in_x.size() * sizeof(Scalar);
1238
1239 Scalar* d_in_x;
1240 Scalar* d_in_a;
1241 Scalar* d_in_b;
1242 Scalar* d_out;
1243 gpuMalloc((void**)(&d_in_x), bytes);
1244 gpuMalloc((void**)(&d_in_a), bytes);
1245 gpuMalloc((void**)(&d_in_b), bytes);
1246 gpuMalloc((void**)(&d_out), bytes);
1247
1248 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1249 gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
1250 gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
1251
1252 Eigen::GpuStreamDevice stream;
1253 Eigen::GpuDevice gpu_device(&stream);
1254
1255 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1256 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1257 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1258 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1259
1260 gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1261
1262 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1263 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1264
1265 for (int i = 1; i < 125; ++i) {
1266 if ((std::isnan)(expected_out(i))) {
1267 VERIFY((std::isnan)(out(i)));
1268 } else {
1269 VERIFY_IS_APPROX(out(i), expected_out(i));
1270 }
1271 }
1272
1273 gpuFree(d_in_x);
1274 gpuFree(d_in_a);
1275 gpuFree(d_in_b);
1276 gpuFree(d_out);
1277 }
1278
1279 template <typename Scalar>
test_gpu_i0e()1280 void test_gpu_i0e()
1281 {
1282 Tensor<Scalar, 1> in_x(21);
1283 Tensor<Scalar, 1> out(21);
1284 Tensor<Scalar, 1> expected_out(21);
1285 out.setZero();
1286
1287 Array<Scalar, 1, Dynamic> in_x_array(21);
1288 Array<Scalar, 1, Dynamic> expected_out_array(21);
1289
1290 in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
1291 -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
1292
1293 expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
1294 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
1295 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
1296 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
1297 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
1298 0.0897803118848;
1299
1300 for (int i = 0; i < 21; ++i) {
1301 in_x(i) = in_x_array(i);
1302 expected_out(i) = expected_out_array(i);
1303 }
1304
1305 std::size_t bytes = in_x.size() * sizeof(Scalar);
1306
1307 Scalar* d_in;
1308 Scalar* d_out;
1309 gpuMalloc((void**)(&d_in), bytes);
1310 gpuMalloc((void**)(&d_out), bytes);
1311
1312 gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
1313
1314 Eigen::GpuStreamDevice stream;
1315 Eigen::GpuDevice gpu_device(&stream);
1316
1317 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
1318 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
1319
1320 gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
1321
1322 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1323 gpu_device.stream()) == gpuSuccess);
1324 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1325
1326 for (int i = 0; i < 21; ++i) {
1327 VERIFY_IS_APPROX(out(i), expected_out(i));
1328 }
1329
1330 gpuFree(d_in);
1331 gpuFree(d_out);
1332 }
1333
1334 template <typename Scalar>
test_gpu_i1e()1335 void test_gpu_i1e()
1336 {
1337 Tensor<Scalar, 1> in_x(21);
1338 Tensor<Scalar, 1> out(21);
1339 Tensor<Scalar, 1> expected_out(21);
1340 out.setZero();
1341
1342 Array<Scalar, 1, Dynamic> in_x_array(21);
1343 Array<Scalar, 1, Dynamic> expected_out_array(21);
1344
1345 in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
1346 -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
1347
1348 expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
1349 -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
1350 -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
1351 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
1352 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
1353 0.0875062221833;
1354
1355 for (int i = 0; i < 21; ++i) {
1356 in_x(i) = in_x_array(i);
1357 expected_out(i) = expected_out_array(i);
1358 }
1359
1360 std::size_t bytes = in_x.size() * sizeof(Scalar);
1361
1362 Scalar* d_in;
1363 Scalar* d_out;
1364 gpuMalloc((void**)(&d_in), bytes);
1365 gpuMalloc((void**)(&d_out), bytes);
1366
1367 gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
1368
1369 Eigen::GpuStreamDevice stream;
1370 Eigen::GpuDevice gpu_device(&stream);
1371
1372 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
1373 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
1374
1375 gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
1376
1377 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1378 gpu_device.stream()) == gpuSuccess);
1379 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1380
1381 for (int i = 0; i < 21; ++i) {
1382 VERIFY_IS_APPROX(out(i), expected_out(i));
1383 }
1384
1385 gpuFree(d_in);
1386 gpuFree(d_out);
1387 }
1388
1389 template <typename Scalar>
test_gpu_igamma_der_a()1390 void test_gpu_igamma_der_a()
1391 {
1392 Tensor<Scalar, 1> in_x(30);
1393 Tensor<Scalar, 1> in_a(30);
1394 Tensor<Scalar, 1> out(30);
1395 Tensor<Scalar, 1> expected_out(30);
1396 out.setZero();
1397
1398 Array<Scalar, 1, Dynamic> in_a_array(30);
1399 Array<Scalar, 1, Dynamic> in_x_array(30);
1400 Array<Scalar, 1, Dynamic> expected_out_array(30);
1401
1402 // See special_functions.cpp for the Python code that generates the test data.
1403
1404 in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
1405 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
1406 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
1407
1408 in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
1409 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
1410 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
1411 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
1412 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
1413 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
1414 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
1415
1416 expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
1417 -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
1418 -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
1419 -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
1420 -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
1421 -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
1422 -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
1423 -0.0125850719397, -0.00455500206958, -0.00476436993148;
1424
1425 for (int i = 0; i < 30; ++i) {
1426 in_x(i) = in_x_array(i);
1427 in_a(i) = in_a_array(i);
1428 expected_out(i) = expected_out_array(i);
1429 }
1430
1431 std::size_t bytes = in_x.size() * sizeof(Scalar);
1432
1433 Scalar* d_a;
1434 Scalar* d_x;
1435 Scalar* d_out;
1436 gpuMalloc((void**)(&d_a), bytes);
1437 gpuMalloc((void**)(&d_x), bytes);
1438 gpuMalloc((void**)(&d_out), bytes);
1439
1440 gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
1441 gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1442
1443 Eigen::GpuStreamDevice stream;
1444 Eigen::GpuDevice gpu_device(&stream);
1445
1446 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
1447 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
1448 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
1449
1450 gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
1451
1452 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1453 gpu_device.stream()) == gpuSuccess);
1454 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1455
1456 for (int i = 0; i < 30; ++i) {
1457 VERIFY_IS_APPROX(out(i), expected_out(i));
1458 }
1459
1460 gpuFree(d_a);
1461 gpuFree(d_x);
1462 gpuFree(d_out);
1463 }
1464
1465 template <typename Scalar>
test_gpu_gamma_sample_der_alpha()1466 void test_gpu_gamma_sample_der_alpha()
1467 {
1468 Tensor<Scalar, 1> in_alpha(30);
1469 Tensor<Scalar, 1> in_sample(30);
1470 Tensor<Scalar, 1> out(30);
1471 Tensor<Scalar, 1> expected_out(30);
1472 out.setZero();
1473
1474 Array<Scalar, 1, Dynamic> in_alpha_array(30);
1475 Array<Scalar, 1, Dynamic> in_sample_array(30);
1476 Array<Scalar, 1, Dynamic> expected_out_array(30);
1477
1478 // See special_functions.cpp for the Python code that generates the test data.
1479
1480 in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
1481 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
1482 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
1483
1484 in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
1485 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
1486 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
1487 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
1488 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
1489 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
1490 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
1491
1492 expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
1493 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
1494 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
1495 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
1496 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
1497 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
1498 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
1499 1.00106492525, 0.97734200649, 1.02198794179;
1500
1501 for (int i = 0; i < 30; ++i) {
1502 in_alpha(i) = in_alpha_array(i);
1503 in_sample(i) = in_sample_array(i);
1504 expected_out(i) = expected_out_array(i);
1505 }
1506
1507 std::size_t bytes = in_alpha.size() * sizeof(Scalar);
1508
1509 Scalar* d_alpha;
1510 Scalar* d_sample;
1511 Scalar* d_out;
1512 gpuMalloc((void**)(&d_alpha), bytes);
1513 gpuMalloc((void**)(&d_sample), bytes);
1514 gpuMalloc((void**)(&d_out), bytes);
1515
1516 gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
1517 gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
1518
1519 Eigen::GpuStreamDevice stream;
1520 Eigen::GpuDevice gpu_device(&stream);
1521
1522 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
1523 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
1524 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
1525
1526 gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
1527
1528 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1529 gpu_device.stream()) == gpuSuccess);
1530 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1531
1532 for (int i = 0; i < 30; ++i) {
1533 VERIFY_IS_APPROX(out(i), expected_out(i));
1534 }
1535
1536 gpuFree(d_alpha);
1537 gpuFree(d_sample);
1538 gpuFree(d_out);
1539 }
1540
EIGEN_DECLARE_TEST(cxx11_tensor_gpu)1541 EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
1542 {
1543 CALL_SUBTEST_1(test_gpu_nullary());
1544 CALL_SUBTEST_1(test_gpu_elementwise_small());
1545 CALL_SUBTEST_1(test_gpu_elementwise());
1546 CALL_SUBTEST_1(test_gpu_props());
1547 CALL_SUBTEST_1(test_gpu_reduction());
1548 CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
1549 CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
1550 CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
1551 CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
1552 CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
1553 CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
1554 CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
1555 CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
1556 #if !defined(EIGEN_USE_HIP)
1557 // disable these tests on HIP for now.
1558 // they hang..need to investigate and fix
1559 CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
1560 CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
1561 #endif
1562
1563 #if EIGEN_GPU_TEST_C99_MATH
1564 // std::erf, std::erfc, and so on where only added in c++11. We use them
1565 // as a golden reference to validate the results produced by Eigen. Therefore
1566 // we can only run these tests if we use a c++11 compiler.
1567 CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
1568 CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
1569 CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
1570 CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
1571
1572 CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
1573 CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
1574 CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
1575 CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
1576
1577 CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
1578 CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
1579 CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
1580 CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
1581
1582 CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
1583 // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
1584 CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
1585 CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
1586 CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
1587
1588 CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
1589 CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
1590 CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
1591 CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
1592
1593 CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
1594 // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
1595 CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
1596 CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
1597 CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
1598
1599 #if !defined(EIGEN_USE_HIP)
1600 // disable these tests on HIP for now.
1601
1602 CALL_SUBTEST_5(test_gpu_ndtri<float>());
1603 CALL_SUBTEST_5(test_gpu_ndtri<double>());
1604
1605 CALL_SUBTEST_5(test_gpu_digamma<float>());
1606 CALL_SUBTEST_5(test_gpu_digamma<double>());
1607
1608 CALL_SUBTEST_5(test_gpu_polygamma<float>());
1609 CALL_SUBTEST_5(test_gpu_polygamma<double>());
1610
1611 CALL_SUBTEST_5(test_gpu_zeta<float>());
1612 CALL_SUBTEST_5(test_gpu_zeta<double>());
1613 #endif
1614
1615 CALL_SUBTEST_5(test_gpu_igamma<float>());
1616 CALL_SUBTEST_5(test_gpu_igammac<float>());
1617
1618 CALL_SUBTEST_5(test_gpu_igamma<double>());
1619 CALL_SUBTEST_5(test_gpu_igammac<double>());
1620
1621 #if !defined(EIGEN_USE_HIP)
1622 // disable these tests on HIP for now.
1623 CALL_SUBTEST_6(test_gpu_betainc<float>());
1624 CALL_SUBTEST_6(test_gpu_betainc<double>());
1625
1626 CALL_SUBTEST_6(test_gpu_i0e<float>());
1627 CALL_SUBTEST_6(test_gpu_i0e<double>());
1628
1629 CALL_SUBTEST_6(test_gpu_i1e<float>());
1630 CALL_SUBTEST_6(test_gpu_i1e<double>());
1631
1632 CALL_SUBTEST_6(test_gpu_i1e<float>());
1633 CALL_SUBTEST_6(test_gpu_i1e<double>());
1634
1635 CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
1636 CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
1637
1638 CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
1639 CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
1640 #endif
1641
1642 #endif
1643 }
1644