xref: /aosp_15_r20/external/eigen/unsupported/test/cxx11_tensor_gpu.cu (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 
13 #define EIGEN_USE_GPU
14 
15 #include "main.h"
16 #include <unsupported/Eigen/CXX11/Tensor>
17 
18 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
19 
20 #define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
21 
22 using Eigen::Tensor;
23 
test_gpu_nullary()24 void test_gpu_nullary() {
25   Tensor<float, 1, 0, int> in1(2);
26   Tensor<float, 1, 0, int> in2(2);
27   in1.setRandom();
28   in2.setRandom();
29 
30   std::size_t tensor_bytes = in1.size() * sizeof(float);
31 
32   float* d_in1;
33   float* d_in2;
34   gpuMalloc((void**)(&d_in1), tensor_bytes);
35   gpuMalloc((void**)(&d_in2), tensor_bytes);
36   gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
37   gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
38 
39   Eigen::GpuStreamDevice stream;
40   Eigen::GpuDevice gpu_device(&stream);
41 
42   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
43       d_in1, 2);
44   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
45       d_in2, 2);
46 
47   gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
48   gpu_in2.device(gpu_device) = gpu_in2.random();
49 
50   Tensor<float, 1, 0, int> new1(2);
51   Tensor<float, 1, 0, int> new2(2);
52 
53   assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
54                          gpu_device.stream()) == gpuSuccess);
55   assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
56                          gpu_device.stream()) == gpuSuccess);
57 
58   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
59 
60   for (int i = 0; i < 2; ++i) {
61     VERIFY_IS_APPROX(new1(i), 3.14f);
62     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
63   }
64 
65   gpuFree(d_in1);
66   gpuFree(d_in2);
67 }
68 
test_gpu_elementwise_small()69 void test_gpu_elementwise_small() {
70   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
71   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
72   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
73   in1.setRandom();
74   in2.setRandom();
75 
76   std::size_t in1_bytes = in1.size() * sizeof(float);
77   std::size_t in2_bytes = in2.size() * sizeof(float);
78   std::size_t out_bytes = out.size() * sizeof(float);
79 
80   float* d_in1;
81   float* d_in2;
82   float* d_out;
83   gpuMalloc((void**)(&d_in1), in1_bytes);
84   gpuMalloc((void**)(&d_in2), in2_bytes);
85   gpuMalloc((void**)(&d_out), out_bytes);
86 
87   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
88   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
89 
90   Eigen::GpuStreamDevice stream;
91   Eigen::GpuDevice gpu_device(&stream);
92 
93   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
94       d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
95   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
96       d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
97   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
98       d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
99 
100   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
101 
102   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
103                          gpu_device.stream()) == gpuSuccess);
104   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
105 
106   for (int i = 0; i < 2; ++i) {
107     VERIFY_IS_APPROX(
108         out(Eigen::array<Eigen::DenseIndex, 1>(i)),
109         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
110   }
111 
112   gpuFree(d_in1);
113   gpuFree(d_in2);
114   gpuFree(d_out);
115 }
116 
test_gpu_elementwise()117 void test_gpu_elementwise()
118 {
119   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
120   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
121   Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
122   Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
123   in1.setRandom();
124   in2.setRandom();
125   in3.setRandom();
126 
127   std::size_t in1_bytes = in1.size() * sizeof(float);
128   std::size_t in2_bytes = in2.size() * sizeof(float);
129   std::size_t in3_bytes = in3.size() * sizeof(float);
130   std::size_t out_bytes = out.size() * sizeof(float);
131 
132   float* d_in1;
133   float* d_in2;
134   float* d_in3;
135   float* d_out;
136   gpuMalloc((void**)(&d_in1), in1_bytes);
137   gpuMalloc((void**)(&d_in2), in2_bytes);
138   gpuMalloc((void**)(&d_in3), in3_bytes);
139   gpuMalloc((void**)(&d_out), out_bytes);
140 
141   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
142   gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
143   gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
144 
145   Eigen::GpuStreamDevice stream;
146   Eigen::GpuDevice gpu_device(&stream);
147 
148   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
149   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
150   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
151   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
152 
153   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
154 
155   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
156   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
157 
158   for (int i = 0; i < 72; ++i) {
159     for (int j = 0; j < 53; ++j) {
160       for (int k = 0; k < 97; ++k) {
161         VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
162       }
163     }
164   }
165 
166   gpuFree(d_in1);
167   gpuFree(d_in2);
168   gpuFree(d_in3);
169   gpuFree(d_out);
170 }
171 
test_gpu_props()172 void test_gpu_props() {
173   Tensor<float, 1> in1(200);
174   Tensor<bool, 1> out(200);
175   in1.setRandom();
176 
177   std::size_t in1_bytes = in1.size() * sizeof(float);
178   std::size_t out_bytes = out.size() * sizeof(bool);
179 
180   float* d_in1;
181   bool* d_out;
182   gpuMalloc((void**)(&d_in1), in1_bytes);
183   gpuMalloc((void**)(&d_out), out_bytes);
184 
185   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
186 
187   Eigen::GpuStreamDevice stream;
188   Eigen::GpuDevice gpu_device(&stream);
189 
190   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
191       d_in1, 200);
192   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
193       d_out, 200);
194 
195   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
196 
197   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
198                          gpu_device.stream()) == gpuSuccess);
199   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
200 
201   for (int i = 0; i < 200; ++i) {
202     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
203   }
204 
205   gpuFree(d_in1);
206   gpuFree(d_out);
207 }
208 
test_gpu_reduction()209 void test_gpu_reduction()
210 {
211   Tensor<float, 4> in1(72,53,97,113);
212   Tensor<float, 2> out(72,97);
213   in1.setRandom();
214 
215   std::size_t in1_bytes = in1.size() * sizeof(float);
216   std::size_t out_bytes = out.size() * sizeof(float);
217 
218   float* d_in1;
219   float* d_out;
220   gpuMalloc((void**)(&d_in1), in1_bytes);
221   gpuMalloc((void**)(&d_out), out_bytes);
222 
223   gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
224 
225   Eigen::GpuStreamDevice stream;
226   Eigen::GpuDevice gpu_device(&stream);
227 
228   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
229   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
230 
231   array<Eigen::DenseIndex, 2> reduction_axis;
232   reduction_axis[0] = 1;
233   reduction_axis[1] = 3;
234 
235   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
236 
237   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
238   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
239 
240   for (int i = 0; i < 72; ++i) {
241     for (int j = 0; j < 97; ++j) {
242       float expected = 0;
243       for (int k = 0; k < 53; ++k) {
244         for (int l = 0; l < 113; ++l) {
245           expected =
246               std::max<float>(expected, in1(i, k, j, l));
247         }
248       }
249       VERIFY_IS_APPROX(out(i,j), expected);
250     }
251   }
252 
253   gpuFree(d_in1);
254   gpuFree(d_out);
255 }
256 
257 template<int DataLayout>
test_gpu_contraction()258 void test_gpu_contraction()
259 {
260   // with these dimensions, the output has 300 * 140 elements, which is
261   // more than 30 * 1024, which is the number of threads in blocks on
262   // a 15 SM GK110 GPU
263   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
264   Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
265   Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
266 
267   t_left.setRandom();
268   t_right.setRandom();
269 
270   std::size_t t_left_bytes = t_left.size()  * sizeof(float);
271   std::size_t t_right_bytes = t_right.size() * sizeof(float);
272   std::size_t t_result_bytes = t_result.size() * sizeof(float);
273 
274   float* d_t_left;
275   float* d_t_right;
276   float* d_t_result;
277 
278   gpuMalloc((void**)(&d_t_left), t_left_bytes);
279   gpuMalloc((void**)(&d_t_right), t_right_bytes);
280   gpuMalloc((void**)(&d_t_result), t_result_bytes);
281 
282   gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
283   gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
284 
285   Eigen::GpuStreamDevice stream;
286   Eigen::GpuDevice gpu_device(&stream);
287 
288   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
289   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
290   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
291 
292   typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
293   MapXf m_left(t_left.data(), 300, 93);
294   MapXf m_right(t_right.data(), 93, 140);
295   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
296 
297   typedef Tensor<float, 1>::DimensionPair DimPair;
298   Eigen::array<DimPair, 2> dims;
299   dims[0] = DimPair(2, 0);
300   dims[1] = DimPair(3, 1);
301 
302   m_result = m_left * m_right;
303   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
304 
305   gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
306 
307   for (DenseIndex i = 0; i < t_result.size(); i++) {
308     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
309       std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
310       assert(false);
311     }
312   }
313 
314   gpuFree(d_t_left);
315   gpuFree(d_t_right);
316   gpuFree(d_t_result);
317 }
318 
319 template<int DataLayout>
test_gpu_convolution_1d()320 void test_gpu_convolution_1d()
321 {
322   Tensor<float, 4, DataLayout> input(74,37,11,137);
323   Tensor<float, 1, DataLayout> kernel(4);
324   Tensor<float, 4, DataLayout> out(74,34,11,137);
325   input = input.constant(10.0f) + input.random();
326   kernel = kernel.constant(7.0f) + kernel.random();
327 
328   std::size_t input_bytes = input.size() * sizeof(float);
329   std::size_t kernel_bytes = kernel.size() * sizeof(float);
330   std::size_t out_bytes = out.size() * sizeof(float);
331 
332   float* d_input;
333   float* d_kernel;
334   float* d_out;
335   gpuMalloc((void**)(&d_input), input_bytes);
336   gpuMalloc((void**)(&d_kernel), kernel_bytes);
337   gpuMalloc((void**)(&d_out), out_bytes);
338 
339   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
340   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
341 
342   Eigen::GpuStreamDevice stream;
343   Eigen::GpuDevice gpu_device(&stream);
344 
345   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
346   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
347   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
348 
349   Eigen::array<Eigen::DenseIndex, 1> dims(1);
350   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
351 
352   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
353   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
354 
355   for (int i = 0; i < 74; ++i) {
356     for (int j = 0; j < 34; ++j) {
357       for (int k = 0; k < 11; ++k) {
358         for (int l = 0; l < 137; ++l) {
359           const float result = out(i,j,k,l);
360           const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
361                                  input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
362           VERIFY_IS_APPROX(result, expected);
363         }
364       }
365     }
366   }
367 
368   gpuFree(d_input);
369   gpuFree(d_kernel);
370   gpuFree(d_out);
371 }
372 
test_gpu_convolution_inner_dim_col_major_1d()373 void test_gpu_convolution_inner_dim_col_major_1d()
374 {
375   Tensor<float, 4, ColMajor> input(74,9,11,7);
376   Tensor<float, 1, ColMajor> kernel(4);
377   Tensor<float, 4, ColMajor> out(71,9,11,7);
378   input = input.constant(10.0f) + input.random();
379   kernel = kernel.constant(7.0f) + kernel.random();
380 
381   std::size_t input_bytes = input.size() * sizeof(float);
382   std::size_t kernel_bytes = kernel.size() * sizeof(float);
383   std::size_t out_bytes = out.size() * sizeof(float);
384 
385   float* d_input;
386   float* d_kernel;
387   float* d_out;
388   gpuMalloc((void**)(&d_input), input_bytes);
389   gpuMalloc((void**)(&d_kernel), kernel_bytes);
390   gpuMalloc((void**)(&d_out), out_bytes);
391 
392   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
393   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
394 
395   Eigen::GpuStreamDevice stream;
396   Eigen::GpuDevice gpu_device(&stream);
397 
398   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
399   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
400   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
401 
402   Eigen::array<Eigen::DenseIndex, 1> dims(0);
403   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
404 
405   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
406   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
407 
408   for (int i = 0; i < 71; ++i) {
409     for (int j = 0; j < 9; ++j) {
410       for (int k = 0; k < 11; ++k) {
411         for (int l = 0; l < 7; ++l) {
412           const float result = out(i,j,k,l);
413           const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
414                                  input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
415           VERIFY_IS_APPROX(result, expected);
416         }
417       }
418     }
419   }
420 
421   gpuFree(d_input);
422   gpuFree(d_kernel);
423   gpuFree(d_out);
424 }
425 
test_gpu_convolution_inner_dim_row_major_1d()426 void test_gpu_convolution_inner_dim_row_major_1d()
427 {
428   Tensor<float, 4, RowMajor> input(7,9,11,74);
429   Tensor<float, 1, RowMajor> kernel(4);
430   Tensor<float, 4, RowMajor> out(7,9,11,71);
431   input = input.constant(10.0f) + input.random();
432   kernel = kernel.constant(7.0f) + kernel.random();
433 
434   std::size_t input_bytes = input.size() * sizeof(float);
435   std::size_t kernel_bytes = kernel.size() * sizeof(float);
436   std::size_t out_bytes = out.size() * sizeof(float);
437 
438   float* d_input;
439   float* d_kernel;
440   float* d_out;
441   gpuMalloc((void**)(&d_input), input_bytes);
442   gpuMalloc((void**)(&d_kernel), kernel_bytes);
443   gpuMalloc((void**)(&d_out), out_bytes);
444 
445   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
446   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
447 
448   Eigen::GpuStreamDevice stream;
449   Eigen::GpuDevice gpu_device(&stream);
450 
451   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
452   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
453   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
454 
455   Eigen::array<Eigen::DenseIndex, 1> dims(3);
456   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
457 
458   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
459   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
460 
461   for (int i = 0; i < 7; ++i) {
462     for (int j = 0; j < 9; ++j) {
463       for (int k = 0; k < 11; ++k) {
464         for (int l = 0; l < 71; ++l) {
465           const float result = out(i,j,k,l);
466           const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
467                                  input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
468           VERIFY_IS_APPROX(result, expected);
469         }
470       }
471     }
472   }
473 
474   gpuFree(d_input);
475   gpuFree(d_kernel);
476   gpuFree(d_out);
477 }
478 
479 template<int DataLayout>
test_gpu_convolution_2d()480 void test_gpu_convolution_2d()
481 {
482   Tensor<float, 4, DataLayout> input(74,37,11,137);
483   Tensor<float, 2, DataLayout> kernel(3,4);
484   Tensor<float, 4, DataLayout> out(74,35,8,137);
485   input = input.constant(10.0f) + input.random();
486   kernel = kernel.constant(7.0f) + kernel.random();
487 
488   std::size_t input_bytes = input.size() * sizeof(float);
489   std::size_t kernel_bytes = kernel.size() * sizeof(float);
490   std::size_t out_bytes = out.size() * sizeof(float);
491 
492   float* d_input;
493   float* d_kernel;
494   float* d_out;
495   gpuMalloc((void**)(&d_input), input_bytes);
496   gpuMalloc((void**)(&d_kernel), kernel_bytes);
497   gpuMalloc((void**)(&d_out), out_bytes);
498 
499   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
500   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
501 
502   Eigen::GpuStreamDevice stream;
503   Eigen::GpuDevice gpu_device(&stream);
504 
505   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
506   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
507   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
508 
509   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
510   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
511 
512   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
513   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
514 
515   for (int i = 0; i < 74; ++i) {
516     for (int j = 0; j < 35; ++j) {
517       for (int k = 0; k < 8; ++k) {
518         for (int l = 0; l < 137; ++l) {
519           const float result = out(i,j,k,l);
520           const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
521                                  input(i,j+1,k+0,l) * kernel(1,0) +
522                                  input(i,j+2,k+0,l) * kernel(2,0) +
523                                  input(i,j+0,k+1,l) * kernel(0,1) +
524                                  input(i,j+1,k+1,l) * kernel(1,1) +
525                                  input(i,j+2,k+1,l) * kernel(2,1) +
526                                  input(i,j+0,k+2,l) * kernel(0,2) +
527                                  input(i,j+1,k+2,l) * kernel(1,2) +
528                                  input(i,j+2,k+2,l) * kernel(2,2) +
529                                  input(i,j+0,k+3,l) * kernel(0,3) +
530                                  input(i,j+1,k+3,l) * kernel(1,3) +
531                                  input(i,j+2,k+3,l) * kernel(2,3);
532           VERIFY_IS_APPROX(result, expected);
533         }
534       }
535     }
536   }
537 
538   gpuFree(d_input);
539   gpuFree(d_kernel);
540   gpuFree(d_out);
541 }
542 
543 template<int DataLayout>
test_gpu_convolution_3d()544 void test_gpu_convolution_3d()
545 {
546   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
547   Tensor<float, 3, DataLayout> kernel(3,4,2);
548   Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
549   input = input.constant(10.0f) + input.random();
550   kernel = kernel.constant(7.0f) + kernel.random();
551 
552   std::size_t input_bytes = input.size() * sizeof(float);
553   std::size_t kernel_bytes = kernel.size() * sizeof(float);
554   std::size_t out_bytes = out.size() * sizeof(float);
555 
556   float* d_input;
557   float* d_kernel;
558   float* d_out;
559   gpuMalloc((void**)(&d_input), input_bytes);
560   gpuMalloc((void**)(&d_kernel), kernel_bytes);
561   gpuMalloc((void**)(&d_out), out_bytes);
562 
563   gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
564   gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
565 
566   Eigen::GpuStreamDevice stream;
567   Eigen::GpuDevice gpu_device(&stream);
568 
569   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
570   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
571   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
572 
573   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
574   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
575 
576   assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
577   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
578 
579   for (int i = 0; i < 74; ++i) {
580     for (int j = 0; j < 35; ++j) {
581       for (int k = 0; k < 8; ++k) {
582         for (int l = 0; l < 136; ++l) {
583           for (int m = 0; m < 17; ++m) {
584             const float result = out(i,j,k,l,m);
585             const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
586                                    input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
587                                    input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
588                                    input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
589                                    input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
590                                    input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
591                                    input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
592                                    input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
593                                    input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
594                                    input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
595                                    input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
596                                    input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
597                                    input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
598                                    input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
599                                    input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
600                                    input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
601                                    input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
602                                    input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
603                                    input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
604                                    input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
605                                    input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
606                                    input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
607                                    input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
608                                    input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
609             VERIFY_IS_APPROX(result, expected);
610           }
611         }
612       }
613     }
614   }
615 
616   gpuFree(d_input);
617   gpuFree(d_kernel);
618   gpuFree(d_out);
619 }
620 
621 
622 #if EIGEN_GPU_TEST_C99_MATH
623 template <typename Scalar>
test_gpu_lgamma(const Scalar stddev)624 void test_gpu_lgamma(const Scalar stddev)
625 {
626   Tensor<Scalar, 2> in(72,97);
627   in.setRandom();
628   in *= in.constant(stddev);
629   Tensor<Scalar, 2> out(72,97);
630   out.setZero();
631 
632   std::size_t bytes = in.size() * sizeof(Scalar);
633 
634   Scalar* d_in;
635   Scalar* d_out;
636   gpuMalloc((void**)(&d_in), bytes);
637   gpuMalloc((void**)(&d_out), bytes);
638 
639   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
640 
641   Eigen::GpuStreamDevice stream;
642   Eigen::GpuDevice gpu_device(&stream);
643 
644   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
645   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
646 
647   gpu_out.device(gpu_device) = gpu_in.lgamma();
648 
649   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
650   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
651 
652   for (int i = 0; i < 72; ++i) {
653     for (int j = 0; j < 97; ++j) {
654       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
655     }
656   }
657 
658   gpuFree(d_in);
659   gpuFree(d_out);
660 }
661 #endif
662 
663 template <typename Scalar>
test_gpu_digamma()664 void test_gpu_digamma()
665 {
666   Tensor<Scalar, 1> in(7);
667   Tensor<Scalar, 1> out(7);
668   Tensor<Scalar, 1> expected_out(7);
669   out.setZero();
670 
671   in(0) = Scalar(1);
672   in(1) = Scalar(1.5);
673   in(2) = Scalar(4);
674   in(3) = Scalar(-10.5);
675   in(4) = Scalar(10000.5);
676   in(5) = Scalar(0);
677   in(6) = Scalar(-1);
678 
679   expected_out(0) = Scalar(-0.5772156649015329);
680   expected_out(1) = Scalar(0.03648997397857645);
681   expected_out(2) = Scalar(1.2561176684318);
682   expected_out(3) = Scalar(2.398239129535781);
683   expected_out(4) = Scalar(9.210340372392849);
684   expected_out(5) = std::numeric_limits<Scalar>::infinity();
685   expected_out(6) = std::numeric_limits<Scalar>::infinity();
686 
687   std::size_t bytes = in.size() * sizeof(Scalar);
688 
689   Scalar* d_in;
690   Scalar* d_out;
691   gpuMalloc((void**)(&d_in), bytes);
692   gpuMalloc((void**)(&d_out), bytes);
693 
694   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
695 
696   Eigen::GpuStreamDevice stream;
697   Eigen::GpuDevice gpu_device(&stream);
698 
699   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
700   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
701 
702   gpu_out.device(gpu_device) = gpu_in.digamma();
703 
704   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
705   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
706 
707   for (int i = 0; i < 5; ++i) {
708     VERIFY_IS_APPROX(out(i), expected_out(i));
709   }
710   for (int i = 5; i < 7; ++i) {
711     VERIFY_IS_EQUAL(out(i), expected_out(i));
712   }
713 
714   gpuFree(d_in);
715   gpuFree(d_out);
716 }
717 
718 template <typename Scalar>
test_gpu_zeta()719 void test_gpu_zeta()
720 {
721   Tensor<Scalar, 1> in_x(6);
722   Tensor<Scalar, 1> in_q(6);
723   Tensor<Scalar, 1> out(6);
724   Tensor<Scalar, 1> expected_out(6);
725   out.setZero();
726 
727   in_x(0) = Scalar(1);
728   in_x(1) = Scalar(1.5);
729   in_x(2) = Scalar(4);
730   in_x(3) = Scalar(-10.5);
731   in_x(4) = Scalar(10000.5);
732   in_x(5) = Scalar(3);
733 
734   in_q(0) = Scalar(1.2345);
735   in_q(1) = Scalar(2);
736   in_q(2) = Scalar(1.5);
737   in_q(3) = Scalar(3);
738   in_q(4) = Scalar(1.0001);
739   in_q(5) = Scalar(-2.5);
740 
741   expected_out(0) = std::numeric_limits<Scalar>::infinity();
742   expected_out(1) = Scalar(1.61237534869);
743   expected_out(2) = Scalar(0.234848505667);
744   expected_out(3) = Scalar(1.03086757337e-5);
745   expected_out(4) = Scalar(0.367879440865);
746   expected_out(5) = Scalar(0.054102025820864097);
747 
748   std::size_t bytes = in_x.size() * sizeof(Scalar);
749 
750   Scalar* d_in_x;
751   Scalar* d_in_q;
752   Scalar* d_out;
753   gpuMalloc((void**)(&d_in_x), bytes);
754   gpuMalloc((void**)(&d_in_q), bytes);
755   gpuMalloc((void**)(&d_out), bytes);
756 
757   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
758   gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
759 
760   Eigen::GpuStreamDevice stream;
761   Eigen::GpuDevice gpu_device(&stream);
762 
763   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
764   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
765   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
766 
767   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
768 
769   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
770   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
771 
772   VERIFY_IS_EQUAL(out(0), expected_out(0));
773   VERIFY((std::isnan)(out(3)));
774 
775   for (int i = 1; i < 6; ++i) {
776     if (i != 3) {
777       VERIFY_IS_APPROX(out(i), expected_out(i));
778     }
779   }
780 
781   gpuFree(d_in_x);
782   gpuFree(d_in_q);
783   gpuFree(d_out);
784 }
785 
786 template <typename Scalar>
test_gpu_polygamma()787 void test_gpu_polygamma()
788 {
789   Tensor<Scalar, 1> in_x(7);
790   Tensor<Scalar, 1> in_n(7);
791   Tensor<Scalar, 1> out(7);
792   Tensor<Scalar, 1> expected_out(7);
793   out.setZero();
794 
795   in_n(0) = Scalar(1);
796   in_n(1) = Scalar(1);
797   in_n(2) = Scalar(1);
798   in_n(3) = Scalar(17);
799   in_n(4) = Scalar(31);
800   in_n(5) = Scalar(28);
801   in_n(6) = Scalar(8);
802 
803   in_x(0) = Scalar(2);
804   in_x(1) = Scalar(3);
805   in_x(2) = Scalar(25.5);
806   in_x(3) = Scalar(4.7);
807   in_x(4) = Scalar(11.8);
808   in_x(5) = Scalar(17.7);
809   in_x(6) = Scalar(30.2);
810 
811   expected_out(0) = Scalar(0.644934066848);
812   expected_out(1) = Scalar(0.394934066848);
813   expected_out(2) = Scalar(0.0399946696496);
814   expected_out(3) = Scalar(293.334565435);
815   expected_out(4) = Scalar(0.445487887616);
816   expected_out(5) = Scalar(-2.47810300902e-07);
817   expected_out(6) = Scalar(-8.29668781082e-09);
818 
819   std::size_t bytes = in_x.size() * sizeof(Scalar);
820 
821   Scalar* d_in_x;
822   Scalar* d_in_n;
823   Scalar* d_out;
824   gpuMalloc((void**)(&d_in_x), bytes);
825   gpuMalloc((void**)(&d_in_n), bytes);
826   gpuMalloc((void**)(&d_out), bytes);
827 
828   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
829   gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
830 
831   Eigen::GpuStreamDevice stream;
832   Eigen::GpuDevice gpu_device(&stream);
833 
834   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
835   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
836   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
837 
838   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
839 
840   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
841   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
842 
843   for (int i = 0; i < 7; ++i) {
844     VERIFY_IS_APPROX(out(i), expected_out(i));
845   }
846 
847   gpuFree(d_in_x);
848   gpuFree(d_in_n);
849   gpuFree(d_out);
850 }
851 
852 template <typename Scalar>
test_gpu_igamma()853 void test_gpu_igamma()
854 {
855   Tensor<Scalar, 2> a(6, 6);
856   Tensor<Scalar, 2> x(6, 6);
857   Tensor<Scalar, 2> out(6, 6);
858   out.setZero();
859 
860   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
861   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
862 
863   for (int i = 0; i < 6; ++i) {
864     for (int j = 0; j < 6; ++j) {
865       a(i, j) = a_s[i];
866       x(i, j) = x_s[j];
867     }
868   }
869 
870   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
871   Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
872                           {0.0, 0.6321205588285578, 0.7768698398515702,
873                            0.9816843611112658, 9.999500016666262e-05, 1.0},
874                           {0.0, 0.4275932955291202, 0.608374823728911,
875                            0.9539882943107686, 7.522076445089201e-07, 1.0},
876                           {0.0, 0.01898815687615381, 0.06564245437845008,
877                            0.5665298796332909, 4.166333347221828e-18, 1.0},
878                           {0.0, 0.9999780593618628, 0.9999899967080838,
879                            0.9999996219837988, 0.9991370418689945, 1.0},
880                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
881 
882 
883 
884   std::size_t bytes = a.size() * sizeof(Scalar);
885 
886   Scalar* d_a;
887   Scalar* d_x;
888   Scalar* d_out;
889   assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
890   assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
891   assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
892 
893   gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
894   gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
895 
896   Eigen::GpuStreamDevice stream;
897   Eigen::GpuDevice gpu_device(&stream);
898 
899   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
900   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
901   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
902 
903   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
904 
905   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
906   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
907 
908   for (int i = 0; i < 6; ++i) {
909     for (int j = 0; j < 6; ++j) {
910       if ((std::isnan)(igamma_s[i][j])) {
911         VERIFY((std::isnan)(out(i, j)));
912       } else {
913         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
914       }
915     }
916   }
917 
918   gpuFree(d_a);
919   gpuFree(d_x);
920   gpuFree(d_out);
921 }
922 
923 template <typename Scalar>
test_gpu_igammac()924 void test_gpu_igammac()
925 {
926   Tensor<Scalar, 2> a(6, 6);
927   Tensor<Scalar, 2> x(6, 6);
928   Tensor<Scalar, 2> out(6, 6);
929   out.setZero();
930 
931   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
932   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
933 
934   for (int i = 0; i < 6; ++i) {
935     for (int j = 0; j < 6; ++j) {
936       a(i, j) = a_s[i];
937       x(i, j) = x_s[j];
938     }
939   }
940 
941   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
942   Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
943                            {1.0, 0.36787944117144233, 0.22313016014842982,
944                             0.018315638888734182, 0.9999000049998333, 0.0},
945                            {1.0, 0.5724067044708798, 0.3916251762710878,
946                             0.04601170568923136, 0.9999992477923555, 0.0},
947                            {1.0, 0.9810118431238462, 0.9343575456215499,
948                             0.4334701203667089, 1.0, 0.0},
949                            {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
950                             3.7801620118431334e-07, 0.0008629581310054535,
951                             0.0},
952                            {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
953 
954   std::size_t bytes = a.size() * sizeof(Scalar);
955 
956   Scalar* d_a;
957   Scalar* d_x;
958   Scalar* d_out;
959   gpuMalloc((void**)(&d_a), bytes);
960   gpuMalloc((void**)(&d_x), bytes);
961   gpuMalloc((void**)(&d_out), bytes);
962 
963   gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
964   gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
965 
966   Eigen::GpuStreamDevice stream;
967   Eigen::GpuDevice gpu_device(&stream);
968 
969   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
970   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
971   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
972 
973   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
974 
975   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
976   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
977 
978   for (int i = 0; i < 6; ++i) {
979     for (int j = 0; j < 6; ++j) {
980       if ((std::isnan)(igammac_s[i][j])) {
981         VERIFY((std::isnan)(out(i, j)));
982       } else {
983         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
984       }
985     }
986   }
987 
988   gpuFree(d_a);
989   gpuFree(d_x);
990   gpuFree(d_out);
991 }
992 
993 #if EIGEN_GPU_TEST_C99_MATH
994 template <typename Scalar>
test_gpu_erf(const Scalar stddev)995 void test_gpu_erf(const Scalar stddev)
996 {
997   Tensor<Scalar, 2> in(72,97);
998   in.setRandom();
999   in *= in.constant(stddev);
1000   Tensor<Scalar, 2> out(72,97);
1001   out.setZero();
1002 
1003   std::size_t bytes = in.size() * sizeof(Scalar);
1004 
1005   Scalar* d_in;
1006   Scalar* d_out;
1007   assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
1008   assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
1009 
1010   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
1011 
1012   Eigen::GpuStreamDevice stream;
1013   Eigen::GpuDevice gpu_device(&stream);
1014 
1015   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1016   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1017 
1018   gpu_out.device(gpu_device) = gpu_in.erf();
1019 
1020   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1021   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1022 
1023   for (int i = 0; i < 72; ++i) {
1024     for (int j = 0; j < 97; ++j) {
1025       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1026     }
1027   }
1028 
1029   gpuFree(d_in);
1030   gpuFree(d_out);
1031 }
1032 
1033 template <typename Scalar>
test_gpu_erfc(const Scalar stddev)1034 void test_gpu_erfc(const Scalar stddev)
1035 {
1036   Tensor<Scalar, 2> in(72,97);
1037   in.setRandom();
1038   in *= in.constant(stddev);
1039   Tensor<Scalar, 2> out(72,97);
1040   out.setZero();
1041 
1042   std::size_t bytes = in.size() * sizeof(Scalar);
1043 
1044   Scalar* d_in;
1045   Scalar* d_out;
1046   gpuMalloc((void**)(&d_in), bytes);
1047   gpuMalloc((void**)(&d_out), bytes);
1048 
1049   gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
1050 
1051   Eigen::GpuStreamDevice stream;
1052   Eigen::GpuDevice gpu_device(&stream);
1053 
1054   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1055   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1056 
1057   gpu_out.device(gpu_device) = gpu_in.erfc();
1058 
1059   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1060   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1061 
1062   for (int i = 0; i < 72; ++i) {
1063     for (int j = 0; j < 97; ++j) {
1064       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1065     }
1066   }
1067 
1068   gpuFree(d_in);
1069   gpuFree(d_out);
1070 }
1071 #endif
1072 template <typename Scalar>
test_gpu_ndtri()1073 void test_gpu_ndtri()
1074 {
1075   Tensor<Scalar, 1> in_x(8);
1076   Tensor<Scalar, 1> out(8);
1077   Tensor<Scalar, 1> expected_out(8);
1078   out.setZero();
1079 
1080   in_x(0) = Scalar(1);
1081   in_x(1) = Scalar(0.);
1082   in_x(2) = Scalar(0.5);
1083   in_x(3) = Scalar(0.2);
1084   in_x(4) = Scalar(0.8);
1085   in_x(5) = Scalar(0.9);
1086   in_x(6) = Scalar(0.1);
1087   in_x(7) = Scalar(0.99);
1088   in_x(8) = Scalar(0.01);
1089 
1090   expected_out(0) = std::numeric_limits<Scalar>::infinity();
1091   expected_out(1) = -std::numeric_limits<Scalar>::infinity();
1092   expected_out(2) = Scalar(0.0);
1093   expected_out(3) = Scalar(-0.8416212335729142);
1094   expected_out(4) = Scalar(0.8416212335729142);
1095   expected_out(5) = Scalar(1.2815515655446004);
1096   expected_out(6) = Scalar(-1.2815515655446004);
1097   expected_out(7) = Scalar(2.3263478740408408);
1098   expected_out(8) = Scalar(-2.3263478740408408);
1099 
1100   std::size_t bytes = in_x.size() * sizeof(Scalar);
1101 
1102   Scalar* d_in_x;
1103   Scalar* d_out;
1104   gpuMalloc((void**)(&d_in_x), bytes);
1105   gpuMalloc((void**)(&d_out), bytes);
1106 
1107   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1108 
1109   Eigen::GpuStreamDevice stream;
1110   Eigen::GpuDevice gpu_device(&stream);
1111 
1112   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
1113   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
1114 
1115   gpu_out.device(gpu_device) = gpu_in_x.ndtri();
1116 
1117   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1118   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1119 
1120   VERIFY_IS_EQUAL(out(0), expected_out(0));
1121   VERIFY((std::isnan)(out(3)));
1122 
1123   for (int i = 1; i < 6; ++i) {
1124     if (i != 3) {
1125       VERIFY_IS_APPROX(out(i), expected_out(i));
1126     }
1127   }
1128 
1129   gpuFree(d_in_x);
1130   gpuFree(d_out);
1131 }
1132 
1133 template <typename Scalar>
test_gpu_betainc()1134 void test_gpu_betainc()
1135 {
1136   Tensor<Scalar, 1> in_x(125);
1137   Tensor<Scalar, 1> in_a(125);
1138   Tensor<Scalar, 1> in_b(125);
1139   Tensor<Scalar, 1> out(125);
1140   Tensor<Scalar, 1> expected_out(125);
1141   out.setZero();
1142 
1143   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1144 
1145   Array<Scalar, 1, Dynamic> x(125);
1146   Array<Scalar, 1, Dynamic> a(125);
1147   Array<Scalar, 1, Dynamic> b(125);
1148   Array<Scalar, 1, Dynamic> v(125);
1149 
1150   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1151       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1152       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1153       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1154       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1155       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1156       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1157       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1158       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1159       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1160       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1161       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1162       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1163       31.62177660168379, 31.62177660168379, 31.62177660168379,
1164       31.62177660168379, 31.62177660168379, 31.62177660168379,
1165       31.62177660168379, 31.62177660168379, 31.62177660168379,
1166       31.62177660168379, 31.62177660168379, 31.62177660168379,
1167       31.62177660168379, 31.62177660168379, 31.62177660168379,
1168       31.62177660168379, 31.62177660168379, 31.62177660168379,
1169       31.62177660168379, 31.62177660168379, 31.62177660168379,
1170       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1171       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1172       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1173       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1174 
1175   b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1176       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1177       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1178       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1179       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1180       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1181       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1182       31.62177660168379, 31.62177660168379, 31.62177660168379,
1183       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1184       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1185       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1186       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1187       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1188       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1189       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1190       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1191       31.62177660168379, 31.62177660168379, 31.62177660168379,
1192       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1193       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1194       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1195       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1196       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1197       999.999, 999.999, 999.999;
1198 
1199   x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1200       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1201       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1202       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1203       0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1204       -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1205       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1206       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1207       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1208 
1209   v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1210       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1211       nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1212       0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1213       0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1214       0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1215       nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1216       0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1217       0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1218       0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1219       0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1220       1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1221       nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1222       nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1223       nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1224       nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1225       2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1226       1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1227       2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1228       0.5000000000004947, 0.9999999999999999, nan;
1229 
1230   for (int i = 0; i < 125; ++i) {
1231     in_x(i) = x(i);
1232     in_a(i) = a(i);
1233     in_b(i) = b(i);
1234     expected_out(i) = v(i);
1235   }
1236 
1237   std::size_t bytes = in_x.size() * sizeof(Scalar);
1238 
1239   Scalar* d_in_x;
1240   Scalar* d_in_a;
1241   Scalar* d_in_b;
1242   Scalar* d_out;
1243   gpuMalloc((void**)(&d_in_x), bytes);
1244   gpuMalloc((void**)(&d_in_a), bytes);
1245   gpuMalloc((void**)(&d_in_b), bytes);
1246   gpuMalloc((void**)(&d_out), bytes);
1247 
1248   gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1249   gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
1250   gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
1251 
1252   Eigen::GpuStreamDevice stream;
1253   Eigen::GpuDevice gpu_device(&stream);
1254 
1255   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1256   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1257   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1258   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1259 
1260   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1261 
1262   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
1263   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1264 
1265   for (int i = 1; i < 125; ++i) {
1266     if ((std::isnan)(expected_out(i))) {
1267       VERIFY((std::isnan)(out(i)));
1268     } else {
1269       VERIFY_IS_APPROX(out(i), expected_out(i));
1270     }
1271   }
1272 
1273   gpuFree(d_in_x);
1274   gpuFree(d_in_a);
1275   gpuFree(d_in_b);
1276   gpuFree(d_out);
1277 }
1278 
1279 template <typename Scalar>
test_gpu_i0e()1280 void test_gpu_i0e()
1281 {
1282   Tensor<Scalar, 1> in_x(21);
1283   Tensor<Scalar, 1> out(21);
1284   Tensor<Scalar, 1> expected_out(21);
1285   out.setZero();
1286 
1287   Array<Scalar, 1, Dynamic> in_x_array(21);
1288   Array<Scalar, 1, Dynamic> expected_out_array(21);
1289 
1290   in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
1291       -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
1292 
1293   expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
1294       0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
1295       0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
1296       0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
1297       0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
1298       0.0897803118848;
1299 
1300   for (int i = 0; i < 21; ++i) {
1301     in_x(i) = in_x_array(i);
1302     expected_out(i) = expected_out_array(i);
1303   }
1304 
1305   std::size_t bytes = in_x.size() * sizeof(Scalar);
1306 
1307   Scalar* d_in;
1308   Scalar* d_out;
1309   gpuMalloc((void**)(&d_in), bytes);
1310   gpuMalloc((void**)(&d_out), bytes);
1311 
1312   gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
1313 
1314   Eigen::GpuStreamDevice stream;
1315   Eigen::GpuDevice gpu_device(&stream);
1316 
1317   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
1318   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
1319 
1320   gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
1321 
1322   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1323                          gpu_device.stream()) == gpuSuccess);
1324   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1325 
1326   for (int i = 0; i < 21; ++i) {
1327     VERIFY_IS_APPROX(out(i), expected_out(i));
1328   }
1329 
1330   gpuFree(d_in);
1331   gpuFree(d_out);
1332 }
1333 
1334 template <typename Scalar>
test_gpu_i1e()1335 void test_gpu_i1e()
1336 {
1337   Tensor<Scalar, 1> in_x(21);
1338   Tensor<Scalar, 1> out(21);
1339   Tensor<Scalar, 1> expected_out(21);
1340   out.setZero();
1341 
1342   Array<Scalar, 1, Dynamic> in_x_array(21);
1343   Array<Scalar, 1, Dynamic> expected_out_array(21);
1344 
1345   in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
1346       -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
1347 
1348   expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
1349       -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
1350       -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
1351       0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
1352       0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
1353       0.0875062221833;
1354 
1355   for (int i = 0; i < 21; ++i) {
1356     in_x(i) = in_x_array(i);
1357     expected_out(i) = expected_out_array(i);
1358   }
1359 
1360   std::size_t bytes = in_x.size() * sizeof(Scalar);
1361 
1362   Scalar* d_in;
1363   Scalar* d_out;
1364   gpuMalloc((void**)(&d_in), bytes);
1365   gpuMalloc((void**)(&d_out), bytes);
1366 
1367   gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
1368 
1369   Eigen::GpuStreamDevice stream;
1370   Eigen::GpuDevice gpu_device(&stream);
1371 
1372   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
1373   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
1374 
1375   gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
1376 
1377   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1378                          gpu_device.stream()) == gpuSuccess);
1379   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1380 
1381   for (int i = 0; i < 21; ++i) {
1382     VERIFY_IS_APPROX(out(i), expected_out(i));
1383   }
1384 
1385   gpuFree(d_in);
1386   gpuFree(d_out);
1387 }
1388 
1389 template <typename Scalar>
test_gpu_igamma_der_a()1390 void test_gpu_igamma_der_a()
1391 {
1392   Tensor<Scalar, 1> in_x(30);
1393   Tensor<Scalar, 1> in_a(30);
1394   Tensor<Scalar, 1> out(30);
1395   Tensor<Scalar, 1> expected_out(30);
1396   out.setZero();
1397 
1398   Array<Scalar, 1, Dynamic> in_a_array(30);
1399   Array<Scalar, 1, Dynamic> in_x_array(30);
1400   Array<Scalar, 1, Dynamic> expected_out_array(30);
1401 
1402   // See special_functions.cpp for the Python code that generates the test data.
1403 
1404   in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
1405       1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
1406       100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
1407 
1408   in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
1409       1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
1410       0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
1411       1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
1412       7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
1413       92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
1414       968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
1415 
1416   expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
1417       -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
1418       -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
1419       -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
1420       -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
1421       -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
1422       -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
1423       -0.0125850719397, -0.00455500206958, -0.00476436993148;
1424 
1425   for (int i = 0; i < 30; ++i) {
1426     in_x(i) = in_x_array(i);
1427     in_a(i) = in_a_array(i);
1428     expected_out(i) = expected_out_array(i);
1429   }
1430 
1431   std::size_t bytes = in_x.size() * sizeof(Scalar);
1432 
1433   Scalar* d_a;
1434   Scalar* d_x;
1435   Scalar* d_out;
1436   gpuMalloc((void**)(&d_a), bytes);
1437   gpuMalloc((void**)(&d_x), bytes);
1438   gpuMalloc((void**)(&d_out), bytes);
1439 
1440   gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
1441   gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
1442 
1443   Eigen::GpuStreamDevice stream;
1444   Eigen::GpuDevice gpu_device(&stream);
1445 
1446   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
1447   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
1448   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
1449 
1450   gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
1451 
1452   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1453                          gpu_device.stream()) == gpuSuccess);
1454   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1455 
1456   for (int i = 0; i < 30; ++i) {
1457     VERIFY_IS_APPROX(out(i), expected_out(i));
1458   }
1459 
1460   gpuFree(d_a);
1461   gpuFree(d_x);
1462   gpuFree(d_out);
1463 }
1464 
1465 template <typename Scalar>
test_gpu_gamma_sample_der_alpha()1466 void test_gpu_gamma_sample_der_alpha()
1467 {
1468   Tensor<Scalar, 1> in_alpha(30);
1469   Tensor<Scalar, 1> in_sample(30);
1470   Tensor<Scalar, 1> out(30);
1471   Tensor<Scalar, 1> expected_out(30);
1472   out.setZero();
1473 
1474   Array<Scalar, 1, Dynamic> in_alpha_array(30);
1475   Array<Scalar, 1, Dynamic> in_sample_array(30);
1476   Array<Scalar, 1, Dynamic> expected_out_array(30);
1477 
1478   // See special_functions.cpp for the Python code that generates the test data.
1479 
1480   in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
1481       1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
1482       100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
1483 
1484   in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
1485       1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
1486       0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
1487       1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
1488       7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
1489       92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
1490       968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
1491 
1492   expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
1493       1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
1494       0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
1495       1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
1496       0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
1497       1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
1498       0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
1499       1.00106492525, 0.97734200649, 1.02198794179;
1500 
1501   for (int i = 0; i < 30; ++i) {
1502     in_alpha(i) = in_alpha_array(i);
1503     in_sample(i) = in_sample_array(i);
1504     expected_out(i) = expected_out_array(i);
1505   }
1506 
1507   std::size_t bytes = in_alpha.size() * sizeof(Scalar);
1508 
1509   Scalar* d_alpha;
1510   Scalar* d_sample;
1511   Scalar* d_out;
1512   gpuMalloc((void**)(&d_alpha), bytes);
1513   gpuMalloc((void**)(&d_sample), bytes);
1514   gpuMalloc((void**)(&d_out), bytes);
1515 
1516   gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
1517   gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
1518 
1519   Eigen::GpuStreamDevice stream;
1520   Eigen::GpuDevice gpu_device(&stream);
1521 
1522   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
1523   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
1524   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
1525 
1526   gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
1527 
1528   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
1529                          gpu_device.stream()) == gpuSuccess);
1530   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
1531 
1532   for (int i = 0; i < 30; ++i) {
1533     VERIFY_IS_APPROX(out(i), expected_out(i));
1534   }
1535 
1536   gpuFree(d_alpha);
1537   gpuFree(d_sample);
1538   gpuFree(d_out);
1539 }
1540 
EIGEN_DECLARE_TEST(cxx11_tensor_gpu)1541 EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
1542 {
1543   CALL_SUBTEST_1(test_gpu_nullary());
1544   CALL_SUBTEST_1(test_gpu_elementwise_small());
1545   CALL_SUBTEST_1(test_gpu_elementwise());
1546   CALL_SUBTEST_1(test_gpu_props());
1547   CALL_SUBTEST_1(test_gpu_reduction());
1548   CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
1549   CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
1550   CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
1551   CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
1552   CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
1553   CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
1554   CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
1555   CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
1556 #if !defined(EIGEN_USE_HIP)
1557 // disable these tests on HIP for now.
1558 // they hang..need to investigate and fix
1559   CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
1560   CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
1561 #endif
1562 
1563 #if EIGEN_GPU_TEST_C99_MATH
1564   // std::erf, std::erfc, and so on where only added in c++11. We use them
1565   // as a golden reference to validate the results produced by Eigen. Therefore
1566   // we can only run these tests if we use a c++11 compiler.
1567   CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
1568   CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
1569   CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
1570   CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
1571 
1572   CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
1573   CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
1574   CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
1575   CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
1576 
1577   CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
1578   CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
1579   CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
1580   CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
1581 
1582   CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
1583   // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
1584   CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
1585   CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
1586   CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
1587 
1588   CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
1589   CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
1590   CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
1591   CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
1592 
1593   CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
1594   // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
1595   CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
1596   CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
1597   CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
1598 
1599 #if !defined(EIGEN_USE_HIP)
1600 // disable these tests on HIP for now.
1601 
1602   CALL_SUBTEST_5(test_gpu_ndtri<float>());
1603   CALL_SUBTEST_5(test_gpu_ndtri<double>());
1604 
1605   CALL_SUBTEST_5(test_gpu_digamma<float>());
1606   CALL_SUBTEST_5(test_gpu_digamma<double>());
1607 
1608   CALL_SUBTEST_5(test_gpu_polygamma<float>());
1609   CALL_SUBTEST_5(test_gpu_polygamma<double>());
1610 
1611   CALL_SUBTEST_5(test_gpu_zeta<float>());
1612   CALL_SUBTEST_5(test_gpu_zeta<double>());
1613 #endif
1614 
1615   CALL_SUBTEST_5(test_gpu_igamma<float>());
1616   CALL_SUBTEST_5(test_gpu_igammac<float>());
1617 
1618   CALL_SUBTEST_5(test_gpu_igamma<double>());
1619   CALL_SUBTEST_5(test_gpu_igammac<double>());
1620 
1621 #if !defined(EIGEN_USE_HIP)
1622 // disable these tests on HIP for now.
1623   CALL_SUBTEST_6(test_gpu_betainc<float>());
1624   CALL_SUBTEST_6(test_gpu_betainc<double>());
1625 
1626   CALL_SUBTEST_6(test_gpu_i0e<float>());
1627   CALL_SUBTEST_6(test_gpu_i0e<double>());
1628 
1629   CALL_SUBTEST_6(test_gpu_i1e<float>());
1630   CALL_SUBTEST_6(test_gpu_i1e<double>());
1631 
1632   CALL_SUBTEST_6(test_gpu_i1e<float>());
1633   CALL_SUBTEST_6(test_gpu_i1e<double>());
1634 
1635   CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
1636   CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
1637 
1638   CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
1639   CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
1640 #endif
1641 
1642 #endif
1643 }
1644