xref: /aosp_15_r20/external/eigen/test/gpu_basic.cu (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015-2016 Gael Guennebaud <[email protected]>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 // workaround issue between gcc >= 4.7 and cuda 5.5
11 #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
12   #undef _GLIBCXX_ATOMIC_BUILTINS
13   #undef _GLIBCXX_USE_INT128
14 #endif
15 
16 #define EIGEN_TEST_NO_LONGDOUBLE
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
18 
19 #include "main.h"
20 #include "gpu_common.h"
21 
22 // Check that dense modules can be properly parsed by nvcc
23 #include <Eigen/Dense>
24 
25 // struct Foo{
26 //   EIGEN_DEVICE_FUNC
27 //   void operator()(int i, const float* mats, float* vecs) const {
28 //     using namespace Eigen;
29 //   //   Matrix3f M(data);
30 //   //   Vector3f x(data+9);
31 //   //   Map<Vector3f>(data+9) = M.inverse() * x;
32 //     Matrix3f M(mats+i/16);
33 //     Vector3f x(vecs+i*3);
34 //   //   using std::min;
35 //   //   using std::sqrt;
36 //     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
37 //     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
38 //   }
39 // };
40 
41 template<typename T>
42 struct coeff_wise {
43   EIGEN_DEVICE_FUNC
operator ()coeff_wise44   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
45   {
46     using namespace Eigen;
47     T x1(in+i);
48     T x2(in+i+1);
49     T x3(in+i+2);
50     Map<T> res(out+i*T::MaxSizeAtCompileTime);
51 
52     res.array() += (in[0] * x1 + x2).array() * x3.array();
53   }
54 };
55 
56 template<typename T>
57 struct complex_sqrt {
58   EIGEN_DEVICE_FUNC
operator ()complex_sqrt59   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
60   {
61     using namespace Eigen;
62     typedef typename T::Scalar ComplexType;
63     typedef typename T::Scalar::value_type ValueType;
64     const int num_special_inputs = 18;
65 
66     if (i == 0) {
67       const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
68       typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
69       SpecialInputs special_in;
70       special_in.setZero();
71       int idx = 0;
72       special_in[idx++] = ComplexType(0, 0);
73       special_in[idx++] = ComplexType(-0, 0);
74       special_in[idx++] = ComplexType(0, -0);
75       special_in[idx++] = ComplexType(-0, -0);
76       // GCC's fallback sqrt implementation fails for inf inputs.
77       // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
78       // clang includes the GCC header (which temporarily disables
79       // _GLIBCXX_USE_C99_COMPLEX)
80       #if !defined(_GLIBCXX_COMPLEX) || \
81         (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
82       const ValueType inf = std::numeric_limits<ValueType>::infinity();
83       special_in[idx++] = ComplexType(1.0, inf);
84       special_in[idx++] = ComplexType(nan, inf);
85       special_in[idx++] = ComplexType(1.0, -inf);
86       special_in[idx++] = ComplexType(nan, -inf);
87       special_in[idx++] = ComplexType(-inf, 1.0);
88       special_in[idx++] = ComplexType(inf, 1.0);
89       special_in[idx++] = ComplexType(-inf, -1.0);
90       special_in[idx++] = ComplexType(inf, -1.0);
91       special_in[idx++] = ComplexType(-inf, nan);
92       special_in[idx++] = ComplexType(inf, nan);
93       #endif
94       special_in[idx++] = ComplexType(1.0, nan);
95       special_in[idx++] = ComplexType(nan, 1.0);
96       special_in[idx++] = ComplexType(nan, -1.0);
97       special_in[idx++] = ComplexType(nan, nan);
98 
99       Map<SpecialInputs> special_out(out);
100       special_out = special_in.cwiseSqrt();
101     }
102 
103     T x1(in + i);
104     Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
105     res = x1.cwiseSqrt();
106   }
107 };
108 
109 template<typename T>
110 struct complex_operators {
111   EIGEN_DEVICE_FUNC
operator ()complex_operators112   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
113   {
114     using namespace Eigen;
115     typedef typename T::Scalar ComplexType;
116     typedef typename T::Scalar::value_type ValueType;
117     const int num_scalar_operators = 24;
118     const int num_vector_operators = 23;  // no unary + operator.
119     int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
120 
121     // Scalar operators.
122     const ComplexType a = in[i];
123     const ComplexType b = in[i + 1];
124 
125     out[out_idx++] = +a;
126     out[out_idx++] = -a;
127 
128     out[out_idx++] = a + b;
129     out[out_idx++] = a + numext::real(b);
130     out[out_idx++] = numext::real(a) + b;
131     out[out_idx++] = a - b;
132     out[out_idx++] = a - numext::real(b);
133     out[out_idx++] = numext::real(a) - b;
134     out[out_idx++] = a * b;
135     out[out_idx++] = a * numext::real(b);
136     out[out_idx++] = numext::real(a) * b;
137     out[out_idx++] = a / b;
138     out[out_idx++] = a / numext::real(b);
139     out[out_idx++] = numext::real(a) / b;
140 
141     out[out_idx] = a; out[out_idx++] += b;
142     out[out_idx] = a; out[out_idx++] -= b;
143     out[out_idx] = a; out[out_idx++] *= b;
144     out[out_idx] = a; out[out_idx++] /= b;
145 
146     const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
147     const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
148     out[out_idx++] = (a == b ? true_value : false_value);
149     out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
150     out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
151     out[out_idx++] = (a != b ? true_value : false_value);
152     out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
153     out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
154 
155     // Vector versions.
156     T x1(in + i);
157     T x2(in + i + 1);
158     const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
159     const int size = T::MaxSizeAtCompileTime;
160     int block_idx = 0;
161 
162     Map<VectorX<ComplexType>> res(out + out_idx, res_size);
163     res.segment(block_idx, size) = -x1;
164     block_idx += size;
165 
166     res.segment(block_idx, size) = x1 + x2;
167     block_idx += size;
168     res.segment(block_idx, size) = x1 + x2.real();
169     block_idx += size;
170     res.segment(block_idx, size) = x1.real() + x2;
171     block_idx += size;
172     res.segment(block_idx, size) = x1 - x2;
173     block_idx += size;
174     res.segment(block_idx, size) = x1 - x2.real();
175     block_idx += size;
176     res.segment(block_idx, size) = x1.real() - x2;
177     block_idx += size;
178     res.segment(block_idx, size) = x1.array() * x2.array();
179     block_idx += size;
180     res.segment(block_idx, size) = x1.array() * x2.real().array();
181     block_idx += size;
182     res.segment(block_idx, size) = x1.real().array() * x2.array();
183     block_idx += size;
184     res.segment(block_idx, size) = x1.array() / x2.array();
185     block_idx += size;
186     res.segment(block_idx, size) = x1.array() / x2.real().array();
187     block_idx += size;
188     res.segment(block_idx, size) = x1.real().array() / x2.array();
189     block_idx += size;
190 
191     res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
192     block_idx += size;
193     res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
194     block_idx += size;
195     res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
196     block_idx += size;
197     res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
198     block_idx += size;
199 
200     const T true_vector = T::Constant(true_value);
201     const T false_vector = T::Constant(false_value);
202     res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
203     block_idx += size;
204     // Mixing types in equality comparison does not work.
205     // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
206     // block_idx += size;
207     // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
208     // block_idx += size;
209     res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
210     block_idx += size;
211     // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
212     // block_idx += size;
213     // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
214     // block_idx += size;
215   }
216 };
217 
218 template<typename T>
219 struct replicate {
220   EIGEN_DEVICE_FUNC
operator ()replicate221   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
222   {
223     using namespace Eigen;
224     T x1(in+i);
225     int step   = x1.size() * 4;
226     int stride = 3 * step;
227 
228     typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
229     MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
230     MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
231     MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
232   }
233 };
234 
235 template<typename T>
236 struct alloc_new_delete {
237   EIGEN_DEVICE_FUNC
operator ()alloc_new_delete238   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
239   {
240     int offset = 2*i*T::MaxSizeAtCompileTime;
241     T* x = new T(in + offset);
242     Eigen::Map<T> u(out + offset);
243     u = *x;
244     delete x;
245 
246     offset += T::MaxSizeAtCompileTime;
247     T* y = new T[1];
248     y[0] = T(in + offset);
249     Eigen::Map<T> v(out + offset);
250     v = y[0];
251     delete[] y;
252   }
253 };
254 
255 template<typename T>
256 struct redux {
257   EIGEN_DEVICE_FUNC
operator ()redux258   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
259   {
260     using namespace Eigen;
261     int N = 10;
262     T x1(in+i);
263     out[i*N+0] = x1.minCoeff();
264     out[i*N+1] = x1.maxCoeff();
265     out[i*N+2] = x1.sum();
266     out[i*N+3] = x1.prod();
267     out[i*N+4] = x1.matrix().squaredNorm();
268     out[i*N+5] = x1.matrix().norm();
269     out[i*N+6] = x1.colwise().sum().maxCoeff();
270     out[i*N+7] = x1.rowwise().maxCoeff().sum();
271     out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
272   }
273 };
274 
275 template<typename T1, typename T2>
276 struct prod_test {
277   EIGEN_DEVICE_FUNC
operator ()prod_test278   void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
279   {
280     using namespace Eigen;
281     typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
282     T1 x1(in+i);
283     T2 x2(in+i+1);
284     Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
285     res += in[i] * x1 * x2;
286   }
287 };
288 
289 template<typename T1, typename T2>
290 struct diagonal {
291   EIGEN_DEVICE_FUNC
operator ()diagonal292   void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
293   {
294     using namespace Eigen;
295     T1 x1(in+i);
296     Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
297     res += x1.diagonal();
298   }
299 };
300 
301 template<typename T>
302 struct eigenvalues_direct {
303   EIGEN_DEVICE_FUNC
operator ()eigenvalues_direct304   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
305   {
306     using namespace Eigen;
307     typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
308     T M(in+i);
309     Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
310     T A = M*M.adjoint();
311     SelfAdjointEigenSolver<T> eig;
312     eig.computeDirect(A);
313     res = eig.eigenvalues();
314   }
315 };
316 
317 template<typename T>
318 struct eigenvalues {
319   EIGEN_DEVICE_FUNC
operator ()eigenvalues320   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
321   {
322     using namespace Eigen;
323     typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
324     T M(in+i);
325     Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
326     T A = M*M.adjoint();
327     SelfAdjointEigenSolver<T> eig;
328     eig.compute(A);
329     res = eig.eigenvalues();
330   }
331 };
332 
333 template<typename T>
334 struct matrix_inverse {
335   EIGEN_DEVICE_FUNC
operator ()matrix_inverse336   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
337   {
338     using namespace Eigen;
339     T M(in+i);
340     Map<T> res(out+i*T::MaxSizeAtCompileTime);
341     res = M.inverse();
342   }
343 };
344 
345 template<typename T>
346 struct numeric_limits_test {
347   EIGEN_DEVICE_FUNC
operator ()numeric_limits_test348   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
349   {
350     EIGEN_UNUSED_VARIABLE(in)
351     int out_idx = i * 5;
352     out[out_idx++] = numext::numeric_limits<float>::epsilon();
353     out[out_idx++] = (numext::numeric_limits<float>::max)();
354     out[out_idx++] = (numext::numeric_limits<float>::min)();
355     out[out_idx++] = numext::numeric_limits<float>::infinity();
356     out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
357   }
358 };
359 
360 template<typename Type1, typename Type2>
verifyIsApproxWithInfsNans(const Type1 & a,const Type2 & b,typename Type1::Scalar * =0)361 bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
362 {
363   if (a.rows() != b.rows()) {
364     return false;
365   }
366   if (a.cols() != b.cols()) {
367     return false;
368   }
369   for (Index r = 0; r < a.rows(); ++r) {
370     for (Index c = 0; c < a.cols(); ++c) {
371       if (a(r, c) != b(r, c)
372           && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
373           && !test_isApprox(a(r, c), b(r, c))) {
374         return false;
375       }
376     }
377   }
378   return true;
379 }
380 
381 template<typename Kernel, typename Input, typename Output>
test_with_infs_nans(const Kernel & ker,int n,const Input & in,Output & out)382 void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
383 {
384   Output out_ref, out_gpu;
385   #if !defined(EIGEN_GPU_COMPILE_PHASE)
386   out_ref = out_gpu = out;
387   #else
388   EIGEN_UNUSED_VARIABLE(in);
389   EIGEN_UNUSED_VARIABLE(out);
390   #endif
391   run_on_cpu (ker, n, in,  out_ref);
392   run_on_gpu(ker, n, in, out_gpu);
393   #if !defined(EIGEN_GPU_COMPILE_PHASE)
394   verifyIsApproxWithInfsNans(out_ref, out_gpu);
395   #endif
396 }
397 
EIGEN_DECLARE_TEST(gpu_basic)398 EIGEN_DECLARE_TEST(gpu_basic)
399 {
400   ei_test_init_gpu();
401 
402   int nthreads = 100;
403   Eigen::VectorXf in, out;
404   Eigen::VectorXcf cfin, cfout;
405 
406   #if !defined(EIGEN_GPU_COMPILE_PHASE)
407   int data_size = nthreads * 512;
408   in.setRandom(data_size);
409   out.setConstant(data_size, -1);
410   cfin.setRandom(data_size);
411   cfout.setConstant(data_size, -1);
412   #endif
413 
414   CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
415   CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
416 
417 #if !defined(EIGEN_USE_HIP)
418   // FIXME
419   // These subtests result in a compile failure on the HIP platform
420   //
421   //  eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
422   //           base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
423   //           (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
424   CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
425   CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
426 
427   // HIP does not support new/delete on device.
428   CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
429 #endif
430 
431   CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
432   CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
433 
434   CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
435   CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
436 
437   CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
438   CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
439 
440   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
441   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
442   CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
443 
444   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
445   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
446 
447   // Test std::complex.
448   CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
449   CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
450 
451   // numeric_limits
452   CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
453 
454 #if defined(__NVCC__)
455   // FIXME
456   // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
457   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
458   typedef Matrix<float,6,6> Matrix6f;
459   CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
460 #endif
461 }
462