xref: /aosp_15_r20/external/eigen/bench/tensors/tensor_contract_sycl_bench.cc (revision bf2c37156dfe67e5dfebd6d394bad8b2ab5804d4)
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli    Codeplay Software Ltd.
6 // Ralph Potter  Codeplay Software Ltd.
7 // Luke Iwanski  Codeplay Software Ltd.
8 // Contact: <[email protected]>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 #ifndef EIGEN_BENCH_CONTRACT_SYCL
14 #define EIGEN_BENCH_CONTRACT_SYCL
15 #define EIGEN_TEST_NO_LONGDOUBLE
16 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #include <SYCL/sycl.hpp>
19 #include <fstream>
20 #include <iostream>
21 #include <chrono>
22 #include <ctime>
23 
24 #include <unsupported/Eigen/CXX11/Tensor>
25 
26 using Eigen::array;
27 using Eigen::SyclDevice;
28 using Eigen::Tensor;
29 using Eigen::TensorMap;
30 std::ofstream out("Result.txt");
31 
get_time()32 std::chrono::time_point<std::chrono::system_clock> get_time(){
33   std::chrono::time_point<std::chrono::system_clock> start, end;
34   return std::chrono::system_clock::now();
35 }
36 
37 template<typename Start, typename End, typename TensorIndex>
finalizeBenchmark(Start start,End end,TensorIndex m_,TensorIndex k_,TensorIndex n_,TensorIndex num_iters,std::string name)38 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
39 
40   std::chrono::duration<double> elapsed_seconds = end-start;
41   std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
42   static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
43     out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
44     static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
45 }
46 
47 // do a contraction which is equivalent to a matrix multiplication
48 template<typename T, typename Device, typename TensorIndex>
contraction(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)49 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
50   T* a_;
51   T* b_;
52   T* c_;
53   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
54   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
55   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
56 
57   // Initialize the content of the memory pools to prevent asan from
58   // complaining.
59   device_.memset(a_, 12, m_ * k_ * sizeof(T));
60   device_.memset(b_, 23, k_ * n_ * sizeof(T));
61   device_.memset(c_, 31, m_ * n_ * sizeof(T));
62 
63   Eigen::array<TensorIndex, 2> sizeA;
64   sizeA[0] = m_;
65   sizeA[1] = k_;
66   Eigen::array<TensorIndex, 2> sizeB;
67   sizeB[0] = k_;
68   sizeB[1] = n_;
69   Eigen::array<TensorIndex, 2> sizeC;
70   sizeC[0] = m_;
71   sizeC[1] = n_;
72 
73   const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
74   const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
75   TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
76 
77   typedef typename Tensor<T, 2>::DimensionPair DimPair;
78   Eigen::array<DimPair, 1> dims;
79   dims[0] = DimPair(1, 0);
80 #ifdef EIGEN_USE_SYCL // warmup for sycl
81   for (int iter = 0; iter < 10; ++iter) {
82     C.device(device_) = A.contract(B, dims);
83    }
84 #endif
85   auto start = get_time();
86   for (int iter = 0; iter < num_iters; ++iter) {
87     C.device(device_) = A.contract(B, dims);
88   }
89  auto end = get_time();
90   // Record the number of FLOPs executed per second (size_ multiplications and
91   // additions for each value in the resulting tensor)
92   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
93   device_.deallocate(a_);
94   device_.deallocate(b_);
95   device_.deallocate(c_);
96   device_.synchronize();
97 }
98 
99 
100 
101 // do a contraction which is equivalent to a matrix multiplication
102 template<typename T, typename Device, typename TensorIndex>
contractionRowMajor(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)103 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
104   T* a_;
105   T* b_;
106   T* c_;
107   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
108   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
109   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
110 
111   // Initialize the content of the memory pools to prevent asan from
112   // complaining.
113   device_.memset(a_, 12, m_ * k_ * sizeof(T));
114   device_.memset(b_, 23, k_ * n_ * sizeof(T));
115   device_.memset(c_, 31, m_ * n_ * sizeof(T));
116 
117   Eigen::array<TensorIndex, 2> sizeA;
118   sizeA[0] = m_;
119   sizeA[1] = k_;
120   Eigen::array<TensorIndex, 2> sizeB;
121   sizeB[0] = k_;
122   sizeB[1] = n_;
123   Eigen::array<TensorIndex, 2> sizeC;
124   sizeC[0] = m_;
125   sizeC[1] = n_;
126 
127   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
128   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
129   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
130 
131   typedef typename Tensor<T, 2>::DimensionPair DimPair;
132   Eigen::array<DimPair, 1> dims;
133   dims[0] = DimPair(1, 0);
134 #ifdef EIGEN_USE_SYCL // warmup for sycl
135   for (int iter = 0; iter < 10; ++iter) {
136     C.device(device_) = A.contract(B, dims);
137    }
138 #endif
139   auto start = get_time();
140   for (int iter = 0; iter < num_iters; ++iter) {
141     C.device(device_) = A.contract(B, dims);
142   }
143   auto end = get_time();
144   // Record the number of FLOPs executed per second (size_ multiplications and
145   // additions for each value in the resulting tensor)
146   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
147   device_.deallocate(a_);
148   device_.deallocate(b_);
149   device_.deallocate(c_);
150   device_.synchronize();
151 }
152 
153 
154 template<typename T, typename Device, typename TensorIndex>
contractionAT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)155 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
156   T* a_;
157   T* b_;
158   T* c_;
159   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
160   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
161   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
162 
163   // Initialize the content of the memory pools to prevent asan from
164   // complaining.
165   device_.memset(a_, 12, m_ * k_ * sizeof(T));
166   device_.memset(b_, 23, k_ * n_ * sizeof(T));
167   device_.memset(c_, 31, m_ * n_ * sizeof(T));
168   Eigen::array<TensorIndex, 2> sizeA;
169   sizeA[0] = k_;
170   sizeA[1] = m_;
171   Eigen::array<TensorIndex, 2> sizeB;
172   sizeB[0] = k_;
173   sizeB[1] = n_;
174   Eigen::array<TensorIndex, 2> sizeC;
175   sizeC[0] = m_;
176   sizeC[1] = n_;
177 
178   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
179   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
180   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
181 
182   typedef typename Tensor<T, 2>::DimensionPair DimPair;
183   Eigen::array<DimPair, 1> dims;
184   dims[0] = DimPair(0, 0);
185 #ifdef EIGEN_USE_SYCL // warmup for sycl
186   for (int iter = 0; iter < 10; ++iter) {
187     C.device(device_) = A.contract(B, dims);
188    }
189 #endif
190   auto start = get_time();
191   for (int iter = 0; iter < num_iters; ++iter) {
192     C.device(device_) = A.contract(B, dims);
193   }
194   auto end = get_time();
195   // Record the number of FLOPs executed per second (size_ multiplications and
196   // additions for each value in the resulting tensor)
197   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
198   device_.deallocate(a_);
199   device_.deallocate(b_);
200   device_.deallocate(c_);
201   device_.synchronize();
202 
203 }
204 
205 template<typename T, typename Device, typename TensorIndex>
contractionBT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)206 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
207   T* a_;
208   T* b_;
209   T* c_;
210   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
211   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
212   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
213 
214   // Initialize the content of the memory pools to prevent asan from
215   // complaining.
216   device_.memset(a_, 12, m_ * k_ * sizeof(T));
217   device_.memset(b_, 23, k_ * n_ * sizeof(T));
218   device_.memset(c_, 31, m_ * n_ * sizeof(T));
219 
220   Eigen::array<TensorIndex, 2> sizeA;
221   sizeA[0] = m_;
222   sizeA[1] = k_;
223   Eigen::array<TensorIndex, 2> sizeB;
224   sizeB[0] = n_;
225   sizeB[1] = k_;
226   Eigen::array<TensorIndex, 2> sizeC;
227   sizeC[0] = m_;
228   sizeC[1] = n_;
229 
230   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
231   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
232   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
233 
234   typedef typename Tensor<T, 2>::DimensionPair DimPair;
235   Eigen::array<DimPair, 1> dims;
236   dims[0] = DimPair(1, 1);
237 #ifdef EIGEN_USE_SYCL // warmup for sycl
238   for (int iter = 0; iter < 10; ++iter) {
239     C.device(device_) = A.contract(B, dims);
240    }
241 #endif
242   auto start = get_time();
243   for (int iter = 0; iter < num_iters; ++iter) {
244     C.device(device_) = A.contract(B, dims);
245   }
246   auto end = get_time();
247   // Record the number of FLOPs executed per second (size_ multiplications and
248   // additions for each value in the resulting tensor)
249   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
250   device_.deallocate(a_);
251   device_.deallocate(b_);
252   device_.deallocate(c_);
253   device_.synchronize();
254 
255 }
256 
257 template<typename T, typename Device, typename TensorIndex>
contractionABT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)258 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
259   T* a_;
260   T* b_;
261   T* c_;
262   a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
263   b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
264   c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
265 
266   // Initialize the content of the memory pools to prevent asan from
267   // complaining.
268   device_.memset(a_, 12, m_ * k_ * sizeof(T));
269   device_.memset(b_, 23, k_ * n_ * sizeof(T));
270   device_.memset(c_, 31, m_ * n_ * sizeof(T));
271 
272   Eigen::array<TensorIndex, 2> sizeA;
273   sizeA[0] = k_;
274   sizeA[1] = m_;
275   Eigen::array<TensorIndex, 2> sizeB;
276   sizeB[0] = n_;
277   sizeB[1] = k_;
278   Eigen::array<TensorIndex, 2> sizeC;
279   sizeC[0] = m_;
280   sizeC[1] = n_;
281 
282   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
283   const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
284   TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
285 
286   typedef typename Tensor<T, 2>::DimensionPair DimPair;
287   Eigen::array<DimPair, 1> dims;
288   dims[0] = DimPair(0, 1);
289 #ifdef EIGEN_USE_SYCL // warmup for sycl
290   for (int iter = 0; iter < 10; ++iter) {
291     C.device(device_) = A.contract(B, dims);
292    }
293 #endif
294   auto start = get_time();
295   for (int iter = 0; iter < num_iters; ++iter) {
296     C.device(device_) = A.contract(B, dims);
297   }
298   auto end = get_time();
299   // Record the number of FLOPs executed per second (size_ multiplications and
300   // additions for each value in the resulting tensor)
301   finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
302   device_.deallocate(a_);
303   device_.deallocate(b_);
304   device_.deallocate(c_);
305   device_.synchronize();
306 }
307 
main()308 int main() {
309   cl::sycl::gpu_selector selector;
310   Eigen::QueueInterface queue(selector);
311   Eigen::SyclDevice device(&queue);
312   int64_t num_iters =20;
313   for(int64_t m = 32; m <= 4096; m *= 2)
314     for(int64_t k = 32; k <= 4096; k *= 2)
315       for(int64_t n = 32; n <= 4096; n*= 2){
316         (contraction<float>(device, num_iters, m, k, n));
317         (contractionRowMajor<float>(device, num_iters, m, k, n));
318         (contractionAT<float>(device, num_iters, m, k, n));
319         (contractionBT<float>(device, num_iters, m, k, n));
320         (contractionABT<float>(device, num_iters, m, k, n));
321       }
322   return 0;
323   }
324 
325 #endif // EIGEN_BENCH_CONTRACT_SYCL
326