1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016
5 // Mehdi Goli Codeplay Software Ltd.
6 // Ralph Potter Codeplay Software Ltd.
7 // Luke Iwanski Codeplay Software Ltd.
8 // Contact: <[email protected]>
9 //
10 // This Source Code Form is subject to the terms of the Mozilla
11 // Public License v. 2.0. If a copy of the MPL was not distributed
12 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 #ifndef EIGEN_BENCH_CONTRACT_SYCL
14 #define EIGEN_BENCH_CONTRACT_SYCL
15 #define EIGEN_TEST_NO_LONGDOUBLE
16 #define EIGEN_TEST_NO_COMPLEX
17 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
18 #include <SYCL/sycl.hpp>
19 #include <fstream>
20 #include <iostream>
21 #include <chrono>
22 #include <ctime>
23
24 #include <unsupported/Eigen/CXX11/Tensor>
25
26 using Eigen::array;
27 using Eigen::SyclDevice;
28 using Eigen::Tensor;
29 using Eigen::TensorMap;
30 std::ofstream out("Result.txt");
31
get_time()32 std::chrono::time_point<std::chrono::system_clock> get_time(){
33 std::chrono::time_point<std::chrono::system_clock> start, end;
34 return std::chrono::system_clock::now();
35 }
36
37 template<typename Start, typename End, typename TensorIndex>
finalizeBenchmark(Start start,End end,TensorIndex m_,TensorIndex k_,TensorIndex n_,TensorIndex num_iters,std::string name)38 void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
39
40 std::chrono::duration<double> elapsed_seconds = end-start;
41 std::cout <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
42 static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
43 out <<"Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
44 static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
45 }
46
47 // do a contraction which is equivalent to a matrix multiplication
48 template<typename T, typename Device, typename TensorIndex>
contraction(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)49 void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
50 T* a_;
51 T* b_;
52 T* c_;
53 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
54 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
55 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
56
57 // Initialize the content of the memory pools to prevent asan from
58 // complaining.
59 device_.memset(a_, 12, m_ * k_ * sizeof(T));
60 device_.memset(b_, 23, k_ * n_ * sizeof(T));
61 device_.memset(c_, 31, m_ * n_ * sizeof(T));
62
63 Eigen::array<TensorIndex, 2> sizeA;
64 sizeA[0] = m_;
65 sizeA[1] = k_;
66 Eigen::array<TensorIndex, 2> sizeB;
67 sizeB[0] = k_;
68 sizeB[1] = n_;
69 Eigen::array<TensorIndex, 2> sizeC;
70 sizeC[0] = m_;
71 sizeC[1] = n_;
72
73 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
74 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
75 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
76
77 typedef typename Tensor<T, 2>::DimensionPair DimPair;
78 Eigen::array<DimPair, 1> dims;
79 dims[0] = DimPair(1, 0);
80 #ifdef EIGEN_USE_SYCL // warmup for sycl
81 for (int iter = 0; iter < 10; ++iter) {
82 C.device(device_) = A.contract(B, dims);
83 }
84 #endif
85 auto start = get_time();
86 for (int iter = 0; iter < num_iters; ++iter) {
87 C.device(device_) = A.contract(B, dims);
88 }
89 auto end = get_time();
90 // Record the number of FLOPs executed per second (size_ multiplications and
91 // additions for each value in the resulting tensor)
92 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
93 device_.deallocate(a_);
94 device_.deallocate(b_);
95 device_.deallocate(c_);
96 device_.synchronize();
97 }
98
99
100
101 // do a contraction which is equivalent to a matrix multiplication
102 template<typename T, typename Device, typename TensorIndex>
contractionRowMajor(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)103 void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
104 T* a_;
105 T* b_;
106 T* c_;
107 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
108 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
109 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
110
111 // Initialize the content of the memory pools to prevent asan from
112 // complaining.
113 device_.memset(a_, 12, m_ * k_ * sizeof(T));
114 device_.memset(b_, 23, k_ * n_ * sizeof(T));
115 device_.memset(c_, 31, m_ * n_ * sizeof(T));
116
117 Eigen::array<TensorIndex, 2> sizeA;
118 sizeA[0] = m_;
119 sizeA[1] = k_;
120 Eigen::array<TensorIndex, 2> sizeB;
121 sizeB[0] = k_;
122 sizeB[1] = n_;
123 Eigen::array<TensorIndex, 2> sizeC;
124 sizeC[0] = m_;
125 sizeC[1] = n_;
126
127 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
128 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
129 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
130
131 typedef typename Tensor<T, 2>::DimensionPair DimPair;
132 Eigen::array<DimPair, 1> dims;
133 dims[0] = DimPair(1, 0);
134 #ifdef EIGEN_USE_SYCL // warmup for sycl
135 for (int iter = 0; iter < 10; ++iter) {
136 C.device(device_) = A.contract(B, dims);
137 }
138 #endif
139 auto start = get_time();
140 for (int iter = 0; iter < num_iters; ++iter) {
141 C.device(device_) = A.contract(B, dims);
142 }
143 auto end = get_time();
144 // Record the number of FLOPs executed per second (size_ multiplications and
145 // additions for each value in the resulting tensor)
146 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
147 device_.deallocate(a_);
148 device_.deallocate(b_);
149 device_.deallocate(c_);
150 device_.synchronize();
151 }
152
153
154 template<typename T, typename Device, typename TensorIndex>
contractionAT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)155 void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
156 T* a_;
157 T* b_;
158 T* c_;
159 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
160 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
161 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
162
163 // Initialize the content of the memory pools to prevent asan from
164 // complaining.
165 device_.memset(a_, 12, m_ * k_ * sizeof(T));
166 device_.memset(b_, 23, k_ * n_ * sizeof(T));
167 device_.memset(c_, 31, m_ * n_ * sizeof(T));
168 Eigen::array<TensorIndex, 2> sizeA;
169 sizeA[0] = k_;
170 sizeA[1] = m_;
171 Eigen::array<TensorIndex, 2> sizeB;
172 sizeB[0] = k_;
173 sizeB[1] = n_;
174 Eigen::array<TensorIndex, 2> sizeC;
175 sizeC[0] = m_;
176 sizeC[1] = n_;
177
178 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
179 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
180 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
181
182 typedef typename Tensor<T, 2>::DimensionPair DimPair;
183 Eigen::array<DimPair, 1> dims;
184 dims[0] = DimPair(0, 0);
185 #ifdef EIGEN_USE_SYCL // warmup for sycl
186 for (int iter = 0; iter < 10; ++iter) {
187 C.device(device_) = A.contract(B, dims);
188 }
189 #endif
190 auto start = get_time();
191 for (int iter = 0; iter < num_iters; ++iter) {
192 C.device(device_) = A.contract(B, dims);
193 }
194 auto end = get_time();
195 // Record the number of FLOPs executed per second (size_ multiplications and
196 // additions for each value in the resulting tensor)
197 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
198 device_.deallocate(a_);
199 device_.deallocate(b_);
200 device_.deallocate(c_);
201 device_.synchronize();
202
203 }
204
205 template<typename T, typename Device, typename TensorIndex>
contractionBT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)206 void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
207 T* a_;
208 T* b_;
209 T* c_;
210 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
211 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
212 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
213
214 // Initialize the content of the memory pools to prevent asan from
215 // complaining.
216 device_.memset(a_, 12, m_ * k_ * sizeof(T));
217 device_.memset(b_, 23, k_ * n_ * sizeof(T));
218 device_.memset(c_, 31, m_ * n_ * sizeof(T));
219
220 Eigen::array<TensorIndex, 2> sizeA;
221 sizeA[0] = m_;
222 sizeA[1] = k_;
223 Eigen::array<TensorIndex, 2> sizeB;
224 sizeB[0] = n_;
225 sizeB[1] = k_;
226 Eigen::array<TensorIndex, 2> sizeC;
227 sizeC[0] = m_;
228 sizeC[1] = n_;
229
230 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
231 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
232 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
233
234 typedef typename Tensor<T, 2>::DimensionPair DimPair;
235 Eigen::array<DimPair, 1> dims;
236 dims[0] = DimPair(1, 1);
237 #ifdef EIGEN_USE_SYCL // warmup for sycl
238 for (int iter = 0; iter < 10; ++iter) {
239 C.device(device_) = A.contract(B, dims);
240 }
241 #endif
242 auto start = get_time();
243 for (int iter = 0; iter < num_iters; ++iter) {
244 C.device(device_) = A.contract(B, dims);
245 }
246 auto end = get_time();
247 // Record the number of FLOPs executed per second (size_ multiplications and
248 // additions for each value in the resulting tensor)
249 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
250 device_.deallocate(a_);
251 device_.deallocate(b_);
252 device_.deallocate(c_);
253 device_.synchronize();
254
255 }
256
257 template<typename T, typename Device, typename TensorIndex>
contractionABT(const Device & device_,TensorIndex num_iters,TensorIndex m_,TensorIndex k_,TensorIndex n_)258 void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
259 T* a_;
260 T* b_;
261 T* c_;
262 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
263 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
264 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
265
266 // Initialize the content of the memory pools to prevent asan from
267 // complaining.
268 device_.memset(a_, 12, m_ * k_ * sizeof(T));
269 device_.memset(b_, 23, k_ * n_ * sizeof(T));
270 device_.memset(c_, 31, m_ * n_ * sizeof(T));
271
272 Eigen::array<TensorIndex, 2> sizeA;
273 sizeA[0] = k_;
274 sizeA[1] = m_;
275 Eigen::array<TensorIndex, 2> sizeB;
276 sizeB[0] = n_;
277 sizeB[1] = k_;
278 Eigen::array<TensorIndex, 2> sizeC;
279 sizeC[0] = m_;
280 sizeC[1] = n_;
281
282 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
283 const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
284 TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
285
286 typedef typename Tensor<T, 2>::DimensionPair DimPair;
287 Eigen::array<DimPair, 1> dims;
288 dims[0] = DimPair(0, 1);
289 #ifdef EIGEN_USE_SYCL // warmup for sycl
290 for (int iter = 0; iter < 10; ++iter) {
291 C.device(device_) = A.contract(B, dims);
292 }
293 #endif
294 auto start = get_time();
295 for (int iter = 0; iter < num_iters; ++iter) {
296 C.device(device_) = A.contract(B, dims);
297 }
298 auto end = get_time();
299 // Record the number of FLOPs executed per second (size_ multiplications and
300 // additions for each value in the resulting tensor)
301 finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
302 device_.deallocate(a_);
303 device_.deallocate(b_);
304 device_.deallocate(c_);
305 device_.synchronize();
306 }
307
main()308 int main() {
309 cl::sycl::gpu_selector selector;
310 Eigen::QueueInterface queue(selector);
311 Eigen::SyclDevice device(&queue);
312 int64_t num_iters =20;
313 for(int64_t m = 32; m <= 4096; m *= 2)
314 for(int64_t k = 32; k <= 4096; k *= 2)
315 for(int64_t n = 32; n <= 4096; n*= 2){
316 (contraction<float>(device, num_iters, m, k, n));
317 (contractionRowMajor<float>(device, num_iters, m, k, n));
318 (contractionAT<float>(device, num_iters, m, k, n));
319 (contractionBT<float>(device, num_iters, m, k, n));
320 (contractionABT<float>(device, num_iters, m, k, n));
321 }
322 return 0;
323 }
324
325 #endif // EIGEN_BENCH_CONTRACT_SYCL
326