xref: /aosp_15_r20/external/clpeak/results/AMD_Accelerated_Parallel_Processing/AMD_MI250X.log (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1Platform: AMD Accelerated Parallel Processing
2  Device: gfx90a:sramecc+:xnack-
3    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
4    Compute units   : 110
5    Clock frequency : 1700 MHz
6
7    Global memory bandwidth (GBPS)
8      float   : 1179.79
9      float2  : 1227.22
10      float4  : 1208.34
11      float8  : 1213.73
12      float16 : 1300.87
13
14    Single-precision compute (GFLOPS)
15      float   : 21413.00
16      float2  : 39938.10
17      float4  : 39604.64
18      float8  : 38800.38
19      float16 : 37660.73
20
21    Half-precision compute (GFLOPS)
22      half   : 10970.78
23      half2  : 41754.73
24      half4  : 41656.55
25      half8  : 40278.59
26      half16 : 39885.05
27
28    Double-precision compute (GFLOPS)
29      double   : 19930.64
30      double2  : 19652.95
31      double4  : 19356.85
32      double8  : 19103.25
33      double16 : 18571.98
34
35    Integer compute (GIOPS)
36      int   : 10125.50
37      int2  : 10104.05
38      int4  : 10100.88
39      int8  : 10051.65
40      int16 : 9970.70
41
42    Integer compute Fast 24bit (GIOPS)
43      int   : 19718.57
44      int2  : 17932.38
45      int4  : 18125.28
46      int8  : 18024.33
47      int16 : 18176.70
48
49    Transfer bandwidth (GBPS)
50      enqueueWriteBuffer              : 21.58
51      enqueueReadBuffer               : 21.59
52      enqueueWriteBuffer non-blocking : 21.60
53      enqueueReadBuffer non-blocking  : 21.60
54      enqueueMapBuffer(for read)      : 156180.62
55        memcpy from mapped ptr        : 19.72
56      enqueueUnmap(after write)       : 277094.66
57        memcpy to mapped ptr          : 20.87
58
59    Kernel launch latency : 8.00 us
60
61  Device: gfx90a:sramecc+:xnack-
62    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
63    Compute units   : 110
64    Clock frequency : 1700 MHz
65
66    Global memory bandwidth (GBPS)
67      float   : 1185.90
68      float2  : 1238.94
69      float4  : 1209.60
70      float8  : 1224.59
71      float16 : 1302.42
72
73    Single-precision compute (GFLOPS)
74      float   : 21420.44
75      float2  : 39925.50
76      float4  : 39595.35
77      float8  : 38763.88
78      float16 : 37618.76
79
80    Half-precision compute (GFLOPS)
81      half   : 10976.96
82      half2  : 41757.19
83      half4  : 41648.72
84      half8  : 40250.00
85      half16 : 39895.61
86
87    Double-precision compute (GFLOPS)
88      double   : 19897.73
89      double2  : 19622.26
90      double4  : 19330.87
91      double8  : 19104.49
92      double16 : 18631.75
93
94    Integer compute (GIOPS)
95      int   : 10113.82
96      int2  : 10092.30
97      int4  : 10091.36
98      int8  : 10044.96
99      int16 : 9961.98
100
101    Integer compute Fast 24bit (GIOPS)
102      int   : 19702.34
103      int2  : 18088.27
104      int4  : 18312.63
105      int8  : 18194.06
106      int16 : 18352.56
107
108    Transfer bandwidth (GBPS)
109      enqueueWriteBuffer              : 21.53
110      enqueueReadBuffer               : 21.63
111      enqueueWriteBuffer non-blocking : 21.54
112      enqueueReadBuffer non-blocking  : 21.64
113      enqueueMapBuffer(for read)      : 461824.41
114        memcpy from mapped ptr        : 21.07
115      enqueueUnmap(after write)       : 913822.88
116        memcpy to mapped ptr          : 20.99
117
118    Kernel launch latency : 4.27 us
119
120  Device: gfx90a:sramecc+:xnack-
121    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
122    Compute units   : 110
123    Clock frequency : 1700 MHz
124
125    Global memory bandwidth (GBPS)
126      float   : 1174.45
127      float2  : 1221.72
128      float4  : 1201.20
129      float8  : 1211.72
130      float16 : 1293.81
131
132    Single-precision compute (GFLOPS)
133      float   : 21380.57
134      float2  : 39810.21
135      float4  : 39470.96
136      float8  : 38675.24
137      float16 : 37514.80
138
139    Half-precision compute (GFLOPS)
140      half   : 10948.25
141      half2  : 41619.61
142      half4  : 41536.67
143      half8  : 40142.61
144      half16 : 39767.54
145
146    Double-precision compute (GFLOPS)
147      double   : 19834.47
148      double2  : 19554.68
149      double4  : 19282.27
150      double8  : 19036.95
151      double16 : 18534.87
152
153    Integer compute (GIOPS)
154      int   : 10091.64
155      int2  : 10070.56
156      int4  : 10066.62
157      int8  : 10020.02
158      int16 : 9938.49
159
160    Integer compute Fast 24bit (GIOPS)
161      int   : 19645.32
162      int2  : 18606.31
163      int4  : 19032.35
164      int8  : 18958.12
165      int16 : 19097.28
166
167    Transfer bandwidth (GBPS)
168      enqueueWriteBuffer              : 21.53
169      enqueueReadBuffer               : 21.67
170      enqueueWriteBuffer non-blocking : 21.57
171      enqueueReadBuffer non-blocking  : 21.69
172      enqueueMapBuffer(for read)      : 155614.75
173        memcpy from mapped ptr        : 20.97
174      enqueueUnmap(after write)       : 227246.94
175        memcpy to mapped ptr          : 21.14
176
177    Kernel launch latency : 8.17 us
178
179  Device: gfx90a:sramecc+:xnack-
180    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
181    Compute units   : 110
182    Clock frequency : 1700 MHz
183
184    Global memory bandwidth (GBPS)
185      float   : 1177.58
186      float2  : 1223.32
187      float4  : 1204.98
188      float8  : 1226.63
189      float16 : 1300.43
190
191    Single-precision compute (GFLOPS)
192      float   : 21301.46
193      float2  : 39667.60
194      float4  : 39336.00
195      float8  : 38468.66
196      float16 : 37332.00
197
198    Half-precision compute (GFLOPS)
199      half   : 10913.78
200      half2  : 41499.94
201      half4  : 41410.21
202      half8  : 39973.47
203      half16 : 39625.46
204
205    Double-precision compute (GFLOPS)
206      double   : 19754.41
207      double2  : 19440.96
208      double4  : 19183.10
209      double8  : 18917.43
210      double16 : 18492.11
211
212    Integer compute (GIOPS)
213      int   : 10034.43
214      int2  : 10009.34
215      int4  : 10006.12
216      int8  : 9946.46
217      int16 : 9878.97
218
219    Integer compute Fast 24bit (GIOPS)
220      int   : 19528.17
221      int2  : 18160.21
222      int4  : 18367.50
223      int8  : 18256.49
224      int16 : 18416.67
225
226    Transfer bandwidth (GBPS)
227      enqueueWriteBuffer              : 21.37
228      enqueueReadBuffer               : 21.19
229      enqueueWriteBuffer non-blocking : 20.88
230      enqueueReadBuffer non-blocking  : 21.19
231      enqueueMapBuffer(for read)      : 161464.92
232        memcpy from mapped ptr        : 20.90
233      enqueueUnmap(after write)       : 315806.41
234        memcpy to mapped ptr          : 20.86
235
236    Kernel launch latency : 7.25 us
237
238  Device: gfx90a:sramecc+:xnack-
239    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
240    Compute units   : 110
241    Clock frequency : 1700 MHz
242
243    Global memory bandwidth (GBPS)
244      float   : 1170.18
245      float2  : 1214.69
246      float4  : 1191.14
247      float8  : 1208.23
248      float16 : 1296.77
249
250    Single-precision compute (GFLOPS)
251      float   : 21311.58
252      float2  : 39706.50
253      float4  : 39360.48
254      float8  : 38579.23
255      float16 : 37406.09
256
257    Half-precision compute (GFLOPS)
258      half   : 10913.10
259      half2  : 41509.91
260      half4  : 41431.52
261      half8  : 40039.86
262      half16 : 39658.05
263
264    Double-precision compute (GFLOPS)
265      double   : 19787.50
266      double2  : 19511.61
267      double4  : 19233.70
268      double8  : 18999.39
269      double16 : 18509.31
270
271    Integer compute (GIOPS)
272      int   : 10066.04
273      int2  : 10040.26
274      int4  : 10039.26
275      int8  : 9991.25
276      int16 : 9907.94
277
278    Integer compute Fast 24bit (GIOPS)
279      int   : 19589.06
280      int2  : 17882.33
281      int4  : 18117.22
282      int8  : 17996.68
283      int16 : 18151.47
284
285    Transfer bandwidth (GBPS)
286      enqueueWriteBuffer              : 21.52
287      enqueueReadBuffer               : 21.63
288      enqueueWriteBuffer non-blocking : 21.52
289      enqueueReadBuffer non-blocking  : 21.61
290      enqueueMapBuffer(for read)      : 141748.09
291        memcpy from mapped ptr        : 20.94
292      enqueueUnmap(after write)       : 308990.47
293        memcpy to mapped ptr          : 20.93
294
295    Kernel launch latency : 8.05 us
296
297  Device: gfx90a:sramecc+:xnack-
298    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
299    Compute units   : 110
300    Clock frequency : 1700 MHz
301
302    Global memory bandwidth (GBPS)
303      float   : 1169.76
304      float2  : 1213.80
305      float4  : 1187.98
306      float8  : 1217.86
307      float16 : 1294.37
308
309    Single-precision compute (GFLOPS)
310      float   : 21245.70
311      float2  : 39548.50
312      float4  : 39240.17
313      float8  : 38408.61
314      float16 : 37265.25
315
316    Half-precision compute (GFLOPS)
317      half   : 10890.11
318      half2  : 41404.17
319      half4  : 41318.71
320      half8  : 39921.00
321      half16 : 39556.89
322
323    Double-precision compute (GFLOPS)
324      double   : 19742.52
325      double2  : 19435.41
326      double4  : 19145.99
327      double8  : 18843.99
328      double16 : 18459.36
329
330    Integer compute (GIOPS)
331      int   : 10032.01
332      int2  : 10008.55
333      int4  : 10007.31
334      int8  : 9960.55
335      int16 : 9876.18
336
337    Integer compute Fast 24bit (GIOPS)
338      int   : 18719.45
339      int2  : 17390.24
340      int4  : 17510.55
341      int8  : 17401.34
342      int16 : 17552.45
343
344    Transfer bandwidth (GBPS)
345      enqueueWriteBuffer              : 21.54
346      enqueueReadBuffer               : 21.65
347      enqueueWriteBuffer non-blocking : 21.54
348      enqueueReadBuffer non-blocking  : 21.66
349      enqueueMapBuffer(for read)      : 149650.44
350        memcpy from mapped ptr        : 21.01
351      enqueueUnmap(after write)       : 325376.31
352        memcpy to mapped ptr          : 21.00
353
354    Kernel launch latency : 7.33 us
355
356  Device: gfx90a:sramecc+:xnack-
357    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
358    Compute units   : 110
359    Clock frequency : 1700 MHz
360
361    Global memory bandwidth (GBPS)
362      float   : 1170.89
363      float2  : 1215.53
364      float4  : 1188.94
365      float8  : 1204.54
366      float16 : 1298.45
367
368    Single-precision compute (GFLOPS)
369      float   : 21351.45
370      float2  : 39773.80
371      float4  : 39451.41
372      float8  : 38628.86
373      float16 : 37465.02
374
375    Half-precision compute (GFLOPS)
376      half   : 10941.41
377      half2  : 41605.68
378      half4  : 41511.61
379      half8  : 40103.77
380      half16 : 39742.34
381
382    Double-precision compute (GFLOPS)
383      double   : 19825.14
384      double2  : 19560.73
385      double4  : 19261.09
386      double8  : 19017.74
387      double16 : 18557.00
388
389    Integer compute (GIOPS)
390      int   : 10079.70
391      int2  : 10056.16
392      int4  : 10057.47
393      int8  : 10011.24
394      int16 : 9925.29
395
396    Integer compute Fast 24bit (GIOPS)
397      int   : 19617.70
398      int2  : 17845.05
399      int4  : 18055.83
400      int8  : 17939.91
401      int16 : 18095.48
402
403    Transfer bandwidth (GBPS)
404      enqueueWriteBuffer              : 21.40
405      enqueueReadBuffer               : 21.62
406      enqueueWriteBuffer non-blocking : 21.41
407      enqueueReadBuffer non-blocking  : 21.63
408      enqueueMapBuffer(for read)      : 421075.22
409        memcpy from mapped ptr        : 21.02
410      enqueueUnmap(after write)       : 976128.88
411        memcpy to mapped ptr          : 20.96
412
413    Kernel launch latency : 3.67 us
414
415  Device: gfx90a:sramecc+:xnack-
416    Driver version  : 3452.0 (HSA1.1,LC) (Linux x64)
417    Compute units   : 110
418    Clock frequency : 1700 MHz
419
420    Global memory bandwidth (GBPS)
421      float   : 1174.10
422      float2  : 1215.97
423      float4  : 1190.07
424      float8  : 1223.02
425      float16 : 1298.92
426
427    Single-precision compute (GFLOPS)
428      float   : 21388.05
429      float2  : 39880.34
430      float4  : 39555.57
431      float8  : 38736.33
432      float16 : 37604.98
433
434    Half-precision compute (GFLOPS)
435      half   : 10959.79
436      half2  : 41704.85
437      half4  : 41598.84
438      half8  : 40212.77
439      half16 : 39851.18
440
441    Double-precision compute (GFLOPS)
442      double   : 19878.08
443      double2  : 19615.96
444      double4  : 19317.80
445      double8  : 19092.55
446      double16 : 18630.57
447
448    Integer compute (GIOPS)
449      int   : 10105.66
450      int2  : 10084.98
451      int4  : 10081.22
452      int8  : 10034.06
453      int16 : 9952.11
454
455    Integer compute Fast 24bit (GIOPS)
456      int   : 18342.49
457      int2  : 17464.12
458      int4  : 17454.40
459      int8  : 17295.76
460      int16 : 17475.15
461
462    Transfer bandwidth (GBPS)
463      enqueueWriteBuffer              : 21.41
464      enqueueReadBuffer               : 21.63
465      enqueueWriteBuffer non-blocking : 21.42
466      enqueueReadBuffer non-blocking  : 21.48
467      enqueueMapBuffer(for read)      : 296204.62
468        memcpy from mapped ptr        : 20.93
469      enqueueUnmap(after write)       : 1047553.00
470        memcpy to mapped ptr          : 20.95
471
472    Kernel launch latency : 3.88 us
473
474