xref: /aosp_15_r20/external/clpeak/results/AMD_Accelerated_Parallel_Processing/AMD_MI100.log (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1Platform: AMD Accelerated Parallel Processing
2  Device: gfx908:sramecc+:xnack-
3    Driver version  : 3406.0 (HSA1.1,LC) (Linux x64)
4    Compute units   : 120
5    Clock frequency : 1502 MHz
6
7    Global memory bandwidth (GBPS)
8      float   : 946.60
9      float2  : 942.59
10      float4  : 931.65
11      float8  : 987.27
12      float16 : 732.10
13
14    Single-precision compute (GFLOPS)
15      float   : 22284.31
16      float2  : 21579.50
17      float4  : 21489.94
18      float8  : 21348.52
19      float16 : 21032.74
20
21    Half-precision compute (GFLOPS)
22      half   : 11191.11
23      half2  : 43951.02
24      half4  : 43740.40
25      half8  : 43416.40
26      half16 : 43042.69
27
28    Double-precision compute (GFLOPS)
29      double   : 11119.62
30      double2  : 11089.45
31      double4  : 11040.92
32      double8  : 10975.65
33      double16 : 10741.24
34
35    Integer compute (GIOPS)
36      int   : 7380.05
37      int2  : 7125.83
38      int4  : 7091.25
39      int8  : 7154.73
40      int16 : 7086.31
41
42    Integer compute Fast 24bit (GIOPS)
43      int   : 20832.50
44      int2  : 19661.20
45      int4  : 18393.51
46      int8  : 18919.36
47      int16 : 18626.53
48
49    Transfer bandwidth (GBPS)
50      enqueueWriteBuffer              : 16.85
51      enqueueReadBuffer               : 16.55
52      enqueueWriteBuffer non-blocking : 16.85
53      enqueueReadBuffer non-blocking  : 16.55
54      enqueueMapBuffer(for read)      : 177477.98
55        memcpy from mapped ptr        : 17.28
56      enqueueUnmap(after write)       : 325376.31
57        memcpy to mapped ptr          : 17.37
58
59    Kernel launch latency : 11.69 us
60
61  Device: gfx908:sramecc+:xnack-
62    Driver version  : 3406.0 (HSA1.1,LC) (Linux x64)
63    Compute units   : 120
64    Clock frequency : 1502 MHz
65
66    Global memory bandwidth (GBPS)
67      float   : 945.47
68      float2  : 940.52
69      float4  : 931.31
70      float8  : 985.60
71      float16 : 731.37
72
73    Single-precision compute (GFLOPS)
74      float   : 22766.39
75      float2  : 21930.19
76      float4  : 21804.63
77      float8  : 21588.35
78      float16 : 21229.40
79
80    Half-precision compute (GFLOPS)
81      half   : 11448.85
82      half2  : 44673.06
83      half4  : 44389.37
84      half8  : 43779.79
85      half16 : 43364.53
86
87    Double-precision compute (GFLOPS)
88      double   : 11328.98
89      double2  : 11190.77
90      double4  : 11041.74
91      double8  : 11017.95
92      double16 : 10726.28
93
94    Integer compute (GIOPS)
95      int   : 7337.42
96      int2  : 7032.87
97      int4  : 6998.89
98      int8  : 7065.90
99      int16 : 7002.87
100
101    Integer compute Fast 24bit (GIOPS)
102      int   : 20604.53
103      int2  : 19520.61
104      int4  : 18383.98
105      int8  : 18883.31
106      int16 : 18693.82
107
108    Transfer bandwidth (GBPS)
109      enqueueWriteBuffer              : 16.74
110      enqueueReadBuffer               : 16.58
111      enqueueWriteBuffer non-blocking : 16.57
112      enqueueReadBuffer non-blocking  : 16.55
113      enqueueMapBuffer(for read)      : 214748.38
114        memcpy from mapped ptr        : 17.27
115      enqueueUnmap(after write)       : 343597.38
116        memcpy to mapped ptr          : 17.36
117
118    Kernel launch latency : 11.64 us
119
120  Device: gfx908:sramecc+:xnack-
121    Driver version  : 3406.0 (HSA1.1,LC) (Linux x64)
122    Compute units   : 120
123    Clock frequency : 1502 MHz
124
125    Global memory bandwidth (GBPS)
126      float   : 944.11
127      float2  : 939.39
128      float4  : 928.42
129      float8  : 984.33
130      float16 : 730.79
131
132    Single-precision compute (GFLOPS)
133      float   : 22816.58
134      float2  : 22092.63
135      float4  : 21986.57
136      float8  : 21809.49
137      float16 : 21506.20
138
139    Half-precision compute (GFLOPS)
140      half   : 11466.75
141      half2  : 44960.66
142      half4  : 44769.55
143      half8  : 44330.82
144      half16 : 43944.03
145
146    Double-precision compute (GFLOPS)
147      double   : 11384.70
148      double2  : 11342.74
149      double4  : 11281.10
150      double8  : 11213.88
151      double16 : 10924.84
152
153    Integer compute (GIOPS)
154      int   : 7584.34
155      int2  : 7256.62
156      int4  : 7209.28
157      int8  : 7247.27
158      int16 : 7167.22
159
160    Integer compute Fast 24bit (GIOPS)
161      int   : 21159.21
162      int2  : 20013.83
163      int4  : 18824.73
164      int8  : 19357.07
165      int16 : 19067.46
166
167    Transfer bandwidth (GBPS)
168      enqueueWriteBuffer              : 16.55
169      enqueueReadBuffer               : 16.55
170      enqueueWriteBuffer non-blocking : 16.58
171      enqueueReadBuffer non-blocking  : 16.60
172      enqueueMapBuffer(for read)      : 169093.20
173        memcpy from mapped ptr        : 17.24
174      enqueueUnmap(after write)       : 290200.47
175        memcpy to mapped ptr          : 17.33
176
177    Kernel launch latency : 11.75 us
178
179  Device: gfx908:sramecc+:xnack-
180    Driver version  : 3406.0 (HSA1.1,LC) (Linux x64)
181    Compute units   : 120
182    Clock frequency : 1502 MHz
183
184    Global memory bandwidth (GBPS)
185      float   : 947.40
186      float2  : 942.69
187      float4  : 930.76
188      float8  : 986.67
189      float16 : 731.70
190
191    Single-precision compute (GFLOPS)
192      float   : 22597.69
193      float2  : 21601.93
194      float4  : 21531.66
195      float8  : 21375.14
196      float16 : 21063.92
197
198    Half-precision compute (GFLOPS)
199      half   : 11287.66
200      half2  : 44227.35
201      half4  : 43823.46
202      half8  : 43463.27
203      half16 : 43087.55
204
205    Double-precision compute (GFLOPS)
206      double   : 11155.44
207      double2  : 11107.10
208      double4  : 11062.91
209      double8  : 10989.63
210      double16 : 10766.67
211
212    Integer compute (GIOPS)
213      int   : 7444.78
214      int2  : 7143.09
215      int4  : 7093.91
216      int8  : 7173.30
217      int16 : 7098.05
218
219    Integer compute Fast 24bit (GIOPS)
220      int   : 20952.08
221      int2  : 19759.49
222      int4  : 18686.32
223      int8  : 19090.16
224      int16 : 18893.46
225
226    Transfer bandwidth (GBPS)
227      enqueueWriteBuffer              : 16.78
228      enqueueReadBuffer               : 16.51
229      enqueueWriteBuffer non-blocking : 16.74
230      enqueueReadBuffer non-blocking  : 16.51
231      enqueueMapBuffer(for read)      : 189205.59
232        memcpy from mapped ptr        : 17.30
233      enqueueUnmap(after write)       : 357913.94
234        memcpy to mapped ptr          : 17.38
235
236    Kernel launch latency : 11.58 us
237