1 /* CpuArch.c -- CPU specific code
2 Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 // #include <stdio.h>
7
8 #include "CpuArch.h"
9
10 #ifdef MY_CPU_X86_OR_AMD64
11
12 #undef NEED_CHECK_FOR_CPUID
13 #if !defined(MY_CPU_AMD64)
14 #define NEED_CHECK_FOR_CPUID
15 #endif
16
17 /*
18 cpuid instruction supports (subFunction) parameter in ECX,
19 that is used only with some specific (function) parameter values.
20 most functions use only (subFunction==0).
21 */
22 /*
23 __cpuid(): MSVC and GCC/CLANG use same function/macro name
24 but parameters are different.
25 We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
26 */
27
28 #if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
29 || defined(__clang__) /* && (__clang_major__ >= 10) */
30
31 /* there was some CLANG/GCC compilers that have issues with
32 rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
33 compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
34 The history of __cpuid() changes in CLANG/GCC:
35 GCC:
36 2007: it preserved ebx for (__PIC__ && __i386__)
37 2013: it preserved rbx and ebx for __PIC__
38 2014: it doesn't preserves rbx and ebx anymore
39 we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
40 CLANG:
41 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
42 Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
43 Do we need __PIC__ test for CLANG or we must care about rbx even if
44 __PIC__ is not defined?
45 */
46
47 #define ASM_LN "\n"
48
49 #if defined(MY_CPU_AMD64) && defined(__PIC__) \
50 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
51
52 /* "=&r" selects free register. It can select even rbx, if that register is free.
53 "=&D" for (RDI) also works, but the code can be larger with "=&D"
54 "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
55
56 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
57 __asm__ __volatile__ ( \
58 ASM_LN "mov %%rbx, %q1" \
59 ASM_LN "cpuid" \
60 ASM_LN "xchg %%rbx, %q1" \
61 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
62
63 #elif defined(MY_CPU_X86) && defined(__PIC__) \
64 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
65
66 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
67 __asm__ __volatile__ ( \
68 ASM_LN "mov %%ebx, %k1" \
69 ASM_LN "cpuid" \
70 ASM_LN "xchg %%ebx, %k1" \
71 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
72
73 #else
74
75 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
76 __asm__ __volatile__ ( \
77 ASM_LN "cpuid" \
78 : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
79
80 #endif
81
82 #define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
83
z7_x86_cpuid(UInt32 p[4],UInt32 func)84 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
85 {
86 x86_cpuid_MACRO(p, func)
87 }
88
89 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)90 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
91 {
92 x86_cpuid_MACRO_2(p, func, subFunc)
93 }
94
95
96 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)97 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
98 {
99 #if defined(NEED_CHECK_FOR_CPUID)
100 #define EFALGS_CPUID_BIT 21
101 UInt32 a;
102 __asm__ __volatile__ (
103 ASM_LN "pushf"
104 ASM_LN "pushf"
105 ASM_LN "pop %0"
106 // ASM_LN "movl %0, %1"
107 // ASM_LN "xorl $0x200000, %0"
108 ASM_LN "btc %1, %0"
109 ASM_LN "push %0"
110 ASM_LN "popf"
111 ASM_LN "pushf"
112 ASM_LN "pop %0"
113 ASM_LN "xorl (%%esp), %0"
114
115 ASM_LN "popf"
116 ASM_LN
117 : "=&r" (a) // "=a"
118 : "i" (EFALGS_CPUID_BIT)
119 );
120 if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
121 return 0;
122 #endif
123 {
124 UInt32 p[4];
125 x86_cpuid_MACRO(p, 0)
126 return p[0];
127 }
128 }
129
130 #undef ASM_LN
131
132 #elif !defined(_MSC_VER)
133
134 /*
135 // for gcc/clang and other: we can try to use __cpuid macro:
136 #include <cpuid.h>
137 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
138 {
139 __cpuid(func, p[0], p[1], p[2], p[3]);
140 }
141 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
142 {
143 return (UInt32)__get_cpuid_max(0, NULL);
144 }
145 */
146 // for unsupported cpuid:
z7_x86_cpuid(UInt32 p[4],UInt32 func)147 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
148 {
149 UNUSED_VAR(func)
150 p[0] = p[1] = p[2] = p[3] = 0;
151 }
z7_x86_cpuid_GetMaxFunc(void)152 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
153 {
154 return 0;
155 }
156
157 #else // _MSC_VER
158
159 #if !defined(MY_CPU_AMD64)
160
z7_x86_cpuid_GetMaxFunc(void)161 UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
162 {
163 #if defined(NEED_CHECK_FOR_CPUID)
164 #define EFALGS_CPUID_BIT 21
165 __asm pushfd
166 __asm pushfd
167 /*
168 __asm pop eax
169 // __asm mov edx, eax
170 __asm btc eax, EFALGS_CPUID_BIT
171 __asm push eax
172 */
173 __asm btc dword ptr [esp], EFALGS_CPUID_BIT
174 __asm popfd
175 __asm pushfd
176 __asm pop eax
177 // __asm xor eax, edx
178 __asm xor eax, [esp]
179 // __asm push edx
180 __asm popfd
181 __asm and eax, (1 shl EFALGS_CPUID_BIT)
182 __asm jz end_func
183 #endif
184 __asm push ebx
185 __asm xor eax, eax // func
186 __asm xor ecx, ecx // subFunction (optional) for (func == 0)
187 __asm cpuid
188 __asm pop ebx
189 #if defined(NEED_CHECK_FOR_CPUID)
190 end_func:
191 #endif
192 __asm ret 0
193 }
194
z7_x86_cpuid(UInt32 p[4],UInt32 func)195 void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
196 {
197 UNUSED_VAR(p)
198 UNUSED_VAR(func)
199 __asm push ebx
200 __asm push edi
201 __asm mov edi, ecx // p
202 __asm mov eax, edx // func
203 __asm xor ecx, ecx // subfunction (optional) for (func == 0)
204 __asm cpuid
205 __asm mov [edi ], eax
206 __asm mov [edi + 4], ebx
207 __asm mov [edi + 8], ecx
208 __asm mov [edi + 12], edx
209 __asm pop edi
210 __asm pop ebx
211 __asm ret 0
212 }
213
214 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)215 void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
216 {
217 UNUSED_VAR(p)
218 UNUSED_VAR(func)
219 UNUSED_VAR(subFunc)
220 __asm push ebx
221 __asm push edi
222 __asm mov edi, ecx // p
223 __asm mov eax, edx // func
224 __asm mov ecx, [esp + 12] // subFunc
225 __asm cpuid
226 __asm mov [edi ], eax
227 __asm mov [edi + 4], ebx
228 __asm mov [edi + 8], ecx
229 __asm mov [edi + 12], edx
230 __asm pop edi
231 __asm pop ebx
232 __asm ret 4
233 }
234
235 #else // MY_CPU_AMD64
236
237 #if _MSC_VER >= 1600
238 #include <intrin.h>
239 #define MY_cpuidex __cpuidex
240
241 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)242 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
243 {
244 __cpuidex((int *)p, func, subFunc);
245 }
246
247 #else
248 /*
249 __cpuid (func == (0 or 7)) requires subfunction number in ECX.
250 MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
251 __cpuid() in new MSVC clears ECX.
252 __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
253 We still can use __cpuid for low (func) values that don't require ECX,
254 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
255 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
256 where ECX value is first parameter for FASTCALL / NO_INLINE func.
257 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
258 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
259
260 DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
261 */
262 static
MY_cpuidex_HACK(Int32 subFunction,Int32 func,Int32 * CPUInfo)263 Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)
264 {
265 UNUSED_VAR(subFunction)
266 __cpuid(CPUInfo, func);
267 }
268 #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
269 #pragma message("======== MY_cpuidex_HACK WAS USED ========")
270 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)271 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
272 {
273 MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
274 }
275 #endif // _MSC_VER >= 1600
276
277 #if !defined(MY_CPU_AMD64)
278 /* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
279 so we disable inlining here */
280 Z7_NO_INLINE
281 #endif
z7_x86_cpuid(UInt32 p[4],UInt32 func)282 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
283 {
284 MY_cpuidex((Int32 *)p, (Int32)func, 0);
285 }
286
287 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)288 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
289 {
290 Int32 a[4];
291 MY_cpuidex(a, 0, 0);
292 return a[0];
293 }
294
295 #endif // MY_CPU_AMD64
296 #endif // _MSC_VER
297
298 #if defined(NEED_CHECK_FOR_CPUID)
299 #define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
300 #else
301 #define CHECK_CPUID_IS_SUPPORTED
302 #endif
303 #undef NEED_CHECK_FOR_CPUID
304
305
306 static
x86cpuid_Func_1(UInt32 * p)307 BoolInt x86cpuid_Func_1(UInt32 *p)
308 {
309 CHECK_CPUID_IS_SUPPORTED
310 z7_x86_cpuid(p, 1);
311 return True;
312 }
313
314 /*
315 static const UInt32 kVendors[][1] =
316 {
317 { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
318 { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
319 { 0x746E6543 } // , 0x48727561, 0x736C7561 }
320 };
321 */
322
323 /*
324 typedef struct
325 {
326 UInt32 maxFunc;
327 UInt32 vendor[3];
328 UInt32 ver;
329 UInt32 b;
330 UInt32 c;
331 UInt32 d;
332 } Cx86cpuid;
333
334 enum
335 {
336 CPU_FIRM_INTEL,
337 CPU_FIRM_AMD,
338 CPU_FIRM_VIA
339 };
340 int x86cpuid_GetFirm(const Cx86cpuid *p);
341 #define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
342 #define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf))
343 #define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
344
345 int x86cpuid_GetFirm(const Cx86cpuid *p)
346 {
347 unsigned i;
348 for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
349 {
350 const UInt32 *v = kVendors[i];
351 if (v[0] == p->vendor[0]
352 // && v[1] == p->vendor[1]
353 // && v[2] == p->vendor[2]
354 )
355 return (int)i;
356 }
357 return -1;
358 }
359
360 BoolInt CPU_Is_InOrder()
361 {
362 Cx86cpuid p;
363 UInt32 family, model;
364 if (!x86cpuid_CheckAndRead(&p))
365 return True;
366
367 family = x86cpuid_ver_GetFamily(p.ver);
368 model = x86cpuid_ver_GetModel(p.ver);
369
370 switch (x86cpuid_GetFirm(&p))
371 {
372 case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
373 // In-Order Atom CPU
374 model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
375 || model == 0x26 // 45 nm, Z6xx
376 || model == 0x27 // 32 nm, Z2460
377 || model == 0x35 // 32 nm, Z2760
378 || model == 0x36 // 32 nm, N2xxx, D2xxx
379 )));
380 case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
381 case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
382 }
383 return False; // v23 : unknown processors are not In-Order
384 }
385 */
386
387 #ifdef _WIN32
388 #include "7zWindows.h"
389 #endif
390
391 #if !defined(MY_CPU_AMD64) && defined(_WIN32)
392
393 /* for legacy SSE ia32: there is no user-space cpu instruction to check
394 that OS supports SSE register storing/restoring on context switches.
395 So we need some OS-specific function to check that it's safe to use SSE registers.
396 */
397
398 Z7_FORCE_INLINE
CPU_Sys_Is_SSE_Supported(void)399 static BoolInt CPU_Sys_Is_SSE_Supported(void)
400 {
401 #ifdef _MSC_VER
402 #pragma warning(push)
403 #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
404 #endif
405 /* low byte is major version of Windows
406 We suppose that any Windows version since
407 Windows2000 (major == 5) supports SSE registers */
408 return (Byte)GetVersion() >= 5;
409 #if defined(_MSC_VER)
410 #pragma warning(pop)
411 #endif
412 }
413 #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
414 #else
415 #define CHECK_SYS_SSE_SUPPORT
416 #endif
417
418
419 #if !defined(MY_CPU_AMD64)
420
CPU_IsSupported_CMOV(void)421 BoolInt CPU_IsSupported_CMOV(void)
422 {
423 UInt32 a[4];
424 if (!x86cpuid_Func_1(&a[0]))
425 return 0;
426 return (BoolInt)(a[3] >> 15) & 1;
427 }
428
CPU_IsSupported_SSE(void)429 BoolInt CPU_IsSupported_SSE(void)
430 {
431 UInt32 a[4];
432 CHECK_SYS_SSE_SUPPORT
433 if (!x86cpuid_Func_1(&a[0]))
434 return 0;
435 return (BoolInt)(a[3] >> 25) & 1;
436 }
437
CPU_IsSupported_SSE2(void)438 BoolInt CPU_IsSupported_SSE2(void)
439 {
440 UInt32 a[4];
441 CHECK_SYS_SSE_SUPPORT
442 if (!x86cpuid_Func_1(&a[0]))
443 return 0;
444 return (BoolInt)(a[3] >> 26) & 1;
445 }
446
447 #endif
448
449
x86cpuid_Func_1_ECX(void)450 static UInt32 x86cpuid_Func_1_ECX(void)
451 {
452 UInt32 a[4];
453 CHECK_SYS_SSE_SUPPORT
454 if (!x86cpuid_Func_1(&a[0]))
455 return 0;
456 return a[2];
457 }
458
CPU_IsSupported_AES(void)459 BoolInt CPU_IsSupported_AES(void)
460 {
461 return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;
462 }
463
CPU_IsSupported_SSSE3(void)464 BoolInt CPU_IsSupported_SSSE3(void)
465 {
466 return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;
467 }
468
CPU_IsSupported_SSE41(void)469 BoolInt CPU_IsSupported_SSE41(void)
470 {
471 return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;
472 }
473
CPU_IsSupported_SHA(void)474 BoolInt CPU_IsSupported_SHA(void)
475 {
476 CHECK_SYS_SSE_SUPPORT
477
478 if (z7_x86_cpuid_GetMaxFunc() < 7)
479 return False;
480 {
481 UInt32 d[4];
482 z7_x86_cpuid(d, 7);
483 return (BoolInt)(d[1] >> 29) & 1;
484 }
485 }
486
487
CPU_IsSupported_SHA512(void)488 BoolInt CPU_IsSupported_SHA512(void)
489 {
490 if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
491
492 if (z7_x86_cpuid_GetMaxFunc() < 7)
493 return False;
494 {
495 UInt32 d[4];
496 z7_x86_cpuid_subFunc(d, 7, 0);
497 if (d[0] < 1) // d[0] - is max supported subleaf value
498 return False;
499 z7_x86_cpuid_subFunc(d, 7, 1);
500 return (BoolInt)(d[0]) & 1;
501 }
502 }
503
504 /*
505 MSVC: _xgetbv() intrinsic is available since VS2010SP1.
506 MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
507 <immintrin.h> that we can use or check.
508 For any 32-bit x86 we can use asm code in MSVC,
509 but MSVC asm code is huge after compilation.
510 So _xgetbv() is better
511
512 ICC: _xgetbv() intrinsic is available (in what version of ICC?)
513 ICC defines (__GNUC___) and it supports gnu assembler
514 also ICC supports MASM style code with -use-msasm switch.
515 but ICC doesn't support __attribute__((__target__))
516
517 GCC/CLANG 9:
518 _xgetbv() is macro that works via __builtin_ia32_xgetbv()
519 and we need __attribute__((__target__("xsave")).
520 But with __target__("xsave") the function will be not
521 inlined to function that has no __target__("xsave") attribute.
522 If we want _xgetbv() call inlining, then we should use asm version
523 instead of calling _xgetbv().
524 Note:intrinsic is broke before GCC 8.2:
525 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
526 */
527
528 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
529 || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \
530 || defined(__GNUC__) && (__GNUC__ >= 9) \
531 || defined(__clang__) && (__clang_major__ >= 9)
532 // we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
533 #if defined(__INTEL_COMPILER)
534 #define ATTRIB_XGETBV
535 #elif defined(__GNUC__) || defined(__clang__)
536 // we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
537 // #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
538 #else
539 #define ATTRIB_XGETBV
540 #endif
541 #endif
542
543 #if defined(ATTRIB_XGETBV)
544 #include <immintrin.h>
545 #endif
546
547
548 // XFEATURE_ENABLED_MASK/XCR0
549 #define MY_XCR_XFEATURE_ENABLED_MASK 0
550
551 #if defined(ATTRIB_XGETBV)
552 ATTRIB_XGETBV
553 #endif
x86_xgetbv_0(UInt32 num)554 static UInt64 x86_xgetbv_0(UInt32 num)
555 {
556 #if defined(ATTRIB_XGETBV)
557 {
558 return
559 #if (defined(_MSC_VER))
560 _xgetbv(num);
561 #else
562 __builtin_ia32_xgetbv(
563 #if !defined(__clang__)
564 (int)
565 #endif
566 num);
567 #endif
568 }
569
570 #elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
571
572 UInt32 a, d;
573 #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
574 __asm__
575 (
576 "xgetbv"
577 : "=a"(a), "=d"(d) : "c"(num) : "cc"
578 );
579 #else // is old gcc
580 __asm__
581 (
582 ".byte 0x0f, 0x01, 0xd0" "\n\t"
583 : "=a"(a), "=d"(d) : "c"(num) : "cc"
584 );
585 #endif
586 return ((UInt64)d << 32) | a;
587 // return a;
588
589 #elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
590
591 UInt32 a, d;
592 __asm {
593 push eax
594 push edx
595 push ecx
596 mov ecx, num;
597 // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
598 _emit 0x0f
599 _emit 0x01
600 _emit 0xd0
601 mov a, eax
602 mov d, edx
603 pop ecx
604 pop edx
605 pop eax
606 }
607 return ((UInt64)d << 32) | a;
608 // return a;
609
610 #else // it's unknown compiler
611 // #error "Need xgetbv function"
612 UNUSED_VAR(num)
613 // for MSVC-X64 we could call external function from external file.
614 /* Actually we had checked OSXSAVE/AVX in cpuid before.
615 So it's expected that OS supports at least AVX and below. */
616 // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
617 return
618 // (1 << 0) | // x87
619 (1 << 1) // SSE
620 | (1 << 2); // AVX
621
622 #endif
623 }
624
625 #ifdef _WIN32
626 /*
627 Windows versions do not know about new ISA extensions that
628 can be introduced. But we still can use new extensions,
629 even if Windows doesn't report about supporting them,
630 But we can use new extensions, only if Windows knows about new ISA extension
631 that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
632 So it's enough to check
633 MY_PF_AVX_INSTRUCTIONS_AVAILABLE
634 instead of
635 MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
636 */
637 #define MY_PF_XSAVE_ENABLED 17
638 // #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36
639 // #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37
640 // #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38
641 // #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39
642 // #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40
643 // #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41
644 #endif
645
CPU_IsSupported_AVX(void)646 BoolInt CPU_IsSupported_AVX(void)
647 {
648 #ifdef _WIN32
649 if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
650 return False;
651 /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
652 some latest Win10 revisions. But we need AVX in older Windows also.
653 So we don't use the following check: */
654 /*
655 if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
656 return False;
657 */
658 #endif
659
660 /*
661 OS must use new special XSAVE/XRSTOR instructions to save
662 AVX registers when it required for context switching.
663 At OS statring:
664 OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
665 Also OS sets bitmask in XCR0 register that defines what
666 registers will be processed by XSAVE instruction:
667 XCR0.SSE[bit 0] - x87 registers and state
668 XCR0.SSE[bit 1] - SSE registers and state
669 XCR0.AVX[bit 2] - AVX registers and state
670 CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
671 So we can read that bit in user-space.
672 XCR0 is available for reading in user-space by new XGETBV instruction.
673 */
674 {
675 const UInt32 c = x86cpuid_Func_1_ECX();
676 if (0 == (1
677 & (c >> 28) // AVX instructions are supported by hardware
678 & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
679 return False;
680 }
681
682 /* also we can check
683 CPUID.1:ECX.XSAVE [bit 26] : that shows that
684 XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
685 But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
686
687 /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
688 in most cases we expect that OS also will support storing/restoring
689 for AVX and SSE states at least.
690 But to be ensure for that we call user-space instruction
691 XGETBV(0) to get XCR0 value that contains bitmask that defines
692 what exact states(registers) OS have enabled for storing/restoring.
693 */
694
695 {
696 const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
697 // printf("\n=== XGetBV=0x%x\n", bm);
698 return 1
699 & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring
700 & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring
701 }
702 // since Win7SP1: we can use GetEnabledXStateFeatures();
703 }
704
705
CPU_IsSupported_AVX2(void)706 BoolInt CPU_IsSupported_AVX2(void)
707 {
708 if (!CPU_IsSupported_AVX())
709 return False;
710 if (z7_x86_cpuid_GetMaxFunc() < 7)
711 return False;
712 {
713 UInt32 d[4];
714 z7_x86_cpuid(d, 7);
715 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
716 return 1
717 & (BoolInt)(d[1] >> 5); // avx2
718 }
719 }
720
721 #if 0
722 BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)
723 {
724 if (!CPU_IsSupported_AVX())
725 return False;
726 if (z7_x86_cpuid_GetMaxFunc() < 7)
727 return False;
728 {
729 UInt32 d[4];
730 BoolInt v;
731 z7_x86_cpuid(d, 7);
732 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
733 v = 1
734 & (BoolInt)(d[1] >> 16) // avx512f
735 & (BoolInt)(d[1] >> 31); // avx512vl
736 if (!v)
737 return False;
738 }
739 {
740 const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
741 // printf("\n=== XGetBV=0x%x\n", bm);
742 return 1
743 & (BoolInt)(bm >> 5) // OPMASK
744 & (BoolInt)(bm >> 6) // ZMM upper 256-bit
745 & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31
746 }
747 }
748 #endif
749
CPU_IsSupported_VAES_AVX2(void)750 BoolInt CPU_IsSupported_VAES_AVX2(void)
751 {
752 if (!CPU_IsSupported_AVX())
753 return False;
754 if (z7_x86_cpuid_GetMaxFunc() < 7)
755 return False;
756 {
757 UInt32 d[4];
758 z7_x86_cpuid(d, 7);
759 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
760 return 1
761 & (BoolInt)(d[1] >> 5) // avx2
762 // & (d[1] >> 31) // avx512vl
763 & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX
764 }
765 }
766
CPU_IsSupported_PageGB(void)767 BoolInt CPU_IsSupported_PageGB(void)
768 {
769 CHECK_CPUID_IS_SUPPORTED
770 {
771 UInt32 d[4];
772 z7_x86_cpuid(d, 0x80000000);
773 if (d[0] < 0x80000001)
774 return False;
775 z7_x86_cpuid(d, 0x80000001);
776 return (BoolInt)(d[3] >> 26) & 1;
777 }
778 }
779
780
781 #elif defined(MY_CPU_ARM_OR_ARM64)
782
783 #ifdef _WIN32
784
785 #include "7zWindows.h"
786
CPU_IsSupported_CRC32(void)787 BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_CRYPTO(void)788 BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_NEON(void)789 BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
790
791 #else
792
793 #if defined(__APPLE__)
794
795 /*
796 #include <stdio.h>
797 #include <string.h>
798 static void Print_sysctlbyname(const char *name)
799 {
800 size_t bufSize = 256;
801 char buf[256];
802 int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);
803 {
804 int i;
805 printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);
806 for (i = 0; i < 20; i++)
807 printf(" %2x", (unsigned)(Byte)buf[i]);
808
809 }
810 }
811 */
812 /*
813 Print_sysctlbyname("hw.pagesize");
814 Print_sysctlbyname("machdep.cpu.brand_string");
815 */
816
z7_sysctlbyname_Get_BoolInt(const char * name)817 static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
818 {
819 UInt32 val = 0;
820 if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
821 return 1;
822 return 0;
823 }
824
CPU_IsSupported_CRC32(void)825 BoolInt CPU_IsSupported_CRC32(void)
826 {
827 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
828 }
829
CPU_IsSupported_NEON(void)830 BoolInt CPU_IsSupported_NEON(void)
831 {
832 return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
833 }
834
CPU_IsSupported_SHA512(void)835 BoolInt CPU_IsSupported_SHA512(void)
836 {
837 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
838 }
839
840 /*
841 BoolInt CPU_IsSupported_SHA3(void)
842 {
843 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
844 }
845 */
846
847 #ifdef MY_CPU_ARM64
848 #define APPLE_CRYPTO_SUPPORT_VAL 1
849 #else
850 #define APPLE_CRYPTO_SUPPORT_VAL 0
851 #endif
852
CPU_IsSupported_SHA1(void)853 BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_SHA2(void)854 BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_AES(void)855 BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
856
857
858 #else // __APPLE__
859
860 #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)
861 #define Z7_GETAUXV_AVAILABLE
862 #else
863 // #pragma message("=== is not NEW GLIBC === ")
864 #if defined __has_include
865 #if __has_include (<sys/auxv.h>)
866 // #pragma message("=== sys/auxv.h is avail=== ")
867 #define Z7_GETAUXV_AVAILABLE
868 #endif
869 #endif
870 #endif
871
872 #ifdef Z7_GETAUXV_AVAILABLE
873 // #pragma message("=== Z7_GETAUXV_AVAILABLE === ")
874 #include <sys/auxv.h>
875 #define USE_HWCAP
876 #endif
877
878 #ifdef USE_HWCAP
879
880 #if defined(__FreeBSD__)
MY_getauxval(int aux)881 static unsigned long MY_getauxval(int aux)
882 {
883 unsigned long val;
884 if (elf_aux_info(aux, &val, sizeof(val)))
885 return 0;
886 return val;
887 }
888 #else
889 #define MY_getauxval getauxval
890 #if defined __has_include
891 #if __has_include (<asm/hwcap.h>)
892 #include <asm/hwcap.h>
893 #endif
894 #endif
895 #endif
896
897 #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
898 BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); }
899
900 #ifdef MY_CPU_ARM64
901 #define MY_HWCAP_CHECK_FUNC(name) \
902 MY_HWCAP_CHECK_FUNC_2(name, name)
903 #if 1 || defined(__ARM_NEON)
CPU_IsSupported_NEON(void)904 BoolInt CPU_IsSupported_NEON(void) { return True; }
905 #else
MY_HWCAP_CHECK_FUNC_2(NEON,ASIMD)906 MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
907 #endif
908 // MY_HWCAP_CHECK_FUNC (ASIMD)
909 #elif defined(MY_CPU_ARM)
910 #define MY_HWCAP_CHECK_FUNC(name) \
911 BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }
912 MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
913 #endif
914
915 #else // USE_HWCAP
916
917 #define MY_HWCAP_CHECK_FUNC(name) \
918 BoolInt CPU_IsSupported_ ## name(void) { return 0; }
919 #if defined(__ARM_NEON)
920 BoolInt CPU_IsSupported_NEON(void) { return True; }
921 #else
922 MY_HWCAP_CHECK_FUNC(NEON)
923 #endif
924
925 #endif // USE_HWCAP
926
927 MY_HWCAP_CHECK_FUNC (CRC32)
928 MY_HWCAP_CHECK_FUNC (SHA1)
929 MY_HWCAP_CHECK_FUNC (SHA2)
930 MY_HWCAP_CHECK_FUNC (AES)
931 #ifdef MY_CPU_ARM64
932 // <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
933 // we define them here, if they are not defined
934 #ifndef HWCAP_SHA3
935 // #define HWCAP_SHA3 (1 << 17)
936 #endif
937 #ifndef HWCAP_SHA512
938 // #pragma message("=== HWCAP_SHA512 define === ")
939 #define HWCAP_SHA512 (1 << 21)
940 #endif
941 MY_HWCAP_CHECK_FUNC (SHA512)
942 // MY_HWCAP_CHECK_FUNC (SHA3)
943 #endif
944
945 #endif // __APPLE__
946 #endif // _WIN32
947
948 #endif // MY_CPU_ARM_OR_ARM64
949
950
951
952 #ifdef __APPLE__
953
954 #include <sys/sysctl.h>
955
956 int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
957 {
958 return sysctlbyname(name, buf, bufSize, NULL, 0);
959 }
960
z7_sysctlbyname_Get_UInt32(const char * name,UInt32 * val)961 int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
962 {
963 size_t bufSize = sizeof(*val);
964 const int res = z7_sysctlbyname_Get(name, val, &bufSize);
965 if (res == 0 && bufSize != sizeof(*val))
966 return EFAULT;
967 return res;
968 }
969
970 #endif
971