xref: /aosp_15_r20/external/lzma/C/Sha256Opt.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7 
8 // #define Z7_USE_HW_SHA_STUB // for debug
9 #ifdef MY_CPU_X86_OR_AMD64
10   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
11       #define USE_HW_SHA
12   #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
13      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
14      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
15       #define USE_HW_SHA
16       #if !defined(__INTEL_COMPILER)
17       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18       #if !defined(__SHA__) || !defined(__SSSE3__)
19         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20       #endif
21       #endif
22   #elif defined(_MSC_VER)
23     #if (_MSC_VER >= 1900)
24       #define USE_HW_SHA
25     #else
26       #define Z7_USE_HW_SHA_STUB
27     #endif
28   #endif
29 // #endif // MY_CPU_X86_OR_AMD64
30 #ifndef USE_HW_SHA
31   // #define Z7_USE_HW_SHA_STUB // for debug
32 #endif
33 
34 #ifdef USE_HW_SHA
35 
36 // #pragma message("Sha256 HW")
37 
38 
39 
40 
41 // sse/sse2/ssse3:
42 #include <tmmintrin.h>
43 // sha*:
44 #include <immintrin.h>
45 
46 #if defined (__clang__) && defined(_MSC_VER)
47   #if !defined(__SHA__)
48     #include <shaintrin.h>
49   #endif
50 #else
51 
52 #endif
53 
54 /*
55 SHA256 uses:
56 SSE2:
57   _mm_loadu_si128
58   _mm_storeu_si128
59   _mm_set_epi32
60   _mm_add_epi32
61   _mm_shuffle_epi32 / pshufd
62 
63 
64 
65 SSSE3:
66   _mm_shuffle_epi8 / pshufb
67   _mm_alignr_epi8
68 SHA:
69   _mm_sha256*
70 */
71 
72 // K array must be aligned for 16-bytes at least.
73 // The compiler can look align attribute and selects
74 //   movdqu - for code without align attribute
75 //   movdqa - for code with    align attribute
76 extern
77 MY_ALIGN(64)
78 const UInt32 SHA256_K_ARRAY[64];
79 #define K SHA256_K_ARRAY
80 
81 
82 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
83 #define SHA256_MSG1(dest, src)    dest = _mm_sha256msg1_epu32(dest, src);
84 #define SHA256_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
85 
86 #define LOAD_SHUFFLE(m, k) \
87     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
88     m = _mm_shuffle_epi8(m, mask); \
89 
90 #define NNN(m0, m1, m2, m3)
91 
92 #define SM1(m1, m2, m3, m0) \
93     SHA256_MSG1(m0, m1); \
94 
95 #define SM2(m2, m3, m0, m1) \
96     ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
97     SHA256_MSG2(m0, m3); \
98 
99 #define RND2(t0, t1) \
100     t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
101 
102 
103 
104 #define R4(k, m0, m1, m2, m3, OP0, OP1) \
105     msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
106     RND2(state0, state1); \
107     msg = _mm_shuffle_epi32(msg, 0x0E); \
108     OP0(m0, m1, m2, m3) \
109     RND2(state1, state0); \
110     OP1(m0, m1, m2, m3) \
111 
112 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
113     R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
114     R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
115     R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
116     R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
117 
118 #define PREPARE_STATE \
119     tmp    = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
120     state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
121     state1 = state0; \
122     state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
123     state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
124 
125 
126 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
127 #ifdef ATTRIB_SHA
128 ATTRIB_SHA
129 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)130 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
131 {
132   const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
133 
134 
135   __m128i tmp, state0, state1;
136 
137   if (numBlocks == 0)
138     return;
139 
140   state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
141   state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
142 
143   PREPARE_STATE
144 
145   do
146   {
147     __m128i state0_save, state1_save;
148     __m128i m0, m1, m2, m3;
149     __m128i msg;
150     // #define msg tmp
151 
152     state0_save = state0;
153     state1_save = state1;
154 
155     LOAD_SHUFFLE (m0, 0)
156     LOAD_SHUFFLE (m1, 1)
157     LOAD_SHUFFLE (m2, 2)
158     LOAD_SHUFFLE (m3, 3)
159 
160 
161 
162     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
163     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
164     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
165     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
166 
167     ADD_EPI32(state0, state0_save)
168     ADD_EPI32(state1, state1_save)
169 
170     data += 64;
171   }
172   while (--numBlocks);
173 
174   PREPARE_STATE
175 
176   _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
177   _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
178 }
179 
180 #endif // USE_HW_SHA
181 
182 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
183 
184   #if   defined(__ARM_FEATURE_SHA2) \
185      || defined(__ARM_FEATURE_CRYPTO)
186     #define USE_HW_SHA
187   #else
188     #if  defined(MY_CPU_ARM64) \
189       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
190       || defined(Z7_MSC_VER_ORIGINAL)
191     #if  defined(__ARM_FP) && \
192           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
193            || defined(__GNUC__) && (__GNUC__ >= 6) \
194           ) \
195       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
196     #if  defined(MY_CPU_ARM64) \
197       || !defined(Z7_CLANG_VERSION) \
198       || defined(__ARM_NEON) && \
199           (Z7_CLANG_VERSION < 170000 || \
200            Z7_CLANG_VERSION > 170001)
201       #define USE_HW_SHA
202     #endif
203     #endif
204     #endif
205   #endif
206 
207 #ifdef USE_HW_SHA
208 
209 // #pragma message("=== Sha256 HW === ")
210 
211 
212 #if defined(__clang__) || defined(__GNUC__)
213 #if !defined(__ARM_FEATURE_SHA2) && \
214     !defined(__ARM_FEATURE_CRYPTO)
215   #ifdef MY_CPU_ARM64
216 #if defined(__clang__)
217     #define ATTRIB_SHA __attribute__((__target__("crypto")))
218 #else
219     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
220 #endif
221   #else
222 #if defined(__clang__) && (__clang_major__ >= 1)
223     #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
224 #else
225     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
226 #endif
227   #endif
228 #endif
229 #else
230   // _MSC_VER
231   // for arm32
232   #define _ARM_USE_NEW_NEON_INTRINSICS
233 #endif
234 
235 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
236 #include <arm64_neon.h>
237 #else
238 
239 #if defined(__clang__) && __clang_major__ < 16
240 #if !defined(__ARM_FEATURE_SHA2) && \
241     !defined(__ARM_FEATURE_CRYPTO)
242 //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
243     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
244     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
245 // #if defined(__clang__) && __clang_major__ < 13
246     #define __ARM_FEATURE_CRYPTO 1
247 // #else
248     #define __ARM_FEATURE_SHA2 1
249 // #endif
250     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
251 #endif
252 #endif // clang
253 
254 #if defined(__clang__)
255 
256 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
257     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
258 //    #pragma message("#define __ARM_ARCH 8")
259     #undef  __ARM_ARCH
260     #define __ARM_ARCH 8
261     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
262 #endif
263 
264 #endif // clang
265 
266 #include <arm_neon.h>
267 
268 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
269     defined(__ARM_FEATURE_CRYPTO) && \
270     defined(__ARM_FEATURE_SHA2)
271 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
272     #undef __ARM_FEATURE_CRYPTO
273     #undef __ARM_FEATURE_SHA2
274     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
275 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
276 //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
277 #endif
278 
279 #endif // Z7_MSC_VER_ORIGINAL
280 
281 typedef uint32x4_t v128;
282 // typedef __n128 v128; // MSVC
283 
284 #ifdef MY_CPU_BE
285   #define MY_rev32_for_LE(x) x
286 #else
287   #define MY_rev32_for_LE(x) vrev32q_u8(x)
288 #endif
289 
290 #if 1 // 0 for debug
291 // for arm32: it works slower by some reason than direct code
292 /*
293 for arm32 it generates:
294 MSVC-2022, GCC-9:
295     vld1.32 {d18,d19}, [r10]
296     vst1.32 {d4,d5}, [r3]
297     vld1.8  {d20-d21}, [r4]
298 there is no align hint (like [r10:128]).  So instruction allows unaligned access
299 */
300 #define LOAD_128_32(_p)       vld1q_u32(_p)
301 #define LOAD_128_8(_p)        vld1q_u8 (_p)
302 #define STORE_128_32(_p, _v)  vst1q_u32(_p, _v)
303 #else
304 /*
305 for arm32:
306 MSVC-2022:
307     vldm r10,{d18,d19}
308     vstm r3,{d4,d5}
309     does it require strict alignment?
310 GCC-9:
311     vld1.64 {d30-d31}, [r0:64]
312     vldr  d28, [r0, #16]
313     vldr  d29, [r0, #24]
314     vst1.64 {d30-d31}, [r0:64]
315     vstr  d28, [r0, #16]
316     vstr  d29, [r0, #24]
317 there is hint [r0:64], so does it requires 64-bit alignment.
318 */
319 #define LOAD_128_32(_p)       (*(const v128 *)(const void *)(_p))
320 #define LOAD_128_8(_p)        vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
321 #define STORE_128_32(_p, _v)  *(v128 *)(void *)(_p) = (_v)
322 #endif
323 
324 #define LOAD_SHUFFLE(m, k) \
325     m = vreinterpretq_u32_u8( \
326         MY_rev32_for_LE( \
327         LOAD_128_8(data + (k) * 16))); \
328 
329 // K array must be aligned for 16-bytes at least.
330 extern
331 MY_ALIGN(64)
332 const UInt32 SHA256_K_ARRAY[64];
333 #define K SHA256_K_ARRAY
334 
335 #define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
336 #define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
337 
338 #define SM1(m0, m1, m2, m3)  SHA256_SU0(m3, m0)
339 #define SM2(m0, m1, m2, m3)  SHA256_SU1(m2, m0, m1)
340 #define NNN(m0, m1, m2, m3)
341 
342 #define R4(k, m0, m1, m2, m3, OP0, OP1) \
343     msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
344     tmp = state0; \
345     state0 = vsha256hq_u32( state0, state1, msg ); \
346     state1 = vsha256h2q_u32( state1, tmp, msg ); \
347     OP0(m0, m1, m2, m3); \
348     OP1(m0, m1, m2, m3); \
349 
350 
351 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
352     R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
353     R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
354     R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
355     R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
356 
357 
358 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
359 #ifdef ATTRIB_SHA
360 ATTRIB_SHA
361 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)362 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
363 {
364   v128 state0, state1;
365 
366   if (numBlocks == 0)
367     return;
368 
369   state0 = LOAD_128_32(&state[0]);
370   state1 = LOAD_128_32(&state[4]);
371 
372   do
373   {
374     v128 state0_save, state1_save;
375     v128 m0, m1, m2, m3;
376     v128 msg, tmp;
377 
378     state0_save = state0;
379     state1_save = state1;
380 
381     LOAD_SHUFFLE (m0, 0)
382     LOAD_SHUFFLE (m1, 1)
383     LOAD_SHUFFLE (m2, 2)
384     LOAD_SHUFFLE (m3, 3)
385 
386     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
387     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
388     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
389     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
390 
391     state0 = vaddq_u32(state0, state0_save);
392     state1 = vaddq_u32(state1, state1_save);
393 
394     data += 64;
395   }
396   while (--numBlocks);
397 
398   STORE_128_32(&state[0], state0);
399   STORE_128_32(&state[4], state1);
400 }
401 
402 #endif // USE_HW_SHA
403 
404 #endif // MY_CPU_ARM_OR_ARM64
405 
406 
407 #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
408 // #error Stop_Compiling_UNSUPPORTED_SHA
409 // #include <stdlib.h>
410 // We can compile this file with another C compiler,
411 // or we can compile asm version.
412 // So we can generate real code instead of this stub function.
413 // #include "Sha256.h"
414 // #if defined(_MSC_VER)
415 #pragma message("Sha256 HW-SW stub was used")
416 // #endif
417 void Z7_FASTCALL Sha256_UpdateBlocks   (UInt32 state[8], const Byte *data, size_t numBlocks);
418 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)419 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
420 {
421   Sha256_UpdateBlocks(state, data, numBlocks);
422   /*
423   UNUSED_VAR(state);
424   UNUSED_VAR(data);
425   UNUSED_VAR(numBlocks);
426   exit(1);
427   return;
428   */
429 }
430 #endif
431 
432 
433 #undef K
434 #undef RND2
435 #undef MY_rev32_for_LE
436 
437 #undef NNN
438 #undef LOAD_128
439 #undef STORE_128
440 #undef LOAD_SHUFFLE
441 #undef SM1
442 #undef SM2
443 
444 
445 #undef R4
446 #undef R16
447 #undef PREPARE_STATE
448 #undef USE_HW_SHA
449 #undef ATTRIB_SHA
450 #undef USE_VER_MIN
451 #undef Z7_USE_HW_SHA_STUB
452