xref: /aosp_15_r20/external/lzma/C/AesOpt.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1*f6dc9357SAndroid Build Coastguard Worker /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2*f6dc9357SAndroid Build Coastguard Worker Igor Pavlov : Public domain */
3*f6dc9357SAndroid Build Coastguard Worker 
4*f6dc9357SAndroid Build Coastguard Worker #include "Precomp.h"
5*f6dc9357SAndroid Build Coastguard Worker 
6*f6dc9357SAndroid Build Coastguard Worker #include "Aes.h"
7*f6dc9357SAndroid Build Coastguard Worker #include "CpuArch.h"
8*f6dc9357SAndroid Build Coastguard Worker 
9*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_X86_OR_AMD64
10*f6dc9357SAndroid Build Coastguard Worker 
11*f6dc9357SAndroid Build Coastguard Worker   #if defined(__INTEL_COMPILER)
12*f6dc9357SAndroid Build Coastguard Worker     #if (__INTEL_COMPILER >= 1110)
13*f6dc9357SAndroid Build Coastguard Worker       #define USE_INTEL_AES
14*f6dc9357SAndroid Build Coastguard Worker       #if (__INTEL_COMPILER >= 1900)
15*f6dc9357SAndroid Build Coastguard Worker         #define USE_INTEL_VAES
16*f6dc9357SAndroid Build Coastguard Worker       #endif
17*f6dc9357SAndroid Build Coastguard Worker     #endif
18*f6dc9357SAndroid Build Coastguard Worker   #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19*f6dc9357SAndroid Build Coastguard Worker      || defined(Z7_GCC_VERSION)   && (Z7_GCC_VERSION   >= 40400)
20*f6dc9357SAndroid Build Coastguard Worker         #define USE_INTEL_AES
21*f6dc9357SAndroid Build Coastguard Worker         #if !defined(__AES__)
22*f6dc9357SAndroid Build Coastguard Worker           #define ATTRIB_AES __attribute__((__target__("aes")))
23*f6dc9357SAndroid Build Coastguard Worker         #endif
24*f6dc9357SAndroid Build Coastguard Worker       #if defined(__clang__) && (__clang_major__ >= 8) \
25*f6dc9357SAndroid Build Coastguard Worker           || defined(__GNUC__) && (__GNUC__ >= 8)
26*f6dc9357SAndroid Build Coastguard Worker         #define USE_INTEL_VAES
27*f6dc9357SAndroid Build Coastguard Worker         #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28*f6dc9357SAndroid Build Coastguard Worker           #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29*f6dc9357SAndroid Build Coastguard Worker         #endif
30*f6dc9357SAndroid Build Coastguard Worker       #endif
31*f6dc9357SAndroid Build Coastguard Worker   #elif defined(_MSC_VER)
32*f6dc9357SAndroid Build Coastguard Worker     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33*f6dc9357SAndroid Build Coastguard Worker       #define USE_INTEL_AES
34*f6dc9357SAndroid Build Coastguard Worker       #if (_MSC_VER >= 1910)
35*f6dc9357SAndroid Build Coastguard Worker         #define USE_INTEL_VAES
36*f6dc9357SAndroid Build Coastguard Worker       #endif
37*f6dc9357SAndroid Build Coastguard Worker     #endif
38*f6dc9357SAndroid Build Coastguard Worker     #ifndef USE_INTEL_AES
39*f6dc9357SAndroid Build Coastguard Worker       #define Z7_USE_AES_HW_STUB
40*f6dc9357SAndroid Build Coastguard Worker     #endif
41*f6dc9357SAndroid Build Coastguard Worker     #ifndef USE_INTEL_VAES
42*f6dc9357SAndroid Build Coastguard Worker       #define Z7_USE_VAES_HW_STUB
43*f6dc9357SAndroid Build Coastguard Worker     #endif
44*f6dc9357SAndroid Build Coastguard Worker   #endif
45*f6dc9357SAndroid Build Coastguard Worker 
46*f6dc9357SAndroid Build Coastguard Worker     #ifndef USE_INTEL_AES
47*f6dc9357SAndroid Build Coastguard Worker       // #define Z7_USE_AES_HW_STUB // for debug
48*f6dc9357SAndroid Build Coastguard Worker     #endif
49*f6dc9357SAndroid Build Coastguard Worker     #ifndef USE_INTEL_VAES
50*f6dc9357SAndroid Build Coastguard Worker       // #define Z7_USE_VAES_HW_STUB // for debug
51*f6dc9357SAndroid Build Coastguard Worker     #endif
52*f6dc9357SAndroid Build Coastguard Worker 
53*f6dc9357SAndroid Build Coastguard Worker 
54*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_AES
55*f6dc9357SAndroid Build Coastguard Worker 
56*f6dc9357SAndroid Build Coastguard Worker #include <wmmintrin.h>
57*f6dc9357SAndroid Build Coastguard Worker 
58*f6dc9357SAndroid Build Coastguard Worker #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_keys UInt32
60*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_data Byte
61*f6dc9357SAndroid Build Coastguard Worker // #define AES_TYPE_keys __m128i
62*f6dc9357SAndroid Build Coastguard Worker // #define AES_TYPE_data __m128i
63*f6dc9357SAndroid Build Coastguard Worker #endif
64*f6dc9357SAndroid Build Coastguard Worker 
65*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_AES
66*f6dc9357SAndroid Build Coastguard Worker   #define ATTRIB_AES
67*f6dc9357SAndroid Build Coastguard Worker #endif
68*f6dc9357SAndroid Build Coastguard Worker 
69*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
70*f6dc9357SAndroid Build Coastguard Worker     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71*f6dc9357SAndroid Build Coastguard Worker     // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72*f6dc9357SAndroid Build Coastguard Worker 
73*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START2(name) \
74*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
75*f6dc9357SAndroid Build Coastguard Worker ATTRIB_AES \
76*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
77*f6dc9357SAndroid Build Coastguard Worker 
78*f6dc9357SAndroid Build Coastguard Worker #define MM_OP(op, dest, src)  dest = op(dest, src);
79*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_m(op, src)      MM_OP(op, m, src)
80*f6dc9357SAndroid Build Coastguard Worker 
81*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
82*f6dc9357SAndroid Build Coastguard Worker 
83*f6dc9357SAndroid Build Coastguard Worker #if 1
84*f6dc9357SAndroid Build Coastguard Worker // use aligned SSE load/store for data.
85*f6dc9357SAndroid Build Coastguard Worker // It is required for our Aes functions, that data is aligned for 16-bytes.
86*f6dc9357SAndroid Build Coastguard Worker // So we can use this branch of code.
87*f6dc9357SAndroid Build Coastguard Worker // and compiler can use fused load-op SSE instructions:
88*f6dc9357SAndroid Build Coastguard Worker //   xorps xmm0, XMMWORD PTR [rdx]
89*f6dc9357SAndroid Build Coastguard Worker #define LOAD_128(pp)        (*(__m128i *)(void *)(pp))
90*f6dc9357SAndroid Build Coastguard Worker #define STORE_128(pp, _v)    *(__m128i *)(void *)(pp) = _v
91*f6dc9357SAndroid Build Coastguard Worker // use aligned SSE load/store for data. Alternative code with direct access
92*f6dc9357SAndroid Build Coastguard Worker // #define LOAD_128(pp)        _mm_load_si128(pp)
93*f6dc9357SAndroid Build Coastguard Worker // #define STORE_128(pp, _v)   _mm_store_si128(pp, _v)
94*f6dc9357SAndroid Build Coastguard Worker #else
95*f6dc9357SAndroid Build Coastguard Worker // use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96*f6dc9357SAndroid Build Coastguard Worker #define LOAD_128(pp)        _mm_loadu_si128(pp)
97*f6dc9357SAndroid Build Coastguard Worker #define STORE_128(pp, _v)   _mm_storeu_si128(pp, _v)
98*f6dc9357SAndroid Build Coastguard Worker #endif
99*f6dc9357SAndroid Build Coastguard Worker 
AES_FUNC_START2(AesCbc_Encode_HW)100*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Encode_HW)
101*f6dc9357SAndroid Build Coastguard Worker {
102*f6dc9357SAndroid Build Coastguard Worker   if (numBlocks == 0)
103*f6dc9357SAndroid Build Coastguard Worker     return;
104*f6dc9357SAndroid Build Coastguard Worker   {
105*f6dc9357SAndroid Build Coastguard Worker   __m128i *p = (__m128i *)(void *)ivAes;
106*f6dc9357SAndroid Build Coastguard Worker   __m128i *data = (__m128i *)(void *)data8;
107*f6dc9357SAndroid Build Coastguard Worker   __m128i m = *p;
108*f6dc9357SAndroid Build Coastguard Worker   const __m128i k0 = p[2];
109*f6dc9357SAndroid Build Coastguard Worker   const __m128i k1 = p[3];
110*f6dc9357SAndroid Build Coastguard Worker   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
111*f6dc9357SAndroid Build Coastguard Worker   do
112*f6dc9357SAndroid Build Coastguard Worker   {
113*f6dc9357SAndroid Build Coastguard Worker     UInt32 r = numRounds2;
114*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = p + 4;
115*f6dc9357SAndroid Build Coastguard Worker     __m128i temp = LOAD_128(data);
116*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (temp, k0)
117*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (m, temp)
118*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenc_si128, k1)
119*f6dc9357SAndroid Build Coastguard Worker     do
120*f6dc9357SAndroid Build Coastguard Worker     {
121*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[0])
122*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[1])
123*f6dc9357SAndroid Build Coastguard Worker       w += 2;
124*f6dc9357SAndroid Build Coastguard Worker     }
125*f6dc9357SAndroid Build Coastguard Worker     while (--r);
126*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenclast_si128, w[0])
127*f6dc9357SAndroid Build Coastguard Worker     STORE_128(data, m);
128*f6dc9357SAndroid Build Coastguard Worker     data++;
129*f6dc9357SAndroid Build Coastguard Worker   }
130*f6dc9357SAndroid Build Coastguard Worker   while (--numBlocks);
131*f6dc9357SAndroid Build Coastguard Worker   *p = m;
132*f6dc9357SAndroid Build Coastguard Worker   }
133*f6dc9357SAndroid Build Coastguard Worker }
134*f6dc9357SAndroid Build Coastguard Worker 
135*f6dc9357SAndroid Build Coastguard Worker 
136*f6dc9357SAndroid Build Coastguard Worker #define WOP_1(op)
137*f6dc9357SAndroid Build Coastguard Worker #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
138*f6dc9357SAndroid Build Coastguard Worker #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
139*f6dc9357SAndroid Build Coastguard Worker #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
140*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_AMD64
141*f6dc9357SAndroid Build Coastguard Worker #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
142*f6dc9357SAndroid Build Coastguard Worker #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
143*f6dc9357SAndroid Build Coastguard Worker #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
144*f6dc9357SAndroid Build Coastguard Worker #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
145*f6dc9357SAndroid Build Coastguard Worker #endif
146*f6dc9357SAndroid Build Coastguard Worker /*
147*f6dc9357SAndroid Build Coastguard Worker #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
148*f6dc9357SAndroid Build Coastguard Worker #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
149*f6dc9357SAndroid Build Coastguard Worker #define WOP_11(op)  WOP_10(op)  op (m10, 10);
150*f6dc9357SAndroid Build Coastguard Worker #define WOP_12(op)  WOP_11(op)  op (m11, 11);
151*f6dc9357SAndroid Build Coastguard Worker #define WOP_13(op)  WOP_12(op)  op (m12, 12);
152*f6dc9357SAndroid Build Coastguard Worker #define WOP_14(op)  WOP_13(op)  op (m13, 13);
153*f6dc9357SAndroid Build Coastguard Worker */
154*f6dc9357SAndroid Build Coastguard Worker 
155*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_AMD64
156*f6dc9357SAndroid Build Coastguard Worker   #define NUM_WAYS      8
157*f6dc9357SAndroid Build Coastguard Worker   #define WOP_M1    WOP_8
158*f6dc9357SAndroid Build Coastguard Worker #else
159*f6dc9357SAndroid Build Coastguard Worker   #define NUM_WAYS      4
160*f6dc9357SAndroid Build Coastguard Worker   #define WOP_M1    WOP_4
161*f6dc9357SAndroid Build Coastguard Worker #endif
162*f6dc9357SAndroid Build Coastguard Worker 
163*f6dc9357SAndroid Build Coastguard Worker #define WOP(op)  op (m0, 0)  WOP_M1(op)
164*f6dc9357SAndroid Build Coastguard Worker 
165*f6dc9357SAndroid Build Coastguard Worker #define DECLARE_VAR(reg, ii)  __m128i reg;
166*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data_ii(ii)      LOAD_128(data + (ii))
167*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data(  reg, ii)  reg = LOAD_data_ii(ii);
168*f6dc9357SAndroid Build Coastguard Worker #define STORE_data( reg, ii)  STORE_128(data + (ii), reg);
169*f6dc9357SAndroid Build Coastguard Worker #if (NUM_WAYS > 1)
170*f6dc9357SAndroid Build Coastguard Worker #define XOR_data_M1(reg, ii)  MM_XOR (reg, LOAD_128(data + (ii- 1)))
171*f6dc9357SAndroid Build Coastguard Worker #endif
172*f6dc9357SAndroid Build Coastguard Worker 
173*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
174*f6dc9357SAndroid Build Coastguard Worker 
175*f6dc9357SAndroid Build Coastguard Worker #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
176*f6dc9357SAndroid Build Coastguard Worker #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
177*f6dc9357SAndroid Build Coastguard Worker #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
178*f6dc9357SAndroid Build Coastguard Worker #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
179*f6dc9357SAndroid Build Coastguard Worker #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
180*f6dc9357SAndroid Build Coastguard Worker 
181*f6dc9357SAndroid Build Coastguard Worker #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
182*f6dc9357SAndroid Build Coastguard Worker #define CTR_END(  reg, ii)  STORE_128(data + (ii), _mm_xor_si128(reg, \
183*f6dc9357SAndroid Build Coastguard Worker                             LOAD_128 (data + (ii))));
184*f6dc9357SAndroid Build Coastguard Worker #define WOP_KEY(op, n) { \
185*f6dc9357SAndroid Build Coastguard Worker     const __m128i key = w[n]; \
186*f6dc9357SAndroid Build Coastguard Worker     WOP(op) }
187*f6dc9357SAndroid Build Coastguard Worker 
188*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START  \
189*f6dc9357SAndroid Build Coastguard Worker     dataEnd = data + numBlocks;  \
190*f6dc9357SAndroid Build Coastguard Worker     if (numBlocks >= NUM_WAYS)  \
191*f6dc9357SAndroid Build Coastguard Worker     { dataEnd -= NUM_WAYS; do {  \
192*f6dc9357SAndroid Build Coastguard Worker 
193*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END  \
194*f6dc9357SAndroid Build Coastguard Worker     data += NUM_WAYS;  \
195*f6dc9357SAndroid Build Coastguard Worker     } while (data <= dataEnd);  \
196*f6dc9357SAndroid Build Coastguard Worker     dataEnd += NUM_WAYS; }  \
197*f6dc9357SAndroid Build Coastguard Worker 
198*f6dc9357SAndroid Build Coastguard Worker #define SINGLE_LOOP  \
199*f6dc9357SAndroid Build Coastguard Worker     for (; data < dataEnd; data++)
200*f6dc9357SAndroid Build Coastguard Worker 
201*f6dc9357SAndroid Build Coastguard Worker 
202*f6dc9357SAndroid Build Coastguard Worker 
203*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_VAES
204*f6dc9357SAndroid Build Coastguard Worker 
205*f6dc9357SAndroid Build Coastguard Worker #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
206*f6dc9357SAndroid Build Coastguard Worker #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
207*f6dc9357SAndroid Build Coastguard Worker 
208*f6dc9357SAndroid Build Coastguard Worker #if 1
209*f6dc9357SAndroid Build Coastguard Worker // use unaligned AVX load/store for data.
210*f6dc9357SAndroid Build Coastguard Worker // It is required for our Aes functions, that data is aligned for 16-bytes.
211*f6dc9357SAndroid Build Coastguard Worker // But we need 32-bytes reading.
212*f6dc9357SAndroid Build Coastguard Worker // So we use intrinsics for unaligned AVX load/store.
213*f6dc9357SAndroid Build Coastguard Worker // notes for _mm256_storeu_si256:
214*f6dc9357SAndroid Build Coastguard Worker // msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215*f6dc9357SAndroid Build Coastguard Worker // new gcc11 uses vmovdqu
216*f6dc9357SAndroid Build Coastguard Worker // old gcc9 could use pair of instructions:
217*f6dc9357SAndroid Build Coastguard Worker //   vmovups        %xmm7, -224(%rax)
218*f6dc9357SAndroid Build Coastguard Worker //   vextracti128   $0x1, %ymm7, -208(%rax)
219*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD(p)         _mm256_loadu_si256((const __m256i *)(const void *)(p))
220*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE(p, _v)    _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221*f6dc9357SAndroid Build Coastguard Worker #else
222*f6dc9357SAndroid Build Coastguard Worker // use aligned AVX load/store for data.
223*f6dc9357SAndroid Build Coastguard Worker // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224*f6dc9357SAndroid Build Coastguard Worker // msvc2022 uses vmovdqu still
225*f6dc9357SAndroid Build Coastguard Worker // gcc      uses vmovdqa (that requires 32-bytes alignment)
226*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD(p)         (*(const __m256i *)(const void *)(p))
227*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE(p, _v)    (*(__m256i *)(void *)(p)) = _v;
228*f6dc9357SAndroid Build Coastguard Worker #endif
229*f6dc9357SAndroid Build Coastguard Worker 
230*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD_data(  reg, ii)  reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE_data( reg, ii)  AVX_STORE((__m256i *)(void *)data + (ii), reg)
232*f6dc9357SAndroid Build Coastguard Worker /*
233*f6dc9357SAndroid Build Coastguard Worker AVX_XOR_data_M1() needs unaligned memory load, even if (data)
234*f6dc9357SAndroid Build Coastguard Worker is aligned for 256-bits, because we read 32-bytes chunk that
235*f6dc9357SAndroid Build Coastguard Worker crosses (data) position: from (data - 16bytes) to (data + 16bytes).
236*f6dc9357SAndroid Build Coastguard Worker */
237*f6dc9357SAndroid Build Coastguard Worker #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
238*f6dc9357SAndroid Build Coastguard Worker 
239*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
240*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
241*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
242*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
243*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
244*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_START(reg, ii)  \
245*f6dc9357SAndroid Build Coastguard Worker     MM_OP (_mm256_add_epi64, ctr2, two) \
246*f6dc9357SAndroid Build Coastguard Worker     reg = _mm256_xor_si256(ctr2, key);
247*f6dc9357SAndroid Build Coastguard Worker 
248*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_END(reg, ii)  \
249*f6dc9357SAndroid Build Coastguard Worker     AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250*f6dc9357SAndroid Build Coastguard Worker     AVX_LOAD ((__m256i *)(void *)data + (ii))));
251*f6dc9357SAndroid Build Coastguard Worker 
252*f6dc9357SAndroid Build Coastguard Worker #define AVX_WOP_KEY(op, n) { \
253*f6dc9357SAndroid Build Coastguard Worker     const __m256i key = w[n]; \
254*f6dc9357SAndroid Build Coastguard Worker     WOP(op) }
255*f6dc9357SAndroid Build Coastguard Worker 
256*f6dc9357SAndroid Build Coastguard Worker #define NUM_AES_KEYS_MAX 15
257*f6dc9357SAndroid Build Coastguard Worker 
258*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START_AVX(OP)  \
259*f6dc9357SAndroid Build Coastguard Worker     dataEnd = data + numBlocks;  \
260*f6dc9357SAndroid Build Coastguard Worker     if (numBlocks >= NUM_WAYS * 2)  \
261*f6dc9357SAndroid Build Coastguard Worker     { __m256i keys[NUM_AES_KEYS_MAX];  \
262*f6dc9357SAndroid Build Coastguard Worker       OP  \
263*f6dc9357SAndroid Build Coastguard Worker       { UInt32 ii; for (ii = 0; ii < numRounds; ii++)  \
264*f6dc9357SAndroid Build Coastguard Worker         keys[ii] = _mm256_broadcastsi128_si256(p[ii]); }  \
265*f6dc9357SAndroid Build Coastguard Worker       dataEnd -= NUM_WAYS * 2; \
266*f6dc9357SAndroid Build Coastguard Worker       do {  \
267*f6dc9357SAndroid Build Coastguard Worker 
268*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END_AVX(OP)  \
269*f6dc9357SAndroid Build Coastguard Worker         data += NUM_WAYS * 2;  \
270*f6dc9357SAndroid Build Coastguard Worker       } while (data <= dataEnd);  \
271*f6dc9357SAndroid Build Coastguard Worker       dataEnd += NUM_WAYS * 2;  \
272*f6dc9357SAndroid Build Coastguard Worker       OP  \
273*f6dc9357SAndroid Build Coastguard Worker       _mm256_zeroupper();  \
274*f6dc9357SAndroid Build Coastguard Worker     }  \
275*f6dc9357SAndroid Build Coastguard Worker 
276*f6dc9357SAndroid Build Coastguard Worker /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
277*f6dc9357SAndroid Build Coastguard Worker    MSVC still can insert vzeroupper instruction. */
278*f6dc9357SAndroid Build Coastguard Worker 
279*f6dc9357SAndroid Build Coastguard Worker #endif
280*f6dc9357SAndroid Build Coastguard Worker 
281*f6dc9357SAndroid Build Coastguard Worker 
282*f6dc9357SAndroid Build Coastguard Worker 
AES_FUNC_START2(AesCbc_Decode_HW)283*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Decode_HW)
284*f6dc9357SAndroid Build Coastguard Worker {
285*f6dc9357SAndroid Build Coastguard Worker   __m128i *p = (__m128i *)(void *)ivAes;
286*f6dc9357SAndroid Build Coastguard Worker   __m128i *data = (__m128i *)(void *)data8;
287*f6dc9357SAndroid Build Coastguard Worker   __m128i iv = *p;
288*f6dc9357SAndroid Build Coastguard Worker   const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
289*f6dc9357SAndroid Build Coastguard Worker   const __m128i *dataEnd;
290*f6dc9357SAndroid Build Coastguard Worker   p += 2;
291*f6dc9357SAndroid Build Coastguard Worker 
292*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START
293*f6dc9357SAndroid Build Coastguard Worker   {
294*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = wStart;
295*f6dc9357SAndroid Build Coastguard Worker     WOP (DECLARE_VAR)
296*f6dc9357SAndroid Build Coastguard Worker     WOP (LOAD_data)
297*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_XOR, 1)
298*f6dc9357SAndroid Build Coastguard Worker     do
299*f6dc9357SAndroid Build Coastguard Worker     {
300*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_DEC, 0)
301*f6dc9357SAndroid Build Coastguard Worker 
302*f6dc9357SAndroid Build Coastguard Worker       w--;
303*f6dc9357SAndroid Build Coastguard Worker     }
304*f6dc9357SAndroid Build Coastguard Worker     while (w != p);
305*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_DEC_LAST, 0)
306*f6dc9357SAndroid Build Coastguard Worker 
307*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (m0, iv)
308*f6dc9357SAndroid Build Coastguard Worker     WOP_M1 (XOR_data_M1)
309*f6dc9357SAndroid Build Coastguard Worker     LOAD_data(iv, NUM_WAYS - 1)
310*f6dc9357SAndroid Build Coastguard Worker     WOP (STORE_data)
311*f6dc9357SAndroid Build Coastguard Worker   }
312*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END
313*f6dc9357SAndroid Build Coastguard Worker 
314*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
315*f6dc9357SAndroid Build Coastguard Worker   {
316*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = wStart - 1;
317*f6dc9357SAndroid Build Coastguard Worker     __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318*f6dc9357SAndroid Build Coastguard Worker 
319*f6dc9357SAndroid Build Coastguard Worker     do
320*f6dc9357SAndroid Build Coastguard Worker     {
321*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesdec_si128, w[1])
322*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesdec_si128, w[0])
323*f6dc9357SAndroid Build Coastguard Worker       w -= 2;
324*f6dc9357SAndroid Build Coastguard Worker     }
325*f6dc9357SAndroid Build Coastguard Worker     while (w != p);
326*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesdec_si128,     w[1])
327*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesdeclast_si128, w[0])
328*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (m, iv)
329*f6dc9357SAndroid Build Coastguard Worker     LOAD_data(iv, 0)
330*f6dc9357SAndroid Build Coastguard Worker     STORE_data(m, 0)
331*f6dc9357SAndroid Build Coastguard Worker   }
332*f6dc9357SAndroid Build Coastguard Worker 
333*f6dc9357SAndroid Build Coastguard Worker   p[-2] = iv;
334*f6dc9357SAndroid Build Coastguard Worker }
335*f6dc9357SAndroid Build Coastguard Worker 
336*f6dc9357SAndroid Build Coastguard Worker 
AES_FUNC_START2(AesCtr_Code_HW)337*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCtr_Code_HW)
338*f6dc9357SAndroid Build Coastguard Worker {
339*f6dc9357SAndroid Build Coastguard Worker   __m128i *p = (__m128i *)(void *)ivAes;
340*f6dc9357SAndroid Build Coastguard Worker   __m128i *data = (__m128i *)(void *)data8;
341*f6dc9357SAndroid Build Coastguard Worker   __m128i ctr = *p;
342*f6dc9357SAndroid Build Coastguard Worker   const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
343*f6dc9357SAndroid Build Coastguard Worker   const __m128i *dataEnd;
344*f6dc9357SAndroid Build Coastguard Worker   const __m128i one = _mm_cvtsi32_si128(1);
345*f6dc9357SAndroid Build Coastguard Worker 
346*f6dc9357SAndroid Build Coastguard Worker   p += 2;
347*f6dc9357SAndroid Build Coastguard Worker 
348*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START
349*f6dc9357SAndroid Build Coastguard Worker   {
350*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = p;
351*f6dc9357SAndroid Build Coastguard Worker     UInt32 r = numRoundsMinus2;
352*f6dc9357SAndroid Build Coastguard Worker     WOP (DECLARE_VAR)
353*f6dc9357SAndroid Build Coastguard Worker     WOP (CTR_START)
354*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_XOR, 0)
355*f6dc9357SAndroid Build Coastguard Worker     w += 1;
356*f6dc9357SAndroid Build Coastguard Worker     do
357*f6dc9357SAndroid Build Coastguard Worker     {
358*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_ENC, 0)
359*f6dc9357SAndroid Build Coastguard Worker       w += 1;
360*f6dc9357SAndroid Build Coastguard Worker     }
361*f6dc9357SAndroid Build Coastguard Worker     while (--r);
362*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_ENC_LAST, 0)
363*f6dc9357SAndroid Build Coastguard Worker     WOP (CTR_END)
364*f6dc9357SAndroid Build Coastguard Worker   }
365*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END
366*f6dc9357SAndroid Build Coastguard Worker 
367*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
368*f6dc9357SAndroid Build Coastguard Worker   {
369*f6dc9357SAndroid Build Coastguard Worker     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
370*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = p;
371*f6dc9357SAndroid Build Coastguard Worker     __m128i m;
372*f6dc9357SAndroid Build Coastguard Worker     MM_OP (_mm_add_epi64, ctr, one)
373*f6dc9357SAndroid Build Coastguard Worker     m = _mm_xor_si128 (ctr, p[0]);
374*f6dc9357SAndroid Build Coastguard Worker     w += 1;
375*f6dc9357SAndroid Build Coastguard Worker     do
376*f6dc9357SAndroid Build Coastguard Worker     {
377*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[0])
378*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[1])
379*f6dc9357SAndroid Build Coastguard Worker       w += 2;
380*f6dc9357SAndroid Build Coastguard Worker     }
381*f6dc9357SAndroid Build Coastguard Worker     while (--numRounds2);
382*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenc_si128,     w[0])
383*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenclast_si128, w[1])
384*f6dc9357SAndroid Build Coastguard Worker     CTR_END (m, 0)
385*f6dc9357SAndroid Build Coastguard Worker   }
386*f6dc9357SAndroid Build Coastguard Worker 
387*f6dc9357SAndroid Build Coastguard Worker   p[-2] = ctr;
388*f6dc9357SAndroid Build Coastguard Worker }
389*f6dc9357SAndroid Build Coastguard Worker 
390*f6dc9357SAndroid Build Coastguard Worker 
391*f6dc9357SAndroid Build Coastguard Worker 
392*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_VAES
393*f6dc9357SAndroid Build Coastguard Worker 
394*f6dc9357SAndroid Build Coastguard Worker /*
395*f6dc9357SAndroid Build Coastguard Worker GCC before 2013-Jun:
396*f6dc9357SAndroid Build Coastguard Worker   <immintrin.h>:
397*f6dc9357SAndroid Build Coastguard Worker     #ifdef __AVX__
398*f6dc9357SAndroid Build Coastguard Worker      #include <avxintrin.h>
399*f6dc9357SAndroid Build Coastguard Worker     #endif
400*f6dc9357SAndroid Build Coastguard Worker GCC after 2013-Jun:
401*f6dc9357SAndroid Build Coastguard Worker   <immintrin.h>:
402*f6dc9357SAndroid Build Coastguard Worker     #include <avxintrin.h>
403*f6dc9357SAndroid Build Coastguard Worker CLANG 3.8+:
404*f6dc9357SAndroid Build Coastguard Worker {
405*f6dc9357SAndroid Build Coastguard Worker   <immintrin.h>:
406*f6dc9357SAndroid Build Coastguard Worker     #if !defined(_MSC_VER) || defined(__AVX__)
407*f6dc9357SAndroid Build Coastguard Worker       #include <avxintrin.h>
408*f6dc9357SAndroid Build Coastguard Worker     #endif
409*f6dc9357SAndroid Build Coastguard Worker 
410*f6dc9357SAndroid Build Coastguard Worker   if (the compiler is clang for Windows and if global arch is not set for __AVX__)
411*f6dc9357SAndroid Build Coastguard Worker     [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
412*f6dc9357SAndroid Build Coastguard Worker   {
413*f6dc9357SAndroid Build Coastguard Worker     <immintrin.h> doesn't include <avxintrin.h>
414*f6dc9357SAndroid Build Coastguard Worker     and we have 2 ways to fix it:
415*f6dc9357SAndroid Build Coastguard Worker       1) we can define required __AVX__ before <immintrin.h>
416*f6dc9357SAndroid Build Coastguard Worker       or
417*f6dc9357SAndroid Build Coastguard Worker       2) we can include <avxintrin.h> after <immintrin.h>
418*f6dc9357SAndroid Build Coastguard Worker   }
419*f6dc9357SAndroid Build Coastguard Worker }
420*f6dc9357SAndroid Build Coastguard Worker 
421*f6dc9357SAndroid Build Coastguard Worker If we include <avxintrin.h> manually for GCC/CLANG, it's
422*f6dc9357SAndroid Build Coastguard Worker required that <immintrin.h> must be included before <avxintrin.h>.
423*f6dc9357SAndroid Build Coastguard Worker */
424*f6dc9357SAndroid Build Coastguard Worker 
425*f6dc9357SAndroid Build Coastguard Worker /*
426*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && defined(_MSC_VER)
427*f6dc9357SAndroid Build Coastguard Worker #define __AVX__
428*f6dc9357SAndroid Build Coastguard Worker #define __AVX2__
429*f6dc9357SAndroid Build Coastguard Worker #define __VAES__
430*f6dc9357SAndroid Build Coastguard Worker #endif
431*f6dc9357SAndroid Build Coastguard Worker */
432*f6dc9357SAndroid Build Coastguard Worker 
433*f6dc9357SAndroid Build Coastguard Worker #include <immintrin.h>
434*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && defined(_MSC_VER)
435*f6dc9357SAndroid Build Coastguard Worker   #if !defined(__AVX__)
436*f6dc9357SAndroid Build Coastguard Worker     #include <avxintrin.h>
437*f6dc9357SAndroid Build Coastguard Worker   #endif
438*f6dc9357SAndroid Build Coastguard Worker   #if !defined(__AVX2__)
439*f6dc9357SAndroid Build Coastguard Worker     #include <avx2intrin.h>
440*f6dc9357SAndroid Build Coastguard Worker   #endif
441*f6dc9357SAndroid Build Coastguard Worker   #if !defined(__VAES__)
442*f6dc9357SAndroid Build Coastguard Worker     #include <vaesintrin.h>
443*f6dc9357SAndroid Build Coastguard Worker   #endif
444*f6dc9357SAndroid Build Coastguard Worker #endif  // __clang__ && _MSC_VER
445*f6dc9357SAndroid Build Coastguard Worker 
446*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_VAES
447*f6dc9357SAndroid Build Coastguard Worker   #define ATTRIB_VAES
448*f6dc9357SAndroid Build Coastguard Worker #endif
449*f6dc9357SAndroid Build Coastguard Worker 
450*f6dc9357SAndroid Build Coastguard Worker #define VAES_FUNC_START2(name) \
451*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
452*f6dc9357SAndroid Build Coastguard Worker ATTRIB_VAES \
453*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
454*f6dc9357SAndroid Build Coastguard Worker 
VAES_FUNC_START2(AesCbc_Decode_HW_256)455*f6dc9357SAndroid Build Coastguard Worker VAES_FUNC_START2 (AesCbc_Decode_HW_256)
456*f6dc9357SAndroid Build Coastguard Worker {
457*f6dc9357SAndroid Build Coastguard Worker   __m128i *p = (__m128i *)(void *)ivAes;
458*f6dc9357SAndroid Build Coastguard Worker   __m128i *data = (__m128i *)(void *)data8;
459*f6dc9357SAndroid Build Coastguard Worker   __m128i iv = *p;
460*f6dc9357SAndroid Build Coastguard Worker   const __m128i *dataEnd;
461*f6dc9357SAndroid Build Coastguard Worker   const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
462*f6dc9357SAndroid Build Coastguard Worker   p += 2;
463*f6dc9357SAndroid Build Coastguard Worker 
464*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START_AVX(;)
465*f6dc9357SAndroid Build Coastguard Worker   {
466*f6dc9357SAndroid Build Coastguard Worker     const __m256i *w = keys + numRounds - 2;
467*f6dc9357SAndroid Build Coastguard Worker 
468*f6dc9357SAndroid Build Coastguard Worker     WOP (AVX_DECLARE_VAR)
469*f6dc9357SAndroid Build Coastguard Worker     WOP (AVX_LOAD_data)
470*f6dc9357SAndroid Build Coastguard Worker     AVX_WOP_KEY (AVX_AES_XOR, 1)
471*f6dc9357SAndroid Build Coastguard Worker 
472*f6dc9357SAndroid Build Coastguard Worker     do
473*f6dc9357SAndroid Build Coastguard Worker     {
474*f6dc9357SAndroid Build Coastguard Worker       AVX_WOP_KEY (AVX_AES_DEC, 0)
475*f6dc9357SAndroid Build Coastguard Worker       w--;
476*f6dc9357SAndroid Build Coastguard Worker     }
477*f6dc9357SAndroid Build Coastguard Worker     while (w != keys);
478*f6dc9357SAndroid Build Coastguard Worker     AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
479*f6dc9357SAndroid Build Coastguard Worker 
480*f6dc9357SAndroid Build Coastguard Worker     AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
481*f6dc9357SAndroid Build Coastguard Worker     WOP_M1 (AVX_XOR_data_M1)
482*f6dc9357SAndroid Build Coastguard Worker     LOAD_data (iv, NUM_WAYS * 2 - 1)
483*f6dc9357SAndroid Build Coastguard Worker     WOP (AVX_STORE_data)
484*f6dc9357SAndroid Build Coastguard Worker   }
485*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END_AVX(;)
486*f6dc9357SAndroid Build Coastguard Worker 
487*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
488*f6dc9357SAndroid Build Coastguard Worker   {
489*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
490*f6dc9357SAndroid Build Coastguard Worker     __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
491*f6dc9357SAndroid Build Coastguard Worker     do
492*f6dc9357SAndroid Build Coastguard Worker     {
493*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesdec_si128, w[1])
494*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesdec_si128, w[0])
495*f6dc9357SAndroid Build Coastguard Worker       w -= 2;
496*f6dc9357SAndroid Build Coastguard Worker     }
497*f6dc9357SAndroid Build Coastguard Worker     while (w != p);
498*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesdec_si128,     w[1])
499*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesdeclast_si128, w[0])
500*f6dc9357SAndroid Build Coastguard Worker 
501*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (m, iv)
502*f6dc9357SAndroid Build Coastguard Worker     LOAD_data(iv, 0)
503*f6dc9357SAndroid Build Coastguard Worker     STORE_data(m, 0)
504*f6dc9357SAndroid Build Coastguard Worker   }
505*f6dc9357SAndroid Build Coastguard Worker 
506*f6dc9357SAndroid Build Coastguard Worker   p[-2] = iv;
507*f6dc9357SAndroid Build Coastguard Worker }
508*f6dc9357SAndroid Build Coastguard Worker 
509*f6dc9357SAndroid Build Coastguard Worker 
510*f6dc9357SAndroid Build Coastguard Worker /*
511*f6dc9357SAndroid Build Coastguard Worker SSE2: _mm_cvtsi32_si128 : movd
512*f6dc9357SAndroid Build Coastguard Worker AVX:  _mm256_setr_m128i            : vinsertf128
513*f6dc9357SAndroid Build Coastguard Worker AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
514*f6dc9357SAndroid Build Coastguard Worker       _mm256_extracti128_si256     : vextracti128
515*f6dc9357SAndroid Build Coastguard Worker       _mm256_broadcastsi128_si256  : vbroadcasti128
516*f6dc9357SAndroid Build Coastguard Worker */
517*f6dc9357SAndroid Build Coastguard Worker 
518*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_LOOP_START  \
519*f6dc9357SAndroid Build Coastguard Worker     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
520*f6dc9357SAndroid Build Coastguard Worker     two = _mm256_setr_m128i(one, one); \
521*f6dc9357SAndroid Build Coastguard Worker     two = _mm256_add_epi64(two, two); \
522*f6dc9357SAndroid Build Coastguard Worker 
523*f6dc9357SAndroid Build Coastguard Worker // two = _mm256_setr_epi64x(2, 0, 2, 0);
524*f6dc9357SAndroid Build Coastguard Worker 
525*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_LOOP_ENC  \
526*f6dc9357SAndroid Build Coastguard Worker     ctr = _mm256_extracti128_si256 (ctr2, 1); \
527*f6dc9357SAndroid Build Coastguard Worker 
VAES_FUNC_START2(AesCtr_Code_HW_256)528*f6dc9357SAndroid Build Coastguard Worker VAES_FUNC_START2 (AesCtr_Code_HW_256)
529*f6dc9357SAndroid Build Coastguard Worker {
530*f6dc9357SAndroid Build Coastguard Worker   __m128i *p = (__m128i *)(void *)ivAes;
531*f6dc9357SAndroid Build Coastguard Worker   __m128i *data = (__m128i *)(void *)data8;
532*f6dc9357SAndroid Build Coastguard Worker   __m128i ctr = *p;
533*f6dc9357SAndroid Build Coastguard Worker   const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
534*f6dc9357SAndroid Build Coastguard Worker   const __m128i *dataEnd;
535*f6dc9357SAndroid Build Coastguard Worker   const __m128i one = _mm_cvtsi32_si128(1);
536*f6dc9357SAndroid Build Coastguard Worker   __m256i ctr2, two;
537*f6dc9357SAndroid Build Coastguard Worker   p += 2;
538*f6dc9357SAndroid Build Coastguard Worker 
539*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
540*f6dc9357SAndroid Build Coastguard Worker   {
541*f6dc9357SAndroid Build Coastguard Worker     const __m256i *w = keys;
542*f6dc9357SAndroid Build Coastguard Worker     UInt32 r = numRounds - 2;
543*f6dc9357SAndroid Build Coastguard Worker     WOP (AVX_DECLARE_VAR)
544*f6dc9357SAndroid Build Coastguard Worker     AVX_WOP_KEY (AVX_CTR_START, 0)
545*f6dc9357SAndroid Build Coastguard Worker 
546*f6dc9357SAndroid Build Coastguard Worker     w += 1;
547*f6dc9357SAndroid Build Coastguard Worker     do
548*f6dc9357SAndroid Build Coastguard Worker     {
549*f6dc9357SAndroid Build Coastguard Worker       AVX_WOP_KEY (AVX_AES_ENC, 0)
550*f6dc9357SAndroid Build Coastguard Worker       w += 1;
551*f6dc9357SAndroid Build Coastguard Worker     }
552*f6dc9357SAndroid Build Coastguard Worker     while (--r);
553*f6dc9357SAndroid Build Coastguard Worker     AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
554*f6dc9357SAndroid Build Coastguard Worker 
555*f6dc9357SAndroid Build Coastguard Worker     WOP (AVX_CTR_END)
556*f6dc9357SAndroid Build Coastguard Worker   }
557*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
558*f6dc9357SAndroid Build Coastguard Worker 
559*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
560*f6dc9357SAndroid Build Coastguard Worker   {
561*f6dc9357SAndroid Build Coastguard Worker     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
562*f6dc9357SAndroid Build Coastguard Worker     const __m128i *w = p;
563*f6dc9357SAndroid Build Coastguard Worker     __m128i m;
564*f6dc9357SAndroid Build Coastguard Worker     MM_OP (_mm_add_epi64, ctr, one)
565*f6dc9357SAndroid Build Coastguard Worker     m = _mm_xor_si128 (ctr, p[0]);
566*f6dc9357SAndroid Build Coastguard Worker     w += 1;
567*f6dc9357SAndroid Build Coastguard Worker     do
568*f6dc9357SAndroid Build Coastguard Worker     {
569*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[0])
570*f6dc9357SAndroid Build Coastguard Worker       MM_OP_m (_mm_aesenc_si128, w[1])
571*f6dc9357SAndroid Build Coastguard Worker       w += 2;
572*f6dc9357SAndroid Build Coastguard Worker     }
573*f6dc9357SAndroid Build Coastguard Worker     while (--numRounds2);
574*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenc_si128,     w[0])
575*f6dc9357SAndroid Build Coastguard Worker     MM_OP_m (_mm_aesenclast_si128, w[1])
576*f6dc9357SAndroid Build Coastguard Worker     CTR_END (m, 0)
577*f6dc9357SAndroid Build Coastguard Worker   }
578*f6dc9357SAndroid Build Coastguard Worker 
579*f6dc9357SAndroid Build Coastguard Worker   p[-2] = ctr;
580*f6dc9357SAndroid Build Coastguard Worker }
581*f6dc9357SAndroid Build Coastguard Worker 
582*f6dc9357SAndroid Build Coastguard Worker #endif // USE_INTEL_VAES
583*f6dc9357SAndroid Build Coastguard Worker 
584*f6dc9357SAndroid Build Coastguard Worker #else // USE_INTEL_AES
585*f6dc9357SAndroid Build Coastguard Worker 
586*f6dc9357SAndroid Build Coastguard Worker /* no USE_INTEL_AES */
587*f6dc9357SAndroid Build Coastguard Worker 
588*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_USE_AES_HW_STUB)
589*f6dc9357SAndroid Build Coastguard Worker // We can compile this file with another C compiler,
590*f6dc9357SAndroid Build Coastguard Worker // or we can compile asm version.
591*f6dc9357SAndroid Build Coastguard Worker // So we can generate real code instead of this stub function.
592*f6dc9357SAndroid Build Coastguard Worker // #if defined(_MSC_VER)
593*f6dc9357SAndroid Build Coastguard Worker #pragma message("AES  HW_SW stub was used")
594*f6dc9357SAndroid Build Coastguard Worker // #endif
595*f6dc9357SAndroid Build Coastguard Worker 
596*f6dc9357SAndroid Build Coastguard Worker #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
597*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_keys UInt32
598*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_data Byte
599*f6dc9357SAndroid Build Coastguard Worker #endif
600*f6dc9357SAndroid Build Coastguard Worker 
601*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
602*f6dc9357SAndroid Build Coastguard Worker     void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
603*f6dc9357SAndroid Build Coastguard Worker 
604*f6dc9357SAndroid Build Coastguard Worker #define AES_COMPAT_STUB(name) \
605*f6dc9357SAndroid Build Coastguard Worker     AES_FUNC_START(name); \
606*f6dc9357SAndroid Build Coastguard Worker     AES_FUNC_START(name ## _HW) \
607*f6dc9357SAndroid Build Coastguard Worker     { name(p, data, numBlocks); }
608*f6dc9357SAndroid Build Coastguard Worker 
609*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCbc_Encode)
610*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCbc_Decode)
611*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCtr_Code)
612*f6dc9357SAndroid Build Coastguard Worker #endif // Z7_USE_AES_HW_STUB
613*f6dc9357SAndroid Build Coastguard Worker 
614*f6dc9357SAndroid Build Coastguard Worker #endif // USE_INTEL_AES
615*f6dc9357SAndroid Build Coastguard Worker 
616*f6dc9357SAndroid Build Coastguard Worker 
617*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_VAES
618*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_USE_VAES_HW_STUB)
619*f6dc9357SAndroid Build Coastguard Worker // #if defined(_MSC_VER)
620*f6dc9357SAndroid Build Coastguard Worker #pragma message("VAES HW_SW stub was used")
621*f6dc9357SAndroid Build Coastguard Worker // #endif
622*f6dc9357SAndroid Build Coastguard Worker 
623*f6dc9357SAndroid Build Coastguard Worker #define VAES_COMPAT_STUB(name) \
624*f6dc9357SAndroid Build Coastguard Worker     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
625*f6dc9357SAndroid Build Coastguard Worker     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
626*f6dc9357SAndroid Build Coastguard Worker     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
627*f6dc9357SAndroid Build Coastguard Worker 
628*f6dc9357SAndroid Build Coastguard Worker VAES_COMPAT_STUB (AesCbc_Decode_HW)
629*f6dc9357SAndroid Build Coastguard Worker VAES_COMPAT_STUB (AesCtr_Code_HW)
630*f6dc9357SAndroid Build Coastguard Worker #endif
631*f6dc9357SAndroid Build Coastguard Worker #endif // ! USE_INTEL_VAES
632*f6dc9357SAndroid Build Coastguard Worker 
633*f6dc9357SAndroid Build Coastguard Worker 
634*f6dc9357SAndroid Build Coastguard Worker 
635*f6dc9357SAndroid Build Coastguard Worker 
636*f6dc9357SAndroid Build Coastguard Worker #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
637*f6dc9357SAndroid Build Coastguard Worker 
638*f6dc9357SAndroid Build Coastguard Worker   #if   defined(__ARM_FEATURE_AES) \
639*f6dc9357SAndroid Build Coastguard Worker      || defined(__ARM_FEATURE_CRYPTO)
640*f6dc9357SAndroid Build Coastguard Worker     #define USE_HW_AES
641*f6dc9357SAndroid Build Coastguard Worker   #else
642*f6dc9357SAndroid Build Coastguard Worker     #if  defined(MY_CPU_ARM64) \
643*f6dc9357SAndroid Build Coastguard Worker       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
644*f6dc9357SAndroid Build Coastguard Worker       || defined(Z7_MSC_VER_ORIGINAL)
645*f6dc9357SAndroid Build Coastguard Worker     #if  defined(__ARM_FP) && \
646*f6dc9357SAndroid Build Coastguard Worker           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
647*f6dc9357SAndroid Build Coastguard Worker            || defined(__GNUC__) && (__GNUC__ >= 6) \
648*f6dc9357SAndroid Build Coastguard Worker           ) \
649*f6dc9357SAndroid Build Coastguard Worker       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
650*f6dc9357SAndroid Build Coastguard Worker     #if  defined(MY_CPU_ARM64) \
651*f6dc9357SAndroid Build Coastguard Worker       || !defined(Z7_CLANG_VERSION) \
652*f6dc9357SAndroid Build Coastguard Worker       || defined(__ARM_NEON) && \
653*f6dc9357SAndroid Build Coastguard Worker           (Z7_CLANG_VERSION < 170000 || \
654*f6dc9357SAndroid Build Coastguard Worker            Z7_CLANG_VERSION > 170001)
655*f6dc9357SAndroid Build Coastguard Worker       #define USE_HW_AES
656*f6dc9357SAndroid Build Coastguard Worker     #endif
657*f6dc9357SAndroid Build Coastguard Worker     #endif
658*f6dc9357SAndroid Build Coastguard Worker     #endif
659*f6dc9357SAndroid Build Coastguard Worker   #endif
660*f6dc9357SAndroid Build Coastguard Worker 
661*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_HW_AES
662*f6dc9357SAndroid Build Coastguard Worker 
663*f6dc9357SAndroid Build Coastguard Worker // #pragma message("=== AES HW === ")
664*f6dc9357SAndroid Build Coastguard Worker // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
665*f6dc9357SAndroid Build Coastguard Worker 
666*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) || defined(__GNUC__)
667*f6dc9357SAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_AES) && \
668*f6dc9357SAndroid Build Coastguard Worker     !defined(__ARM_FEATURE_CRYPTO)
669*f6dc9357SAndroid Build Coastguard Worker   #ifdef MY_CPU_ARM64
670*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
671*f6dc9357SAndroid Build Coastguard Worker     #define ATTRIB_AES __attribute__((__target__("crypto")))
672*f6dc9357SAndroid Build Coastguard Worker #else
673*f6dc9357SAndroid Build Coastguard Worker     #define ATTRIB_AES __attribute__((__target__("+crypto")))
674*f6dc9357SAndroid Build Coastguard Worker #endif
675*f6dc9357SAndroid Build Coastguard Worker   #else
676*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
677*f6dc9357SAndroid Build Coastguard Worker     #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
678*f6dc9357SAndroid Build Coastguard Worker #else
679*f6dc9357SAndroid Build Coastguard Worker     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
680*f6dc9357SAndroid Build Coastguard Worker #endif
681*f6dc9357SAndroid Build Coastguard Worker   #endif
682*f6dc9357SAndroid Build Coastguard Worker #endif
683*f6dc9357SAndroid Build Coastguard Worker #else
684*f6dc9357SAndroid Build Coastguard Worker   // _MSC_VER
685*f6dc9357SAndroid Build Coastguard Worker   // for arm32
686*f6dc9357SAndroid Build Coastguard Worker   #define _ARM_USE_NEW_NEON_INTRINSICS
687*f6dc9357SAndroid Build Coastguard Worker #endif
688*f6dc9357SAndroid Build Coastguard Worker 
689*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_AES
690*f6dc9357SAndroid Build Coastguard Worker   #define ATTRIB_AES
691*f6dc9357SAndroid Build Coastguard Worker #endif
692*f6dc9357SAndroid Build Coastguard Worker 
693*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
694*f6dc9357SAndroid Build Coastguard Worker #include <arm64_neon.h>
695*f6dc9357SAndroid Build Coastguard Worker #else
696*f6dc9357SAndroid Build Coastguard Worker /*
697*f6dc9357SAndroid Build Coastguard Worker   clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
698*f6dc9357SAndroid Build Coastguard Worker   clang
699*f6dc9357SAndroid Build Coastguard Worker    3.8.1 : __ARM_NEON             :                    defined(__ARM_FEATURE_CRYPTO)
700*f6dc9357SAndroid Build Coastguard Worker    7.0.1 : __ARM_NEON             : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
701*f6dc9357SAndroid Build Coastguard Worker   11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
702*f6dc9357SAndroid Build Coastguard Worker   13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
703*f6dc9357SAndroid Build Coastguard Worker   16     : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
704*f6dc9357SAndroid Build Coastguard Worker */
705*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && __clang_major__ < 16
706*f6dc9357SAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_AES) && \
707*f6dc9357SAndroid Build Coastguard Worker     !defined(__ARM_FEATURE_CRYPTO)
708*f6dc9357SAndroid Build Coastguard Worker //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
709*f6dc9357SAndroid Build Coastguard Worker     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
710*f6dc9357SAndroid Build Coastguard Worker     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
711*f6dc9357SAndroid Build Coastguard Worker // #if defined(__clang__) && __clang_major__ < 13
712*f6dc9357SAndroid Build Coastguard Worker     #define __ARM_FEATURE_CRYPTO 1
713*f6dc9357SAndroid Build Coastguard Worker // #else
714*f6dc9357SAndroid Build Coastguard Worker     #define __ARM_FEATURE_AES 1
715*f6dc9357SAndroid Build Coastguard Worker // #endif
716*f6dc9357SAndroid Build Coastguard Worker     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
717*f6dc9357SAndroid Build Coastguard Worker #endif
718*f6dc9357SAndroid Build Coastguard Worker #endif // clang
719*f6dc9357SAndroid Build Coastguard Worker 
720*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
721*f6dc9357SAndroid Build Coastguard Worker 
722*f6dc9357SAndroid Build Coastguard Worker #if defined(__ARM_ARCH) && __ARM_ARCH < 8
723*f6dc9357SAndroid Build Coastguard Worker     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
724*f6dc9357SAndroid Build Coastguard Worker //    #pragma message("#define __ARM_ARCH 8")
725*f6dc9357SAndroid Build Coastguard Worker     #undef  __ARM_ARCH
726*f6dc9357SAndroid Build Coastguard Worker     #define __ARM_ARCH 8
727*f6dc9357SAndroid Build Coastguard Worker     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
728*f6dc9357SAndroid Build Coastguard Worker #endif
729*f6dc9357SAndroid Build Coastguard Worker 
730*f6dc9357SAndroid Build Coastguard Worker #endif // clang
731*f6dc9357SAndroid Build Coastguard Worker 
732*f6dc9357SAndroid Build Coastguard Worker #include <arm_neon.h>
733*f6dc9357SAndroid Build Coastguard Worker 
734*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
735*f6dc9357SAndroid Build Coastguard Worker     defined(__ARM_FEATURE_CRYPTO) && \
736*f6dc9357SAndroid Build Coastguard Worker     defined(__ARM_FEATURE_AES)
737*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
738*f6dc9357SAndroid Build Coastguard Worker     #undef __ARM_FEATURE_CRYPTO
739*f6dc9357SAndroid Build Coastguard Worker     #undef __ARM_FEATURE_AES
740*f6dc9357SAndroid Build Coastguard Worker     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
741*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
742*f6dc9357SAndroid Build Coastguard Worker //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
743*f6dc9357SAndroid Build Coastguard Worker #endif
744*f6dc9357SAndroid Build Coastguard Worker 
745*f6dc9357SAndroid Build Coastguard Worker #endif // Z7_MSC_VER_ORIGINAL
746*f6dc9357SAndroid Build Coastguard Worker 
747*f6dc9357SAndroid Build Coastguard Worker typedef uint8x16_t v128;
748*f6dc9357SAndroid Build Coastguard Worker 
749*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
750*f6dc9357SAndroid Build Coastguard Worker     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
751*f6dc9357SAndroid Build Coastguard Worker     // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
752*f6dc9357SAndroid Build Coastguard Worker 
753*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START2(name) \
754*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
755*f6dc9357SAndroid Build Coastguard Worker ATTRIB_AES \
756*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
757*f6dc9357SAndroid Build Coastguard Worker 
758*f6dc9357SAndroid Build Coastguard Worker #define MM_OP(op, dest, src)  dest = op(dest, src);
759*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_m(op, src)      MM_OP(op, m, src)
760*f6dc9357SAndroid Build Coastguard Worker #define MM_OP1_m(op)          m = op(m);
761*f6dc9357SAndroid Build Coastguard Worker 
762*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
763*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR_m( src)        MM_XOR(m, src)
764*f6dc9357SAndroid Build Coastguard Worker 
765*f6dc9357SAndroid Build Coastguard Worker #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
766*f6dc9357SAndroid Build Coastguard Worker #define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
767*f6dc9357SAndroid Build Coastguard Worker 
768*f6dc9357SAndroid Build Coastguard Worker 
769*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Encode_HW)
770*f6dc9357SAndroid Build Coastguard Worker {
771*f6dc9357SAndroid Build Coastguard Worker   if (numBlocks == 0)
772*f6dc9357SAndroid Build Coastguard Worker     return;
773*f6dc9357SAndroid Build Coastguard Worker   {
774*f6dc9357SAndroid Build Coastguard Worker   v128 * const p = (v128 *)(void *)ivAes;
775*f6dc9357SAndroid Build Coastguard Worker   v128 *data = (v128 *)(void *)data8;
776*f6dc9357SAndroid Build Coastguard Worker   v128 m = *p;
777*f6dc9357SAndroid Build Coastguard Worker   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778*f6dc9357SAndroid Build Coastguard Worker   const v128 *w = p + (size_t)numRounds2 * 2;
779*f6dc9357SAndroid Build Coastguard Worker   const v128 k0 = p[2];
780*f6dc9357SAndroid Build Coastguard Worker   const v128 k1 = p[3];
781*f6dc9357SAndroid Build Coastguard Worker   const v128 k2 = p[4];
782*f6dc9357SAndroid Build Coastguard Worker   const v128 k3 = p[5];
783*f6dc9357SAndroid Build Coastguard Worker   const v128 k4 = p[6];
784*f6dc9357SAndroid Build Coastguard Worker   const v128 k5 = p[7];
785*f6dc9357SAndroid Build Coastguard Worker   const v128 k6 = p[8];
786*f6dc9357SAndroid Build Coastguard Worker   const v128 k7 = p[9];
787*f6dc9357SAndroid Build Coastguard Worker   const v128 k8 = p[10];
788*f6dc9357SAndroid Build Coastguard Worker   const v128 k9 = p[11];
789*f6dc9357SAndroid Build Coastguard Worker   const v128 k_z4 = w[-2];
790*f6dc9357SAndroid Build Coastguard Worker   const v128 k_z3 = w[-1];
791*f6dc9357SAndroid Build Coastguard Worker   const v128 k_z2 = w[0];
792*f6dc9357SAndroid Build Coastguard Worker   const v128 k_z1 = w[1];
793*f6dc9357SAndroid Build Coastguard Worker   const v128 k_z0 = w[2];
794*f6dc9357SAndroid Build Coastguard Worker   // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795*f6dc9357SAndroid Build Coastguard Worker   // because gcc/clang compilers are not good for that optimization.
796*f6dc9357SAndroid Build Coastguard Worker   do
797*f6dc9357SAndroid Build Coastguard Worker   {
798*f6dc9357SAndroid Build Coastguard Worker     MM_XOR_m (*data)
799*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k0)
800*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k1)
801*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k2)
802*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k3)
803*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k4)
804*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k5)
805*f6dc9357SAndroid Build Coastguard Worker     if (numRounds2 >= 6)
806*f6dc9357SAndroid Build Coastguard Worker     {
807*f6dc9357SAndroid Build Coastguard Worker       AES_E_MC_m (k6)
808*f6dc9357SAndroid Build Coastguard Worker       AES_E_MC_m (k7)
809*f6dc9357SAndroid Build Coastguard Worker       if (numRounds2 != 6)
810*f6dc9357SAndroid Build Coastguard Worker       {
811*f6dc9357SAndroid Build Coastguard Worker         AES_E_MC_m (k8)
812*f6dc9357SAndroid Build Coastguard Worker         AES_E_MC_m (k9)
813*f6dc9357SAndroid Build Coastguard Worker       }
814*f6dc9357SAndroid Build Coastguard Worker     }
815*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k_z4)
816*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k_z3)
817*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (k_z2)
818*f6dc9357SAndroid Build Coastguard Worker     AES_E_m    (k_z1)
819*f6dc9357SAndroid Build Coastguard Worker     MM_XOR_m   (k_z0)
820*f6dc9357SAndroid Build Coastguard Worker     *data++ = m;
821*f6dc9357SAndroid Build Coastguard Worker   }
822*f6dc9357SAndroid Build Coastguard Worker   while (--numBlocks);
823*f6dc9357SAndroid Build Coastguard Worker   *p = m;
824*f6dc9357SAndroid Build Coastguard Worker   }
825*f6dc9357SAndroid Build Coastguard Worker }
826*f6dc9357SAndroid Build Coastguard Worker 
827*f6dc9357SAndroid Build Coastguard Worker 
828*f6dc9357SAndroid Build Coastguard Worker #define WOP_1(op)
829*f6dc9357SAndroid Build Coastguard Worker #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
830*f6dc9357SAndroid Build Coastguard Worker #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
831*f6dc9357SAndroid Build Coastguard Worker #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
832*f6dc9357SAndroid Build Coastguard Worker #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
833*f6dc9357SAndroid Build Coastguard Worker #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
834*f6dc9357SAndroid Build Coastguard Worker #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
835*f6dc9357SAndroid Build Coastguard Worker #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
836*f6dc9357SAndroid Build Coastguard Worker 
837*f6dc9357SAndroid Build Coastguard Worker   #define NUM_WAYS      8
838*f6dc9357SAndroid Build Coastguard Worker   #define WOP_M1    WOP_8
839*f6dc9357SAndroid Build Coastguard Worker 
840*f6dc9357SAndroid Build Coastguard Worker #define WOP(op)  op (m0, 0)   WOP_M1(op)
841*f6dc9357SAndroid Build Coastguard Worker 
842*f6dc9357SAndroid Build Coastguard Worker #define DECLARE_VAR(reg, ii)  v128 reg;
843*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data(  reg, ii)  reg = data[ii];
844*f6dc9357SAndroid Build Coastguard Worker #define STORE_data( reg, ii)  data[ii] = reg;
845*f6dc9357SAndroid Build Coastguard Worker #if (NUM_WAYS > 1)
846*f6dc9357SAndroid Build Coastguard Worker #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
847*f6dc9357SAndroid Build Coastguard Worker #endif
848*f6dc9357SAndroid Build Coastguard Worker 
849*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_key(op, reg)  MM_OP (op, reg, key)
850*f6dc9357SAndroid Build Coastguard Worker 
851*f6dc9357SAndroid Build Coastguard Worker #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
852*f6dc9357SAndroid Build Coastguard Worker #define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
853*f6dc9357SAndroid Build Coastguard Worker 
854*f6dc9357SAndroid Build Coastguard Worker #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
855*f6dc9357SAndroid Build Coastguard Worker #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
856*f6dc9357SAndroid Build Coastguard Worker #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
857*f6dc9357SAndroid Build Coastguard Worker 
858*f6dc9357SAndroid Build Coastguard Worker #define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
859*f6dc9357SAndroid Build Coastguard Worker #define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
860*f6dc9357SAndroid Build Coastguard Worker 
861*f6dc9357SAndroid Build Coastguard Worker #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
862*f6dc9357SAndroid Build Coastguard Worker #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
863*f6dc9357SAndroid Build Coastguard Worker 
864*f6dc9357SAndroid Build Coastguard Worker #define WOP_KEY(op, n) { \
865*f6dc9357SAndroid Build Coastguard Worker     const v128 key = w[n]; \
866*f6dc9357SAndroid Build Coastguard Worker     WOP(op) }
867*f6dc9357SAndroid Build Coastguard Worker 
868*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START  \
869*f6dc9357SAndroid Build Coastguard Worker     dataEnd = data + numBlocks;  \
870*f6dc9357SAndroid Build Coastguard Worker     if (numBlocks >= NUM_WAYS)  \
871*f6dc9357SAndroid Build Coastguard Worker     { dataEnd -= NUM_WAYS; do {  \
872*f6dc9357SAndroid Build Coastguard Worker 
873*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END  \
874*f6dc9357SAndroid Build Coastguard Worker     data += NUM_WAYS;  \
875*f6dc9357SAndroid Build Coastguard Worker     } while (data <= dataEnd);  \
876*f6dc9357SAndroid Build Coastguard Worker     dataEnd += NUM_WAYS; }  \
877*f6dc9357SAndroid Build Coastguard Worker 
878*f6dc9357SAndroid Build Coastguard Worker #define SINGLE_LOOP  \
879*f6dc9357SAndroid Build Coastguard Worker     for (; data < dataEnd; data++)
880*f6dc9357SAndroid Build Coastguard Worker 
881*f6dc9357SAndroid Build Coastguard Worker 
882*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Decode_HW)
883*f6dc9357SAndroid Build Coastguard Worker {
884*f6dc9357SAndroid Build Coastguard Worker   v128 *p = (v128 *)(void *)ivAes;
885*f6dc9357SAndroid Build Coastguard Worker   v128 *data = (v128 *)(void *)data8;
886*f6dc9357SAndroid Build Coastguard Worker   v128 iv = *p;
887*f6dc9357SAndroid Build Coastguard Worker   const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
888*f6dc9357SAndroid Build Coastguard Worker   const v128 *dataEnd;
889*f6dc9357SAndroid Build Coastguard Worker   p += 2;
890*f6dc9357SAndroid Build Coastguard Worker 
891*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START
892*f6dc9357SAndroid Build Coastguard Worker   {
893*f6dc9357SAndroid Build Coastguard Worker     const v128 *w = wStart;
894*f6dc9357SAndroid Build Coastguard Worker     WOP (DECLARE_VAR)
895*f6dc9357SAndroid Build Coastguard Worker     WOP (LOAD_data)
896*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_D_IMC, 2)
897*f6dc9357SAndroid Build Coastguard Worker     do
898*f6dc9357SAndroid Build Coastguard Worker     {
899*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_D_IMC, 1)
900*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_D_IMC, 0)
901*f6dc9357SAndroid Build Coastguard Worker       w -= 2;
902*f6dc9357SAndroid Build Coastguard Worker     }
903*f6dc9357SAndroid Build Coastguard Worker     while (w != p);
904*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_D,   1)
905*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_XOR, 0)
906*f6dc9357SAndroid Build Coastguard Worker     MM_XOR (m0, iv)
907*f6dc9357SAndroid Build Coastguard Worker     WOP_M1 (XOR_data_M1)
908*f6dc9357SAndroid Build Coastguard Worker     LOAD_data(iv, NUM_WAYS - 1)
909*f6dc9357SAndroid Build Coastguard Worker     WOP (STORE_data)
910*f6dc9357SAndroid Build Coastguard Worker   }
911*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END
912*f6dc9357SAndroid Build Coastguard Worker 
913*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
914*f6dc9357SAndroid Build Coastguard Worker   {
915*f6dc9357SAndroid Build Coastguard Worker     const v128 *w = wStart;
916*f6dc9357SAndroid Build Coastguard Worker     v128 m;  LOAD_data(m, 0)
917*f6dc9357SAndroid Build Coastguard Worker     AES_D_IMC_m (w[2])
918*f6dc9357SAndroid Build Coastguard Worker     do
919*f6dc9357SAndroid Build Coastguard Worker     {
920*f6dc9357SAndroid Build Coastguard Worker       AES_D_IMC_m (w[1])
921*f6dc9357SAndroid Build Coastguard Worker       AES_D_IMC_m (w[0])
922*f6dc9357SAndroid Build Coastguard Worker       w -= 2;
923*f6dc9357SAndroid Build Coastguard Worker     }
924*f6dc9357SAndroid Build Coastguard Worker     while (w != p);
925*f6dc9357SAndroid Build Coastguard Worker     AES_D_m  (w[1])
926*f6dc9357SAndroid Build Coastguard Worker     MM_XOR_m (w[0])
927*f6dc9357SAndroid Build Coastguard Worker     MM_XOR_m (iv)
928*f6dc9357SAndroid Build Coastguard Worker     LOAD_data(iv, 0)
929*f6dc9357SAndroid Build Coastguard Worker     STORE_data(m, 0)
930*f6dc9357SAndroid Build Coastguard Worker   }
931*f6dc9357SAndroid Build Coastguard Worker 
932*f6dc9357SAndroid Build Coastguard Worker   p[-2] = iv;
933*f6dc9357SAndroid Build Coastguard Worker }
934*f6dc9357SAndroid Build Coastguard Worker 
935*f6dc9357SAndroid Build Coastguard Worker 
936*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCtr_Code_HW)
937*f6dc9357SAndroid Build Coastguard Worker {
938*f6dc9357SAndroid Build Coastguard Worker   v128 *p = (v128 *)(void *)ivAes;
939*f6dc9357SAndroid Build Coastguard Worker   v128 *data = (v128 *)(void *)data8;
940*f6dc9357SAndroid Build Coastguard Worker   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
941*f6dc9357SAndroid Build Coastguard Worker   const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
942*f6dc9357SAndroid Build Coastguard Worker   const v128 *dataEnd;
943*f6dc9357SAndroid Build Coastguard Worker // the bug in clang:
944*f6dc9357SAndroid Build Coastguard Worker // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
945*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && (__clang_major__ <= 9)
946*f6dc9357SAndroid Build Coastguard Worker #pragma GCC diagnostic ignored "-Wvector-conversion"
947*f6dc9357SAndroid Build Coastguard Worker #endif
948*f6dc9357SAndroid Build Coastguard Worker   const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
949*f6dc9357SAndroid Build Coastguard Worker   p += 2;
950*f6dc9357SAndroid Build Coastguard Worker 
951*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_START
952*f6dc9357SAndroid Build Coastguard Worker   {
953*f6dc9357SAndroid Build Coastguard Worker     const v128 *w = p;
954*f6dc9357SAndroid Build Coastguard Worker     WOP (DECLARE_VAR)
955*f6dc9357SAndroid Build Coastguard Worker     WOP (CTR_START)
956*f6dc9357SAndroid Build Coastguard Worker     do
957*f6dc9357SAndroid Build Coastguard Worker     {
958*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_E_MC, 0)
959*f6dc9357SAndroid Build Coastguard Worker       WOP_KEY (AES_E_MC, 1)
960*f6dc9357SAndroid Build Coastguard Worker       w += 2;
961*f6dc9357SAndroid Build Coastguard Worker     }
962*f6dc9357SAndroid Build Coastguard Worker     while (w != wEnd);
963*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_E_MC, 0)
964*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_E,    1)
965*f6dc9357SAndroid Build Coastguard Worker     WOP_KEY (AES_XOR,  2)
966*f6dc9357SAndroid Build Coastguard Worker     WOP (CTR_END)
967*f6dc9357SAndroid Build Coastguard Worker   }
968*f6dc9357SAndroid Build Coastguard Worker   WIDE_LOOP_END
969*f6dc9357SAndroid Build Coastguard Worker 
970*f6dc9357SAndroid Build Coastguard Worker   SINGLE_LOOP
971*f6dc9357SAndroid Build Coastguard Worker   {
972*f6dc9357SAndroid Build Coastguard Worker     const v128 *w = p;
973*f6dc9357SAndroid Build Coastguard Worker     v128 m;
974*f6dc9357SAndroid Build Coastguard Worker     CTR_START (m, 0)
975*f6dc9357SAndroid Build Coastguard Worker     do
976*f6dc9357SAndroid Build Coastguard Worker     {
977*f6dc9357SAndroid Build Coastguard Worker       AES_E_MC_m (w[0])
978*f6dc9357SAndroid Build Coastguard Worker       AES_E_MC_m (w[1])
979*f6dc9357SAndroid Build Coastguard Worker       w += 2;
980*f6dc9357SAndroid Build Coastguard Worker     }
981*f6dc9357SAndroid Build Coastguard Worker     while (w != wEnd);
982*f6dc9357SAndroid Build Coastguard Worker     AES_E_MC_m (w[0])
983*f6dc9357SAndroid Build Coastguard Worker     AES_E_m    (w[1])
984*f6dc9357SAndroid Build Coastguard Worker     MM_XOR_m   (w[2])
985*f6dc9357SAndroid Build Coastguard Worker     CTR_END (m, 0)
986*f6dc9357SAndroid Build Coastguard Worker   }
987*f6dc9357SAndroid Build Coastguard Worker 
988*f6dc9357SAndroid Build Coastguard Worker   p[-2] = vreinterpretq_u8_u64(ctr);
989*f6dc9357SAndroid Build Coastguard Worker }
990*f6dc9357SAndroid Build Coastguard Worker 
991*f6dc9357SAndroid Build Coastguard Worker #endif // USE_HW_AES
992*f6dc9357SAndroid Build Coastguard Worker 
993*f6dc9357SAndroid Build Coastguard Worker #endif // MY_CPU_ARM_OR_ARM64
994*f6dc9357SAndroid Build Coastguard Worker 
995*f6dc9357SAndroid Build Coastguard Worker #undef NUM_WAYS
996*f6dc9357SAndroid Build Coastguard Worker #undef WOP_M1
997*f6dc9357SAndroid Build Coastguard Worker #undef WOP
998*f6dc9357SAndroid Build Coastguard Worker #undef DECLARE_VAR
999*f6dc9357SAndroid Build Coastguard Worker #undef LOAD_data
1000*f6dc9357SAndroid Build Coastguard Worker #undef STORE_data
1001*f6dc9357SAndroid Build Coastguard Worker #undef USE_INTEL_AES
1002*f6dc9357SAndroid Build Coastguard Worker #undef USE_HW_AES
1003