1*f6dc9357SAndroid Build Coastguard Worker /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2*f6dc9357SAndroid Build Coastguard Worker Igor Pavlov : Public domain */
3*f6dc9357SAndroid Build Coastguard Worker
4*f6dc9357SAndroid Build Coastguard Worker #include "Precomp.h"
5*f6dc9357SAndroid Build Coastguard Worker
6*f6dc9357SAndroid Build Coastguard Worker #include "Aes.h"
7*f6dc9357SAndroid Build Coastguard Worker #include "CpuArch.h"
8*f6dc9357SAndroid Build Coastguard Worker
9*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_X86_OR_AMD64
10*f6dc9357SAndroid Build Coastguard Worker
11*f6dc9357SAndroid Build Coastguard Worker #if defined(__INTEL_COMPILER)
12*f6dc9357SAndroid Build Coastguard Worker #if (__INTEL_COMPILER >= 1110)
13*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_AES
14*f6dc9357SAndroid Build Coastguard Worker #if (__INTEL_COMPILER >= 1900)
15*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_VAES
16*f6dc9357SAndroid Build Coastguard Worker #endif
17*f6dc9357SAndroid Build Coastguard Worker #endif
18*f6dc9357SAndroid Build Coastguard Worker #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19*f6dc9357SAndroid Build Coastguard Worker || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
20*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_AES
21*f6dc9357SAndroid Build Coastguard Worker #if !defined(__AES__)
22*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES __attribute__((__target__("aes")))
23*f6dc9357SAndroid Build Coastguard Worker #endif
24*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && (__clang_major__ >= 8) \
25*f6dc9357SAndroid Build Coastguard Worker || defined(__GNUC__) && (__GNUC__ >= 8)
26*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_VAES
27*f6dc9357SAndroid Build Coastguard Worker #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29*f6dc9357SAndroid Build Coastguard Worker #endif
30*f6dc9357SAndroid Build Coastguard Worker #endif
31*f6dc9357SAndroid Build Coastguard Worker #elif defined(_MSC_VER)
32*f6dc9357SAndroid Build Coastguard Worker #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_AES
34*f6dc9357SAndroid Build Coastguard Worker #if (_MSC_VER >= 1910)
35*f6dc9357SAndroid Build Coastguard Worker #define USE_INTEL_VAES
36*f6dc9357SAndroid Build Coastguard Worker #endif
37*f6dc9357SAndroid Build Coastguard Worker #endif
38*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_AES
39*f6dc9357SAndroid Build Coastguard Worker #define Z7_USE_AES_HW_STUB
40*f6dc9357SAndroid Build Coastguard Worker #endif
41*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_VAES
42*f6dc9357SAndroid Build Coastguard Worker #define Z7_USE_VAES_HW_STUB
43*f6dc9357SAndroid Build Coastguard Worker #endif
44*f6dc9357SAndroid Build Coastguard Worker #endif
45*f6dc9357SAndroid Build Coastguard Worker
46*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_AES
47*f6dc9357SAndroid Build Coastguard Worker // #define Z7_USE_AES_HW_STUB // for debug
48*f6dc9357SAndroid Build Coastguard Worker #endif
49*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_VAES
50*f6dc9357SAndroid Build Coastguard Worker // #define Z7_USE_VAES_HW_STUB // for debug
51*f6dc9357SAndroid Build Coastguard Worker #endif
52*f6dc9357SAndroid Build Coastguard Worker
53*f6dc9357SAndroid Build Coastguard Worker
54*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_AES
55*f6dc9357SAndroid Build Coastguard Worker
56*f6dc9357SAndroid Build Coastguard Worker #include <wmmintrin.h>
57*f6dc9357SAndroid Build Coastguard Worker
58*f6dc9357SAndroid Build Coastguard Worker #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_keys UInt32
60*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_data Byte
61*f6dc9357SAndroid Build Coastguard Worker // #define AES_TYPE_keys __m128i
62*f6dc9357SAndroid Build Coastguard Worker // #define AES_TYPE_data __m128i
63*f6dc9357SAndroid Build Coastguard Worker #endif
64*f6dc9357SAndroid Build Coastguard Worker
65*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_AES
66*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES
67*f6dc9357SAndroid Build Coastguard Worker #endif
68*f6dc9357SAndroid Build Coastguard Worker
69*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
70*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71*f6dc9357SAndroid Build Coastguard Worker // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72*f6dc9357SAndroid Build Coastguard Worker
73*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START2(name) \
74*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
75*f6dc9357SAndroid Build Coastguard Worker ATTRIB_AES \
76*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
77*f6dc9357SAndroid Build Coastguard Worker
78*f6dc9357SAndroid Build Coastguard Worker #define MM_OP(op, dest, src) dest = op(dest, src);
79*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_m(op, src) MM_OP(op, m, src)
80*f6dc9357SAndroid Build Coastguard Worker
81*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82*f6dc9357SAndroid Build Coastguard Worker
83*f6dc9357SAndroid Build Coastguard Worker #if 1
84*f6dc9357SAndroid Build Coastguard Worker // use aligned SSE load/store for data.
85*f6dc9357SAndroid Build Coastguard Worker // It is required for our Aes functions, that data is aligned for 16-bytes.
86*f6dc9357SAndroid Build Coastguard Worker // So we can use this branch of code.
87*f6dc9357SAndroid Build Coastguard Worker // and compiler can use fused load-op SSE instructions:
88*f6dc9357SAndroid Build Coastguard Worker // xorps xmm0, XMMWORD PTR [rdx]
89*f6dc9357SAndroid Build Coastguard Worker #define LOAD_128(pp) (*(__m128i *)(void *)(pp))
90*f6dc9357SAndroid Build Coastguard Worker #define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
91*f6dc9357SAndroid Build Coastguard Worker // use aligned SSE load/store for data. Alternative code with direct access
92*f6dc9357SAndroid Build Coastguard Worker // #define LOAD_128(pp) _mm_load_si128(pp)
93*f6dc9357SAndroid Build Coastguard Worker // #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
94*f6dc9357SAndroid Build Coastguard Worker #else
95*f6dc9357SAndroid Build Coastguard Worker // use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96*f6dc9357SAndroid Build Coastguard Worker #define LOAD_128(pp) _mm_loadu_si128(pp)
97*f6dc9357SAndroid Build Coastguard Worker #define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
98*f6dc9357SAndroid Build Coastguard Worker #endif
99*f6dc9357SAndroid Build Coastguard Worker
AES_FUNC_START2(AesCbc_Encode_HW)100*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Encode_HW)
101*f6dc9357SAndroid Build Coastguard Worker {
102*f6dc9357SAndroid Build Coastguard Worker if (numBlocks == 0)
103*f6dc9357SAndroid Build Coastguard Worker return;
104*f6dc9357SAndroid Build Coastguard Worker {
105*f6dc9357SAndroid Build Coastguard Worker __m128i *p = (__m128i *)(void *)ivAes;
106*f6dc9357SAndroid Build Coastguard Worker __m128i *data = (__m128i *)(void *)data8;
107*f6dc9357SAndroid Build Coastguard Worker __m128i m = *p;
108*f6dc9357SAndroid Build Coastguard Worker const __m128i k0 = p[2];
109*f6dc9357SAndroid Build Coastguard Worker const __m128i k1 = p[3];
110*f6dc9357SAndroid Build Coastguard Worker const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
111*f6dc9357SAndroid Build Coastguard Worker do
112*f6dc9357SAndroid Build Coastguard Worker {
113*f6dc9357SAndroid Build Coastguard Worker UInt32 r = numRounds2;
114*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = p + 4;
115*f6dc9357SAndroid Build Coastguard Worker __m128i temp = LOAD_128(data);
116*f6dc9357SAndroid Build Coastguard Worker MM_XOR (temp, k0)
117*f6dc9357SAndroid Build Coastguard Worker MM_XOR (m, temp)
118*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, k1)
119*f6dc9357SAndroid Build Coastguard Worker do
120*f6dc9357SAndroid Build Coastguard Worker {
121*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[0])
122*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[1])
123*f6dc9357SAndroid Build Coastguard Worker w += 2;
124*f6dc9357SAndroid Build Coastguard Worker }
125*f6dc9357SAndroid Build Coastguard Worker while (--r);
126*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenclast_si128, w[0])
127*f6dc9357SAndroid Build Coastguard Worker STORE_128(data, m);
128*f6dc9357SAndroid Build Coastguard Worker data++;
129*f6dc9357SAndroid Build Coastguard Worker }
130*f6dc9357SAndroid Build Coastguard Worker while (--numBlocks);
131*f6dc9357SAndroid Build Coastguard Worker *p = m;
132*f6dc9357SAndroid Build Coastguard Worker }
133*f6dc9357SAndroid Build Coastguard Worker }
134*f6dc9357SAndroid Build Coastguard Worker
135*f6dc9357SAndroid Build Coastguard Worker
136*f6dc9357SAndroid Build Coastguard Worker #define WOP_1(op)
137*f6dc9357SAndroid Build Coastguard Worker #define WOP_2(op) WOP_1 (op) op (m1, 1)
138*f6dc9357SAndroid Build Coastguard Worker #define WOP_3(op) WOP_2 (op) op (m2, 2)
139*f6dc9357SAndroid Build Coastguard Worker #define WOP_4(op) WOP_3 (op) op (m3, 3)
140*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_AMD64
141*f6dc9357SAndroid Build Coastguard Worker #define WOP_5(op) WOP_4 (op) op (m4, 4)
142*f6dc9357SAndroid Build Coastguard Worker #define WOP_6(op) WOP_5 (op) op (m5, 5)
143*f6dc9357SAndroid Build Coastguard Worker #define WOP_7(op) WOP_6 (op) op (m6, 6)
144*f6dc9357SAndroid Build Coastguard Worker #define WOP_8(op) WOP_7 (op) op (m7, 7)
145*f6dc9357SAndroid Build Coastguard Worker #endif
146*f6dc9357SAndroid Build Coastguard Worker /*
147*f6dc9357SAndroid Build Coastguard Worker #define WOP_9(op) WOP_8 (op) op (m8, 8);
148*f6dc9357SAndroid Build Coastguard Worker #define WOP_10(op) WOP_9 (op) op (m9, 9);
149*f6dc9357SAndroid Build Coastguard Worker #define WOP_11(op) WOP_10(op) op (m10, 10);
150*f6dc9357SAndroid Build Coastguard Worker #define WOP_12(op) WOP_11(op) op (m11, 11);
151*f6dc9357SAndroid Build Coastguard Worker #define WOP_13(op) WOP_12(op) op (m12, 12);
152*f6dc9357SAndroid Build Coastguard Worker #define WOP_14(op) WOP_13(op) op (m13, 13);
153*f6dc9357SAndroid Build Coastguard Worker */
154*f6dc9357SAndroid Build Coastguard Worker
155*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_AMD64
156*f6dc9357SAndroid Build Coastguard Worker #define NUM_WAYS 8
157*f6dc9357SAndroid Build Coastguard Worker #define WOP_M1 WOP_8
158*f6dc9357SAndroid Build Coastguard Worker #else
159*f6dc9357SAndroid Build Coastguard Worker #define NUM_WAYS 4
160*f6dc9357SAndroid Build Coastguard Worker #define WOP_M1 WOP_4
161*f6dc9357SAndroid Build Coastguard Worker #endif
162*f6dc9357SAndroid Build Coastguard Worker
163*f6dc9357SAndroid Build Coastguard Worker #define WOP(op) op (m0, 0) WOP_M1(op)
164*f6dc9357SAndroid Build Coastguard Worker
165*f6dc9357SAndroid Build Coastguard Worker #define DECLARE_VAR(reg, ii) __m128i reg;
166*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data_ii(ii) LOAD_128(data + (ii))
167*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
168*f6dc9357SAndroid Build Coastguard Worker #define STORE_data( reg, ii) STORE_128(data + (ii), reg);
169*f6dc9357SAndroid Build Coastguard Worker #if (NUM_WAYS > 1)
170*f6dc9357SAndroid Build Coastguard Worker #define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
171*f6dc9357SAndroid Build Coastguard Worker #endif
172*f6dc9357SAndroid Build Coastguard Worker
173*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_key(op, reg) MM_OP(op, reg, key);
174*f6dc9357SAndroid Build Coastguard Worker
175*f6dc9357SAndroid Build Coastguard Worker #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
176*f6dc9357SAndroid Build Coastguard Worker #define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg)
177*f6dc9357SAndroid Build Coastguard Worker #define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg)
178*f6dc9357SAndroid Build Coastguard Worker #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
179*f6dc9357SAndroid Build Coastguard Worker #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
180*f6dc9357SAndroid Build Coastguard Worker
181*f6dc9357SAndroid Build Coastguard Worker #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
182*f6dc9357SAndroid Build Coastguard Worker #define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
183*f6dc9357SAndroid Build Coastguard Worker LOAD_128 (data + (ii))));
184*f6dc9357SAndroid Build Coastguard Worker #define WOP_KEY(op, n) { \
185*f6dc9357SAndroid Build Coastguard Worker const __m128i key = w[n]; \
186*f6dc9357SAndroid Build Coastguard Worker WOP(op) }
187*f6dc9357SAndroid Build Coastguard Worker
188*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START \
189*f6dc9357SAndroid Build Coastguard Worker dataEnd = data + numBlocks; \
190*f6dc9357SAndroid Build Coastguard Worker if (numBlocks >= NUM_WAYS) \
191*f6dc9357SAndroid Build Coastguard Worker { dataEnd -= NUM_WAYS; do { \
192*f6dc9357SAndroid Build Coastguard Worker
193*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END \
194*f6dc9357SAndroid Build Coastguard Worker data += NUM_WAYS; \
195*f6dc9357SAndroid Build Coastguard Worker } while (data <= dataEnd); \
196*f6dc9357SAndroid Build Coastguard Worker dataEnd += NUM_WAYS; } \
197*f6dc9357SAndroid Build Coastguard Worker
198*f6dc9357SAndroid Build Coastguard Worker #define SINGLE_LOOP \
199*f6dc9357SAndroid Build Coastguard Worker for (; data < dataEnd; data++)
200*f6dc9357SAndroid Build Coastguard Worker
201*f6dc9357SAndroid Build Coastguard Worker
202*f6dc9357SAndroid Build Coastguard Worker
203*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_VAES
204*f6dc9357SAndroid Build Coastguard Worker
205*f6dc9357SAndroid Build Coastguard Worker #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
206*f6dc9357SAndroid Build Coastguard Worker #define AVX_DECLARE_VAR(reg, ii) __m256i reg;
207*f6dc9357SAndroid Build Coastguard Worker
208*f6dc9357SAndroid Build Coastguard Worker #if 1
209*f6dc9357SAndroid Build Coastguard Worker // use unaligned AVX load/store for data.
210*f6dc9357SAndroid Build Coastguard Worker // It is required for our Aes functions, that data is aligned for 16-bytes.
211*f6dc9357SAndroid Build Coastguard Worker // But we need 32-bytes reading.
212*f6dc9357SAndroid Build Coastguard Worker // So we use intrinsics for unaligned AVX load/store.
213*f6dc9357SAndroid Build Coastguard Worker // notes for _mm256_storeu_si256:
214*f6dc9357SAndroid Build Coastguard Worker // msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215*f6dc9357SAndroid Build Coastguard Worker // new gcc11 uses vmovdqu
216*f6dc9357SAndroid Build Coastguard Worker // old gcc9 could use pair of instructions:
217*f6dc9357SAndroid Build Coastguard Worker // vmovups %xmm7, -224(%rax)
218*f6dc9357SAndroid Build Coastguard Worker // vextracti128 $0x1, %ymm7, -208(%rax)
219*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
220*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221*f6dc9357SAndroid Build Coastguard Worker #else
222*f6dc9357SAndroid Build Coastguard Worker // use aligned AVX load/store for data.
223*f6dc9357SAndroid Build Coastguard Worker // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224*f6dc9357SAndroid Build Coastguard Worker // msvc2022 uses vmovdqu still
225*f6dc9357SAndroid Build Coastguard Worker // gcc uses vmovdqa (that requires 32-bytes alignment)
226*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
227*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
228*f6dc9357SAndroid Build Coastguard Worker #endif
229*f6dc9357SAndroid Build Coastguard Worker
230*f6dc9357SAndroid Build Coastguard Worker #define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231*f6dc9357SAndroid Build Coastguard Worker #define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
232*f6dc9357SAndroid Build Coastguard Worker /*
233*f6dc9357SAndroid Build Coastguard Worker AVX_XOR_data_M1() needs unaligned memory load, even if (data)
234*f6dc9357SAndroid Build Coastguard Worker is aligned for 256-bits, because we read 32-bytes chunk that
235*f6dc9357SAndroid Build Coastguard Worker crosses (data) position: from (data - 16bytes) to (data + 16bytes).
236*f6dc9357SAndroid Build Coastguard Worker */
237*f6dc9357SAndroid Build Coastguard Worker #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
238*f6dc9357SAndroid Build Coastguard Worker
239*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
240*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
241*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
242*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
243*f6dc9357SAndroid Build Coastguard Worker #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
244*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_START(reg, ii) \
245*f6dc9357SAndroid Build Coastguard Worker MM_OP (_mm256_add_epi64, ctr2, two) \
246*f6dc9357SAndroid Build Coastguard Worker reg = _mm256_xor_si256(ctr2, key);
247*f6dc9357SAndroid Build Coastguard Worker
248*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_END(reg, ii) \
249*f6dc9357SAndroid Build Coastguard Worker AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250*f6dc9357SAndroid Build Coastguard Worker AVX_LOAD ((__m256i *)(void *)data + (ii))));
251*f6dc9357SAndroid Build Coastguard Worker
252*f6dc9357SAndroid Build Coastguard Worker #define AVX_WOP_KEY(op, n) { \
253*f6dc9357SAndroid Build Coastguard Worker const __m256i key = w[n]; \
254*f6dc9357SAndroid Build Coastguard Worker WOP(op) }
255*f6dc9357SAndroid Build Coastguard Worker
256*f6dc9357SAndroid Build Coastguard Worker #define NUM_AES_KEYS_MAX 15
257*f6dc9357SAndroid Build Coastguard Worker
258*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START_AVX(OP) \
259*f6dc9357SAndroid Build Coastguard Worker dataEnd = data + numBlocks; \
260*f6dc9357SAndroid Build Coastguard Worker if (numBlocks >= NUM_WAYS * 2) \
261*f6dc9357SAndroid Build Coastguard Worker { __m256i keys[NUM_AES_KEYS_MAX]; \
262*f6dc9357SAndroid Build Coastguard Worker OP \
263*f6dc9357SAndroid Build Coastguard Worker { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
264*f6dc9357SAndroid Build Coastguard Worker keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
265*f6dc9357SAndroid Build Coastguard Worker dataEnd -= NUM_WAYS * 2; \
266*f6dc9357SAndroid Build Coastguard Worker do { \
267*f6dc9357SAndroid Build Coastguard Worker
268*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END_AVX(OP) \
269*f6dc9357SAndroid Build Coastguard Worker data += NUM_WAYS * 2; \
270*f6dc9357SAndroid Build Coastguard Worker } while (data <= dataEnd); \
271*f6dc9357SAndroid Build Coastguard Worker dataEnd += NUM_WAYS * 2; \
272*f6dc9357SAndroid Build Coastguard Worker OP \
273*f6dc9357SAndroid Build Coastguard Worker _mm256_zeroupper(); \
274*f6dc9357SAndroid Build Coastguard Worker } \
275*f6dc9357SAndroid Build Coastguard Worker
276*f6dc9357SAndroid Build Coastguard Worker /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
277*f6dc9357SAndroid Build Coastguard Worker MSVC still can insert vzeroupper instruction. */
278*f6dc9357SAndroid Build Coastguard Worker
279*f6dc9357SAndroid Build Coastguard Worker #endif
280*f6dc9357SAndroid Build Coastguard Worker
281*f6dc9357SAndroid Build Coastguard Worker
282*f6dc9357SAndroid Build Coastguard Worker
AES_FUNC_START2(AesCbc_Decode_HW)283*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Decode_HW)
284*f6dc9357SAndroid Build Coastguard Worker {
285*f6dc9357SAndroid Build Coastguard Worker __m128i *p = (__m128i *)(void *)ivAes;
286*f6dc9357SAndroid Build Coastguard Worker __m128i *data = (__m128i *)(void *)data8;
287*f6dc9357SAndroid Build Coastguard Worker __m128i iv = *p;
288*f6dc9357SAndroid Build Coastguard Worker const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
289*f6dc9357SAndroid Build Coastguard Worker const __m128i *dataEnd;
290*f6dc9357SAndroid Build Coastguard Worker p += 2;
291*f6dc9357SAndroid Build Coastguard Worker
292*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START
293*f6dc9357SAndroid Build Coastguard Worker {
294*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = wStart;
295*f6dc9357SAndroid Build Coastguard Worker WOP (DECLARE_VAR)
296*f6dc9357SAndroid Build Coastguard Worker WOP (LOAD_data)
297*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_XOR, 1)
298*f6dc9357SAndroid Build Coastguard Worker do
299*f6dc9357SAndroid Build Coastguard Worker {
300*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_DEC, 0)
301*f6dc9357SAndroid Build Coastguard Worker
302*f6dc9357SAndroid Build Coastguard Worker w--;
303*f6dc9357SAndroid Build Coastguard Worker }
304*f6dc9357SAndroid Build Coastguard Worker while (w != p);
305*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_DEC_LAST, 0)
306*f6dc9357SAndroid Build Coastguard Worker
307*f6dc9357SAndroid Build Coastguard Worker MM_XOR (m0, iv)
308*f6dc9357SAndroid Build Coastguard Worker WOP_M1 (XOR_data_M1)
309*f6dc9357SAndroid Build Coastguard Worker LOAD_data(iv, NUM_WAYS - 1)
310*f6dc9357SAndroid Build Coastguard Worker WOP (STORE_data)
311*f6dc9357SAndroid Build Coastguard Worker }
312*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END
313*f6dc9357SAndroid Build Coastguard Worker
314*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
315*f6dc9357SAndroid Build Coastguard Worker {
316*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = wStart - 1;
317*f6dc9357SAndroid Build Coastguard Worker __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318*f6dc9357SAndroid Build Coastguard Worker
319*f6dc9357SAndroid Build Coastguard Worker do
320*f6dc9357SAndroid Build Coastguard Worker {
321*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[1])
322*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[0])
323*f6dc9357SAndroid Build Coastguard Worker w -= 2;
324*f6dc9357SAndroid Build Coastguard Worker }
325*f6dc9357SAndroid Build Coastguard Worker while (w != p);
326*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[1])
327*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdeclast_si128, w[0])
328*f6dc9357SAndroid Build Coastguard Worker MM_XOR (m, iv)
329*f6dc9357SAndroid Build Coastguard Worker LOAD_data(iv, 0)
330*f6dc9357SAndroid Build Coastguard Worker STORE_data(m, 0)
331*f6dc9357SAndroid Build Coastguard Worker }
332*f6dc9357SAndroid Build Coastguard Worker
333*f6dc9357SAndroid Build Coastguard Worker p[-2] = iv;
334*f6dc9357SAndroid Build Coastguard Worker }
335*f6dc9357SAndroid Build Coastguard Worker
336*f6dc9357SAndroid Build Coastguard Worker
AES_FUNC_START2(AesCtr_Code_HW)337*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCtr_Code_HW)
338*f6dc9357SAndroid Build Coastguard Worker {
339*f6dc9357SAndroid Build Coastguard Worker __m128i *p = (__m128i *)(void *)ivAes;
340*f6dc9357SAndroid Build Coastguard Worker __m128i *data = (__m128i *)(void *)data8;
341*f6dc9357SAndroid Build Coastguard Worker __m128i ctr = *p;
342*f6dc9357SAndroid Build Coastguard Worker const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
343*f6dc9357SAndroid Build Coastguard Worker const __m128i *dataEnd;
344*f6dc9357SAndroid Build Coastguard Worker const __m128i one = _mm_cvtsi32_si128(1);
345*f6dc9357SAndroid Build Coastguard Worker
346*f6dc9357SAndroid Build Coastguard Worker p += 2;
347*f6dc9357SAndroid Build Coastguard Worker
348*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START
349*f6dc9357SAndroid Build Coastguard Worker {
350*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = p;
351*f6dc9357SAndroid Build Coastguard Worker UInt32 r = numRoundsMinus2;
352*f6dc9357SAndroid Build Coastguard Worker WOP (DECLARE_VAR)
353*f6dc9357SAndroid Build Coastguard Worker WOP (CTR_START)
354*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_XOR, 0)
355*f6dc9357SAndroid Build Coastguard Worker w += 1;
356*f6dc9357SAndroid Build Coastguard Worker do
357*f6dc9357SAndroid Build Coastguard Worker {
358*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_ENC, 0)
359*f6dc9357SAndroid Build Coastguard Worker w += 1;
360*f6dc9357SAndroid Build Coastguard Worker }
361*f6dc9357SAndroid Build Coastguard Worker while (--r);
362*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_ENC_LAST, 0)
363*f6dc9357SAndroid Build Coastguard Worker WOP (CTR_END)
364*f6dc9357SAndroid Build Coastguard Worker }
365*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END
366*f6dc9357SAndroid Build Coastguard Worker
367*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
368*f6dc9357SAndroid Build Coastguard Worker {
369*f6dc9357SAndroid Build Coastguard Worker UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
370*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = p;
371*f6dc9357SAndroid Build Coastguard Worker __m128i m;
372*f6dc9357SAndroid Build Coastguard Worker MM_OP (_mm_add_epi64, ctr, one)
373*f6dc9357SAndroid Build Coastguard Worker m = _mm_xor_si128 (ctr, p[0]);
374*f6dc9357SAndroid Build Coastguard Worker w += 1;
375*f6dc9357SAndroid Build Coastguard Worker do
376*f6dc9357SAndroid Build Coastguard Worker {
377*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[0])
378*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[1])
379*f6dc9357SAndroid Build Coastguard Worker w += 2;
380*f6dc9357SAndroid Build Coastguard Worker }
381*f6dc9357SAndroid Build Coastguard Worker while (--numRounds2);
382*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[0])
383*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenclast_si128, w[1])
384*f6dc9357SAndroid Build Coastguard Worker CTR_END (m, 0)
385*f6dc9357SAndroid Build Coastguard Worker }
386*f6dc9357SAndroid Build Coastguard Worker
387*f6dc9357SAndroid Build Coastguard Worker p[-2] = ctr;
388*f6dc9357SAndroid Build Coastguard Worker }
389*f6dc9357SAndroid Build Coastguard Worker
390*f6dc9357SAndroid Build Coastguard Worker
391*f6dc9357SAndroid Build Coastguard Worker
392*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_INTEL_VAES
393*f6dc9357SAndroid Build Coastguard Worker
394*f6dc9357SAndroid Build Coastguard Worker /*
395*f6dc9357SAndroid Build Coastguard Worker GCC before 2013-Jun:
396*f6dc9357SAndroid Build Coastguard Worker <immintrin.h>:
397*f6dc9357SAndroid Build Coastguard Worker #ifdef __AVX__
398*f6dc9357SAndroid Build Coastguard Worker #include <avxintrin.h>
399*f6dc9357SAndroid Build Coastguard Worker #endif
400*f6dc9357SAndroid Build Coastguard Worker GCC after 2013-Jun:
401*f6dc9357SAndroid Build Coastguard Worker <immintrin.h>:
402*f6dc9357SAndroid Build Coastguard Worker #include <avxintrin.h>
403*f6dc9357SAndroid Build Coastguard Worker CLANG 3.8+:
404*f6dc9357SAndroid Build Coastguard Worker {
405*f6dc9357SAndroid Build Coastguard Worker <immintrin.h>:
406*f6dc9357SAndroid Build Coastguard Worker #if !defined(_MSC_VER) || defined(__AVX__)
407*f6dc9357SAndroid Build Coastguard Worker #include <avxintrin.h>
408*f6dc9357SAndroid Build Coastguard Worker #endif
409*f6dc9357SAndroid Build Coastguard Worker
410*f6dc9357SAndroid Build Coastguard Worker if (the compiler is clang for Windows and if global arch is not set for __AVX__)
411*f6dc9357SAndroid Build Coastguard Worker [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
412*f6dc9357SAndroid Build Coastguard Worker {
413*f6dc9357SAndroid Build Coastguard Worker <immintrin.h> doesn't include <avxintrin.h>
414*f6dc9357SAndroid Build Coastguard Worker and we have 2 ways to fix it:
415*f6dc9357SAndroid Build Coastguard Worker 1) we can define required __AVX__ before <immintrin.h>
416*f6dc9357SAndroid Build Coastguard Worker or
417*f6dc9357SAndroid Build Coastguard Worker 2) we can include <avxintrin.h> after <immintrin.h>
418*f6dc9357SAndroid Build Coastguard Worker }
419*f6dc9357SAndroid Build Coastguard Worker }
420*f6dc9357SAndroid Build Coastguard Worker
421*f6dc9357SAndroid Build Coastguard Worker If we include <avxintrin.h> manually for GCC/CLANG, it's
422*f6dc9357SAndroid Build Coastguard Worker required that <immintrin.h> must be included before <avxintrin.h>.
423*f6dc9357SAndroid Build Coastguard Worker */
424*f6dc9357SAndroid Build Coastguard Worker
425*f6dc9357SAndroid Build Coastguard Worker /*
426*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && defined(_MSC_VER)
427*f6dc9357SAndroid Build Coastguard Worker #define __AVX__
428*f6dc9357SAndroid Build Coastguard Worker #define __AVX2__
429*f6dc9357SAndroid Build Coastguard Worker #define __VAES__
430*f6dc9357SAndroid Build Coastguard Worker #endif
431*f6dc9357SAndroid Build Coastguard Worker */
432*f6dc9357SAndroid Build Coastguard Worker
433*f6dc9357SAndroid Build Coastguard Worker #include <immintrin.h>
434*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && defined(_MSC_VER)
435*f6dc9357SAndroid Build Coastguard Worker #if !defined(__AVX__)
436*f6dc9357SAndroid Build Coastguard Worker #include <avxintrin.h>
437*f6dc9357SAndroid Build Coastguard Worker #endif
438*f6dc9357SAndroid Build Coastguard Worker #if !defined(__AVX2__)
439*f6dc9357SAndroid Build Coastguard Worker #include <avx2intrin.h>
440*f6dc9357SAndroid Build Coastguard Worker #endif
441*f6dc9357SAndroid Build Coastguard Worker #if !defined(__VAES__)
442*f6dc9357SAndroid Build Coastguard Worker #include <vaesintrin.h>
443*f6dc9357SAndroid Build Coastguard Worker #endif
444*f6dc9357SAndroid Build Coastguard Worker #endif // __clang__ && _MSC_VER
445*f6dc9357SAndroid Build Coastguard Worker
446*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_VAES
447*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_VAES
448*f6dc9357SAndroid Build Coastguard Worker #endif
449*f6dc9357SAndroid Build Coastguard Worker
450*f6dc9357SAndroid Build Coastguard Worker #define VAES_FUNC_START2(name) \
451*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
452*f6dc9357SAndroid Build Coastguard Worker ATTRIB_VAES \
453*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
454*f6dc9357SAndroid Build Coastguard Worker
VAES_FUNC_START2(AesCbc_Decode_HW_256)455*f6dc9357SAndroid Build Coastguard Worker VAES_FUNC_START2 (AesCbc_Decode_HW_256)
456*f6dc9357SAndroid Build Coastguard Worker {
457*f6dc9357SAndroid Build Coastguard Worker __m128i *p = (__m128i *)(void *)ivAes;
458*f6dc9357SAndroid Build Coastguard Worker __m128i *data = (__m128i *)(void *)data8;
459*f6dc9357SAndroid Build Coastguard Worker __m128i iv = *p;
460*f6dc9357SAndroid Build Coastguard Worker const __m128i *dataEnd;
461*f6dc9357SAndroid Build Coastguard Worker const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
462*f6dc9357SAndroid Build Coastguard Worker p += 2;
463*f6dc9357SAndroid Build Coastguard Worker
464*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START_AVX(;)
465*f6dc9357SAndroid Build Coastguard Worker {
466*f6dc9357SAndroid Build Coastguard Worker const __m256i *w = keys + numRounds - 2;
467*f6dc9357SAndroid Build Coastguard Worker
468*f6dc9357SAndroid Build Coastguard Worker WOP (AVX_DECLARE_VAR)
469*f6dc9357SAndroid Build Coastguard Worker WOP (AVX_LOAD_data)
470*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_AES_XOR, 1)
471*f6dc9357SAndroid Build Coastguard Worker
472*f6dc9357SAndroid Build Coastguard Worker do
473*f6dc9357SAndroid Build Coastguard Worker {
474*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_AES_DEC, 0)
475*f6dc9357SAndroid Build Coastguard Worker w--;
476*f6dc9357SAndroid Build Coastguard Worker }
477*f6dc9357SAndroid Build Coastguard Worker while (w != keys);
478*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
479*f6dc9357SAndroid Build Coastguard Worker
480*f6dc9357SAndroid Build Coastguard Worker AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
481*f6dc9357SAndroid Build Coastguard Worker WOP_M1 (AVX_XOR_data_M1)
482*f6dc9357SAndroid Build Coastguard Worker LOAD_data (iv, NUM_WAYS * 2 - 1)
483*f6dc9357SAndroid Build Coastguard Worker WOP (AVX_STORE_data)
484*f6dc9357SAndroid Build Coastguard Worker }
485*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END_AVX(;)
486*f6dc9357SAndroid Build Coastguard Worker
487*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
488*f6dc9357SAndroid Build Coastguard Worker {
489*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
490*f6dc9357SAndroid Build Coastguard Worker __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
491*f6dc9357SAndroid Build Coastguard Worker do
492*f6dc9357SAndroid Build Coastguard Worker {
493*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[1])
494*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[0])
495*f6dc9357SAndroid Build Coastguard Worker w -= 2;
496*f6dc9357SAndroid Build Coastguard Worker }
497*f6dc9357SAndroid Build Coastguard Worker while (w != p);
498*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdec_si128, w[1])
499*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesdeclast_si128, w[0])
500*f6dc9357SAndroid Build Coastguard Worker
501*f6dc9357SAndroid Build Coastguard Worker MM_XOR (m, iv)
502*f6dc9357SAndroid Build Coastguard Worker LOAD_data(iv, 0)
503*f6dc9357SAndroid Build Coastguard Worker STORE_data(m, 0)
504*f6dc9357SAndroid Build Coastguard Worker }
505*f6dc9357SAndroid Build Coastguard Worker
506*f6dc9357SAndroid Build Coastguard Worker p[-2] = iv;
507*f6dc9357SAndroid Build Coastguard Worker }
508*f6dc9357SAndroid Build Coastguard Worker
509*f6dc9357SAndroid Build Coastguard Worker
510*f6dc9357SAndroid Build Coastguard Worker /*
511*f6dc9357SAndroid Build Coastguard Worker SSE2: _mm_cvtsi32_si128 : movd
512*f6dc9357SAndroid Build Coastguard Worker AVX: _mm256_setr_m128i : vinsertf128
513*f6dc9357SAndroid Build Coastguard Worker AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
514*f6dc9357SAndroid Build Coastguard Worker _mm256_extracti128_si256 : vextracti128
515*f6dc9357SAndroid Build Coastguard Worker _mm256_broadcastsi128_si256 : vbroadcasti128
516*f6dc9357SAndroid Build Coastguard Worker */
517*f6dc9357SAndroid Build Coastguard Worker
518*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_LOOP_START \
519*f6dc9357SAndroid Build Coastguard Worker ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
520*f6dc9357SAndroid Build Coastguard Worker two = _mm256_setr_m128i(one, one); \
521*f6dc9357SAndroid Build Coastguard Worker two = _mm256_add_epi64(two, two); \
522*f6dc9357SAndroid Build Coastguard Worker
523*f6dc9357SAndroid Build Coastguard Worker // two = _mm256_setr_epi64x(2, 0, 2, 0);
524*f6dc9357SAndroid Build Coastguard Worker
525*f6dc9357SAndroid Build Coastguard Worker #define AVX_CTR_LOOP_ENC \
526*f6dc9357SAndroid Build Coastguard Worker ctr = _mm256_extracti128_si256 (ctr2, 1); \
527*f6dc9357SAndroid Build Coastguard Worker
VAES_FUNC_START2(AesCtr_Code_HW_256)528*f6dc9357SAndroid Build Coastguard Worker VAES_FUNC_START2 (AesCtr_Code_HW_256)
529*f6dc9357SAndroid Build Coastguard Worker {
530*f6dc9357SAndroid Build Coastguard Worker __m128i *p = (__m128i *)(void *)ivAes;
531*f6dc9357SAndroid Build Coastguard Worker __m128i *data = (__m128i *)(void *)data8;
532*f6dc9357SAndroid Build Coastguard Worker __m128i ctr = *p;
533*f6dc9357SAndroid Build Coastguard Worker const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
534*f6dc9357SAndroid Build Coastguard Worker const __m128i *dataEnd;
535*f6dc9357SAndroid Build Coastguard Worker const __m128i one = _mm_cvtsi32_si128(1);
536*f6dc9357SAndroid Build Coastguard Worker __m256i ctr2, two;
537*f6dc9357SAndroid Build Coastguard Worker p += 2;
538*f6dc9357SAndroid Build Coastguard Worker
539*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
540*f6dc9357SAndroid Build Coastguard Worker {
541*f6dc9357SAndroid Build Coastguard Worker const __m256i *w = keys;
542*f6dc9357SAndroid Build Coastguard Worker UInt32 r = numRounds - 2;
543*f6dc9357SAndroid Build Coastguard Worker WOP (AVX_DECLARE_VAR)
544*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_CTR_START, 0)
545*f6dc9357SAndroid Build Coastguard Worker
546*f6dc9357SAndroid Build Coastguard Worker w += 1;
547*f6dc9357SAndroid Build Coastguard Worker do
548*f6dc9357SAndroid Build Coastguard Worker {
549*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_AES_ENC, 0)
550*f6dc9357SAndroid Build Coastguard Worker w += 1;
551*f6dc9357SAndroid Build Coastguard Worker }
552*f6dc9357SAndroid Build Coastguard Worker while (--r);
553*f6dc9357SAndroid Build Coastguard Worker AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
554*f6dc9357SAndroid Build Coastguard Worker
555*f6dc9357SAndroid Build Coastguard Worker WOP (AVX_CTR_END)
556*f6dc9357SAndroid Build Coastguard Worker }
557*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
558*f6dc9357SAndroid Build Coastguard Worker
559*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
560*f6dc9357SAndroid Build Coastguard Worker {
561*f6dc9357SAndroid Build Coastguard Worker UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
562*f6dc9357SAndroid Build Coastguard Worker const __m128i *w = p;
563*f6dc9357SAndroid Build Coastguard Worker __m128i m;
564*f6dc9357SAndroid Build Coastguard Worker MM_OP (_mm_add_epi64, ctr, one)
565*f6dc9357SAndroid Build Coastguard Worker m = _mm_xor_si128 (ctr, p[0]);
566*f6dc9357SAndroid Build Coastguard Worker w += 1;
567*f6dc9357SAndroid Build Coastguard Worker do
568*f6dc9357SAndroid Build Coastguard Worker {
569*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[0])
570*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[1])
571*f6dc9357SAndroid Build Coastguard Worker w += 2;
572*f6dc9357SAndroid Build Coastguard Worker }
573*f6dc9357SAndroid Build Coastguard Worker while (--numRounds2);
574*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenc_si128, w[0])
575*f6dc9357SAndroid Build Coastguard Worker MM_OP_m (_mm_aesenclast_si128, w[1])
576*f6dc9357SAndroid Build Coastguard Worker CTR_END (m, 0)
577*f6dc9357SAndroid Build Coastguard Worker }
578*f6dc9357SAndroid Build Coastguard Worker
579*f6dc9357SAndroid Build Coastguard Worker p[-2] = ctr;
580*f6dc9357SAndroid Build Coastguard Worker }
581*f6dc9357SAndroid Build Coastguard Worker
582*f6dc9357SAndroid Build Coastguard Worker #endif // USE_INTEL_VAES
583*f6dc9357SAndroid Build Coastguard Worker
584*f6dc9357SAndroid Build Coastguard Worker #else // USE_INTEL_AES
585*f6dc9357SAndroid Build Coastguard Worker
586*f6dc9357SAndroid Build Coastguard Worker /* no USE_INTEL_AES */
587*f6dc9357SAndroid Build Coastguard Worker
588*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_USE_AES_HW_STUB)
589*f6dc9357SAndroid Build Coastguard Worker // We can compile this file with another C compiler,
590*f6dc9357SAndroid Build Coastguard Worker // or we can compile asm version.
591*f6dc9357SAndroid Build Coastguard Worker // So we can generate real code instead of this stub function.
592*f6dc9357SAndroid Build Coastguard Worker // #if defined(_MSC_VER)
593*f6dc9357SAndroid Build Coastguard Worker #pragma message("AES HW_SW stub was used")
594*f6dc9357SAndroid Build Coastguard Worker // #endif
595*f6dc9357SAndroid Build Coastguard Worker
596*f6dc9357SAndroid Build Coastguard Worker #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
597*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_keys UInt32
598*f6dc9357SAndroid Build Coastguard Worker #define AES_TYPE_data Byte
599*f6dc9357SAndroid Build Coastguard Worker #endif
600*f6dc9357SAndroid Build Coastguard Worker
601*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
602*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
603*f6dc9357SAndroid Build Coastguard Worker
604*f6dc9357SAndroid Build Coastguard Worker #define AES_COMPAT_STUB(name) \
605*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START(name); \
606*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START(name ## _HW) \
607*f6dc9357SAndroid Build Coastguard Worker { name(p, data, numBlocks); }
608*f6dc9357SAndroid Build Coastguard Worker
609*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCbc_Encode)
610*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCbc_Decode)
611*f6dc9357SAndroid Build Coastguard Worker AES_COMPAT_STUB (AesCtr_Code)
612*f6dc9357SAndroid Build Coastguard Worker #endif // Z7_USE_AES_HW_STUB
613*f6dc9357SAndroid Build Coastguard Worker
614*f6dc9357SAndroid Build Coastguard Worker #endif // USE_INTEL_AES
615*f6dc9357SAndroid Build Coastguard Worker
616*f6dc9357SAndroid Build Coastguard Worker
617*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_INTEL_VAES
618*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_USE_VAES_HW_STUB)
619*f6dc9357SAndroid Build Coastguard Worker // #if defined(_MSC_VER)
620*f6dc9357SAndroid Build Coastguard Worker #pragma message("VAES HW_SW stub was used")
621*f6dc9357SAndroid Build Coastguard Worker // #endif
622*f6dc9357SAndroid Build Coastguard Worker
623*f6dc9357SAndroid Build Coastguard Worker #define VAES_COMPAT_STUB(name) \
624*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
625*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
626*f6dc9357SAndroid Build Coastguard Worker { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
627*f6dc9357SAndroid Build Coastguard Worker
628*f6dc9357SAndroid Build Coastguard Worker VAES_COMPAT_STUB (AesCbc_Decode_HW)
629*f6dc9357SAndroid Build Coastguard Worker VAES_COMPAT_STUB (AesCtr_Code_HW)
630*f6dc9357SAndroid Build Coastguard Worker #endif
631*f6dc9357SAndroid Build Coastguard Worker #endif // ! USE_INTEL_VAES
632*f6dc9357SAndroid Build Coastguard Worker
633*f6dc9357SAndroid Build Coastguard Worker
634*f6dc9357SAndroid Build Coastguard Worker
635*f6dc9357SAndroid Build Coastguard Worker
636*f6dc9357SAndroid Build Coastguard Worker #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
637*f6dc9357SAndroid Build Coastguard Worker
638*f6dc9357SAndroid Build Coastguard Worker #if defined(__ARM_FEATURE_AES) \
639*f6dc9357SAndroid Build Coastguard Worker || defined(__ARM_FEATURE_CRYPTO)
640*f6dc9357SAndroid Build Coastguard Worker #define USE_HW_AES
641*f6dc9357SAndroid Build Coastguard Worker #else
642*f6dc9357SAndroid Build Coastguard Worker #if defined(MY_CPU_ARM64) \
643*f6dc9357SAndroid Build Coastguard Worker || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
644*f6dc9357SAndroid Build Coastguard Worker || defined(Z7_MSC_VER_ORIGINAL)
645*f6dc9357SAndroid Build Coastguard Worker #if defined(__ARM_FP) && \
646*f6dc9357SAndroid Build Coastguard Worker ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
647*f6dc9357SAndroid Build Coastguard Worker || defined(__GNUC__) && (__GNUC__ >= 6) \
648*f6dc9357SAndroid Build Coastguard Worker ) \
649*f6dc9357SAndroid Build Coastguard Worker || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
650*f6dc9357SAndroid Build Coastguard Worker #if defined(MY_CPU_ARM64) \
651*f6dc9357SAndroid Build Coastguard Worker || !defined(Z7_CLANG_VERSION) \
652*f6dc9357SAndroid Build Coastguard Worker || defined(__ARM_NEON) && \
653*f6dc9357SAndroid Build Coastguard Worker (Z7_CLANG_VERSION < 170000 || \
654*f6dc9357SAndroid Build Coastguard Worker Z7_CLANG_VERSION > 170001)
655*f6dc9357SAndroid Build Coastguard Worker #define USE_HW_AES
656*f6dc9357SAndroid Build Coastguard Worker #endif
657*f6dc9357SAndroid Build Coastguard Worker #endif
658*f6dc9357SAndroid Build Coastguard Worker #endif
659*f6dc9357SAndroid Build Coastguard Worker #endif
660*f6dc9357SAndroid Build Coastguard Worker
661*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_HW_AES
662*f6dc9357SAndroid Build Coastguard Worker
663*f6dc9357SAndroid Build Coastguard Worker // #pragma message("=== AES HW === ")
664*f6dc9357SAndroid Build Coastguard Worker // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
665*f6dc9357SAndroid Build Coastguard Worker
666*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) || defined(__GNUC__)
667*f6dc9357SAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_AES) && \
668*f6dc9357SAndroid Build Coastguard Worker !defined(__ARM_FEATURE_CRYPTO)
669*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_ARM64
670*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
671*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES __attribute__((__target__("crypto")))
672*f6dc9357SAndroid Build Coastguard Worker #else
673*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES __attribute__((__target__("+crypto")))
674*f6dc9357SAndroid Build Coastguard Worker #endif
675*f6dc9357SAndroid Build Coastguard Worker #else
676*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
677*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
678*f6dc9357SAndroid Build Coastguard Worker #else
679*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
680*f6dc9357SAndroid Build Coastguard Worker #endif
681*f6dc9357SAndroid Build Coastguard Worker #endif
682*f6dc9357SAndroid Build Coastguard Worker #endif
683*f6dc9357SAndroid Build Coastguard Worker #else
684*f6dc9357SAndroid Build Coastguard Worker // _MSC_VER
685*f6dc9357SAndroid Build Coastguard Worker // for arm32
686*f6dc9357SAndroid Build Coastguard Worker #define _ARM_USE_NEW_NEON_INTRINSICS
687*f6dc9357SAndroid Build Coastguard Worker #endif
688*f6dc9357SAndroid Build Coastguard Worker
689*f6dc9357SAndroid Build Coastguard Worker #ifndef ATTRIB_AES
690*f6dc9357SAndroid Build Coastguard Worker #define ATTRIB_AES
691*f6dc9357SAndroid Build Coastguard Worker #endif
692*f6dc9357SAndroid Build Coastguard Worker
693*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
694*f6dc9357SAndroid Build Coastguard Worker #include <arm64_neon.h>
695*f6dc9357SAndroid Build Coastguard Worker #else
696*f6dc9357SAndroid Build Coastguard Worker /*
697*f6dc9357SAndroid Build Coastguard Worker clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
698*f6dc9357SAndroid Build Coastguard Worker clang
699*f6dc9357SAndroid Build Coastguard Worker 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO)
700*f6dc9357SAndroid Build Coastguard Worker 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
701*f6dc9357SAndroid Build Coastguard Worker 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
702*f6dc9357SAndroid Build Coastguard Worker 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
703*f6dc9357SAndroid Build Coastguard Worker 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
704*f6dc9357SAndroid Build Coastguard Worker */
705*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && __clang_major__ < 16
706*f6dc9357SAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_AES) && \
707*f6dc9357SAndroid Build Coastguard Worker !defined(__ARM_FEATURE_CRYPTO)
708*f6dc9357SAndroid Build Coastguard Worker // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
709*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
710*f6dc9357SAndroid Build Coastguard Worker #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
711*f6dc9357SAndroid Build Coastguard Worker // #if defined(__clang__) && __clang_major__ < 13
712*f6dc9357SAndroid Build Coastguard Worker #define __ARM_FEATURE_CRYPTO 1
713*f6dc9357SAndroid Build Coastguard Worker // #else
714*f6dc9357SAndroid Build Coastguard Worker #define __ARM_FEATURE_AES 1
715*f6dc9357SAndroid Build Coastguard Worker // #endif
716*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
717*f6dc9357SAndroid Build Coastguard Worker #endif
718*f6dc9357SAndroid Build Coastguard Worker #endif // clang
719*f6dc9357SAndroid Build Coastguard Worker
720*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
721*f6dc9357SAndroid Build Coastguard Worker
722*f6dc9357SAndroid Build Coastguard Worker #if defined(__ARM_ARCH) && __ARM_ARCH < 8
723*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
724*f6dc9357SAndroid Build Coastguard Worker // #pragma message("#define __ARM_ARCH 8")
725*f6dc9357SAndroid Build Coastguard Worker #undef __ARM_ARCH
726*f6dc9357SAndroid Build Coastguard Worker #define __ARM_ARCH 8
727*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
728*f6dc9357SAndroid Build Coastguard Worker #endif
729*f6dc9357SAndroid Build Coastguard Worker
730*f6dc9357SAndroid Build Coastguard Worker #endif // clang
731*f6dc9357SAndroid Build Coastguard Worker
732*f6dc9357SAndroid Build Coastguard Worker #include <arm_neon.h>
733*f6dc9357SAndroid Build Coastguard Worker
734*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
735*f6dc9357SAndroid Build Coastguard Worker defined(__ARM_FEATURE_CRYPTO) && \
736*f6dc9357SAndroid Build Coastguard Worker defined(__ARM_FEATURE_AES)
737*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
738*f6dc9357SAndroid Build Coastguard Worker #undef __ARM_FEATURE_CRYPTO
739*f6dc9357SAndroid Build Coastguard Worker #undef __ARM_FEATURE_AES
740*f6dc9357SAndroid Build Coastguard Worker #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
741*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
742*f6dc9357SAndroid Build Coastguard Worker // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
743*f6dc9357SAndroid Build Coastguard Worker #endif
744*f6dc9357SAndroid Build Coastguard Worker
745*f6dc9357SAndroid Build Coastguard Worker #endif // Z7_MSC_VER_ORIGINAL
746*f6dc9357SAndroid Build Coastguard Worker
747*f6dc9357SAndroid Build Coastguard Worker typedef uint8x16_t v128;
748*f6dc9357SAndroid Build Coastguard Worker
749*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START(name) \
750*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
751*f6dc9357SAndroid Build Coastguard Worker // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
752*f6dc9357SAndroid Build Coastguard Worker
753*f6dc9357SAndroid Build Coastguard Worker #define AES_FUNC_START2(name) \
754*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name); \
755*f6dc9357SAndroid Build Coastguard Worker ATTRIB_AES \
756*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START (name)
757*f6dc9357SAndroid Build Coastguard Worker
758*f6dc9357SAndroid Build Coastguard Worker #define MM_OP(op, dest, src) dest = op(dest, src);
759*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_m(op, src) MM_OP(op, m, src)
760*f6dc9357SAndroid Build Coastguard Worker #define MM_OP1_m(op) m = op(m);
761*f6dc9357SAndroid Build Coastguard Worker
762*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
763*f6dc9357SAndroid Build Coastguard Worker #define MM_XOR_m( src) MM_XOR(m, src)
764*f6dc9357SAndroid Build Coastguard Worker
765*f6dc9357SAndroid Build Coastguard Worker #define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
766*f6dc9357SAndroid Build Coastguard Worker #define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
767*f6dc9357SAndroid Build Coastguard Worker
768*f6dc9357SAndroid Build Coastguard Worker
769*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Encode_HW)
770*f6dc9357SAndroid Build Coastguard Worker {
771*f6dc9357SAndroid Build Coastguard Worker if (numBlocks == 0)
772*f6dc9357SAndroid Build Coastguard Worker return;
773*f6dc9357SAndroid Build Coastguard Worker {
774*f6dc9357SAndroid Build Coastguard Worker v128 * const p = (v128 *)(void *)ivAes;
775*f6dc9357SAndroid Build Coastguard Worker v128 *data = (v128 *)(void *)data8;
776*f6dc9357SAndroid Build Coastguard Worker v128 m = *p;
777*f6dc9357SAndroid Build Coastguard Worker const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778*f6dc9357SAndroid Build Coastguard Worker const v128 *w = p + (size_t)numRounds2 * 2;
779*f6dc9357SAndroid Build Coastguard Worker const v128 k0 = p[2];
780*f6dc9357SAndroid Build Coastguard Worker const v128 k1 = p[3];
781*f6dc9357SAndroid Build Coastguard Worker const v128 k2 = p[4];
782*f6dc9357SAndroid Build Coastguard Worker const v128 k3 = p[5];
783*f6dc9357SAndroid Build Coastguard Worker const v128 k4 = p[6];
784*f6dc9357SAndroid Build Coastguard Worker const v128 k5 = p[7];
785*f6dc9357SAndroid Build Coastguard Worker const v128 k6 = p[8];
786*f6dc9357SAndroid Build Coastguard Worker const v128 k7 = p[9];
787*f6dc9357SAndroid Build Coastguard Worker const v128 k8 = p[10];
788*f6dc9357SAndroid Build Coastguard Worker const v128 k9 = p[11];
789*f6dc9357SAndroid Build Coastguard Worker const v128 k_z4 = w[-2];
790*f6dc9357SAndroid Build Coastguard Worker const v128 k_z3 = w[-1];
791*f6dc9357SAndroid Build Coastguard Worker const v128 k_z2 = w[0];
792*f6dc9357SAndroid Build Coastguard Worker const v128 k_z1 = w[1];
793*f6dc9357SAndroid Build Coastguard Worker const v128 k_z0 = w[2];
794*f6dc9357SAndroid Build Coastguard Worker // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795*f6dc9357SAndroid Build Coastguard Worker // because gcc/clang compilers are not good for that optimization.
796*f6dc9357SAndroid Build Coastguard Worker do
797*f6dc9357SAndroid Build Coastguard Worker {
798*f6dc9357SAndroid Build Coastguard Worker MM_XOR_m (*data)
799*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k0)
800*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k1)
801*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k2)
802*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k3)
803*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k4)
804*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k5)
805*f6dc9357SAndroid Build Coastguard Worker if (numRounds2 >= 6)
806*f6dc9357SAndroid Build Coastguard Worker {
807*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k6)
808*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k7)
809*f6dc9357SAndroid Build Coastguard Worker if (numRounds2 != 6)
810*f6dc9357SAndroid Build Coastguard Worker {
811*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k8)
812*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k9)
813*f6dc9357SAndroid Build Coastguard Worker }
814*f6dc9357SAndroid Build Coastguard Worker }
815*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k_z4)
816*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k_z3)
817*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (k_z2)
818*f6dc9357SAndroid Build Coastguard Worker AES_E_m (k_z1)
819*f6dc9357SAndroid Build Coastguard Worker MM_XOR_m (k_z0)
820*f6dc9357SAndroid Build Coastguard Worker *data++ = m;
821*f6dc9357SAndroid Build Coastguard Worker }
822*f6dc9357SAndroid Build Coastguard Worker while (--numBlocks);
823*f6dc9357SAndroid Build Coastguard Worker *p = m;
824*f6dc9357SAndroid Build Coastguard Worker }
825*f6dc9357SAndroid Build Coastguard Worker }
826*f6dc9357SAndroid Build Coastguard Worker
827*f6dc9357SAndroid Build Coastguard Worker
828*f6dc9357SAndroid Build Coastguard Worker #define WOP_1(op)
829*f6dc9357SAndroid Build Coastguard Worker #define WOP_2(op) WOP_1 (op) op (m1, 1)
830*f6dc9357SAndroid Build Coastguard Worker #define WOP_3(op) WOP_2 (op) op (m2, 2)
831*f6dc9357SAndroid Build Coastguard Worker #define WOP_4(op) WOP_3 (op) op (m3, 3)
832*f6dc9357SAndroid Build Coastguard Worker #define WOP_5(op) WOP_4 (op) op (m4, 4)
833*f6dc9357SAndroid Build Coastguard Worker #define WOP_6(op) WOP_5 (op) op (m5, 5)
834*f6dc9357SAndroid Build Coastguard Worker #define WOP_7(op) WOP_6 (op) op (m6, 6)
835*f6dc9357SAndroid Build Coastguard Worker #define WOP_8(op) WOP_7 (op) op (m7, 7)
836*f6dc9357SAndroid Build Coastguard Worker
837*f6dc9357SAndroid Build Coastguard Worker #define NUM_WAYS 8
838*f6dc9357SAndroid Build Coastguard Worker #define WOP_M1 WOP_8
839*f6dc9357SAndroid Build Coastguard Worker
840*f6dc9357SAndroid Build Coastguard Worker #define WOP(op) op (m0, 0) WOP_M1(op)
841*f6dc9357SAndroid Build Coastguard Worker
842*f6dc9357SAndroid Build Coastguard Worker #define DECLARE_VAR(reg, ii) v128 reg;
843*f6dc9357SAndroid Build Coastguard Worker #define LOAD_data( reg, ii) reg = data[ii];
844*f6dc9357SAndroid Build Coastguard Worker #define STORE_data( reg, ii) data[ii] = reg;
845*f6dc9357SAndroid Build Coastguard Worker #if (NUM_WAYS > 1)
846*f6dc9357SAndroid Build Coastguard Worker #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
847*f6dc9357SAndroid Build Coastguard Worker #endif
848*f6dc9357SAndroid Build Coastguard Worker
849*f6dc9357SAndroid Build Coastguard Worker #define MM_OP_key(op, reg) MM_OP (op, reg, key)
850*f6dc9357SAndroid Build Coastguard Worker
851*f6dc9357SAndroid Build Coastguard Worker #define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
852*f6dc9357SAndroid Build Coastguard Worker #define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
853*f6dc9357SAndroid Build Coastguard Worker
854*f6dc9357SAndroid Build Coastguard Worker #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
855*f6dc9357SAndroid Build Coastguard Worker #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
856*f6dc9357SAndroid Build Coastguard Worker #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
857*f6dc9357SAndroid Build Coastguard Worker
858*f6dc9357SAndroid Build Coastguard Worker #define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
859*f6dc9357SAndroid Build Coastguard Worker #define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
860*f6dc9357SAndroid Build Coastguard Worker
861*f6dc9357SAndroid Build Coastguard Worker #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
862*f6dc9357SAndroid Build Coastguard Worker #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
863*f6dc9357SAndroid Build Coastguard Worker
864*f6dc9357SAndroid Build Coastguard Worker #define WOP_KEY(op, n) { \
865*f6dc9357SAndroid Build Coastguard Worker const v128 key = w[n]; \
866*f6dc9357SAndroid Build Coastguard Worker WOP(op) }
867*f6dc9357SAndroid Build Coastguard Worker
868*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_START \
869*f6dc9357SAndroid Build Coastguard Worker dataEnd = data + numBlocks; \
870*f6dc9357SAndroid Build Coastguard Worker if (numBlocks >= NUM_WAYS) \
871*f6dc9357SAndroid Build Coastguard Worker { dataEnd -= NUM_WAYS; do { \
872*f6dc9357SAndroid Build Coastguard Worker
873*f6dc9357SAndroid Build Coastguard Worker #define WIDE_LOOP_END \
874*f6dc9357SAndroid Build Coastguard Worker data += NUM_WAYS; \
875*f6dc9357SAndroid Build Coastguard Worker } while (data <= dataEnd); \
876*f6dc9357SAndroid Build Coastguard Worker dataEnd += NUM_WAYS; } \
877*f6dc9357SAndroid Build Coastguard Worker
878*f6dc9357SAndroid Build Coastguard Worker #define SINGLE_LOOP \
879*f6dc9357SAndroid Build Coastguard Worker for (; data < dataEnd; data++)
880*f6dc9357SAndroid Build Coastguard Worker
881*f6dc9357SAndroid Build Coastguard Worker
882*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCbc_Decode_HW)
883*f6dc9357SAndroid Build Coastguard Worker {
884*f6dc9357SAndroid Build Coastguard Worker v128 *p = (v128 *)(void *)ivAes;
885*f6dc9357SAndroid Build Coastguard Worker v128 *data = (v128 *)(void *)data8;
886*f6dc9357SAndroid Build Coastguard Worker v128 iv = *p;
887*f6dc9357SAndroid Build Coastguard Worker const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
888*f6dc9357SAndroid Build Coastguard Worker const v128 *dataEnd;
889*f6dc9357SAndroid Build Coastguard Worker p += 2;
890*f6dc9357SAndroid Build Coastguard Worker
891*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START
892*f6dc9357SAndroid Build Coastguard Worker {
893*f6dc9357SAndroid Build Coastguard Worker const v128 *w = wStart;
894*f6dc9357SAndroid Build Coastguard Worker WOP (DECLARE_VAR)
895*f6dc9357SAndroid Build Coastguard Worker WOP (LOAD_data)
896*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_D_IMC, 2)
897*f6dc9357SAndroid Build Coastguard Worker do
898*f6dc9357SAndroid Build Coastguard Worker {
899*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_D_IMC, 1)
900*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_D_IMC, 0)
901*f6dc9357SAndroid Build Coastguard Worker w -= 2;
902*f6dc9357SAndroid Build Coastguard Worker }
903*f6dc9357SAndroid Build Coastguard Worker while (w != p);
904*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_D, 1)
905*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_XOR, 0)
906*f6dc9357SAndroid Build Coastguard Worker MM_XOR (m0, iv)
907*f6dc9357SAndroid Build Coastguard Worker WOP_M1 (XOR_data_M1)
908*f6dc9357SAndroid Build Coastguard Worker LOAD_data(iv, NUM_WAYS - 1)
909*f6dc9357SAndroid Build Coastguard Worker WOP (STORE_data)
910*f6dc9357SAndroid Build Coastguard Worker }
911*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END
912*f6dc9357SAndroid Build Coastguard Worker
913*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
914*f6dc9357SAndroid Build Coastguard Worker {
915*f6dc9357SAndroid Build Coastguard Worker const v128 *w = wStart;
916*f6dc9357SAndroid Build Coastguard Worker v128 m; LOAD_data(m, 0)
917*f6dc9357SAndroid Build Coastguard Worker AES_D_IMC_m (w[2])
918*f6dc9357SAndroid Build Coastguard Worker do
919*f6dc9357SAndroid Build Coastguard Worker {
920*f6dc9357SAndroid Build Coastguard Worker AES_D_IMC_m (w[1])
921*f6dc9357SAndroid Build Coastguard Worker AES_D_IMC_m (w[0])
922*f6dc9357SAndroid Build Coastguard Worker w -= 2;
923*f6dc9357SAndroid Build Coastguard Worker }
924*f6dc9357SAndroid Build Coastguard Worker while (w != p);
925*f6dc9357SAndroid Build Coastguard Worker AES_D_m (w[1])
926*f6dc9357SAndroid Build Coastguard Worker MM_XOR_m (w[0])
927*f6dc9357SAndroid Build Coastguard Worker MM_XOR_m (iv)
928*f6dc9357SAndroid Build Coastguard Worker LOAD_data(iv, 0)
929*f6dc9357SAndroid Build Coastguard Worker STORE_data(m, 0)
930*f6dc9357SAndroid Build Coastguard Worker }
931*f6dc9357SAndroid Build Coastguard Worker
932*f6dc9357SAndroid Build Coastguard Worker p[-2] = iv;
933*f6dc9357SAndroid Build Coastguard Worker }
934*f6dc9357SAndroid Build Coastguard Worker
935*f6dc9357SAndroid Build Coastguard Worker
936*f6dc9357SAndroid Build Coastguard Worker AES_FUNC_START2 (AesCtr_Code_HW)
937*f6dc9357SAndroid Build Coastguard Worker {
938*f6dc9357SAndroid Build Coastguard Worker v128 *p = (v128 *)(void *)ivAes;
939*f6dc9357SAndroid Build Coastguard Worker v128 *data = (v128 *)(void *)data8;
940*f6dc9357SAndroid Build Coastguard Worker uint64x2_t ctr = vreinterpretq_u64_u8(*p);
941*f6dc9357SAndroid Build Coastguard Worker const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
942*f6dc9357SAndroid Build Coastguard Worker const v128 *dataEnd;
943*f6dc9357SAndroid Build Coastguard Worker // the bug in clang:
944*f6dc9357SAndroid Build Coastguard Worker // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
945*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__) && (__clang_major__ <= 9)
946*f6dc9357SAndroid Build Coastguard Worker #pragma GCC diagnostic ignored "-Wvector-conversion"
947*f6dc9357SAndroid Build Coastguard Worker #endif
948*f6dc9357SAndroid Build Coastguard Worker const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
949*f6dc9357SAndroid Build Coastguard Worker p += 2;
950*f6dc9357SAndroid Build Coastguard Worker
951*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_START
952*f6dc9357SAndroid Build Coastguard Worker {
953*f6dc9357SAndroid Build Coastguard Worker const v128 *w = p;
954*f6dc9357SAndroid Build Coastguard Worker WOP (DECLARE_VAR)
955*f6dc9357SAndroid Build Coastguard Worker WOP (CTR_START)
956*f6dc9357SAndroid Build Coastguard Worker do
957*f6dc9357SAndroid Build Coastguard Worker {
958*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_E_MC, 0)
959*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_E_MC, 1)
960*f6dc9357SAndroid Build Coastguard Worker w += 2;
961*f6dc9357SAndroid Build Coastguard Worker }
962*f6dc9357SAndroid Build Coastguard Worker while (w != wEnd);
963*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_E_MC, 0)
964*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_E, 1)
965*f6dc9357SAndroid Build Coastguard Worker WOP_KEY (AES_XOR, 2)
966*f6dc9357SAndroid Build Coastguard Worker WOP (CTR_END)
967*f6dc9357SAndroid Build Coastguard Worker }
968*f6dc9357SAndroid Build Coastguard Worker WIDE_LOOP_END
969*f6dc9357SAndroid Build Coastguard Worker
970*f6dc9357SAndroid Build Coastguard Worker SINGLE_LOOP
971*f6dc9357SAndroid Build Coastguard Worker {
972*f6dc9357SAndroid Build Coastguard Worker const v128 *w = p;
973*f6dc9357SAndroid Build Coastguard Worker v128 m;
974*f6dc9357SAndroid Build Coastguard Worker CTR_START (m, 0)
975*f6dc9357SAndroid Build Coastguard Worker do
976*f6dc9357SAndroid Build Coastguard Worker {
977*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (w[0])
978*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (w[1])
979*f6dc9357SAndroid Build Coastguard Worker w += 2;
980*f6dc9357SAndroid Build Coastguard Worker }
981*f6dc9357SAndroid Build Coastguard Worker while (w != wEnd);
982*f6dc9357SAndroid Build Coastguard Worker AES_E_MC_m (w[0])
983*f6dc9357SAndroid Build Coastguard Worker AES_E_m (w[1])
984*f6dc9357SAndroid Build Coastguard Worker MM_XOR_m (w[2])
985*f6dc9357SAndroid Build Coastguard Worker CTR_END (m, 0)
986*f6dc9357SAndroid Build Coastguard Worker }
987*f6dc9357SAndroid Build Coastguard Worker
988*f6dc9357SAndroid Build Coastguard Worker p[-2] = vreinterpretq_u8_u64(ctr);
989*f6dc9357SAndroid Build Coastguard Worker }
990*f6dc9357SAndroid Build Coastguard Worker
991*f6dc9357SAndroid Build Coastguard Worker #endif // USE_HW_AES
992*f6dc9357SAndroid Build Coastguard Worker
993*f6dc9357SAndroid Build Coastguard Worker #endif // MY_CPU_ARM_OR_ARM64
994*f6dc9357SAndroid Build Coastguard Worker
995*f6dc9357SAndroid Build Coastguard Worker #undef NUM_WAYS
996*f6dc9357SAndroid Build Coastguard Worker #undef WOP_M1
997*f6dc9357SAndroid Build Coastguard Worker #undef WOP
998*f6dc9357SAndroid Build Coastguard Worker #undef DECLARE_VAR
999*f6dc9357SAndroid Build Coastguard Worker #undef LOAD_data
1000*f6dc9357SAndroid Build Coastguard Worker #undef STORE_data
1001*f6dc9357SAndroid Build Coastguard Worker #undef USE_INTEL_AES
1002*f6dc9357SAndroid Build Coastguard Worker #undef USE_HW_AES
1003