1 #pragma once
2 /*
3 AVX implementation of sin, cos, sincos, exp and log
4
5 Based on "sse_mathfun.h", by Julien Pommier
6 http://gruntthepeon.free.fr/ssemath/
7
8 Copyright (C) 2012 Giovanni Garberoglio
9 Interdisciplinary Laboratory for Computational Science (LISC)
10 Fondazione Bruno Kessler and University of Trento
11 via Sommarive, 18
12 I-38123 Trento (Italy)
13
14 This software is provided 'as-is', without any express or implied
15 warranty. In no event will the authors be held liable for any damages
16 arising from the use of this software.
17
18 Permission is granted to anyone to use this software for any purpose,
19 including commercial applications, and to alter it and redistribute it
20 freely, subject to the following restrictions:
21
22 1. The origin of this software must not be misrepresented; you must not
23 claim that you wrote the original software. If you use this software
24 in a product, an acknowledgment in the product documentation would be
25 appreciated but is not required.
26 2. Altered source versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 3. This notice may not be removed or altered from any source distribution.
29
30 (this is the zlib license)
31 */
32
33 #include <ATen/native/cpu/Intrinsics.h>
34
35 /* The original source of this file has been modified. */
36 #if defined(CPU_CAPABILITY_AVX2)
37
38 #if defined(__GNUC__)
39 # define ALIGN32_BEG __attribute__((aligned(32)))
40 #elif defined(_WIN32)
41 # define ALIGN32_BEG __declspec(align(32))
42 #endif
43
44 typedef __m256 v8sf; // vector of 8 float (avx2)
45 typedef __m256i v8si; // vector of 8 int (avx2)
46
47 /* declare some AVX constants -- why can't I figure a better way to do that? */
48 #define _PS256_CONST(Name, Val) \
49 static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
50 #define _PI32_CONST256(Name, Val) \
51 static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
52 #define _PS256_CONST_TYPE(Name, Type, Val) \
53 static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
54
55 _PS256_CONST(1 , 1.0f);
56 _PS256_CONST(0p5, 0.5f);
57 /* the smallest non denormalized float number */
58 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
59 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
60 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
61
62 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
63 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
64
65 _PI32_CONST256(0, 0);
66 _PI32_CONST256(1, 1);
67 _PI32_CONST256(inv1, ~1);
68 _PI32_CONST256(2, 2);
69 _PI32_CONST256(4, 4);
70 _PI32_CONST256(0x7f, 0x7f);
71
72 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
73 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
74 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
75 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
76 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
77 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
78 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
79 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
80 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
81 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
82 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
83 _PS256_CONST(cephes_log_q2, 0.693359375);
84
85
86 /* natural logarithm computed for 8 simultaneous float
87 return NaN for x <= 0
88 */
log256_ps(v8sf x)89 inline v8sf log256_ps(v8sf x) {
90 v8si imm0;
91 v8sf one = *(v8sf*)_ps256_1;
92
93 //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
94 v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
95
96 x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
97
98 // can be done with AVX2
99 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
100
101 /* keep only the fractional part */
102 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
103 x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
104
105 // this is again another AVX2 instruction
106 imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
107 v8sf e = _mm256_cvtepi32_ps(imm0);
108
109 e = _mm256_add_ps(e, one);
110
111 /* part2:
112 if( x < SQRTHF ) {
113 e -= 1;
114 x = x + x - 1.0;
115 } else { x = x - 1.0; }
116 */
117 //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
118 v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
119 v8sf tmp = _mm256_and_ps(x, mask);
120 x = _mm256_sub_ps(x, one);
121 e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
122 x = _mm256_add_ps(x, tmp);
123
124 v8sf z = _mm256_mul_ps(x,x);
125
126 v8sf y = *(v8sf*)_ps256_cephes_log_p0;
127 y = _mm256_mul_ps(y, x);
128 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
129 y = _mm256_mul_ps(y, x);
130 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
131 y = _mm256_mul_ps(y, x);
132 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
133 y = _mm256_mul_ps(y, x);
134 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
135 y = _mm256_mul_ps(y, x);
136 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
137 y = _mm256_mul_ps(y, x);
138 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
139 y = _mm256_mul_ps(y, x);
140 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
141 y = _mm256_mul_ps(y, x);
142 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
143 y = _mm256_mul_ps(y, x);
144
145 y = _mm256_mul_ps(y, z);
146
147 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
148 y = _mm256_add_ps(y, tmp);
149
150
151 tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
152 y = _mm256_sub_ps(y, tmp);
153
154 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
155 x = _mm256_add_ps(x, y);
156 x = _mm256_add_ps(x, tmp);
157 x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
158 return x;
159 }
160
161 _PS256_CONST(exp_hi, 88.3762626647949f);
162 _PS256_CONST(exp_lo, -88.3762626647949f);
163
164 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
165 _PS256_CONST(cephes_exp_C1, 0.693359375);
166 _PS256_CONST(cephes_exp_C2, -2.12194440e-4);
167
168 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
169 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
170 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
171 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
172 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
173 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
174
exp256_ps(v8sf x)175 inline v8sf exp256_ps(v8sf x) {
176 v8sf tmp = _mm256_setzero_ps(), fx;
177 v8si imm0;
178 v8sf one = *(v8sf*)_ps256_1;
179
180 x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
181 x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
182
183 /* express exp(x) as exp(g + n*log(2)) */
184 fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
185 fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
186
187 /* how to perform a floorf with SSE: just below */
188 //imm0 = _mm256_cvttps_epi32(fx);
189 //tmp = _mm256_cvtepi32_ps(imm0);
190
191 tmp = _mm256_floor_ps(fx);
192
193 /* if greater, subtract 1 */
194 //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
195 v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
196 mask = _mm256_and_ps(mask, one);
197 fx = _mm256_sub_ps(tmp, mask);
198
199 tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
200 v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
201 x = _mm256_sub_ps(x, tmp);
202 x = _mm256_sub_ps(x, z);
203
204 z = _mm256_mul_ps(x,x);
205
206 v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
207 y = _mm256_mul_ps(y, x);
208 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
209 y = _mm256_mul_ps(y, x);
210 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
211 y = _mm256_mul_ps(y, x);
212 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
213 y = _mm256_mul_ps(y, x);
214 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
215 y = _mm256_mul_ps(y, x);
216 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
217 y = _mm256_mul_ps(y, z);
218 y = _mm256_add_ps(y, x);
219 y = _mm256_add_ps(y, one);
220
221 /* build 2^n */
222 imm0 = _mm256_cvttps_epi32(fx);
223 // another two AVX2 instructions
224 imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
225 imm0 = _mm256_slli_epi32(imm0, 23);
226 v8sf pow2n = _mm256_castsi256_ps(imm0);
227 y = _mm256_mul_ps(y, pow2n);
228 return y;
229 }
230
231 _PS256_CONST(minus_cephes_DP1, -0.78515625);
232 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
233 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
234 _PS256_CONST(sincof_p0, -1.9515295891E-4);
235 _PS256_CONST(sincof_p1, 8.3321608736E-3);
236 _PS256_CONST(sincof_p2, -1.6666654611E-1);
237 _PS256_CONST(coscof_p0, 2.443315711809948E-005);
238 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
239 _PS256_CONST(coscof_p2, 4.166664568298827E-002);
240 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
241
242
243 /* evaluation of 8 sines at onces using AVX intrinsics
244
245 The code is the exact rewriting of the cephes sinf function.
246 Precision is excellent as long as x < 8192 (I did not bother to
247 take into account the special handling they have for greater values
248 -- it does not return garbage for arguments over 8192, though, but
249 the extra precision is missing).
250
251 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
252 surprising but correct result.
253
254 */
sin256_ps(v8sf x)255 inline v8sf sin256_ps(v8sf x) { // any x
256 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
257 v8si imm0, imm2;
258
259 sign_bit = x;
260 /* take the absolute value */
261 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
262 /* extract the sign bit (upper one) */
263 sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
264
265 /* scale by 4/Pi */
266 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
267
268 /*
269 Here we start a series of integer operations, which are in the
270 realm of AVX2.
271 If we don't have AVX, let's perform them using SSE2 directives
272 */
273
274 /* store the integer part of y in mm0 */
275 imm2 = _mm256_cvttps_epi32(y);
276 /* j=(j+1) & (~1) (see the cephes sources) */
277 // another two AVX2 instruction
278 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
279 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
280 y = _mm256_cvtepi32_ps(imm2);
281
282 /* get the swap sign flag */
283 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
284 imm0 = _mm256_slli_epi32(imm0, 29);
285 /* get the polynom selection mask
286 there is one polynom for 0 <= x <= Pi/4
287 and another one for Pi/4<x<=Pi/2
288
289 Both branches will be computed.
290 */
291 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
292 imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
293
294 v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
295 v8sf poly_mask = _mm256_castsi256_ps(imm2);
296 sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
297
298 /* The magic pass: "Extended precision modular arithmetic"
299 x = ((x - y * DP1) - y * DP2) - y * DP3; */
300 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
301 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
302 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
303 xmm1 = _mm256_mul_ps(y, xmm1);
304 xmm2 = _mm256_mul_ps(y, xmm2);
305 xmm3 = _mm256_mul_ps(y, xmm3);
306 x = _mm256_add_ps(x, xmm1);
307 x = _mm256_add_ps(x, xmm2);
308 x = _mm256_add_ps(x, xmm3);
309
310 /* Evaluate the first polynom (0 <= x <= Pi/4) */
311 y = *(v8sf*)_ps256_coscof_p0;
312 v8sf z = _mm256_mul_ps(x,x);
313
314 y = _mm256_mul_ps(y, z);
315 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
316 y = _mm256_mul_ps(y, z);
317 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
318 y = _mm256_mul_ps(y, z);
319 y = _mm256_mul_ps(y, z);
320 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
321 y = _mm256_sub_ps(y, tmp);
322 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
323
324 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
325
326 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
327 y2 = _mm256_mul_ps(y2, z);
328 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
329 y2 = _mm256_mul_ps(y2, z);
330 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
331 y2 = _mm256_mul_ps(y2, z);
332 y2 = _mm256_mul_ps(y2, x);
333 y2 = _mm256_add_ps(y2, x);
334
335 /* select the correct result from the two polynoms */
336 xmm3 = poly_mask;
337 y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
338 y = _mm256_andnot_ps(xmm3, y);
339 y = _mm256_add_ps(y,y2);
340 /* update the sign */
341 y = _mm256_xor_ps(y, sign_bit);
342
343 return y;
344 }
345
346 /* almost the same as sin_ps */
cos256_ps(v8sf x)347 inline v8sf cos256_ps(v8sf x) { // any x
348 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
349 v8si imm0, imm2;
350
351 /* take the absolute value */
352 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
353
354 /* scale by 4/Pi */
355 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
356
357 /* store the integer part of y in mm0 */
358 imm2 = _mm256_cvttps_epi32(y);
359 /* j=(j+1) & (~1) (see the cephes sources) */
360 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
361 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
362 y = _mm256_cvtepi32_ps(imm2);
363 imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
364
365 /* get the swap sign flag */
366 imm0 = _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
367 imm0 = _mm256_slli_epi32(imm0, 29);
368 /* get the polynom selection mask */
369 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
370 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
371
372 v8sf sign_bit = _mm256_castsi256_ps(imm0);
373 v8sf poly_mask = _mm256_castsi256_ps(imm2);
374
375 /* The magic pass: "Extended precision modular arithmetic"
376 x = ((x - y * DP1) - y * DP2) - y * DP3; */
377 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
378 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
379 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
380 xmm1 = _mm256_mul_ps(y, xmm1);
381 xmm2 = _mm256_mul_ps(y, xmm2);
382 xmm3 = _mm256_mul_ps(y, xmm3);
383 x = _mm256_add_ps(x, xmm1);
384 x = _mm256_add_ps(x, xmm2);
385 x = _mm256_add_ps(x, xmm3);
386
387 /* Evaluate the first polynom (0 <= x <= Pi/4) */
388 y = *(v8sf*)_ps256_coscof_p0;
389 v8sf z = _mm256_mul_ps(x,x);
390
391 y = _mm256_mul_ps(y, z);
392 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
393 y = _mm256_mul_ps(y, z);
394 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
395 y = _mm256_mul_ps(y, z);
396 y = _mm256_mul_ps(y, z);
397 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
398 y = _mm256_sub_ps(y, tmp);
399 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
400
401 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
402
403 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
404 y2 = _mm256_mul_ps(y2, z);
405 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
406 y2 = _mm256_mul_ps(y2, z);
407 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
408 y2 = _mm256_mul_ps(y2, z);
409 y2 = _mm256_mul_ps(y2, x);
410 y2 = _mm256_add_ps(y2, x);
411
412 /* select the correct result from the two polynoms */
413 xmm3 = poly_mask;
414 y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
415 y = _mm256_andnot_ps(xmm3, y);
416 y = _mm256_add_ps(y,y2);
417 /* update the sign */
418 y = _mm256_xor_ps(y, sign_bit);
419
420 return y;
421 }
422
423 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
424 it is almost as fast, and gives you a free cosine with your sine */
sincos256_ps(v8sf x,v8sf * s,v8sf * c)425 inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
426
427 v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
428 v8si imm0, imm2, imm4;
429
430 sign_bit_sin = x;
431 /* take the absolute value */
432 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
433 /* extract the sign bit (upper one) */
434 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
435
436 /* scale by 4/Pi */
437 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
438
439 /* store the integer part of y in imm2 */
440 imm2 = _mm256_cvttps_epi32(y);
441
442 /* j=(j+1) & (~1) (see the cephes sources) */
443 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
444 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
445
446 y = _mm256_cvtepi32_ps(imm2);
447 imm4 = imm2;
448
449 /* get the swap sign flag for the sine */
450 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
451 imm0 = _mm256_slli_epi32(imm0, 29);
452 //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
453
454 /* get the polynom selection mask for the sine*/
455 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
456 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
457 //v8sf poly_mask = _mm256_castsi256_ps(imm2);
458
459 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
460 v8sf poly_mask = _mm256_castsi256_ps(imm2);
461
462 /* The magic pass: "Extended precision modular arithmetic"
463 x = ((x - y * DP1) - y * DP2) - y * DP3; */
464 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
465 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
466 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
467 xmm1 = _mm256_mul_ps(y, xmm1);
468 xmm2 = _mm256_mul_ps(y, xmm2);
469 xmm3 = _mm256_mul_ps(y, xmm3);
470 x = _mm256_add_ps(x, xmm1);
471 x = _mm256_add_ps(x, xmm2);
472 x = _mm256_add_ps(x, xmm3);
473
474 imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
475 imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
476 imm4 = _mm256_slli_epi32(imm4, 29);
477
478 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
479
480 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
481
482 /* Evaluate the first polynom (0 <= x <= Pi/4) */
483 v8sf z = _mm256_mul_ps(x,x);
484 y = *(v8sf*)_ps256_coscof_p0;
485
486 y = _mm256_mul_ps(y, z);
487 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
488 y = _mm256_mul_ps(y, z);
489 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
490 y = _mm256_mul_ps(y, z);
491 y = _mm256_mul_ps(y, z);
492 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
493 y = _mm256_sub_ps(y, tmp);
494 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
495
496 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
497
498 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
499 y2 = _mm256_mul_ps(y2, z);
500 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
501 y2 = _mm256_mul_ps(y2, z);
502 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
503 y2 = _mm256_mul_ps(y2, z);
504 y2 = _mm256_mul_ps(y2, x);
505 y2 = _mm256_add_ps(y2, x);
506
507 /* select the correct result from the two polynoms */
508 xmm3 = poly_mask;
509 v8sf ysin2 = _mm256_and_ps(xmm3, y2);
510 v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
511 y2 = _mm256_sub_ps(y2,ysin2);
512 y = _mm256_sub_ps(y, ysin1);
513
514 xmm1 = _mm256_add_ps(ysin1,ysin2);
515 xmm2 = _mm256_add_ps(y,y2);
516
517 /* update the sign */
518 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
519 *c = _mm256_xor_ps(xmm2, sign_bit_cos);
520 }
521
522 #endif // CPU_CAPABILITY_AVX2
523