xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cpu/avx_mathfun.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 /*
3    AVX implementation of sin, cos, sincos, exp and log
4 
5    Based on "sse_mathfun.h", by Julien Pommier
6    http://gruntthepeon.free.fr/ssemath/
7 
8    Copyright (C) 2012 Giovanni Garberoglio
9    Interdisciplinary Laboratory for Computational Science (LISC)
10    Fondazione Bruno Kessler and University of Trento
11    via Sommarive, 18
12    I-38123 Trento (Italy)
13 
14   This software is provided 'as-is', without any express or implied
15   warranty.  In no event will the authors be held liable for any damages
16   arising from the use of this software.
17 
18   Permission is granted to anyone to use this software for any purpose,
19   including commercial applications, and to alter it and redistribute it
20   freely, subject to the following restrictions:
21 
22   1. The origin of this software must not be misrepresented; you must not
23      claim that you wrote the original software. If you use this software
24      in a product, an acknowledgment in the product documentation would be
25      appreciated but is not required.
26   2. Altered source versions must be plainly marked as such, and must not be
27      misrepresented as being the original software.
28   3. This notice may not be removed or altered from any source distribution.
29 
30   (this is the zlib license)
31 */
32 
33 #include <ATen/native/cpu/Intrinsics.h>
34 
35 /* The original source of this file has been modified. */
36 #if defined(CPU_CAPABILITY_AVX2)
37 
38 #if defined(__GNUC__)
39 # define ALIGN32_BEG __attribute__((aligned(32)))
40 #elif defined(_WIN32)
41 # define ALIGN32_BEG __declspec(align(32))
42 #endif
43 
44 typedef __m256  v8sf; // vector of 8 float (avx2)
45 typedef __m256i v8si; // vector of 8 int   (avx2)
46 
47 /* declare some AVX constants -- why can't I figure a better way to do that? */
48 #define _PS256_CONST(Name, Val)                                            \
49   static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
50 #define _PI32_CONST256(Name, Val)                                            \
51   static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
52 #define _PS256_CONST_TYPE(Name, Type, Val)                                 \
53   static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
54 
55 _PS256_CONST(1  , 1.0f);
56 _PS256_CONST(0p5, 0.5f);
57 /* the smallest non denormalized float number */
58 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
59 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
60 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
61 
62 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
63 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
64 
65 _PI32_CONST256(0, 0);
66 _PI32_CONST256(1, 1);
67 _PI32_CONST256(inv1, ~1);
68 _PI32_CONST256(2, 2);
69 _PI32_CONST256(4, 4);
70 _PI32_CONST256(0x7f, 0x7f);
71 
72 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
73 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
74 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
75 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
76 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
77 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
78 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
79 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
80 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
81 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
82 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
83 _PS256_CONST(cephes_log_q2, 0.693359375);
84 
85 
86 /* natural logarithm computed for 8 simultaneous float
87    return NaN for x <= 0
88 */
log256_ps(v8sf x)89 inline v8sf log256_ps(v8sf x) {
90   v8si imm0;
91   v8sf one = *(v8sf*)_ps256_1;
92 
93   //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
94   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
95 
96   x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
97 
98   // can be done with AVX2
99   imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
100 
101   /* keep only the fractional part */
102   x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
103   x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
104 
105   // this is again another AVX2 instruction
106   imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
107   v8sf e = _mm256_cvtepi32_ps(imm0);
108 
109   e = _mm256_add_ps(e, one);
110 
111   /* part2:
112      if( x < SQRTHF ) {
113        e -= 1;
114        x = x + x - 1.0;
115      } else { x = x - 1.0; }
116   */
117   //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
118   v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
119   v8sf tmp = _mm256_and_ps(x, mask);
120   x = _mm256_sub_ps(x, one);
121   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
122   x = _mm256_add_ps(x, tmp);
123 
124   v8sf z = _mm256_mul_ps(x,x);
125 
126   v8sf y = *(v8sf*)_ps256_cephes_log_p0;
127   y = _mm256_mul_ps(y, x);
128   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
129   y = _mm256_mul_ps(y, x);
130   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
131   y = _mm256_mul_ps(y, x);
132   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
133   y = _mm256_mul_ps(y, x);
134   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
135   y = _mm256_mul_ps(y, x);
136   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
137   y = _mm256_mul_ps(y, x);
138   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
139   y = _mm256_mul_ps(y, x);
140   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
141   y = _mm256_mul_ps(y, x);
142   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
143   y = _mm256_mul_ps(y, x);
144 
145   y = _mm256_mul_ps(y, z);
146 
147   tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
148   y = _mm256_add_ps(y, tmp);
149 
150 
151   tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
152   y = _mm256_sub_ps(y, tmp);
153 
154   tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
155   x = _mm256_add_ps(x, y);
156   x = _mm256_add_ps(x, tmp);
157   x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
158   return x;
159 }
160 
161 _PS256_CONST(exp_hi,        88.3762626647949f);
162 _PS256_CONST(exp_lo,        -88.3762626647949f);
163 
164 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
165 _PS256_CONST(cephes_exp_C1, 0.693359375);
166 _PS256_CONST(cephes_exp_C2, -2.12194440e-4);
167 
168 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
169 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
170 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
171 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
172 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
173 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
174 
exp256_ps(v8sf x)175 inline v8sf exp256_ps(v8sf x) {
176   v8sf tmp = _mm256_setzero_ps(), fx;
177   v8si imm0;
178   v8sf one = *(v8sf*)_ps256_1;
179 
180   x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
181   x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
182 
183   /* express exp(x) as exp(g + n*log(2)) */
184   fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
185   fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
186 
187   /* how to perform a floorf with SSE: just below */
188   //imm0 = _mm256_cvttps_epi32(fx);
189   //tmp  = _mm256_cvtepi32_ps(imm0);
190 
191   tmp = _mm256_floor_ps(fx);
192 
193   /* if greater, subtract 1 */
194   //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
195   v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
196   mask = _mm256_and_ps(mask, one);
197   fx = _mm256_sub_ps(tmp, mask);
198 
199   tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
200   v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
201   x = _mm256_sub_ps(x, tmp);
202   x = _mm256_sub_ps(x, z);
203 
204   z = _mm256_mul_ps(x,x);
205 
206   v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
207   y = _mm256_mul_ps(y, x);
208   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
209   y = _mm256_mul_ps(y, x);
210   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
211   y = _mm256_mul_ps(y, x);
212   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
213   y = _mm256_mul_ps(y, x);
214   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
215   y = _mm256_mul_ps(y, x);
216   y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
217   y = _mm256_mul_ps(y, z);
218   y = _mm256_add_ps(y, x);
219   y = _mm256_add_ps(y, one);
220 
221   /* build 2^n */
222   imm0 = _mm256_cvttps_epi32(fx);
223   // another two AVX2 instructions
224   imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
225   imm0 = _mm256_slli_epi32(imm0, 23);
226   v8sf pow2n = _mm256_castsi256_ps(imm0);
227   y = _mm256_mul_ps(y, pow2n);
228   return y;
229 }
230 
231 _PS256_CONST(minus_cephes_DP1, -0.78515625);
232 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
233 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
234 _PS256_CONST(sincof_p0, -1.9515295891E-4);
235 _PS256_CONST(sincof_p1,  8.3321608736E-3);
236 _PS256_CONST(sincof_p2, -1.6666654611E-1);
237 _PS256_CONST(coscof_p0,  2.443315711809948E-005);
238 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
239 _PS256_CONST(coscof_p2,  4.166664568298827E-002);
240 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
241 
242 
243 /* evaluation of 8 sines at onces using AVX intrinsics
244 
245    The code is the exact rewriting of the cephes sinf function.
246    Precision is excellent as long as x < 8192 (I did not bother to
247    take into account the special handling they have for greater values
248    -- it does not return garbage for arguments over 8192, though, but
249    the extra precision is missing).
250 
251    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
252    surprising but correct result.
253 
254 */
sin256_ps(v8sf x)255 inline v8sf sin256_ps(v8sf x) { // any x
256   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
257   v8si imm0, imm2;
258 
259   sign_bit = x;
260   /* take the absolute value */
261   x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
262   /* extract the sign bit (upper one) */
263   sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
264 
265   /* scale by 4/Pi */
266   y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
267 
268   /*
269     Here we start a series of integer operations, which are in the
270     realm of AVX2.
271     If we don't have AVX, let's perform them using SSE2 directives
272   */
273 
274   /* store the integer part of y in mm0 */
275   imm2 = _mm256_cvttps_epi32(y);
276   /* j=(j+1) & (~1) (see the cephes sources) */
277   // another two AVX2 instruction
278   imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
279   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
280   y = _mm256_cvtepi32_ps(imm2);
281 
282   /* get the swap sign flag */
283   imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
284   imm0 = _mm256_slli_epi32(imm0, 29);
285   /* get the polynom selection mask
286      there is one polynom for 0 <= x <= Pi/4
287      and another one for Pi/4<x<=Pi/2
288 
289      Both branches will be computed.
290   */
291   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
292   imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
293 
294   v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
295   v8sf poly_mask = _mm256_castsi256_ps(imm2);
296   sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
297 
298   /* The magic pass: "Extended precision modular arithmetic"
299      x = ((x - y * DP1) - y * DP2) - y * DP3; */
300   xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
301   xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
302   xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
303   xmm1 = _mm256_mul_ps(y, xmm1);
304   xmm2 = _mm256_mul_ps(y, xmm2);
305   xmm3 = _mm256_mul_ps(y, xmm3);
306   x = _mm256_add_ps(x, xmm1);
307   x = _mm256_add_ps(x, xmm2);
308   x = _mm256_add_ps(x, xmm3);
309 
310   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
311   y = *(v8sf*)_ps256_coscof_p0;
312   v8sf z = _mm256_mul_ps(x,x);
313 
314   y = _mm256_mul_ps(y, z);
315   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
316   y = _mm256_mul_ps(y, z);
317   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
318   y = _mm256_mul_ps(y, z);
319   y = _mm256_mul_ps(y, z);
320   v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
321   y = _mm256_sub_ps(y, tmp);
322   y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
323 
324   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
325 
326   v8sf y2 = *(v8sf*)_ps256_sincof_p0;
327   y2 = _mm256_mul_ps(y2, z);
328   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
329   y2 = _mm256_mul_ps(y2, z);
330   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
331   y2 = _mm256_mul_ps(y2, z);
332   y2 = _mm256_mul_ps(y2, x);
333   y2 = _mm256_add_ps(y2, x);
334 
335   /* select the correct result from the two polynoms */
336   xmm3 = poly_mask;
337   y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
338   y = _mm256_andnot_ps(xmm3, y);
339   y = _mm256_add_ps(y,y2);
340   /* update the sign */
341   y = _mm256_xor_ps(y, sign_bit);
342 
343   return y;
344 }
345 
346 /* almost the same as sin_ps */
cos256_ps(v8sf x)347 inline v8sf cos256_ps(v8sf x) { // any x
348   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
349   v8si imm0, imm2;
350 
351   /* take the absolute value */
352   x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
353 
354   /* scale by 4/Pi */
355   y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
356 
357   /* store the integer part of y in mm0 */
358   imm2 = _mm256_cvttps_epi32(y);
359   /* j=(j+1) & (~1) (see the cephes sources) */
360   imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
361   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
362   y = _mm256_cvtepi32_ps(imm2);
363   imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
364 
365   /* get the swap sign flag */
366   imm0 =  _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
367   imm0 = _mm256_slli_epi32(imm0, 29);
368   /* get the polynom selection mask */
369   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
370   imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
371 
372   v8sf sign_bit = _mm256_castsi256_ps(imm0);
373   v8sf poly_mask = _mm256_castsi256_ps(imm2);
374 
375   /* The magic pass: "Extended precision modular arithmetic"
376      x = ((x - y * DP1) - y * DP2) - y * DP3; */
377   xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
378   xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
379   xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
380   xmm1 = _mm256_mul_ps(y, xmm1);
381   xmm2 = _mm256_mul_ps(y, xmm2);
382   xmm3 = _mm256_mul_ps(y, xmm3);
383   x = _mm256_add_ps(x, xmm1);
384   x = _mm256_add_ps(x, xmm2);
385   x = _mm256_add_ps(x, xmm3);
386 
387   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
388   y = *(v8sf*)_ps256_coscof_p0;
389   v8sf z = _mm256_mul_ps(x,x);
390 
391   y = _mm256_mul_ps(y, z);
392   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
393   y = _mm256_mul_ps(y, z);
394   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
395   y = _mm256_mul_ps(y, z);
396   y = _mm256_mul_ps(y, z);
397   v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
398   y = _mm256_sub_ps(y, tmp);
399   y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
400 
401   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
402 
403   v8sf y2 = *(v8sf*)_ps256_sincof_p0;
404   y2 = _mm256_mul_ps(y2, z);
405   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
406   y2 = _mm256_mul_ps(y2, z);
407   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
408   y2 = _mm256_mul_ps(y2, z);
409   y2 = _mm256_mul_ps(y2, x);
410   y2 = _mm256_add_ps(y2, x);
411 
412   /* select the correct result from the two polynoms */
413   xmm3 = poly_mask;
414   y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
415   y = _mm256_andnot_ps(xmm3, y);
416   y = _mm256_add_ps(y,y2);
417   /* update the sign */
418   y = _mm256_xor_ps(y, sign_bit);
419 
420   return y;
421 }
422 
423 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
424    it is almost as fast, and gives you a free cosine with your sine */
sincos256_ps(v8sf x,v8sf * s,v8sf * c)425 inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
426 
427   v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
428   v8si imm0, imm2, imm4;
429 
430   sign_bit_sin = x;
431   /* take the absolute value */
432   x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
433   /* extract the sign bit (upper one) */
434   sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
435 
436   /* scale by 4/Pi */
437   y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
438 
439   /* store the integer part of y in imm2 */
440   imm2 = _mm256_cvttps_epi32(y);
441 
442   /* j=(j+1) & (~1) (see the cephes sources) */
443   imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
444   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
445 
446   y = _mm256_cvtepi32_ps(imm2);
447   imm4 = imm2;
448 
449   /* get the swap sign flag for the sine */
450   imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
451   imm0 = _mm256_slli_epi32(imm0, 29);
452   //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
453 
454   /* get the polynom selection mask for the sine*/
455   imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
456   imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
457   //v8sf poly_mask = _mm256_castsi256_ps(imm2);
458 
459   v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
460   v8sf poly_mask = _mm256_castsi256_ps(imm2);
461 
462   /* The magic pass: "Extended precision modular arithmetic"
463      x = ((x - y * DP1) - y * DP2) - y * DP3; */
464   xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
465   xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
466   xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
467   xmm1 = _mm256_mul_ps(y, xmm1);
468   xmm2 = _mm256_mul_ps(y, xmm2);
469   xmm3 = _mm256_mul_ps(y, xmm3);
470   x = _mm256_add_ps(x, xmm1);
471   x = _mm256_add_ps(x, xmm2);
472   x = _mm256_add_ps(x, xmm3);
473 
474   imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
475   imm4 =  _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
476   imm4 = _mm256_slli_epi32(imm4, 29);
477 
478   v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
479 
480   sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
481 
482   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
483   v8sf z = _mm256_mul_ps(x,x);
484   y = *(v8sf*)_ps256_coscof_p0;
485 
486   y = _mm256_mul_ps(y, z);
487   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
488   y = _mm256_mul_ps(y, z);
489   y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
490   y = _mm256_mul_ps(y, z);
491   y = _mm256_mul_ps(y, z);
492   v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
493   y = _mm256_sub_ps(y, tmp);
494   y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
495 
496   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
497 
498   v8sf y2 = *(v8sf*)_ps256_sincof_p0;
499   y2 = _mm256_mul_ps(y2, z);
500   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
501   y2 = _mm256_mul_ps(y2, z);
502   y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
503   y2 = _mm256_mul_ps(y2, z);
504   y2 = _mm256_mul_ps(y2, x);
505   y2 = _mm256_add_ps(y2, x);
506 
507   /* select the correct result from the two polynoms */
508   xmm3 = poly_mask;
509   v8sf ysin2 = _mm256_and_ps(xmm3, y2);
510   v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
511   y2 = _mm256_sub_ps(y2,ysin2);
512   y = _mm256_sub_ps(y, ysin1);
513 
514   xmm1 = _mm256_add_ps(ysin1,ysin2);
515   xmm2 = _mm256_add_ps(y,y2);
516 
517   /* update the sign */
518   *s = _mm256_xor_ps(xmm1, sign_bit_sin);
519   *c = _mm256_xor_ps(xmm2, sign_bit_cos);
520 }
521 
522 #endif // CPU_CAPABILITY_AVX2
523