xref: /aosp_15_r20/external/llvm/lib/Target/X86/README-SSE.txt (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
2*9880d681SAndroid Build Coastguard Worker// Random ideas for the X86 backend: SSE-specific stuff.
3*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
4*9880d681SAndroid Build Coastguard Worker
5*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
6*9880d681SAndroid Build Coastguard Worker
7*9880d681SAndroid Build Coastguard WorkerSSE Variable shift can be custom lowered to something like this, which uses a
8*9880d681SAndroid Build Coastguard Workersmall table + unaligned load + shuffle instead of going through memory.
9*9880d681SAndroid Build Coastguard Worker
10*9880d681SAndroid Build Coastguard Worker__m128i_shift_right:
11*9880d681SAndroid Build Coastguard Worker	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12*9880d681SAndroid Build Coastguard Worker	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13*9880d681SAndroid Build Coastguard Worker
14*9880d681SAndroid Build Coastguard Worker...
15*9880d681SAndroid Build Coastguard Worker__m128i shift_right(__m128i value, unsigned long offset) {
16*9880d681SAndroid Build Coastguard Worker  return _mm_shuffle_epi8(value,
17*9880d681SAndroid Build Coastguard Worker               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18*9880d681SAndroid Build Coastguard Worker}
19*9880d681SAndroid Build Coastguard Worker
20*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
21*9880d681SAndroid Build Coastguard Worker
22*9880d681SAndroid Build Coastguard WorkerSSE has instructions for doing operations on complex numbers, we should pattern
23*9880d681SAndroid Build Coastguard Workermatch them.   For example, this should turn into a horizontal add:
24*9880d681SAndroid Build Coastguard Worker
25*9880d681SAndroid Build Coastguard Workertypedef float __attribute__((vector_size(16))) v4f32;
26*9880d681SAndroid Build Coastguard Workerfloat f32(v4f32 A) {
27*9880d681SAndroid Build Coastguard Worker  return A[0]+A[1]+A[2]+A[3];
28*9880d681SAndroid Build Coastguard Worker}
29*9880d681SAndroid Build Coastguard Worker
30*9880d681SAndroid Build Coastguard WorkerInstead we get this:
31*9880d681SAndroid Build Coastguard Worker
32*9880d681SAndroid Build Coastguard Worker_f32:                                   ## @f32
33*9880d681SAndroid Build Coastguard Worker	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34*9880d681SAndroid Build Coastguard Worker	addss	%xmm0, %xmm1
35*9880d681SAndroid Build Coastguard Worker	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36*9880d681SAndroid Build Coastguard Worker	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37*9880d681SAndroid Build Coastguard Worker	movaps	%xmm0, %xmm3
38*9880d681SAndroid Build Coastguard Worker	addss	%xmm1, %xmm3
39*9880d681SAndroid Build Coastguard Worker	movdqa	%xmm2, %xmm0
40*9880d681SAndroid Build Coastguard Worker	addss	%xmm3, %xmm0
41*9880d681SAndroid Build Coastguard Worker	ret
42*9880d681SAndroid Build Coastguard Worker
43*9880d681SAndroid Build Coastguard WorkerAlso, there are cases where some simple local SLP would improve codegen a bit.
44*9880d681SAndroid Build Coastguard Workercompiling this:
45*9880d681SAndroid Build Coastguard Worker
46*9880d681SAndroid Build Coastguard Worker_Complex float f32(_Complex float A, _Complex float B) {
47*9880d681SAndroid Build Coastguard Worker  return A+B;
48*9880d681SAndroid Build Coastguard Worker}
49*9880d681SAndroid Build Coastguard Worker
50*9880d681SAndroid Build Coastguard Workerinto:
51*9880d681SAndroid Build Coastguard Worker
52*9880d681SAndroid Build Coastguard Worker_f32:                                   ## @f32
53*9880d681SAndroid Build Coastguard Worker	movdqa	%xmm0, %xmm2
54*9880d681SAndroid Build Coastguard Worker	addss	%xmm1, %xmm2
55*9880d681SAndroid Build Coastguard Worker	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56*9880d681SAndroid Build Coastguard Worker	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57*9880d681SAndroid Build Coastguard Worker	addss	%xmm1, %xmm3
58*9880d681SAndroid Build Coastguard Worker	movaps	%xmm2, %xmm0
59*9880d681SAndroid Build Coastguard Worker	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60*9880d681SAndroid Build Coastguard Worker	ret
61*9880d681SAndroid Build Coastguard Worker
62*9880d681SAndroid Build Coastguard Workerseems silly when it could just be one addps.
63*9880d681SAndroid Build Coastguard Worker
64*9880d681SAndroid Build Coastguard Worker
65*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
66*9880d681SAndroid Build Coastguard Worker
67*9880d681SAndroid Build Coastguard WorkerExpand libm rounding functions inline:  Significant speedups possible.
68*9880d681SAndroid Build Coastguard Workerhttp://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69*9880d681SAndroid Build Coastguard Worker
70*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
71*9880d681SAndroid Build Coastguard Worker
72*9880d681SAndroid Build Coastguard WorkerWhen compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73*9880d681SAndroid Build Coastguard Workerother fast SSE modes.
74*9880d681SAndroid Build Coastguard Worker
75*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
76*9880d681SAndroid Build Coastguard Worker
77*9880d681SAndroid Build Coastguard WorkerThink about doing i64 math in SSE regs on x86-32.
78*9880d681SAndroid Build Coastguard Worker
79*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
80*9880d681SAndroid Build Coastguard Worker
81*9880d681SAndroid Build Coastguard WorkerThis testcase should have no SSE instructions in it, and only one load from
82*9880d681SAndroid Build Coastguard Workera constant pool:
83*9880d681SAndroid Build Coastguard Worker
84*9880d681SAndroid Build Coastguard Workerdouble %test3(bool %B) {
85*9880d681SAndroid Build Coastguard Worker        %C = select bool %B, double 123.412, double 523.01123123
86*9880d681SAndroid Build Coastguard Worker        ret double %C
87*9880d681SAndroid Build Coastguard Worker}
88*9880d681SAndroid Build Coastguard Worker
89*9880d681SAndroid Build Coastguard WorkerCurrently, the select is being lowered, which prevents the dag combiner from
90*9880d681SAndroid Build Coastguard Workerturning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91*9880d681SAndroid Build Coastguard Worker
92*9880d681SAndroid Build Coastguard WorkerThe pattern isel got this one right.
93*9880d681SAndroid Build Coastguard Worker
94*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
95*9880d681SAndroid Build Coastguard Worker
96*9880d681SAndroid Build Coastguard WorkerLower memcpy / memset to a series of SSE 128 bit move instructions when it's
97*9880d681SAndroid Build Coastguard Workerfeasible.
98*9880d681SAndroid Build Coastguard Worker
99*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
100*9880d681SAndroid Build Coastguard Worker
101*9880d681SAndroid Build Coastguard WorkerCodegen:
102*9880d681SAndroid Build Coastguard Worker  if (copysign(1.0, x) == copysign(1.0, y))
103*9880d681SAndroid Build Coastguard Workerinto:
104*9880d681SAndroid Build Coastguard Worker  if (x^y & mask)
105*9880d681SAndroid Build Coastguard Workerwhen using SSE.
106*9880d681SAndroid Build Coastguard Worker
107*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
108*9880d681SAndroid Build Coastguard Worker
109*9880d681SAndroid Build Coastguard WorkerUse movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
110*9880d681SAndroid Build Coastguard Workerof a v4sf value.
111*9880d681SAndroid Build Coastguard Worker
112*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
113*9880d681SAndroid Build Coastguard Worker
114*9880d681SAndroid Build Coastguard WorkerBetter codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115*9880d681SAndroid Build Coastguard WorkerPerhaps use pxor / xorp* to clear a XMM register first?
116*9880d681SAndroid Build Coastguard Worker
117*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
118*9880d681SAndroid Build Coastguard Worker
119*9880d681SAndroid Build Coastguard WorkerExternal test Nurbs exposed some problems. Look for
120*9880d681SAndroid Build Coastguard Worker__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
121*9880d681SAndroid Build Coastguard Workeremits:
122*9880d681SAndroid Build Coastguard Worker
123*9880d681SAndroid Build Coastguard Worker        movaps    (%edx), %xmm2                                 #59.21
124*9880d681SAndroid Build Coastguard Worker        movaps    (%edx), %xmm5                                 #60.21
125*9880d681SAndroid Build Coastguard Worker        movaps    (%edx), %xmm4                                 #61.21
126*9880d681SAndroid Build Coastguard Worker        movaps    (%edx), %xmm3                                 #62.21
127*9880d681SAndroid Build Coastguard Worker        movl      40(%ecx), %ebp                                #69.49
128*9880d681SAndroid Build Coastguard Worker        shufps    $0, %xmm2, %xmm5                              #60.21
129*9880d681SAndroid Build Coastguard Worker        movl      100(%esp), %ebx                               #69.20
130*9880d681SAndroid Build Coastguard Worker        movl      (%ebx), %edi                                  #69.20
131*9880d681SAndroid Build Coastguard Worker        imull     %ebp, %edi                                    #69.49
132*9880d681SAndroid Build Coastguard Worker        addl      (%eax), %edi                                  #70.33
133*9880d681SAndroid Build Coastguard Worker        shufps    $85, %xmm2, %xmm4                             #61.21
134*9880d681SAndroid Build Coastguard Worker        shufps    $170, %xmm2, %xmm3                            #62.21
135*9880d681SAndroid Build Coastguard Worker        shufps    $255, %xmm2, %xmm2                            #63.21
136*9880d681SAndroid Build Coastguard Worker        lea       (%ebp,%ebp,2), %ebx                           #69.49
137*9880d681SAndroid Build Coastguard Worker        negl      %ebx                                          #69.49
138*9880d681SAndroid Build Coastguard Worker        lea       -3(%edi,%ebx), %ebx                           #70.33
139*9880d681SAndroid Build Coastguard Worker        shll      $4, %ebx                                      #68.37
140*9880d681SAndroid Build Coastguard Worker        addl      32(%ecx), %ebx                                #68.37
141*9880d681SAndroid Build Coastguard Worker        testb     $15, %bl                                      #91.13
142*9880d681SAndroid Build Coastguard Worker        jne       L_B1.24       # Prob 5%                       #91.13
143*9880d681SAndroid Build Coastguard Worker
144*9880d681SAndroid Build Coastguard WorkerThis is the llvm code after instruction scheduling:
145*9880d681SAndroid Build Coastguard Worker
146*9880d681SAndroid Build Coastguard Workercond_next140 (0xa910740, LLVM BB @0xa90beb0):
147*9880d681SAndroid Build Coastguard Worker	%reg1078 = MOV32ri -3
148*9880d681SAndroid Build Coastguard Worker	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
149*9880d681SAndroid Build Coastguard Worker	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
150*9880d681SAndroid Build Coastguard Worker	%reg1080 = IMUL32rr %reg1079, %reg1037
151*9880d681SAndroid Build Coastguard Worker	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
152*9880d681SAndroid Build Coastguard Worker	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
153*9880d681SAndroid Build Coastguard Worker	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
154*9880d681SAndroid Build Coastguard Worker	%reg1082 = SHL32ri %reg1038, 4
155*9880d681SAndroid Build Coastguard Worker	%reg1039 = ADD32rr %reg1036, %reg1082
156*9880d681SAndroid Build Coastguard Worker	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
157*9880d681SAndroid Build Coastguard Worker	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158*9880d681SAndroid Build Coastguard Worker	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159*9880d681SAndroid Build Coastguard Worker	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160*9880d681SAndroid Build Coastguard Worker	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161*9880d681SAndroid Build Coastguard Worker	%reg1040 = MOV32rr %reg1039
162*9880d681SAndroid Build Coastguard Worker	%reg1084 = AND32ri8 %reg1039, 15
163*9880d681SAndroid Build Coastguard Worker	CMP32ri8 %reg1084, 0
164*9880d681SAndroid Build Coastguard Worker	JE mbb<cond_next204,0xa914d30>
165*9880d681SAndroid Build Coastguard Worker
166*9880d681SAndroid Build Coastguard WorkerStill ok. After register allocation:
167*9880d681SAndroid Build Coastguard Worker
168*9880d681SAndroid Build Coastguard Workercond_next140 (0xa910740, LLVM BB @0xa90beb0):
169*9880d681SAndroid Build Coastguard Worker	%EAX = MOV32ri -3
170*9880d681SAndroid Build Coastguard Worker	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
171*9880d681SAndroid Build Coastguard Worker	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
172*9880d681SAndroid Build Coastguard Worker	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
173*9880d681SAndroid Build Coastguard Worker	%EDX = MOV32rm %EDX, 1, %NOREG, 40
174*9880d681SAndroid Build Coastguard Worker	IMUL32rr %EAX<def&use>, %EDX
175*9880d681SAndroid Build Coastguard Worker	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
176*9880d681SAndroid Build Coastguard Worker	%ESI = MOV32rm %ESI, 1, %NOREG, 0
177*9880d681SAndroid Build Coastguard Worker	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
178*9880d681SAndroid Build Coastguard Worker	%EAX = LEA32r %ESI, 1, %EAX, -3
179*9880d681SAndroid Build Coastguard Worker	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
180*9880d681SAndroid Build Coastguard Worker	%ESI = MOV32rm %ESI, 1, %NOREG, 32
181*9880d681SAndroid Build Coastguard Worker	%EDI = MOV32rr %EAX
182*9880d681SAndroid Build Coastguard Worker	SHL32ri %EDI<def&use>, 4
183*9880d681SAndroid Build Coastguard Worker	ADD32rr %EDI<def&use>, %ESI
184*9880d681SAndroid Build Coastguard Worker	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
185*9880d681SAndroid Build Coastguard Worker	%XMM1 = MOVAPSrr %XMM0
186*9880d681SAndroid Build Coastguard Worker	SHUFPSrr %XMM1<def&use>, %XMM1, 170
187*9880d681SAndroid Build Coastguard Worker	%XMM2 = MOVAPSrr %XMM0
188*9880d681SAndroid Build Coastguard Worker	SHUFPSrr %XMM2<def&use>, %XMM2, 0
189*9880d681SAndroid Build Coastguard Worker	%XMM3 = MOVAPSrr %XMM0
190*9880d681SAndroid Build Coastguard Worker	SHUFPSrr %XMM3<def&use>, %XMM3, 255
191*9880d681SAndroid Build Coastguard Worker	SHUFPSrr %XMM0<def&use>, %XMM0, 85
192*9880d681SAndroid Build Coastguard Worker	%EBX = MOV32rr %EDI
193*9880d681SAndroid Build Coastguard Worker	AND32ri8 %EBX<def&use>, 15
194*9880d681SAndroid Build Coastguard Worker	CMP32ri8 %EBX, 0
195*9880d681SAndroid Build Coastguard Worker	JE mbb<cond_next204,0xa914d30>
196*9880d681SAndroid Build Coastguard Worker
197*9880d681SAndroid Build Coastguard WorkerThis looks really bad. The problem is shufps is a destructive opcode. Since it
198*9880d681SAndroid Build Coastguard Workerappears as operand two in more than one shufps ops. It resulted in a number of
199*9880d681SAndroid Build Coastguard Workercopies. Note icc also suffers from the same problem. Either the instruction
200*9880d681SAndroid Build Coastguard Workerselector should select pshufd or The register allocator can made the two-address
201*9880d681SAndroid Build Coastguard Workerto three-address transformation.
202*9880d681SAndroid Build Coastguard Worker
203*9880d681SAndroid Build Coastguard WorkerIt also exposes some other problems. See MOV32ri -3 and the spills.
204*9880d681SAndroid Build Coastguard Worker
205*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
206*9880d681SAndroid Build Coastguard Worker
207*9880d681SAndroid Build Coastguard WorkerConsider:
208*9880d681SAndroid Build Coastguard Worker
209*9880d681SAndroid Build Coastguard Worker__m128 test(float a) {
210*9880d681SAndroid Build Coastguard Worker  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
211*9880d681SAndroid Build Coastguard Worker}
212*9880d681SAndroid Build Coastguard Worker
213*9880d681SAndroid Build Coastguard WorkerThis compiles into:
214*9880d681SAndroid Build Coastguard Worker
215*9880d681SAndroid Build Coastguard Workermovss 4(%esp), %xmm1
216*9880d681SAndroid Build Coastguard Workermulss %xmm1, %xmm1
217*9880d681SAndroid Build Coastguard Workerxorps %xmm0, %xmm0
218*9880d681SAndroid Build Coastguard Workermovss %xmm1, %xmm0
219*9880d681SAndroid Build Coastguard Workerret
220*9880d681SAndroid Build Coastguard Worker
221*9880d681SAndroid Build Coastguard WorkerBecause mulss doesn't modify the top 3 elements, the top elements of
222*9880d681SAndroid Build Coastguard Workerxmm1 are already zero'd.  We could compile this to:
223*9880d681SAndroid Build Coastguard Worker
224*9880d681SAndroid Build Coastguard Workermovss 4(%esp), %xmm0
225*9880d681SAndroid Build Coastguard Workermulss %xmm0, %xmm0
226*9880d681SAndroid Build Coastguard Workerret
227*9880d681SAndroid Build Coastguard Worker
228*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
229*9880d681SAndroid Build Coastguard Worker
230*9880d681SAndroid Build Coastguard WorkerHere's a sick and twisted idea.  Consider code like this:
231*9880d681SAndroid Build Coastguard Worker
232*9880d681SAndroid Build Coastguard Worker__m128 test(__m128 a) {
233*9880d681SAndroid Build Coastguard Worker  float b = *(float*)&A;
234*9880d681SAndroid Build Coastguard Worker  ...
235*9880d681SAndroid Build Coastguard Worker  return _mm_set_ps(0.0, 0.0, 0.0, b);
236*9880d681SAndroid Build Coastguard Worker}
237*9880d681SAndroid Build Coastguard Worker
238*9880d681SAndroid Build Coastguard WorkerThis might compile to this code:
239*9880d681SAndroid Build Coastguard Worker
240*9880d681SAndroid Build Coastguard Workermovaps c(%esp), %xmm1
241*9880d681SAndroid Build Coastguard Workerxorps %xmm0, %xmm0
242*9880d681SAndroid Build Coastguard Workermovss %xmm1, %xmm0
243*9880d681SAndroid Build Coastguard Workerret
244*9880d681SAndroid Build Coastguard Worker
245*9880d681SAndroid Build Coastguard WorkerNow consider if the ... code caused xmm1 to get spilled.  This might produce
246*9880d681SAndroid Build Coastguard Workerthis code:
247*9880d681SAndroid Build Coastguard Worker
248*9880d681SAndroid Build Coastguard Workermovaps c(%esp), %xmm1
249*9880d681SAndroid Build Coastguard Workermovaps %xmm1, c2(%esp)
250*9880d681SAndroid Build Coastguard Worker...
251*9880d681SAndroid Build Coastguard Worker
252*9880d681SAndroid Build Coastguard Workerxorps %xmm0, %xmm0
253*9880d681SAndroid Build Coastguard Workermovaps c2(%esp), %xmm1
254*9880d681SAndroid Build Coastguard Workermovss %xmm1, %xmm0
255*9880d681SAndroid Build Coastguard Workerret
256*9880d681SAndroid Build Coastguard Worker
257*9880d681SAndroid Build Coastguard WorkerHowever, since the reload is only used by these instructions, we could
258*9880d681SAndroid Build Coastguard Worker"fold" it into the uses, producing something like this:
259*9880d681SAndroid Build Coastguard Worker
260*9880d681SAndroid Build Coastguard Workermovaps c(%esp), %xmm1
261*9880d681SAndroid Build Coastguard Workermovaps %xmm1, c2(%esp)
262*9880d681SAndroid Build Coastguard Worker...
263*9880d681SAndroid Build Coastguard Worker
264*9880d681SAndroid Build Coastguard Workermovss c2(%esp), %xmm0
265*9880d681SAndroid Build Coastguard Workerret
266*9880d681SAndroid Build Coastguard Worker
267*9880d681SAndroid Build Coastguard Worker... saving two instructions.
268*9880d681SAndroid Build Coastguard Worker
269*9880d681SAndroid Build Coastguard WorkerThe basic idea is that a reload from a spill slot, can, if only one 4-byte
270*9880d681SAndroid Build Coastguard Workerchunk is used, bring in 3 zeros the one element instead of 4 elements.
271*9880d681SAndroid Build Coastguard WorkerThis can be used to simplify a variety of shuffle operations, where the
272*9880d681SAndroid Build Coastguard Workerelements are fixed zeros.
273*9880d681SAndroid Build Coastguard Worker
274*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
275*9880d681SAndroid Build Coastguard Worker
276*9880d681SAndroid Build Coastguard WorkerThis code generates ugly code, probably due to costs being off or something:
277*9880d681SAndroid Build Coastguard Worker
278*9880d681SAndroid Build Coastguard Workerdefine void @test(float* %P, <4 x float>* %P2 ) {
279*9880d681SAndroid Build Coastguard Worker        %xFloat0.688 = load float* %P
280*9880d681SAndroid Build Coastguard Worker        %tmp = load <4 x float>* %P2
281*9880d681SAndroid Build Coastguard Worker        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
282*9880d681SAndroid Build Coastguard Worker        store <4 x float> %inFloat3.713, <4 x float>* %P2
283*9880d681SAndroid Build Coastguard Worker        ret void
284*9880d681SAndroid Build Coastguard Worker}
285*9880d681SAndroid Build Coastguard Worker
286*9880d681SAndroid Build Coastguard WorkerGenerates:
287*9880d681SAndroid Build Coastguard Worker
288*9880d681SAndroid Build Coastguard Worker_test:
289*9880d681SAndroid Build Coastguard Worker	movl	8(%esp), %eax
290*9880d681SAndroid Build Coastguard Worker	movaps	(%eax), %xmm0
291*9880d681SAndroid Build Coastguard Worker	pxor	%xmm1, %xmm1
292*9880d681SAndroid Build Coastguard Worker	movaps	%xmm0, %xmm2
293*9880d681SAndroid Build Coastguard Worker	shufps	$50, %xmm1, %xmm2
294*9880d681SAndroid Build Coastguard Worker	shufps	$132, %xmm2, %xmm0
295*9880d681SAndroid Build Coastguard Worker	movaps	%xmm0, (%eax)
296*9880d681SAndroid Build Coastguard Worker	ret
297*9880d681SAndroid Build Coastguard Worker
298*9880d681SAndroid Build Coastguard WorkerWould it be better to generate:
299*9880d681SAndroid Build Coastguard Worker
300*9880d681SAndroid Build Coastguard Worker_test:
301*9880d681SAndroid Build Coastguard Worker        movl 8(%esp), %ecx
302*9880d681SAndroid Build Coastguard Worker        movaps (%ecx), %xmm0
303*9880d681SAndroid Build Coastguard Worker	xor %eax, %eax
304*9880d681SAndroid Build Coastguard Worker        pinsrw $6, %eax, %xmm0
305*9880d681SAndroid Build Coastguard Worker        pinsrw $7, %eax, %xmm0
306*9880d681SAndroid Build Coastguard Worker        movaps %xmm0, (%ecx)
307*9880d681SAndroid Build Coastguard Worker        ret
308*9880d681SAndroid Build Coastguard Worker
309*9880d681SAndroid Build Coastguard Worker?
310*9880d681SAndroid Build Coastguard Worker
311*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
312*9880d681SAndroid Build Coastguard Worker
313*9880d681SAndroid Build Coastguard WorkerSome useful information in the Apple Altivec / SSE Migration Guide:
314*9880d681SAndroid Build Coastguard Worker
315*9880d681SAndroid Build Coastguard Workerhttp://developer.apple.com/documentation/Performance/Conceptual/
316*9880d681SAndroid Build Coastguard WorkerAccelerate_sse_migration/index.html
317*9880d681SAndroid Build Coastguard Worker
318*9880d681SAndroid Build Coastguard Workere.g. SSE select using and, andnot, or. Various SSE compare translations.
319*9880d681SAndroid Build Coastguard Worker
320*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
321*9880d681SAndroid Build Coastguard Worker
322*9880d681SAndroid Build Coastguard WorkerAdd hooks to commute some CMPP operations.
323*9880d681SAndroid Build Coastguard Worker
324*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
325*9880d681SAndroid Build Coastguard Worker
326*9880d681SAndroid Build Coastguard WorkerApply the same transformation that merged four float into a single 128-bit load
327*9880d681SAndroid Build Coastguard Workerto loads from constant pool.
328*9880d681SAndroid Build Coastguard Worker
329*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
330*9880d681SAndroid Build Coastguard Worker
331*9880d681SAndroid Build Coastguard WorkerFloating point max / min are commutable when -enable-unsafe-fp-path is
332*9880d681SAndroid Build Coastguard Workerspecified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333*9880d681SAndroid Build Coastguard Workernodes which are selected to max / min instructions that are marked commutable.
334*9880d681SAndroid Build Coastguard Worker
335*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
336*9880d681SAndroid Build Coastguard Worker
337*9880d681SAndroid Build Coastguard WorkerWe should materialize vector constants like "all ones" and "signbit" with
338*9880d681SAndroid Build Coastguard Workercode like:
339*9880d681SAndroid Build Coastguard Worker
340*9880d681SAndroid Build Coastguard Worker     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
341*9880d681SAndroid Build Coastguard Worker
342*9880d681SAndroid Build Coastguard Workerand:
343*9880d681SAndroid Build Coastguard Worker     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
344*9880d681SAndroid Build Coastguard Worker     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
345*9880d681SAndroid Build Coastguard Worker
346*9880d681SAndroid Build Coastguard Workerinstead of using a load from the constant pool.  The later is important for
347*9880d681SAndroid Build Coastguard WorkerABS/NEG/copysign etc.
348*9880d681SAndroid Build Coastguard Worker
349*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
350*9880d681SAndroid Build Coastguard Worker
351*9880d681SAndroid Build Coastguard WorkerThese functions:
352*9880d681SAndroid Build Coastguard Worker
353*9880d681SAndroid Build Coastguard Worker#include <xmmintrin.h>
354*9880d681SAndroid Build Coastguard Worker__m128i a;
355*9880d681SAndroid Build Coastguard Workervoid x(unsigned short n) {
356*9880d681SAndroid Build Coastguard Worker  a = _mm_slli_epi32 (a, n);
357*9880d681SAndroid Build Coastguard Worker}
358*9880d681SAndroid Build Coastguard Workervoid y(unsigned n) {
359*9880d681SAndroid Build Coastguard Worker  a = _mm_slli_epi32 (a, n);
360*9880d681SAndroid Build Coastguard Worker}
361*9880d681SAndroid Build Coastguard Worker
362*9880d681SAndroid Build Coastguard Workercompile to ( -O3 -static -fomit-frame-pointer):
363*9880d681SAndroid Build Coastguard Worker_x:
364*9880d681SAndroid Build Coastguard Worker        movzwl  4(%esp), %eax
365*9880d681SAndroid Build Coastguard Worker        movd    %eax, %xmm0
366*9880d681SAndroid Build Coastguard Worker        movaps  _a, %xmm1
367*9880d681SAndroid Build Coastguard Worker        pslld   %xmm0, %xmm1
368*9880d681SAndroid Build Coastguard Worker        movaps  %xmm1, _a
369*9880d681SAndroid Build Coastguard Worker        ret
370*9880d681SAndroid Build Coastguard Worker_y:
371*9880d681SAndroid Build Coastguard Worker        movd    4(%esp), %xmm0
372*9880d681SAndroid Build Coastguard Worker        movaps  _a, %xmm1
373*9880d681SAndroid Build Coastguard Worker        pslld   %xmm0, %xmm1
374*9880d681SAndroid Build Coastguard Worker        movaps  %xmm1, _a
375*9880d681SAndroid Build Coastguard Worker        ret
376*9880d681SAndroid Build Coastguard Worker
377*9880d681SAndroid Build Coastguard Worker"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
378*9880d681SAndroid Build Coastguard Workerlike movd would be sufficient in both cases as the value is already zero
379*9880d681SAndroid Build Coastguard Workerextended in the 32-bit stack slot IIRC.  For signed short, it should also be
380*9880d681SAndroid Build Coastguard Workersave, as a really-signed value would be undefined for pslld.
381*9880d681SAndroid Build Coastguard Worker
382*9880d681SAndroid Build Coastguard Worker
383*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
384*9880d681SAndroid Build Coastguard Worker
385*9880d681SAndroid Build Coastguard Worker#include <math.h>
386*9880d681SAndroid Build Coastguard Workerint t1(double d) { return signbit(d); }
387*9880d681SAndroid Build Coastguard Worker
388*9880d681SAndroid Build Coastguard WorkerThis currently compiles to:
389*9880d681SAndroid Build Coastguard Worker	subl	$12, %esp
390*9880d681SAndroid Build Coastguard Worker	movsd	16(%esp), %xmm0
391*9880d681SAndroid Build Coastguard Worker	movsd	%xmm0, (%esp)
392*9880d681SAndroid Build Coastguard Worker	movl	4(%esp), %eax
393*9880d681SAndroid Build Coastguard Worker	shrl	$31, %eax
394*9880d681SAndroid Build Coastguard Worker	addl	$12, %esp
395*9880d681SAndroid Build Coastguard Worker	ret
396*9880d681SAndroid Build Coastguard Worker
397*9880d681SAndroid Build Coastguard WorkerWe should use movmskp{s|d} instead.
398*9880d681SAndroid Build Coastguard Worker
399*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
400*9880d681SAndroid Build Coastguard Worker
401*9880d681SAndroid Build Coastguard WorkerCodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402*9880d681SAndroid Build Coastguard Worker(aligned) vector load.  This functionality has a couple of problems.
403*9880d681SAndroid Build Coastguard Worker
404*9880d681SAndroid Build Coastguard Worker1. The code to infer alignment from loads of globals is in the X86 backend,
405*9880d681SAndroid Build Coastguard Worker   not the dag combiner.  This is because dagcombine2 needs to be able to see
406*9880d681SAndroid Build Coastguard Worker   through the X86ISD::Wrapper node, which DAGCombine can't really do.
407*9880d681SAndroid Build Coastguard Worker2. The code for turning 4 x load into a single vector load is target
408*9880d681SAndroid Build Coastguard Worker   independent and should be moved to the dag combiner.
409*9880d681SAndroid Build Coastguard Worker3. The code for turning 4 x load into a vector load can only handle a direct
410*9880d681SAndroid Build Coastguard Worker   load from a global or a direct load from the stack.  It should be generalized
411*9880d681SAndroid Build Coastguard Worker   to handle any load from P, P+4, P+8, P+12, where P can be anything.
412*9880d681SAndroid Build Coastguard Worker4. The alignment inference code cannot handle loads from globals in non-static
413*9880d681SAndroid Build Coastguard Worker   mode because it doesn't look through the extra dyld stub load.  If you try
414*9880d681SAndroid Build Coastguard Worker   vec_align.ll without -relocation-model=static, you'll see what I mean.
415*9880d681SAndroid Build Coastguard Worker
416*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
417*9880d681SAndroid Build Coastguard Worker
418*9880d681SAndroid Build Coastguard WorkerWe should lower store(fneg(load p), q) into an integer load+xor+store, which
419*9880d681SAndroid Build Coastguard Workereliminates a constant pool load.  For example, consider:
420*9880d681SAndroid Build Coastguard Worker
421*9880d681SAndroid Build Coastguard Workerdefine i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
422*9880d681SAndroid Build Coastguard Workerentry:
423*9880d681SAndroid Build Coastguard Worker %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
424*9880d681SAndroid Build Coastguard Worker %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
425*9880d681SAndroid Build Coastguard Worker ret i64 %tmp20
426*9880d681SAndroid Build Coastguard Worker}
427*9880d681SAndroid Build Coastguard Workerdeclare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
428*9880d681SAndroid Build Coastguard Worker
429*9880d681SAndroid Build Coastguard WorkerThis currently compiles to:
430*9880d681SAndroid Build Coastguard Worker
431*9880d681SAndroid Build Coastguard WorkerLCPI1_0:					#  <4 x float>
432*9880d681SAndroid Build Coastguard Worker	.long	2147483648	# float -0
433*9880d681SAndroid Build Coastguard Worker	.long	2147483648	# float -0
434*9880d681SAndroid Build Coastguard Worker	.long	2147483648	# float -0
435*9880d681SAndroid Build Coastguard Worker	.long	2147483648	# float -0
436*9880d681SAndroid Build Coastguard Worker_ccosf:
437*9880d681SAndroid Build Coastguard Worker	subl	$12, %esp
438*9880d681SAndroid Build Coastguard Worker	movss	16(%esp), %xmm0
439*9880d681SAndroid Build Coastguard Worker	movss	%xmm0, 4(%esp)
440*9880d681SAndroid Build Coastguard Worker	movss	20(%esp), %xmm0
441*9880d681SAndroid Build Coastguard Worker	xorps	LCPI1_0, %xmm0
442*9880d681SAndroid Build Coastguard Worker	movss	%xmm0, (%esp)
443*9880d681SAndroid Build Coastguard Worker	call	L_ccoshf$stub
444*9880d681SAndroid Build Coastguard Worker	addl	$12, %esp
445*9880d681SAndroid Build Coastguard Worker	ret
446*9880d681SAndroid Build Coastguard Worker
447*9880d681SAndroid Build Coastguard WorkerNote the load into xmm0, then xor (to negate), then store.  In PIC mode,
448*9880d681SAndroid Build Coastguard Workerthis code computes the pic base and does two loads to do the constant pool
449*9880d681SAndroid Build Coastguard Workerload, so the improvement is much bigger.
450*9880d681SAndroid Build Coastguard Worker
451*9880d681SAndroid Build Coastguard WorkerThe tricky part about this xform is that the argument load/store isn't exposed
452*9880d681SAndroid Build Coastguard Workeruntil post-legalize, and at that point, the fneg has been custom expanded into
453*9880d681SAndroid Build Coastguard Workeran X86 fxor.  This means that we need to handle this case in the x86 backend
454*9880d681SAndroid Build Coastguard Workerinstead of in target independent code.
455*9880d681SAndroid Build Coastguard Worker
456*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
457*9880d681SAndroid Build Coastguard Worker
458*9880d681SAndroid Build Coastguard WorkerNon-SSE4 insert into 16 x i8 is atrociously bad.
459*9880d681SAndroid Build Coastguard Worker
460*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
461*9880d681SAndroid Build Coastguard Worker
462*9880d681SAndroid Build Coastguard Worker<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
463*9880d681SAndroid Build Coastguard Workeris memory.
464*9880d681SAndroid Build Coastguard Worker
465*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
466*9880d681SAndroid Build Coastguard Worker
467*9880d681SAndroid Build Coastguard WorkerINSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468*9880d681SAndroid Build Coastguard Workerany number of 0.0 simultaneously.  Currently we only use it for simple
469*9880d681SAndroid Build Coastguard Workerinsertions.
470*9880d681SAndroid Build Coastguard Worker
471*9880d681SAndroid Build Coastguard WorkerSee comments in LowerINSERT_VECTOR_ELT_SSE4.
472*9880d681SAndroid Build Coastguard Worker
473*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
474*9880d681SAndroid Build Coastguard Worker
475*9880d681SAndroid Build Coastguard WorkerOn a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476*9880d681SAndroid Build Coastguard WorkerCustom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477*9880d681SAndroid Build Coastguard Workerlegal, it'll just take a few extra patterns written in the .td file.
478*9880d681SAndroid Build Coastguard Worker
479*9880d681SAndroid Build Coastguard WorkerNote: this is not a code quality issue; the custom lowered code happens to be
480*9880d681SAndroid Build Coastguard Workerright, but we shouldn't have to custom lower anything.  This is probably related
481*9880d681SAndroid Build Coastguard Workerto <2 x i64> ops being so bad.
482*9880d681SAndroid Build Coastguard Worker
483*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
484*9880d681SAndroid Build Coastguard Worker
485*9880d681SAndroid Build Coastguard WorkerLLVM currently generates stack realignment code, when it is not necessary
486*9880d681SAndroid Build Coastguard Workerneeded. The problem is that we need to know about stack alignment too early,
487*9880d681SAndroid Build Coastguard Workerbefore RA runs.
488*9880d681SAndroid Build Coastguard Worker
489*9880d681SAndroid Build Coastguard WorkerAt that point we don't know, whether there will be vector spill, or not.
490*9880d681SAndroid Build Coastguard WorkerStack realignment logic is overly conservative here, but otherwise we can
491*9880d681SAndroid Build Coastguard Workerproduce unaligned loads/stores.
492*9880d681SAndroid Build Coastguard Worker
493*9880d681SAndroid Build Coastguard WorkerFixing this will require some huge RA changes.
494*9880d681SAndroid Build Coastguard Worker
495*9880d681SAndroid Build Coastguard WorkerTestcase:
496*9880d681SAndroid Build Coastguard Worker#include <emmintrin.h>
497*9880d681SAndroid Build Coastguard Worker
498*9880d681SAndroid Build Coastguard Workertypedef short vSInt16 __attribute__ ((__vector_size__ (16)));
499*9880d681SAndroid Build Coastguard Worker
500*9880d681SAndroid Build Coastguard Workerstatic const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
501*9880d681SAndroid Build Coastguard Worker- 22725, - 12873};;
502*9880d681SAndroid Build Coastguard Worker
503*9880d681SAndroid Build Coastguard WorkervSInt16 madd(vSInt16 b)
504*9880d681SAndroid Build Coastguard Worker{
505*9880d681SAndroid Build Coastguard Worker    return _mm_madd_epi16(a, b);
506*9880d681SAndroid Build Coastguard Worker}
507*9880d681SAndroid Build Coastguard Worker
508*9880d681SAndroid Build Coastguard WorkerGenerated code (x86-32, linux):
509*9880d681SAndroid Build Coastguard Workermadd:
510*9880d681SAndroid Build Coastguard Worker        pushl   %ebp
511*9880d681SAndroid Build Coastguard Worker        movl    %esp, %ebp
512*9880d681SAndroid Build Coastguard Worker        andl    $-16, %esp
513*9880d681SAndroid Build Coastguard Worker        movaps  .LCPI1_0, %xmm1
514*9880d681SAndroid Build Coastguard Worker        pmaddwd %xmm1, %xmm0
515*9880d681SAndroid Build Coastguard Worker        movl    %ebp, %esp
516*9880d681SAndroid Build Coastguard Worker        popl    %ebp
517*9880d681SAndroid Build Coastguard Worker        ret
518*9880d681SAndroid Build Coastguard Worker
519*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
520*9880d681SAndroid Build Coastguard Worker
521*9880d681SAndroid Build Coastguard WorkerConsider:
522*9880d681SAndroid Build Coastguard Worker#include <emmintrin.h>
523*9880d681SAndroid Build Coastguard Worker__m128 foo2 (float x) {
524*9880d681SAndroid Build Coastguard Worker return _mm_set_ps (0, 0, x, 0);
525*9880d681SAndroid Build Coastguard Worker}
526*9880d681SAndroid Build Coastguard Worker
527*9880d681SAndroid Build Coastguard WorkerIn x86-32 mode, we generate this spiffy code:
528*9880d681SAndroid Build Coastguard Worker
529*9880d681SAndroid Build Coastguard Worker_foo2:
530*9880d681SAndroid Build Coastguard Worker	movss	4(%esp), %xmm0
531*9880d681SAndroid Build Coastguard Worker	pshufd	$81, %xmm0, %xmm0
532*9880d681SAndroid Build Coastguard Worker	ret
533*9880d681SAndroid Build Coastguard Worker
534*9880d681SAndroid Build Coastguard Workerin x86-64 mode, we generate this code, which could be better:
535*9880d681SAndroid Build Coastguard Worker
536*9880d681SAndroid Build Coastguard Worker_foo2:
537*9880d681SAndroid Build Coastguard Worker	xorps	%xmm1, %xmm1
538*9880d681SAndroid Build Coastguard Worker	movss	%xmm0, %xmm1
539*9880d681SAndroid Build Coastguard Worker	pshufd	$81, %xmm1, %xmm0
540*9880d681SAndroid Build Coastguard Worker	ret
541*9880d681SAndroid Build Coastguard Worker
542*9880d681SAndroid Build Coastguard WorkerIn sse4 mode, we could use insertps to make both better.
543*9880d681SAndroid Build Coastguard Worker
544*9880d681SAndroid Build Coastguard WorkerHere's another testcase that could use insertps [mem]:
545*9880d681SAndroid Build Coastguard Worker
546*9880d681SAndroid Build Coastguard Worker#include <xmmintrin.h>
547*9880d681SAndroid Build Coastguard Workerextern float x2, x3;
548*9880d681SAndroid Build Coastguard Worker__m128 foo1 (float x1, float x4) {
549*9880d681SAndroid Build Coastguard Worker return _mm_set_ps (x2, x1, x3, x4);
550*9880d681SAndroid Build Coastguard Worker}
551*9880d681SAndroid Build Coastguard Worker
552*9880d681SAndroid Build Coastguard Workergcc mainline compiles it to:
553*9880d681SAndroid Build Coastguard Worker
554*9880d681SAndroid Build Coastguard Workerfoo1:
555*9880d681SAndroid Build Coastguard Worker       insertps        $0x10, x2(%rip), %xmm0
556*9880d681SAndroid Build Coastguard Worker       insertps        $0x10, x3(%rip), %xmm1
557*9880d681SAndroid Build Coastguard Worker       movaps  %xmm1, %xmm2
558*9880d681SAndroid Build Coastguard Worker       movlhps %xmm0, %xmm2
559*9880d681SAndroid Build Coastguard Worker       movaps  %xmm2, %xmm0
560*9880d681SAndroid Build Coastguard Worker       ret
561*9880d681SAndroid Build Coastguard Worker
562*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
563*9880d681SAndroid Build Coastguard Worker
564*9880d681SAndroid Build Coastguard WorkerWe compile vector multiply-by-constant into poor code:
565*9880d681SAndroid Build Coastguard Worker
566*9880d681SAndroid Build Coastguard Workerdefine <4 x i32> @f(<4 x i32> %i) nounwind  {
567*9880d681SAndroid Build Coastguard Worker	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
568*9880d681SAndroid Build Coastguard Worker	ret <4 x i32> %A
569*9880d681SAndroid Build Coastguard Worker}
570*9880d681SAndroid Build Coastguard Worker
571*9880d681SAndroid Build Coastguard WorkerOn targets without SSE4.1, this compiles into:
572*9880d681SAndroid Build Coastguard Worker
573*9880d681SAndroid Build Coastguard WorkerLCPI1_0:					##  <4 x i32>
574*9880d681SAndroid Build Coastguard Worker	.long	10
575*9880d681SAndroid Build Coastguard Worker	.long	10
576*9880d681SAndroid Build Coastguard Worker	.long	10
577*9880d681SAndroid Build Coastguard Worker	.long	10
578*9880d681SAndroid Build Coastguard Worker	.text
579*9880d681SAndroid Build Coastguard Worker	.align	4,0x90
580*9880d681SAndroid Build Coastguard Worker	.globl	_f
581*9880d681SAndroid Build Coastguard Worker_f:
582*9880d681SAndroid Build Coastguard Worker	pshufd	$3, %xmm0, %xmm1
583*9880d681SAndroid Build Coastguard Worker	movd	%xmm1, %eax
584*9880d681SAndroid Build Coastguard Worker	imull	LCPI1_0+12, %eax
585*9880d681SAndroid Build Coastguard Worker	movd	%eax, %xmm1
586*9880d681SAndroid Build Coastguard Worker	pshufd	$1, %xmm0, %xmm2
587*9880d681SAndroid Build Coastguard Worker	movd	%xmm2, %eax
588*9880d681SAndroid Build Coastguard Worker	imull	LCPI1_0+4, %eax
589*9880d681SAndroid Build Coastguard Worker	movd	%eax, %xmm2
590*9880d681SAndroid Build Coastguard Worker	punpckldq	%xmm1, %xmm2
591*9880d681SAndroid Build Coastguard Worker	movd	%xmm0, %eax
592*9880d681SAndroid Build Coastguard Worker	imull	LCPI1_0, %eax
593*9880d681SAndroid Build Coastguard Worker	movd	%eax, %xmm1
594*9880d681SAndroid Build Coastguard Worker	movhlps	%xmm0, %xmm0
595*9880d681SAndroid Build Coastguard Worker	movd	%xmm0, %eax
596*9880d681SAndroid Build Coastguard Worker	imull	LCPI1_0+8, %eax
597*9880d681SAndroid Build Coastguard Worker	movd	%eax, %xmm0
598*9880d681SAndroid Build Coastguard Worker	punpckldq	%xmm0, %xmm1
599*9880d681SAndroid Build Coastguard Worker	movaps	%xmm1, %xmm0
600*9880d681SAndroid Build Coastguard Worker	punpckldq	%xmm2, %xmm0
601*9880d681SAndroid Build Coastguard Worker	ret
602*9880d681SAndroid Build Coastguard Worker
603*9880d681SAndroid Build Coastguard WorkerIt would be better to synthesize integer vector multiplication by constants
604*9880d681SAndroid Build Coastguard Workerusing shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605*9880d681SAndroid Build Coastguard Workersimple cases such as multiplication by powers of two would be better as
606*9880d681SAndroid Build Coastguard Workervector shifts than as multiplications.
607*9880d681SAndroid Build Coastguard Worker
608*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
609*9880d681SAndroid Build Coastguard Worker
610*9880d681SAndroid Build Coastguard WorkerWe compile this:
611*9880d681SAndroid Build Coastguard Worker
612*9880d681SAndroid Build Coastguard Worker__m128i
613*9880d681SAndroid Build Coastguard Workerfoo2 (char x)
614*9880d681SAndroid Build Coastguard Worker{
615*9880d681SAndroid Build Coastguard Worker  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
616*9880d681SAndroid Build Coastguard Worker}
617*9880d681SAndroid Build Coastguard Worker
618*9880d681SAndroid Build Coastguard Workerinto:
619*9880d681SAndroid Build Coastguard Worker	movl	$1, %eax
620*9880d681SAndroid Build Coastguard Worker	xorps	%xmm0, %xmm0
621*9880d681SAndroid Build Coastguard Worker	pinsrw	$2, %eax, %xmm0
622*9880d681SAndroid Build Coastguard Worker	movzbl	4(%esp), %eax
623*9880d681SAndroid Build Coastguard Worker	pinsrw	$3, %eax, %xmm0
624*9880d681SAndroid Build Coastguard Worker	movl	$256, %eax
625*9880d681SAndroid Build Coastguard Worker	pinsrw	$7, %eax, %xmm0
626*9880d681SAndroid Build Coastguard Worker	ret
627*9880d681SAndroid Build Coastguard Worker
628*9880d681SAndroid Build Coastguard Worker
629*9880d681SAndroid Build Coastguard Workergcc-4.2:
630*9880d681SAndroid Build Coastguard Worker	subl	$12, %esp
631*9880d681SAndroid Build Coastguard Worker	movzbl	16(%esp), %eax
632*9880d681SAndroid Build Coastguard Worker	movdqa	LC0, %xmm0
633*9880d681SAndroid Build Coastguard Worker	pinsrw	$3, %eax, %xmm0
634*9880d681SAndroid Build Coastguard Worker	addl	$12, %esp
635*9880d681SAndroid Build Coastguard Worker	ret
636*9880d681SAndroid Build Coastguard Worker	.const
637*9880d681SAndroid Build Coastguard Worker	.align 4
638*9880d681SAndroid Build Coastguard WorkerLC0:
639*9880d681SAndroid Build Coastguard Worker	.word	0
640*9880d681SAndroid Build Coastguard Worker	.word	0
641*9880d681SAndroid Build Coastguard Worker	.word	1
642*9880d681SAndroid Build Coastguard Worker	.word	0
643*9880d681SAndroid Build Coastguard Worker	.word	0
644*9880d681SAndroid Build Coastguard Worker	.word	0
645*9880d681SAndroid Build Coastguard Worker	.word	0
646*9880d681SAndroid Build Coastguard Worker	.word	256
647*9880d681SAndroid Build Coastguard Worker
648*9880d681SAndroid Build Coastguard WorkerWith SSE4, it should be
649*9880d681SAndroid Build Coastguard Worker      movdqa  .LC0(%rip), %xmm0
650*9880d681SAndroid Build Coastguard Worker      pinsrb  $6, %edi, %xmm0
651*9880d681SAndroid Build Coastguard Worker
652*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
653*9880d681SAndroid Build Coastguard Worker
654*9880d681SAndroid Build Coastguard WorkerWe should transform a shuffle of two vectors of constants into a single vector
655*9880d681SAndroid Build Coastguard Workerof constants. Also, insertelement of a constant into a vector of constants
656*9880d681SAndroid Build Coastguard Workershould also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
657*9880d681SAndroid Build Coastguard Worker
658*9880d681SAndroid Build Coastguard WorkerWe compiled it to something horrible:
659*9880d681SAndroid Build Coastguard Worker
660*9880d681SAndroid Build Coastguard Worker	.align	4
661*9880d681SAndroid Build Coastguard WorkerLCPI1_1:					##  float
662*9880d681SAndroid Build Coastguard Worker	.long	1065353216	## float 1
663*9880d681SAndroid Build Coastguard Worker	.const
664*9880d681SAndroid Build Coastguard Worker
665*9880d681SAndroid Build Coastguard Worker	.align	4
666*9880d681SAndroid Build Coastguard WorkerLCPI1_0:					##  <4 x float>
667*9880d681SAndroid Build Coastguard Worker	.space	4
668*9880d681SAndroid Build Coastguard Worker	.long	1065353216	## float 1
669*9880d681SAndroid Build Coastguard Worker	.space	4
670*9880d681SAndroid Build Coastguard Worker	.long	1065353216	## float 1
671*9880d681SAndroid Build Coastguard Worker	.text
672*9880d681SAndroid Build Coastguard Worker	.align	4,0x90
673*9880d681SAndroid Build Coastguard Worker	.globl	_t
674*9880d681SAndroid Build Coastguard Worker_t:
675*9880d681SAndroid Build Coastguard Worker	xorps	%xmm0, %xmm0
676*9880d681SAndroid Build Coastguard Worker	movhps	LCPI1_0, %xmm0
677*9880d681SAndroid Build Coastguard Worker	movss	LCPI1_1, %xmm1
678*9880d681SAndroid Build Coastguard Worker	movaps	%xmm0, %xmm2
679*9880d681SAndroid Build Coastguard Worker	shufps	$2, %xmm1, %xmm2
680*9880d681SAndroid Build Coastguard Worker	shufps	$132, %xmm2, %xmm0
681*9880d681SAndroid Build Coastguard Worker	movaps	%xmm0, 0
682*9880d681SAndroid Build Coastguard Worker
683*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
684*9880d681SAndroid Build Coastguard Workerrdar://5907648
685*9880d681SAndroid Build Coastguard Worker
686*9880d681SAndroid Build Coastguard WorkerThis function:
687*9880d681SAndroid Build Coastguard Worker
688*9880d681SAndroid Build Coastguard Workerfloat foo(unsigned char x) {
689*9880d681SAndroid Build Coastguard Worker  return x;
690*9880d681SAndroid Build Coastguard Worker}
691*9880d681SAndroid Build Coastguard Worker
692*9880d681SAndroid Build Coastguard Workercompiles to (x86-32):
693*9880d681SAndroid Build Coastguard Worker
694*9880d681SAndroid Build Coastguard Workerdefine float @foo(i8 zeroext  %x) nounwind  {
695*9880d681SAndroid Build Coastguard Worker	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
696*9880d681SAndroid Build Coastguard Worker	ret float %tmp12
697*9880d681SAndroid Build Coastguard Worker}
698*9880d681SAndroid Build Coastguard Worker
699*9880d681SAndroid Build Coastguard Workercompiles to:
700*9880d681SAndroid Build Coastguard Worker
701*9880d681SAndroid Build Coastguard Worker_foo:
702*9880d681SAndroid Build Coastguard Worker	subl	$4, %esp
703*9880d681SAndroid Build Coastguard Worker	movzbl	8(%esp), %eax
704*9880d681SAndroid Build Coastguard Worker	cvtsi2ss	%eax, %xmm0
705*9880d681SAndroid Build Coastguard Worker	movss	%xmm0, (%esp)
706*9880d681SAndroid Build Coastguard Worker	flds	(%esp)
707*9880d681SAndroid Build Coastguard Worker	addl	$4, %esp
708*9880d681SAndroid Build Coastguard Worker	ret
709*9880d681SAndroid Build Coastguard Worker
710*9880d681SAndroid Build Coastguard WorkerWe should be able to use:
711*9880d681SAndroid Build Coastguard Worker  cvtsi2ss 8($esp), %xmm0
712*9880d681SAndroid Build Coastguard Workersince we know the stack slot is already zext'd.
713*9880d681SAndroid Build Coastguard Worker
714*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
715*9880d681SAndroid Build Coastguard Worker
716*9880d681SAndroid Build Coastguard WorkerConsider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
717*9880d681SAndroid Build Coastguard Workerwhen code size is critical. movlps is slower than movsd on core2 but it's one
718*9880d681SAndroid Build Coastguard Workerbyte shorter.
719*9880d681SAndroid Build Coastguard Worker
720*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
721*9880d681SAndroid Build Coastguard Worker
722*9880d681SAndroid Build Coastguard WorkerWe should use a dynamic programming based approach to tell when using FPStack
723*9880d681SAndroid Build Coastguard Workeroperations is cheaper than SSE.  SciMark montecarlo contains code like this
724*9880d681SAndroid Build Coastguard Workerfor example:
725*9880d681SAndroid Build Coastguard Worker
726*9880d681SAndroid Build Coastguard Workerdouble MonteCarlo_num_flops(int Num_samples) {
727*9880d681SAndroid Build Coastguard Worker    return ((double) Num_samples)* 4.0;
728*9880d681SAndroid Build Coastguard Worker}
729*9880d681SAndroid Build Coastguard Worker
730*9880d681SAndroid Build Coastguard WorkerIn fpstack mode, this compiles into:
731*9880d681SAndroid Build Coastguard Worker
732*9880d681SAndroid Build Coastguard WorkerLCPI1_0:
733*9880d681SAndroid Build Coastguard Worker	.long	1082130432	## float 4.000000e+00
734*9880d681SAndroid Build Coastguard Worker_MonteCarlo_num_flops:
735*9880d681SAndroid Build Coastguard Worker	subl	$4, %esp
736*9880d681SAndroid Build Coastguard Worker	movl	8(%esp), %eax
737*9880d681SAndroid Build Coastguard Worker	movl	%eax, (%esp)
738*9880d681SAndroid Build Coastguard Worker	fildl	(%esp)
739*9880d681SAndroid Build Coastguard Worker	fmuls	LCPI1_0
740*9880d681SAndroid Build Coastguard Worker	addl	$4, %esp
741*9880d681SAndroid Build Coastguard Worker	ret
742*9880d681SAndroid Build Coastguard Worker
743*9880d681SAndroid Build Coastguard Workerin SSE mode, it compiles into significantly slower code:
744*9880d681SAndroid Build Coastguard Worker
745*9880d681SAndroid Build Coastguard Worker_MonteCarlo_num_flops:
746*9880d681SAndroid Build Coastguard Worker	subl	$12, %esp
747*9880d681SAndroid Build Coastguard Worker	cvtsi2sd	16(%esp), %xmm0
748*9880d681SAndroid Build Coastguard Worker	mulsd	LCPI1_0, %xmm0
749*9880d681SAndroid Build Coastguard Worker	movsd	%xmm0, (%esp)
750*9880d681SAndroid Build Coastguard Worker	fldl	(%esp)
751*9880d681SAndroid Build Coastguard Worker	addl	$12, %esp
752*9880d681SAndroid Build Coastguard Worker	ret
753*9880d681SAndroid Build Coastguard Worker
754*9880d681SAndroid Build Coastguard WorkerThere are also other cases in scimark where using fpstack is better, it is
755*9880d681SAndroid Build Coastguard Workercheaper to do fld1 than load from a constant pool for example, so
756*9880d681SAndroid Build Coastguard Worker"load, add 1.0, store" is better done in the fp stack, etc.
757*9880d681SAndroid Build Coastguard Worker
758*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
759*9880d681SAndroid Build Coastguard Worker
760*9880d681SAndroid Build Coastguard WorkerThese should compile into the same code (PR6214): Perhaps instcombine should
761*9880d681SAndroid Build Coastguard Workercanonicalize the former into the later?
762*9880d681SAndroid Build Coastguard Worker
763*9880d681SAndroid Build Coastguard Workerdefine float @foo(float %x) nounwind {
764*9880d681SAndroid Build Coastguard Worker  %t = bitcast float %x to i32
765*9880d681SAndroid Build Coastguard Worker  %s = and i32 %t, 2147483647
766*9880d681SAndroid Build Coastguard Worker  %d = bitcast i32 %s to float
767*9880d681SAndroid Build Coastguard Worker  ret float %d
768*9880d681SAndroid Build Coastguard Worker}
769*9880d681SAndroid Build Coastguard Worker
770*9880d681SAndroid Build Coastguard Workerdeclare float @fabsf(float %n)
771*9880d681SAndroid Build Coastguard Workerdefine float @bar(float %x) nounwind {
772*9880d681SAndroid Build Coastguard Worker  %d = call float @fabsf(float %x)
773*9880d681SAndroid Build Coastguard Worker  ret float %d
774*9880d681SAndroid Build Coastguard Worker}
775*9880d681SAndroid Build Coastguard Worker
776*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
777*9880d681SAndroid Build Coastguard Worker
778*9880d681SAndroid Build Coastguard WorkerThis IR (from PR6194):
779*9880d681SAndroid Build Coastguard Worker
780*9880d681SAndroid Build Coastguard Workertarget datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
781*9880d681SAndroid Build Coastguard Workertarget triple = "x86_64-apple-darwin10.0.0"
782*9880d681SAndroid Build Coastguard Worker
783*9880d681SAndroid Build Coastguard Worker%0 = type { double, double }
784*9880d681SAndroid Build Coastguard Worker%struct.float3 = type { float, float, float }
785*9880d681SAndroid Build Coastguard Worker
786*9880d681SAndroid Build Coastguard Workerdefine void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
787*9880d681SAndroid Build Coastguard Workerentry:
788*9880d681SAndroid Build Coastguard Worker  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
789*9880d681SAndroid Build Coastguard Worker  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
790*9880d681SAndroid Build Coastguard Worker  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
791*9880d681SAndroid Build Coastguard Worker  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
792*9880d681SAndroid Build Coastguard Worker  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
793*9880d681SAndroid Build Coastguard Worker  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
794*9880d681SAndroid Build Coastguard Worker  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
795*9880d681SAndroid Build Coastguard Worker  store float %tmp12, float* %tmp5
796*9880d681SAndroid Build Coastguard Worker  ret void
797*9880d681SAndroid Build Coastguard Worker}
798*9880d681SAndroid Build Coastguard Worker
799*9880d681SAndroid Build Coastguard WorkerCompiles to:
800*9880d681SAndroid Build Coastguard Worker
801*9880d681SAndroid Build Coastguard Worker_test:                                  ## @test
802*9880d681SAndroid Build Coastguard Worker	movd	%xmm0, %rax
803*9880d681SAndroid Build Coastguard Worker	shrq	$32, %rax
804*9880d681SAndroid Build Coastguard Worker	movl	%eax, 4(%rdi)
805*9880d681SAndroid Build Coastguard Worker	ret
806*9880d681SAndroid Build Coastguard Worker
807*9880d681SAndroid Build Coastguard WorkerThis would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
808*9880d681SAndroid Build Coastguard Workerdoing a shuffle from v[1] to v[0] then a float store.
809*9880d681SAndroid Build Coastguard Worker
810*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
811*9880d681SAndroid Build Coastguard Worker
812*9880d681SAndroid Build Coastguard Worker[UNSAFE FP]
813*9880d681SAndroid Build Coastguard Worker
814*9880d681SAndroid Build Coastguard Workervoid foo(double, double, double);
815*9880d681SAndroid Build Coastguard Workervoid norm(double x, double y, double z) {
816*9880d681SAndroid Build Coastguard Worker  double scale = __builtin_sqrt(x*x + y*y + z*z);
817*9880d681SAndroid Build Coastguard Worker  foo(x/scale, y/scale, z/scale);
818*9880d681SAndroid Build Coastguard Worker}
819*9880d681SAndroid Build Coastguard Worker
820*9880d681SAndroid Build Coastguard WorkerWe currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
821*9880d681SAndroid Build Coastguard Workerslow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
822*9880d681SAndroid Build Coastguard Workerand emit 3 mulsd in place of the divs. This can be done as a target-independent
823*9880d681SAndroid Build Coastguard Workertransform.
824*9880d681SAndroid Build Coastguard Worker
825*9880d681SAndroid Build Coastguard WorkerIf we're dealing with floats instead of doubles we could even replace the sqrtss
826*9880d681SAndroid Build Coastguard Workerand inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
827*9880d681SAndroid Build Coastguard Workercost of reduced accuracy.
828*9880d681SAndroid Build Coastguard Worker
829*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
830*9880d681SAndroid Build Coastguard Worker
831*9880d681SAndroid Build Coastguard WorkerThis function should be matched to haddpd when the appropriate CPU is enabled:
832*9880d681SAndroid Build Coastguard Worker
833*9880d681SAndroid Build Coastguard Worker#include <x86intrin.h>
834*9880d681SAndroid Build Coastguard Workerdouble f (__m128d p) {
835*9880d681SAndroid Build Coastguard Worker  return p[0] + p[1];
836*9880d681SAndroid Build Coastguard Worker}
837*9880d681SAndroid Build Coastguard Worker
838*9880d681SAndroid Build Coastguard Workersimilarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
839*9880d681SAndroid Build Coastguard Workerturn into hsubpd also.
840*9880d681SAndroid Build Coastguard Worker
841*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
842*9880d681SAndroid Build Coastguard Worker
843*9880d681SAndroid Build Coastguard Workerdefine <2 x i32> @foo(<2 x double> %in) {
844*9880d681SAndroid Build Coastguard Worker  %x = fptosi <2 x double> %in to <2 x i32>
845*9880d681SAndroid Build Coastguard Worker  ret <2 x i32> %x
846*9880d681SAndroid Build Coastguard Worker}
847*9880d681SAndroid Build Coastguard Worker
848*9880d681SAndroid Build Coastguard WorkerShould compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
849*9880d681SAndroid Build Coastguard Worker
850*9880d681SAndroid Build Coastguard Worker//===---------------------------------------------------------------------===//
851