xref: /aosp_15_r20/external/llvm/test/CodeGen/AMDGPU/fma-combine.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
3
4declare i32 @llvm.amdgcn.workitem.id.x() #0
5declare double @llvm.fabs.f64(double) #0
6declare double @llvm.fma.f64(double, double, double) #0
7declare float @llvm.fma.f32(float, float, float) #0
8
9; (fadd (fmul x, y), z) -> (fma x, y, z)
10; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
11; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
12; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
13; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
14; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
15; SI: buffer_store_dwordx2 [[RESULT]]
16define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
17  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
18  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
19  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
20  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
21  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
22
23  %a = load volatile double, double addrspace(1)* %gep.0
24  %b = load volatile double, double addrspace(1)* %gep.1
25  %c = load volatile double, double addrspace(1)* %gep.2
26
27  %mul = fmul double %a, %b
28  %fma = fadd double %mul, %c
29  store double %fma, double addrspace(1)* %gep.out
30  ret void
31}
32
33; (fadd (fmul x, y), z) -> (fma x, y, z)
34; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
35; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
36; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
37; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
38; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
39; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
40; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
41; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
42; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
43; SI: s_endpgm
44define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
45  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
46  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
47  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
48  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
49  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
50  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
51  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
52
53  %a = load volatile double, double addrspace(1)* %gep.0
54  %b = load volatile double, double addrspace(1)* %gep.1
55  %c = load volatile double, double addrspace(1)* %gep.2
56  %d = load volatile double, double addrspace(1)* %gep.3
57
58  %mul = fmul double %a, %b
59  %fma0 = fadd double %mul, %c
60  %fma1 = fadd double %mul, %d
61  store volatile double %fma0, double addrspace(1)* %gep.out.0
62  store volatile double %fma1, double addrspace(1)* %gep.out.1
63  ret void
64}
65
66; (fadd x, (fmul y, z)) -> (fma y, z, x)
67; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
68; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
70; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
71; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
72; SI: buffer_store_dwordx2 [[RESULT]]
73define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
74  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
75  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
76  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
77  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
78  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
79
80  %a = load volatile double, double addrspace(1)* %gep.0
81  %b = load volatile double, double addrspace(1)* %gep.1
82  %c = load volatile double, double addrspace(1)* %gep.2
83
84  %mul = fmul double %a, %b
85  %fma = fadd double %c, %mul
86  store double %fma, double addrspace(1)* %gep.out
87  ret void
88}
89
90; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
91; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
92; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
93; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
94; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
95; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
96; SI: buffer_store_dwordx2 [[RESULT]]
97define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
98  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
99  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
100  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
101  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
102  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
103
104  %a = load volatile double, double addrspace(1)* %gep.0
105  %b = load volatile double, double addrspace(1)* %gep.1
106  %c = load volatile double, double addrspace(1)* %gep.2
107
108  %mul = fmul double %a, %b
109  %fma = fsub double %mul, %c
110  store double %fma, double addrspace(1)* %gep.out
111  ret void
112}
113
114; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
115; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
116; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
117; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
118; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
119; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
120; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
121; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
122; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
123; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
124; SI: s_endpgm
125define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
126  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
127  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
128  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
129  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
130  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
131  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
132  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
133
134  %a = load volatile double, double addrspace(1)* %gep.0
135  %b = load volatile double, double addrspace(1)* %gep.1
136  %c = load volatile double, double addrspace(1)* %gep.2
137  %d = load volatile double, double addrspace(1)* %gep.3
138
139  %mul = fmul double %a, %b
140  %fma0 = fsub double %mul, %c
141  %fma1 = fsub double %mul, %d
142  store volatile double %fma0, double addrspace(1)* %gep.out.0
143  store volatile double %fma1, double addrspace(1)* %gep.out.1
144  ret void
145}
146
147; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
148; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
149; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
150; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
151; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
152; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
153; SI: buffer_store_dwordx2 [[RESULT]]
154define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
155  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
156  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
157  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
158  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
159  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
160
161  %a = load volatile double, double addrspace(1)* %gep.0
162  %b = load volatile double, double addrspace(1)* %gep.1
163  %c = load volatile double, double addrspace(1)* %gep.2
164
165  %mul = fmul double %a, %b
166  %fma = fsub double %c, %mul
167  store double %fma, double addrspace(1)* %gep.out
168  ret void
169}
170
171; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
172; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
173; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
174; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
175; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
176; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
177; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
178; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
179; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
180; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
181; SI: s_endpgm
182define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
183  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
184  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
185  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
186  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
187  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
188  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
189  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
190
191  %a = load volatile double, double addrspace(1)* %gep.0
192  %b = load volatile double, double addrspace(1)* %gep.1
193  %c = load volatile double, double addrspace(1)* %gep.2
194  %d = load volatile double, double addrspace(1)* %gep.3
195
196  %mul = fmul double %a, %b
197  %fma0 = fsub double %c, %mul
198  %fma1 = fsub double %d, %mul
199  store volatile double %fma0, double addrspace(1)* %gep.out.0
200  store volatile double %fma1, double addrspace(1)* %gep.out.1
201  ret void
202}
203
204; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
205; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
206; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
207; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
208; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
209; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
210; SI: buffer_store_dwordx2 [[RESULT]]
211define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
212  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
213  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
214  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
215  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
216  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
217
218  %a = load volatile double, double addrspace(1)* %gep.0
219  %b = load volatile double, double addrspace(1)* %gep.1
220  %c = load volatile double, double addrspace(1)* %gep.2
221
222  %mul = fmul double %a, %b
223  %mul.neg = fsub double -0.0, %mul
224  %fma = fsub double %mul.neg, %c
225
226  store double %fma, double addrspace(1)* %gep.out
227  ret void
228}
229
230; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
231; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
232; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
233; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
235; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
236; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
237; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239; SI: s_endpgm
240define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
241  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
242  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
243  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
244  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
245  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
246  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
247  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
248
249  %a = load volatile double, double addrspace(1)* %gep.0
250  %b = load volatile double, double addrspace(1)* %gep.1
251  %c = load volatile double, double addrspace(1)* %gep.2
252  %d = load volatile double, double addrspace(1)* %gep.3
253
254  %mul = fmul double %a, %b
255  %mul.neg = fsub double -0.0, %mul
256  %fma0 = fsub double %mul.neg, %c
257  %fma1 = fsub double %mul.neg, %d
258
259  store volatile double %fma0, double addrspace(1)* %gep.out.0
260  store volatile double %fma1, double addrspace(1)* %gep.out.1
261  ret void
262}
263
264; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
265; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
266; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
267; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
268; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
269; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
270; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
271; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
272; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
273; SI: s_endpgm
274define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
275  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
276  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
277  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
278  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
279  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
280  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
281  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
282
283  %a = load volatile double, double addrspace(1)* %gep.0
284  %b = load volatile double, double addrspace(1)* %gep.1
285  %c = load volatile double, double addrspace(1)* %gep.2
286  %d = load volatile double, double addrspace(1)* %gep.3
287
288  %mul = fmul double %a, %b
289  %mul.neg = fsub double -0.0, %mul
290  %fma0 = fsub double %mul.neg, %c
291  %fma1 = fsub double %mul, %d
292
293  store volatile double %fma0, double addrspace(1)* %gep.out.0
294  store volatile double %fma1, double addrspace(1)* %gep.out.1
295  ret void
296}
297
298; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
299
300; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
301; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
302; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
303; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
304; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
305; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
306; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
307; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
308; SI: buffer_store_dwordx2 [[RESULT]]
309define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
310  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
311  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
312  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
313  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
314  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
315  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
316  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
317
318  %x = load volatile double, double addrspace(1)* %gep.0
319  %y = load volatile double, double addrspace(1)* %gep.1
320  %z = load volatile double, double addrspace(1)* %gep.2
321  %u = load volatile double, double addrspace(1)* %gep.3
322  %v = load volatile double, double addrspace(1)* %gep.4
323
324  %tmp0 = fmul double %u, %v
325  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
326  %tmp2 = fsub double %tmp1, %z
327
328  store double %tmp2, double addrspace(1)* %gep.out
329  ret void
330}
331
332; fold (fsub x, (fma y, z, (fmul u, v)))
333;   -> (fma (fneg y), z, (fma (fneg u), v, x))
334
335; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
336; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
337; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
338; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
339; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
340; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
341; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
342; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
343; SI: buffer_store_dwordx2 [[RESULT]]
344define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
345  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
346  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
347  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
348  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
349  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
350  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
351  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
352
353  %x = load volatile double, double addrspace(1)* %gep.0
354  %y = load volatile double, double addrspace(1)* %gep.1
355  %z = load volatile double, double addrspace(1)* %gep.2
356  %u = load volatile double, double addrspace(1)* %gep.3
357  %v = load volatile double, double addrspace(1)* %gep.4
358
359  %tmp0 = fmul double %u, %v
360  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
361  %tmp2 = fsub double %x, %tmp1
362
363  store double %tmp2, double addrspace(1)* %gep.out
364  ret void
365}
366
367;
368; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
369;
370
371; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
372; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
373define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
374                                        float addrspace(1)* %in1,
375                                        float addrspace(1)* %in2) {
376  %x = load volatile float, float addrspace(1)* %in1
377  %y = load volatile float, float addrspace(1)* %in2
378  %a = fadd float %x, 1.0
379  %m = fmul float %a, %y
380  store float %m, float addrspace(1)* %out
381  ret void
382}
383
384; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
385; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]]
386define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
387                                        float addrspace(1)* %in1,
388                                        float addrspace(1)* %in2) {
389  %x = load volatile float, float addrspace(1)* %in1
390  %y = load volatile float, float addrspace(1)* %in2
391  %a = fadd float %x, 1.0
392  %m = fmul float %y, %a
393  store float %m, float addrspace(1)* %out
394  ret void
395}
396
397; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
398; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
399define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
400                                           float addrspace(1)* %in1,
401                                           float addrspace(1)* %in2) {
402  %x = load float, float addrspace(1)* %in1
403  %y = load float, float addrspace(1)* %in2
404  %a = fadd float %x, -1.0
405  %m = fmul float %a, %y
406  store float %m, float addrspace(1)* %out
407  ret void
408}
409
410; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
411; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
412define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
413                                           float addrspace(1)* %in1,
414                                           float addrspace(1)* %in2) {
415  %x = load float, float addrspace(1)* %in1
416  %y = load float, float addrspace(1)* %in2
417  %a = fadd float %x, -1.0
418  %m = fmul float %y, %a
419  store float %m, float addrspace(1)* %out
420  ret void
421}
422
423; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
424; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
425define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
426                                        float addrspace(1)* %in1,
427                                        float addrspace(1)* %in2) {
428  %x = load float, float addrspace(1)* %in1
429  %y = load float, float addrspace(1)* %in2
430  %s = fsub float 1.0, %x
431  %m = fmul float %s, %y
432  store float %m, float addrspace(1)* %out
433  ret void
434}
435
436; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
437; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]]
438define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
439                                        float addrspace(1)* %in1,
440                                        float addrspace(1)* %in2) {
441  %x = load float, float addrspace(1)* %in1
442  %y = load float, float addrspace(1)* %in2
443  %s = fsub float 1.0, %x
444  %m = fmul float %y, %s
445  store float %m, float addrspace(1)* %out
446  ret void
447}
448
449; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
450; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
451define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
452                                           float addrspace(1)* %in1,
453                                           float addrspace(1)* %in2) {
454  %x = load float, float addrspace(1)* %in1
455  %y = load float, float addrspace(1)* %in2
456  %s = fsub float -1.0, %x
457  %m = fmul float %s, %y
458  store float %m, float addrspace(1)* %out
459  ret void
460}
461
462; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
463; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]]
464define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
465                                         float addrspace(1)* %in1,
466                                         float addrspace(1)* %in2) {
467  %x = load float, float addrspace(1)* %in1
468  %y = load float, float addrspace(1)* %in2
469  %s = fsub float -1.0, %x
470  %m = fmul float %y, %s
471  store float %m, float addrspace(1)* %out
472  ret void
473}
474
475; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
476; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
477define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
478                                        float addrspace(1)* %in1,
479                                        float addrspace(1)* %in2) {
480  %x = load float, float addrspace(1)* %in1
481  %y = load float, float addrspace(1)* %in2
482  %s = fsub float %x, 1.0
483  %m = fmul float %s, %y
484  store float %m, float addrspace(1)* %out
485  ret void
486}
487
488; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
489; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]]
490define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
491                                      float addrspace(1)* %in1,
492                                      float addrspace(1)* %in2) {
493  %x = load float, float addrspace(1)* %in1
494  %y = load float, float addrspace(1)* %in2
495  %s = fsub float %x, 1.0
496  %m = fmul float %y, %s
497  store float %m, float addrspace(1)* %out
498  ret void
499}
500
501; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
502; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
503define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
504                                         float addrspace(1)* %in1,
505                                         float addrspace(1)* %in2) {
506  %x = load float, float addrspace(1)* %in1
507  %y = load float, float addrspace(1)* %in2
508  %s = fsub float %x, -1.0
509  %m = fmul float %s, %y
510  store float %m, float addrspace(1)* %out
511  ret void
512}
513
514; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
515; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]]
516define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
517                                         float addrspace(1)* %in1,
518                                         float addrspace(1)* %in2) {
519  %x = load float, float addrspace(1)* %in1
520  %y = load float, float addrspace(1)* %in2
521  %s = fsub float %x, -1.0
522  %m = fmul float %y, %s
523  store float %m, float addrspace(1)* %out
524  ret void
525}
526
527;
528; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
529;
530
531; FUNC-LABEL: {{^}}test_f32_interp:
532; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
533; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]]
534define void @test_f32_interp(float addrspace(1)* %out,
535                             float addrspace(1)* %in1,
536                             float addrspace(1)* %in2,
537                             float addrspace(1)* %in3) {
538  %x = load float, float addrspace(1)* %in1
539  %y = load float, float addrspace(1)* %in2
540  %t = load float, float addrspace(1)* %in3
541  %t1 = fsub float 1.0, %t
542  %tx = fmul float %x, %t
543  %ty = fmul float %y, %t1
544  %r = fadd float %tx, %ty
545  store float %r, float addrspace(1)* %out
546  ret void
547}
548
549; FUNC-LABEL: {{^}}test_f64_interp:
550; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
551; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
552define void @test_f64_interp(double addrspace(1)* %out,
553                             double addrspace(1)* %in1,
554                             double addrspace(1)* %in2,
555                             double addrspace(1)* %in3) {
556  %x = load double, double addrspace(1)* %in1
557  %y = load double, double addrspace(1)* %in2
558  %t = load double, double addrspace(1)* %in3
559  %t1 = fsub double 1.0, %t
560  %tx = fmul double %x, %t
561  %ty = fmul double %y, %t1
562  %r = fadd double %tx, %ty
563  store double %r, double addrspace(1)* %out
564  ret void
565}
566
567attributes #0 = { nounwind readnone }
568attributes #1 = { nounwind }
569