1 /*
2 * Copyright 2003 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <[email protected]>
26 */
27
28
29 #include "util/detect.h"
30 #include "util/compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_math.h"
34 #include "util/format/u_format.h"
35
36 #include "translate.h"
37
38
39 #if DETECT_ARCH_X86 || DETECT_ARCH_X86_64
40
41 #include "rtasm/rtasm_x86sse.h"
42
43
44 #define X 0
45 #define Y 1
46 #define Z 2
47 #define W 3
48
49
50 struct translate_buffer
51 {
52 const void *base_ptr;
53 uintptr_t stride;
54 unsigned max_index;
55 };
56
57 struct translate_buffer_variant
58 {
59 unsigned buffer_index;
60 unsigned instance_divisor;
61 void *ptr; /* updated either per vertex or per instance */
62 };
63
64
65 #define ELEMENT_BUFFER_INSTANCE_ID 1001
66
67 #define NUM_FLOAT_CONSTS 9
68 #define NUM_UNSIGNED_CONSTS 1
69
70 enum
71 {
72 CONST_IDENTITY,
73 CONST_INV_127,
74 CONST_INV_255,
75 CONST_INV_32767,
76 CONST_INV_65535,
77 CONST_INV_2147483647,
78 CONST_INV_4294967295,
79 CONST_255,
80 CONST_2147483648,
81 /* float consts end */
82 CONST_2147483647_INT,
83 };
84
85 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
86 static float consts[NUM_FLOAT_CONSTS][4] = {
87 {0, 0, 0, 1},
88 C(1.0 / 127.0),
89 C(1.0 / 255.0),
90 C(1.0 / 32767.0),
91 C(1.0 / 65535.0),
92 C(1.0 / 2147483647.0),
93 C(1.0 / 4294967295.0),
94 C(255.0),
95 C(2147483648.0),
96 };
97
98 #undef C
99
100 static unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
101 {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
102 };
103
104 struct translate_sse
105 {
106 struct translate translate;
107
108 struct x86_function linear_func;
109 struct x86_function elt_func;
110 struct x86_function elt16_func;
111 struct x86_function elt8_func;
112 struct x86_function *func;
113
114 alignas(16) float consts[NUM_FLOAT_CONSTS][4];
115 alignas(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
116 int8_t reg_to_const[16];
117 int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];
118
119 struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
120 unsigned nr_buffers;
121
122 /* Multiple buffer variants can map to a single buffer. */
123 struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
124 unsigned nr_buffer_variants;
125
126 /* Multiple elements can map to a single buffer variant. */
127 unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
128
129 bool use_instancing;
130 unsigned instance_id;
131 unsigned start_instance;
132
133 /* these are actually known values, but putting them in a struct
134 * like this is helpful to keep them in sync across the file.
135 */
136 struct x86_reg tmp_EAX;
137 struct x86_reg tmp2_EDX;
138 struct x86_reg src_ECX;
139 struct x86_reg idx_ESI; /* either start+i or &elt[i] */
140 struct x86_reg machine_EDI;
141 struct x86_reg outbuf_EBX;
142 struct x86_reg count_EBP; /* decrements to zero */
143 };
144
145
146 static int
get_offset(const void * a,const void * b)147 get_offset(const void *a, const void *b)
148 {
149 return (const char *) b - (const char *) a;
150 }
151
152
153 static struct x86_reg
get_const(struct translate_sse * p,unsigned id)154 get_const(struct translate_sse *p, unsigned id)
155 {
156 struct x86_reg reg;
157 unsigned i;
158
159 if (p->const_to_reg[id] >= 0)
160 return x86_make_reg(file_XMM, p->const_to_reg[id]);
161
162 for (i = 2; i < 8; ++i) {
163 if (p->reg_to_const[i] < 0)
164 break;
165 }
166
167 /* TODO: be smarter here */
168 if (i == 8)
169 --i;
170
171 reg = x86_make_reg(file_XMM, i);
172
173 if (p->reg_to_const[i] >= 0)
174 p->const_to_reg[p->reg_to_const[i]] = -1;
175
176 p->reg_to_const[i] = id;
177 p->const_to_reg[id] = i;
178
179 /* TODO: this should happen outside the loop, if possible */
180 const void *c;
181 if (id < NUM_FLOAT_CONSTS)
182 c = &p->consts[id][0];
183 else
184 c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
185 sse_movaps(p->func, reg,
186 x86_make_disp(p->machine_EDI, get_offset(p, c)));
187
188 return reg;
189 }
190
191
192 /* load the data in a SSE2 register, padding with zeros */
193 static bool
emit_load_sse2(struct translate_sse * p,struct x86_reg data,struct x86_reg src,unsigned size)194 emit_load_sse2(struct translate_sse *p,
195 struct x86_reg data, struct x86_reg src, unsigned size)
196 {
197 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
198 struct x86_reg tmp = p->tmp_EAX;
199 switch (size) {
200 case 1:
201 x86_movzx8(p->func, tmp, src);
202 sse2_movd(p->func, data, tmp);
203 break;
204 case 2:
205 x86_movzx16(p->func, tmp, src);
206 sse2_movd(p->func, data, tmp);
207 break;
208 case 3:
209 x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
210 x86_shl_imm(p->func, tmp, 16);
211 x86_mov16(p->func, tmp, src);
212 sse2_movd(p->func, data, tmp);
213 break;
214 case 4:
215 sse2_movd(p->func, data, src);
216 break;
217 case 6:
218 sse2_movd(p->func, data, src);
219 x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
220 sse2_movd(p->func, tmpXMM, tmp);
221 sse2_punpckldq(p->func, data, tmpXMM);
222 break;
223 case 8:
224 sse2_movq(p->func, data, src);
225 break;
226 case 12:
227 sse2_movq(p->func, data, src);
228 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
229 sse2_punpcklqdq(p->func, data, tmpXMM);
230 break;
231 case 16:
232 sse2_movdqu(p->func, data, src);
233 break;
234 default:
235 return false;
236 }
237 return true;
238 }
239
240
241 /* this value can be passed for the out_chans argument */
242 #define CHANNELS_0001 5
243
244
245 /* this function will load #chans float values, and will
246 * pad the register with zeroes at least up to out_chans.
247 *
248 * If out_chans is set to CHANNELS_0001, then the fourth
249 * value will be padded with 1. Only pass this value if
250 * chans < 4 or results are undefined.
251 */
252 static void
emit_load_float32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)253 emit_load_float32(struct translate_sse *p, struct x86_reg data,
254 struct x86_reg arg0, unsigned out_chans, unsigned chans)
255 {
256 switch (chans) {
257 case 1:
258 /* a 0 0 0
259 * a 0 0 1
260 */
261 sse_movss(p->func, data, arg0);
262 if (out_chans == CHANNELS_0001)
263 sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
264 break;
265 case 2:
266 /* 0 0 0 1
267 * a b 0 1
268 */
269 if (out_chans == CHANNELS_0001)
270 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271 SHUF(X, Y, Z, W));
272 else if (out_chans > 2)
273 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
274 sse_movlps(p->func, data, arg0);
275 break;
276 case 3:
277 /* Have to jump through some hoops:
278 *
279 * c 0 0 0
280 * c 0 0 1 if out_chans == CHANNELS_0001
281 * 0 0 c 0/1
282 * a b c 0/1
283 */
284 sse_movss(p->func, data, x86_make_disp(arg0, 8));
285 if (out_chans == CHANNELS_0001)
286 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
287 SHUF(X, Y, Z, W));
288 sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
289 sse_movlps(p->func, data, arg0);
290 break;
291 case 4:
292 sse_movups(p->func, data, arg0);
293 break;
294 }
295 }
296
297 /* this function behaves like emit_load_float32, but loads
298 64-bit floating point numbers, converting them to 32-bit
299 ones */
300 static void
emit_load_float64to32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)301 emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
302 struct x86_reg arg0, unsigned out_chans, unsigned chans)
303 {
304 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
305 switch (chans) {
306 case 1:
307 sse2_movsd(p->func, data, arg0);
308 if (out_chans > 1)
309 sse2_cvtpd2ps(p->func, data, data);
310 else
311 sse2_cvtsd2ss(p->func, data, data);
312 if (out_chans == CHANNELS_0001)
313 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
314 SHUF(X, Y, Z, W));
315 break;
316 case 2:
317 sse2_movupd(p->func, data, arg0);
318 sse2_cvtpd2ps(p->func, data, data);
319 if (out_chans == CHANNELS_0001)
320 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
321 SHUF(X, Y, Z, W));
322 else if (out_chans > 2)
323 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
324 break;
325 case 3:
326 sse2_movupd(p->func, data, arg0);
327 sse2_cvtpd2ps(p->func, data, data);
328 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
329 if (out_chans > 3)
330 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
331 else
332 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
333 sse_movlhps(p->func, data, tmpXMM);
334 if (out_chans == CHANNELS_0001)
335 sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
336 break;
337 case 4:
338 sse2_movupd(p->func, data, arg0);
339 sse2_cvtpd2ps(p->func, data, data);
340 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
341 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
342 sse_movlhps(p->func, data, tmpXMM);
343 break;
344 }
345 }
346
347
348 static void
emit_mov64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src_gpr,struct x86_reg src_xmm)349 emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
350 struct x86_reg dst_xmm, struct x86_reg src_gpr,
351 struct x86_reg src_xmm)
352 {
353 if (x86_target(p->func) != X86_32)
354 x64_mov64(p->func, dst_gpr, src_gpr);
355 else {
356 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
357 if (x86_target_caps(p->func) & X86_SSE2)
358 sse2_movq(p->func, dst_xmm, src_xmm);
359 else
360 sse_movlps(p->func, dst_xmm, src_xmm);
361 }
362 }
363
364
365 static void
emit_load64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src)366 emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
367 struct x86_reg dst_xmm, struct x86_reg src)
368 {
369 emit_mov64(p, dst_gpr, dst_xmm, src, src);
370 }
371
372
373 static void
emit_store64(struct translate_sse * p,struct x86_reg dst,struct x86_reg src_gpr,struct x86_reg src_xmm)374 emit_store64(struct translate_sse *p, struct x86_reg dst,
375 struct x86_reg src_gpr, struct x86_reg src_xmm)
376 {
377 emit_mov64(p, dst, dst, src_gpr, src_xmm);
378 }
379
380
381 static void
emit_mov128(struct translate_sse * p,struct x86_reg dst,struct x86_reg src)382 emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
383 {
384 if (x86_target_caps(p->func) & X86_SSE2)
385 sse2_movdqu(p->func, dst, src);
386 else
387 sse_movups(p->func, dst, src);
388 }
389
390
391 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
392 * but may or may not be good on older processors
393 * TODO: may perhaps want to use non-temporal stores here if possible
394 */
395 static void
emit_memcpy(struct translate_sse * p,struct x86_reg dst,struct x86_reg src,unsigned size)396 emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
397 unsigned size)
398 {
399 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
400 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
401 struct x86_reg dataGPR = p->tmp_EAX;
402 struct x86_reg dataGPR2 = p->tmp2_EDX;
403
404 if (size < 8) {
405 switch (size) {
406 case 1:
407 x86_mov8(p->func, dataGPR, src);
408 x86_mov8(p->func, dst, dataGPR);
409 break;
410 case 2:
411 x86_mov16(p->func, dataGPR, src);
412 x86_mov16(p->func, dst, dataGPR);
413 break;
414 case 3:
415 x86_mov16(p->func, dataGPR, src);
416 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
417 x86_mov16(p->func, dst, dataGPR);
418 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
419 break;
420 case 4:
421 x86_mov(p->func, dataGPR, src);
422 x86_mov(p->func, dst, dataGPR);
423 break;
424 case 6:
425 x86_mov(p->func, dataGPR, src);
426 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
427 x86_mov(p->func, dst, dataGPR);
428 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
429 break;
430 }
431 }
432 else if (!(x86_target_caps(p->func) & X86_SSE)) {
433 unsigned i = 0;
434 assert((size & 3) == 0);
435 for (i = 0; i < size; i += 4) {
436 x86_mov(p->func, dataGPR, x86_make_disp(src, i));
437 x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
438 }
439 }
440 else {
441 switch (size) {
442 case 8:
443 emit_load64(p, dataGPR, dataXMM, src);
444 emit_store64(p, dst, dataGPR, dataXMM);
445 break;
446 case 12:
447 emit_load64(p, dataGPR2, dataXMM, src);
448 x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
449 emit_store64(p, dst, dataGPR2, dataXMM);
450 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
451 break;
452 case 16:
453 emit_mov128(p, dataXMM, src);
454 emit_mov128(p, dst, dataXMM);
455 break;
456 case 24:
457 emit_mov128(p, dataXMM, src);
458 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
459 emit_mov128(p, dst, dataXMM);
460 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
461 break;
462 case 32:
463 emit_mov128(p, dataXMM, src);
464 emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
465 emit_mov128(p, dst, dataXMM);
466 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
467 break;
468 default:
469 assert(0);
470 }
471 }
472 }
473
474 static bool
translate_attr_convert(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)475 translate_attr_convert(struct translate_sse *p,
476 const struct translate_element *a,
477 struct x86_reg src, struct x86_reg dst)
478 {
479 const struct util_format_description *input_desc =
480 util_format_description(a->input_format);
481 const struct util_format_description *output_desc =
482 util_format_description(a->output_format);
483 unsigned i;
484 bool id_swizzle = true;
485 unsigned swizzle[4] =
486 { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
487 PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
488 unsigned needed_chans = 0;
489 unsigned imms[2] = { 0, 0x3f800000 };
490
491 if (a->output_format == PIPE_FORMAT_NONE
492 || a->input_format == PIPE_FORMAT_NONE)
493 return false;
494
495 if (input_desc->channel[0].size & 7)
496 return false;
497
498 if (input_desc->colorspace != output_desc->colorspace)
499 return false;
500
501 for (i = 1; i < input_desc->nr_channels; ++i) {
502 if (memcmp
503 (&input_desc->channel[i], &input_desc->channel[0],
504 sizeof(input_desc->channel[0])))
505 return false;
506 }
507
508 for (i = 1; i < output_desc->nr_channels; ++i) {
509 if (memcmp
510 (&output_desc->channel[i], &output_desc->channel[0],
511 sizeof(output_desc->channel[0]))) {
512 return false;
513 }
514 }
515
516 for (i = 0; i < output_desc->nr_channels; ++i) {
517 if (output_desc->swizzle[i] < 4)
518 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
519 }
520
521 if ((x86_target_caps(p->func) & X86_SSE) &&
522 (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
523 || a->output_format == PIPE_FORMAT_R32G32_FLOAT
524 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
525 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
526 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
527 struct x86_reg auxXMM;
528
529 for (i = 0; i < output_desc->nr_channels; ++i) {
530 if (swizzle[i] == PIPE_SWIZZLE_0
531 && i >= input_desc->nr_channels)
532 swizzle[i] = i;
533 }
534
535 for (i = 0; i < output_desc->nr_channels; ++i) {
536 if (swizzle[i] < 4)
537 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
538 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
539 id_swizzle = false;
540 }
541
542 if (needed_chans > 0) {
543 switch (input_desc->channel[0].type) {
544 case UTIL_FORMAT_TYPE_UNSIGNED:
545 if (!(x86_target_caps(p->func) & X86_SSE2))
546 return false;
547 emit_load_sse2(p, dataXMM, src,
548 input_desc->channel[0].size *
549 input_desc->nr_channels >> 3);
550
551 /* TODO: add support for SSE4.1 pmovzx */
552 switch (input_desc->channel[0].size) {
553 case 8:
554 /* TODO: this may be inefficient due to get_identity() being
555 * used both as a float and integer register.
556 */
557 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
558 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
559 break;
560 case 16:
561 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
562 break;
563 case 32: /* we lose precision here */
564 /* No unsigned conversion (except in AVX512F), so we check if
565 * it's negative, and stick the high bit as a separate float
566 * value in an aux register: */
567 auxXMM = x86_make_reg(file_XMM, 1);
568 /* aux = 0 */
569 sse_xorps(p->func, auxXMM, auxXMM);
570 /* aux = aux > data ? 0xffffffff : 0 */
571 sse2_pcmpgtd(p->func, auxXMM, dataXMM);
572 /* data = data & 0x7fffffff */
573 sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
574 /* aux = aux & 2147483648.0 */
575 sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
576 break;
577 default:
578 return false;
579 }
580 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
581 if (input_desc->channel[0].size == 32)
582 /* add in the high bit's worth of float that we AND'd away */
583 sse_addps(p->func, dataXMM, auxXMM);
584 if (input_desc->channel[0].normalized) {
585 struct x86_reg factor;
586 switch (input_desc->channel[0].size) {
587 case 8:
588 factor = get_const(p, CONST_INV_255);
589 break;
590 case 16:
591 factor = get_const(p, CONST_INV_65535);
592 break;
593 case 32:
594 factor = get_const(p, CONST_INV_4294967295);
595 break;
596 default:
597 assert(0);
598 factor.disp = 0;
599 factor.file = 0;
600 factor.idx = 0;
601 factor.mod = 0;
602 break;
603 }
604 sse_mulps(p->func, dataXMM, factor);
605 }
606 break;
607 case UTIL_FORMAT_TYPE_SIGNED:
608 if (!(x86_target_caps(p->func) & X86_SSE2))
609 return false;
610 emit_load_sse2(p, dataXMM, src,
611 input_desc->channel[0].size *
612 input_desc->nr_channels >> 3);
613
614 /* TODO: add support for SSE4.1 pmovsx */
615 switch (input_desc->channel[0].size) {
616 case 8:
617 sse2_punpcklbw(p->func, dataXMM, dataXMM);
618 sse2_punpcklbw(p->func, dataXMM, dataXMM);
619 sse2_psrad_imm(p->func, dataXMM, 24);
620 break;
621 case 16:
622 sse2_punpcklwd(p->func, dataXMM, dataXMM);
623 sse2_psrad_imm(p->func, dataXMM, 16);
624 break;
625 case 32: /* we lose precision here */
626 break;
627 default:
628 return false;
629 }
630 sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
631 if (input_desc->channel[0].normalized) {
632 struct x86_reg factor;
633 switch (input_desc->channel[0].size) {
634 case 8:
635 factor = get_const(p, CONST_INV_127);
636 break;
637 case 16:
638 factor = get_const(p, CONST_INV_32767);
639 break;
640 case 32:
641 factor = get_const(p, CONST_INV_2147483647);
642 break;
643 default:
644 assert(0);
645 factor.disp = 0;
646 factor.file = 0;
647 factor.idx = 0;
648 factor.mod = 0;
649 break;
650 }
651 sse_mulps(p->func, dataXMM, factor);
652 }
653 break;
654
655 break;
656 case UTIL_FORMAT_TYPE_FLOAT:
657 if (input_desc->channel[0].size != 32
658 && input_desc->channel[0].size != 64) {
659 return false;
660 }
661 if (swizzle[3] == PIPE_SWIZZLE_1
662 && input_desc->nr_channels <= 3) {
663 swizzle[3] = PIPE_SWIZZLE_W;
664 needed_chans = CHANNELS_0001;
665 }
666 switch (input_desc->channel[0].size) {
667 case 32:
668 emit_load_float32(p, dataXMM, src, needed_chans,
669 input_desc->nr_channels);
670 break;
671 case 64: /* we lose precision here */
672 if (!(x86_target_caps(p->func) & X86_SSE2))
673 return false;
674 emit_load_float64to32(p, dataXMM, src, needed_chans,
675 input_desc->nr_channels);
676 break;
677 default:
678 return false;
679 }
680 break;
681 default:
682 return false;
683 }
684
685 if (!id_swizzle) {
686 sse_shufps(p->func, dataXMM, dataXMM,
687 SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
688 }
689 }
690
691 if (output_desc->nr_channels >= 4
692 && swizzle[0] < PIPE_SWIZZLE_0
693 && swizzle[1] < PIPE_SWIZZLE_0
694 && swizzle[2] < PIPE_SWIZZLE_0
695 && swizzle[3] < PIPE_SWIZZLE_0) {
696 sse_movups(p->func, dst, dataXMM);
697 }
698 else {
699 if (output_desc->nr_channels >= 2
700 && swizzle[0] < PIPE_SWIZZLE_0
701 && swizzle[1] < PIPE_SWIZZLE_0) {
702 sse_movlps(p->func, dst, dataXMM);
703 }
704 else {
705 if (swizzle[0] < PIPE_SWIZZLE_0) {
706 sse_movss(p->func, dst, dataXMM);
707 }
708 else {
709 x86_mov_imm(p->func, dst,
710 imms[swizzle[0] - PIPE_SWIZZLE_0]);
711 }
712
713 if (output_desc->nr_channels >= 2) {
714 if (swizzle[1] < PIPE_SWIZZLE_0) {
715 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
716 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
717 }
718 else {
719 x86_mov_imm(p->func, x86_make_disp(dst, 4),
720 imms[swizzle[1] - PIPE_SWIZZLE_0]);
721 }
722 }
723 }
724
725 if (output_desc->nr_channels >= 3) {
726 if (output_desc->nr_channels >= 4
727 && swizzle[2] < PIPE_SWIZZLE_0
728 && swizzle[3] < PIPE_SWIZZLE_0) {
729 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
730 }
731 else {
732 if (swizzle[2] < PIPE_SWIZZLE_0) {
733 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
734 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
735 }
736 else {
737 x86_mov_imm(p->func, x86_make_disp(dst, 8),
738 imms[swizzle[2] - PIPE_SWIZZLE_0]);
739 }
740
741 if (output_desc->nr_channels >= 4) {
742 if (swizzle[3] < PIPE_SWIZZLE_0) {
743 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
744 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
745 }
746 else {
747 x86_mov_imm(p->func, x86_make_disp(dst, 12),
748 imms[swizzle[3] - PIPE_SWIZZLE_0]);
749 }
750 }
751 }
752 }
753 }
754 return true;
755 }
756 else if ((x86_target_caps(p->func) & X86_SSE2)
757 && input_desc->channel[0].size == 8
758 && output_desc->channel[0].size == 16
759 && output_desc->channel[0].normalized ==
760 input_desc->channel[0].normalized &&
761 (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
762 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
763 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
764 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
765 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
766 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
767 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
768 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
769 struct x86_reg tmp = p->tmp_EAX;
770 unsigned imms[2] = { 0, 1 };
771
772 for (i = 0; i < output_desc->nr_channels; ++i) {
773 if (swizzle[i] == PIPE_SWIZZLE_0
774 && i >= input_desc->nr_channels) {
775 swizzle[i] = i;
776 }
777 }
778
779 for (i = 0; i < output_desc->nr_channels; ++i) {
780 if (swizzle[i] < 4)
781 needed_chans = MAX2(needed_chans, swizzle[i] + 1);
782 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
783 id_swizzle = false;
784 }
785
786 if (needed_chans > 0) {
787 emit_load_sse2(p, dataXMM, src,
788 input_desc->channel[0].size *
789 input_desc->nr_channels >> 3);
790
791 switch (input_desc->channel[0].type) {
792 case UTIL_FORMAT_TYPE_UNSIGNED:
793 if (input_desc->channel[0].normalized) {
794 sse2_punpcklbw(p->func, dataXMM, dataXMM);
795 if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
796 sse2_psrlw_imm(p->func, dataXMM, 1);
797 }
798 else
799 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
800 break;
801 case UTIL_FORMAT_TYPE_SIGNED:
802 if (input_desc->channel[0].normalized) {
803 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
804 sse2_punpcklbw(p->func, tmpXMM, dataXMM);
805 sse2_psllw_imm(p->func, dataXMM, 9);
806 sse2_psrlw_imm(p->func, dataXMM, 8);
807 sse2_por(p->func, tmpXMM, dataXMM);
808 sse2_psrlw_imm(p->func, dataXMM, 7);
809 sse2_por(p->func, tmpXMM, dataXMM);
810 {
811 struct x86_reg t = dataXMM;
812 dataXMM = tmpXMM;
813 tmpXMM = t;
814 }
815 }
816 else {
817 sse2_punpcklbw(p->func, dataXMM, dataXMM);
818 sse2_psraw_imm(p->func, dataXMM, 8);
819 }
820 break;
821 default:
822 assert(0);
823 }
824
825 if (output_desc->channel[0].normalized)
826 imms[1] =
827 (output_desc->channel[0].type ==
828 UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
829
830 if (!id_swizzle)
831 sse2_pshuflw(p->func, dataXMM, dataXMM,
832 (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
833 ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
834 }
835
836 if (output_desc->nr_channels >= 4
837 && swizzle[0] < PIPE_SWIZZLE_0
838 && swizzle[1] < PIPE_SWIZZLE_0
839 && swizzle[2] < PIPE_SWIZZLE_0
840 && swizzle[3] < PIPE_SWIZZLE_0) {
841 sse2_movq(p->func, dst, dataXMM);
842 }
843 else {
844 if (swizzle[0] < PIPE_SWIZZLE_0) {
845 if (output_desc->nr_channels >= 2
846 && swizzle[1] < PIPE_SWIZZLE_0) {
847 sse2_movd(p->func, dst, dataXMM);
848 }
849 else {
850 sse2_movd(p->func, tmp, dataXMM);
851 x86_mov16(p->func, dst, tmp);
852 if (output_desc->nr_channels >= 2)
853 x86_mov16_imm(p->func, x86_make_disp(dst, 2),
854 imms[swizzle[1] - PIPE_SWIZZLE_0]);
855 }
856 }
857 else {
858 if (output_desc->nr_channels >= 2
859 && swizzle[1] >= PIPE_SWIZZLE_0) {
860 x86_mov_imm(p->func, dst,
861 (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
862 imms[swizzle[0] - PIPE_SWIZZLE_0]);
863 }
864 else {
865 x86_mov16_imm(p->func, dst,
866 imms[swizzle[0] - PIPE_SWIZZLE_0]);
867 if (output_desc->nr_channels >= 2) {
868 sse2_movd(p->func, tmp, dataXMM);
869 x86_shr_imm(p->func, tmp, 16);
870 x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
871 }
872 }
873 }
874
875 if (output_desc->nr_channels >= 3) {
876 if (swizzle[2] < PIPE_SWIZZLE_0) {
877 if (output_desc->nr_channels >= 4
878 && swizzle[3] < PIPE_SWIZZLE_0) {
879 sse2_psrlq_imm(p->func, dataXMM, 32);
880 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
881 }
882 else {
883 sse2_psrlq_imm(p->func, dataXMM, 32);
884 sse2_movd(p->func, tmp, dataXMM);
885 x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
886 if (output_desc->nr_channels >= 4) {
887 x86_mov16_imm(p->func, x86_make_disp(dst, 6),
888 imms[swizzle[3] - PIPE_SWIZZLE_0]);
889 }
890 }
891 }
892 else {
893 if (output_desc->nr_channels >= 4
894 && swizzle[3] >= PIPE_SWIZZLE_0) {
895 x86_mov_imm(p->func, x86_make_disp(dst, 4),
896 (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
897 | imms[swizzle[2] - PIPE_SWIZZLE_0]);
898 }
899 else {
900 x86_mov16_imm(p->func, x86_make_disp(dst, 4),
901 imms[swizzle[2] - PIPE_SWIZZLE_0]);
902
903 if (output_desc->nr_channels >= 4) {
904 sse2_psrlq_imm(p->func, dataXMM, 48);
905 sse2_movd(p->func, tmp, dataXMM);
906 x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
907 }
908 }
909 }
910 }
911 }
912 return true;
913 }
914 else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
915 sizeof(output_desc->channel[0]))) {
916 struct x86_reg tmp = p->tmp_EAX;
917 unsigned i;
918
919 if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
920 && output_desc->nr_channels == 4
921 && swizzle[0] == PIPE_SWIZZLE_W
922 && swizzle[1] == PIPE_SWIZZLE_Z
923 && swizzle[2] == PIPE_SWIZZLE_Y
924 && swizzle[3] == PIPE_SWIZZLE_X) {
925 /* TODO: support movbe */
926 x86_mov(p->func, tmp, src);
927 x86_bswap(p->func, tmp);
928 x86_mov(p->func, dst, tmp);
929 return true;
930 }
931
932 for (i = 0; i < output_desc->nr_channels; ++i) {
933 switch (output_desc->channel[0].size) {
934 case 8:
935 if (swizzle[i] >= PIPE_SWIZZLE_0) {
936 unsigned v = 0;
937 if (swizzle[i] == PIPE_SWIZZLE_1) {
938 switch (output_desc->channel[0].type) {
939 case UTIL_FORMAT_TYPE_UNSIGNED:
940 v = output_desc->channel[0].normalized ? 0xff : 1;
941 break;
942 case UTIL_FORMAT_TYPE_SIGNED:
943 v = output_desc->channel[0].normalized ? 0x7f : 1;
944 break;
945 default:
946 return false;
947 }
948 }
949 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
950 }
951 else {
952 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
953 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
954 }
955 break;
956 case 16:
957 if (swizzle[i] >= PIPE_SWIZZLE_0) {
958 unsigned v = 0;
959 if (swizzle[i] == PIPE_SWIZZLE_1) {
960 switch (output_desc->channel[1].type) {
961 case UTIL_FORMAT_TYPE_UNSIGNED:
962 v = output_desc->channel[1].normalized ? 0xffff : 1;
963 break;
964 case UTIL_FORMAT_TYPE_SIGNED:
965 v = output_desc->channel[1].normalized ? 0x7fff : 1;
966 break;
967 case UTIL_FORMAT_TYPE_FLOAT:
968 v = 0x3c00;
969 break;
970 default:
971 return false;
972 }
973 }
974 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
975 }
976 else if (swizzle[i] == PIPE_SWIZZLE_0) {
977 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
978 }
979 else {
980 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
981 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
982 }
983 break;
984 case 32:
985 if (swizzle[i] >= PIPE_SWIZZLE_0) {
986 unsigned v = 0;
987 if (swizzle[i] == PIPE_SWIZZLE_1) {
988 switch (output_desc->channel[1].type) {
989 case UTIL_FORMAT_TYPE_UNSIGNED:
990 v = output_desc->channel[1].normalized ? 0xffffffff : 1;
991 break;
992 case UTIL_FORMAT_TYPE_SIGNED:
993 v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
994 break;
995 case UTIL_FORMAT_TYPE_FLOAT:
996 v = 0x3f800000;
997 break;
998 default:
999 return false;
1000 }
1001 }
1002 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
1003 }
1004 else {
1005 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
1006 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
1007 }
1008 break;
1009 case 64:
1010 if (swizzle[i] >= PIPE_SWIZZLE_0) {
1011 unsigned l = 0;
1012 unsigned h = 0;
1013 if (swizzle[i] == PIPE_SWIZZLE_1) {
1014 switch (output_desc->channel[1].type) {
1015 case UTIL_FORMAT_TYPE_UNSIGNED:
1016 h = output_desc->channel[1].normalized ? 0xffffffff : 0;
1017 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1018 break;
1019 case UTIL_FORMAT_TYPE_SIGNED:
1020 h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
1021 l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1022 break;
1023 case UTIL_FORMAT_TYPE_FLOAT:
1024 h = 0x3ff00000;
1025 l = 0;
1026 break;
1027 default:
1028 return false;
1029 }
1030 }
1031 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1032 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1033 }
1034 else {
1035 if (x86_target_caps(p->func) & X86_SSE) {
1036 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1037 emit_load64(p, tmp, tmpXMM,
1038 x86_make_disp(src, swizzle[i] * 8));
1039 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1040 }
1041 else {
1042 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1043 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1044 x86_mov(p->func, tmp,
1045 x86_make_disp(src, swizzle[i] * 8 + 4));
1046 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1047 }
1048 }
1049 break;
1050 default:
1051 return false;
1052 }
1053 }
1054 return true;
1055 }
1056 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1057 else if ((x86_target_caps(p->func) & X86_SSE2) &&
1058 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1059 (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1060 || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1061 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1062
1063 /* load */
1064 sse_movups(p->func, dataXMM, src);
1065
1066 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1067 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1068 }
1069
1070 /* scale by 255.0 */
1071 sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1072
1073 /* pack and emit */
1074 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1075 sse2_packssdw(p->func, dataXMM, dataXMM);
1076 sse2_packuswb(p->func, dataXMM, dataXMM);
1077 sse2_movd(p->func, dst, dataXMM);
1078
1079 return true;
1080 }
1081
1082 return false;
1083 }
1084
1085
1086 static bool
translate_attr(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)1087 translate_attr(struct translate_sse *p,
1088 const struct translate_element *a,
1089 struct x86_reg src, struct x86_reg dst)
1090 {
1091 if (a->input_format == a->output_format) {
1092 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1093 return true;
1094 }
1095
1096 return translate_attr_convert(p, a, src, dst);
1097 }
1098
1099
1100 static bool
init_inputs(struct translate_sse * p,unsigned index_size)1101 init_inputs(struct translate_sse *p, unsigned index_size)
1102 {
1103 unsigned i;
1104 struct x86_reg instance_id =
1105 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1106 struct x86_reg start_instance =
1107 x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1108
1109 for (i = 0; i < p->nr_buffer_variants; i++) {
1110 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1111 struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1112
1113 if (!index_size || variant->instance_divisor) {
1114 struct x86_reg buf_max_index =
1115 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1116 struct x86_reg buf_stride =
1117 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1118 struct x86_reg buf_ptr =
1119 x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1120 struct x86_reg buf_base_ptr =
1121 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1122 struct x86_reg elt = p->idx_ESI;
1123 struct x86_reg tmp_EAX = p->tmp_EAX;
1124
1125 /* Calculate pointer to first attrib:
1126 * base_ptr + stride * index, where index depends on instance divisor
1127 */
1128 if (variant->instance_divisor) {
1129 struct x86_reg tmp_EDX = p->tmp2_EDX;
1130
1131 /* Start with instance = instance_id
1132 * which is true if divisor is 1.
1133 */
1134 x86_mov(p->func, tmp_EAX, instance_id);
1135
1136 if (variant->instance_divisor != 1) {
1137 struct x86_reg tmp_ECX = p->src_ECX;
1138
1139 /* TODO: Add x86_shr() to rtasm and use it whenever
1140 * instance divisor is power of two.
1141 */
1142 x86_xor(p->func, tmp_EDX, tmp_EDX);
1143 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1144 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
1145 }
1146
1147 /* instance = (instance_id / divisor) + start_instance
1148 */
1149 x86_mov(p->func, tmp_EDX, start_instance);
1150 x86_add(p->func, tmp_EAX, tmp_EDX);
1151
1152 /* XXX we need to clamp the index here too, but to a
1153 * per-array max value, not the draw->pt.max_index value
1154 * that's being given to us via translate->set_buffer().
1155 */
1156 }
1157 else {
1158 x86_mov(p->func, tmp_EAX, elt);
1159
1160 /* Clamp to max_index
1161 */
1162 x86_cmp(p->func, tmp_EAX, buf_max_index);
1163 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1164 }
1165
1166 x86_mov(p->func, p->tmp2_EDX, buf_stride);
1167 x64_rexw(p->func);
1168 x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1169 x64_rexw(p->func);
1170 x86_add(p->func, tmp_EAX, buf_base_ptr);
1171
1172 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1173
1174 /* In the linear case, keep the buffer pointer instead of the
1175 * index number.
1176 */
1177 if (!index_size && p->nr_buffer_variants == 1) {
1178 x64_rexw(p->func);
1179 x86_mov(p->func, elt, tmp_EAX);
1180 }
1181 else {
1182 x64_rexw(p->func);
1183 x86_mov(p->func, buf_ptr, tmp_EAX);
1184 }
1185 }
1186 }
1187
1188 return true;
1189 }
1190
1191
1192 static struct x86_reg
get_buffer_ptr(struct translate_sse * p,unsigned index_size,unsigned var_idx,struct x86_reg elt)1193 get_buffer_ptr(struct translate_sse *p,
1194 unsigned index_size, unsigned var_idx, struct x86_reg elt)
1195 {
1196 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1197 return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1198 }
1199 if (!index_size && p->nr_buffer_variants == 1) {
1200 return p->idx_ESI;
1201 }
1202 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1203 struct x86_reg ptr = p->src_ECX;
1204 struct x86_reg buf_ptr =
1205 x86_make_disp(p->machine_EDI,
1206 get_offset(p, &p->buffer_variant[var_idx].ptr));
1207
1208 x64_rexw(p->func);
1209 x86_mov(p->func, ptr, buf_ptr);
1210 return ptr;
1211 }
1212 else {
1213 struct x86_reg ptr = p->src_ECX;
1214 const struct translate_buffer_variant *variant =
1215 &p->buffer_variant[var_idx];
1216 struct x86_reg buf_stride =
1217 x86_make_disp(p->machine_EDI,
1218 get_offset(p, &p->buffer[variant->buffer_index].stride));
1219 struct x86_reg buf_base_ptr =
1220 x86_make_disp(p->machine_EDI,
1221 get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1222 struct x86_reg buf_max_index =
1223 x86_make_disp(p->machine_EDI,
1224 get_offset(p, &p->buffer[variant->buffer_index].max_index));
1225
1226 /* Calculate pointer to current attrib:
1227 */
1228 switch (index_size) {
1229 case 1:
1230 x86_movzx8(p->func, ptr, elt);
1231 break;
1232 case 2:
1233 x86_movzx16(p->func, ptr, elt);
1234 break;
1235 case 4:
1236 x86_mov(p->func, ptr, elt);
1237 break;
1238 }
1239
1240 /* Clamp to max_index
1241 */
1242 x86_cmp(p->func, ptr, buf_max_index);
1243 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1244
1245 x86_mov(p->func, p->tmp2_EDX, buf_stride);
1246 x64_rexw(p->func);
1247 x86_imul(p->func, ptr, p->tmp2_EDX);
1248 x64_rexw(p->func);
1249 x86_add(p->func, ptr, buf_base_ptr);
1250 return ptr;
1251 }
1252 }
1253
1254
1255 static bool
incr_inputs(struct translate_sse * p,unsigned index_size)1256 incr_inputs(struct translate_sse *p, unsigned index_size)
1257 {
1258 if (!index_size && p->nr_buffer_variants == 1) {
1259 const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1260 struct x86_reg stride =
1261 x86_make_disp(p->machine_EDI,
1262 get_offset(p, &p->buffer[buffer_index].stride));
1263
1264 if (p->buffer_variant[0].instance_divisor == 0) {
1265 x64_rexw(p->func);
1266 x86_add(p->func, p->idx_ESI, stride);
1267 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1268 }
1269 }
1270 else if (!index_size) {
1271 unsigned i;
1272
1273 /* Is this worthwhile??
1274 */
1275 for (i = 0; i < p->nr_buffer_variants; i++) {
1276 struct translate_buffer_variant *variant = &p->buffer_variant[i];
1277 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1278 get_offset(p, &variant->ptr));
1279 struct x86_reg buf_stride =
1280 x86_make_disp(p->machine_EDI,
1281 get_offset(p, &p->buffer[variant->buffer_index].stride));
1282
1283 if (variant->instance_divisor == 0) {
1284 x86_mov(p->func, p->tmp_EAX, buf_stride);
1285 x64_rexw(p->func);
1286 x86_add(p->func, p->tmp_EAX, buf_ptr);
1287 if (i == 0)
1288 sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1289 x64_rexw(p->func);
1290 x86_mov(p->func, buf_ptr, p->tmp_EAX);
1291 }
1292 }
1293 }
1294 else {
1295 x64_rexw(p->func);
1296 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1297 }
1298
1299 return true;
1300 }
1301
1302
1303 /* Build run( struct translate *machine,
1304 * unsigned start,
1305 * unsigned count,
1306 * void *output_buffer )
1307 * or
1308 * run_elts( struct translate *machine,
1309 * unsigned *elts,
1310 * unsigned count,
1311 * void *output_buffer )
1312 *
1313 * Lots of hardcoding
1314 *
1315 * EAX -- pointer to current output vertex
1316 * ECX -- pointer to current attribute
1317 *
1318 */
1319 static bool
build_vertex_emit(struct translate_sse * p,struct x86_function * func,unsigned index_size)1320 build_vertex_emit(struct translate_sse *p,
1321 struct x86_function *func, unsigned index_size)
1322 {
1323 int fixup, label;
1324 unsigned j;
1325
1326 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1327 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1328
1329 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1330 p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1331 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1332 p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1333 p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1334 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1335 p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1336
1337 p->func = func;
1338
1339 x86_init_func(p->func);
1340
1341 if (x86_target(p->func) == X86_64_WIN64_ABI) {
1342 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1343 * above the return address
1344 */
1345 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1346 x86_make_reg(file_XMM, 6));
1347 sse2_movdqa(p->func,
1348 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1349 x86_make_reg(file_XMM, 7));
1350 }
1351
1352 x86_push(p->func, p->outbuf_EBX);
1353 x86_push(p->func, p->count_EBP);
1354
1355 /* on non-Win64 x86-64, these are already in the right registers */
1356 if (x86_target(p->func) != X86_64_STD_ABI) {
1357 x86_push(p->func, p->machine_EDI);
1358 x86_push(p->func, p->idx_ESI);
1359
1360 if (x86_target(p->func) != X86_32) {
1361 x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1362 x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1363 }
1364 else {
1365 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1366 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1367 }
1368 }
1369
1370 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1371
1372 if (x86_target(p->func) != X86_32)
1373 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1374 else
1375 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1376
1377 /* Load instance ID.
1378 */
1379 if (p->use_instancing) {
1380 x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1381 x86_mov(p->func,
1382 x86_make_disp(p->machine_EDI,
1383 get_offset(p, &p->start_instance)), p->tmp2_EDX);
1384
1385 x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1386 x86_mov(p->func,
1387 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1388 p->tmp_EAX);
1389 }
1390
1391 /* Get vertex count, compare to zero
1392 */
1393 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1394 x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1395 fixup = x86_jcc_forward(p->func, cc_E);
1396
1397 /* always load, needed or not:
1398 */
1399 init_inputs(p, index_size);
1400
1401 /* Note address for loop jump
1402 */
1403 label = x86_get_label(p->func);
1404 {
1405 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1406 int last_variant = -1;
1407 struct x86_reg vb;
1408
1409 for (j = 0; j < p->translate.key.nr_elements; j++) {
1410 const struct translate_element *a = &p->translate.key.element[j];
1411 unsigned variant = p->element_to_buffer_variant[j];
1412
1413 /* Figure out source pointer address:
1414 */
1415 if (variant != last_variant) {
1416 last_variant = variant;
1417 vb = get_buffer_ptr(p, index_size, variant, elt);
1418 }
1419
1420 if (!translate_attr(p, a,
1421 x86_make_disp(vb, a->input_offset),
1422 x86_make_disp(p->outbuf_EBX, a->output_offset)))
1423 return false;
1424 }
1425
1426 /* Next output vertex:
1427 */
1428 x64_rexw(p->func);
1429 x86_lea(p->func, p->outbuf_EBX,
1430 x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1431
1432 /* Incr index
1433 */
1434 incr_inputs(p, index_size);
1435 }
1436
1437 /* decr count, loop if not zero
1438 */
1439 x86_dec(p->func, p->count_EBP);
1440 x86_jcc(p->func, cc_NZ, label);
1441
1442 /* Exit mmx state?
1443 */
1444 if (p->func->need_emms)
1445 mmx_emms(p->func);
1446
1447 /* Land forward jump here:
1448 */
1449 x86_fixup_fwd_jump(p->func, fixup);
1450
1451 /* Pop regs and return
1452 */
1453 if (x86_target(p->func) != X86_64_STD_ABI) {
1454 x86_pop(p->func, p->idx_ESI);
1455 x86_pop(p->func, p->machine_EDI);
1456 }
1457
1458 x86_pop(p->func, p->count_EBP);
1459 x86_pop(p->func, p->outbuf_EBX);
1460
1461 if (x86_target(p->func) == X86_64_WIN64_ABI) {
1462 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1463 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1464 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1465 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1466 }
1467 x86_ret(p->func);
1468
1469 return true;
1470 }
1471
1472
1473 static void
translate_sse_set_buffer(struct translate * translate,unsigned buf,const void * ptr,unsigned stride,unsigned max_index)1474 translate_sse_set_buffer(struct translate *translate,
1475 unsigned buf,
1476 const void *ptr, unsigned stride, unsigned max_index)
1477 {
1478 struct translate_sse *p = (struct translate_sse *) translate;
1479
1480 if (buf < p->nr_buffers) {
1481 p->buffer[buf].base_ptr = (char *) ptr;
1482 p->buffer[buf].stride = stride;
1483 p->buffer[buf].max_index = max_index;
1484 }
1485
1486 if (0)
1487 debug_printf("%s %d/%d: %p %d\n",
1488 __func__, buf, p->nr_buffers, ptr, stride);
1489 }
1490
1491
1492 static void
translate_sse_release(struct translate * translate)1493 translate_sse_release(struct translate *translate)
1494 {
1495 struct translate_sse *p = (struct translate_sse *) translate;
1496
1497 x86_release_func(&p->elt8_func);
1498 x86_release_func(&p->elt16_func);
1499 x86_release_func(&p->elt_func);
1500 x86_release_func(&p->linear_func);
1501
1502 os_free_aligned(p);
1503 }
1504
1505
1506 struct translate *
translate_sse2_create(const struct translate_key * key)1507 translate_sse2_create(const struct translate_key *key)
1508 {
1509 struct translate_sse *p = NULL;
1510 unsigned i;
1511
1512 if (!util_get_cpu_caps()->has_sse)
1513 goto fail;
1514
1515 p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1516 if (!p)
1517 goto fail;
1518
1519 memset(p, 0, sizeof(*p));
1520 memcpy(p->consts, consts, sizeof(consts));
1521 memcpy(p->uconsts, uconsts, sizeof(uconsts));
1522
1523 p->translate.key = *key;
1524 p->translate.release = translate_sse_release;
1525 p->translate.set_buffer = translate_sse_set_buffer;
1526
1527 assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1528
1529 for (i = 0; i < key->nr_elements; i++) {
1530 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1531 unsigned j;
1532
1533 p->nr_buffers =
1534 MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1535
1536 if (key->element[i].instance_divisor) {
1537 p->use_instancing = true;
1538 }
1539
1540 /*
1541 * Map vertex element to vertex buffer variant.
1542 */
1543 for (j = 0; j < p->nr_buffer_variants; j++) {
1544 if (p->buffer_variant[j].buffer_index ==
1545 key->element[i].input_buffer
1546 && p->buffer_variant[j].instance_divisor ==
1547 key->element[i].instance_divisor) {
1548 break;
1549 }
1550 }
1551 if (j == p->nr_buffer_variants) {
1552 p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1553 p->buffer_variant[j].instance_divisor =
1554 key->element[i].instance_divisor;
1555 p->nr_buffer_variants++;
1556 }
1557 p->element_to_buffer_variant[i] = j;
1558 }
1559 else {
1560 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1561
1562 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1563 }
1564 }
1565
1566 if (0)
1567 debug_printf("nr_buffers: %d\n", p->nr_buffers);
1568
1569 if (!build_vertex_emit(p, &p->linear_func, 0))
1570 goto fail;
1571
1572 if (!build_vertex_emit(p, &p->elt_func, 4))
1573 goto fail;
1574
1575 if (!build_vertex_emit(p, &p->elt16_func, 2))
1576 goto fail;
1577
1578 if (!build_vertex_emit(p, &p->elt8_func, 1))
1579 goto fail;
1580
1581 p->translate.run = (run_func) x86_get_func(&p->linear_func);
1582 if (p->translate.run == NULL)
1583 goto fail;
1584
1585 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1586 if (p->translate.run_elts == NULL)
1587 goto fail;
1588
1589 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1590 if (p->translate.run_elts16 == NULL)
1591 goto fail;
1592
1593 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1594 if (p->translate.run_elts8 == NULL)
1595 goto fail;
1596
1597 return &p->translate;
1598
1599 fail:
1600 if (p)
1601 translate_sse_release(&p->translate);
1602
1603 return NULL;
1604 }
1605
1606
1607 #else
1608
1609 struct translate *
translate_sse2_create(const struct translate_key * key)1610 translate_sse2_create(const struct translate_key *key)
1611 {
1612 return NULL;
1613 }
1614
1615 #endif
1616