xref: /aosp_15_r20/external/mesa3d/src/gallium/auxiliary/translate/translate_sse.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2003 VMware, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19  * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Keith Whitwell <[email protected]>
26  */
27 
28 
29 #include "util/detect.h"
30 #include "util/compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_math.h"
34 #include "util/format/u_format.h"
35 
36 #include "translate.h"
37 
38 
39 #if DETECT_ARCH_X86 || DETECT_ARCH_X86_64
40 
41 #include "rtasm/rtasm_x86sse.h"
42 
43 
44 #define X    0
45 #define Y    1
46 #define Z    2
47 #define W    3
48 
49 
50 struct translate_buffer
51 {
52    const void *base_ptr;
53    uintptr_t stride;
54    unsigned max_index;
55 };
56 
57 struct translate_buffer_variant
58 {
59    unsigned buffer_index;
60    unsigned instance_divisor;
61    void *ptr;                   /* updated either per vertex or per instance */
62 };
63 
64 
65 #define ELEMENT_BUFFER_INSTANCE_ID  1001
66 
67 #define NUM_FLOAT_CONSTS 9
68 #define NUM_UNSIGNED_CONSTS 1
69 
70 enum
71 {
72    CONST_IDENTITY,
73    CONST_INV_127,
74    CONST_INV_255,
75    CONST_INV_32767,
76    CONST_INV_65535,
77    CONST_INV_2147483647,
78    CONST_INV_4294967295,
79    CONST_255,
80    CONST_2147483648,
81    /* float consts end */
82    CONST_2147483647_INT,
83 };
84 
85 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
86 static float consts[NUM_FLOAT_CONSTS][4] = {
87    {0, 0, 0, 1},
88    C(1.0 / 127.0),
89    C(1.0 / 255.0),
90    C(1.0 / 32767.0),
91    C(1.0 / 65535.0),
92    C(1.0 / 2147483647.0),
93    C(1.0 / 4294967295.0),
94    C(255.0),
95    C(2147483648.0),
96 };
97 
98 #undef C
99 
100 static unsigned uconsts[NUM_UNSIGNED_CONSTS][4] = {
101    {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff},
102 };
103 
104 struct translate_sse
105 {
106    struct translate translate;
107 
108    struct x86_function linear_func;
109    struct x86_function elt_func;
110    struct x86_function elt16_func;
111    struct x86_function elt8_func;
112    struct x86_function *func;
113 
114    alignas(16) float consts[NUM_FLOAT_CONSTS][4];
115    alignas(16) float uconsts[NUM_UNSIGNED_CONSTS][4];
116    int8_t reg_to_const[16];
117    int8_t const_to_reg[NUM_FLOAT_CONSTS + NUM_UNSIGNED_CONSTS];
118 
119    struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
120    unsigned nr_buffers;
121 
122    /* Multiple buffer variants can map to a single buffer. */
123    struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
124    unsigned nr_buffer_variants;
125 
126    /* Multiple elements can map to a single buffer variant. */
127    unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
128 
129    bool use_instancing;
130    unsigned instance_id;
131    unsigned start_instance;
132 
133    /* these are actually known values, but putting them in a struct
134     * like this is helpful to keep them in sync across the file.
135     */
136    struct x86_reg tmp_EAX;
137    struct x86_reg tmp2_EDX;
138    struct x86_reg src_ECX;
139    struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
140    struct x86_reg machine_EDI;
141    struct x86_reg outbuf_EBX;
142    struct x86_reg count_EBP;    /* decrements to zero */
143 };
144 
145 
146 static int
get_offset(const void * a,const void * b)147 get_offset(const void *a, const void *b)
148 {
149    return (const char *) b - (const char *) a;
150 }
151 
152 
153 static struct x86_reg
get_const(struct translate_sse * p,unsigned id)154 get_const(struct translate_sse *p, unsigned id)
155 {
156    struct x86_reg reg;
157    unsigned i;
158 
159    if (p->const_to_reg[id] >= 0)
160       return x86_make_reg(file_XMM, p->const_to_reg[id]);
161 
162    for (i = 2; i < 8; ++i) {
163       if (p->reg_to_const[i] < 0)
164          break;
165    }
166 
167    /* TODO: be smarter here */
168    if (i == 8)
169       --i;
170 
171    reg = x86_make_reg(file_XMM, i);
172 
173    if (p->reg_to_const[i] >= 0)
174       p->const_to_reg[p->reg_to_const[i]] = -1;
175 
176    p->reg_to_const[i] = id;
177    p->const_to_reg[id] = i;
178 
179    /* TODO: this should happen outside the loop, if possible */
180    const void *c;
181    if (id < NUM_FLOAT_CONSTS)
182       c = &p->consts[id][0];
183    else
184       c = &p->uconsts[id - NUM_FLOAT_CONSTS][0];
185    sse_movaps(p->func, reg,
186               x86_make_disp(p->machine_EDI, get_offset(p, c)));
187 
188    return reg;
189 }
190 
191 
192 /* load the data in a SSE2 register, padding with zeros */
193 static bool
emit_load_sse2(struct translate_sse * p,struct x86_reg data,struct x86_reg src,unsigned size)194 emit_load_sse2(struct translate_sse *p,
195                struct x86_reg data, struct x86_reg src, unsigned size)
196 {
197    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
198    struct x86_reg tmp = p->tmp_EAX;
199    switch (size) {
200    case 1:
201       x86_movzx8(p->func, tmp, src);
202       sse2_movd(p->func, data, tmp);
203       break;
204    case 2:
205       x86_movzx16(p->func, tmp, src);
206       sse2_movd(p->func, data, tmp);
207       break;
208    case 3:
209       x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
210       x86_shl_imm(p->func, tmp, 16);
211       x86_mov16(p->func, tmp, src);
212       sse2_movd(p->func, data, tmp);
213       break;
214    case 4:
215       sse2_movd(p->func, data, src);
216       break;
217    case 6:
218       sse2_movd(p->func, data, src);
219       x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
220       sse2_movd(p->func, tmpXMM, tmp);
221       sse2_punpckldq(p->func, data, tmpXMM);
222       break;
223    case 8:
224       sse2_movq(p->func, data, src);
225       break;
226    case 12:
227       sse2_movq(p->func, data, src);
228       sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
229       sse2_punpcklqdq(p->func, data, tmpXMM);
230       break;
231    case 16:
232       sse2_movdqu(p->func, data, src);
233       break;
234    default:
235       return false;
236    }
237    return true;
238 }
239 
240 
241 /* this value can be passed for the out_chans argument */
242 #define CHANNELS_0001 5
243 
244 
245 /* this function will load #chans float values, and will
246  * pad the register with zeroes at least up to out_chans.
247  *
248  * If out_chans is set to CHANNELS_0001, then the fourth
249  * value will be padded with 1. Only pass this value if
250  * chans < 4 or results are undefined.
251  */
252 static void
emit_load_float32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)253 emit_load_float32(struct translate_sse *p, struct x86_reg data,
254                   struct x86_reg arg0, unsigned out_chans, unsigned chans)
255 {
256    switch (chans) {
257    case 1:
258       /* a 0 0 0
259        * a 0 0 1
260        */
261       sse_movss(p->func, data, arg0);
262       if (out_chans == CHANNELS_0001)
263          sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
264       break;
265    case 2:
266       /* 0 0 0 1
267        * a b 0 1
268        */
269       if (out_chans == CHANNELS_0001)
270          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271                     SHUF(X, Y, Z, W));
272       else if (out_chans > 2)
273          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
274       sse_movlps(p->func, data, arg0);
275       break;
276    case 3:
277       /* Have to jump through some hoops:
278        *
279        * c 0 0 0
280        * c 0 0 1 if out_chans == CHANNELS_0001
281        * 0 0 c 0/1
282        * a b c 0/1
283        */
284       sse_movss(p->func, data, x86_make_disp(arg0, 8));
285       if (out_chans == CHANNELS_0001)
286          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
287                     SHUF(X, Y, Z, W));
288       sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
289       sse_movlps(p->func, data, arg0);
290       break;
291    case 4:
292       sse_movups(p->func, data, arg0);
293       break;
294    }
295 }
296 
297 /* this function behaves like emit_load_float32, but loads
298    64-bit floating point numbers, converting them to 32-bit
299   ones */
300 static void
emit_load_float64to32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)301 emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
302                       struct x86_reg arg0, unsigned out_chans, unsigned chans)
303 {
304    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
305    switch (chans) {
306    case 1:
307       sse2_movsd(p->func, data, arg0);
308       if (out_chans > 1)
309          sse2_cvtpd2ps(p->func, data, data);
310       else
311          sse2_cvtsd2ss(p->func, data, data);
312       if (out_chans == CHANNELS_0001)
313          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
314                     SHUF(X, Y, Z, W));
315       break;
316    case 2:
317       sse2_movupd(p->func, data, arg0);
318       sse2_cvtpd2ps(p->func, data, data);
319       if (out_chans == CHANNELS_0001)
320          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
321                     SHUF(X, Y, Z, W));
322       else if (out_chans > 2)
323          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
324       break;
325    case 3:
326       sse2_movupd(p->func, data, arg0);
327       sse2_cvtpd2ps(p->func, data, data);
328       sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
329       if (out_chans > 3)
330          sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
331       else
332          sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
333       sse_movlhps(p->func, data, tmpXMM);
334       if (out_chans == CHANNELS_0001)
335          sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
336       break;
337    case 4:
338       sse2_movupd(p->func, data, arg0);
339       sse2_cvtpd2ps(p->func, data, data);
340       sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
341       sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
342       sse_movlhps(p->func, data, tmpXMM);
343       break;
344    }
345 }
346 
347 
348 static void
emit_mov64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src_gpr,struct x86_reg src_xmm)349 emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
350            struct x86_reg dst_xmm, struct x86_reg src_gpr,
351            struct x86_reg src_xmm)
352 {
353    if (x86_target(p->func) != X86_32)
354       x64_mov64(p->func, dst_gpr, src_gpr);
355    else {
356       /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
357       if (x86_target_caps(p->func) & X86_SSE2)
358          sse2_movq(p->func, dst_xmm, src_xmm);
359       else
360          sse_movlps(p->func, dst_xmm, src_xmm);
361    }
362 }
363 
364 
365 static void
emit_load64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src)366 emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
367             struct x86_reg dst_xmm, struct x86_reg src)
368 {
369    emit_mov64(p, dst_gpr, dst_xmm, src, src);
370 }
371 
372 
373 static void
emit_store64(struct translate_sse * p,struct x86_reg dst,struct x86_reg src_gpr,struct x86_reg src_xmm)374 emit_store64(struct translate_sse *p, struct x86_reg dst,
375              struct x86_reg src_gpr, struct x86_reg src_xmm)
376 {
377    emit_mov64(p, dst, dst, src_gpr, src_xmm);
378 }
379 
380 
381 static void
emit_mov128(struct translate_sse * p,struct x86_reg dst,struct x86_reg src)382 emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
383 {
384    if (x86_target_caps(p->func) & X86_SSE2)
385       sse2_movdqu(p->func, dst, src);
386    else
387       sse_movups(p->func, dst, src);
388 }
389 
390 
391 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
392  * but may or may not be good on older processors
393  * TODO: may perhaps want to use non-temporal stores here if possible
394  */
395 static void
emit_memcpy(struct translate_sse * p,struct x86_reg dst,struct x86_reg src,unsigned size)396 emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
397             unsigned size)
398 {
399    struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
400    struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
401    struct x86_reg dataGPR = p->tmp_EAX;
402    struct x86_reg dataGPR2 = p->tmp2_EDX;
403 
404    if (size < 8) {
405       switch (size) {
406       case 1:
407          x86_mov8(p->func, dataGPR, src);
408          x86_mov8(p->func, dst, dataGPR);
409          break;
410       case 2:
411          x86_mov16(p->func, dataGPR, src);
412          x86_mov16(p->func, dst, dataGPR);
413          break;
414       case 3:
415          x86_mov16(p->func, dataGPR, src);
416          x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
417          x86_mov16(p->func, dst, dataGPR);
418          x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
419          break;
420       case 4:
421          x86_mov(p->func, dataGPR, src);
422          x86_mov(p->func, dst, dataGPR);
423          break;
424       case 6:
425          x86_mov(p->func, dataGPR, src);
426          x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
427          x86_mov(p->func, dst, dataGPR);
428          x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
429          break;
430       }
431    }
432    else if (!(x86_target_caps(p->func) & X86_SSE)) {
433       unsigned i = 0;
434       assert((size & 3) == 0);
435       for (i = 0; i < size; i += 4) {
436          x86_mov(p->func, dataGPR, x86_make_disp(src, i));
437          x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
438       }
439    }
440    else {
441       switch (size) {
442       case 8:
443          emit_load64(p, dataGPR, dataXMM, src);
444          emit_store64(p, dst, dataGPR, dataXMM);
445          break;
446       case 12:
447          emit_load64(p, dataGPR2, dataXMM, src);
448          x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
449          emit_store64(p, dst, dataGPR2, dataXMM);
450          x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
451          break;
452       case 16:
453          emit_mov128(p, dataXMM, src);
454          emit_mov128(p, dst, dataXMM);
455          break;
456       case 24:
457          emit_mov128(p, dataXMM, src);
458          emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
459          emit_mov128(p, dst, dataXMM);
460          emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
461          break;
462       case 32:
463          emit_mov128(p, dataXMM, src);
464          emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
465          emit_mov128(p, dst, dataXMM);
466          emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
467          break;
468       default:
469          assert(0);
470       }
471    }
472 }
473 
474 static bool
translate_attr_convert(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)475 translate_attr_convert(struct translate_sse *p,
476                        const struct translate_element *a,
477                        struct x86_reg src, struct x86_reg dst)
478 {
479    const struct util_format_description *input_desc =
480       util_format_description(a->input_format);
481    const struct util_format_description *output_desc =
482       util_format_description(a->output_format);
483    unsigned i;
484    bool id_swizzle = true;
485    unsigned swizzle[4] =
486       { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
487         PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
488    unsigned needed_chans = 0;
489    unsigned imms[2] = { 0, 0x3f800000 };
490 
491    if (a->output_format == PIPE_FORMAT_NONE
492        || a->input_format == PIPE_FORMAT_NONE)
493       return false;
494 
495    if (input_desc->channel[0].size & 7)
496       return false;
497 
498    if (input_desc->colorspace != output_desc->colorspace)
499       return false;
500 
501    for (i = 1; i < input_desc->nr_channels; ++i) {
502       if (memcmp
503           (&input_desc->channel[i], &input_desc->channel[0],
504            sizeof(input_desc->channel[0])))
505          return false;
506    }
507 
508    for (i = 1; i < output_desc->nr_channels; ++i) {
509       if (memcmp
510           (&output_desc->channel[i], &output_desc->channel[0],
511            sizeof(output_desc->channel[0]))) {
512          return false;
513       }
514    }
515 
516    for (i = 0; i < output_desc->nr_channels; ++i) {
517       if (output_desc->swizzle[i] < 4)
518          swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
519    }
520 
521    if ((x86_target_caps(p->func) & X86_SSE) &&
522        (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
523         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
524         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
525         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
526       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
527       struct x86_reg auxXMM;
528 
529       for (i = 0; i < output_desc->nr_channels; ++i) {
530          if (swizzle[i] == PIPE_SWIZZLE_0
531              && i >= input_desc->nr_channels)
532             swizzle[i] = i;
533       }
534 
535       for (i = 0; i < output_desc->nr_channels; ++i) {
536          if (swizzle[i] < 4)
537             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
538          if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
539             id_swizzle = false;
540       }
541 
542       if (needed_chans > 0) {
543          switch (input_desc->channel[0].type) {
544          case UTIL_FORMAT_TYPE_UNSIGNED:
545             if (!(x86_target_caps(p->func) & X86_SSE2))
546                return false;
547             emit_load_sse2(p, dataXMM, src,
548                            input_desc->channel[0].size *
549                            input_desc->nr_channels >> 3);
550 
551             /* TODO: add support for SSE4.1 pmovzx */
552             switch (input_desc->channel[0].size) {
553             case 8:
554                /* TODO: this may be inefficient due to get_identity() being
555                 *  used both as a float and integer register.
556                 */
557                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
558                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
559                break;
560             case 16:
561                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
562                break;
563             case 32:           /* we lose precision here */
564                /* No unsigned conversion (except in AVX512F), so we check if
565                 * it's negative, and stick the high bit as a separate float
566                 * value in an aux register: */
567                auxXMM = x86_make_reg(file_XMM, 1);
568                /* aux = 0 */
569                sse_xorps(p->func, auxXMM, auxXMM);
570                /* aux = aux > data ? 0xffffffff : 0 */
571                sse2_pcmpgtd(p->func, auxXMM, dataXMM);
572                /* data = data & 0x7fffffff */
573                sse_andps(p->func, dataXMM, get_const(p, CONST_2147483647_INT));
574                /* aux = aux & 2147483648.0 */
575                sse_andps(p->func, auxXMM, get_const(p, CONST_2147483648));
576                break;
577             default:
578                return false;
579             }
580             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
581             if (input_desc->channel[0].size == 32)
582                /* add in the high bit's worth of float that we AND'd away */
583                sse_addps(p->func, dataXMM, auxXMM);
584             if (input_desc->channel[0].normalized) {
585                struct x86_reg factor;
586                switch (input_desc->channel[0].size) {
587                case 8:
588                   factor = get_const(p, CONST_INV_255);
589                   break;
590                case 16:
591                   factor = get_const(p, CONST_INV_65535);
592                   break;
593                case 32:
594                   factor = get_const(p, CONST_INV_4294967295);
595                   break;
596                default:
597                   assert(0);
598                   factor.disp = 0;
599                   factor.file = 0;
600                   factor.idx = 0;
601                   factor.mod = 0;
602                   break;
603                }
604                sse_mulps(p->func, dataXMM, factor);
605             }
606             break;
607          case UTIL_FORMAT_TYPE_SIGNED:
608             if (!(x86_target_caps(p->func) & X86_SSE2))
609                return false;
610             emit_load_sse2(p, dataXMM, src,
611                            input_desc->channel[0].size *
612                            input_desc->nr_channels >> 3);
613 
614             /* TODO: add support for SSE4.1 pmovsx */
615             switch (input_desc->channel[0].size) {
616             case 8:
617                sse2_punpcklbw(p->func, dataXMM, dataXMM);
618                sse2_punpcklbw(p->func, dataXMM, dataXMM);
619                sse2_psrad_imm(p->func, dataXMM, 24);
620                break;
621             case 16:
622                sse2_punpcklwd(p->func, dataXMM, dataXMM);
623                sse2_psrad_imm(p->func, dataXMM, 16);
624                break;
625             case 32:           /* we lose precision here */
626                break;
627             default:
628                return false;
629             }
630             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
631             if (input_desc->channel[0].normalized) {
632                struct x86_reg factor;
633                switch (input_desc->channel[0].size) {
634                case 8:
635                   factor = get_const(p, CONST_INV_127);
636                   break;
637                case 16:
638                   factor = get_const(p, CONST_INV_32767);
639                   break;
640                case 32:
641                   factor = get_const(p, CONST_INV_2147483647);
642                   break;
643                default:
644                   assert(0);
645                   factor.disp = 0;
646                   factor.file = 0;
647                   factor.idx = 0;
648                   factor.mod = 0;
649                   break;
650                }
651                sse_mulps(p->func, dataXMM, factor);
652             }
653             break;
654 
655             break;
656          case UTIL_FORMAT_TYPE_FLOAT:
657             if (input_desc->channel[0].size != 32
658                 && input_desc->channel[0].size != 64) {
659                return false;
660             }
661             if (swizzle[3] == PIPE_SWIZZLE_1
662                 && input_desc->nr_channels <= 3) {
663                swizzle[3] = PIPE_SWIZZLE_W;
664                needed_chans = CHANNELS_0001;
665             }
666             switch (input_desc->channel[0].size) {
667             case 32:
668                emit_load_float32(p, dataXMM, src, needed_chans,
669                                  input_desc->nr_channels);
670                break;
671             case 64:           /* we lose precision here */
672                if (!(x86_target_caps(p->func) & X86_SSE2))
673                   return false;
674                emit_load_float64to32(p, dataXMM, src, needed_chans,
675                                      input_desc->nr_channels);
676                break;
677             default:
678                return false;
679             }
680             break;
681          default:
682             return false;
683          }
684 
685          if (!id_swizzle) {
686             sse_shufps(p->func, dataXMM, dataXMM,
687                        SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
688          }
689       }
690 
691       if (output_desc->nr_channels >= 4
692           && swizzle[0] < PIPE_SWIZZLE_0
693           && swizzle[1] < PIPE_SWIZZLE_0
694           && swizzle[2] < PIPE_SWIZZLE_0
695           && swizzle[3] < PIPE_SWIZZLE_0) {
696          sse_movups(p->func, dst, dataXMM);
697       }
698       else {
699          if (output_desc->nr_channels >= 2
700              && swizzle[0] < PIPE_SWIZZLE_0
701              && swizzle[1] < PIPE_SWIZZLE_0) {
702             sse_movlps(p->func, dst, dataXMM);
703          }
704          else {
705             if (swizzle[0] < PIPE_SWIZZLE_0) {
706                sse_movss(p->func, dst, dataXMM);
707             }
708             else {
709                x86_mov_imm(p->func, dst,
710                            imms[swizzle[0] - PIPE_SWIZZLE_0]);
711             }
712 
713             if (output_desc->nr_channels >= 2) {
714                if (swizzle[1] < PIPE_SWIZZLE_0) {
715                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
716                   sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
717                }
718                else {
719                   x86_mov_imm(p->func, x86_make_disp(dst, 4),
720                               imms[swizzle[1] - PIPE_SWIZZLE_0]);
721                }
722             }
723          }
724 
725          if (output_desc->nr_channels >= 3) {
726             if (output_desc->nr_channels >= 4
727                 && swizzle[2] < PIPE_SWIZZLE_0
728                 && swizzle[3] < PIPE_SWIZZLE_0) {
729                sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
730             }
731             else {
732                if (swizzle[2] < PIPE_SWIZZLE_0) {
733                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
734                   sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
735                }
736                else {
737                   x86_mov_imm(p->func, x86_make_disp(dst, 8),
738                               imms[swizzle[2] - PIPE_SWIZZLE_0]);
739                }
740 
741                if (output_desc->nr_channels >= 4) {
742                   if (swizzle[3] < PIPE_SWIZZLE_0) {
743                      sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
744                      sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
745                   }
746                   else {
747                      x86_mov_imm(p->func, x86_make_disp(dst, 12),
748                                  imms[swizzle[3] - PIPE_SWIZZLE_0]);
749                   }
750                }
751             }
752          }
753       }
754       return true;
755    }
756    else if ((x86_target_caps(p->func) & X86_SSE2)
757             && input_desc->channel[0].size == 8
758             && output_desc->channel[0].size == 16
759             && output_desc->channel[0].normalized ==
760             input_desc->channel[0].normalized &&
761             (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
762                    && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
763              || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
764                  && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
765              || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
766                  && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
767       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
768       struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
769       struct x86_reg tmp = p->tmp_EAX;
770       unsigned imms[2] = { 0, 1 };
771 
772       for (i = 0; i < output_desc->nr_channels; ++i) {
773          if (swizzle[i] == PIPE_SWIZZLE_0
774              && i >= input_desc->nr_channels) {
775             swizzle[i] = i;
776          }
777       }
778 
779       for (i = 0; i < output_desc->nr_channels; ++i) {
780          if (swizzle[i] < 4)
781             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
782          if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
783             id_swizzle = false;
784       }
785 
786       if (needed_chans > 0) {
787          emit_load_sse2(p, dataXMM, src,
788                         input_desc->channel[0].size *
789                         input_desc->nr_channels >> 3);
790 
791          switch (input_desc->channel[0].type) {
792          case UTIL_FORMAT_TYPE_UNSIGNED:
793             if (input_desc->channel[0].normalized) {
794                sse2_punpcklbw(p->func, dataXMM, dataXMM);
795                if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
796                   sse2_psrlw_imm(p->func, dataXMM, 1);
797             }
798             else
799                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
800             break;
801          case UTIL_FORMAT_TYPE_SIGNED:
802             if (input_desc->channel[0].normalized) {
803                sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
804                sse2_punpcklbw(p->func, tmpXMM, dataXMM);
805                sse2_psllw_imm(p->func, dataXMM, 9);
806                sse2_psrlw_imm(p->func, dataXMM, 8);
807                sse2_por(p->func, tmpXMM, dataXMM);
808                sse2_psrlw_imm(p->func, dataXMM, 7);
809                sse2_por(p->func, tmpXMM, dataXMM);
810                {
811                   struct x86_reg t = dataXMM;
812                   dataXMM = tmpXMM;
813                   tmpXMM = t;
814                }
815             }
816             else {
817                sse2_punpcklbw(p->func, dataXMM, dataXMM);
818                sse2_psraw_imm(p->func, dataXMM, 8);
819             }
820             break;
821          default:
822             assert(0);
823          }
824 
825          if (output_desc->channel[0].normalized)
826             imms[1] =
827                (output_desc->channel[0].type ==
828                 UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
829 
830          if (!id_swizzle)
831             sse2_pshuflw(p->func, dataXMM, dataXMM,
832                          (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
833                          ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
834       }
835 
836       if (output_desc->nr_channels >= 4
837           && swizzle[0] < PIPE_SWIZZLE_0
838           && swizzle[1] < PIPE_SWIZZLE_0
839           && swizzle[2] < PIPE_SWIZZLE_0
840           && swizzle[3] < PIPE_SWIZZLE_0) {
841          sse2_movq(p->func, dst, dataXMM);
842       }
843       else {
844          if (swizzle[0] < PIPE_SWIZZLE_0) {
845             if (output_desc->nr_channels >= 2
846                 && swizzle[1] < PIPE_SWIZZLE_0) {
847                sse2_movd(p->func, dst, dataXMM);
848             }
849             else {
850                sse2_movd(p->func, tmp, dataXMM);
851                x86_mov16(p->func, dst, tmp);
852                if (output_desc->nr_channels >= 2)
853                   x86_mov16_imm(p->func, x86_make_disp(dst, 2),
854                                 imms[swizzle[1] - PIPE_SWIZZLE_0]);
855             }
856          }
857          else {
858             if (output_desc->nr_channels >= 2
859                 && swizzle[1] >= PIPE_SWIZZLE_0) {
860                x86_mov_imm(p->func, dst,
861                            (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
862                            imms[swizzle[0] - PIPE_SWIZZLE_0]);
863             }
864             else {
865                x86_mov16_imm(p->func, dst,
866                              imms[swizzle[0] - PIPE_SWIZZLE_0]);
867                if (output_desc->nr_channels >= 2) {
868                   sse2_movd(p->func, tmp, dataXMM);
869                   x86_shr_imm(p->func, tmp, 16);
870                   x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
871                }
872             }
873          }
874 
875          if (output_desc->nr_channels >= 3) {
876             if (swizzle[2] < PIPE_SWIZZLE_0) {
877                if (output_desc->nr_channels >= 4
878                    && swizzle[3] < PIPE_SWIZZLE_0) {
879                   sse2_psrlq_imm(p->func, dataXMM, 32);
880                   sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
881                }
882                else {
883                   sse2_psrlq_imm(p->func, dataXMM, 32);
884                   sse2_movd(p->func, tmp, dataXMM);
885                   x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
886                   if (output_desc->nr_channels >= 4) {
887                      x86_mov16_imm(p->func, x86_make_disp(dst, 6),
888                                    imms[swizzle[3] - PIPE_SWIZZLE_0]);
889                   }
890                }
891             }
892             else {
893                if (output_desc->nr_channels >= 4
894                    && swizzle[3] >= PIPE_SWIZZLE_0) {
895                   x86_mov_imm(p->func, x86_make_disp(dst, 4),
896                               (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
897                               | imms[swizzle[2] - PIPE_SWIZZLE_0]);
898                }
899                else {
900                   x86_mov16_imm(p->func, x86_make_disp(dst, 4),
901                                 imms[swizzle[2] - PIPE_SWIZZLE_0]);
902 
903                   if (output_desc->nr_channels >= 4) {
904                      sse2_psrlq_imm(p->func, dataXMM, 48);
905                      sse2_movd(p->func, tmp, dataXMM);
906                      x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
907                   }
908                }
909             }
910          }
911       }
912       return true;
913    }
914    else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
915                     sizeof(output_desc->channel[0]))) {
916       struct x86_reg tmp = p->tmp_EAX;
917       unsigned i;
918 
919       if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
920           && output_desc->nr_channels == 4
921           && swizzle[0] == PIPE_SWIZZLE_W
922           && swizzle[1] == PIPE_SWIZZLE_Z
923           && swizzle[2] == PIPE_SWIZZLE_Y
924           && swizzle[3] == PIPE_SWIZZLE_X) {
925          /* TODO: support movbe */
926          x86_mov(p->func, tmp, src);
927          x86_bswap(p->func, tmp);
928          x86_mov(p->func, dst, tmp);
929          return true;
930       }
931 
932       for (i = 0; i < output_desc->nr_channels; ++i) {
933          switch (output_desc->channel[0].size) {
934          case 8:
935             if (swizzle[i] >= PIPE_SWIZZLE_0) {
936                unsigned v = 0;
937                if (swizzle[i] == PIPE_SWIZZLE_1) {
938                   switch (output_desc->channel[0].type) {
939                   case UTIL_FORMAT_TYPE_UNSIGNED:
940                      v = output_desc->channel[0].normalized ? 0xff : 1;
941                      break;
942                   case UTIL_FORMAT_TYPE_SIGNED:
943                      v = output_desc->channel[0].normalized ? 0x7f : 1;
944                      break;
945                   default:
946                      return false;
947                   }
948                }
949                x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
950             }
951             else {
952                x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
953                x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
954             }
955             break;
956          case 16:
957             if (swizzle[i] >= PIPE_SWIZZLE_0) {
958                unsigned v = 0;
959                if (swizzle[i] == PIPE_SWIZZLE_1) {
960                   switch (output_desc->channel[1].type) {
961                   case UTIL_FORMAT_TYPE_UNSIGNED:
962                      v = output_desc->channel[1].normalized ? 0xffff : 1;
963                      break;
964                   case UTIL_FORMAT_TYPE_SIGNED:
965                      v = output_desc->channel[1].normalized ? 0x7fff : 1;
966                      break;
967                   case UTIL_FORMAT_TYPE_FLOAT:
968                      v = 0x3c00;
969                      break;
970                   default:
971                      return false;
972                   }
973                }
974                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
975             }
976             else if (swizzle[i] == PIPE_SWIZZLE_0) {
977                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
978             }
979             else {
980                x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
981                x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
982             }
983             break;
984          case 32:
985             if (swizzle[i] >= PIPE_SWIZZLE_0) {
986                unsigned v = 0;
987                if (swizzle[i] == PIPE_SWIZZLE_1) {
988                   switch (output_desc->channel[1].type) {
989                   case UTIL_FORMAT_TYPE_UNSIGNED:
990                      v = output_desc->channel[1].normalized ? 0xffffffff : 1;
991                      break;
992                   case UTIL_FORMAT_TYPE_SIGNED:
993                      v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
994                      break;
995                   case UTIL_FORMAT_TYPE_FLOAT:
996                      v = 0x3f800000;
997                      break;
998                   default:
999                      return false;
1000                   }
1001                }
1002                x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
1003             }
1004             else {
1005                x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
1006                x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
1007             }
1008             break;
1009          case 64:
1010             if (swizzle[i] >= PIPE_SWIZZLE_0) {
1011                unsigned l = 0;
1012                unsigned h = 0;
1013                if (swizzle[i] == PIPE_SWIZZLE_1) {
1014                   switch (output_desc->channel[1].type) {
1015                   case UTIL_FORMAT_TYPE_UNSIGNED:
1016                      h = output_desc->channel[1].normalized ? 0xffffffff : 0;
1017                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1018                      break;
1019                   case UTIL_FORMAT_TYPE_SIGNED:
1020                      h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
1021                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
1022                      break;
1023                   case UTIL_FORMAT_TYPE_FLOAT:
1024                      h = 0x3ff00000;
1025                      l = 0;
1026                      break;
1027                   default:
1028                      return false;
1029                   }
1030                }
1031                x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1032                x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1033             }
1034             else {
1035                if (x86_target_caps(p->func) & X86_SSE) {
1036                   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1037                   emit_load64(p, tmp, tmpXMM,
1038                               x86_make_disp(src, swizzle[i] * 8));
1039                   emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1040                }
1041                else {
1042                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1043                   x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1044                   x86_mov(p->func, tmp,
1045                           x86_make_disp(src, swizzle[i] * 8 + 4));
1046                   x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1047                }
1048             }
1049             break;
1050          default:
1051             return false;
1052          }
1053       }
1054       return true;
1055    }
1056    /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1057    else if ((x86_target_caps(p->func) & X86_SSE2) &&
1058             a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1059             (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1060              || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1061       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1062 
1063       /* load */
1064       sse_movups(p->func, dataXMM, src);
1065 
1066       if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1067          sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1068       }
1069 
1070       /* scale by 255.0 */
1071       sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1072 
1073       /* pack and emit */
1074       sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1075       sse2_packssdw(p->func, dataXMM, dataXMM);
1076       sse2_packuswb(p->func, dataXMM, dataXMM);
1077       sse2_movd(p->func, dst, dataXMM);
1078 
1079       return true;
1080    }
1081 
1082    return false;
1083 }
1084 
1085 
1086 static bool
translate_attr(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)1087 translate_attr(struct translate_sse *p,
1088                const struct translate_element *a,
1089                struct x86_reg src, struct x86_reg dst)
1090 {
1091    if (a->input_format == a->output_format) {
1092       emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1093       return true;
1094    }
1095 
1096    return translate_attr_convert(p, a, src, dst);
1097 }
1098 
1099 
1100 static bool
init_inputs(struct translate_sse * p,unsigned index_size)1101 init_inputs(struct translate_sse *p, unsigned index_size)
1102 {
1103    unsigned i;
1104    struct x86_reg instance_id =
1105       x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1106    struct x86_reg start_instance =
1107       x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1108 
1109    for (i = 0; i < p->nr_buffer_variants; i++) {
1110       struct translate_buffer_variant *variant = &p->buffer_variant[i];
1111       struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1112 
1113       if (!index_size || variant->instance_divisor) {
1114          struct x86_reg buf_max_index =
1115             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1116          struct x86_reg buf_stride =
1117             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1118          struct x86_reg buf_ptr =
1119             x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1120          struct x86_reg buf_base_ptr =
1121             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1122          struct x86_reg elt = p->idx_ESI;
1123          struct x86_reg tmp_EAX = p->tmp_EAX;
1124 
1125          /* Calculate pointer to first attrib:
1126           *   base_ptr + stride * index, where index depends on instance divisor
1127           */
1128          if (variant->instance_divisor) {
1129             struct x86_reg tmp_EDX = p->tmp2_EDX;
1130 
1131             /* Start with instance = instance_id
1132              * which is true if divisor is 1.
1133              */
1134             x86_mov(p->func, tmp_EAX, instance_id);
1135 
1136             if (variant->instance_divisor != 1) {
1137                struct x86_reg tmp_ECX = p->src_ECX;
1138 
1139                /* TODO: Add x86_shr() to rtasm and use it whenever
1140                 *       instance divisor is power of two.
1141                 */
1142                x86_xor(p->func, tmp_EDX, tmp_EDX);
1143                x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1144                x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1145             }
1146 
1147             /* instance = (instance_id / divisor) + start_instance
1148              */
1149             x86_mov(p->func, tmp_EDX, start_instance);
1150             x86_add(p->func, tmp_EAX, tmp_EDX);
1151 
1152             /* XXX we need to clamp the index here too, but to a
1153              * per-array max value, not the draw->pt.max_index value
1154              * that's being given to us via translate->set_buffer().
1155              */
1156          }
1157          else {
1158             x86_mov(p->func, tmp_EAX, elt);
1159 
1160             /* Clamp to max_index
1161              */
1162             x86_cmp(p->func, tmp_EAX, buf_max_index);
1163             x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1164          }
1165 
1166          x86_mov(p->func, p->tmp2_EDX, buf_stride);
1167          x64_rexw(p->func);
1168          x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1169          x64_rexw(p->func);
1170          x86_add(p->func, tmp_EAX, buf_base_ptr);
1171 
1172          x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1173 
1174          /* In the linear case, keep the buffer pointer instead of the
1175           * index number.
1176           */
1177          if (!index_size && p->nr_buffer_variants == 1) {
1178             x64_rexw(p->func);
1179             x86_mov(p->func, elt, tmp_EAX);
1180          }
1181          else {
1182             x64_rexw(p->func);
1183             x86_mov(p->func, buf_ptr, tmp_EAX);
1184          }
1185       }
1186    }
1187 
1188    return true;
1189 }
1190 
1191 
1192 static struct x86_reg
get_buffer_ptr(struct translate_sse * p,unsigned index_size,unsigned var_idx,struct x86_reg elt)1193 get_buffer_ptr(struct translate_sse *p,
1194                unsigned index_size, unsigned var_idx, struct x86_reg elt)
1195 {
1196    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1197       return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1198    }
1199    if (!index_size && p->nr_buffer_variants == 1) {
1200       return p->idx_ESI;
1201    }
1202    else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1203       struct x86_reg ptr = p->src_ECX;
1204       struct x86_reg buf_ptr =
1205          x86_make_disp(p->machine_EDI,
1206                        get_offset(p, &p->buffer_variant[var_idx].ptr));
1207 
1208       x64_rexw(p->func);
1209       x86_mov(p->func, ptr, buf_ptr);
1210       return ptr;
1211    }
1212    else {
1213       struct x86_reg ptr = p->src_ECX;
1214       const struct translate_buffer_variant *variant =
1215          &p->buffer_variant[var_idx];
1216       struct x86_reg buf_stride =
1217          x86_make_disp(p->machine_EDI,
1218                        get_offset(p, &p->buffer[variant->buffer_index].stride));
1219       struct x86_reg buf_base_ptr =
1220          x86_make_disp(p->machine_EDI,
1221                   get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1222       struct x86_reg buf_max_index =
1223          x86_make_disp(p->machine_EDI,
1224                   get_offset(p, &p->buffer[variant->buffer_index].max_index));
1225 
1226       /* Calculate pointer to current attrib:
1227        */
1228       switch (index_size) {
1229       case 1:
1230          x86_movzx8(p->func, ptr, elt);
1231          break;
1232       case 2:
1233          x86_movzx16(p->func, ptr, elt);
1234          break;
1235       case 4:
1236          x86_mov(p->func, ptr, elt);
1237          break;
1238       }
1239 
1240       /* Clamp to max_index
1241        */
1242       x86_cmp(p->func, ptr, buf_max_index);
1243       x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1244 
1245       x86_mov(p->func, p->tmp2_EDX, buf_stride);
1246       x64_rexw(p->func);
1247       x86_imul(p->func, ptr, p->tmp2_EDX);
1248       x64_rexw(p->func);
1249       x86_add(p->func, ptr, buf_base_ptr);
1250       return ptr;
1251    }
1252 }
1253 
1254 
1255 static bool
incr_inputs(struct translate_sse * p,unsigned index_size)1256 incr_inputs(struct translate_sse *p, unsigned index_size)
1257 {
1258    if (!index_size && p->nr_buffer_variants == 1) {
1259       const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1260       struct x86_reg stride =
1261          x86_make_disp(p->machine_EDI,
1262                        get_offset(p, &p->buffer[buffer_index].stride));
1263 
1264       if (p->buffer_variant[0].instance_divisor == 0) {
1265          x64_rexw(p->func);
1266          x86_add(p->func, p->idx_ESI, stride);
1267          sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1268       }
1269    }
1270    else if (!index_size) {
1271       unsigned i;
1272 
1273       /* Is this worthwhile??
1274        */
1275       for (i = 0; i < p->nr_buffer_variants; i++) {
1276          struct translate_buffer_variant *variant = &p->buffer_variant[i];
1277          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1278                                                 get_offset(p, &variant->ptr));
1279       struct x86_reg buf_stride =
1280          x86_make_disp(p->machine_EDI,
1281                        get_offset(p, &p->buffer[variant->buffer_index].stride));
1282 
1283          if (variant->instance_divisor == 0) {
1284             x86_mov(p->func, p->tmp_EAX, buf_stride);
1285             x64_rexw(p->func);
1286             x86_add(p->func, p->tmp_EAX, buf_ptr);
1287             if (i == 0)
1288                sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1289             x64_rexw(p->func);
1290             x86_mov(p->func, buf_ptr, p->tmp_EAX);
1291          }
1292       }
1293    }
1294    else {
1295       x64_rexw(p->func);
1296       x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1297    }
1298 
1299    return true;
1300 }
1301 
1302 
1303 /* Build run( struct translate *machine,
1304  *            unsigned start,
1305  *            unsigned count,
1306  *            void *output_buffer )
1307  * or
1308  *  run_elts( struct translate *machine,
1309  *            unsigned *elts,
1310  *            unsigned count,
1311  *            void *output_buffer )
1312  *
1313  *  Lots of hardcoding
1314  *
1315  * EAX -- pointer to current output vertex
1316  * ECX -- pointer to current attribute
1317  *
1318  */
1319 static bool
build_vertex_emit(struct translate_sse * p,struct x86_function * func,unsigned index_size)1320 build_vertex_emit(struct translate_sse *p,
1321                   struct x86_function *func, unsigned index_size)
1322 {
1323    int fixup, label;
1324    unsigned j;
1325 
1326    memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1327    memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1328 
1329    p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1330    p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1331    p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1332    p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1333    p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1334    p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1335    p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1336 
1337    p->func = func;
1338 
1339    x86_init_func(p->func);
1340 
1341    if (x86_target(p->func) == X86_64_WIN64_ABI) {
1342       /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1343        * above the return address
1344        */
1345       sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1346                   x86_make_reg(file_XMM, 6));
1347       sse2_movdqa(p->func,
1348                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1349                   x86_make_reg(file_XMM, 7));
1350    }
1351 
1352    x86_push(p->func, p->outbuf_EBX);
1353    x86_push(p->func, p->count_EBP);
1354 
1355    /* on non-Win64 x86-64, these are already in the right registers */
1356    if (x86_target(p->func) != X86_64_STD_ABI) {
1357       x86_push(p->func, p->machine_EDI);
1358       x86_push(p->func, p->idx_ESI);
1359 
1360       if (x86_target(p->func) != X86_32) {
1361          x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1362          x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1363       }
1364       else {
1365          x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1366          x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1367       }
1368    }
1369 
1370    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1371 
1372    if (x86_target(p->func) != X86_32)
1373       x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1374    else
1375       x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1376 
1377    /* Load instance ID.
1378     */
1379    if (p->use_instancing) {
1380       x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1381       x86_mov(p->func,
1382               x86_make_disp(p->machine_EDI,
1383                             get_offset(p, &p->start_instance)), p->tmp2_EDX);
1384 
1385       x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1386       x86_mov(p->func,
1387               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1388               p->tmp_EAX);
1389    }
1390 
1391    /* Get vertex count, compare to zero
1392     */
1393    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1394    x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1395    fixup = x86_jcc_forward(p->func, cc_E);
1396 
1397    /* always load, needed or not:
1398     */
1399    init_inputs(p, index_size);
1400 
1401    /* Note address for loop jump
1402     */
1403    label = x86_get_label(p->func);
1404    {
1405       struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1406       int last_variant = -1;
1407       struct x86_reg vb;
1408 
1409       for (j = 0; j < p->translate.key.nr_elements; j++) {
1410          const struct translate_element *a = &p->translate.key.element[j];
1411          unsigned variant = p->element_to_buffer_variant[j];
1412 
1413          /* Figure out source pointer address:
1414           */
1415          if (variant != last_variant) {
1416             last_variant = variant;
1417             vb = get_buffer_ptr(p, index_size, variant, elt);
1418          }
1419 
1420          if (!translate_attr(p, a,
1421                              x86_make_disp(vb, a->input_offset),
1422                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
1423             return false;
1424       }
1425 
1426       /* Next output vertex:
1427        */
1428       x64_rexw(p->func);
1429       x86_lea(p->func, p->outbuf_EBX,
1430               x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1431 
1432       /* Incr index
1433        */
1434       incr_inputs(p, index_size);
1435    }
1436 
1437    /* decr count, loop if not zero
1438     */
1439    x86_dec(p->func, p->count_EBP);
1440    x86_jcc(p->func, cc_NZ, label);
1441 
1442    /* Exit mmx state?
1443     */
1444    if (p->func->need_emms)
1445       mmx_emms(p->func);
1446 
1447    /* Land forward jump here:
1448     */
1449    x86_fixup_fwd_jump(p->func, fixup);
1450 
1451    /* Pop regs and return
1452     */
1453    if (x86_target(p->func) != X86_64_STD_ABI) {
1454       x86_pop(p->func, p->idx_ESI);
1455       x86_pop(p->func, p->machine_EDI);
1456    }
1457 
1458    x86_pop(p->func, p->count_EBP);
1459    x86_pop(p->func, p->outbuf_EBX);
1460 
1461    if (x86_target(p->func) == X86_64_WIN64_ABI) {
1462       sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1463                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1464       sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1465                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1466    }
1467    x86_ret(p->func);
1468 
1469    return true;
1470 }
1471 
1472 
1473 static void
translate_sse_set_buffer(struct translate * translate,unsigned buf,const void * ptr,unsigned stride,unsigned max_index)1474 translate_sse_set_buffer(struct translate *translate,
1475                          unsigned buf,
1476                          const void *ptr, unsigned stride, unsigned max_index)
1477 {
1478    struct translate_sse *p = (struct translate_sse *) translate;
1479 
1480    if (buf < p->nr_buffers) {
1481       p->buffer[buf].base_ptr = (char *) ptr;
1482       p->buffer[buf].stride = stride;
1483       p->buffer[buf].max_index = max_index;
1484    }
1485 
1486    if (0)
1487       debug_printf("%s %d/%d: %p %d\n",
1488                    __func__, buf, p->nr_buffers, ptr, stride);
1489 }
1490 
1491 
1492 static void
translate_sse_release(struct translate * translate)1493 translate_sse_release(struct translate *translate)
1494 {
1495    struct translate_sse *p = (struct translate_sse *) translate;
1496 
1497    x86_release_func(&p->elt8_func);
1498    x86_release_func(&p->elt16_func);
1499    x86_release_func(&p->elt_func);
1500    x86_release_func(&p->linear_func);
1501 
1502    os_free_aligned(p);
1503 }
1504 
1505 
1506 struct translate *
translate_sse2_create(const struct translate_key * key)1507 translate_sse2_create(const struct translate_key *key)
1508 {
1509    struct translate_sse *p = NULL;
1510    unsigned i;
1511 
1512    if (!util_get_cpu_caps()->has_sse)
1513       goto fail;
1514 
1515    p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1516    if (!p)
1517       goto fail;
1518 
1519    memset(p, 0, sizeof(*p));
1520    memcpy(p->consts, consts, sizeof(consts));
1521    memcpy(p->uconsts, uconsts, sizeof(uconsts));
1522 
1523    p->translate.key = *key;
1524    p->translate.release = translate_sse_release;
1525    p->translate.set_buffer = translate_sse_set_buffer;
1526 
1527    assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1528 
1529    for (i = 0; i < key->nr_elements; i++) {
1530       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1531          unsigned j;
1532 
1533          p->nr_buffers =
1534             MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1535 
1536          if (key->element[i].instance_divisor) {
1537             p->use_instancing = true;
1538          }
1539 
1540          /*
1541           * Map vertex element to vertex buffer variant.
1542           */
1543          for (j = 0; j < p->nr_buffer_variants; j++) {
1544             if (p->buffer_variant[j].buffer_index ==
1545                 key->element[i].input_buffer
1546                 && p->buffer_variant[j].instance_divisor ==
1547                 key->element[i].instance_divisor) {
1548                break;
1549             }
1550          }
1551          if (j == p->nr_buffer_variants) {
1552             p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1553             p->buffer_variant[j].instance_divisor =
1554                key->element[i].instance_divisor;
1555             p->nr_buffer_variants++;
1556          }
1557          p->element_to_buffer_variant[i] = j;
1558       }
1559       else {
1560          assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1561 
1562          p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1563       }
1564    }
1565 
1566    if (0)
1567       debug_printf("nr_buffers: %d\n", p->nr_buffers);
1568 
1569    if (!build_vertex_emit(p, &p->linear_func, 0))
1570       goto fail;
1571 
1572    if (!build_vertex_emit(p, &p->elt_func, 4))
1573       goto fail;
1574 
1575    if (!build_vertex_emit(p, &p->elt16_func, 2))
1576       goto fail;
1577 
1578    if (!build_vertex_emit(p, &p->elt8_func, 1))
1579       goto fail;
1580 
1581    p->translate.run = (run_func) x86_get_func(&p->linear_func);
1582    if (p->translate.run == NULL)
1583       goto fail;
1584 
1585    p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1586    if (p->translate.run_elts == NULL)
1587       goto fail;
1588 
1589    p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1590    if (p->translate.run_elts16 == NULL)
1591       goto fail;
1592 
1593    p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1594    if (p->translate.run_elts8 == NULL)
1595       goto fail;
1596 
1597    return &p->translate;
1598 
1599  fail:
1600    if (p)
1601       translate_sse_release(&p->translate);
1602 
1603    return NULL;
1604 }
1605 
1606 
1607 #else
1608 
1609 struct translate *
translate_sse2_create(const struct translate_key * key)1610 translate_sse2_create(const struct translate_key *key)
1611 {
1612    return NULL;
1613 }
1614 
1615 #endif
1616