xref: /aosp_15_r20/external/pcre/src/sljit/sljitNativeX86_common.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*
2*22dc650dSSadaf Ebrahimi  *    Stack-less Just-In-Time compiler
3*22dc650dSSadaf Ebrahimi  *
4*22dc650dSSadaf Ebrahimi  *    Copyright Zoltan Herczeg ([email protected]). All rights reserved.
5*22dc650dSSadaf Ebrahimi  *
6*22dc650dSSadaf Ebrahimi  * Redistribution and use in source and binary forms, with or without modification, are
7*22dc650dSSadaf Ebrahimi  * permitted provided that the following conditions are met:
8*22dc650dSSadaf Ebrahimi  *
9*22dc650dSSadaf Ebrahimi  *   1. Redistributions of source code must retain the above copyright notice, this list of
10*22dc650dSSadaf Ebrahimi  *      conditions and the following disclaimer.
11*22dc650dSSadaf Ebrahimi  *
12*22dc650dSSadaf Ebrahimi  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13*22dc650dSSadaf Ebrahimi  *      of conditions and the following disclaimer in the documentation and/or other materials
14*22dc650dSSadaf Ebrahimi  *      provided with the distribution.
15*22dc650dSSadaf Ebrahimi  *
16*22dc650dSSadaf Ebrahimi  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17*22dc650dSSadaf Ebrahimi  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18*22dc650dSSadaf Ebrahimi  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19*22dc650dSSadaf Ebrahimi  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20*22dc650dSSadaf Ebrahimi  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21*22dc650dSSadaf Ebrahimi  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22*22dc650dSSadaf Ebrahimi  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23*22dc650dSSadaf Ebrahimi  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24*22dc650dSSadaf Ebrahimi  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*22dc650dSSadaf Ebrahimi  */
26*22dc650dSSadaf Ebrahimi 
sljit_get_platform_name(void)27*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28*22dc650dSSadaf Ebrahimi {
29*22dc650dSSadaf Ebrahimi 	return "x86" SLJIT_CPUINFO;
30*22dc650dSSadaf Ebrahimi }
31*22dc650dSSadaf Ebrahimi 
32*22dc650dSSadaf Ebrahimi /*
33*22dc650dSSadaf Ebrahimi    32b register indexes:
34*22dc650dSSadaf Ebrahimi      0 - EAX
35*22dc650dSSadaf Ebrahimi      1 - ECX
36*22dc650dSSadaf Ebrahimi      2 - EDX
37*22dc650dSSadaf Ebrahimi      3 - EBX
38*22dc650dSSadaf Ebrahimi      4 - ESP
39*22dc650dSSadaf Ebrahimi      5 - EBP
40*22dc650dSSadaf Ebrahimi      6 - ESI
41*22dc650dSSadaf Ebrahimi      7 - EDI
42*22dc650dSSadaf Ebrahimi */
43*22dc650dSSadaf Ebrahimi 
44*22dc650dSSadaf Ebrahimi /*
45*22dc650dSSadaf Ebrahimi    64b register indexes:
46*22dc650dSSadaf Ebrahimi      0 - RAX
47*22dc650dSSadaf Ebrahimi      1 - RCX
48*22dc650dSSadaf Ebrahimi      2 - RDX
49*22dc650dSSadaf Ebrahimi      3 - RBX
50*22dc650dSSadaf Ebrahimi      4 - RSP
51*22dc650dSSadaf Ebrahimi      5 - RBP
52*22dc650dSSadaf Ebrahimi      6 - RSI
53*22dc650dSSadaf Ebrahimi      7 - RDI
54*22dc650dSSadaf Ebrahimi      8 - R8   - From now on REX prefix is required
55*22dc650dSSadaf Ebrahimi      9 - R9
56*22dc650dSSadaf Ebrahimi     10 - R10
57*22dc650dSSadaf Ebrahimi     11 - R11
58*22dc650dSSadaf Ebrahimi     12 - R12
59*22dc650dSSadaf Ebrahimi     13 - R13
60*22dc650dSSadaf Ebrahimi     14 - R14
61*22dc650dSSadaf Ebrahimi     15 - R15
62*22dc650dSSadaf Ebrahimi */
63*22dc650dSSadaf Ebrahimi 
64*22dc650dSSadaf Ebrahimi #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
65*22dc650dSSadaf Ebrahimi #define TMP_FREG	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
66*22dc650dSSadaf Ebrahimi 
67*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
68*22dc650dSSadaf Ebrahimi 
69*22dc650dSSadaf Ebrahimi static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70*22dc650dSSadaf Ebrahimi 	0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3
71*22dc650dSSadaf Ebrahimi };
72*22dc650dSSadaf Ebrahimi 
73*22dc650dSSadaf Ebrahimi static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
74*22dc650dSSadaf Ebrahimi 	0, 1, 2, 3, 4, 5, 6, 7, 0
75*22dc650dSSadaf Ebrahimi };
76*22dc650dSSadaf Ebrahimi 
77*22dc650dSSadaf Ebrahimi #define CHECK_EXTRA_REGS(p, w, do) \
78*22dc650dSSadaf Ebrahimi 	if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
79*22dc650dSSadaf Ebrahimi 		w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
80*22dc650dSSadaf Ebrahimi 		p = SLJIT_MEM1(SLJIT_SP); \
81*22dc650dSSadaf Ebrahimi 		do; \
82*22dc650dSSadaf Ebrahimi 	}
83*22dc650dSSadaf Ebrahimi 
84*22dc650dSSadaf Ebrahimi #else /* SLJIT_CONFIG_X86_32 */
85*22dc650dSSadaf Ebrahimi 
86*22dc650dSSadaf Ebrahimi #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
87*22dc650dSSadaf Ebrahimi 
88*22dc650dSSadaf Ebrahimi /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
89*22dc650dSSadaf Ebrahimi    Note: avoid to use r12 and r13 for memory addressing
90*22dc650dSSadaf Ebrahimi    therefore r12 is better to be a higher saved register. */
91*22dc650dSSadaf Ebrahimi #ifndef _WIN64
92*22dc650dSSadaf Ebrahimi /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
93*22dc650dSSadaf Ebrahimi static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
94*22dc650dSSadaf Ebrahimi 	0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
95*22dc650dSSadaf Ebrahimi };
96*22dc650dSSadaf Ebrahimi /* low-map. reg_map & 0x7. */
97*22dc650dSSadaf Ebrahimi static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
98*22dc650dSSadaf Ebrahimi 	0, 0, 6, 7, 1, 0,  3,  2,  4, 5,  5,  6,  7, 3, 4, 2, 1
99*22dc650dSSadaf Ebrahimi };
100*22dc650dSSadaf Ebrahimi #else
101*22dc650dSSadaf Ebrahimi /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
102*22dc650dSSadaf Ebrahimi static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
103*22dc650dSSadaf Ebrahimi 	0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
104*22dc650dSSadaf Ebrahimi };
105*22dc650dSSadaf Ebrahimi /* low-map. reg_map & 0x7. */
106*22dc650dSSadaf Ebrahimi static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
107*22dc650dSSadaf Ebrahimi 	0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
108*22dc650dSSadaf Ebrahimi };
109*22dc650dSSadaf Ebrahimi #endif
110*22dc650dSSadaf Ebrahimi 
111*22dc650dSSadaf Ebrahimi /* Args: xmm0-xmm3 */
112*22dc650dSSadaf Ebrahimi static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
113*22dc650dSSadaf Ebrahimi 	0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4
114*22dc650dSSadaf Ebrahimi };
115*22dc650dSSadaf Ebrahimi /* low-map. freg_map & 0x7. */
116*22dc650dSSadaf Ebrahimi static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
117*22dc650dSSadaf Ebrahimi 	0, 0, 1, 2, 3, 5, 6, 7, 0, 1,  2,  3,  4,  5,  6,  7, 4
118*22dc650dSSadaf Ebrahimi };
119*22dc650dSSadaf Ebrahimi 
120*22dc650dSSadaf Ebrahimi #define REX_W		0x48
121*22dc650dSSadaf Ebrahimi #define REX_R		0x44
122*22dc650dSSadaf Ebrahimi #define REX_X		0x42
123*22dc650dSSadaf Ebrahimi #define REX_B		0x41
124*22dc650dSSadaf Ebrahimi #define REX		0x40
125*22dc650dSSadaf Ebrahimi 
126*22dc650dSSadaf Ebrahimi #ifndef _WIN64
127*22dc650dSSadaf Ebrahimi #define HALFWORD_MAX 0x7fffffffl
128*22dc650dSSadaf Ebrahimi #define HALFWORD_MIN -0x80000000l
129*22dc650dSSadaf Ebrahimi #else
130*22dc650dSSadaf Ebrahimi #define HALFWORD_MAX 0x7fffffffll
131*22dc650dSSadaf Ebrahimi #define HALFWORD_MIN -0x80000000ll
132*22dc650dSSadaf Ebrahimi #endif
133*22dc650dSSadaf Ebrahimi 
134*22dc650dSSadaf Ebrahimi #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
135*22dc650dSSadaf Ebrahimi #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
136*22dc650dSSadaf Ebrahimi 
137*22dc650dSSadaf Ebrahimi #define CHECK_EXTRA_REGS(p, w, do)
138*22dc650dSSadaf Ebrahimi 
139*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
140*22dc650dSSadaf Ebrahimi 
141*22dc650dSSadaf Ebrahimi #define U8(v)			((sljit_u8)(v))
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi /* Size flags for emit_x86_instruction: */
144*22dc650dSSadaf Ebrahimi #define EX86_BIN_INS		((sljit_uw)0x000010)
145*22dc650dSSadaf Ebrahimi #define EX86_SHIFT_INS		((sljit_uw)0x000020)
146*22dc650dSSadaf Ebrahimi #define EX86_BYTE_ARG		((sljit_uw)0x000040)
147*22dc650dSSadaf Ebrahimi #define EX86_HALF_ARG		((sljit_uw)0x000080)
148*22dc650dSSadaf Ebrahimi /* Size flags for both emit_x86_instruction and emit_vex_instruction: */
149*22dc650dSSadaf Ebrahimi #define EX86_REX		((sljit_uw)0x000100)
150*22dc650dSSadaf Ebrahimi #define EX86_NO_REXW		((sljit_uw)0x000200)
151*22dc650dSSadaf Ebrahimi #define EX86_PREF_66		((sljit_uw)0x000400)
152*22dc650dSSadaf Ebrahimi #define EX86_PREF_F2		((sljit_uw)0x000800)
153*22dc650dSSadaf Ebrahimi #define EX86_PREF_F3		((sljit_uw)0x001000)
154*22dc650dSSadaf Ebrahimi #define EX86_SSE2_OP1		((sljit_uw)0x002000)
155*22dc650dSSadaf Ebrahimi #define EX86_SSE2_OP2		((sljit_uw)0x004000)
156*22dc650dSSadaf Ebrahimi #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
157*22dc650dSSadaf Ebrahimi #define EX86_VEX_EXT		((sljit_uw)0x008000)
158*22dc650dSSadaf Ebrahimi /* Op flags for emit_vex_instruction: */
159*22dc650dSSadaf Ebrahimi #define VEX_OP_0F38		((sljit_uw)0x010000)
160*22dc650dSSadaf Ebrahimi #define VEX_OP_0F3A		((sljit_uw)0x020000)
161*22dc650dSSadaf Ebrahimi #define VEX_SSE2_OPV		((sljit_uw)0x040000)
162*22dc650dSSadaf Ebrahimi #define VEX_AUTO_W		((sljit_uw)0x080000)
163*22dc650dSSadaf Ebrahimi #define VEX_W			((sljit_uw)0x100000)
164*22dc650dSSadaf Ebrahimi #define VEX_256			((sljit_uw)0x200000)
165*22dc650dSSadaf Ebrahimi 
166*22dc650dSSadaf Ebrahimi #define EX86_SELECT_66(op)	(((op) & SLJIT_32) ? 0 : EX86_PREF_66)
167*22dc650dSSadaf Ebrahimi #define EX86_SELECT_F2_F3(op)	(((op) & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2)
168*22dc650dSSadaf Ebrahimi 
169*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
170*22dc650dSSadaf Ebrahimi /*  Instruction forms                                                    */
171*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
172*22dc650dSSadaf Ebrahimi 
173*22dc650dSSadaf Ebrahimi #define ADD			(/* BINARY */ 0 << 3)
174*22dc650dSSadaf Ebrahimi #define ADD_EAX_i32		0x05
175*22dc650dSSadaf Ebrahimi #define ADD_r_rm		0x03
176*22dc650dSSadaf Ebrahimi #define ADD_rm_r		0x01
177*22dc650dSSadaf Ebrahimi #define ADDSD_x_xm		0x58
178*22dc650dSSadaf Ebrahimi #define ADC			(/* BINARY */ 2 << 3)
179*22dc650dSSadaf Ebrahimi #define ADC_EAX_i32		0x15
180*22dc650dSSadaf Ebrahimi #define ADC_r_rm		0x13
181*22dc650dSSadaf Ebrahimi #define ADC_rm_r		0x11
182*22dc650dSSadaf Ebrahimi #define AND			(/* BINARY */ 4 << 3)
183*22dc650dSSadaf Ebrahimi #define AND_EAX_i32		0x25
184*22dc650dSSadaf Ebrahimi #define AND_r_rm		0x23
185*22dc650dSSadaf Ebrahimi #define AND_rm_r		0x21
186*22dc650dSSadaf Ebrahimi #define ANDPD_x_xm		0x54
187*22dc650dSSadaf Ebrahimi #define BSR_r_rm		(/* GROUP_0F */ 0xbd)
188*22dc650dSSadaf Ebrahimi #define BSF_r_rm		(/* GROUP_0F */ 0xbc)
189*22dc650dSSadaf Ebrahimi #define BSWAP_r			(/* GROUP_0F */ 0xc8)
190*22dc650dSSadaf Ebrahimi #define CALL_i32		0xe8
191*22dc650dSSadaf Ebrahimi #define CALL_rm			(/* GROUP_FF */ 2 << 3)
192*22dc650dSSadaf Ebrahimi #define CDQ			0x99
193*22dc650dSSadaf Ebrahimi #define CMOVE_r_rm		(/* GROUP_0F */ 0x44)
194*22dc650dSSadaf Ebrahimi #define CMP			(/* BINARY */ 7 << 3)
195*22dc650dSSadaf Ebrahimi #define CMP_EAX_i32		0x3d
196*22dc650dSSadaf Ebrahimi #define CMP_r_rm		0x3b
197*22dc650dSSadaf Ebrahimi #define CMP_rm_r		0x39
198*22dc650dSSadaf Ebrahimi #define CMPS_x_xm		0xc2
199*22dc650dSSadaf Ebrahimi #define CMPXCHG_rm_r		0xb1
200*22dc650dSSadaf Ebrahimi #define CMPXCHG_rm8_r		0xb0
201*22dc650dSSadaf Ebrahimi #define CVTPD2PS_x_xm		0x5a
202*22dc650dSSadaf Ebrahimi #define CVTPS2PD_x_xm		0x5a
203*22dc650dSSadaf Ebrahimi #define CVTSI2SD_x_rm		0x2a
204*22dc650dSSadaf Ebrahimi #define CVTTSD2SI_r_xm		0x2c
205*22dc650dSSadaf Ebrahimi #define DIV			(/* GROUP_F7 */ 6 << 3)
206*22dc650dSSadaf Ebrahimi #define DIVSD_x_xm		0x5e
207*22dc650dSSadaf Ebrahimi #define EXTRACTPS_x_xm		0x17
208*22dc650dSSadaf Ebrahimi #define FLDS			0xd9
209*22dc650dSSadaf Ebrahimi #define FLDL			0xdd
210*22dc650dSSadaf Ebrahimi #define FSTPS			0xd9
211*22dc650dSSadaf Ebrahimi #define FSTPD			0xdd
212*22dc650dSSadaf Ebrahimi #define INSERTPS_x_xm		0x21
213*22dc650dSSadaf Ebrahimi #define INT3			0xcc
214*22dc650dSSadaf Ebrahimi #define IDIV			(/* GROUP_F7 */ 7 << 3)
215*22dc650dSSadaf Ebrahimi #define IMUL			(/* GROUP_F7 */ 5 << 3)
216*22dc650dSSadaf Ebrahimi #define IMUL_r_rm		(/* GROUP_0F */ 0xaf)
217*22dc650dSSadaf Ebrahimi #define IMUL_r_rm_i8		0x6b
218*22dc650dSSadaf Ebrahimi #define IMUL_r_rm_i32		0x69
219*22dc650dSSadaf Ebrahimi #define JL_i8			0x7c
220*22dc650dSSadaf Ebrahimi #define JE_i8			0x74
221*22dc650dSSadaf Ebrahimi #define JNC_i8			0x73
222*22dc650dSSadaf Ebrahimi #define JNE_i8			0x75
223*22dc650dSSadaf Ebrahimi #define JMP_i8			0xeb
224*22dc650dSSadaf Ebrahimi #define JMP_i32			0xe9
225*22dc650dSSadaf Ebrahimi #define JMP_rm			(/* GROUP_FF */ 4 << 3)
226*22dc650dSSadaf Ebrahimi #define LEA_r_m			0x8d
227*22dc650dSSadaf Ebrahimi #define LOOP_i8			0xe2
228*22dc650dSSadaf Ebrahimi #define LZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
229*22dc650dSSadaf Ebrahimi #define MOV_r_rm		0x8b
230*22dc650dSSadaf Ebrahimi #define MOV_r_i32		0xb8
231*22dc650dSSadaf Ebrahimi #define MOV_rm_r		0x89
232*22dc650dSSadaf Ebrahimi #define MOV_rm_i32		0xc7
233*22dc650dSSadaf Ebrahimi #define MOV_rm8_i8		0xc6
234*22dc650dSSadaf Ebrahimi #define MOV_rm8_r8		0x88
235*22dc650dSSadaf Ebrahimi #define MOVAPS_x_xm		0x28
236*22dc650dSSadaf Ebrahimi #define MOVAPS_xm_x		0x29
237*22dc650dSSadaf Ebrahimi #define MOVD_x_rm		0x6e
238*22dc650dSSadaf Ebrahimi #define MOVD_rm_x		0x7e
239*22dc650dSSadaf Ebrahimi #define MOVDDUP_x_xm		0x12
240*22dc650dSSadaf Ebrahimi #define MOVDQA_x_xm		0x6f
241*22dc650dSSadaf Ebrahimi #define MOVDQA_xm_x		0x7f
242*22dc650dSSadaf Ebrahimi #define MOVHLPS_x_x		0x12
243*22dc650dSSadaf Ebrahimi #define MOVHPD_m_x		0x17
244*22dc650dSSadaf Ebrahimi #define MOVHPD_x_m		0x16
245*22dc650dSSadaf Ebrahimi #define MOVLHPS_x_x		0x16
246*22dc650dSSadaf Ebrahimi #define MOVLPD_m_x		0x13
247*22dc650dSSadaf Ebrahimi #define MOVLPD_x_m		0x12
248*22dc650dSSadaf Ebrahimi #define MOVMSKPS_r_x		(/* GROUP_0F */ 0x50)
249*22dc650dSSadaf Ebrahimi #define MOVQ_x_xm		(/* GROUP_0F */ 0x7e)
250*22dc650dSSadaf Ebrahimi #define MOVSD_x_xm		0x10
251*22dc650dSSadaf Ebrahimi #define MOVSD_xm_x		0x11
252*22dc650dSSadaf Ebrahimi #define MOVSHDUP_x_xm		0x16
253*22dc650dSSadaf Ebrahimi #define MOVSXD_r_rm		0x63
254*22dc650dSSadaf Ebrahimi #define MOVSX_r_rm8		(/* GROUP_0F */ 0xbe)
255*22dc650dSSadaf Ebrahimi #define MOVSX_r_rm16		(/* GROUP_0F */ 0xbf)
256*22dc650dSSadaf Ebrahimi #define MOVUPS_x_xm		0x10
257*22dc650dSSadaf Ebrahimi #define MOVZX_r_rm8		(/* GROUP_0F */ 0xb6)
258*22dc650dSSadaf Ebrahimi #define MOVZX_r_rm16		(/* GROUP_0F */ 0xb7)
259*22dc650dSSadaf Ebrahimi #define MUL			(/* GROUP_F7 */ 4 << 3)
260*22dc650dSSadaf Ebrahimi #define MULSD_x_xm		0x59
261*22dc650dSSadaf Ebrahimi #define NEG_rm			(/* GROUP_F7 */ 3 << 3)
262*22dc650dSSadaf Ebrahimi #define NOP			0x90
263*22dc650dSSadaf Ebrahimi #define NOT_rm			(/* GROUP_F7 */ 2 << 3)
264*22dc650dSSadaf Ebrahimi #define OR			(/* BINARY */ 1 << 3)
265*22dc650dSSadaf Ebrahimi #define OR_r_rm			0x0b
266*22dc650dSSadaf Ebrahimi #define OR_EAX_i32		0x0d
267*22dc650dSSadaf Ebrahimi #define OR_rm_r			0x09
268*22dc650dSSadaf Ebrahimi #define OR_rm8_r8		0x08
269*22dc650dSSadaf Ebrahimi #define ORPD_x_xm		0x56
270*22dc650dSSadaf Ebrahimi #define PACKSSWB_x_xm		(/* GROUP_0F */ 0x63)
271*22dc650dSSadaf Ebrahimi #define PAND_x_xm		0xdb
272*22dc650dSSadaf Ebrahimi #define PCMPEQD_x_xm		0x76
273*22dc650dSSadaf Ebrahimi #define PINSRB_x_rm_i8		0x20
274*22dc650dSSadaf Ebrahimi #define PINSRW_x_rm_i8		0xc4
275*22dc650dSSadaf Ebrahimi #define PINSRD_x_rm_i8		0x22
276*22dc650dSSadaf Ebrahimi #define PEXTRB_rm_x_i8		0x14
277*22dc650dSSadaf Ebrahimi #define PEXTRW_rm_x_i8		0x15
278*22dc650dSSadaf Ebrahimi #define PEXTRD_rm_x_i8		0x16
279*22dc650dSSadaf Ebrahimi #define PMOVMSKB_r_x		(/* GROUP_0F */ 0xd7)
280*22dc650dSSadaf Ebrahimi #define PMOVSXBD_x_xm		0x21
281*22dc650dSSadaf Ebrahimi #define PMOVSXBQ_x_xm		0x22
282*22dc650dSSadaf Ebrahimi #define PMOVSXBW_x_xm		0x20
283*22dc650dSSadaf Ebrahimi #define PMOVSXDQ_x_xm		0x25
284*22dc650dSSadaf Ebrahimi #define PMOVSXWD_x_xm		0x23
285*22dc650dSSadaf Ebrahimi #define PMOVSXWQ_x_xm		0x24
286*22dc650dSSadaf Ebrahimi #define PMOVZXBD_x_xm		0x31
287*22dc650dSSadaf Ebrahimi #define PMOVZXBQ_x_xm		0x32
288*22dc650dSSadaf Ebrahimi #define PMOVZXBW_x_xm		0x30
289*22dc650dSSadaf Ebrahimi #define PMOVZXDQ_x_xm		0x35
290*22dc650dSSadaf Ebrahimi #define PMOVZXWD_x_xm		0x33
291*22dc650dSSadaf Ebrahimi #define PMOVZXWQ_x_xm		0x34
292*22dc650dSSadaf Ebrahimi #define POP_r			0x58
293*22dc650dSSadaf Ebrahimi #define POP_rm			0x8f
294*22dc650dSSadaf Ebrahimi #define POPF			0x9d
295*22dc650dSSadaf Ebrahimi #define POR_x_xm		0xeb
296*22dc650dSSadaf Ebrahimi #define PREFETCH		0x18
297*22dc650dSSadaf Ebrahimi #define PSHUFB_x_xm		0x00
298*22dc650dSSadaf Ebrahimi #define PSHUFD_x_xm		0x70
299*22dc650dSSadaf Ebrahimi #define PSHUFLW_x_xm		0x70
300*22dc650dSSadaf Ebrahimi #define PSRLDQ_x		0x73
301*22dc650dSSadaf Ebrahimi #define PSLLD_x_i8		0x72
302*22dc650dSSadaf Ebrahimi #define PSLLQ_x_i8		0x73
303*22dc650dSSadaf Ebrahimi #define PUSH_i32		0x68
304*22dc650dSSadaf Ebrahimi #define PUSH_r			0x50
305*22dc650dSSadaf Ebrahimi #define PUSH_rm			(/* GROUP_FF */ 6 << 3)
306*22dc650dSSadaf Ebrahimi #define PUSHF			0x9c
307*22dc650dSSadaf Ebrahimi #define PXOR_x_xm		0xef
308*22dc650dSSadaf Ebrahimi #define ROL			(/* SHIFT */ 0 << 3)
309*22dc650dSSadaf Ebrahimi #define ROR			(/* SHIFT */ 1 << 3)
310*22dc650dSSadaf Ebrahimi #define RET_near		0xc3
311*22dc650dSSadaf Ebrahimi #define RET_i16			0xc2
312*22dc650dSSadaf Ebrahimi #define SBB			(/* BINARY */ 3 << 3)
313*22dc650dSSadaf Ebrahimi #define SBB_EAX_i32		0x1d
314*22dc650dSSadaf Ebrahimi #define SBB_r_rm		0x1b
315*22dc650dSSadaf Ebrahimi #define SBB_rm_r		0x19
316*22dc650dSSadaf Ebrahimi #define SAR			(/* SHIFT */ 7 << 3)
317*22dc650dSSadaf Ebrahimi #define SHL			(/* SHIFT */ 4 << 3)
318*22dc650dSSadaf Ebrahimi #define SHLD			(/* GROUP_0F */ 0xa5)
319*22dc650dSSadaf Ebrahimi #define SHRD			(/* GROUP_0F */ 0xad)
320*22dc650dSSadaf Ebrahimi #define SHR			(/* SHIFT */ 5 << 3)
321*22dc650dSSadaf Ebrahimi #define SHUFPS_x_xm		0xc6
322*22dc650dSSadaf Ebrahimi #define SUB			(/* BINARY */ 5 << 3)
323*22dc650dSSadaf Ebrahimi #define SUB_EAX_i32		0x2d
324*22dc650dSSadaf Ebrahimi #define SUB_r_rm		0x2b
325*22dc650dSSadaf Ebrahimi #define SUB_rm_r		0x29
326*22dc650dSSadaf Ebrahimi #define SUBSD_x_xm		0x5c
327*22dc650dSSadaf Ebrahimi #define TEST_EAX_i32		0xa9
328*22dc650dSSadaf Ebrahimi #define TEST_rm_r		0x85
329*22dc650dSSadaf Ebrahimi #define TZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
330*22dc650dSSadaf Ebrahimi #define UCOMISD_x_xm		0x2e
331*22dc650dSSadaf Ebrahimi #define UNPCKLPD_x_xm		0x14
332*22dc650dSSadaf Ebrahimi #define UNPCKLPS_x_xm		0x14
333*22dc650dSSadaf Ebrahimi #define VBROADCASTSD_x_xm	0x19
334*22dc650dSSadaf Ebrahimi #define VBROADCASTSS_x_xm	0x18
335*22dc650dSSadaf Ebrahimi #define VEXTRACTF128_x_ym	0x19
336*22dc650dSSadaf Ebrahimi #define VEXTRACTI128_x_ym	0x39
337*22dc650dSSadaf Ebrahimi #define VINSERTF128_y_y_xm	0x18
338*22dc650dSSadaf Ebrahimi #define VINSERTI128_y_y_xm	0x38
339*22dc650dSSadaf Ebrahimi #define VPBROADCASTB_x_xm	0x78
340*22dc650dSSadaf Ebrahimi #define VPBROADCASTD_x_xm	0x58
341*22dc650dSSadaf Ebrahimi #define VPBROADCASTQ_x_xm	0x59
342*22dc650dSSadaf Ebrahimi #define VPBROADCASTW_x_xm	0x79
343*22dc650dSSadaf Ebrahimi #define VPERMPD_y_ym		0x01
344*22dc650dSSadaf Ebrahimi #define VPERMQ_y_ym		0x00
345*22dc650dSSadaf Ebrahimi #define XCHG_EAX_r		0x90
346*22dc650dSSadaf Ebrahimi #define XCHG_r_rm		0x87
347*22dc650dSSadaf Ebrahimi #define XOR			(/* BINARY */ 6 << 3)
348*22dc650dSSadaf Ebrahimi #define XOR_EAX_i32		0x35
349*22dc650dSSadaf Ebrahimi #define XOR_r_rm		0x33
350*22dc650dSSadaf Ebrahimi #define XOR_rm_r		0x31
351*22dc650dSSadaf Ebrahimi #define XORPD_x_xm		0x57
352*22dc650dSSadaf Ebrahimi 
353*22dc650dSSadaf Ebrahimi #define GROUP_0F		0x0f
354*22dc650dSSadaf Ebrahimi #define GROUP_66		0x66
355*22dc650dSSadaf Ebrahimi #define GROUP_F3		0xf3
356*22dc650dSSadaf Ebrahimi #define GROUP_F7		0xf7
357*22dc650dSSadaf Ebrahimi #define GROUP_FF		0xff
358*22dc650dSSadaf Ebrahimi #define GROUP_BINARY_81		0x81
359*22dc650dSSadaf Ebrahimi #define GROUP_BINARY_83		0x83
360*22dc650dSSadaf Ebrahimi #define GROUP_SHIFT_1		0xd1
361*22dc650dSSadaf Ebrahimi #define GROUP_SHIFT_N		0xc1
362*22dc650dSSadaf Ebrahimi #define GROUP_SHIFT_CL		0xd3
363*22dc650dSSadaf Ebrahimi #define GROUP_LOCK		0xf0
364*22dc650dSSadaf Ebrahimi 
365*22dc650dSSadaf Ebrahimi #define MOD_REG			0xc0
366*22dc650dSSadaf Ebrahimi #define MOD_DISP8		0x40
367*22dc650dSSadaf Ebrahimi 
368*22dc650dSSadaf Ebrahimi #define INC_SIZE(s)		(*inst++ = U8(s), compiler->size += (s))
369*22dc650dSSadaf Ebrahimi 
370*22dc650dSSadaf Ebrahimi #define PUSH_REG(r)		(*inst++ = U8(PUSH_r + (r)))
371*22dc650dSSadaf Ebrahimi #define POP_REG(r)		(*inst++ = U8(POP_r + (r)))
372*22dc650dSSadaf Ebrahimi #define RET()			(*inst++ = RET_near)
373*22dc650dSSadaf Ebrahimi #define RET_I16(n)		(*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0)
374*22dc650dSSadaf Ebrahimi 
375*22dc650dSSadaf Ebrahimi #define SLJIT_INST_LABEL	255
376*22dc650dSSadaf Ebrahimi #define SLJIT_INST_JUMP		254
377*22dc650dSSadaf Ebrahimi #define SLJIT_INST_MOV_ADDR	253
378*22dc650dSSadaf Ebrahimi #define SLJIT_INST_CONST	252
379*22dc650dSSadaf Ebrahimi 
380*22dc650dSSadaf Ebrahimi /* Multithreading does not affect these static variables, since they store
381*22dc650dSSadaf Ebrahimi    built-in CPU features. Therefore they can be overwritten by different threads
382*22dc650dSSadaf Ebrahimi    if they detect the CPU features in the same time. */
383*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_DETECTED		0x001
384*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
385*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_SSE2		0x002
386*22dc650dSSadaf Ebrahimi #endif
387*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_SSE41		0x004
388*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_LZCNT		0x008
389*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_TZCNT		0x010
390*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_CMOV		0x020
391*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_AVX			0x040
392*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_AVX2		0x080
393*22dc650dSSadaf Ebrahimi #define CPU_FEATURE_OSXSAVE		0x100
394*22dc650dSSadaf Ebrahimi 
395*22dc650dSSadaf Ebrahimi static sljit_u32 cpu_feature_list = 0;
396*22dc650dSSadaf Ebrahimi 
397*22dc650dSSadaf Ebrahimi #ifdef _WIN32_WCE
398*22dc650dSSadaf Ebrahimi #include <cmnintrin.h>
399*22dc650dSSadaf Ebrahimi #elif defined(_MSC_VER) && _MSC_VER >= 1400
400*22dc650dSSadaf Ebrahimi #include <intrin.h>
401*22dc650dSSadaf Ebrahimi #endif
402*22dc650dSSadaf Ebrahimi 
403*22dc650dSSadaf Ebrahimi /******************************************************/
404*22dc650dSSadaf Ebrahimi /*    Unaligned-store functions                       */
405*22dc650dSSadaf Ebrahimi /******************************************************/
406*22dc650dSSadaf Ebrahimi 
sljit_unaligned_store_s16(void * addr,sljit_s16 value)407*22dc650dSSadaf Ebrahimi static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
408*22dc650dSSadaf Ebrahimi {
409*22dc650dSSadaf Ebrahimi 	SLJIT_MEMCPY(addr, &value, sizeof(value));
410*22dc650dSSadaf Ebrahimi }
411*22dc650dSSadaf Ebrahimi 
sljit_unaligned_store_s32(void * addr,sljit_s32 value)412*22dc650dSSadaf Ebrahimi static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
413*22dc650dSSadaf Ebrahimi {
414*22dc650dSSadaf Ebrahimi 	SLJIT_MEMCPY(addr, &value, sizeof(value));
415*22dc650dSSadaf Ebrahimi }
416*22dc650dSSadaf Ebrahimi 
sljit_unaligned_store_sw(void * addr,sljit_sw value)417*22dc650dSSadaf Ebrahimi static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
418*22dc650dSSadaf Ebrahimi {
419*22dc650dSSadaf Ebrahimi 	SLJIT_MEMCPY(addr, &value, sizeof(value));
420*22dc650dSSadaf Ebrahimi }
421*22dc650dSSadaf Ebrahimi 
422*22dc650dSSadaf Ebrahimi /******************************************************/
423*22dc650dSSadaf Ebrahimi /*    Utility functions                               */
424*22dc650dSSadaf Ebrahimi /******************************************************/
425*22dc650dSSadaf Ebrahimi 
execute_cpu_id(sljit_u32 info[4])426*22dc650dSSadaf Ebrahimi static void execute_cpu_id(sljit_u32 info[4])
427*22dc650dSSadaf Ebrahimi {
428*22dc650dSSadaf Ebrahimi #if defined(_MSC_VER) && _MSC_VER >= 1400
429*22dc650dSSadaf Ebrahimi 
430*22dc650dSSadaf Ebrahimi 	__cpuidex((int*)info, (int)info[0], (int)info[2]);
431*22dc650dSSadaf Ebrahimi 
432*22dc650dSSadaf Ebrahimi #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
433*22dc650dSSadaf Ebrahimi 
434*22dc650dSSadaf Ebrahimi 	/* AT&T syntax. */
435*22dc650dSSadaf Ebrahimi 	__asm__ (
436*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
437*22dc650dSSadaf Ebrahimi 		"movl %0, %%esi\n"
438*22dc650dSSadaf Ebrahimi 		"movl (%%esi), %%eax\n"
439*22dc650dSSadaf Ebrahimi 		"movl 8(%%esi), %%ecx\n"
440*22dc650dSSadaf Ebrahimi 		"pushl %%ebx\n"
441*22dc650dSSadaf Ebrahimi 		"cpuid\n"
442*22dc650dSSadaf Ebrahimi 		"movl %%eax, (%%esi)\n"
443*22dc650dSSadaf Ebrahimi 		"movl %%ebx, 4(%%esi)\n"
444*22dc650dSSadaf Ebrahimi 		"popl %%ebx\n"
445*22dc650dSSadaf Ebrahimi 		"movl %%ecx, 8(%%esi)\n"
446*22dc650dSSadaf Ebrahimi 		"movl %%edx, 12(%%esi)\n"
447*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
448*22dc650dSSadaf Ebrahimi 		"movq %0, %%rsi\n"
449*22dc650dSSadaf Ebrahimi 		"movl (%%rsi), %%eax\n"
450*22dc650dSSadaf Ebrahimi 		"movl 8(%%rsi), %%ecx\n"
451*22dc650dSSadaf Ebrahimi 		"cpuid\n"
452*22dc650dSSadaf Ebrahimi 		"movl %%eax, (%%rsi)\n"
453*22dc650dSSadaf Ebrahimi 		"movl %%ebx, 4(%%rsi)\n"
454*22dc650dSSadaf Ebrahimi 		"movl %%ecx, 8(%%rsi)\n"
455*22dc650dSSadaf Ebrahimi 		"movl %%edx, 12(%%rsi)\n"
456*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
457*22dc650dSSadaf Ebrahimi 		:
458*22dc650dSSadaf Ebrahimi 		: "r" (info)
459*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
460*22dc650dSSadaf Ebrahimi 		: "memory", "eax", "ecx", "edx", "esi"
461*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
462*22dc650dSSadaf Ebrahimi 		: "memory", "rax", "rbx", "rcx", "rdx", "rsi"
463*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
464*22dc650dSSadaf Ebrahimi 	);
465*22dc650dSSadaf Ebrahimi 
466*22dc650dSSadaf Ebrahimi #else /* _MSC_VER < 1400 */
467*22dc650dSSadaf Ebrahimi 
468*22dc650dSSadaf Ebrahimi 	/* Intel syntax. */
469*22dc650dSSadaf Ebrahimi 	__asm {
470*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
471*22dc650dSSadaf Ebrahimi 		mov esi, info
472*22dc650dSSadaf Ebrahimi 		mov eax, [esi]
473*22dc650dSSadaf Ebrahimi 		mov ecx, [esi + 8]
474*22dc650dSSadaf Ebrahimi 		cpuid
475*22dc650dSSadaf Ebrahimi 		mov [esi], eax
476*22dc650dSSadaf Ebrahimi 		mov [esi + 4], ebx
477*22dc650dSSadaf Ebrahimi 		mov [esi + 8], ecx
478*22dc650dSSadaf Ebrahimi 		mov [esi + 12], edx
479*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
480*22dc650dSSadaf Ebrahimi 		mov rsi, info
481*22dc650dSSadaf Ebrahimi 		mov eax, [rsi]
482*22dc650dSSadaf Ebrahimi 		mov ecx, [rsi + 8]
483*22dc650dSSadaf Ebrahimi 		cpuid
484*22dc650dSSadaf Ebrahimi 		mov [rsi], eax
485*22dc650dSSadaf Ebrahimi 		mov [rsi + 4], ebx
486*22dc650dSSadaf Ebrahimi 		mov [rsi + 8], ecx
487*22dc650dSSadaf Ebrahimi 		mov [rsi + 12], edx
488*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
489*22dc650dSSadaf Ebrahimi 	}
490*22dc650dSSadaf Ebrahimi 
491*22dc650dSSadaf Ebrahimi #endif /* _MSC_VER && _MSC_VER >= 1400 */
492*22dc650dSSadaf Ebrahimi }
493*22dc650dSSadaf Ebrahimi 
execute_get_xcr0_low(void)494*22dc650dSSadaf Ebrahimi static sljit_u32 execute_get_xcr0_low(void)
495*22dc650dSSadaf Ebrahimi {
496*22dc650dSSadaf Ebrahimi 	sljit_u32 xcr0;
497*22dc650dSSadaf Ebrahimi 
498*22dc650dSSadaf Ebrahimi #if defined(_MSC_VER) && _MSC_VER >= 1400
499*22dc650dSSadaf Ebrahimi 
500*22dc650dSSadaf Ebrahimi 	xcr0 = (sljit_u32)_xgetbv(0);
501*22dc650dSSadaf Ebrahimi 
502*22dc650dSSadaf Ebrahimi #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
503*22dc650dSSadaf Ebrahimi 
504*22dc650dSSadaf Ebrahimi 	/* AT&T syntax. */
505*22dc650dSSadaf Ebrahimi 	__asm__ (
506*22dc650dSSadaf Ebrahimi 		"xorl %%ecx, %%ecx\n"
507*22dc650dSSadaf Ebrahimi 		"xgetbv\n"
508*22dc650dSSadaf Ebrahimi 		: "=a" (xcr0)
509*22dc650dSSadaf Ebrahimi 		:
510*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
511*22dc650dSSadaf Ebrahimi 		: "ecx", "edx"
512*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
513*22dc650dSSadaf Ebrahimi 		: "rcx", "rdx"
514*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
515*22dc650dSSadaf Ebrahimi 	);
516*22dc650dSSadaf Ebrahimi 
517*22dc650dSSadaf Ebrahimi #else /* _MSC_VER < 1400 */
518*22dc650dSSadaf Ebrahimi 
519*22dc650dSSadaf Ebrahimi 	/* Intel syntax. */
520*22dc650dSSadaf Ebrahimi 	__asm {
521*22dc650dSSadaf Ebrahimi 		mov ecx, 0
522*22dc650dSSadaf Ebrahimi 		xgetbv
523*22dc650dSSadaf Ebrahimi 		mov xcr0, eax
524*22dc650dSSadaf Ebrahimi 	}
525*22dc650dSSadaf Ebrahimi 
526*22dc650dSSadaf Ebrahimi #endif /* _MSC_VER && _MSC_VER >= 1400 */
527*22dc650dSSadaf Ebrahimi 	return xcr0;
528*22dc650dSSadaf Ebrahimi }
529*22dc650dSSadaf Ebrahimi 
get_cpu_features(void)530*22dc650dSSadaf Ebrahimi static void get_cpu_features(void)
531*22dc650dSSadaf Ebrahimi {
532*22dc650dSSadaf Ebrahimi 	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
533*22dc650dSSadaf Ebrahimi 	sljit_u32 info[4] = {0};
534*22dc650dSSadaf Ebrahimi 	sljit_u32 max_id;
535*22dc650dSSadaf Ebrahimi 
536*22dc650dSSadaf Ebrahimi 	execute_cpu_id(info);
537*22dc650dSSadaf Ebrahimi 	max_id = info[0];
538*22dc650dSSadaf Ebrahimi 
539*22dc650dSSadaf Ebrahimi 	if (max_id >= 7) {
540*22dc650dSSadaf Ebrahimi 		info[0] = 7;
541*22dc650dSSadaf Ebrahimi 		info[2] = 0;
542*22dc650dSSadaf Ebrahimi 		execute_cpu_id(info);
543*22dc650dSSadaf Ebrahimi 
544*22dc650dSSadaf Ebrahimi 		if (info[1] & 0x8)
545*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_TZCNT;
546*22dc650dSSadaf Ebrahimi 		if (info[1] & 0x20)
547*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_AVX2;
548*22dc650dSSadaf Ebrahimi 	}
549*22dc650dSSadaf Ebrahimi 
550*22dc650dSSadaf Ebrahimi 	if (max_id >= 1) {
551*22dc650dSSadaf Ebrahimi 		info[0] = 1;
552*22dc650dSSadaf Ebrahimi 		execute_cpu_id(info);
553*22dc650dSSadaf Ebrahimi 
554*22dc650dSSadaf Ebrahimi 		if (info[2] & 0x80000)
555*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_SSE41;
556*22dc650dSSadaf Ebrahimi 		if (info[2] & 0x8000000)
557*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_OSXSAVE;
558*22dc650dSSadaf Ebrahimi 		if (info[2] & 0x10000000)
559*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_AVX;
560*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
561*22dc650dSSadaf Ebrahimi 		if (info[3] & 0x4000000)
562*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_SSE2;
563*22dc650dSSadaf Ebrahimi #endif
564*22dc650dSSadaf Ebrahimi 		if (info[3] & 0x8000)
565*22dc650dSSadaf Ebrahimi 			feature_list |= CPU_FEATURE_CMOV;
566*22dc650dSSadaf Ebrahimi 	}
567*22dc650dSSadaf Ebrahimi 
568*22dc650dSSadaf Ebrahimi 	info[0] = 0x80000001;
569*22dc650dSSadaf Ebrahimi 	execute_cpu_id(info);
570*22dc650dSSadaf Ebrahimi 
571*22dc650dSSadaf Ebrahimi 	if (info[2] & 0x20)
572*22dc650dSSadaf Ebrahimi 		feature_list |= CPU_FEATURE_LZCNT;
573*22dc650dSSadaf Ebrahimi 
574*22dc650dSSadaf Ebrahimi 	if ((feature_list & CPU_FEATURE_OSXSAVE) && (execute_get_xcr0_low() & 0x4) == 0)
575*22dc650dSSadaf Ebrahimi 		feature_list &= ~(sljit_u32)(CPU_FEATURE_AVX | CPU_FEATURE_AVX2);
576*22dc650dSSadaf Ebrahimi 
577*22dc650dSSadaf Ebrahimi 	cpu_feature_list = feature_list;
578*22dc650dSSadaf Ebrahimi }
579*22dc650dSSadaf Ebrahimi 
get_jump_code(sljit_uw type)580*22dc650dSSadaf Ebrahimi static sljit_u8 get_jump_code(sljit_uw type)
581*22dc650dSSadaf Ebrahimi {
582*22dc650dSSadaf Ebrahimi 	switch (type) {
583*22dc650dSSadaf Ebrahimi 	case SLJIT_EQUAL:
584*22dc650dSSadaf Ebrahimi 	case SLJIT_ATOMIC_STORED:
585*22dc650dSSadaf Ebrahimi 	case SLJIT_F_EQUAL:
586*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_EQUAL:
587*22dc650dSSadaf Ebrahimi 		return 0x84 /* je */;
588*22dc650dSSadaf Ebrahimi 
589*22dc650dSSadaf Ebrahimi 	case SLJIT_NOT_EQUAL:
590*22dc650dSSadaf Ebrahimi 	case SLJIT_ATOMIC_NOT_STORED:
591*22dc650dSSadaf Ebrahimi 	case SLJIT_F_NOT_EQUAL:
592*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_NOT_EQUAL:
593*22dc650dSSadaf Ebrahimi 		return 0x85 /* jne */;
594*22dc650dSSadaf Ebrahimi 
595*22dc650dSSadaf Ebrahimi 	case SLJIT_LESS:
596*22dc650dSSadaf Ebrahimi 	case SLJIT_CARRY:
597*22dc650dSSadaf Ebrahimi 	case SLJIT_F_LESS:
598*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_LESS:
599*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_GREATER:
600*22dc650dSSadaf Ebrahimi 		return 0x82 /* jc */;
601*22dc650dSSadaf Ebrahimi 
602*22dc650dSSadaf Ebrahimi 	case SLJIT_GREATER_EQUAL:
603*22dc650dSSadaf Ebrahimi 	case SLJIT_NOT_CARRY:
604*22dc650dSSadaf Ebrahimi 	case SLJIT_F_GREATER_EQUAL:
605*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_GREATER_EQUAL:
606*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_LESS_EQUAL:
607*22dc650dSSadaf Ebrahimi 		return 0x83 /* jae */;
608*22dc650dSSadaf Ebrahimi 
609*22dc650dSSadaf Ebrahimi 	case SLJIT_GREATER:
610*22dc650dSSadaf Ebrahimi 	case SLJIT_F_GREATER:
611*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_LESS:
612*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_GREATER:
613*22dc650dSSadaf Ebrahimi 		return 0x87 /* jnbe */;
614*22dc650dSSadaf Ebrahimi 
615*22dc650dSSadaf Ebrahimi 	case SLJIT_LESS_EQUAL:
616*22dc650dSSadaf Ebrahimi 	case SLJIT_F_LESS_EQUAL:
617*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_GREATER_EQUAL:
618*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_LESS_EQUAL:
619*22dc650dSSadaf Ebrahimi 		return 0x86 /* jbe */;
620*22dc650dSSadaf Ebrahimi 
621*22dc650dSSadaf Ebrahimi 	case SLJIT_SIG_LESS:
622*22dc650dSSadaf Ebrahimi 		return 0x8c /* jl */;
623*22dc650dSSadaf Ebrahimi 
624*22dc650dSSadaf Ebrahimi 	case SLJIT_SIG_GREATER_EQUAL:
625*22dc650dSSadaf Ebrahimi 		return 0x8d /* jnl */;
626*22dc650dSSadaf Ebrahimi 
627*22dc650dSSadaf Ebrahimi 	case SLJIT_SIG_GREATER:
628*22dc650dSSadaf Ebrahimi 		return 0x8f /* jnle */;
629*22dc650dSSadaf Ebrahimi 
630*22dc650dSSadaf Ebrahimi 	case SLJIT_SIG_LESS_EQUAL:
631*22dc650dSSadaf Ebrahimi 		return 0x8e /* jle */;
632*22dc650dSSadaf Ebrahimi 
633*22dc650dSSadaf Ebrahimi 	case SLJIT_OVERFLOW:
634*22dc650dSSadaf Ebrahimi 		return 0x80 /* jo */;
635*22dc650dSSadaf Ebrahimi 
636*22dc650dSSadaf Ebrahimi 	case SLJIT_NOT_OVERFLOW:
637*22dc650dSSadaf Ebrahimi 		return 0x81 /* jno */;
638*22dc650dSSadaf Ebrahimi 
639*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED:
640*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_EQUAL: /* NaN. */
641*22dc650dSSadaf Ebrahimi 		return 0x8a /* jp */;
642*22dc650dSSadaf Ebrahimi 
643*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED:
644*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_NOT_EQUAL: /* Not NaN. */
645*22dc650dSSadaf Ebrahimi 		return 0x8b /* jpo */;
646*22dc650dSSadaf Ebrahimi 	}
647*22dc650dSSadaf Ebrahimi 	return 0;
648*22dc650dSSadaf Ebrahimi }
649*22dc650dSSadaf Ebrahimi 
650*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
651*22dc650dSSadaf Ebrahimi static sljit_u8* detect_far_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_sw executable_offset);
652*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
653*22dc650dSSadaf Ebrahimi static sljit_u8* detect_far_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr);
654*22dc650dSSadaf Ebrahimi static sljit_u8* generate_mov_addr_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset);
655*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
656*22dc650dSSadaf Ebrahimi 
detect_near_jump_type(struct sljit_jump * jump,sljit_u8 * code_ptr,sljit_u8 * code,sljit_sw executable_offset)657*22dc650dSSadaf Ebrahimi static sljit_u8* detect_near_jump_type(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset)
658*22dc650dSSadaf Ebrahimi {
659*22dc650dSSadaf Ebrahimi 	sljit_uw type = jump->flags >> TYPE_SHIFT;
660*22dc650dSSadaf Ebrahimi 	sljit_s32 short_jump;
661*22dc650dSSadaf Ebrahimi 	sljit_uw label_addr;
662*22dc650dSSadaf Ebrahimi 
663*22dc650dSSadaf Ebrahimi 	if (jump->flags & JUMP_ADDR)
664*22dc650dSSadaf Ebrahimi 		label_addr = jump->u.target - (sljit_uw)executable_offset;
665*22dc650dSSadaf Ebrahimi 	else
666*22dc650dSSadaf Ebrahimi 		label_addr = (sljit_uw)(code + jump->u.label->size);
667*22dc650dSSadaf Ebrahimi 
668*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
669*22dc650dSSadaf Ebrahimi 	if ((sljit_sw)(label_addr - (sljit_uw)(code_ptr + 6)) > HALFWORD_MAX || (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 5)) < HALFWORD_MIN)
670*22dc650dSSadaf Ebrahimi 		return detect_far_jump_type(jump, code_ptr);
671*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
672*22dc650dSSadaf Ebrahimi 
673*22dc650dSSadaf Ebrahimi 	short_jump = (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 2)) >= -0x80 && (sljit_sw)(label_addr - (sljit_uw)(code_ptr + 2)) <= 0x7f;
674*22dc650dSSadaf Ebrahimi 
675*22dc650dSSadaf Ebrahimi 	if (type == SLJIT_JUMP) {
676*22dc650dSSadaf Ebrahimi 		if (short_jump)
677*22dc650dSSadaf Ebrahimi 			*code_ptr++ = JMP_i8;
678*22dc650dSSadaf Ebrahimi 		else
679*22dc650dSSadaf Ebrahimi 			*code_ptr++ = JMP_i32;
680*22dc650dSSadaf Ebrahimi 	} else if (type > SLJIT_JUMP) {
681*22dc650dSSadaf Ebrahimi 		short_jump = 0;
682*22dc650dSSadaf Ebrahimi 		*code_ptr++ = CALL_i32;
683*22dc650dSSadaf Ebrahimi 	} else if (short_jump) {
684*22dc650dSSadaf Ebrahimi 		*code_ptr++ = U8(get_jump_code(type) - 0x10);
685*22dc650dSSadaf Ebrahimi 	} else {
686*22dc650dSSadaf Ebrahimi 		*code_ptr++ = GROUP_0F;
687*22dc650dSSadaf Ebrahimi 		*code_ptr++ = get_jump_code(type);
688*22dc650dSSadaf Ebrahimi 	}
689*22dc650dSSadaf Ebrahimi 
690*22dc650dSSadaf Ebrahimi 	jump->addr = (sljit_uw)code_ptr;
691*22dc650dSSadaf Ebrahimi 
692*22dc650dSSadaf Ebrahimi 	if (short_jump) {
693*22dc650dSSadaf Ebrahimi 		jump->flags |= PATCH_MB;
694*22dc650dSSadaf Ebrahimi 		code_ptr += sizeof(sljit_s8);
695*22dc650dSSadaf Ebrahimi 	} else {
696*22dc650dSSadaf Ebrahimi 		jump->flags |= PATCH_MW;
697*22dc650dSSadaf Ebrahimi 		code_ptr += sizeof(sljit_s32);
698*22dc650dSSadaf Ebrahimi 	}
699*22dc650dSSadaf Ebrahimi 
700*22dc650dSSadaf Ebrahimi 	return code_ptr;
701*22dc650dSSadaf Ebrahimi }
702*22dc650dSSadaf Ebrahimi 
generate_jump_or_mov_addr(struct sljit_jump * jump,sljit_sw executable_offset)703*22dc650dSSadaf Ebrahimi static void generate_jump_or_mov_addr(struct sljit_jump *jump, sljit_sw executable_offset)
704*22dc650dSSadaf Ebrahimi {
705*22dc650dSSadaf Ebrahimi 	sljit_uw flags = jump->flags;
706*22dc650dSSadaf Ebrahimi 	sljit_uw addr = (flags & JUMP_ADDR) ? jump->u.target : jump->u.label->u.addr;
707*22dc650dSSadaf Ebrahimi 	sljit_uw jump_addr = jump->addr;
708*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(executable_offset);
709*22dc650dSSadaf Ebrahimi 
710*22dc650dSSadaf Ebrahimi 	if (SLJIT_UNLIKELY(flags & JUMP_MOV_ADDR)) {
711*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
712*22dc650dSSadaf Ebrahimi 		sljit_unaligned_store_sw((void*)(jump_addr - sizeof(sljit_sw)), (sljit_sw)addr);
713*22dc650dSSadaf Ebrahimi #else /* SLJIT_CONFIG_X86_32 */
714*22dc650dSSadaf Ebrahimi 		if (flags & PATCH_MD) {
715*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(addr > HALFWORD_MAX);
716*22dc650dSSadaf Ebrahimi 			sljit_unaligned_store_sw((void*)(jump_addr - sizeof(sljit_sw)), (sljit_sw)addr);
717*22dc650dSSadaf Ebrahimi 			return;
718*22dc650dSSadaf Ebrahimi 		}
719*22dc650dSSadaf Ebrahimi 
720*22dc650dSSadaf Ebrahimi 		if (flags & PATCH_MW) {
721*22dc650dSSadaf Ebrahimi 			addr -= (sljit_uw)SLJIT_ADD_EXEC_OFFSET((sljit_u8*)jump_addr, executable_offset);
722*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT((sljit_sw)addr <= HALFWORD_MAX && (sljit_sw)addr >= HALFWORD_MIN);
723*22dc650dSSadaf Ebrahimi 		} else {
724*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(addr <= HALFWORD_MAX);
725*22dc650dSSadaf Ebrahimi 		}
726*22dc650dSSadaf Ebrahimi 		sljit_unaligned_store_s32((void*)(jump_addr - sizeof(sljit_s32)), (sljit_s32)addr);
727*22dc650dSSadaf Ebrahimi #endif /* !SLJIT_CONFIG_X86_32 */
728*22dc650dSSadaf Ebrahimi 		return;
729*22dc650dSSadaf Ebrahimi 	}
730*22dc650dSSadaf Ebrahimi 
731*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
732*22dc650dSSadaf Ebrahimi 	if (SLJIT_UNLIKELY(flags & PATCH_MD)) {
733*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(!(flags & JUMP_ADDR));
734*22dc650dSSadaf Ebrahimi 		sljit_unaligned_store_sw((void*)jump_addr, (sljit_sw)addr);
735*22dc650dSSadaf Ebrahimi 		return;
736*22dc650dSSadaf Ebrahimi 	}
737*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
738*22dc650dSSadaf Ebrahimi 
739*22dc650dSSadaf Ebrahimi 	addr -= (sljit_uw)SLJIT_ADD_EXEC_OFFSET((sljit_u8*)jump_addr, executable_offset);
740*22dc650dSSadaf Ebrahimi 
741*22dc650dSSadaf Ebrahimi 	if (flags & PATCH_MB) {
742*22dc650dSSadaf Ebrahimi 		addr -= sizeof(sljit_s8);
743*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT((sljit_sw)addr <= 0x7f && (sljit_sw)addr >= -0x80);
744*22dc650dSSadaf Ebrahimi 		*(sljit_u8*)jump_addr = U8(addr);
745*22dc650dSSadaf Ebrahimi 		return;
746*22dc650dSSadaf Ebrahimi 	} else if (flags & PATCH_MW) {
747*22dc650dSSadaf Ebrahimi 		addr -= sizeof(sljit_s32);
748*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
749*22dc650dSSadaf Ebrahimi 		sljit_unaligned_store_sw((void*)jump_addr, (sljit_sw)addr);
750*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
751*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT((sljit_sw)addr <= HALFWORD_MAX && (sljit_sw)addr >= HALFWORD_MIN);
752*22dc650dSSadaf Ebrahimi 		sljit_unaligned_store_s32((void*)jump_addr, (sljit_s32)addr);
753*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
754*22dc650dSSadaf Ebrahimi 	}
755*22dc650dSSadaf Ebrahimi }
756*22dc650dSSadaf Ebrahimi 
reduce_code_size(struct sljit_compiler * compiler)757*22dc650dSSadaf Ebrahimi static void reduce_code_size(struct sljit_compiler *compiler)
758*22dc650dSSadaf Ebrahimi {
759*22dc650dSSadaf Ebrahimi 	struct sljit_label *label;
760*22dc650dSSadaf Ebrahimi 	struct sljit_jump *jump;
761*22dc650dSSadaf Ebrahimi 	sljit_uw next_label_size;
762*22dc650dSSadaf Ebrahimi 	sljit_uw next_jump_addr;
763*22dc650dSSadaf Ebrahimi 	sljit_uw next_min_addr;
764*22dc650dSSadaf Ebrahimi 	sljit_uw size_reduce = 0;
765*22dc650dSSadaf Ebrahimi 	sljit_sw diff;
766*22dc650dSSadaf Ebrahimi 	sljit_uw type;
767*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
768*22dc650dSSadaf Ebrahimi 	sljit_uw size_reduce_max;
769*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
770*22dc650dSSadaf Ebrahimi 
771*22dc650dSSadaf Ebrahimi 	label = compiler->labels;
772*22dc650dSSadaf Ebrahimi 	jump = compiler->jumps;
773*22dc650dSSadaf Ebrahimi 
774*22dc650dSSadaf Ebrahimi 	next_label_size = SLJIT_GET_NEXT_SIZE(label);
775*22dc650dSSadaf Ebrahimi 	next_jump_addr = SLJIT_GET_NEXT_ADDRESS(jump);
776*22dc650dSSadaf Ebrahimi 
777*22dc650dSSadaf Ebrahimi 	while (1) {
778*22dc650dSSadaf Ebrahimi 		next_min_addr = next_label_size;
779*22dc650dSSadaf Ebrahimi 		if (next_jump_addr < next_min_addr)
780*22dc650dSSadaf Ebrahimi 			next_min_addr = next_jump_addr;
781*22dc650dSSadaf Ebrahimi 
782*22dc650dSSadaf Ebrahimi 		if (next_min_addr == SLJIT_MAX_ADDRESS)
783*22dc650dSSadaf Ebrahimi 			break;
784*22dc650dSSadaf Ebrahimi 
785*22dc650dSSadaf Ebrahimi 		if (next_min_addr == next_label_size) {
786*22dc650dSSadaf Ebrahimi 			label->size -= size_reduce;
787*22dc650dSSadaf Ebrahimi 
788*22dc650dSSadaf Ebrahimi 			label = label->next;
789*22dc650dSSadaf Ebrahimi 			next_label_size = SLJIT_GET_NEXT_SIZE(label);
790*22dc650dSSadaf Ebrahimi 		}
791*22dc650dSSadaf Ebrahimi 
792*22dc650dSSadaf Ebrahimi 		if (next_min_addr != next_jump_addr)
793*22dc650dSSadaf Ebrahimi 			continue;
794*22dc650dSSadaf Ebrahimi 
795*22dc650dSSadaf Ebrahimi 		if (!(jump->flags & JUMP_MOV_ADDR)) {
796*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
797*22dc650dSSadaf Ebrahimi 			size_reduce_max = size_reduce + (((jump->flags >> TYPE_SHIFT) < SLJIT_JUMP) ? CJUMP_MAX_SIZE : JUMP_MAX_SIZE);
798*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
799*22dc650dSSadaf Ebrahimi 
800*22dc650dSSadaf Ebrahimi 			if (!(jump->flags & SLJIT_REWRITABLE_JUMP)) {
801*22dc650dSSadaf Ebrahimi 				if (jump->flags & JUMP_ADDR) {
802*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
803*22dc650dSSadaf Ebrahimi 					if (jump->u.target <= 0xffffffffl)
804*22dc650dSSadaf Ebrahimi 						size_reduce += sizeof(sljit_s32);
805*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
806*22dc650dSSadaf Ebrahimi 				} else {
807*22dc650dSSadaf Ebrahimi 					/* Unit size: instruction. */
808*22dc650dSSadaf Ebrahimi 					diff = (sljit_sw)jump->u.label->size - (sljit_sw)(jump->addr - size_reduce);
809*22dc650dSSadaf Ebrahimi 					type = jump->flags >> TYPE_SHIFT;
810*22dc650dSSadaf Ebrahimi 
811*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
812*22dc650dSSadaf Ebrahimi 					if (type == SLJIT_JUMP) {
813*22dc650dSSadaf Ebrahimi 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
814*22dc650dSSadaf Ebrahimi 							size_reduce += JUMP_MAX_SIZE - 2;
815*22dc650dSSadaf Ebrahimi 						else if (diff <= HALFWORD_MAX + 5 && diff >= HALFWORD_MIN + 5)
816*22dc650dSSadaf Ebrahimi 							size_reduce += JUMP_MAX_SIZE - 5;
817*22dc650dSSadaf Ebrahimi 					} else if (type < SLJIT_JUMP) {
818*22dc650dSSadaf Ebrahimi 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
819*22dc650dSSadaf Ebrahimi 							size_reduce += CJUMP_MAX_SIZE - 2;
820*22dc650dSSadaf Ebrahimi 						else if (diff <= HALFWORD_MAX + 6 && diff >= HALFWORD_MIN + 6)
821*22dc650dSSadaf Ebrahimi 							size_reduce += CJUMP_MAX_SIZE - 6;
822*22dc650dSSadaf Ebrahimi 					} else  {
823*22dc650dSSadaf Ebrahimi 						if (diff <= HALFWORD_MAX + 5 && diff >= HALFWORD_MIN + 5)
824*22dc650dSSadaf Ebrahimi 							size_reduce += JUMP_MAX_SIZE - 5;
825*22dc650dSSadaf Ebrahimi 					}
826*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
827*22dc650dSSadaf Ebrahimi 					if (type == SLJIT_JUMP) {
828*22dc650dSSadaf Ebrahimi 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
829*22dc650dSSadaf Ebrahimi 							size_reduce += JUMP_MAX_SIZE - 2;
830*22dc650dSSadaf Ebrahimi 					} else if (type < SLJIT_JUMP) {
831*22dc650dSSadaf Ebrahimi 						if (diff <= 0x7f + 2 && diff >= -0x80 + 2)
832*22dc650dSSadaf Ebrahimi 							size_reduce += CJUMP_MAX_SIZE - 2;
833*22dc650dSSadaf Ebrahimi 					}
834*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
835*22dc650dSSadaf Ebrahimi 				}
836*22dc650dSSadaf Ebrahimi 			}
837*22dc650dSSadaf Ebrahimi 
838*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
839*22dc650dSSadaf Ebrahimi 			jump->flags |= (size_reduce_max - size_reduce) << JUMP_SIZE_SHIFT;
840*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
841*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
842*22dc650dSSadaf Ebrahimi 		} else {
843*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
844*22dc650dSSadaf Ebrahimi 			size_reduce_max = size_reduce + 10;
845*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
846*22dc650dSSadaf Ebrahimi 
847*22dc650dSSadaf Ebrahimi 			if (!(jump->flags & JUMP_ADDR)) {
848*22dc650dSSadaf Ebrahimi 				diff = (sljit_sw)jump->u.label->size - (sljit_sw)(jump->addr - size_reduce - 3);
849*22dc650dSSadaf Ebrahimi 
850*22dc650dSSadaf Ebrahimi 				if (diff <= HALFWORD_MAX && diff >= HALFWORD_MIN)
851*22dc650dSSadaf Ebrahimi 					size_reduce += 3;
852*22dc650dSSadaf Ebrahimi 			} else if (jump->u.target <= 0xffffffffl)
853*22dc650dSSadaf Ebrahimi 				size_reduce += (jump->flags & MOV_ADDR_HI) ? 4 : 5;
854*22dc650dSSadaf Ebrahimi 
855*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
856*22dc650dSSadaf Ebrahimi 			jump->flags |= (size_reduce_max - size_reduce) << JUMP_SIZE_SHIFT;
857*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
858*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
859*22dc650dSSadaf Ebrahimi 		}
860*22dc650dSSadaf Ebrahimi 
861*22dc650dSSadaf Ebrahimi 		jump = jump->next;
862*22dc650dSSadaf Ebrahimi 		next_jump_addr = SLJIT_GET_NEXT_ADDRESS(jump);
863*22dc650dSSadaf Ebrahimi 	}
864*22dc650dSSadaf Ebrahimi 
865*22dc650dSSadaf Ebrahimi 	compiler->size -= size_reduce;
866*22dc650dSSadaf Ebrahimi }
867*22dc650dSSadaf Ebrahimi 
sljit_generate_code(struct sljit_compiler * compiler,sljit_s32 options,void * exec_allocator_data)868*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler, sljit_s32 options, void *exec_allocator_data)
869*22dc650dSSadaf Ebrahimi {
870*22dc650dSSadaf Ebrahimi 	struct sljit_memory_fragment *buf;
871*22dc650dSSadaf Ebrahimi 	sljit_u8 *code;
872*22dc650dSSadaf Ebrahimi 	sljit_u8 *code_ptr;
873*22dc650dSSadaf Ebrahimi 	sljit_u8 *buf_ptr;
874*22dc650dSSadaf Ebrahimi 	sljit_u8 *buf_end;
875*22dc650dSSadaf Ebrahimi 	sljit_u8 len;
876*22dc650dSSadaf Ebrahimi 	sljit_sw executable_offset;
877*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
878*22dc650dSSadaf Ebrahimi 	sljit_uw addr;
879*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
880*22dc650dSSadaf Ebrahimi 
881*22dc650dSSadaf Ebrahimi 	struct sljit_label *label;
882*22dc650dSSadaf Ebrahimi 	struct sljit_jump *jump;
883*22dc650dSSadaf Ebrahimi 	struct sljit_const *const_;
884*22dc650dSSadaf Ebrahimi 
885*22dc650dSSadaf Ebrahimi 	CHECK_ERROR_PTR();
886*22dc650dSSadaf Ebrahimi 	CHECK_PTR(check_sljit_generate_code(compiler));
887*22dc650dSSadaf Ebrahimi 
888*22dc650dSSadaf Ebrahimi 	reduce_code_size(compiler);
889*22dc650dSSadaf Ebrahimi 
890*22dc650dSSadaf Ebrahimi 	/* Second code generation pass. */
891*22dc650dSSadaf Ebrahimi 	code = (sljit_u8*)allocate_executable_memory(compiler->size, options, exec_allocator_data, &executable_offset);
892*22dc650dSSadaf Ebrahimi 	PTR_FAIL_WITH_EXEC_IF(code);
893*22dc650dSSadaf Ebrahimi 
894*22dc650dSSadaf Ebrahimi 	reverse_buf(compiler);
895*22dc650dSSadaf Ebrahimi 	buf = compiler->buf;
896*22dc650dSSadaf Ebrahimi 
897*22dc650dSSadaf Ebrahimi 	code_ptr = code;
898*22dc650dSSadaf Ebrahimi 	label = compiler->labels;
899*22dc650dSSadaf Ebrahimi 	jump = compiler->jumps;
900*22dc650dSSadaf Ebrahimi 	const_ = compiler->consts;
901*22dc650dSSadaf Ebrahimi 
902*22dc650dSSadaf Ebrahimi 	do {
903*22dc650dSSadaf Ebrahimi 		buf_ptr = buf->memory;
904*22dc650dSSadaf Ebrahimi 		buf_end = buf_ptr + buf->used_size;
905*22dc650dSSadaf Ebrahimi 		do {
906*22dc650dSSadaf Ebrahimi 			len = *buf_ptr++;
907*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(len > 0);
908*22dc650dSSadaf Ebrahimi 			if (len < SLJIT_INST_CONST) {
909*22dc650dSSadaf Ebrahimi 				/* The code is already generated. */
910*22dc650dSSadaf Ebrahimi 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
911*22dc650dSSadaf Ebrahimi 				code_ptr += len;
912*22dc650dSSadaf Ebrahimi 				buf_ptr += len;
913*22dc650dSSadaf Ebrahimi 			} else {
914*22dc650dSSadaf Ebrahimi 				switch (len) {
915*22dc650dSSadaf Ebrahimi 				case SLJIT_INST_LABEL:
916*22dc650dSSadaf Ebrahimi 					label->u.addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
917*22dc650dSSadaf Ebrahimi 					label->size = (sljit_uw)(code_ptr - code);
918*22dc650dSSadaf Ebrahimi 					label = label->next;
919*22dc650dSSadaf Ebrahimi 					break;
920*22dc650dSSadaf Ebrahimi 				case SLJIT_INST_JUMP:
921*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
922*22dc650dSSadaf Ebrahimi 					addr = (sljit_uw)code_ptr;
923*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DEBUG */
924*22dc650dSSadaf Ebrahimi 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
925*22dc650dSSadaf Ebrahimi 						code_ptr = detect_near_jump_type(jump, code_ptr, code, executable_offset);
926*22dc650dSSadaf Ebrahimi 					else {
927*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
928*22dc650dSSadaf Ebrahimi 						code_ptr = detect_far_jump_type(jump, code_ptr, executable_offset);
929*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
930*22dc650dSSadaf Ebrahimi 						code_ptr = detect_far_jump_type(jump, code_ptr);
931*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
932*22dc650dSSadaf Ebrahimi 					}
933*22dc650dSSadaf Ebrahimi 
934*22dc650dSSadaf Ebrahimi 					SLJIT_ASSERT((sljit_uw)code_ptr - addr <= ((jump->flags >> JUMP_SIZE_SHIFT) & 0x1f));
935*22dc650dSSadaf Ebrahimi 					jump = jump->next;
936*22dc650dSSadaf Ebrahimi 					break;
937*22dc650dSSadaf Ebrahimi 				case SLJIT_INST_MOV_ADDR:
938*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
939*22dc650dSSadaf Ebrahimi 					code_ptr = generate_mov_addr_code(jump, code_ptr, code, executable_offset);
940*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
941*22dc650dSSadaf Ebrahimi 					jump->addr = (sljit_uw)code_ptr;
942*22dc650dSSadaf Ebrahimi 					jump = jump->next;
943*22dc650dSSadaf Ebrahimi 					break;
944*22dc650dSSadaf Ebrahimi 				default:
945*22dc650dSSadaf Ebrahimi 					SLJIT_ASSERT(len == SLJIT_INST_CONST);
946*22dc650dSSadaf Ebrahimi 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
947*22dc650dSSadaf Ebrahimi 					const_ = const_->next;
948*22dc650dSSadaf Ebrahimi 					break;
949*22dc650dSSadaf Ebrahimi 				}
950*22dc650dSSadaf Ebrahimi 			}
951*22dc650dSSadaf Ebrahimi 		} while (buf_ptr < buf_end);
952*22dc650dSSadaf Ebrahimi 
953*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(buf_ptr == buf_end);
954*22dc650dSSadaf Ebrahimi 		buf = buf->next;
955*22dc650dSSadaf Ebrahimi 	} while (buf);
956*22dc650dSSadaf Ebrahimi 
957*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(!label);
958*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(!jump);
959*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(!const_);
960*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
961*22dc650dSSadaf Ebrahimi 
962*22dc650dSSadaf Ebrahimi 	jump = compiler->jumps;
963*22dc650dSSadaf Ebrahimi 	while (jump) {
964*22dc650dSSadaf Ebrahimi 		generate_jump_or_mov_addr(jump, executable_offset);
965*22dc650dSSadaf Ebrahimi 		jump = jump->next;
966*22dc650dSSadaf Ebrahimi 	}
967*22dc650dSSadaf Ebrahimi 
968*22dc650dSSadaf Ebrahimi 	compiler->error = SLJIT_ERR_COMPILED;
969*22dc650dSSadaf Ebrahimi 	compiler->executable_offset = executable_offset;
970*22dc650dSSadaf Ebrahimi 	compiler->executable_size = (sljit_uw)(code_ptr - code);
971*22dc650dSSadaf Ebrahimi 
972*22dc650dSSadaf Ebrahimi 	code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
973*22dc650dSSadaf Ebrahimi 
974*22dc650dSSadaf Ebrahimi 	SLJIT_UPDATE_WX_FLAGS(code, (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset), 1);
975*22dc650dSSadaf Ebrahimi 	return (void*)code;
976*22dc650dSSadaf Ebrahimi }
977*22dc650dSSadaf Ebrahimi 
sljit_has_cpu_feature(sljit_s32 feature_type)978*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
979*22dc650dSSadaf Ebrahimi {
980*22dc650dSSadaf Ebrahimi 	switch (feature_type) {
981*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_FPU:
982*22dc650dSSadaf Ebrahimi #ifdef SLJIT_IS_FPU_AVAILABLE
983*22dc650dSSadaf Ebrahimi 		return (SLJIT_IS_FPU_AVAILABLE) != 0;
984*22dc650dSSadaf Ebrahimi #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
985*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
986*22dc650dSSadaf Ebrahimi 			get_cpu_features();
987*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
988*22dc650dSSadaf Ebrahimi #else /* SLJIT_DETECT_SSE2 */
989*22dc650dSSadaf Ebrahimi 		return 1;
990*22dc650dSSadaf Ebrahimi #endif /* SLJIT_DETECT_SSE2 */
991*22dc650dSSadaf Ebrahimi 
992*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
993*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_VIRTUAL_REGISTERS:
994*22dc650dSSadaf Ebrahimi 		return 1;
995*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
996*22dc650dSSadaf Ebrahimi 
997*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_CLZ:
998*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
999*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1000*22dc650dSSadaf Ebrahimi 
1001*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2;
1002*22dc650dSSadaf Ebrahimi 
1003*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_CTZ:
1004*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
1005*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1006*22dc650dSSadaf Ebrahimi 
1007*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2;
1008*22dc650dSSadaf Ebrahimi 
1009*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_CMOV:
1010*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
1011*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1012*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_CMOV) != 0;
1013*22dc650dSSadaf Ebrahimi 
1014*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_REV:
1015*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_ROT:
1016*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_PREFETCH:
1017*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_COPY_F32:
1018*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_COPY_F64:
1019*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_ATOMIC:
1020*22dc650dSSadaf Ebrahimi 		return 1;
1021*22dc650dSSadaf Ebrahimi 
1022*22dc650dSSadaf Ebrahimi #if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE
1023*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_AVX:
1024*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
1025*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1026*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_AVX) != 0;
1027*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_AVX2:
1028*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
1029*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1030*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_AVX2) != 0;
1031*22dc650dSSadaf Ebrahimi 	case SLJIT_HAS_SIMD:
1032*22dc650dSSadaf Ebrahimi 		if (cpu_feature_list == 0)
1033*22dc650dSSadaf Ebrahimi 			get_cpu_features();
1034*22dc650dSSadaf Ebrahimi 		return (cpu_feature_list & CPU_FEATURE_SSE41) != 0;
1035*22dc650dSSadaf Ebrahimi #endif /* SLJIT_IS_FPU_AVAILABLE */
1036*22dc650dSSadaf Ebrahimi 	default:
1037*22dc650dSSadaf Ebrahimi 		return 0;
1038*22dc650dSSadaf Ebrahimi 	}
1039*22dc650dSSadaf Ebrahimi }
1040*22dc650dSSadaf Ebrahimi 
sljit_cmp_info(sljit_s32 type)1041*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type)
1042*22dc650dSSadaf Ebrahimi {
1043*22dc650dSSadaf Ebrahimi 	switch (type) {
1044*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_EQUAL:
1045*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_NOT_EQUAL:
1046*22dc650dSSadaf Ebrahimi 		return 2;
1047*22dc650dSSadaf Ebrahimi 	}
1048*22dc650dSSadaf Ebrahimi 
1049*22dc650dSSadaf Ebrahimi 	return 0;
1050*22dc650dSSadaf Ebrahimi }
1051*22dc650dSSadaf Ebrahimi 
1052*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
1053*22dc650dSSadaf Ebrahimi /*  Operators                                                            */
1054*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
1055*22dc650dSSadaf Ebrahimi 
1056*22dc650dSSadaf Ebrahimi #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
1057*22dc650dSSadaf Ebrahimi 
1058*22dc650dSSadaf Ebrahimi #define BINARY_IMM32(op_imm, immw, arg, argw) \
1059*22dc650dSSadaf Ebrahimi 	do { \
1060*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1061*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst); \
1062*22dc650dSSadaf Ebrahimi 		*(inst + 1) |= (op_imm); \
1063*22dc650dSSadaf Ebrahimi 	} while (0)
1064*22dc650dSSadaf Ebrahimi 
1065*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1066*22dc650dSSadaf Ebrahimi 
1067*22dc650dSSadaf Ebrahimi #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1068*22dc650dSSadaf Ebrahimi 	do { \
1069*22dc650dSSadaf Ebrahimi 		if (IS_HALFWORD(immw) || compiler->mode32) { \
1070*22dc650dSSadaf Ebrahimi 			BINARY_IMM32(op_imm, immw, arg, argw); \
1071*22dc650dSSadaf Ebrahimi 		} \
1072*22dc650dSSadaf Ebrahimi 		else { \
1073*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, FAST_IS_REG(arg) ? TMP_REG2 : TMP_REG1, immw)); \
1074*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, FAST_IS_REG(arg) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
1075*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst); \
1076*22dc650dSSadaf Ebrahimi 			*inst = (op_mr); \
1077*22dc650dSSadaf Ebrahimi 		} \
1078*22dc650dSSadaf Ebrahimi 	} while (0)
1079*22dc650dSSadaf Ebrahimi 
1080*22dc650dSSadaf Ebrahimi #define BINARY_EAX_IMM(op_eax_imm, immw) \
1081*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1082*22dc650dSSadaf Ebrahimi 
1083*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
1084*22dc650dSSadaf Ebrahimi 
1085*22dc650dSSadaf Ebrahimi #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1086*22dc650dSSadaf Ebrahimi 	BINARY_IMM32(op_imm, immw, arg, argw)
1087*22dc650dSSadaf Ebrahimi 
1088*22dc650dSSadaf Ebrahimi #define BINARY_EAX_IMM(op_eax_imm, immw) \
1089*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1090*22dc650dSSadaf Ebrahimi 
1091*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1092*22dc650dSSadaf Ebrahimi 
emit_byte(struct sljit_compiler * compiler,sljit_u8 byte)1093*22dc650dSSadaf Ebrahimi static sljit_s32 emit_byte(struct sljit_compiler *compiler, sljit_u8 byte)
1094*22dc650dSSadaf Ebrahimi {
1095*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1096*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1097*22dc650dSSadaf Ebrahimi 	INC_SIZE(1);
1098*22dc650dSSadaf Ebrahimi 	*inst = byte;
1099*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1100*22dc650dSSadaf Ebrahimi }
1101*22dc650dSSadaf Ebrahimi 
1102*22dc650dSSadaf Ebrahimi static sljit_s32 emit_mov(struct sljit_compiler *compiler,
1103*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1104*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw);
1105*22dc650dSSadaf Ebrahimi 
1106*22dc650dSSadaf Ebrahimi #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
1107*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1108*22dc650dSSadaf Ebrahimi 
1109*22dc650dSSadaf Ebrahimi static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
1110*22dc650dSSadaf Ebrahimi 	sljit_uw op,
1111*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1112*22dc650dSSadaf Ebrahimi 
1113*22dc650dSSadaf Ebrahimi static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
1114*22dc650dSSadaf Ebrahimi 	sljit_uw op,
1115*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1116*22dc650dSSadaf Ebrahimi 
1117*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
1118*22dc650dSSadaf Ebrahimi 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
1119*22dc650dSSadaf Ebrahimi 
1120*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
1121*22dc650dSSadaf Ebrahimi 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
1122*22dc650dSSadaf Ebrahimi 
1123*22dc650dSSadaf Ebrahimi static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1124*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
1125*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w);
1126*22dc650dSSadaf Ebrahimi 
1127*22dc650dSSadaf Ebrahimi static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
1128*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_reg,
1129*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw);
1130*22dc650dSSadaf Ebrahimi 
emit_endbranch(struct sljit_compiler * compiler)1131*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler)
1132*22dc650dSSadaf Ebrahimi {
1133*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET)
1134*22dc650dSSadaf Ebrahimi 	/* Emit endbr32/endbr64 when CET is enabled.  */
1135*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
1136*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1137*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1138*22dc650dSSadaf Ebrahimi 	INC_SIZE(4);
1139*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_F3;
1140*22dc650dSSadaf Ebrahimi 	inst[1] = GROUP_0F;
1141*22dc650dSSadaf Ebrahimi 	inst[2] = 0x1e;
1142*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1143*22dc650dSSadaf Ebrahimi 	inst[3] = 0xfb;
1144*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
1145*22dc650dSSadaf Ebrahimi 	inst[3] = 0xfa;
1146*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1147*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_CET */
1148*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(compiler);
1149*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_CET */
1150*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1151*22dc650dSSadaf Ebrahimi }
1152*22dc650dSSadaf Ebrahimi 
1153*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1154*22dc650dSSadaf Ebrahimi 
emit_rdssp(struct sljit_compiler * compiler,sljit_s32 reg)1155*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_s32 reg)
1156*22dc650dSSadaf Ebrahimi {
1157*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
1158*22dc650dSSadaf Ebrahimi 	sljit_s32 size;
1159*22dc650dSSadaf Ebrahimi 
1160*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1161*22dc650dSSadaf Ebrahimi 	size = 5;
1162*22dc650dSSadaf Ebrahimi #else
1163*22dc650dSSadaf Ebrahimi 	size = 4;
1164*22dc650dSSadaf Ebrahimi #endif
1165*22dc650dSSadaf Ebrahimi 
1166*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1167*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1168*22dc650dSSadaf Ebrahimi 	INC_SIZE(size);
1169*22dc650dSSadaf Ebrahimi 	*inst++ = GROUP_F3;
1170*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1171*22dc650dSSadaf Ebrahimi 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1172*22dc650dSSadaf Ebrahimi #endif
1173*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
1174*22dc650dSSadaf Ebrahimi 	inst[1] = 0x1e;
1175*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1176*22dc650dSSadaf Ebrahimi 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_lmap[reg]);
1177*22dc650dSSadaf Ebrahimi #else
1178*22dc650dSSadaf Ebrahimi 	inst[2] = U8(MOD_REG | (0x1 << 3) | reg_map[reg]);
1179*22dc650dSSadaf Ebrahimi #endif
1180*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1181*22dc650dSSadaf Ebrahimi }
1182*22dc650dSSadaf Ebrahimi 
emit_incssp(struct sljit_compiler * compiler,sljit_s32 reg)1183*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit_s32 reg)
1184*22dc650dSSadaf Ebrahimi {
1185*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
1186*22dc650dSSadaf Ebrahimi 	sljit_s32 size;
1187*22dc650dSSadaf Ebrahimi 
1188*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1189*22dc650dSSadaf Ebrahimi 	size = 5;
1190*22dc650dSSadaf Ebrahimi #else
1191*22dc650dSSadaf Ebrahimi 	size = 4;
1192*22dc650dSSadaf Ebrahimi #endif
1193*22dc650dSSadaf Ebrahimi 
1194*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1195*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1196*22dc650dSSadaf Ebrahimi 	INC_SIZE(size);
1197*22dc650dSSadaf Ebrahimi 	*inst++ = GROUP_F3;
1198*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1199*22dc650dSSadaf Ebrahimi 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B);
1200*22dc650dSSadaf Ebrahimi #endif
1201*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
1202*22dc650dSSadaf Ebrahimi 	inst[1] = 0xae;
1203*22dc650dSSadaf Ebrahimi 	inst[2] = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7);
1204*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1205*22dc650dSSadaf Ebrahimi }
1206*22dc650dSSadaf Ebrahimi 
1207*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1208*22dc650dSSadaf Ebrahimi 
cpu_has_shadow_stack(void)1209*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void)
1210*22dc650dSSadaf Ebrahimi {
1211*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1212*22dc650dSSadaf Ebrahimi 	return _get_ssp() != 0;
1213*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1214*22dc650dSSadaf Ebrahimi 	return 0;
1215*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1216*22dc650dSSadaf Ebrahimi }
1217*22dc650dSSadaf Ebrahimi 
adjust_shadow_stack(struct sljit_compiler * compiler,sljit_s32 src,sljit_sw srcw)1218*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler,
1219*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1220*22dc650dSSadaf Ebrahimi {
1221*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__)
1222*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst, *jz_after_cmp_inst;
1223*22dc650dSSadaf Ebrahimi 	sljit_uw size_jz_after_cmp_inst;
1224*22dc650dSSadaf Ebrahimi 
1225*22dc650dSSadaf Ebrahimi 	sljit_uw size_before_rdssp_inst = compiler->size;
1226*22dc650dSSadaf Ebrahimi 
1227*22dc650dSSadaf Ebrahimi 	/* Generate "RDSSP TMP_REG1". */
1228*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_rdssp(compiler, TMP_REG1));
1229*22dc650dSSadaf Ebrahimi 
1230*22dc650dSSadaf Ebrahimi 	/* Load return address on shadow stack into TMP_REG1. */
1231*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0);
1232*22dc650dSSadaf Ebrahimi 
1233*22dc650dSSadaf Ebrahimi 	/* Compare return address against TMP_REG1. */
1234*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw));
1235*22dc650dSSadaf Ebrahimi 
1236*22dc650dSSadaf Ebrahimi 	/* Generate JZ to skip shadow stack ajdustment when shadow
1237*22dc650dSSadaf Ebrahimi 	   stack matches normal stack. */
1238*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1239*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1240*22dc650dSSadaf Ebrahimi 	INC_SIZE(2);
1241*22dc650dSSadaf Ebrahimi 	*inst++ = get_jump_code(SLJIT_EQUAL) - 0x10;
1242*22dc650dSSadaf Ebrahimi 	size_jz_after_cmp_inst = compiler->size;
1243*22dc650dSSadaf Ebrahimi 	jz_after_cmp_inst = inst;
1244*22dc650dSSadaf Ebrahimi 
1245*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1246*22dc650dSSadaf Ebrahimi 	/* REX_W is not necessary. */
1247*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
1248*22dc650dSSadaf Ebrahimi #endif
1249*22dc650dSSadaf Ebrahimi 	/* Load 1 into TMP_REG1. */
1250*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
1251*22dc650dSSadaf Ebrahimi 
1252*22dc650dSSadaf Ebrahimi 	/* Generate "INCSSP TMP_REG1". */
1253*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_incssp(compiler, TMP_REG1));
1254*22dc650dSSadaf Ebrahimi 
1255*22dc650dSSadaf Ebrahimi 	/* Jump back to "RDSSP TMP_REG1" to check shadow stack again. */
1256*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1257*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1258*22dc650dSSadaf Ebrahimi 	INC_SIZE(2);
1259*22dc650dSSadaf Ebrahimi 	inst[0] = JMP_i8;
1260*22dc650dSSadaf Ebrahimi 	inst[1] = size_before_rdssp_inst - compiler->size;
1261*22dc650dSSadaf Ebrahimi 
1262*22dc650dSSadaf Ebrahimi 	*jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst;
1263*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */
1264*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(compiler);
1265*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(src);
1266*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(srcw);
1267*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */
1268*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1269*22dc650dSSadaf Ebrahimi }
1270*22dc650dSSadaf Ebrahimi 
1271*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1272*22dc650dSSadaf Ebrahimi #include "sljitNativeX86_32.c"
1273*22dc650dSSadaf Ebrahimi #else
1274*22dc650dSSadaf Ebrahimi #include "sljitNativeX86_64.c"
1275*22dc650dSSadaf Ebrahimi #endif
1276*22dc650dSSadaf Ebrahimi 
emit_mov(struct sljit_compiler * compiler,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1277*22dc650dSSadaf Ebrahimi static sljit_s32 emit_mov(struct sljit_compiler *compiler,
1278*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1279*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1280*22dc650dSSadaf Ebrahimi {
1281*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1282*22dc650dSSadaf Ebrahimi 
1283*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(src)) {
1284*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
1285*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1286*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm_r;
1287*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1288*22dc650dSSadaf Ebrahimi 	}
1289*22dc650dSSadaf Ebrahimi 
1290*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
1291*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(dst)) {
1292*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1293*22dc650dSSadaf Ebrahimi 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1294*22dc650dSSadaf Ebrahimi #else
1295*22dc650dSSadaf Ebrahimi 			if (!compiler->mode32) {
1296*22dc650dSSadaf Ebrahimi 				if (NOT_HALFWORD(srcw))
1297*22dc650dSSadaf Ebrahimi 					return emit_load_imm64(compiler, dst, srcw);
1298*22dc650dSSadaf Ebrahimi 			}
1299*22dc650dSSadaf Ebrahimi 			else
1300*22dc650dSSadaf Ebrahimi 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, U8(MOV_r_i32 | reg_lmap[dst]), srcw);
1301*22dc650dSSadaf Ebrahimi #endif
1302*22dc650dSSadaf Ebrahimi 		}
1303*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1304*22dc650dSSadaf Ebrahimi 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
1305*22dc650dSSadaf Ebrahimi 			/* Immediate to memory move. Only SLJIT_MOV operation copies
1306*22dc650dSSadaf Ebrahimi 			   an immediate directly into memory so TMP_REG1 can be used. */
1307*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
1308*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1309*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
1310*22dc650dSSadaf Ebrahimi 			*inst = MOV_rm_r;
1311*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
1312*22dc650dSSadaf Ebrahimi 		}
1313*22dc650dSSadaf Ebrahimi #endif
1314*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
1315*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1316*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm_i32;
1317*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1318*22dc650dSSadaf Ebrahimi 	}
1319*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst)) {
1320*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
1321*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1322*22dc650dSSadaf Ebrahimi 		*inst = MOV_r_rm;
1323*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1324*22dc650dSSadaf Ebrahimi 	}
1325*22dc650dSSadaf Ebrahimi 
1326*22dc650dSSadaf Ebrahimi 	/* Memory to memory move. Only SLJIT_MOV operation copies
1327*22dc650dSSadaf Ebrahimi 	   data from memory to memory so TMP_REG1 can be used. */
1328*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
1329*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1330*22dc650dSSadaf Ebrahimi 	*inst = MOV_r_rm;
1331*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1332*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1333*22dc650dSSadaf Ebrahimi 	*inst = MOV_rm_r;
1334*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1335*22dc650dSSadaf Ebrahimi }
1336*22dc650dSSadaf Ebrahimi 
emit_cmov_generic(struct sljit_compiler * compiler,sljit_s32 type,sljit_s32 dst_reg,sljit_s32 src,sljit_sw srcw)1337*22dc650dSSadaf Ebrahimi static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type,
1338*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_reg,
1339*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1340*22dc650dSSadaf Ebrahimi {
1341*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1342*22dc650dSSadaf Ebrahimi 	sljit_uw size;
1343*22dc650dSSadaf Ebrahimi 
1344*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL);
1345*22dc650dSSadaf Ebrahimi 
1346*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1347*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1348*22dc650dSSadaf Ebrahimi 	INC_SIZE(2);
1349*22dc650dSSadaf Ebrahimi 	inst[0] = U8(get_jump_code((sljit_uw)type ^ 0x1) - 0x10);
1350*22dc650dSSadaf Ebrahimi 
1351*22dc650dSSadaf Ebrahimi 	size = compiler->size;
1352*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, dst_reg, 0, src, srcw);
1353*22dc650dSSadaf Ebrahimi 
1354*22dc650dSSadaf Ebrahimi 	inst[1] = U8(compiler->size - size);
1355*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1356*22dc650dSSadaf Ebrahimi }
1357*22dc650dSSadaf Ebrahimi 
sljit_emit_op0(struct sljit_compiler * compiler,sljit_s32 op)1358*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
1359*22dc650dSSadaf Ebrahimi {
1360*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
1361*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1362*22dc650dSSadaf Ebrahimi 	sljit_uw size;
1363*22dc650dSSadaf Ebrahimi #endif
1364*22dc650dSSadaf Ebrahimi 
1365*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
1366*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op0(compiler, op));
1367*22dc650dSSadaf Ebrahimi 
1368*22dc650dSSadaf Ebrahimi 	switch (GET_OPCODE(op)) {
1369*22dc650dSSadaf Ebrahimi 	case SLJIT_BREAKPOINT:
1370*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, INT3);
1371*22dc650dSSadaf Ebrahimi 	case SLJIT_NOP:
1372*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, NOP);
1373*22dc650dSSadaf Ebrahimi 	case SLJIT_LMUL_UW:
1374*22dc650dSSadaf Ebrahimi 	case SLJIT_LMUL_SW:
1375*22dc650dSSadaf Ebrahimi 	case SLJIT_DIVMOD_UW:
1376*22dc650dSSadaf Ebrahimi 	case SLJIT_DIVMOD_SW:
1377*22dc650dSSadaf Ebrahimi 	case SLJIT_DIV_UW:
1378*22dc650dSSadaf Ebrahimi 	case SLJIT_DIV_SW:
1379*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1380*22dc650dSSadaf Ebrahimi #ifdef _WIN64
1381*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(
1382*22dc650dSSadaf Ebrahimi 			reg_map[SLJIT_R0] == 0
1383*22dc650dSSadaf Ebrahimi 			&& reg_map[SLJIT_R1] == 2
1384*22dc650dSSadaf Ebrahimi 			&& reg_map[TMP_REG1] > 7);
1385*22dc650dSSadaf Ebrahimi #else
1386*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(
1387*22dc650dSSadaf Ebrahimi 			reg_map[SLJIT_R0] == 0
1388*22dc650dSSadaf Ebrahimi 			&& reg_map[SLJIT_R1] < 7
1389*22dc650dSSadaf Ebrahimi 			&& reg_map[TMP_REG1] == 2);
1390*22dc650dSSadaf Ebrahimi #endif
1391*22dc650dSSadaf Ebrahimi 		compiler->mode32 = op & SLJIT_32;
1392*22dc650dSSadaf Ebrahimi #endif
1393*22dc650dSSadaf Ebrahimi 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
1394*22dc650dSSadaf Ebrahimi 
1395*22dc650dSSadaf Ebrahimi 		op = GET_OPCODE(op);
1396*22dc650dSSadaf Ebrahimi 		if ((op | 0x2) == SLJIT_DIV_UW) {
1397*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1398*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1399*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
1400*22dc650dSSadaf Ebrahimi #else
1401*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1402*22dc650dSSadaf Ebrahimi #endif
1403*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
1404*22dc650dSSadaf Ebrahimi 			*inst = XOR_r_rm;
1405*22dc650dSSadaf Ebrahimi 		}
1406*22dc650dSSadaf Ebrahimi 
1407*22dc650dSSadaf Ebrahimi 		if ((op | 0x2) == SLJIT_DIV_SW) {
1408*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
1409*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
1410*22dc650dSSadaf Ebrahimi #endif
1411*22dc650dSSadaf Ebrahimi 
1412*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1413*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, CDQ));
1414*22dc650dSSadaf Ebrahimi #else
1415*22dc650dSSadaf Ebrahimi 			if (!compiler->mode32) {
1416*22dc650dSSadaf Ebrahimi 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1417*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
1418*22dc650dSSadaf Ebrahimi 				INC_SIZE(2);
1419*22dc650dSSadaf Ebrahimi 				inst[0] = REX_W;
1420*22dc650dSSadaf Ebrahimi 				inst[1] = CDQ;
1421*22dc650dSSadaf Ebrahimi 			} else
1422*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_byte(compiler, CDQ));
1423*22dc650dSSadaf Ebrahimi #endif
1424*22dc650dSSadaf Ebrahimi 		}
1425*22dc650dSSadaf Ebrahimi 
1426*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1427*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
1428*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1429*22dc650dSSadaf Ebrahimi 		INC_SIZE(2);
1430*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_F7;
1431*22dc650dSSadaf Ebrahimi 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
1432*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
1433*22dc650dSSadaf Ebrahimi #ifdef _WIN64
1434*22dc650dSSadaf Ebrahimi 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
1435*22dc650dSSadaf Ebrahimi #else /* !_WIN64 */
1436*22dc650dSSadaf Ebrahimi 		size = (!compiler->mode32) ? 3 : 2;
1437*22dc650dSSadaf Ebrahimi #endif /* _WIN64 */
1438*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1439*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1440*22dc650dSSadaf Ebrahimi 		INC_SIZE(size);
1441*22dc650dSSadaf Ebrahimi #ifdef _WIN64
1442*22dc650dSSadaf Ebrahimi 		if (!compiler->mode32)
1443*22dc650dSSadaf Ebrahimi 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
1444*22dc650dSSadaf Ebrahimi 		else if (op >= SLJIT_DIVMOD_UW)
1445*22dc650dSSadaf Ebrahimi 			*inst++ = REX_B;
1446*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_F7;
1447*22dc650dSSadaf Ebrahimi 		inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
1448*22dc650dSSadaf Ebrahimi #else /* !_WIN64 */
1449*22dc650dSSadaf Ebrahimi 		if (!compiler->mode32)
1450*22dc650dSSadaf Ebrahimi 			*inst++ = REX_W;
1451*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_F7;
1452*22dc650dSSadaf Ebrahimi 		inst[1] = MOD_REG | reg_map[SLJIT_R1];
1453*22dc650dSSadaf Ebrahimi #endif /* _WIN64 */
1454*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1455*22dc650dSSadaf Ebrahimi 		switch (op) {
1456*22dc650dSSadaf Ebrahimi 		case SLJIT_LMUL_UW:
1457*22dc650dSSadaf Ebrahimi 			inst[1] |= MUL;
1458*22dc650dSSadaf Ebrahimi 			break;
1459*22dc650dSSadaf Ebrahimi 		case SLJIT_LMUL_SW:
1460*22dc650dSSadaf Ebrahimi 			inst[1] |= IMUL;
1461*22dc650dSSadaf Ebrahimi 			break;
1462*22dc650dSSadaf Ebrahimi 		case SLJIT_DIVMOD_UW:
1463*22dc650dSSadaf Ebrahimi 		case SLJIT_DIV_UW:
1464*22dc650dSSadaf Ebrahimi 			inst[1] |= DIV;
1465*22dc650dSSadaf Ebrahimi 			break;
1466*22dc650dSSadaf Ebrahimi 		case SLJIT_DIVMOD_SW:
1467*22dc650dSSadaf Ebrahimi 		case SLJIT_DIV_SW:
1468*22dc650dSSadaf Ebrahimi 			inst[1] |= IDIV;
1469*22dc650dSSadaf Ebrahimi 			break;
1470*22dc650dSSadaf Ebrahimi 		}
1471*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
1472*22dc650dSSadaf Ebrahimi 		if (op <= SLJIT_DIVMOD_SW)
1473*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1474*22dc650dSSadaf Ebrahimi #else
1475*22dc650dSSadaf Ebrahimi 		if (op >= SLJIT_DIV_UW)
1476*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
1477*22dc650dSSadaf Ebrahimi #endif
1478*22dc650dSSadaf Ebrahimi 		break;
1479*22dc650dSSadaf Ebrahimi 	case SLJIT_ENDBR:
1480*22dc650dSSadaf Ebrahimi 		return emit_endbranch(compiler);
1481*22dc650dSSadaf Ebrahimi 	case SLJIT_SKIP_FRAMES_BEFORE_RETURN:
1482*22dc650dSSadaf Ebrahimi 		return skip_frames_before_return(compiler);
1483*22dc650dSSadaf Ebrahimi 	}
1484*22dc650dSSadaf Ebrahimi 
1485*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1486*22dc650dSSadaf Ebrahimi }
1487*22dc650dSSadaf Ebrahimi 
emit_mov_byte(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1488*22dc650dSSadaf Ebrahimi static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
1489*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1490*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1491*22dc650dSSadaf Ebrahimi {
1492*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1493*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
1494*22dc650dSSadaf Ebrahimi 
1495*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1496*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
1497*22dc650dSSadaf Ebrahimi #endif
1498*22dc650dSSadaf Ebrahimi 
1499*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
1500*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(dst)) {
1501*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1502*22dc650dSSadaf Ebrahimi 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1503*22dc650dSSadaf Ebrahimi #else
1504*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1505*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
1506*22dc650dSSadaf Ebrahimi 			*inst = MOV_rm_i32;
1507*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
1508*22dc650dSSadaf Ebrahimi #endif
1509*22dc650dSSadaf Ebrahimi 		}
1510*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
1511*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1512*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm8_i8;
1513*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1514*22dc650dSSadaf Ebrahimi 	}
1515*22dc650dSSadaf Ebrahimi 
1516*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1517*22dc650dSSadaf Ebrahimi 
1518*22dc650dSSadaf Ebrahimi 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
1519*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1520*22dc650dSSadaf Ebrahimi 		if (reg_map[src] >= 4) {
1521*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(dst_r == TMP_REG1);
1522*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1523*22dc650dSSadaf Ebrahimi 		} else
1524*22dc650dSSadaf Ebrahimi 			dst_r = src;
1525*22dc650dSSadaf Ebrahimi #else
1526*22dc650dSSadaf Ebrahimi 		dst_r = src;
1527*22dc650dSSadaf Ebrahimi #endif
1528*22dc650dSSadaf Ebrahimi 	} else {
1529*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1530*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(src) && reg_map[src] >= 4) {
1531*22dc650dSSadaf Ebrahimi 			/* Both src and dst are registers. */
1532*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(FAST_IS_REG(dst));
1533*22dc650dSSadaf Ebrahimi 
1534*22dc650dSSadaf Ebrahimi 			if (src == dst && !sign) {
1535*22dc650dSSadaf Ebrahimi 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
1536*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
1537*22dc650dSSadaf Ebrahimi 				*(inst + 1) |= AND;
1538*22dc650dSSadaf Ebrahimi 				return SLJIT_SUCCESS;
1539*22dc650dSSadaf Ebrahimi 			}
1540*22dc650dSSadaf Ebrahimi 
1541*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
1542*22dc650dSSadaf Ebrahimi 			src = TMP_REG1;
1543*22dc650dSSadaf Ebrahimi 			srcw = 0;
1544*22dc650dSSadaf Ebrahimi 		}
1545*22dc650dSSadaf Ebrahimi #endif /* !SLJIT_CONFIG_X86_32 */
1546*22dc650dSSadaf Ebrahimi 
1547*22dc650dSSadaf Ebrahimi 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
1548*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw));
1549*22dc650dSSadaf Ebrahimi 	}
1550*22dc650dSSadaf Ebrahimi 
1551*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM) {
1552*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1553*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1554*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm8_r8;
1555*22dc650dSSadaf Ebrahimi 	}
1556*22dc650dSSadaf Ebrahimi 
1557*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1558*22dc650dSSadaf Ebrahimi }
1559*22dc650dSSadaf Ebrahimi 
emit_prefetch(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 src,sljit_sw srcw)1560*22dc650dSSadaf Ebrahimi static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1561*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1562*22dc650dSSadaf Ebrahimi {
1563*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1564*22dc650dSSadaf Ebrahimi 
1565*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1566*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
1567*22dc650dSSadaf Ebrahimi #endif
1568*22dc650dSSadaf Ebrahimi 
1569*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1570*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1571*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
1572*22dc650dSSadaf Ebrahimi 	inst[1] = PREFETCH;
1573*22dc650dSSadaf Ebrahimi 
1574*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_PREFETCH_L1)
1575*22dc650dSSadaf Ebrahimi 		inst[2] |= (1 << 3);
1576*22dc650dSSadaf Ebrahimi 	else if (op == SLJIT_PREFETCH_L2)
1577*22dc650dSSadaf Ebrahimi 		inst[2] |= (2 << 3);
1578*22dc650dSSadaf Ebrahimi 	else if (op == SLJIT_PREFETCH_L3)
1579*22dc650dSSadaf Ebrahimi 		inst[2] |= (3 << 3);
1580*22dc650dSSadaf Ebrahimi 
1581*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1582*22dc650dSSadaf Ebrahimi }
1583*22dc650dSSadaf Ebrahimi 
emit_mov_half(struct sljit_compiler * compiler,sljit_s32 sign,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1584*22dc650dSSadaf Ebrahimi static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1585*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1586*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1587*22dc650dSSadaf Ebrahimi {
1588*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1589*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
1590*22dc650dSSadaf Ebrahimi 
1591*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1592*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
1593*22dc650dSSadaf Ebrahimi #endif
1594*22dc650dSSadaf Ebrahimi 
1595*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
1596*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(dst)) {
1597*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1598*22dc650dSSadaf Ebrahimi 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
1599*22dc650dSSadaf Ebrahimi #else
1600*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1601*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
1602*22dc650dSSadaf Ebrahimi 			*inst = MOV_rm_i32;
1603*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
1604*22dc650dSSadaf Ebrahimi #endif
1605*22dc650dSSadaf Ebrahimi 		}
1606*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1607*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1608*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm_i32;
1609*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1610*22dc650dSSadaf Ebrahimi 	}
1611*22dc650dSSadaf Ebrahimi 
1612*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1613*22dc650dSSadaf Ebrahimi 
1614*22dc650dSSadaf Ebrahimi 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1615*22dc650dSSadaf Ebrahimi 		dst_r = src;
1616*22dc650dSSadaf Ebrahimi 	else
1617*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw));
1618*22dc650dSSadaf Ebrahimi 
1619*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM) {
1620*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1621*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1622*22dc650dSSadaf Ebrahimi 		*inst = MOV_rm_r;
1623*22dc650dSSadaf Ebrahimi 	}
1624*22dc650dSSadaf Ebrahimi 
1625*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1626*22dc650dSSadaf Ebrahimi }
1627*22dc650dSSadaf Ebrahimi 
emit_unary(struct sljit_compiler * compiler,sljit_u8 opcode,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1628*22dc650dSSadaf Ebrahimi static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1629*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1630*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1631*22dc650dSSadaf Ebrahimi {
1632*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1633*22dc650dSSadaf Ebrahimi 
1634*22dc650dSSadaf Ebrahimi 	if (dst == src && dstw == srcw) {
1635*22dc650dSSadaf Ebrahimi 		/* Same input and output */
1636*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1637*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1638*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_F7;
1639*22dc650dSSadaf Ebrahimi 		inst[1] |= opcode;
1640*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1641*22dc650dSSadaf Ebrahimi 	}
1642*22dc650dSSadaf Ebrahimi 
1643*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst)) {
1644*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, 0, src, srcw);
1645*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1646*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1647*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_F7;
1648*22dc650dSSadaf Ebrahimi 		inst[1] |= opcode;
1649*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1650*22dc650dSSadaf Ebrahimi 	}
1651*22dc650dSSadaf Ebrahimi 
1652*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1653*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1654*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1655*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_F7;
1656*22dc650dSSadaf Ebrahimi 	inst[1] |= opcode;
1657*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1658*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1659*22dc650dSSadaf Ebrahimi }
1660*22dc650dSSadaf Ebrahimi 
1661*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1662*22dc650dSSadaf Ebrahimi static const sljit_sw emit_clz_arg = 32 + 31;
1663*22dc650dSSadaf Ebrahimi static const sljit_sw emit_ctz_arg = 32;
1664*22dc650dSSadaf Ebrahimi #endif
1665*22dc650dSSadaf Ebrahimi 
emit_clz_ctz(struct sljit_compiler * compiler,sljit_s32 is_clz,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1666*22dc650dSSadaf Ebrahimi static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
1667*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1668*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1669*22dc650dSSadaf Ebrahimi {
1670*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1671*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
1672*22dc650dSSadaf Ebrahimi 	sljit_sw max;
1673*22dc650dSSadaf Ebrahimi 
1674*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(cpu_feature_list != 0);
1675*22dc650dSSadaf Ebrahimi 
1676*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1677*22dc650dSSadaf Ebrahimi 
1678*22dc650dSSadaf Ebrahimi 	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
1679*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw));
1680*22dc650dSSadaf Ebrahimi 
1681*22dc650dSSadaf Ebrahimi 		if (dst & SLJIT_MEM)
1682*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1683*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1684*22dc650dSSadaf Ebrahimi 	}
1685*22dc650dSSadaf Ebrahimi 
1686*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw));
1687*22dc650dSSadaf Ebrahimi 
1688*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1689*22dc650dSSadaf Ebrahimi 	max = is_clz ? (32 + 31) : 32;
1690*22dc650dSSadaf Ebrahimi 
1691*22dc650dSSadaf Ebrahimi 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1692*22dc650dSSadaf Ebrahimi 		if (dst_r != TMP_REG1) {
1693*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max);
1694*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1695*22dc650dSSadaf Ebrahimi 		}
1696*22dc650dSSadaf Ebrahimi 		else
1697*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg);
1698*22dc650dSSadaf Ebrahimi 
1699*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1700*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
1701*22dc650dSSadaf Ebrahimi 		inst[1] = CMOVE_r_rm;
1702*22dc650dSSadaf Ebrahimi 	}
1703*22dc650dSSadaf Ebrahimi 	else
1704*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1705*22dc650dSSadaf Ebrahimi 
1706*22dc650dSSadaf Ebrahimi 	if (is_clz) {
1707*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1708*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1709*22dc650dSSadaf Ebrahimi 		*(inst + 1) |= XOR;
1710*22dc650dSSadaf Ebrahimi 	}
1711*22dc650dSSadaf Ebrahimi #else
1712*22dc650dSSadaf Ebrahimi 	if (is_clz)
1713*22dc650dSSadaf Ebrahimi 		max = compiler->mode32 ? (32 + 31) : (64 + 63);
1714*22dc650dSSadaf Ebrahimi 	else
1715*22dc650dSSadaf Ebrahimi 		max = compiler->mode32 ? 32 : 64;
1716*22dc650dSSadaf Ebrahimi 
1717*22dc650dSSadaf Ebrahimi 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
1718*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
1719*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0));
1720*22dc650dSSadaf Ebrahimi 	} else
1721*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
1722*22dc650dSSadaf Ebrahimi 
1723*22dc650dSSadaf Ebrahimi 	if (is_clz) {
1724*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0);
1725*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1726*22dc650dSSadaf Ebrahimi 		*(inst + 1) |= XOR;
1727*22dc650dSSadaf Ebrahimi 	}
1728*22dc650dSSadaf Ebrahimi #endif
1729*22dc650dSSadaf Ebrahimi 
1730*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM)
1731*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1732*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1733*22dc650dSSadaf Ebrahimi }
1734*22dc650dSSadaf Ebrahimi 
emit_bswap(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1735*22dc650dSSadaf Ebrahimi static sljit_s32 emit_bswap(struct sljit_compiler *compiler,
1736*22dc650dSSadaf Ebrahimi 	sljit_s32 op,
1737*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1738*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1739*22dc650dSSadaf Ebrahimi {
1740*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
1741*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1742*22dc650dSSadaf Ebrahimi 	sljit_uw size;
1743*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1744*22dc650dSSadaf Ebrahimi 	sljit_u8 rex = 0;
1745*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
1746*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_is_ereg = op & SLJIT_32;
1747*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1748*22dc650dSSadaf Ebrahimi 
1749*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1750*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_REV_U32 || op == SLJIT_REV_S32)
1751*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 1;
1752*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
1753*22dc650dSSadaf Ebrahimi 	op &= ~SLJIT_32;
1754*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1755*22dc650dSSadaf Ebrahimi 
1756*22dc650dSSadaf Ebrahimi 	if (src != dst_r) {
1757*22dc650dSSadaf Ebrahimi 		/* Only the lower 16 bit is read for eregs. */
1758*22dc650dSSadaf Ebrahimi 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1759*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_half(compiler, 0, dst_r, 0, src, srcw));
1760*22dc650dSSadaf Ebrahimi 		else
1761*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst_r, 0, src, srcw);
1762*22dc650dSSadaf Ebrahimi 	}
1763*22dc650dSSadaf Ebrahimi 
1764*22dc650dSSadaf Ebrahimi 	size = 2;
1765*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1766*22dc650dSSadaf Ebrahimi 	if (!compiler->mode32)
1767*22dc650dSSadaf Ebrahimi 		rex = REX_W;
1768*22dc650dSSadaf Ebrahimi 
1769*22dc650dSSadaf Ebrahimi 	if (reg_map[dst_r] >= 8)
1770*22dc650dSSadaf Ebrahimi 		rex |= REX_B;
1771*22dc650dSSadaf Ebrahimi 
1772*22dc650dSSadaf Ebrahimi 	if (rex != 0)
1773*22dc650dSSadaf Ebrahimi 		size++;
1774*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1775*22dc650dSSadaf Ebrahimi 
1776*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
1777*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
1778*22dc650dSSadaf Ebrahimi 	INC_SIZE(size);
1779*22dc650dSSadaf Ebrahimi 
1780*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1781*22dc650dSSadaf Ebrahimi 	if (rex != 0)
1782*22dc650dSSadaf Ebrahimi 		*inst++ = rex;
1783*22dc650dSSadaf Ebrahimi 
1784*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
1785*22dc650dSSadaf Ebrahimi 	inst[1] = BSWAP_r | reg_lmap[dst_r];
1786*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
1787*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
1788*22dc650dSSadaf Ebrahimi 	inst[1] = BSWAP_r | reg_map[dst_r];
1789*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1790*22dc650dSSadaf Ebrahimi 
1791*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) {
1792*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1793*22dc650dSSadaf Ebrahimi 		size = compiler->mode32 ? 16 : 48;
1794*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
1795*22dc650dSSadaf Ebrahimi 		size = 16;
1796*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1797*22dc650dSSadaf Ebrahimi 
1798*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, (sljit_sw)size, dst_r, 0);
1799*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1800*22dc650dSSadaf Ebrahimi 		if (op == SLJIT_REV_U16)
1801*22dc650dSSadaf Ebrahimi 			inst[1] |= SHR;
1802*22dc650dSSadaf Ebrahimi 		else
1803*22dc650dSSadaf Ebrahimi 			inst[1] |= SAR;
1804*22dc650dSSadaf Ebrahimi 	}
1805*22dc650dSSadaf Ebrahimi 
1806*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM) {
1807*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1808*22dc650dSSadaf Ebrahimi 		if (dst_is_ereg)
1809*22dc650dSSadaf Ebrahimi 			op = SLJIT_REV;
1810*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1811*22dc650dSSadaf Ebrahimi 		if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16)
1812*22dc650dSSadaf Ebrahimi 			return emit_mov_half(compiler, 0, dst, dstw, TMP_REG1, 0);
1813*22dc650dSSadaf Ebrahimi 
1814*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1815*22dc650dSSadaf Ebrahimi 	}
1816*22dc650dSSadaf Ebrahimi 
1817*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1818*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_REV_S32) {
1819*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
1820*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1821*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
1822*22dc650dSSadaf Ebrahimi 		*inst = MOVSXD_r_rm;
1823*22dc650dSSadaf Ebrahimi 	}
1824*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1825*22dc650dSSadaf Ebrahimi 
1826*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1827*22dc650dSSadaf Ebrahimi }
1828*22dc650dSSadaf Ebrahimi 
sljit_emit_op1(struct sljit_compiler * compiler,sljit_s32 op,sljit_s32 dst,sljit_sw dstw,sljit_s32 src,sljit_sw srcw)1829*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1830*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1831*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
1832*22dc650dSSadaf Ebrahimi {
1833*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1834*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_is_ereg = 0;
1835*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
1836*22dc650dSSadaf Ebrahimi 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1837*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1838*22dc650dSSadaf Ebrahimi 
1839*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
1840*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1841*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
1842*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src, srcw);
1843*22dc650dSSadaf Ebrahimi 
1844*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1845*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src, srcw, (void)0);
1846*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1847*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op_flags & SLJIT_32;
1848*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1849*22dc650dSSadaf Ebrahimi 
1850*22dc650dSSadaf Ebrahimi 	op = GET_OPCODE(op);
1851*22dc650dSSadaf Ebrahimi 
1852*22dc650dSSadaf Ebrahimi 	if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1853*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1854*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
1855*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1856*22dc650dSSadaf Ebrahimi 
1857*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(src) && src == dst) {
1858*22dc650dSSadaf Ebrahimi 			if (!TYPE_CAST_NEEDED(op))
1859*22dc650dSSadaf Ebrahimi 				return SLJIT_SUCCESS;
1860*22dc650dSSadaf Ebrahimi 		}
1861*22dc650dSSadaf Ebrahimi 
1862*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1863*22dc650dSSadaf Ebrahimi 		if (op_flags & SLJIT_32) {
1864*22dc650dSSadaf Ebrahimi 			if (src & SLJIT_MEM) {
1865*22dc650dSSadaf Ebrahimi 				if (op == SLJIT_MOV_S32)
1866*22dc650dSSadaf Ebrahimi 					op = SLJIT_MOV_U32;
1867*22dc650dSSadaf Ebrahimi 			}
1868*22dc650dSSadaf Ebrahimi 			else if (src == SLJIT_IMM) {
1869*22dc650dSSadaf Ebrahimi 				if (op == SLJIT_MOV_U32)
1870*22dc650dSSadaf Ebrahimi 					op = SLJIT_MOV_S32;
1871*22dc650dSSadaf Ebrahimi 			}
1872*22dc650dSSadaf Ebrahimi 		}
1873*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1874*22dc650dSSadaf Ebrahimi 
1875*22dc650dSSadaf Ebrahimi 		if (src == SLJIT_IMM) {
1876*22dc650dSSadaf Ebrahimi 			switch (op) {
1877*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_U8:
1878*22dc650dSSadaf Ebrahimi 				srcw = (sljit_u8)srcw;
1879*22dc650dSSadaf Ebrahimi 				break;
1880*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_S8:
1881*22dc650dSSadaf Ebrahimi 				srcw = (sljit_s8)srcw;
1882*22dc650dSSadaf Ebrahimi 				break;
1883*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_U16:
1884*22dc650dSSadaf Ebrahimi 				srcw = (sljit_u16)srcw;
1885*22dc650dSSadaf Ebrahimi 				break;
1886*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_S16:
1887*22dc650dSSadaf Ebrahimi 				srcw = (sljit_s16)srcw;
1888*22dc650dSSadaf Ebrahimi 				break;
1889*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1890*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_U32:
1891*22dc650dSSadaf Ebrahimi 				srcw = (sljit_u32)srcw;
1892*22dc650dSSadaf Ebrahimi 				break;
1893*22dc650dSSadaf Ebrahimi 			case SLJIT_MOV_S32:
1894*22dc650dSSadaf Ebrahimi 				srcw = (sljit_s32)srcw;
1895*22dc650dSSadaf Ebrahimi 				break;
1896*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1897*22dc650dSSadaf Ebrahimi 			}
1898*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1899*22dc650dSSadaf Ebrahimi 			if (SLJIT_UNLIKELY(dst_is_ereg))
1900*22dc650dSSadaf Ebrahimi 				return emit_mov(compiler, dst, dstw, src, srcw);
1901*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1902*22dc650dSSadaf Ebrahimi 		}
1903*22dc650dSSadaf Ebrahimi 
1904*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1905*22dc650dSSadaf Ebrahimi 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1906*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1907*22dc650dSSadaf Ebrahimi 			dst = TMP_REG1;
1908*22dc650dSSadaf Ebrahimi 		}
1909*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1910*22dc650dSSadaf Ebrahimi 
1911*22dc650dSSadaf Ebrahimi 		switch (op) {
1912*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV:
1913*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_P:
1914*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1915*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_U32:
1916*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_S32:
1917*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV32:
1918*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1919*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1920*22dc650dSSadaf Ebrahimi 			break;
1921*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_U8:
1922*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1923*22dc650dSSadaf Ebrahimi 			break;
1924*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_S8:
1925*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1926*22dc650dSSadaf Ebrahimi 			break;
1927*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_U16:
1928*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1929*22dc650dSSadaf Ebrahimi 			break;
1930*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_S16:
1931*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1932*22dc650dSSadaf Ebrahimi 			break;
1933*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1934*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_U32:
1935*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1936*22dc650dSSadaf Ebrahimi 			break;
1937*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV_S32:
1938*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1939*22dc650dSSadaf Ebrahimi 			break;
1940*22dc650dSSadaf Ebrahimi 		case SLJIT_MOV32:
1941*22dc650dSSadaf Ebrahimi 			compiler->mode32 = 1;
1942*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst, dstw, src, srcw);
1943*22dc650dSSadaf Ebrahimi 			compiler->mode32 = 0;
1944*22dc650dSSadaf Ebrahimi 			break;
1945*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
1946*22dc650dSSadaf Ebrahimi 		}
1947*22dc650dSSadaf Ebrahimi 
1948*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1949*22dc650dSSadaf Ebrahimi 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1950*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1951*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1952*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
1953*22dc650dSSadaf Ebrahimi 	}
1954*22dc650dSSadaf Ebrahimi 
1955*22dc650dSSadaf Ebrahimi 	switch (op) {
1956*22dc650dSSadaf Ebrahimi 	case SLJIT_CLZ:
1957*22dc650dSSadaf Ebrahimi 	case SLJIT_CTZ:
1958*22dc650dSSadaf Ebrahimi 		return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw);
1959*22dc650dSSadaf Ebrahimi 	case SLJIT_REV:
1960*22dc650dSSadaf Ebrahimi 	case SLJIT_REV_U16:
1961*22dc650dSSadaf Ebrahimi 	case SLJIT_REV_S16:
1962*22dc650dSSadaf Ebrahimi 	case SLJIT_REV_U32:
1963*22dc650dSSadaf Ebrahimi 	case SLJIT_REV_S32:
1964*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1965*22dc650dSSadaf Ebrahimi 		if (dst_is_ereg)
1966*22dc650dSSadaf Ebrahimi 			op |= SLJIT_32;
1967*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
1968*22dc650dSSadaf Ebrahimi 		return emit_bswap(compiler, op, dst, dstw, src, srcw);
1969*22dc650dSSadaf Ebrahimi 	}
1970*22dc650dSSadaf Ebrahimi 
1971*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
1972*22dc650dSSadaf Ebrahimi }
1973*22dc650dSSadaf Ebrahimi 
emit_cum_binary(struct sljit_compiler * compiler,sljit_u32 op_types,sljit_s32 dst,sljit_sw dstw,sljit_s32 src1,sljit_sw src1w,sljit_s32 src2,sljit_sw src2w)1974*22dc650dSSadaf Ebrahimi static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1975*22dc650dSSadaf Ebrahimi 	sljit_u32 op_types,
1976*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
1977*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
1978*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
1979*22dc650dSSadaf Ebrahimi {
1980*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
1981*22dc650dSSadaf Ebrahimi 	sljit_u8 op_eax_imm = U8(op_types >> 24);
1982*22dc650dSSadaf Ebrahimi 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
1983*22dc650dSSadaf Ebrahimi 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
1984*22dc650dSSadaf Ebrahimi 	sljit_u8 op_imm = U8(op_types & 0xff);
1985*22dc650dSSadaf Ebrahimi 
1986*22dc650dSSadaf Ebrahimi 	if (dst == src1 && dstw == src1w) {
1987*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
1988*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1989*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1990*22dc650dSSadaf Ebrahimi #else
1991*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1992*22dc650dSSadaf Ebrahimi #endif
1993*22dc650dSSadaf Ebrahimi 				BINARY_EAX_IMM(op_eax_imm, src2w);
1994*22dc650dSSadaf Ebrahimi 			}
1995*22dc650dSSadaf Ebrahimi 			else {
1996*22dc650dSSadaf Ebrahimi 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1997*22dc650dSSadaf Ebrahimi 			}
1998*22dc650dSSadaf Ebrahimi 		}
1999*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(dst)) {
2000*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
2001*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2002*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2003*22dc650dSSadaf Ebrahimi 		}
2004*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(src2)) {
2005*22dc650dSSadaf Ebrahimi 			/* Special exception for sljit_emit_op_flags. */
2006*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
2007*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2008*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2009*22dc650dSSadaf Ebrahimi 		}
2010*22dc650dSSadaf Ebrahimi 		else {
2011*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
2012*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2013*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2014*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2015*22dc650dSSadaf Ebrahimi 		}
2016*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2017*22dc650dSSadaf Ebrahimi 	}
2018*22dc650dSSadaf Ebrahimi 
2019*22dc650dSSadaf Ebrahimi 	/* Only for cumulative operations. */
2020*22dc650dSSadaf Ebrahimi 	if (dst == src2 && dstw == src2w) {
2021*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM) {
2022*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2023*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2024*22dc650dSSadaf Ebrahimi #else
2025*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
2026*22dc650dSSadaf Ebrahimi #endif
2027*22dc650dSSadaf Ebrahimi 				BINARY_EAX_IMM(op_eax_imm, src1w);
2028*22dc650dSSadaf Ebrahimi 			}
2029*22dc650dSSadaf Ebrahimi 			else {
2030*22dc650dSSadaf Ebrahimi 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
2031*22dc650dSSadaf Ebrahimi 			}
2032*22dc650dSSadaf Ebrahimi 		}
2033*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(dst)) {
2034*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
2035*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2036*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2037*22dc650dSSadaf Ebrahimi 		}
2038*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(src1)) {
2039*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
2040*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2041*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2042*22dc650dSSadaf Ebrahimi 		}
2043*22dc650dSSadaf Ebrahimi 		else {
2044*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2045*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2046*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2047*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2048*22dc650dSSadaf Ebrahimi 		}
2049*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2050*22dc650dSSadaf Ebrahimi 	}
2051*22dc650dSSadaf Ebrahimi 
2052*22dc650dSSadaf Ebrahimi 	/* General version. */
2053*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst)) {
2054*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, 0, src1, src1w);
2055*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2056*22dc650dSSadaf Ebrahimi 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
2057*22dc650dSSadaf Ebrahimi 		}
2058*22dc650dSSadaf Ebrahimi 		else {
2059*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
2060*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2061*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2062*22dc650dSSadaf Ebrahimi 		}
2063*22dc650dSSadaf Ebrahimi 	}
2064*22dc650dSSadaf Ebrahimi 	else {
2065*22dc650dSSadaf Ebrahimi 		/* This version requires less memory writing. */
2066*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2067*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2068*22dc650dSSadaf Ebrahimi 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
2069*22dc650dSSadaf Ebrahimi 		}
2070*22dc650dSSadaf Ebrahimi 		else {
2071*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2072*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2073*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2074*22dc650dSSadaf Ebrahimi 		}
2075*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2076*22dc650dSSadaf Ebrahimi 	}
2077*22dc650dSSadaf Ebrahimi 
2078*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2079*22dc650dSSadaf Ebrahimi }
2080*22dc650dSSadaf Ebrahimi 
2081*22dc650dSSadaf Ebrahimi static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
2082*22dc650dSSadaf Ebrahimi 	sljit_u32 op_types,
2083*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2084*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2085*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2086*22dc650dSSadaf Ebrahimi {
2087*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2088*22dc650dSSadaf Ebrahimi 	sljit_u8 op_eax_imm = U8(op_types >> 24);
2089*22dc650dSSadaf Ebrahimi 	sljit_u8 op_rm = U8((op_types >> 16) & 0xff);
2090*22dc650dSSadaf Ebrahimi 	sljit_u8 op_mr = U8((op_types >> 8) & 0xff);
2091*22dc650dSSadaf Ebrahimi 	sljit_u8 op_imm = U8(op_types & 0xff);
2092*22dc650dSSadaf Ebrahimi 
2093*22dc650dSSadaf Ebrahimi 	if (dst == src1 && dstw == src1w) {
2094*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2095*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2096*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2097*22dc650dSSadaf Ebrahimi #else
2098*22dc650dSSadaf Ebrahimi 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
2099*22dc650dSSadaf Ebrahimi #endif
2100*22dc650dSSadaf Ebrahimi 				BINARY_EAX_IMM(op_eax_imm, src2w);
2101*22dc650dSSadaf Ebrahimi 			}
2102*22dc650dSSadaf Ebrahimi 			else {
2103*22dc650dSSadaf Ebrahimi 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
2104*22dc650dSSadaf Ebrahimi 			}
2105*22dc650dSSadaf Ebrahimi 		}
2106*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(dst)) {
2107*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
2108*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2109*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2110*22dc650dSSadaf Ebrahimi 		}
2111*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(src2)) {
2112*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
2113*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2114*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2115*22dc650dSSadaf Ebrahimi 		}
2116*22dc650dSSadaf Ebrahimi 		else {
2117*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
2118*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
2119*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2120*22dc650dSSadaf Ebrahimi 			*inst = op_mr;
2121*22dc650dSSadaf Ebrahimi 		}
2122*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2123*22dc650dSSadaf Ebrahimi 	}
2124*22dc650dSSadaf Ebrahimi 
2125*22dc650dSSadaf Ebrahimi 	/* General version. */
2126*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst) && dst != src2) {
2127*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, 0, src1, src1w);
2128*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2129*22dc650dSSadaf Ebrahimi 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
2130*22dc650dSSadaf Ebrahimi 		}
2131*22dc650dSSadaf Ebrahimi 		else {
2132*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
2133*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2134*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2135*22dc650dSSadaf Ebrahimi 		}
2136*22dc650dSSadaf Ebrahimi 	}
2137*22dc650dSSadaf Ebrahimi 	else {
2138*22dc650dSSadaf Ebrahimi 		/* This version requires less memory writing. */
2139*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2140*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2141*22dc650dSSadaf Ebrahimi 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
2142*22dc650dSSadaf Ebrahimi 		}
2143*22dc650dSSadaf Ebrahimi 		else {
2144*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2145*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2146*22dc650dSSadaf Ebrahimi 			*inst = op_rm;
2147*22dc650dSSadaf Ebrahimi 		}
2148*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2149*22dc650dSSadaf Ebrahimi 	}
2150*22dc650dSSadaf Ebrahimi 
2151*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2152*22dc650dSSadaf Ebrahimi }
2153*22dc650dSSadaf Ebrahimi 
2154*22dc650dSSadaf Ebrahimi static sljit_s32 emit_mul(struct sljit_compiler *compiler,
2155*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2156*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2157*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2158*22dc650dSSadaf Ebrahimi {
2159*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2160*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2161*22dc650dSSadaf Ebrahimi 
2162*22dc650dSSadaf Ebrahimi 	/* Register destination. */
2163*22dc650dSSadaf Ebrahimi 	if (dst_r == src1 && src2 != SLJIT_IMM) {
2164*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2165*22dc650dSSadaf Ebrahimi 	} else if (dst_r == src2 && src1 != SLJIT_IMM) {
2166*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w));
2167*22dc650dSSadaf Ebrahimi 	} else if (src1 == SLJIT_IMM) {
2168*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2169*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
2170*22dc650dSSadaf Ebrahimi 			src2 = dst_r;
2171*22dc650dSSadaf Ebrahimi 			src2w = 0;
2172*22dc650dSSadaf Ebrahimi 		}
2173*22dc650dSSadaf Ebrahimi 
2174*22dc650dSSadaf Ebrahimi 		if (src1w <= 127 && src1w >= -128) {
2175*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2176*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2177*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i8;
2178*22dc650dSSadaf Ebrahimi 
2179*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, U8(src1w)));
2180*22dc650dSSadaf Ebrahimi 		}
2181*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2182*22dc650dSSadaf Ebrahimi 		else {
2183*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2184*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2185*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i32;
2186*22dc650dSSadaf Ebrahimi 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2187*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2188*22dc650dSSadaf Ebrahimi 			INC_SIZE(4);
2189*22dc650dSSadaf Ebrahimi 			sljit_unaligned_store_sw(inst, src1w);
2190*22dc650dSSadaf Ebrahimi 		}
2191*22dc650dSSadaf Ebrahimi #else
2192*22dc650dSSadaf Ebrahimi 		else if (IS_HALFWORD(src1w)) {
2193*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
2194*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2195*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i32;
2196*22dc650dSSadaf Ebrahimi 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2197*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2198*22dc650dSSadaf Ebrahimi 			INC_SIZE(4);
2199*22dc650dSSadaf Ebrahimi 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
2200*22dc650dSSadaf Ebrahimi 		}
2201*22dc650dSSadaf Ebrahimi 		else {
2202*22dc650dSSadaf Ebrahimi 			if (dst_r != src2)
2203*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
2204*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
2205*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2206*22dc650dSSadaf Ebrahimi 		}
2207*22dc650dSSadaf Ebrahimi #endif
2208*22dc650dSSadaf Ebrahimi 	}
2209*22dc650dSSadaf Ebrahimi 	else if (src2 == SLJIT_IMM) {
2210*22dc650dSSadaf Ebrahimi 		/* Note: src1 is NOT immediate. */
2211*22dc650dSSadaf Ebrahimi 
2212*22dc650dSSadaf Ebrahimi 		if (src2w <= 127 && src2w >= -128) {
2213*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2214*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2215*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i8;
2216*22dc650dSSadaf Ebrahimi 
2217*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, U8(src2w)));
2218*22dc650dSSadaf Ebrahimi 		}
2219*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2220*22dc650dSSadaf Ebrahimi 		else {
2221*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2222*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2223*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i32;
2224*22dc650dSSadaf Ebrahimi 
2225*22dc650dSSadaf Ebrahimi 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2226*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2227*22dc650dSSadaf Ebrahimi 			INC_SIZE(4);
2228*22dc650dSSadaf Ebrahimi 			sljit_unaligned_store_sw(inst, src2w);
2229*22dc650dSSadaf Ebrahimi 		}
2230*22dc650dSSadaf Ebrahimi #else
2231*22dc650dSSadaf Ebrahimi 		else if (IS_HALFWORD(src2w)) {
2232*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
2233*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2234*22dc650dSSadaf Ebrahimi 			*inst = IMUL_r_rm_i32;
2235*22dc650dSSadaf Ebrahimi 
2236*22dc650dSSadaf Ebrahimi 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
2237*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2238*22dc650dSSadaf Ebrahimi 			INC_SIZE(4);
2239*22dc650dSSadaf Ebrahimi 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
2240*22dc650dSSadaf Ebrahimi 		} else {
2241*22dc650dSSadaf Ebrahimi 			if (dst_r != src1)
2242*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2243*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2244*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
2245*22dc650dSSadaf Ebrahimi 		}
2246*22dc650dSSadaf Ebrahimi #endif
2247*22dc650dSSadaf Ebrahimi 	} else {
2248*22dc650dSSadaf Ebrahimi 		/* Neither argument is immediate. */
2249*22dc650dSSadaf Ebrahimi 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
2250*22dc650dSSadaf Ebrahimi 			dst_r = TMP_REG1;
2251*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
2252*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
2253*22dc650dSSadaf Ebrahimi 	}
2254*22dc650dSSadaf Ebrahimi 
2255*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM)
2256*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2257*22dc650dSSadaf Ebrahimi 
2258*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2259*22dc650dSSadaf Ebrahimi }
2260*22dc650dSSadaf Ebrahimi 
2261*22dc650dSSadaf Ebrahimi static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
2262*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2263*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2264*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2265*22dc650dSSadaf Ebrahimi {
2266*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2267*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r, done = 0;
2268*22dc650dSSadaf Ebrahimi 
2269*22dc650dSSadaf Ebrahimi 	/* These cases better be left to handled by normal way. */
2270*22dc650dSSadaf Ebrahimi 	if (dst == src1 && dstw == src1w)
2271*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
2272*22dc650dSSadaf Ebrahimi 	if (dst == src2 && dstw == src2w)
2273*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
2274*22dc650dSSadaf Ebrahimi 
2275*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2276*22dc650dSSadaf Ebrahimi 
2277*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(src1)) {
2278*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(src2)) {
2279*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
2280*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2281*22dc650dSSadaf Ebrahimi 			*inst = LEA_r_m;
2282*22dc650dSSadaf Ebrahimi 			done = 1;
2283*22dc650dSSadaf Ebrahimi 		}
2284*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2285*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src2w))) {
2286*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
2287*22dc650dSSadaf Ebrahimi #else
2288*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2289*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
2290*22dc650dSSadaf Ebrahimi #endif
2291*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2292*22dc650dSSadaf Ebrahimi 			*inst = LEA_r_m;
2293*22dc650dSSadaf Ebrahimi 			done = 1;
2294*22dc650dSSadaf Ebrahimi 		}
2295*22dc650dSSadaf Ebrahimi 	}
2296*22dc650dSSadaf Ebrahimi 	else if (FAST_IS_REG(src2)) {
2297*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2298*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src1w))) {
2299*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
2300*22dc650dSSadaf Ebrahimi #else
2301*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM) {
2302*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
2303*22dc650dSSadaf Ebrahimi #endif
2304*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2305*22dc650dSSadaf Ebrahimi 			*inst = LEA_r_m;
2306*22dc650dSSadaf Ebrahimi 			done = 1;
2307*22dc650dSSadaf Ebrahimi 		}
2308*22dc650dSSadaf Ebrahimi 	}
2309*22dc650dSSadaf Ebrahimi 
2310*22dc650dSSadaf Ebrahimi 	if (done) {
2311*22dc650dSSadaf Ebrahimi 		if (dst_r == TMP_REG1)
2312*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2313*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2314*22dc650dSSadaf Ebrahimi 	}
2315*22dc650dSSadaf Ebrahimi 	return SLJIT_ERR_UNSUPPORTED;
2316*22dc650dSSadaf Ebrahimi }
2317*22dc650dSSadaf Ebrahimi 
2318*22dc650dSSadaf Ebrahimi static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
2319*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2320*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2321*22dc650dSSadaf Ebrahimi {
2322*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2323*22dc650dSSadaf Ebrahimi 
2324*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2325*22dc650dSSadaf Ebrahimi 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2326*22dc650dSSadaf Ebrahimi #else
2327*22dc650dSSadaf Ebrahimi 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2328*22dc650dSSadaf Ebrahimi #endif
2329*22dc650dSSadaf Ebrahimi 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
2330*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2331*22dc650dSSadaf Ebrahimi 	}
2332*22dc650dSSadaf Ebrahimi 
2333*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(src1)) {
2334*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2335*22dc650dSSadaf Ebrahimi 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
2336*22dc650dSSadaf Ebrahimi 		}
2337*22dc650dSSadaf Ebrahimi 		else {
2338*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2339*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2340*22dc650dSSadaf Ebrahimi 			*inst = CMP_r_rm;
2341*22dc650dSSadaf Ebrahimi 		}
2342*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2343*22dc650dSSadaf Ebrahimi 	}
2344*22dc650dSSadaf Ebrahimi 
2345*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(src2) && src1 != SLJIT_IMM) {
2346*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2347*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2348*22dc650dSSadaf Ebrahimi 		*inst = CMP_rm_r;
2349*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2350*22dc650dSSadaf Ebrahimi 	}
2351*22dc650dSSadaf Ebrahimi 
2352*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_IMM) {
2353*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM) {
2354*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2355*22dc650dSSadaf Ebrahimi 			src1 = TMP_REG1;
2356*22dc650dSSadaf Ebrahimi 			src1w = 0;
2357*22dc650dSSadaf Ebrahimi 		}
2358*22dc650dSSadaf Ebrahimi 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
2359*22dc650dSSadaf Ebrahimi 	}
2360*22dc650dSSadaf Ebrahimi 	else {
2361*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2362*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2363*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2364*22dc650dSSadaf Ebrahimi 		*inst = CMP_r_rm;
2365*22dc650dSSadaf Ebrahimi 	}
2366*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2367*22dc650dSSadaf Ebrahimi }
2368*22dc650dSSadaf Ebrahimi 
2369*22dc650dSSadaf Ebrahimi static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
2370*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2371*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2372*22dc650dSSadaf Ebrahimi {
2373*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2374*22dc650dSSadaf Ebrahimi 
2375*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2376*22dc650dSSadaf Ebrahimi 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
2377*22dc650dSSadaf Ebrahimi #else
2378*22dc650dSSadaf Ebrahimi 	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
2379*22dc650dSSadaf Ebrahimi #endif
2380*22dc650dSSadaf Ebrahimi 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
2381*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2382*22dc650dSSadaf Ebrahimi 	}
2383*22dc650dSSadaf Ebrahimi 
2384*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2385*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
2386*22dc650dSSadaf Ebrahimi #else
2387*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128)) {
2388*22dc650dSSadaf Ebrahimi #endif
2389*22dc650dSSadaf Ebrahimi 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
2390*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2391*22dc650dSSadaf Ebrahimi 	}
2392*22dc650dSSadaf Ebrahimi 
2393*22dc650dSSadaf Ebrahimi 	if (src1 != SLJIT_IMM) {
2394*22dc650dSSadaf Ebrahimi 		if (src2 == SLJIT_IMM) {
2395*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2396*22dc650dSSadaf Ebrahimi 			if (IS_HALFWORD(src2w) || compiler->mode32) {
2397*22dc650dSSadaf Ebrahimi 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2398*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
2399*22dc650dSSadaf Ebrahimi 				*inst = GROUP_F7;
2400*22dc650dSSadaf Ebrahimi 			} else {
2401*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_load_imm64(compiler, FAST_IS_REG(src1) ? TMP_REG2 : TMP_REG1, src2w));
2402*22dc650dSSadaf Ebrahimi 				inst = emit_x86_instruction(compiler, 1, FAST_IS_REG(src1) ? TMP_REG2 : TMP_REG1, 0, src1, src1w);
2403*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
2404*22dc650dSSadaf Ebrahimi 				*inst = TEST_rm_r;
2405*22dc650dSSadaf Ebrahimi 			}
2406*22dc650dSSadaf Ebrahimi #else
2407*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
2408*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2409*22dc650dSSadaf Ebrahimi 			*inst = GROUP_F7;
2410*22dc650dSSadaf Ebrahimi #endif
2411*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2412*22dc650dSSadaf Ebrahimi 		}
2413*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(src1)) {
2414*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
2415*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2416*22dc650dSSadaf Ebrahimi 			*inst = TEST_rm_r;
2417*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2418*22dc650dSSadaf Ebrahimi 		}
2419*22dc650dSSadaf Ebrahimi 	}
2420*22dc650dSSadaf Ebrahimi 
2421*22dc650dSSadaf Ebrahimi 	if (src2 != SLJIT_IMM) {
2422*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM) {
2423*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2424*22dc650dSSadaf Ebrahimi 			if (IS_HALFWORD(src1w) || compiler->mode32) {
2425*22dc650dSSadaf Ebrahimi 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
2426*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
2427*22dc650dSSadaf Ebrahimi 				*inst = GROUP_F7;
2428*22dc650dSSadaf Ebrahimi 			}
2429*22dc650dSSadaf Ebrahimi 			else {
2430*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
2431*22dc650dSSadaf Ebrahimi 				inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2432*22dc650dSSadaf Ebrahimi 				FAIL_IF(!inst);
2433*22dc650dSSadaf Ebrahimi 				*inst = TEST_rm_r;
2434*22dc650dSSadaf Ebrahimi 			}
2435*22dc650dSSadaf Ebrahimi #else
2436*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
2437*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2438*22dc650dSSadaf Ebrahimi 			*inst = GROUP_F7;
2439*22dc650dSSadaf Ebrahimi #endif
2440*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2441*22dc650dSSadaf Ebrahimi 		}
2442*22dc650dSSadaf Ebrahimi 		else if (FAST_IS_REG(src2)) {
2443*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
2444*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2445*22dc650dSSadaf Ebrahimi 			*inst = TEST_rm_r;
2446*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2447*22dc650dSSadaf Ebrahimi 		}
2448*22dc650dSSadaf Ebrahimi 	}
2449*22dc650dSSadaf Ebrahimi 
2450*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2451*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_IMM) {
2452*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2453*22dc650dSSadaf Ebrahimi 		if (IS_HALFWORD(src2w) || compiler->mode32) {
2454*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2455*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2456*22dc650dSSadaf Ebrahimi 			*inst = GROUP_F7;
2457*22dc650dSSadaf Ebrahimi 		}
2458*22dc650dSSadaf Ebrahimi 		else {
2459*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2460*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2461*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2462*22dc650dSSadaf Ebrahimi 			*inst = TEST_rm_r;
2463*22dc650dSSadaf Ebrahimi 		}
2464*22dc650dSSadaf Ebrahimi #else
2465*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2466*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2467*22dc650dSSadaf Ebrahimi 		*inst = GROUP_F7;
2468*22dc650dSSadaf Ebrahimi #endif
2469*22dc650dSSadaf Ebrahimi 	}
2470*22dc650dSSadaf Ebrahimi 	else {
2471*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2472*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2473*22dc650dSSadaf Ebrahimi 		*inst = TEST_rm_r;
2474*22dc650dSSadaf Ebrahimi 	}
2475*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2476*22dc650dSSadaf Ebrahimi }
2477*22dc650dSSadaf Ebrahimi 
2478*22dc650dSSadaf Ebrahimi static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2479*22dc650dSSadaf Ebrahimi 	sljit_u8 mode,
2480*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2481*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2482*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2483*22dc650dSSadaf Ebrahimi {
2484*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2485*22dc650dSSadaf Ebrahimi 	sljit_s32 mode32;
2486*22dc650dSSadaf Ebrahimi #endif
2487*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2488*22dc650dSSadaf Ebrahimi 
2489*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_IMM || src2 == SLJIT_PREF_SHIFT_REG) {
2490*22dc650dSSadaf Ebrahimi 		if (dst == src1 && dstw == src1w) {
2491*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2492*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2493*22dc650dSSadaf Ebrahimi 			inst[1] |= mode;
2494*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2495*22dc650dSSadaf Ebrahimi 		}
2496*22dc650dSSadaf Ebrahimi 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2497*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2498*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2499*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2500*22dc650dSSadaf Ebrahimi 			inst[1] |= mode;
2501*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2502*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2503*22dc650dSSadaf Ebrahimi 		}
2504*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(dst)) {
2505*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2506*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2507*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
2508*22dc650dSSadaf Ebrahimi 			inst[1] |= mode;
2509*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2510*22dc650dSSadaf Ebrahimi 		}
2511*22dc650dSSadaf Ebrahimi 
2512*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2513*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2514*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2515*22dc650dSSadaf Ebrahimi 		inst[1] |= mode;
2516*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2517*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2518*22dc650dSSadaf Ebrahimi 	}
2519*22dc650dSSadaf Ebrahimi 
2520*22dc650dSSadaf Ebrahimi 	if (dst == SLJIT_PREF_SHIFT_REG) {
2521*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2522*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2523*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2524*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2525*22dc650dSSadaf Ebrahimi 		inst[1] |= mode;
2526*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2527*22dc650dSSadaf Ebrahimi 	}
2528*22dc650dSSadaf Ebrahimi 
2529*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2530*22dc650dSSadaf Ebrahimi 		if (src1 != dst)
2531*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst, 0, src1, src1w);
2532*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2533*22dc650dSSadaf Ebrahimi 		mode32 = compiler->mode32;
2534*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
2535*22dc650dSSadaf Ebrahimi #endif
2536*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2537*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2538*22dc650dSSadaf Ebrahimi 		compiler->mode32 = mode32;
2539*22dc650dSSadaf Ebrahimi #endif
2540*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2541*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2542*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2543*22dc650dSSadaf Ebrahimi 		inst[1] |= mode;
2544*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2545*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
2546*22dc650dSSadaf Ebrahimi #endif
2547*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2548*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2549*22dc650dSSadaf Ebrahimi 		compiler->mode32 = mode32;
2550*22dc650dSSadaf Ebrahimi #endif
2551*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2552*22dc650dSSadaf Ebrahimi 	}
2553*22dc650dSSadaf Ebrahimi 
2554*22dc650dSSadaf Ebrahimi 	/* This case is complex since ecx itself may be used for
2555*22dc650dSSadaf Ebrahimi 	   addressing, and this case must be supported as well. */
2556*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2557*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2558*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2559*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
2560*22dc650dSSadaf Ebrahimi 	mode32 = compiler->mode32;
2561*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
2562*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2563*22dc650dSSadaf Ebrahimi 	compiler->mode32 = mode32;
2564*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2565*22dc650dSSadaf Ebrahimi 
2566*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2567*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2568*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
2569*22dc650dSSadaf Ebrahimi 	inst[1] |= mode;
2570*22dc650dSSadaf Ebrahimi 
2571*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2572*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2573*22dc650dSSadaf Ebrahimi #else
2574*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
2575*22dc650dSSadaf Ebrahimi 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2576*22dc650dSSadaf Ebrahimi 	compiler->mode32 = mode32;
2577*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2578*22dc650dSSadaf Ebrahimi 
2579*22dc650dSSadaf Ebrahimi 	if (dst != TMP_REG1)
2580*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2581*22dc650dSSadaf Ebrahimi 
2582*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2583*22dc650dSSadaf Ebrahimi }
2584*22dc650dSSadaf Ebrahimi 
2585*22dc650dSSadaf Ebrahimi static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2586*22dc650dSSadaf Ebrahimi 	sljit_u8 mode, sljit_s32 set_flags,
2587*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2588*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2589*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2590*22dc650dSSadaf Ebrahimi {
2591*22dc650dSSadaf Ebrahimi 	/* The CPU does not set flags if the shift count is 0. */
2592*22dc650dSSadaf Ebrahimi 	if (src2 == SLJIT_IMM) {
2593*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2594*22dc650dSSadaf Ebrahimi 		src2w &= compiler->mode32 ? 0x1f : 0x3f;
2595*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
2596*22dc650dSSadaf Ebrahimi 		src2w &= 0x1f;
2597*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2598*22dc650dSSadaf Ebrahimi 		if (src2w != 0)
2599*22dc650dSSadaf Ebrahimi 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2600*22dc650dSSadaf Ebrahimi 
2601*22dc650dSSadaf Ebrahimi 		if (!set_flags)
2602*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, dst, dstw, src1, src1w);
2603*22dc650dSSadaf Ebrahimi 		/* OR dst, src, 0 */
2604*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2605*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2606*22dc650dSSadaf Ebrahimi 	}
2607*22dc650dSSadaf Ebrahimi 
2608*22dc650dSSadaf Ebrahimi 	if (!set_flags)
2609*22dc650dSSadaf Ebrahimi 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2610*22dc650dSSadaf Ebrahimi 
2611*22dc650dSSadaf Ebrahimi 	if (!FAST_IS_REG(dst))
2612*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2613*22dc650dSSadaf Ebrahimi 
2614*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2615*22dc650dSSadaf Ebrahimi 
2616*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst))
2617*22dc650dSSadaf Ebrahimi 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2618*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2619*22dc650dSSadaf Ebrahimi }
2620*22dc650dSSadaf Ebrahimi 
2621*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2622*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
2623*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2624*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2625*22dc650dSSadaf Ebrahimi {
2626*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2627*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w));
2628*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
2629*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
2630*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src2, src2w);
2631*22dc650dSSadaf Ebrahimi 
2632*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2633*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2634*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2635*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2636*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op & SLJIT_32;
2637*22dc650dSSadaf Ebrahimi #endif
2638*22dc650dSSadaf Ebrahimi 
2639*22dc650dSSadaf Ebrahimi 	switch (GET_OPCODE(op)) {
2640*22dc650dSSadaf Ebrahimi 	case SLJIT_ADD:
2641*22dc650dSSadaf Ebrahimi 		if (!HAS_FLAGS(op)) {
2642*22dc650dSSadaf Ebrahimi 			if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2643*22dc650dSSadaf Ebrahimi 				return compiler->error;
2644*22dc650dSSadaf Ebrahimi 		}
2645*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2646*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2647*22dc650dSSadaf Ebrahimi 	case SLJIT_ADDC:
2648*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2649*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2650*22dc650dSSadaf Ebrahimi 	case SLJIT_SUB:
2651*22dc650dSSadaf Ebrahimi 		if (src1 == SLJIT_IMM && src1w == 0)
2652*22dc650dSSadaf Ebrahimi 			return emit_unary(compiler, NEG_rm, dst, dstw, src2, src2w);
2653*22dc650dSSadaf Ebrahimi 
2654*22dc650dSSadaf Ebrahimi 		if (!HAS_FLAGS(op)) {
2655*22dc650dSSadaf Ebrahimi 			if (src2 == SLJIT_IMM && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2656*22dc650dSSadaf Ebrahimi 				return compiler->error;
2657*22dc650dSSadaf Ebrahimi 			if (FAST_IS_REG(dst) && src2 == dst) {
2658*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
2659*22dc650dSSadaf Ebrahimi 				return emit_unary(compiler, NEG_rm, dst, 0, dst, 0);
2660*22dc650dSSadaf Ebrahimi 			}
2661*22dc650dSSadaf Ebrahimi 		}
2662*22dc650dSSadaf Ebrahimi 
2663*22dc650dSSadaf Ebrahimi 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2664*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2665*22dc650dSSadaf Ebrahimi 	case SLJIT_SUBC:
2666*22dc650dSSadaf Ebrahimi 		return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2667*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2668*22dc650dSSadaf Ebrahimi 	case SLJIT_MUL:
2669*22dc650dSSadaf Ebrahimi 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2670*22dc650dSSadaf Ebrahimi 	case SLJIT_AND:
2671*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2672*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2673*22dc650dSSadaf Ebrahimi 	case SLJIT_OR:
2674*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2675*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2676*22dc650dSSadaf Ebrahimi 	case SLJIT_XOR:
2677*22dc650dSSadaf Ebrahimi 		if (!HAS_FLAGS(op)) {
2678*22dc650dSSadaf Ebrahimi 			if (src2 == SLJIT_IMM && src2w == -1)
2679*22dc650dSSadaf Ebrahimi 				return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w);
2680*22dc650dSSadaf Ebrahimi 			if (src1 == SLJIT_IMM && src1w == -1)
2681*22dc650dSSadaf Ebrahimi 				return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w);
2682*22dc650dSSadaf Ebrahimi 		}
2683*22dc650dSSadaf Ebrahimi 
2684*22dc650dSSadaf Ebrahimi 		return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2685*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2686*22dc650dSSadaf Ebrahimi 	case SLJIT_SHL:
2687*22dc650dSSadaf Ebrahimi 	case SLJIT_MSHL:
2688*22dc650dSSadaf Ebrahimi 		return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2689*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2690*22dc650dSSadaf Ebrahimi 	case SLJIT_LSHR:
2691*22dc650dSSadaf Ebrahimi 	case SLJIT_MLSHR:
2692*22dc650dSSadaf Ebrahimi 		return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2693*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2694*22dc650dSSadaf Ebrahimi 	case SLJIT_ASHR:
2695*22dc650dSSadaf Ebrahimi 	case SLJIT_MASHR:
2696*22dc650dSSadaf Ebrahimi 		return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2697*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2698*22dc650dSSadaf Ebrahimi 	case SLJIT_ROTL:
2699*22dc650dSSadaf Ebrahimi 		return emit_shift_with_flags(compiler, ROL, 0,
2700*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2701*22dc650dSSadaf Ebrahimi 	case SLJIT_ROTR:
2702*22dc650dSSadaf Ebrahimi 		return emit_shift_with_flags(compiler, ROR, 0,
2703*22dc650dSSadaf Ebrahimi 			dst, dstw, src1, src1w, src2, src2w);
2704*22dc650dSSadaf Ebrahimi 	}
2705*22dc650dSSadaf Ebrahimi 
2706*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2707*22dc650dSSadaf Ebrahimi }
2708*22dc650dSSadaf Ebrahimi 
2709*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op,
2710*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2711*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2712*22dc650dSSadaf Ebrahimi {
2713*22dc650dSSadaf Ebrahimi 	sljit_s32 opcode = GET_OPCODE(op);
2714*22dc650dSSadaf Ebrahimi 
2715*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2716*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
2717*22dc650dSSadaf Ebrahimi 
2718*22dc650dSSadaf Ebrahimi 	if (opcode != SLJIT_SUB && opcode != SLJIT_AND) {
2719*22dc650dSSadaf Ebrahimi 		SLJIT_SKIP_CHECKS(compiler);
2720*22dc650dSSadaf Ebrahimi 		return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w);
2721*22dc650dSSadaf Ebrahimi 	}
2722*22dc650dSSadaf Ebrahimi 
2723*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
2724*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src2, src2w);
2725*22dc650dSSadaf Ebrahimi 
2726*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2727*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2728*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2729*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op & SLJIT_32;
2730*22dc650dSSadaf Ebrahimi #endif
2731*22dc650dSSadaf Ebrahimi 
2732*22dc650dSSadaf Ebrahimi 	if (opcode == SLJIT_SUB)
2733*22dc650dSSadaf Ebrahimi 		return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2734*22dc650dSSadaf Ebrahimi 
2735*22dc650dSSadaf Ebrahimi 	return emit_test_binary(compiler, src1, src1w, src2, src2w);
2736*22dc650dSSadaf Ebrahimi }
2737*22dc650dSSadaf Ebrahimi 
2738*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2r(struct sljit_compiler *compiler, sljit_s32 op,
2739*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_reg,
2740*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
2741*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
2742*22dc650dSSadaf Ebrahimi {
2743*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2744*22dc650dSSadaf Ebrahimi 	sljit_sw dstw = 0;
2745*22dc650dSSadaf Ebrahimi 
2746*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2747*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op2r(compiler, op, dst_reg, src1, src1w, src2, src2w));
2748*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
2749*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src2, src2w);
2750*22dc650dSSadaf Ebrahimi 
2751*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
2752*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2753*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2754*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2755*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op & SLJIT_32;
2756*22dc650dSSadaf Ebrahimi #endif
2757*22dc650dSSadaf Ebrahimi 
2758*22dc650dSSadaf Ebrahimi 	switch (GET_OPCODE(op)) {
2759*22dc650dSSadaf Ebrahimi 	case SLJIT_MULADD:
2760*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_mul(compiler, TMP_REG1, 0, src1, src1w, src2, src2w));
2761*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst_reg, dstw);
2762*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
2763*22dc650dSSadaf Ebrahimi 		*inst = ADD_rm_r;
2764*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
2765*22dc650dSSadaf Ebrahimi 	}
2766*22dc650dSSadaf Ebrahimi 
2767*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2768*22dc650dSSadaf Ebrahimi }
2769*22dc650dSSadaf Ebrahimi 
2770*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
2771*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_reg,
2772*22dc650dSSadaf Ebrahimi 	sljit_s32 src1_reg,
2773*22dc650dSSadaf Ebrahimi 	sljit_s32 src2_reg,
2774*22dc650dSSadaf Ebrahimi 	sljit_s32 src3, sljit_sw src3w)
2775*22dc650dSSadaf Ebrahimi {
2776*22dc650dSSadaf Ebrahimi 	sljit_s32 is_rotate, is_left, move_src1;
2777*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
2778*22dc650dSSadaf Ebrahimi 	sljit_sw src1w = 0;
2779*22dc650dSSadaf Ebrahimi 	sljit_sw dstw = 0;
2780*22dc650dSSadaf Ebrahimi 	/* The whole register must be saved even for 32 bit operations. */
2781*22dc650dSSadaf Ebrahimi 	sljit_u8 restore_ecx = 0;
2782*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2783*22dc650dSSadaf Ebrahimi 	sljit_sw src2w = 0;
2784*22dc650dSSadaf Ebrahimi 	sljit_s32 restore_sp4 = 0;
2785*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2786*22dc650dSSadaf Ebrahimi 
2787*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2788*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w));
2789*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src3, src3w);
2790*22dc650dSSadaf Ebrahimi 
2791*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst_reg, dstw, (void)0);
2792*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src3, src3w, (void)0);
2793*22dc650dSSadaf Ebrahimi 
2794*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2795*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op & SLJIT_32;
2796*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2797*22dc650dSSadaf Ebrahimi 
2798*22dc650dSSadaf Ebrahimi 	if (src3 == SLJIT_IMM) {
2799*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2800*22dc650dSSadaf Ebrahimi 		src3w &= 0x1f;
2801*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
2802*22dc650dSSadaf Ebrahimi 		src3w &= (op & SLJIT_32) ? 0x1f : 0x3f;
2803*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2804*22dc650dSSadaf Ebrahimi 
2805*22dc650dSSadaf Ebrahimi 		if (src3w == 0)
2806*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2807*22dc650dSSadaf Ebrahimi 	}
2808*22dc650dSSadaf Ebrahimi 
2809*22dc650dSSadaf Ebrahimi 	is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL);
2810*22dc650dSSadaf Ebrahimi 
2811*22dc650dSSadaf Ebrahimi 	is_rotate = (src1_reg == src2_reg);
2812*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src1_reg, src1w, (void)0);
2813*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src2_reg, src2w, (void)0);
2814*22dc650dSSadaf Ebrahimi 
2815*22dc650dSSadaf Ebrahimi 	if (is_rotate)
2816*22dc650dSSadaf Ebrahimi 		return emit_shift(compiler, is_left ? ROL : ROR, dst_reg, dstw, src1_reg, src1w, src3, src3w);
2817*22dc650dSSadaf Ebrahimi 
2818*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2819*22dc650dSSadaf Ebrahimi 	if (src2_reg & SLJIT_MEM) {
2820*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w);
2821*22dc650dSSadaf Ebrahimi 		src2_reg = TMP_REG1;
2822*22dc650dSSadaf Ebrahimi 	}
2823*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2824*22dc650dSSadaf Ebrahimi 
2825*22dc650dSSadaf Ebrahimi 	if (dst_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
2826*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2827*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2828*22dc650dSSadaf Ebrahimi 		src1_reg = TMP_REG1;
2829*22dc650dSSadaf Ebrahimi 		src1w = 0;
2830*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
2831*22dc650dSSadaf Ebrahimi 		if (src2_reg != TMP_REG1) {
2832*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2833*22dc650dSSadaf Ebrahimi 			src1_reg = TMP_REG1;
2834*22dc650dSSadaf Ebrahimi 			src1w = 0;
2835*22dc650dSSadaf Ebrahimi 		} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2836*22dc650dSSadaf Ebrahimi 			restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2837*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2838*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2839*22dc650dSSadaf Ebrahimi 			src1_reg = restore_sp4;
2840*22dc650dSSadaf Ebrahimi 			src1w = 0;
2841*22dc650dSSadaf Ebrahimi 		} else {
2842*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2843*22dc650dSSadaf Ebrahimi 			restore_sp4 = src1_reg;
2844*22dc650dSSadaf Ebrahimi 		}
2845*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2846*22dc650dSSadaf Ebrahimi 
2847*22dc650dSSadaf Ebrahimi 		if (src3 != SLJIT_PREF_SHIFT_REG)
2848*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2849*22dc650dSSadaf Ebrahimi 	} else {
2850*22dc650dSSadaf Ebrahimi 		if (src2_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2851*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2852*22dc650dSSadaf Ebrahimi 			compiler->mode32 = 0;
2853*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2854*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2855*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2856*22dc650dSSadaf Ebrahimi 			compiler->mode32 = op & SLJIT_32;
2857*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2858*22dc650dSSadaf Ebrahimi 			src2_reg = TMP_REG1;
2859*22dc650dSSadaf Ebrahimi 			restore_ecx = 1;
2860*22dc650dSSadaf Ebrahimi 		}
2861*22dc650dSSadaf Ebrahimi 
2862*22dc650dSSadaf Ebrahimi 		move_src1 = 0;
2863*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2864*22dc650dSSadaf Ebrahimi 		if (dst_reg != src1_reg) {
2865*22dc650dSSadaf Ebrahimi 			if (dst_reg != src3) {
2866*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2867*22dc650dSSadaf Ebrahimi 				src1_reg = dst_reg;
2868*22dc650dSSadaf Ebrahimi 				src1w = 0;
2869*22dc650dSSadaf Ebrahimi 			} else
2870*22dc650dSSadaf Ebrahimi 				move_src1 = 1;
2871*22dc650dSSadaf Ebrahimi 		}
2872*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
2873*22dc650dSSadaf Ebrahimi 		if (dst_reg & SLJIT_MEM) {
2874*22dc650dSSadaf Ebrahimi 			if (src2_reg != TMP_REG1) {
2875*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
2876*22dc650dSSadaf Ebrahimi 				src1_reg = TMP_REG1;
2877*22dc650dSSadaf Ebrahimi 				src1w = 0;
2878*22dc650dSSadaf Ebrahimi 			} else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) {
2879*22dc650dSSadaf Ebrahimi 				restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0;
2880*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0);
2881*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w);
2882*22dc650dSSadaf Ebrahimi 				src1_reg = restore_sp4;
2883*22dc650dSSadaf Ebrahimi 				src1w = 0;
2884*22dc650dSSadaf Ebrahimi 			} else {
2885*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0);
2886*22dc650dSSadaf Ebrahimi 				restore_sp4 = src1_reg;
2887*22dc650dSSadaf Ebrahimi 			}
2888*22dc650dSSadaf Ebrahimi 		} else if (dst_reg != src1_reg) {
2889*22dc650dSSadaf Ebrahimi 			if (dst_reg != src3) {
2890*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2891*22dc650dSSadaf Ebrahimi 				src1_reg = dst_reg;
2892*22dc650dSSadaf Ebrahimi 				src1w = 0;
2893*22dc650dSSadaf Ebrahimi 			} else
2894*22dc650dSSadaf Ebrahimi 				move_src1 = 1;
2895*22dc650dSSadaf Ebrahimi 		}
2896*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2897*22dc650dSSadaf Ebrahimi 
2898*22dc650dSSadaf Ebrahimi 		if (src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
2899*22dc650dSSadaf Ebrahimi 			if (!restore_ecx) {
2900*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2901*22dc650dSSadaf Ebrahimi 				compiler->mode32 = 0;
2902*22dc650dSSadaf Ebrahimi 				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2903*22dc650dSSadaf Ebrahimi 				compiler->mode32 = op & SLJIT_32;
2904*22dc650dSSadaf Ebrahimi 				restore_ecx = 1;
2905*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
2906*22dc650dSSadaf Ebrahimi 				if (src1_reg != TMP_REG1 && src2_reg != TMP_REG1) {
2907*22dc650dSSadaf Ebrahimi 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2908*22dc650dSSadaf Ebrahimi 					restore_ecx = 1;
2909*22dc650dSSadaf Ebrahimi 				} else {
2910*22dc650dSSadaf Ebrahimi 					EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2911*22dc650dSSadaf Ebrahimi 					restore_ecx = 2;
2912*22dc650dSSadaf Ebrahimi 				}
2913*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
2914*22dc650dSSadaf Ebrahimi 			}
2915*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
2916*22dc650dSSadaf Ebrahimi 		}
2917*22dc650dSSadaf Ebrahimi 
2918*22dc650dSSadaf Ebrahimi 		if (move_src1) {
2919*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w);
2920*22dc650dSSadaf Ebrahimi 			src1_reg = dst_reg;
2921*22dc650dSSadaf Ebrahimi 			src1w = 0;
2922*22dc650dSSadaf Ebrahimi 		}
2923*22dc650dSSadaf Ebrahimi 	}
2924*22dc650dSSadaf Ebrahimi 
2925*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 2, src2_reg, 0, src1_reg, src1w);
2926*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
2927*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
2928*22dc650dSSadaf Ebrahimi 
2929*22dc650dSSadaf Ebrahimi 	if (src3 == SLJIT_IMM) {
2930*22dc650dSSadaf Ebrahimi 		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
2931*22dc650dSSadaf Ebrahimi 
2932*22dc650dSSadaf Ebrahimi 		/* Immediate argument is added separately. */
2933*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, U8(src3w)));
2934*22dc650dSSadaf Ebrahimi 	} else
2935*22dc650dSSadaf Ebrahimi 		inst[1] = U8(is_left ? SHLD : SHRD);
2936*22dc650dSSadaf Ebrahimi 
2937*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2938*22dc650dSSadaf Ebrahimi 	if (restore_ecx) {
2939*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
2940*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2941*22dc650dSSadaf Ebrahimi 	}
2942*22dc650dSSadaf Ebrahimi 
2943*22dc650dSSadaf Ebrahimi 	if (src1_reg != dst_reg) {
2944*22dc650dSSadaf Ebrahimi 		compiler->mode32 = op & SLJIT_32;
2945*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst_reg, dstw, src1_reg, 0);
2946*22dc650dSSadaf Ebrahimi 	}
2947*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
2948*22dc650dSSadaf Ebrahimi 	if (restore_ecx)
2949*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, restore_ecx == 1 ? TMP_REG1 : SLJIT_MEM1(SLJIT_SP), 0);
2950*22dc650dSSadaf Ebrahimi 
2951*22dc650dSSadaf Ebrahimi 	if (src1_reg != dst_reg)
2952*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, dst_reg, dstw, src1_reg, 0);
2953*22dc650dSSadaf Ebrahimi 
2954*22dc650dSSadaf Ebrahimi 	if (restore_sp4)
2955*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, restore_sp4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32));
2956*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
2957*22dc650dSSadaf Ebrahimi 
2958*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2959*22dc650dSSadaf Ebrahimi }
2960*22dc650dSSadaf Ebrahimi 
2961*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *compiler, sljit_s32 op,
2962*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
2963*22dc650dSSadaf Ebrahimi {
2964*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2965*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op_src(compiler, op, src, srcw));
2966*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src, srcw);
2967*22dc650dSSadaf Ebrahimi 
2968*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src, srcw, (void)0);
2969*22dc650dSSadaf Ebrahimi 
2970*22dc650dSSadaf Ebrahimi 	switch (op) {
2971*22dc650dSSadaf Ebrahimi 	case SLJIT_FAST_RETURN:
2972*22dc650dSSadaf Ebrahimi 		return emit_fast_return(compiler, src, srcw);
2973*22dc650dSSadaf Ebrahimi 	case SLJIT_SKIP_FRAMES_BEFORE_FAST_RETURN:
2974*22dc650dSSadaf Ebrahimi 		/* Don't adjust shadow stack if it isn't enabled.  */
2975*22dc650dSSadaf Ebrahimi 		if (!cpu_has_shadow_stack ())
2976*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
2977*22dc650dSSadaf Ebrahimi 		return adjust_shadow_stack(compiler, src, srcw);
2978*22dc650dSSadaf Ebrahimi 	case SLJIT_PREFETCH_L1:
2979*22dc650dSSadaf Ebrahimi 	case SLJIT_PREFETCH_L2:
2980*22dc650dSSadaf Ebrahimi 	case SLJIT_PREFETCH_L3:
2981*22dc650dSSadaf Ebrahimi 	case SLJIT_PREFETCH_ONCE:
2982*22dc650dSSadaf Ebrahimi 		return emit_prefetch(compiler, op, src, srcw);
2983*22dc650dSSadaf Ebrahimi 	}
2984*22dc650dSSadaf Ebrahimi 
2985*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
2986*22dc650dSSadaf Ebrahimi }
2987*22dc650dSSadaf Ebrahimi 
2988*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op,
2989*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw)
2990*22dc650dSSadaf Ebrahimi {
2991*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
2992*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw));
2993*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
2994*22dc650dSSadaf Ebrahimi 
2995*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2996*22dc650dSSadaf Ebrahimi 
2997*22dc650dSSadaf Ebrahimi 	switch (op) {
2998*22dc650dSSadaf Ebrahimi 	case SLJIT_FAST_ENTER:
2999*22dc650dSSadaf Ebrahimi 		return emit_fast_enter(compiler, dst, dstw);
3000*22dc650dSSadaf Ebrahimi 	case SLJIT_GET_RETURN_ADDRESS:
3001*22dc650dSSadaf Ebrahimi 		return sljit_emit_get_return_address(compiler, dst, dstw);
3002*22dc650dSSadaf Ebrahimi 	}
3003*22dc650dSSadaf Ebrahimi 
3004*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3005*22dc650dSSadaf Ebrahimi }
3006*22dc650dSSadaf Ebrahimi 
3007*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, sljit_s32 reg)
3008*22dc650dSSadaf Ebrahimi {
3009*22dc650dSSadaf Ebrahimi 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
3010*22dc650dSSadaf Ebrahimi 
3011*22dc650dSSadaf Ebrahimi 	if (type == SLJIT_GP_REGISTER) {
3012*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3013*22dc650dSSadaf Ebrahimi 		if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
3014*22dc650dSSadaf Ebrahimi 			return -1;
3015*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
3016*22dc650dSSadaf Ebrahimi 		return reg_map[reg];
3017*22dc650dSSadaf Ebrahimi 	}
3018*22dc650dSSadaf Ebrahimi 
3019*22dc650dSSadaf Ebrahimi 	if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512)
3020*22dc650dSSadaf Ebrahimi 		return -1;
3021*22dc650dSSadaf Ebrahimi 
3022*22dc650dSSadaf Ebrahimi 	return freg_map[reg];
3023*22dc650dSSadaf Ebrahimi }
3024*22dc650dSSadaf Ebrahimi 
3025*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
3026*22dc650dSSadaf Ebrahimi 	void *instruction, sljit_u32 size)
3027*22dc650dSSadaf Ebrahimi {
3028*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3029*22dc650dSSadaf Ebrahimi 
3030*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3031*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
3032*22dc650dSSadaf Ebrahimi 
3033*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
3034*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3035*22dc650dSSadaf Ebrahimi 	INC_SIZE(size);
3036*22dc650dSSadaf Ebrahimi 	SLJIT_MEMCPY(inst, instruction, size);
3037*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3038*22dc650dSSadaf Ebrahimi }
3039*22dc650dSSadaf Ebrahimi 
3040*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
3041*22dc650dSSadaf Ebrahimi /*  Floating point operators                                             */
3042*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
3043*22dc650dSSadaf Ebrahimi 
3044*22dc650dSSadaf Ebrahimi /* Alignment(3) + 4 * 16 bytes. */
3045*22dc650dSSadaf Ebrahimi static sljit_u32 sse2_data[3 + (4 * 4)];
3046*22dc650dSSadaf Ebrahimi static sljit_u32 *sse2_buffer;
3047*22dc650dSSadaf Ebrahimi 
3048*22dc650dSSadaf Ebrahimi static void init_compiler(void)
3049*22dc650dSSadaf Ebrahimi {
3050*22dc650dSSadaf Ebrahimi 	get_cpu_features();
3051*22dc650dSSadaf Ebrahimi 
3052*22dc650dSSadaf Ebrahimi 	/* Align to 16 bytes. */
3053*22dc650dSSadaf Ebrahimi 	sse2_buffer = (sljit_u32*)(((sljit_uw)sse2_data + 15) & ~(sljit_uw)0xf);
3054*22dc650dSSadaf Ebrahimi 
3055*22dc650dSSadaf Ebrahimi 	/* Single precision constants (each constant is 16 byte long). */
3056*22dc650dSSadaf Ebrahimi 	sse2_buffer[0] = 0x80000000;
3057*22dc650dSSadaf Ebrahimi 	sse2_buffer[4] = 0x7fffffff;
3058*22dc650dSSadaf Ebrahimi 	/* Double precision constants (each constant is 16 byte long). */
3059*22dc650dSSadaf Ebrahimi 	sse2_buffer[8] = 0;
3060*22dc650dSSadaf Ebrahimi 	sse2_buffer[9] = 0x80000000;
3061*22dc650dSSadaf Ebrahimi 	sse2_buffer[12] = 0xffffffff;
3062*22dc650dSSadaf Ebrahimi 	sse2_buffer[13] = 0x7fffffff;
3063*22dc650dSSadaf Ebrahimi }
3064*22dc650dSSadaf Ebrahimi 
3065*22dc650dSSadaf Ebrahimi static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
3066*22dc650dSSadaf Ebrahimi 	sljit_uw op,
3067*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3068*22dc650dSSadaf Ebrahimi {
3069*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw);
3070*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3071*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
3072*22dc650dSSadaf Ebrahimi 	inst[1] = op & 0xff;
3073*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3074*22dc650dSSadaf Ebrahimi }
3075*22dc650dSSadaf Ebrahimi 
3076*22dc650dSSadaf Ebrahimi static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
3077*22dc650dSSadaf Ebrahimi 	sljit_uw op,
3078*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3079*22dc650dSSadaf Ebrahimi {
3080*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3081*22dc650dSSadaf Ebrahimi 
3082*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A)));
3083*22dc650dSSadaf Ebrahimi 
3084*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
3085*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3086*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
3087*22dc650dSSadaf Ebrahimi 	inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A);
3088*22dc650dSSadaf Ebrahimi 	inst[2] = op & 0xff;
3089*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3090*22dc650dSSadaf Ebrahimi }
3091*22dc650dSSadaf Ebrahimi 
3092*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
3093*22dc650dSSadaf Ebrahimi 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
3094*22dc650dSSadaf Ebrahimi {
3095*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
3096*22dc650dSSadaf Ebrahimi }
3097*22dc650dSSadaf Ebrahimi 
3098*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
3099*22dc650dSSadaf Ebrahimi 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
3100*22dc650dSSadaf Ebrahimi {
3101*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
3102*22dc650dSSadaf Ebrahimi }
3103*22dc650dSSadaf Ebrahimi 
3104*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
3105*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
3106*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
3107*22dc650dSSadaf Ebrahimi {
3108*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
3109*22dc650dSSadaf Ebrahimi 
3110*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3111*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
3112*22dc650dSSadaf Ebrahimi 
3113*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3114*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
3115*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
3116*22dc650dSSadaf Ebrahimi #endif
3117*22dc650dSSadaf Ebrahimi 
3118*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
3119*22dc650dSSadaf Ebrahimi 
3120*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM)
3121*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3122*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3123*22dc650dSSadaf Ebrahimi }
3124*22dc650dSSadaf Ebrahimi 
3125*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
3126*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
3127*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
3128*22dc650dSSadaf Ebrahimi {
3129*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
3130*22dc650dSSadaf Ebrahimi 
3131*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src, srcw, (void)0);
3132*22dc650dSSadaf Ebrahimi 
3133*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3134*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
3135*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
3136*22dc650dSSadaf Ebrahimi #endif
3137*22dc650dSSadaf Ebrahimi 
3138*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
3139*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3140*22dc650dSSadaf Ebrahimi 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
3141*22dc650dSSadaf Ebrahimi 			srcw = (sljit_s32)srcw;
3142*22dc650dSSadaf Ebrahimi #endif
3143*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
3144*22dc650dSSadaf Ebrahimi 		src = TMP_REG1;
3145*22dc650dSSadaf Ebrahimi 		srcw = 0;
3146*22dc650dSSadaf Ebrahimi 	}
3147*22dc650dSSadaf Ebrahimi 
3148*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
3149*22dc650dSSadaf Ebrahimi 
3150*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3151*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3152*22dc650dSSadaf Ebrahimi #endif
3153*22dc650dSSadaf Ebrahimi 	if (dst_r == TMP_FREG)
3154*22dc650dSSadaf Ebrahimi 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3155*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3156*22dc650dSSadaf Ebrahimi }
3157*22dc650dSSadaf Ebrahimi 
3158*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
3159*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
3160*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
3161*22dc650dSSadaf Ebrahimi {
3162*22dc650dSSadaf Ebrahimi 	switch (GET_FLAG_TYPE(op)) {
3163*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_EQUAL:
3164*22dc650dSSadaf Ebrahimi 		/* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */
3165*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3166*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
3167*22dc650dSSadaf Ebrahimi 
3168*22dc650dSSadaf Ebrahimi 		/* EQ */
3169*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, 0));
3170*22dc650dSSadaf Ebrahimi 
3171*22dc650dSSadaf Ebrahimi 		src1 = TMP_FREG;
3172*22dc650dSSadaf Ebrahimi 		src2 = TMP_FREG;
3173*22dc650dSSadaf Ebrahimi 		src2w = 0;
3174*22dc650dSSadaf Ebrahimi 		break;
3175*22dc650dSSadaf Ebrahimi 
3176*22dc650dSSadaf Ebrahimi 	case SLJIT_ORDERED_LESS:
3177*22dc650dSSadaf Ebrahimi 	case SLJIT_UNORDERED_OR_GREATER:
3178*22dc650dSSadaf Ebrahimi 		/* Also: SLJIT_UNORDERED_OR_GREATER_EQUAL, SLJIT_ORDERED_LESS_EQUAL  */
3179*22dc650dSSadaf Ebrahimi 		if (!FAST_IS_REG(src2)) {
3180*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3181*22dc650dSSadaf Ebrahimi 			src2 = TMP_FREG;
3182*22dc650dSSadaf Ebrahimi 		}
3183*22dc650dSSadaf Ebrahimi 
3184*22dc650dSSadaf Ebrahimi 		return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
3185*22dc650dSSadaf Ebrahimi 	}
3186*22dc650dSSadaf Ebrahimi 
3187*22dc650dSSadaf Ebrahimi 	if (!FAST_IS_REG(src1)) {
3188*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3189*22dc650dSSadaf Ebrahimi 		src1 = TMP_FREG;
3190*22dc650dSSadaf Ebrahimi 	}
3191*22dc650dSSadaf Ebrahimi 
3192*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
3193*22dc650dSSadaf Ebrahimi }
3194*22dc650dSSadaf Ebrahimi 
3195*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
3196*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
3197*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
3198*22dc650dSSadaf Ebrahimi {
3199*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
3200*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3201*22dc650dSSadaf Ebrahimi 
3202*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3203*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3204*22dc650dSSadaf Ebrahimi #endif
3205*22dc650dSSadaf Ebrahimi 
3206*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3207*22dc650dSSadaf Ebrahimi 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
3208*22dc650dSSadaf Ebrahimi 
3209*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
3210*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(dst))
3211*22dc650dSSadaf Ebrahimi 			return emit_sse2_load(compiler, op & SLJIT_32, dst, src, srcw);
3212*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(src))
3213*22dc650dSSadaf Ebrahimi 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, src);
3214*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3215*22dc650dSSadaf Ebrahimi 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3216*22dc650dSSadaf Ebrahimi 	}
3217*22dc650dSSadaf Ebrahimi 
3218*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
3219*22dc650dSSadaf Ebrahimi 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
3220*22dc650dSSadaf Ebrahimi 		if (FAST_IS_REG(src)) {
3221*22dc650dSSadaf Ebrahimi 			/* We overwrite the high bits of source. From SLJIT point of view,
3222*22dc650dSSadaf Ebrahimi 			   this is not an issue.
3223*22dc650dSSadaf Ebrahimi 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
3224*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
3225*22dc650dSSadaf Ebrahimi 		} else {
3226*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw));
3227*22dc650dSSadaf Ebrahimi 			src = TMP_FREG;
3228*22dc650dSSadaf Ebrahimi 		}
3229*22dc650dSSadaf Ebrahimi 
3230*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
3231*22dc650dSSadaf Ebrahimi 		if (dst_r == TMP_FREG)
3232*22dc650dSSadaf Ebrahimi 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3233*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3234*22dc650dSSadaf Ebrahimi 	}
3235*22dc650dSSadaf Ebrahimi 
3236*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst)) {
3237*22dc650dSSadaf Ebrahimi 		dst_r = (dst == src) ? TMP_FREG : dst;
3238*22dc650dSSadaf Ebrahimi 
3239*22dc650dSSadaf Ebrahimi 		if (src & SLJIT_MEM)
3240*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3241*22dc650dSSadaf Ebrahimi 
3242*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0));
3243*22dc650dSSadaf Ebrahimi 
3244*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0);
3245*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
3246*22dc650dSSadaf Ebrahimi 		/* Same as PSRLD_x / PSRLQ_x */
3247*22dc650dSSadaf Ebrahimi 		inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8;
3248*22dc650dSSadaf Ebrahimi 
3249*22dc650dSSadaf Ebrahimi 		if (GET_OPCODE(op) == SLJIT_ABS_F64) {
3250*22dc650dSSadaf Ebrahimi 			inst[2] |= 2 << 3;
3251*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, 1));
3252*22dc650dSSadaf Ebrahimi 		} else {
3253*22dc650dSSadaf Ebrahimi 			inst[2] |= 6 << 3;
3254*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63)));
3255*22dc650dSSadaf Ebrahimi 		}
3256*22dc650dSSadaf Ebrahimi 
3257*22dc650dSSadaf Ebrahimi 		if (dst_r != TMP_FREG)
3258*22dc650dSSadaf Ebrahimi 			dst_r = (src & SLJIT_MEM) ? TMP_FREG : src;
3259*22dc650dSSadaf Ebrahimi 		return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0);
3260*22dc650dSSadaf Ebrahimi 	}
3261*22dc650dSSadaf Ebrahimi 
3262*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
3263*22dc650dSSadaf Ebrahimi 
3264*22dc650dSSadaf Ebrahimi 	switch (GET_OPCODE(op)) {
3265*22dc650dSSadaf Ebrahimi 	case SLJIT_NEG_F64:
3266*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3267*22dc650dSSadaf Ebrahimi 		break;
3268*22dc650dSSadaf Ebrahimi 
3269*22dc650dSSadaf Ebrahimi 	case SLJIT_ABS_F64:
3270*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
3271*22dc650dSSadaf Ebrahimi 		break;
3272*22dc650dSSadaf Ebrahimi 	}
3273*22dc650dSSadaf Ebrahimi 
3274*22dc650dSSadaf Ebrahimi 	return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3275*22dc650dSSadaf Ebrahimi }
3276*22dc650dSSadaf Ebrahimi 
3277*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
3278*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
3279*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
3280*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
3281*22dc650dSSadaf Ebrahimi {
3282*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
3283*22dc650dSSadaf Ebrahimi 
3284*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3285*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
3286*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
3287*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
3288*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src2, src2w);
3289*22dc650dSSadaf Ebrahimi 
3290*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3291*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3292*22dc650dSSadaf Ebrahimi #endif
3293*22dc650dSSadaf Ebrahimi 
3294*22dc650dSSadaf Ebrahimi 	if (FAST_IS_REG(dst)) {
3295*22dc650dSSadaf Ebrahimi 		dst_r = dst;
3296*22dc650dSSadaf Ebrahimi 		if (dst == src1)
3297*22dc650dSSadaf Ebrahimi 			; /* Do nothing here. */
3298*22dc650dSSadaf Ebrahimi 		else if (dst == src2 && (GET_OPCODE(op) == SLJIT_ADD_F64 || GET_OPCODE(op) == SLJIT_MUL_F64)) {
3299*22dc650dSSadaf Ebrahimi 			/* Swap arguments. */
3300*22dc650dSSadaf Ebrahimi 			src2 = src1;
3301*22dc650dSSadaf Ebrahimi 			src2w = src1w;
3302*22dc650dSSadaf Ebrahimi 		} else if (dst != src2)
3303*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src1, src1w));
3304*22dc650dSSadaf Ebrahimi 		else {
3305*22dc650dSSadaf Ebrahimi 			dst_r = TMP_FREG;
3306*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3307*22dc650dSSadaf Ebrahimi 		}
3308*22dc650dSSadaf Ebrahimi 	} else {
3309*22dc650dSSadaf Ebrahimi 		dst_r = TMP_FREG;
3310*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3311*22dc650dSSadaf Ebrahimi 	}
3312*22dc650dSSadaf Ebrahimi 
3313*22dc650dSSadaf Ebrahimi 	switch (GET_OPCODE(op)) {
3314*22dc650dSSadaf Ebrahimi 	case SLJIT_ADD_F64:
3315*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3316*22dc650dSSadaf Ebrahimi 		break;
3317*22dc650dSSadaf Ebrahimi 
3318*22dc650dSSadaf Ebrahimi 	case SLJIT_SUB_F64:
3319*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3320*22dc650dSSadaf Ebrahimi 		break;
3321*22dc650dSSadaf Ebrahimi 
3322*22dc650dSSadaf Ebrahimi 	case SLJIT_MUL_F64:
3323*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3324*22dc650dSSadaf Ebrahimi 		break;
3325*22dc650dSSadaf Ebrahimi 
3326*22dc650dSSadaf Ebrahimi 	case SLJIT_DIV_F64:
3327*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
3328*22dc650dSSadaf Ebrahimi 		break;
3329*22dc650dSSadaf Ebrahimi 	}
3330*22dc650dSSadaf Ebrahimi 
3331*22dc650dSSadaf Ebrahimi 	if (dst_r != dst)
3332*22dc650dSSadaf Ebrahimi 		return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
3333*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3334*22dc650dSSadaf Ebrahimi }
3335*22dc650dSSadaf Ebrahimi 
3336*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
3337*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_freg,
3338*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
3339*22dc650dSSadaf Ebrahimi 	sljit_s32 src2, sljit_sw src2w)
3340*22dc650dSSadaf Ebrahimi {
3341*22dc650dSSadaf Ebrahimi 	sljit_uw pref;
3342*22dc650dSSadaf Ebrahimi 
3343*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3344*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w));
3345*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
3346*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src2, src2w);
3347*22dc650dSSadaf Ebrahimi 
3348*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3349*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3350*22dc650dSSadaf Ebrahimi #endif
3351*22dc650dSSadaf Ebrahimi 
3352*22dc650dSSadaf Ebrahimi 	if (dst_freg == src1) {
3353*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
3354*22dc650dSSadaf Ebrahimi 		pref = EX86_SELECT_66(op) | EX86_SSE2;
3355*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w));
3356*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3357*22dc650dSSadaf Ebrahimi 		return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0);
3358*22dc650dSSadaf Ebrahimi 	}
3359*22dc650dSSadaf Ebrahimi 
3360*22dc650dSSadaf Ebrahimi 	if (src1 & SLJIT_MEM) {
3361*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
3362*22dc650dSSadaf Ebrahimi 		src1 = TMP_FREG;
3363*22dc650dSSadaf Ebrahimi 		src1w = 0;
3364*22dc650dSSadaf Ebrahimi 	}
3365*22dc650dSSadaf Ebrahimi 
3366*22dc650dSSadaf Ebrahimi 	if (dst_freg != src2)
3367*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w));
3368*22dc650dSSadaf Ebrahimi 
3369*22dc650dSSadaf Ebrahimi 	pref = EX86_SELECT_66(op) | EX86_SSE2;
3370*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w));
3371*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
3372*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w);
3373*22dc650dSSadaf Ebrahimi }
3374*22dc650dSSadaf Ebrahimi 
3375*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
3376*22dc650dSSadaf Ebrahimi /*  Conditional instructions                                             */
3377*22dc650dSSadaf Ebrahimi /* --------------------------------------------------------------------- */
3378*22dc650dSSadaf Ebrahimi 
3379*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
3380*22dc650dSSadaf Ebrahimi {
3381*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3382*22dc650dSSadaf Ebrahimi 	struct sljit_label *label;
3383*22dc650dSSadaf Ebrahimi 
3384*22dc650dSSadaf Ebrahimi 	CHECK_ERROR_PTR();
3385*22dc650dSSadaf Ebrahimi 	CHECK_PTR(check_sljit_emit_label(compiler));
3386*22dc650dSSadaf Ebrahimi 
3387*22dc650dSSadaf Ebrahimi 	if (compiler->last_label && compiler->last_label->size == compiler->size)
3388*22dc650dSSadaf Ebrahimi 		return compiler->last_label;
3389*22dc650dSSadaf Ebrahimi 
3390*22dc650dSSadaf Ebrahimi 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
3391*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!label);
3392*22dc650dSSadaf Ebrahimi 	set_label(label, compiler);
3393*22dc650dSSadaf Ebrahimi 
3394*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1);
3395*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!inst);
3396*22dc650dSSadaf Ebrahimi 	inst[0] = SLJIT_INST_LABEL;
3397*22dc650dSSadaf Ebrahimi 
3398*22dc650dSSadaf Ebrahimi 	return label;
3399*22dc650dSSadaf Ebrahimi }
3400*22dc650dSSadaf Ebrahimi 
3401*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
3402*22dc650dSSadaf Ebrahimi {
3403*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3404*22dc650dSSadaf Ebrahimi 	struct sljit_jump *jump;
3405*22dc650dSSadaf Ebrahimi 
3406*22dc650dSSadaf Ebrahimi 	CHECK_ERROR_PTR();
3407*22dc650dSSadaf Ebrahimi 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
3408*22dc650dSSadaf Ebrahimi 
3409*22dc650dSSadaf Ebrahimi 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3410*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF_NULL(jump);
3411*22dc650dSSadaf Ebrahimi 	set_jump(jump, compiler, (sljit_u32)((type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT)));
3412*22dc650dSSadaf Ebrahimi 	type &= 0xff;
3413*22dc650dSSadaf Ebrahimi 
3414*22dc650dSSadaf Ebrahimi 	jump->addr = compiler->size;
3415*22dc650dSSadaf Ebrahimi 	/* Worst case size. */
3416*22dc650dSSadaf Ebrahimi 	compiler->size += (type >= SLJIT_JUMP) ? JUMP_MAX_SIZE : CJUMP_MAX_SIZE;
3417*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1);
3418*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF_NULL(inst);
3419*22dc650dSSadaf Ebrahimi 
3420*22dc650dSSadaf Ebrahimi 	inst[0] = SLJIT_INST_JUMP;
3421*22dc650dSSadaf Ebrahimi 	return jump;
3422*22dc650dSSadaf Ebrahimi }
3423*22dc650dSSadaf Ebrahimi 
3424*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
3425*22dc650dSSadaf Ebrahimi {
3426*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3427*22dc650dSSadaf Ebrahimi 	struct sljit_jump *jump;
3428*22dc650dSSadaf Ebrahimi 
3429*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3430*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
3431*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src, srcw);
3432*22dc650dSSadaf Ebrahimi 
3433*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src, srcw, (void)0);
3434*22dc650dSSadaf Ebrahimi 
3435*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
3436*22dc650dSSadaf Ebrahimi 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
3437*22dc650dSSadaf Ebrahimi 		FAIL_IF_NULL(jump);
3438*22dc650dSSadaf Ebrahimi 		set_jump(jump, compiler, (sljit_u32)(JUMP_ADDR | (type << TYPE_SHIFT)));
3439*22dc650dSSadaf Ebrahimi 		jump->u.target = (sljit_uw)srcw;
3440*22dc650dSSadaf Ebrahimi 
3441*22dc650dSSadaf Ebrahimi 		jump->addr = compiler->size;
3442*22dc650dSSadaf Ebrahimi 		/* Worst case size. */
3443*22dc650dSSadaf Ebrahimi 		compiler->size += JUMP_MAX_SIZE;
3444*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1);
3445*22dc650dSSadaf Ebrahimi 		FAIL_IF_NULL(inst);
3446*22dc650dSSadaf Ebrahimi 
3447*22dc650dSSadaf Ebrahimi 		inst[0] = SLJIT_INST_JUMP;
3448*22dc650dSSadaf Ebrahimi 	} else {
3449*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3450*22dc650dSSadaf Ebrahimi 		/* REX_W is not necessary (src is not immediate). */
3451*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 1;
3452*22dc650dSSadaf Ebrahimi #endif
3453*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
3454*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
3455*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_FF;
3456*22dc650dSSadaf Ebrahimi 		inst[1] = U8(inst[1] | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm));
3457*22dc650dSSadaf Ebrahimi 	}
3458*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3459*22dc650dSSadaf Ebrahimi }
3460*22dc650dSSadaf Ebrahimi 
3461*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
3462*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw,
3463*22dc650dSSadaf Ebrahimi 	sljit_s32 type)
3464*22dc650dSSadaf Ebrahimi {
3465*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3466*22dc650dSSadaf Ebrahimi 	sljit_u8 cond_set;
3467*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3468*22dc650dSSadaf Ebrahimi 	sljit_s32 reg;
3469*22dc650dSSadaf Ebrahimi #endif /* !SLJIT_CONFIG_X86_64 */
3470*22dc650dSSadaf Ebrahimi 	/* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
3471*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_save = dst;
3472*22dc650dSSadaf Ebrahimi 	sljit_sw dstw_save = dstw;
3473*22dc650dSSadaf Ebrahimi 
3474*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3475*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
3476*22dc650dSSadaf Ebrahimi 
3477*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
3478*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
3479*22dc650dSSadaf Ebrahimi 
3480*22dc650dSSadaf Ebrahimi 	/* setcc = jcc + 0x10. */
3481*22dc650dSSadaf Ebrahimi 	cond_set = U8(get_jump_code((sljit_uw)type) + 0x10);
3482*22dc650dSSadaf Ebrahimi 
3483*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3484*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
3485*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
3486*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
3487*22dc650dSSadaf Ebrahimi 		INC_SIZE(4 + 3);
3488*22dc650dSSadaf Ebrahimi 		/* Set low register to conditional flag. */
3489*22dc650dSSadaf Ebrahimi 		inst[0] = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
3490*22dc650dSSadaf Ebrahimi 		inst[1] = GROUP_0F;
3491*22dc650dSSadaf Ebrahimi 		inst[2] = cond_set;
3492*22dc650dSSadaf Ebrahimi 		inst[3] = MOD_REG | reg_lmap[TMP_REG1];
3493*22dc650dSSadaf Ebrahimi 		inst[4] = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B));
3494*22dc650dSSadaf Ebrahimi 		inst[5] = OR_rm8_r8;
3495*22dc650dSSadaf Ebrahimi 		inst[6] = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]);
3496*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3497*22dc650dSSadaf Ebrahimi 	}
3498*22dc650dSSadaf Ebrahimi 
3499*22dc650dSSadaf Ebrahimi 	reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
3500*22dc650dSSadaf Ebrahimi 
3501*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
3502*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3503*22dc650dSSadaf Ebrahimi 	INC_SIZE(4 + 4);
3504*22dc650dSSadaf Ebrahimi 	/* Set low register to conditional flag. */
3505*22dc650dSSadaf Ebrahimi 	inst[0] = (reg_map[reg] <= 7) ? REX : REX_B;
3506*22dc650dSSadaf Ebrahimi 	inst[1] = GROUP_0F;
3507*22dc650dSSadaf Ebrahimi 	inst[2] = cond_set;
3508*22dc650dSSadaf Ebrahimi 	inst[3] = MOD_REG | reg_lmap[reg];
3509*22dc650dSSadaf Ebrahimi 	inst[4] = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
3510*22dc650dSSadaf Ebrahimi 	/* The movzx instruction does not affect flags. */
3511*22dc650dSSadaf Ebrahimi 	inst[5] = GROUP_0F;
3512*22dc650dSSadaf Ebrahimi 	inst[6] = MOVZX_r_rm8;
3513*22dc650dSSadaf Ebrahimi 	inst[7] = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]);
3514*22dc650dSSadaf Ebrahimi 
3515*22dc650dSSadaf Ebrahimi 	if (reg != TMP_REG1)
3516*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3517*22dc650dSSadaf Ebrahimi 
3518*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) < SLJIT_ADD) {
3519*22dc650dSSadaf Ebrahimi 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
3520*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3521*22dc650dSSadaf Ebrahimi 	}
3522*22dc650dSSadaf Ebrahimi 
3523*22dc650dSSadaf Ebrahimi 	SLJIT_SKIP_CHECKS(compiler);
3524*22dc650dSSadaf Ebrahimi 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3525*22dc650dSSadaf Ebrahimi 
3526*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
3527*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(reg_map[TMP_REG1] < 4);
3528*22dc650dSSadaf Ebrahimi 
3529*22dc650dSSadaf Ebrahimi 	/* The SLJIT_CONFIG_X86_32 code path starts here. */
3530*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3531*22dc650dSSadaf Ebrahimi 		/* Low byte is accessible. */
3532*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3533*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
3534*22dc650dSSadaf Ebrahimi 		INC_SIZE(3 + 3);
3535*22dc650dSSadaf Ebrahimi 		/* Set low byte to conditional flag. */
3536*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
3537*22dc650dSSadaf Ebrahimi 		inst[1] = cond_set;
3538*22dc650dSSadaf Ebrahimi 		inst[2] = U8(MOD_REG | reg_map[dst]);
3539*22dc650dSSadaf Ebrahimi 
3540*22dc650dSSadaf Ebrahimi 		inst[3] = GROUP_0F;
3541*22dc650dSSadaf Ebrahimi 		inst[4] = MOVZX_r_rm8;
3542*22dc650dSSadaf Ebrahimi 		inst[5] = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]);
3543*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3544*22dc650dSSadaf Ebrahimi 	}
3545*22dc650dSSadaf Ebrahimi 
3546*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
3547*22dc650dSSadaf Ebrahimi 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 2);
3548*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
3549*22dc650dSSadaf Ebrahimi 		INC_SIZE(3 + 2);
3550*22dc650dSSadaf Ebrahimi 
3551*22dc650dSSadaf Ebrahimi 		/* Set low byte to conditional flag. */
3552*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
3553*22dc650dSSadaf Ebrahimi 		inst[1] = cond_set;
3554*22dc650dSSadaf Ebrahimi 		inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3555*22dc650dSSadaf Ebrahimi 
3556*22dc650dSSadaf Ebrahimi 		inst[3] = OR_rm8_r8;
3557*22dc650dSSadaf Ebrahimi 		inst[4] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[dst]);
3558*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3559*22dc650dSSadaf Ebrahimi 	}
3560*22dc650dSSadaf Ebrahimi 
3561*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
3562*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3563*22dc650dSSadaf Ebrahimi 	INC_SIZE(3 + 3);
3564*22dc650dSSadaf Ebrahimi 	/* Set low byte to conditional flag. */
3565*22dc650dSSadaf Ebrahimi 	inst[0] = GROUP_0F;
3566*22dc650dSSadaf Ebrahimi 	inst[1] = cond_set;
3567*22dc650dSSadaf Ebrahimi 	inst[2] = U8(MOD_REG | reg_map[TMP_REG1]);
3568*22dc650dSSadaf Ebrahimi 
3569*22dc650dSSadaf Ebrahimi 	inst[3] = GROUP_0F;
3570*22dc650dSSadaf Ebrahimi 	inst[4] = MOVZX_r_rm8;
3571*22dc650dSSadaf Ebrahimi 	inst[5] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[TMP_REG1]);
3572*22dc650dSSadaf Ebrahimi 
3573*22dc650dSSadaf Ebrahimi 	if (GET_OPCODE(op) < SLJIT_ADD)
3574*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
3575*22dc650dSSadaf Ebrahimi 
3576*22dc650dSSadaf Ebrahimi 	SLJIT_SKIP_CHECKS(compiler);
3577*22dc650dSSadaf Ebrahimi 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
3578*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3579*22dc650dSSadaf Ebrahimi }
3580*22dc650dSSadaf Ebrahimi 
3581*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fselect(struct sljit_compiler *compiler, sljit_s32 type,
3582*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_freg,
3583*22dc650dSSadaf Ebrahimi 	sljit_s32 src1, sljit_sw src1w,
3584*22dc650dSSadaf Ebrahimi 	sljit_s32 src2_freg)
3585*22dc650dSSadaf Ebrahimi {
3586*22dc650dSSadaf Ebrahimi 	sljit_u8* inst;
3587*22dc650dSSadaf Ebrahimi 	sljit_uw size;
3588*22dc650dSSadaf Ebrahimi 
3589*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3590*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_fselect(compiler, type, dst_freg, src1, src1w, src2_freg));
3591*22dc650dSSadaf Ebrahimi 
3592*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src1, src1w);
3593*22dc650dSSadaf Ebrahimi 
3594*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3595*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3596*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3597*22dc650dSSadaf Ebrahimi 
3598*22dc650dSSadaf Ebrahimi 	if (dst_freg != src2_freg) {
3599*22dc650dSSadaf Ebrahimi 		if (dst_freg == src1) {
3600*22dc650dSSadaf Ebrahimi 			src1 = src2_freg;
3601*22dc650dSSadaf Ebrahimi 			src1w = 0;
3602*22dc650dSSadaf Ebrahimi 			type ^= 0x1;
3603*22dc650dSSadaf Ebrahimi 		} else
3604*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src2_freg, 0));
3605*22dc650dSSadaf Ebrahimi 	}
3606*22dc650dSSadaf Ebrahimi 
3607*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
3608*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
3609*22dc650dSSadaf Ebrahimi 	INC_SIZE(2);
3610*22dc650dSSadaf Ebrahimi 	inst[0] = U8(get_jump_code((sljit_uw)(type & ~SLJIT_32) ^ 0x1) - 0x10);
3611*22dc650dSSadaf Ebrahimi 
3612*22dc650dSSadaf Ebrahimi 	size = compiler->size;
3613*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src1, src1w));
3614*22dc650dSSadaf Ebrahimi 
3615*22dc650dSSadaf Ebrahimi 	inst[1] = U8(compiler->size - size);
3616*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
3617*22dc650dSSadaf Ebrahimi }
3618*22dc650dSSadaf Ebrahimi 
3619*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
3620*22dc650dSSadaf Ebrahimi 	sljit_s32 freg,
3621*22dc650dSSadaf Ebrahimi 	sljit_s32 srcdst, sljit_sw srcdstw)
3622*22dc650dSSadaf Ebrahimi {
3623*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3624*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3625*22dc650dSSadaf Ebrahimi 	sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
3626*22dc650dSSadaf Ebrahimi 	sljit_uw op;
3627*22dc650dSSadaf Ebrahimi 
3628*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3629*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
3630*22dc650dSSadaf Ebrahimi 
3631*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3632*22dc650dSSadaf Ebrahimi 
3633*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3634*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3635*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3636*22dc650dSSadaf Ebrahimi 
3637*22dc650dSSadaf Ebrahimi 	switch (reg_size) {
3638*22dc650dSSadaf Ebrahimi 	case 4:
3639*22dc650dSSadaf Ebrahimi 		op = EX86_SSE2;
3640*22dc650dSSadaf Ebrahimi 		break;
3641*22dc650dSSadaf Ebrahimi 	case 5:
3642*22dc650dSSadaf Ebrahimi 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3643*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
3644*22dc650dSSadaf Ebrahimi 		op = EX86_SSE2 | VEX_256;
3645*22dc650dSSadaf Ebrahimi 		break;
3646*22dc650dSSadaf Ebrahimi 	default:
3647*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3648*22dc650dSSadaf Ebrahimi 	}
3649*22dc650dSSadaf Ebrahimi 
3650*22dc650dSSadaf Ebrahimi 	if (!(srcdst & SLJIT_MEM))
3651*22dc650dSSadaf Ebrahimi 		alignment = reg_size;
3652*22dc650dSSadaf Ebrahimi 
3653*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_FLOAT) {
3654*22dc650dSSadaf Ebrahimi 		if (elem_size == 2 || elem_size == 3) {
3655*22dc650dSSadaf Ebrahimi 			op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
3656*22dc650dSSadaf Ebrahimi 
3657*22dc650dSSadaf Ebrahimi 			if (elem_size == 3)
3658*22dc650dSSadaf Ebrahimi 				op |= EX86_PREF_66;
3659*22dc650dSSadaf Ebrahimi 
3660*22dc650dSSadaf Ebrahimi 			if (type & SLJIT_SIMD_STORE)
3661*22dc650dSSadaf Ebrahimi 				op += 1;
3662*22dc650dSSadaf Ebrahimi 		} else
3663*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
3664*22dc650dSSadaf Ebrahimi 	} else {
3665*22dc650dSSadaf Ebrahimi 		op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm)
3666*22dc650dSSadaf Ebrahimi 			| (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3);
3667*22dc650dSSadaf Ebrahimi 	}
3668*22dc650dSSadaf Ebrahimi 
3669*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
3670*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3671*22dc650dSSadaf Ebrahimi 
3672*22dc650dSSadaf Ebrahimi 	if ((op & VEX_256) || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX)))
3673*22dc650dSSadaf Ebrahimi 		return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw);
3674*22dc650dSSadaf Ebrahimi 
3675*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, op, freg, srcdst, srcdstw);
3676*22dc650dSSadaf Ebrahimi }
3677*22dc650dSSadaf Ebrahimi 
3678*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
3679*22dc650dSSadaf Ebrahimi 	sljit_s32 freg,
3680*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
3681*22dc650dSSadaf Ebrahimi {
3682*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3683*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3684*22dc650dSSadaf Ebrahimi 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
3685*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3686*22dc650dSSadaf Ebrahimi 	sljit_u8 opcode = 0;
3687*22dc650dSSadaf Ebrahimi 	sljit_uw op;
3688*22dc650dSSadaf Ebrahimi 
3689*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3690*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw));
3691*22dc650dSSadaf Ebrahimi 
3692*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src, srcw);
3693*22dc650dSSadaf Ebrahimi 
3694*22dc650dSSadaf Ebrahimi 	if (!(type & SLJIT_SIMD_FLOAT)) {
3695*22dc650dSSadaf Ebrahimi 		CHECK_EXTRA_REGS(src, srcw, (void)0);
3696*22dc650dSSadaf Ebrahimi 	}
3697*22dc650dSSadaf Ebrahimi 
3698*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3699*22dc650dSSadaf Ebrahimi 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : (elem_size > 2))
3700*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3701*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
3702*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3703*22dc650dSSadaf Ebrahimi 
3704*22dc650dSSadaf Ebrahimi 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3705*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3706*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
3707*22dc650dSSadaf Ebrahimi 
3708*22dc650dSSadaf Ebrahimi 	if (reg_size != 4 && (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2)))
3709*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3710*22dc650dSSadaf Ebrahimi 
3711*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
3712*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3713*22dc650dSSadaf Ebrahimi 
3714*22dc650dSSadaf Ebrahimi 	if (reg_size == 5)
3715*22dc650dSSadaf Ebrahimi 		use_vex = 1;
3716*22dc650dSSadaf Ebrahimi 
3717*22dc650dSSadaf Ebrahimi 	if (use_vex && src != SLJIT_IMM) {
3718*22dc650dSSadaf Ebrahimi 		op = 0;
3719*22dc650dSSadaf Ebrahimi 
3720*22dc650dSSadaf Ebrahimi 		switch (elem_size) {
3721*22dc650dSSadaf Ebrahimi 		case 0:
3722*22dc650dSSadaf Ebrahimi 			if (cpu_feature_list & CPU_FEATURE_AVX2)
3723*22dc650dSSadaf Ebrahimi 				op = VPBROADCASTB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3724*22dc650dSSadaf Ebrahimi 			break;
3725*22dc650dSSadaf Ebrahimi 		case 1:
3726*22dc650dSSadaf Ebrahimi 			if (cpu_feature_list & CPU_FEATURE_AVX2)
3727*22dc650dSSadaf Ebrahimi 				op = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3728*22dc650dSSadaf Ebrahimi 			break;
3729*22dc650dSSadaf Ebrahimi 		case 2:
3730*22dc650dSSadaf Ebrahimi 			if (type & SLJIT_SIMD_FLOAT) {
3731*22dc650dSSadaf Ebrahimi 				if ((cpu_feature_list & CPU_FEATURE_AVX2) || ((cpu_feature_list & CPU_FEATURE_AVX) && (src & SLJIT_MEM)))
3732*22dc650dSSadaf Ebrahimi 					op = VBROADCASTSS_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3733*22dc650dSSadaf Ebrahimi 			} else if (cpu_feature_list & CPU_FEATURE_AVX2)
3734*22dc650dSSadaf Ebrahimi 				op = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3735*22dc650dSSadaf Ebrahimi 			break;
3736*22dc650dSSadaf Ebrahimi 		default:
3737*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3738*22dc650dSSadaf Ebrahimi 			if (!(type & SLJIT_SIMD_FLOAT)) {
3739*22dc650dSSadaf Ebrahimi 				if (cpu_feature_list & CPU_FEATURE_AVX2)
3740*22dc650dSSadaf Ebrahimi 					op = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3741*22dc650dSSadaf Ebrahimi 				break;
3742*22dc650dSSadaf Ebrahimi 			}
3743*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3744*22dc650dSSadaf Ebrahimi 
3745*22dc650dSSadaf Ebrahimi 			if (reg_size == 5)
3746*22dc650dSSadaf Ebrahimi 				op = VBROADCASTSD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
3747*22dc650dSSadaf Ebrahimi 			break;
3748*22dc650dSSadaf Ebrahimi 		}
3749*22dc650dSSadaf Ebrahimi 
3750*22dc650dSSadaf Ebrahimi 		if (op != 0) {
3751*22dc650dSSadaf Ebrahimi 			if (!(src & SLJIT_MEM) && !(type & SLJIT_SIMD_FLOAT)) {
3752*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3753*22dc650dSSadaf Ebrahimi 				if (elem_size >= 3)
3754*22dc650dSSadaf Ebrahimi 					compiler->mode32 = 0;
3755*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3756*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw));
3757*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3758*22dc650dSSadaf Ebrahimi 				compiler->mode32 = 1;
3759*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3760*22dc650dSSadaf Ebrahimi 				src = freg;
3761*22dc650dSSadaf Ebrahimi 				srcw = 0;
3762*22dc650dSSadaf Ebrahimi 			}
3763*22dc650dSSadaf Ebrahimi 
3764*22dc650dSSadaf Ebrahimi 			if (reg_size == 5)
3765*22dc650dSSadaf Ebrahimi 				op |= VEX_256;
3766*22dc650dSSadaf Ebrahimi 
3767*22dc650dSSadaf Ebrahimi 			return emit_vex_instruction(compiler, op, freg, 0, src, srcw);
3768*22dc650dSSadaf Ebrahimi 		}
3769*22dc650dSSadaf Ebrahimi 	}
3770*22dc650dSSadaf Ebrahimi 
3771*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_FLOAT) {
3772*22dc650dSSadaf Ebrahimi 		if (src == SLJIT_IMM) {
3773*22dc650dSSadaf Ebrahimi 			if (use_vex)
3774*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, XORPD_x_xm | (reg_size == 5 ? VEX_256 : 0) | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3775*22dc650dSSadaf Ebrahimi 
3776*22dc650dSSadaf Ebrahimi 			return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
3777*22dc650dSSadaf Ebrahimi 		}
3778*22dc650dSSadaf Ebrahimi 
3779*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(reg_size == 4);
3780*22dc650dSSadaf Ebrahimi 
3781*22dc650dSSadaf Ebrahimi 		if (use_vex) {
3782*22dc650dSSadaf Ebrahimi 			if (elem_size == 3)
3783*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, srcw);
3784*22dc650dSSadaf Ebrahimi 
3785*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(!(src & SLJIT_MEM));
3786*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
3787*22dc650dSSadaf Ebrahimi 			return emit_byte(compiler, 0);
3788*22dc650dSSadaf Ebrahimi 		}
3789*22dc650dSSadaf Ebrahimi 
3790*22dc650dSSadaf Ebrahimi 		if (elem_size == 2 && freg != src) {
3791*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw));
3792*22dc650dSSadaf Ebrahimi 			src = freg;
3793*22dc650dSSadaf Ebrahimi 			srcw = 0;
3794*22dc650dSSadaf Ebrahimi 		}
3795*22dc650dSSadaf Ebrahimi 
3796*22dc650dSSadaf Ebrahimi 		op = (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2;
3797*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, op, freg, src, srcw));
3798*22dc650dSSadaf Ebrahimi 
3799*22dc650dSSadaf Ebrahimi 		if (elem_size == 2)
3800*22dc650dSSadaf Ebrahimi 			return emit_byte(compiler, 0);
3801*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3802*22dc650dSSadaf Ebrahimi 	}
3803*22dc650dSSadaf Ebrahimi 
3804*22dc650dSSadaf Ebrahimi 	if (src == SLJIT_IMM) {
3805*22dc650dSSadaf Ebrahimi 		if (elem_size == 0) {
3806*22dc650dSSadaf Ebrahimi 			srcw = (sljit_u8)srcw;
3807*22dc650dSSadaf Ebrahimi 			srcw |= srcw << 8;
3808*22dc650dSSadaf Ebrahimi 			srcw |= srcw << 16;
3809*22dc650dSSadaf Ebrahimi 			elem_size = 2;
3810*22dc650dSSadaf Ebrahimi 		} else if (elem_size == 1) {
3811*22dc650dSSadaf Ebrahimi 			srcw = (sljit_u16)srcw;
3812*22dc650dSSadaf Ebrahimi 			srcw |= srcw << 16;
3813*22dc650dSSadaf Ebrahimi 			elem_size = 2;
3814*22dc650dSSadaf Ebrahimi 		}
3815*22dc650dSSadaf Ebrahimi 
3816*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3817*22dc650dSSadaf Ebrahimi 		if (elem_size == 2 && (sljit_s32)srcw == -1)
3818*22dc650dSSadaf Ebrahimi 			srcw = -1;
3819*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3820*22dc650dSSadaf Ebrahimi 
3821*22dc650dSSadaf Ebrahimi 		if (srcw == 0 || srcw == -1) {
3822*22dc650dSSadaf Ebrahimi 			if (use_vex)
3823*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
3824*22dc650dSSadaf Ebrahimi 
3825*22dc650dSSadaf Ebrahimi 			return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
3826*22dc650dSSadaf Ebrahimi 		}
3827*22dc650dSSadaf Ebrahimi 
3828*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3829*22dc650dSSadaf Ebrahimi 		if (elem_size == 3)
3830*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
3831*22dc650dSSadaf Ebrahimi 		else
3832*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3833*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
3834*22dc650dSSadaf Ebrahimi 
3835*22dc650dSSadaf Ebrahimi 		src = TMP_REG1;
3836*22dc650dSSadaf Ebrahimi 		srcw = 0;
3837*22dc650dSSadaf Ebrahimi 
3838*22dc650dSSadaf Ebrahimi 	}
3839*22dc650dSSadaf Ebrahimi 
3840*22dc650dSSadaf Ebrahimi 	op = 2;
3841*22dc650dSSadaf Ebrahimi 	opcode = MOVD_x_rm;
3842*22dc650dSSadaf Ebrahimi 
3843*22dc650dSSadaf Ebrahimi 	switch (elem_size) {
3844*22dc650dSSadaf Ebrahimi 	case 0:
3845*22dc650dSSadaf Ebrahimi 		if (!FAST_IS_REG(src)) {
3846*22dc650dSSadaf Ebrahimi 			opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */;
3847*22dc650dSSadaf Ebrahimi 			op = 3;
3848*22dc650dSSadaf Ebrahimi 		}
3849*22dc650dSSadaf Ebrahimi 		break;
3850*22dc650dSSadaf Ebrahimi 	case 1:
3851*22dc650dSSadaf Ebrahimi 		if (!FAST_IS_REG(src))
3852*22dc650dSSadaf Ebrahimi 			opcode = PINSRW_x_rm_i8;
3853*22dc650dSSadaf Ebrahimi 		break;
3854*22dc650dSSadaf Ebrahimi 	case 2:
3855*22dc650dSSadaf Ebrahimi 		break;
3856*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3857*22dc650dSSadaf Ebrahimi 	case 3:
3858*22dc650dSSadaf Ebrahimi 		/* MOVQ */
3859*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
3860*22dc650dSSadaf Ebrahimi 		break;
3861*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3862*22dc650dSSadaf Ebrahimi 	}
3863*22dc650dSSadaf Ebrahimi 
3864*22dc650dSSadaf Ebrahimi 	if (use_vex) {
3865*22dc650dSSadaf Ebrahimi 		if (opcode != MOVD_x_rm) {
3866*22dc650dSSadaf Ebrahimi 			op = (opcode == 0x3a) ? (PINSRB_x_rm_i8 | VEX_OP_0F3A) : opcode;
3867*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, freg, src, srcw));
3868*22dc650dSSadaf Ebrahimi 		} else
3869*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw));
3870*22dc650dSSadaf Ebrahimi 	} else {
3871*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw);
3872*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
3873*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
3874*22dc650dSSadaf Ebrahimi 		inst[1] = opcode;
3875*22dc650dSSadaf Ebrahimi 
3876*22dc650dSSadaf Ebrahimi 		if (op == 3) {
3877*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(opcode == 0x3a);
3878*22dc650dSSadaf Ebrahimi 			inst[2] = PINSRB_x_rm_i8;
3879*22dc650dSSadaf Ebrahimi 		}
3880*22dc650dSSadaf Ebrahimi 	}
3881*22dc650dSSadaf Ebrahimi 
3882*22dc650dSSadaf Ebrahimi 	if (use_vex && elem_size >= 2) {
3883*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3884*22dc650dSSadaf Ebrahimi 		op = VPBROADCASTD_x_xm;
3885*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
3886*22dc650dSSadaf Ebrahimi 		op = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm;
3887*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
3888*22dc650dSSadaf Ebrahimi 		return emit_vex_instruction(compiler, op | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
3889*22dc650dSSadaf Ebrahimi 	}
3890*22dc650dSSadaf Ebrahimi 
3891*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(reg_size == 4);
3892*22dc650dSSadaf Ebrahimi 
3893*22dc650dSSadaf Ebrahimi 	if (opcode != MOVD_x_rm)
3894*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, 0));
3895*22dc650dSSadaf Ebrahimi 
3896*22dc650dSSadaf Ebrahimi 	switch (elem_size) {
3897*22dc650dSSadaf Ebrahimi 	case 0:
3898*22dc650dSSadaf Ebrahimi 		if (use_vex) {
3899*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0));
3900*22dc650dSSadaf Ebrahimi 			return emit_vex_instruction(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, TMP_FREG, 0);
3901*22dc650dSSadaf Ebrahimi 		}
3902*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
3903*22dc650dSSadaf Ebrahimi 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
3904*22dc650dSSadaf Ebrahimi 	case 1:
3905*22dc650dSSadaf Ebrahimi 		if (use_vex)
3906*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, freg, 0));
3907*22dc650dSSadaf Ebrahimi 		else
3908*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
3909*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, 0));
3910*22dc650dSSadaf Ebrahimi 		/* fallthrough */
3911*22dc650dSSadaf Ebrahimi 	default:
3912*22dc650dSSadaf Ebrahimi 		if (use_vex)
3913*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0));
3914*22dc650dSSadaf Ebrahimi 		else
3915*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3916*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, 0);
3917*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3918*22dc650dSSadaf Ebrahimi 	case 3:
3919*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 1;
3920*22dc650dSSadaf Ebrahimi 		if (use_vex)
3921*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0));
3922*22dc650dSSadaf Ebrahimi 		else
3923*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
3924*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, 0x44);
3925*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3926*22dc650dSSadaf Ebrahimi 	}
3927*22dc650dSSadaf Ebrahimi }
3928*22dc650dSSadaf Ebrahimi 
3929*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type,
3930*22dc650dSSadaf Ebrahimi 	sljit_s32 freg, sljit_s32 lane_index,
3931*22dc650dSSadaf Ebrahimi 	sljit_s32 srcdst, sljit_sw srcdstw)
3932*22dc650dSSadaf Ebrahimi {
3933*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
3934*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
3935*22dc650dSSadaf Ebrahimi 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
3936*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
3937*22dc650dSSadaf Ebrahimi 	sljit_u8 opcode = 0;
3938*22dc650dSSadaf Ebrahimi 	sljit_uw op;
3939*22dc650dSSadaf Ebrahimi 	sljit_s32 freg_orig = freg;
3940*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3941*22dc650dSSadaf Ebrahimi 	sljit_s32 srcdst_is_ereg = 0;
3942*22dc650dSSadaf Ebrahimi 	sljit_s32 srcdst_orig = 0;
3943*22dc650dSSadaf Ebrahimi 	sljit_sw srcdstw_orig = 0;
3944*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
3945*22dc650dSSadaf Ebrahimi 
3946*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
3947*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
3948*22dc650dSSadaf Ebrahimi 
3949*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
3950*22dc650dSSadaf Ebrahimi 
3951*22dc650dSSadaf Ebrahimi 	if (reg_size == 5) {
3952*22dc650dSSadaf Ebrahimi 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
3953*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
3954*22dc650dSSadaf Ebrahimi 		use_vex = 1;
3955*22dc650dSSadaf Ebrahimi 	} else if (reg_size != 4)
3956*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3957*22dc650dSSadaf Ebrahimi 
3958*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
3959*22dc650dSSadaf Ebrahimi 	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2)
3960*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3961*22dc650dSSadaf Ebrahimi #else /* SLJIT_CONFIG_X86_32 */
3962*22dc650dSSadaf Ebrahimi 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
3963*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
3964*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
3965*22dc650dSSadaf Ebrahimi 
3966*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
3967*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
3968*22dc650dSSadaf Ebrahimi 
3969*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3970*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
3971*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
3972*22dc650dSSadaf Ebrahimi 	if (!(type & SLJIT_SIMD_FLOAT)) {
3973*22dc650dSSadaf Ebrahimi 		CHECK_EXTRA_REGS(srcdst, srcdstw, srcdst_is_ereg = 1);
3974*22dc650dSSadaf Ebrahimi 
3975*22dc650dSSadaf Ebrahimi 		if ((type & SLJIT_SIMD_STORE) && ((srcdst_is_ereg && elem_size < 2) || (elem_size == 0 && (type & SLJIT_SIMD_LANE_SIGNED) && FAST_IS_REG(srcdst) && reg_map[srcdst] >= 4))) {
3976*22dc650dSSadaf Ebrahimi 			srcdst_orig = srcdst;
3977*22dc650dSSadaf Ebrahimi 			srcdstw_orig = srcdstw;
3978*22dc650dSSadaf Ebrahimi 			srcdst = TMP_REG1;
3979*22dc650dSSadaf Ebrahimi 			srcdstw = 0;
3980*22dc650dSSadaf Ebrahimi 		}
3981*22dc650dSSadaf Ebrahimi 	}
3982*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3983*22dc650dSSadaf Ebrahimi 
3984*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_LANE_ZERO) {
3985*22dc650dSSadaf Ebrahimi 		if (lane_index == 0) {
3986*22dc650dSSadaf Ebrahimi 			if (!(type & SLJIT_SIMD_FLOAT)) {
3987*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3988*22dc650dSSadaf Ebrahimi 				if (elem_size == 3) {
3989*22dc650dSSadaf Ebrahimi 					compiler->mode32 = 0;
3990*22dc650dSSadaf Ebrahimi 					elem_size = 2;
3991*22dc650dSSadaf Ebrahimi 				}
3992*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
3993*22dc650dSSadaf Ebrahimi 				if (srcdst == SLJIT_IMM) {
3994*22dc650dSSadaf Ebrahimi 					if (elem_size == 0)
3995*22dc650dSSadaf Ebrahimi 						srcdstw = (sljit_u8)srcdstw;
3996*22dc650dSSadaf Ebrahimi 					else if (elem_size == 1)
3997*22dc650dSSadaf Ebrahimi 						srcdstw = (sljit_u16)srcdstw;
3998*22dc650dSSadaf Ebrahimi 
3999*22dc650dSSadaf Ebrahimi 					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
4000*22dc650dSSadaf Ebrahimi 					srcdst = TMP_REG1;
4001*22dc650dSSadaf Ebrahimi 					srcdstw = 0;
4002*22dc650dSSadaf Ebrahimi 					elem_size = 2;
4003*22dc650dSSadaf Ebrahimi 				}
4004*22dc650dSSadaf Ebrahimi 
4005*22dc650dSSadaf Ebrahimi 				if (elem_size == 2) {
4006*22dc650dSSadaf Ebrahimi 					if (use_vex)
4007*22dc650dSSadaf Ebrahimi 						return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
4008*22dc650dSSadaf Ebrahimi 					return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
4009*22dc650dSSadaf Ebrahimi 				}
4010*22dc650dSSadaf Ebrahimi 			} else if (srcdst & SLJIT_MEM) {
4011*22dc650dSSadaf Ebrahimi 				SLJIT_ASSERT(elem_size == 2 || elem_size == 3);
4012*22dc650dSSadaf Ebrahimi 
4013*22dc650dSSadaf Ebrahimi 				if (use_vex)
4014*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw);
4015*22dc650dSSadaf Ebrahimi 				return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
4016*22dc650dSSadaf Ebrahimi 			} else if (elem_size == 3) {
4017*22dc650dSSadaf Ebrahimi 				if (use_vex)
4018*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0);
4019*22dc650dSSadaf Ebrahimi 				return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
4020*22dc650dSSadaf Ebrahimi 			} else if (use_vex) {
4021*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, XORPD_x_xm | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, TMP_FREG, 0));
4022*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F3 | EX86_SSE2 | VEX_SSE2_OPV, freg, TMP_FREG, srcdst, 0);
4023*22dc650dSSadaf Ebrahimi 			}
4024*22dc650dSSadaf Ebrahimi 		}
4025*22dc650dSSadaf Ebrahimi 
4026*22dc650dSSadaf Ebrahimi 		if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
4027*22dc650dSSadaf Ebrahimi 			freg = TMP_FREG;
4028*22dc650dSSadaf Ebrahimi 			lane_index -= (1 << (4 - elem_size));
4029*22dc650dSSadaf Ebrahimi 		} else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
4030*22dc650dSSadaf Ebrahimi 			if (use_vex)
4031*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, TMP_FREG, srcdst, srcdstw));
4032*22dc650dSSadaf Ebrahimi 			else
4033*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_sse2_load(compiler, elem_size == 2, TMP_FREG, srcdst, srcdstw));
4034*22dc650dSSadaf Ebrahimi 			srcdst = TMP_FREG;
4035*22dc650dSSadaf Ebrahimi 			srcdstw = 0;
4036*22dc650dSSadaf Ebrahimi 		}
4037*22dc650dSSadaf Ebrahimi 
4038*22dc650dSSadaf Ebrahimi 		op = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0)
4039*22dc650dSSadaf Ebrahimi 			| ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2;
4040*22dc650dSSadaf Ebrahimi 
4041*22dc650dSSadaf Ebrahimi 		if (use_vex)
4042*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, op | (reg_size == 5 ? VEX_256 : 0) | VEX_SSE2_OPV, freg, freg, freg, 0));
4043*22dc650dSSadaf Ebrahimi 		else
4044*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, op, freg, freg, 0));
4045*22dc650dSSadaf Ebrahimi 	} else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
4046*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
4047*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, 1));
4048*22dc650dSSadaf Ebrahimi 
4049*22dc650dSSadaf Ebrahimi 		freg = TMP_FREG;
4050*22dc650dSSadaf Ebrahimi 		lane_index -= (1 << (4 - elem_size));
4051*22dc650dSSadaf Ebrahimi 	}
4052*22dc650dSSadaf Ebrahimi 
4053*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_FLOAT) {
4054*22dc650dSSadaf Ebrahimi 		if (elem_size == 3) {
4055*22dc650dSSadaf Ebrahimi 			if (srcdst & SLJIT_MEM) {
4056*22dc650dSSadaf Ebrahimi 				if (type & SLJIT_SIMD_STORE)
4057*22dc650dSSadaf Ebrahimi 					op = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
4058*22dc650dSSadaf Ebrahimi 				else
4059*22dc650dSSadaf Ebrahimi 					op = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
4060*22dc650dSSadaf Ebrahimi 
4061*22dc650dSSadaf Ebrahimi 				/* VEX prefix clears upper bits of the target register. */
4062*22dc650dSSadaf Ebrahimi 				if (use_vex && ((type & SLJIT_SIMD_STORE) || reg_size == 4 || freg == TMP_FREG))
4063*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2
4064*22dc650dSSadaf Ebrahimi 						| ((type & SLJIT_SIMD_STORE) ? 0 : VEX_SSE2_OPV), freg, (type & SLJIT_SIMD_STORE) ? 0 : freg, srcdst, srcdstw));
4065*22dc650dSSadaf Ebrahimi 				else
4066*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_groupf(compiler, op | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
4067*22dc650dSSadaf Ebrahimi 
4068*22dc650dSSadaf Ebrahimi 				/* In case of store, freg is not TMP_FREG. */
4069*22dc650dSSadaf Ebrahimi 			} else if (type & SLJIT_SIMD_STORE) {
4070*22dc650dSSadaf Ebrahimi 				if (lane_index == 1) {
4071*22dc650dSSadaf Ebrahimi 					if (use_vex)
4072*22dc650dSSadaf Ebrahimi 						return emit_vex_instruction(compiler, MOVHLPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0);
4073*22dc650dSSadaf Ebrahimi 					return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0);
4074*22dc650dSSadaf Ebrahimi 				}
4075*22dc650dSSadaf Ebrahimi 				if (use_vex)
4076*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, srcdst, srcdst, freg, 0);
4077*22dc650dSSadaf Ebrahimi 				return emit_sse2_load(compiler, 0, srcdst, freg, 0);
4078*22dc650dSSadaf Ebrahimi 			} else if (use_vex && (reg_size == 4 || freg == TMP_FREG)) {
4079*22dc650dSSadaf Ebrahimi 				if (lane_index == 1)
4080*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, MOVLHPS_x_x | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0));
4081*22dc650dSSadaf Ebrahimi 				else
4082*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, MOVSD_x_xm | EX86_PREF_F2 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, srcdst, 0));
4083*22dc650dSSadaf Ebrahimi 			} else {
4084*22dc650dSSadaf Ebrahimi 				if (lane_index == 1)
4085*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0));
4086*22dc650dSSadaf Ebrahimi 				else
4087*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_sse2_load(compiler, 0, freg, srcdst, 0));
4088*22dc650dSSadaf Ebrahimi 			}
4089*22dc650dSSadaf Ebrahimi 		} else if (type & SLJIT_SIMD_STORE) {
4090*22dc650dSSadaf Ebrahimi 			if (lane_index == 0) {
4091*22dc650dSSadaf Ebrahimi 				if (use_vex)
4092*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, ((srcdst & SLJIT_MEM) ? MOVSD_xm_x : MOVSD_x_xm) | EX86_PREF_F3 | EX86_SSE2
4093*22dc650dSSadaf Ebrahimi 						| ((srcdst & SLJIT_MEM) ? 0 : VEX_SSE2_OPV), freg, ((srcdst & SLJIT_MEM) ? 0 : freg), srcdst, srcdstw);
4094*22dc650dSSadaf Ebrahimi 				return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg);
4095*22dc650dSSadaf Ebrahimi 			}
4096*22dc650dSSadaf Ebrahimi 
4097*22dc650dSSadaf Ebrahimi 			if (srcdst & SLJIT_MEM) {
4098*22dc650dSSadaf Ebrahimi 				if (use_vex)
4099*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, srcdst, srcdstw));
4100*22dc650dSSadaf Ebrahimi 				else
4101*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
4102*22dc650dSSadaf Ebrahimi 				return emit_byte(compiler, U8(lane_index));
4103*22dc650dSSadaf Ebrahimi 			}
4104*22dc650dSSadaf Ebrahimi 
4105*22dc650dSSadaf Ebrahimi 			if (use_vex) {
4106*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0));
4107*22dc650dSSadaf Ebrahimi 				return emit_byte(compiler, U8(lane_index));
4108*22dc650dSSadaf Ebrahimi 			}
4109*22dc650dSSadaf Ebrahimi 
4110*22dc650dSSadaf Ebrahimi 			if (srcdst == freg)
4111*22dc650dSSadaf Ebrahimi 				op = SHUFPS_x_xm | EX86_SSE2;
4112*22dc650dSSadaf Ebrahimi 			else {
4113*22dc650dSSadaf Ebrahimi 				switch (lane_index) {
4114*22dc650dSSadaf Ebrahimi 				case 1:
4115*22dc650dSSadaf Ebrahimi 					op = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2;
4116*22dc650dSSadaf Ebrahimi 					break;
4117*22dc650dSSadaf Ebrahimi 				case 2:
4118*22dc650dSSadaf Ebrahimi 					op = MOVHLPS_x_x | EX86_SSE2;
4119*22dc650dSSadaf Ebrahimi 					break;
4120*22dc650dSSadaf Ebrahimi 				default:
4121*22dc650dSSadaf Ebrahimi 					SLJIT_ASSERT(lane_index == 3);
4122*22dc650dSSadaf Ebrahimi 					op = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2;
4123*22dc650dSSadaf Ebrahimi 					break;
4124*22dc650dSSadaf Ebrahimi 				}
4125*22dc650dSSadaf Ebrahimi 			}
4126*22dc650dSSadaf Ebrahimi 
4127*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, op, srcdst, freg, 0));
4128*22dc650dSSadaf Ebrahimi 
4129*22dc650dSSadaf Ebrahimi 			op &= 0xff;
4130*22dc650dSSadaf Ebrahimi 			if (op == SHUFPS_x_xm || op == PSHUFD_x_xm)
4131*22dc650dSSadaf Ebrahimi 				return emit_byte(compiler, U8(lane_index));
4132*22dc650dSSadaf Ebrahimi 
4133*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4134*22dc650dSSadaf Ebrahimi 		} else {
4135*22dc650dSSadaf Ebrahimi 			if (lane_index != 0 || (srcdst & SLJIT_MEM)) {
4136*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
4137*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_byte(compiler, U8(lane_index << 4)));
4138*22dc650dSSadaf Ebrahimi 			} else
4139*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst));
4140*22dc650dSSadaf Ebrahimi 		}
4141*22dc650dSSadaf Ebrahimi 
4142*22dc650dSSadaf Ebrahimi 		if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE))
4143*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4144*22dc650dSSadaf Ebrahimi 
4145*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(reg_size == 5);
4146*22dc650dSSadaf Ebrahimi 
4147*22dc650dSSadaf Ebrahimi 		if (type & SLJIT_SIMD_LANE_ZERO) {
4148*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
4149*22dc650dSSadaf Ebrahimi 			return emit_byte(compiler, 0x4e);
4150*22dc650dSSadaf Ebrahimi 		}
4151*22dc650dSSadaf Ebrahimi 
4152*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
4153*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, 1);
4154*22dc650dSSadaf Ebrahimi 	}
4155*22dc650dSSadaf Ebrahimi 
4156*22dc650dSSadaf Ebrahimi 	if (srcdst == SLJIT_IMM) {
4157*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
4158*22dc650dSSadaf Ebrahimi 		srcdst = TMP_REG1;
4159*22dc650dSSadaf Ebrahimi 		srcdstw = 0;
4160*22dc650dSSadaf Ebrahimi 	}
4161*22dc650dSSadaf Ebrahimi 
4162*22dc650dSSadaf Ebrahimi 	op = 3;
4163*22dc650dSSadaf Ebrahimi 
4164*22dc650dSSadaf Ebrahimi 	switch (elem_size) {
4165*22dc650dSSadaf Ebrahimi 	case 0:
4166*22dc650dSSadaf Ebrahimi 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8;
4167*22dc650dSSadaf Ebrahimi 		break;
4168*22dc650dSSadaf Ebrahimi 	case 1:
4169*22dc650dSSadaf Ebrahimi 		if (!(type & SLJIT_SIMD_STORE)) {
4170*22dc650dSSadaf Ebrahimi 			op = 2;
4171*22dc650dSSadaf Ebrahimi 			opcode = PINSRW_x_rm_i8;
4172*22dc650dSSadaf Ebrahimi 		} else
4173*22dc650dSSadaf Ebrahimi 			opcode = PEXTRW_rm_x_i8;
4174*22dc650dSSadaf Ebrahimi 		break;
4175*22dc650dSSadaf Ebrahimi 	case 2:
4176*22dc650dSSadaf Ebrahimi 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4177*22dc650dSSadaf Ebrahimi 		break;
4178*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4179*22dc650dSSadaf Ebrahimi 	case 3:
4180*22dc650dSSadaf Ebrahimi 		/* PINSRQ / PEXTRQ */
4181*22dc650dSSadaf Ebrahimi 		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
4182*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
4183*22dc650dSSadaf Ebrahimi 		break;
4184*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4185*22dc650dSSadaf Ebrahimi 	}
4186*22dc650dSSadaf Ebrahimi 
4187*22dc650dSSadaf Ebrahimi 	if (use_vex && (type & SLJIT_SIMD_STORE)) {
4188*22dc650dSSadaf Ebrahimi 		op = opcode | ((op == 3) ? VEX_OP_0F3A : 0);
4189*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, op | EX86_PREF_66 | VEX_AUTO_W | EX86_SSE2_OP1 | VEX_SSE2_OPV, freg, 0, srcdst, srcdstw));
4190*22dc650dSSadaf Ebrahimi 	} else {
4191*22dc650dSSadaf Ebrahimi 		inst = emit_x86_instruction(compiler, op | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
4192*22dc650dSSadaf Ebrahimi 		FAIL_IF(!inst);
4193*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
4194*22dc650dSSadaf Ebrahimi 
4195*22dc650dSSadaf Ebrahimi 		if (op == 3) {
4196*22dc650dSSadaf Ebrahimi 			inst[1] = 0x3a;
4197*22dc650dSSadaf Ebrahimi 			inst[2] = opcode;
4198*22dc650dSSadaf Ebrahimi 		} else
4199*22dc650dSSadaf Ebrahimi 			inst[1] = opcode;
4200*22dc650dSSadaf Ebrahimi 	}
4201*22dc650dSSadaf Ebrahimi 
4202*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_byte(compiler, U8(lane_index)));
4203*22dc650dSSadaf Ebrahimi 
4204*22dc650dSSadaf Ebrahimi 	if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) {
4205*22dc650dSSadaf Ebrahimi 		if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) {
4206*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(reg_size == 5);
4207*22dc650dSSadaf Ebrahimi 
4208*22dc650dSSadaf Ebrahimi 			if (type & SLJIT_SIMD_LANE_ZERO) {
4209*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
4210*22dc650dSSadaf Ebrahimi 				return emit_byte(compiler, 0x4e);
4211*22dc650dSSadaf Ebrahimi 			}
4212*22dc650dSSadaf Ebrahimi 
4213*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
4214*22dc650dSSadaf Ebrahimi 			return emit_byte(compiler, 1);
4215*22dc650dSSadaf Ebrahimi 		}
4216*22dc650dSSadaf Ebrahimi 
4217*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4218*22dc650dSSadaf Ebrahimi 		if (srcdst_orig & SLJIT_MEM)
4219*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4220*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
4221*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4222*22dc650dSSadaf Ebrahimi 	}
4223*22dc650dSSadaf Ebrahimi 
4224*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4225*22dc650dSSadaf Ebrahimi 	if (elem_size >= 3)
4226*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4227*22dc650dSSadaf Ebrahimi 
4228*22dc650dSSadaf Ebrahimi 	compiler->mode32 = (type & SLJIT_32);
4229*22dc650dSSadaf Ebrahimi 
4230*22dc650dSSadaf Ebrahimi 	op = 2;
4231*22dc650dSSadaf Ebrahimi 
4232*22dc650dSSadaf Ebrahimi 	if (elem_size == 0)
4233*22dc650dSSadaf Ebrahimi 		op |= EX86_REX;
4234*22dc650dSSadaf Ebrahimi 
4235*22dc650dSSadaf Ebrahimi 	if (elem_size == 2) {
4236*22dc650dSSadaf Ebrahimi 		if (type & SLJIT_32)
4237*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4238*22dc650dSSadaf Ebrahimi 
4239*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(!(compiler->mode32));
4240*22dc650dSSadaf Ebrahimi 		op = 1;
4241*22dc650dSSadaf Ebrahimi 	}
4242*22dc650dSSadaf Ebrahimi 
4243*22dc650dSSadaf Ebrahimi 	inst = emit_x86_instruction(compiler, op, srcdst, 0, srcdst, 0);
4244*22dc650dSSadaf Ebrahimi 	FAIL_IF(!inst);
4245*22dc650dSSadaf Ebrahimi 
4246*22dc650dSSadaf Ebrahimi 	if (op != 1) {
4247*22dc650dSSadaf Ebrahimi 		inst[0] = GROUP_0F;
4248*22dc650dSSadaf Ebrahimi 		inst[1] = U8((elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16);
4249*22dc650dSSadaf Ebrahimi 	} else
4250*22dc650dSSadaf Ebrahimi 		inst[0] = MOVSXD_r_rm;
4251*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
4252*22dc650dSSadaf Ebrahimi 	if (elem_size >= 2)
4253*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4254*22dc650dSSadaf Ebrahimi 
4255*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16,
4256*22dc650dSSadaf Ebrahimi 		(srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0));
4257*22dc650dSSadaf Ebrahimi 
4258*22dc650dSSadaf Ebrahimi 	if (srcdst_orig & SLJIT_MEM)
4259*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
4260*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4261*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
4262*22dc650dSSadaf Ebrahimi }
4263*22dc650dSSadaf Ebrahimi 
4264*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type,
4265*22dc650dSSadaf Ebrahimi 	sljit_s32 freg,
4266*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_s32 src_lane_index)
4267*22dc650dSSadaf Ebrahimi {
4268*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4269*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4270*22dc650dSSadaf Ebrahimi 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4271*22dc650dSSadaf Ebrahimi 	sljit_uw pref;
4272*22dc650dSSadaf Ebrahimi 	sljit_u8 byte;
4273*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4274*22dc650dSSadaf Ebrahimi 	sljit_s32 opcode3 = TMP_REG1;
4275*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_32 */
4276*22dc650dSSadaf Ebrahimi 	sljit_s32 opcode3 = SLJIT_S0;
4277*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
4278*22dc650dSSadaf Ebrahimi 
4279*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4280*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
4281*22dc650dSSadaf Ebrahimi 
4282*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4283*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
4284*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4285*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(reg_map[opcode3] == 3);
4286*22dc650dSSadaf Ebrahimi 
4287*22dc650dSSadaf Ebrahimi 	if (reg_size == 5) {
4288*22dc650dSSadaf Ebrahimi 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4289*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4290*22dc650dSSadaf Ebrahimi 		use_vex = 1;
4291*22dc650dSSadaf Ebrahimi 	} else if (reg_size != 4)
4292*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4293*22dc650dSSadaf Ebrahimi 
4294*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_FLOAT) {
4295*22dc650dSSadaf Ebrahimi 		pref = 0;
4296*22dc650dSSadaf Ebrahimi 		byte = U8(src_lane_index);
4297*22dc650dSSadaf Ebrahimi 
4298*22dc650dSSadaf Ebrahimi 		if (elem_size == 3) {
4299*22dc650dSSadaf Ebrahimi 			if (type & SLJIT_SIMD_TEST)
4300*22dc650dSSadaf Ebrahimi 				return SLJIT_SUCCESS;
4301*22dc650dSSadaf Ebrahimi 
4302*22dc650dSSadaf Ebrahimi 			if (reg_size == 5) {
4303*22dc650dSSadaf Ebrahimi 				if (src_lane_index == 0)
4304*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4305*22dc650dSSadaf Ebrahimi 
4306*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4307*22dc650dSSadaf Ebrahimi 
4308*22dc650dSSadaf Ebrahimi 				byte = U8(byte | (byte << 2));
4309*22dc650dSSadaf Ebrahimi 				return emit_byte(compiler, U8(byte | (byte << 4)));
4310*22dc650dSSadaf Ebrahimi 			}
4311*22dc650dSSadaf Ebrahimi 
4312*22dc650dSSadaf Ebrahimi 			if (src_lane_index == 0) {
4313*22dc650dSSadaf Ebrahimi 				if (use_vex)
4314*22dc650dSSadaf Ebrahimi 					return emit_vex_instruction(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, 0);
4315*22dc650dSSadaf Ebrahimi 				return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
4316*22dc650dSSadaf Ebrahimi 			}
4317*22dc650dSSadaf Ebrahimi 
4318*22dc650dSSadaf Ebrahimi 			/* Changes it to SHUFPD_x_xm. */
4319*22dc650dSSadaf Ebrahimi 			pref = EX86_PREF_66;
4320*22dc650dSSadaf Ebrahimi 		} else if (elem_size != 2)
4321*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4322*22dc650dSSadaf Ebrahimi 		else if (type & SLJIT_SIMD_TEST)
4323*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4324*22dc650dSSadaf Ebrahimi 
4325*22dc650dSSadaf Ebrahimi 		if (reg_size == 5) {
4326*22dc650dSSadaf Ebrahimi 			SLJIT_ASSERT(elem_size == 2);
4327*22dc650dSSadaf Ebrahimi 
4328*22dc650dSSadaf Ebrahimi 			if (src_lane_index == 0)
4329*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4330*22dc650dSSadaf Ebrahimi 
4331*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4332*22dc650dSSadaf Ebrahimi 
4333*22dc650dSSadaf Ebrahimi 			byte = 0x44;
4334*22dc650dSSadaf Ebrahimi 			if (src_lane_index >= 4) {
4335*22dc650dSSadaf Ebrahimi 				byte = 0xee;
4336*22dc650dSSadaf Ebrahimi 				src_lane_index -= 4;
4337*22dc650dSSadaf Ebrahimi 			}
4338*22dc650dSSadaf Ebrahimi 
4339*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, byte));
4340*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
4341*22dc650dSSadaf Ebrahimi 			byte = U8(src_lane_index);
4342*22dc650dSSadaf Ebrahimi 		} else if (use_vex) {
4343*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
4344*22dc650dSSadaf Ebrahimi 		} else {
4345*22dc650dSSadaf Ebrahimi 			if (freg != src)
4346*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0));
4347*22dc650dSSadaf Ebrahimi 
4348*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0));
4349*22dc650dSSadaf Ebrahimi 		}
4350*22dc650dSSadaf Ebrahimi 
4351*22dc650dSSadaf Ebrahimi 		if (elem_size == 2) {
4352*22dc650dSSadaf Ebrahimi 			byte = U8(byte | (byte << 2));
4353*22dc650dSSadaf Ebrahimi 			byte = U8(byte | (byte << 4));
4354*22dc650dSSadaf Ebrahimi 		} else
4355*22dc650dSSadaf Ebrahimi 			byte = U8(byte | (byte << 1));
4356*22dc650dSSadaf Ebrahimi 
4357*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, U8(byte));
4358*22dc650dSSadaf Ebrahimi 	}
4359*22dc650dSSadaf Ebrahimi 
4360*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
4361*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4362*22dc650dSSadaf Ebrahimi 
4363*22dc650dSSadaf Ebrahimi 	if (elem_size == 0) {
4364*22dc650dSSadaf Ebrahimi 		if (reg_size == 5 && src_lane_index >= 16) {
4365*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4366*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa));
4367*22dc650dSSadaf Ebrahimi 			src_lane_index &= 0x7;
4368*22dc650dSSadaf Ebrahimi 			src = freg;
4369*22dc650dSSadaf Ebrahimi 		}
4370*22dc650dSSadaf Ebrahimi 
4371*22dc650dSSadaf Ebrahimi 		if (src_lane_index != 0 || (freg != src && (!(cpu_feature_list & CPU_FEATURE_AVX2) || !use_vex))) {
4372*22dc650dSSadaf Ebrahimi 			pref = 0;
4373*22dc650dSSadaf Ebrahimi 
4374*22dc650dSSadaf Ebrahimi 			if ((src_lane_index & 0x3) == 0) {
4375*22dc650dSSadaf Ebrahimi 				pref = EX86_PREF_66;
4376*22dc650dSSadaf Ebrahimi 				byte = U8(src_lane_index >> 2);
4377*22dc650dSSadaf Ebrahimi 			} else if (src_lane_index < 8 && (src_lane_index & 0x1) == 0) {
4378*22dc650dSSadaf Ebrahimi 				pref = EX86_PREF_F2;
4379*22dc650dSSadaf Ebrahimi 				byte = U8(src_lane_index >> 1);
4380*22dc650dSSadaf Ebrahimi 			} else {
4381*22dc650dSSadaf Ebrahimi 				if (!use_vex) {
4382*22dc650dSSadaf Ebrahimi 					if (freg != src)
4383*22dc650dSSadaf Ebrahimi 						FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4384*22dc650dSSadaf Ebrahimi 
4385*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
4386*22dc650dSSadaf Ebrahimi 				} else
4387*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0));
4388*22dc650dSSadaf Ebrahimi 
4389*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_byte(compiler, U8(src_lane_index)));
4390*22dc650dSSadaf Ebrahimi 			}
4391*22dc650dSSadaf Ebrahimi 
4392*22dc650dSSadaf Ebrahimi 			if (pref != 0) {
4393*22dc650dSSadaf Ebrahimi 				if (use_vex)
4394*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0));
4395*22dc650dSSadaf Ebrahimi 				else
4396*22dc650dSSadaf Ebrahimi 					FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4397*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_byte(compiler, byte));
4398*22dc650dSSadaf Ebrahimi 			}
4399*22dc650dSSadaf Ebrahimi 
4400*22dc650dSSadaf Ebrahimi 			src = freg;
4401*22dc650dSSadaf Ebrahimi 		}
4402*22dc650dSSadaf Ebrahimi 
4403*22dc650dSSadaf Ebrahimi 		if (use_vex && (cpu_feature_list & CPU_FEATURE_AVX2))
4404*22dc650dSSadaf Ebrahimi 			return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
4405*22dc650dSSadaf Ebrahimi 
4406*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(reg_size == 4);
4407*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
4408*22dc650dSSadaf Ebrahimi 		return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
4409*22dc650dSSadaf Ebrahimi 	}
4410*22dc650dSSadaf Ebrahimi 
4411*22dc650dSSadaf Ebrahimi 	if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && src_lane_index == 0 && elem_size <= 3) {
4412*22dc650dSSadaf Ebrahimi 		switch (elem_size) {
4413*22dc650dSSadaf Ebrahimi 		case 1:
4414*22dc650dSSadaf Ebrahimi 			pref = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4415*22dc650dSSadaf Ebrahimi 			break;
4416*22dc650dSSadaf Ebrahimi 		case 2:
4417*22dc650dSSadaf Ebrahimi 			pref = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4418*22dc650dSSadaf Ebrahimi 			break;
4419*22dc650dSSadaf Ebrahimi 		default:
4420*22dc650dSSadaf Ebrahimi 			pref = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
4421*22dc650dSSadaf Ebrahimi 			break;
4422*22dc650dSSadaf Ebrahimi 		}
4423*22dc650dSSadaf Ebrahimi 
4424*22dc650dSSadaf Ebrahimi 		if (reg_size == 5)
4425*22dc650dSSadaf Ebrahimi 			pref |= VEX_256;
4426*22dc650dSSadaf Ebrahimi 
4427*22dc650dSSadaf Ebrahimi 		return emit_vex_instruction(compiler, pref, freg, 0, src, 0);
4428*22dc650dSSadaf Ebrahimi 	}
4429*22dc650dSSadaf Ebrahimi 
4430*22dc650dSSadaf Ebrahimi 	if (reg_size == 5) {
4431*22dc650dSSadaf Ebrahimi 		switch (elem_size) {
4432*22dc650dSSadaf Ebrahimi 		case 1:
4433*22dc650dSSadaf Ebrahimi 			byte = U8(src_lane_index & 0x3);
4434*22dc650dSSadaf Ebrahimi 			src_lane_index >>= 2;
4435*22dc650dSSadaf Ebrahimi 			pref = PSHUFLW_x_xm | VEX_256 | ((src_lane_index & 1) == 0 ? EX86_PREF_F2 : EX86_PREF_F3) | EX86_SSE2;
4436*22dc650dSSadaf Ebrahimi 			break;
4437*22dc650dSSadaf Ebrahimi 		case 2:
4438*22dc650dSSadaf Ebrahimi 			byte = U8(src_lane_index & 0x3);
4439*22dc650dSSadaf Ebrahimi 			src_lane_index >>= 1;
4440*22dc650dSSadaf Ebrahimi 			pref = PSHUFD_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2;
4441*22dc650dSSadaf Ebrahimi 			break;
4442*22dc650dSSadaf Ebrahimi 		case 3:
4443*22dc650dSSadaf Ebrahimi 			pref = 0;
4444*22dc650dSSadaf Ebrahimi 			break;
4445*22dc650dSSadaf Ebrahimi 		default:
4446*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4447*22dc650dSSadaf Ebrahimi 			return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee));
4448*22dc650dSSadaf Ebrahimi 		}
4449*22dc650dSSadaf Ebrahimi 
4450*22dc650dSSadaf Ebrahimi 		if (pref != 0) {
4451*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0));
4452*22dc650dSSadaf Ebrahimi 			byte = U8(byte | (byte << 2));
4453*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4454*22dc650dSSadaf Ebrahimi 
4455*22dc650dSSadaf Ebrahimi 			if (src_lane_index == 0)
4456*22dc650dSSadaf Ebrahimi 				return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4457*22dc650dSSadaf Ebrahimi 
4458*22dc650dSSadaf Ebrahimi 			src = freg;
4459*22dc650dSSadaf Ebrahimi 		}
4460*22dc650dSSadaf Ebrahimi 
4461*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
4462*22dc650dSSadaf Ebrahimi 		byte = U8(src_lane_index);
4463*22dc650dSSadaf Ebrahimi 		byte = U8(byte | (byte << 2));
4464*22dc650dSSadaf Ebrahimi 		return emit_byte(compiler, U8(byte | (byte << 4)));
4465*22dc650dSSadaf Ebrahimi 	}
4466*22dc650dSSadaf Ebrahimi 
4467*22dc650dSSadaf Ebrahimi 	switch (elem_size) {
4468*22dc650dSSadaf Ebrahimi 	case 1:
4469*22dc650dSSadaf Ebrahimi 		byte = U8(src_lane_index & 0x3);
4470*22dc650dSSadaf Ebrahimi 		src_lane_index >>= 1;
4471*22dc650dSSadaf Ebrahimi 		pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3;
4472*22dc650dSSadaf Ebrahimi 
4473*22dc650dSSadaf Ebrahimi 		if (use_vex)
4474*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, 0, src, 0));
4475*22dc650dSSadaf Ebrahimi 		else
4476*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
4477*22dc650dSSadaf Ebrahimi 		byte = U8(byte | (byte << 2));
4478*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
4479*22dc650dSSadaf Ebrahimi 
4480*22dc650dSSadaf Ebrahimi 		if ((cpu_feature_list & CPU_FEATURE_AVX2) && use_vex && pref == EX86_PREF_F2)
4481*22dc650dSSadaf Ebrahimi 			return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
4482*22dc650dSSadaf Ebrahimi 
4483*22dc650dSSadaf Ebrahimi 		src = freg;
4484*22dc650dSSadaf Ebrahimi 		/* fallthrough */
4485*22dc650dSSadaf Ebrahimi 	case 2:
4486*22dc650dSSadaf Ebrahimi 		byte = U8(src_lane_index);
4487*22dc650dSSadaf Ebrahimi 		byte = U8(byte | (byte << 2));
4488*22dc650dSSadaf Ebrahimi 		break;
4489*22dc650dSSadaf Ebrahimi 	default:
4490*22dc650dSSadaf Ebrahimi 		byte = U8(src_lane_index << 1);
4491*22dc650dSSadaf Ebrahimi 		byte = U8(byte | (byte << 2) | 0x4);
4492*22dc650dSSadaf Ebrahimi 		break;
4493*22dc650dSSadaf Ebrahimi 	}
4494*22dc650dSSadaf Ebrahimi 
4495*22dc650dSSadaf Ebrahimi 	if (use_vex)
4496*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, 0, src, 0));
4497*22dc650dSSadaf Ebrahimi 	else
4498*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
4499*22dc650dSSadaf Ebrahimi 	return emit_byte(compiler, U8(byte | (byte << 4)));
4500*22dc650dSSadaf Ebrahimi }
4501*22dc650dSSadaf Ebrahimi 
4502*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
4503*22dc650dSSadaf Ebrahimi 	sljit_s32 freg,
4504*22dc650dSSadaf Ebrahimi 	sljit_s32 src, sljit_sw srcw)
4505*22dc650dSSadaf Ebrahimi {
4506*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4507*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4508*22dc650dSSadaf Ebrahimi 	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
4509*22dc650dSSadaf Ebrahimi 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4510*22dc650dSSadaf Ebrahimi 	sljit_u8 opcode;
4511*22dc650dSSadaf Ebrahimi 
4512*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4513*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
4514*22dc650dSSadaf Ebrahimi 
4515*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(src, srcw);
4516*22dc650dSSadaf Ebrahimi 
4517*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4518*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
4519*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4520*22dc650dSSadaf Ebrahimi 
4521*22dc650dSSadaf Ebrahimi 	if (reg_size == 5) {
4522*22dc650dSSadaf Ebrahimi 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4523*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4524*22dc650dSSadaf Ebrahimi 		use_vex = 1;
4525*22dc650dSSadaf Ebrahimi 	} else if (reg_size != 4)
4526*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4527*22dc650dSSadaf Ebrahimi 
4528*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_FLOAT) {
4529*22dc650dSSadaf Ebrahimi 		if (elem_size != 2 || elem2_size != 3)
4530*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4531*22dc650dSSadaf Ebrahimi 
4532*22dc650dSSadaf Ebrahimi 		if (type & SLJIT_SIMD_TEST)
4533*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4534*22dc650dSSadaf Ebrahimi 
4535*22dc650dSSadaf Ebrahimi 		if (use_vex)
4536*22dc650dSSadaf Ebrahimi 			return emit_vex_instruction(compiler, CVTPS2PD_x_xm | ((reg_size == 5) ? VEX_256 : 0) | EX86_SSE2, freg, 0, src, srcw);
4537*22dc650dSSadaf Ebrahimi 		return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw);
4538*22dc650dSSadaf Ebrahimi 	}
4539*22dc650dSSadaf Ebrahimi 
4540*22dc650dSSadaf Ebrahimi 	switch (elem_size) {
4541*22dc650dSSadaf Ebrahimi 	case 0:
4542*22dc650dSSadaf Ebrahimi 		if (elem2_size == 1)
4543*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBW_x_xm : PMOVZXBW_x_xm;
4544*22dc650dSSadaf Ebrahimi 		else if (elem2_size == 2)
4545*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBD_x_xm : PMOVZXBD_x_xm;
4546*22dc650dSSadaf Ebrahimi 		else if (elem2_size == 3)
4547*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBQ_x_xm : PMOVZXBQ_x_xm;
4548*22dc650dSSadaf Ebrahimi 		else
4549*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4550*22dc650dSSadaf Ebrahimi 		break;
4551*22dc650dSSadaf Ebrahimi 	case 1:
4552*22dc650dSSadaf Ebrahimi 		if (elem2_size == 2)
4553*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWD_x_xm : PMOVZXWD_x_xm;
4554*22dc650dSSadaf Ebrahimi 		else if (elem2_size == 3)
4555*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWQ_x_xm : PMOVZXWQ_x_xm;
4556*22dc650dSSadaf Ebrahimi 		else
4557*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4558*22dc650dSSadaf Ebrahimi 		break;
4559*22dc650dSSadaf Ebrahimi 	case 2:
4560*22dc650dSSadaf Ebrahimi 		if (elem2_size == 3)
4561*22dc650dSSadaf Ebrahimi 			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXDQ_x_xm : PMOVZXDQ_x_xm;
4562*22dc650dSSadaf Ebrahimi 		else
4563*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4564*22dc650dSSadaf Ebrahimi 		break;
4565*22dc650dSSadaf Ebrahimi 	default:
4566*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4567*22dc650dSSadaf Ebrahimi 	}
4568*22dc650dSSadaf Ebrahimi 
4569*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
4570*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4571*22dc650dSSadaf Ebrahimi 
4572*22dc650dSSadaf Ebrahimi 	if (use_vex)
4573*22dc650dSSadaf Ebrahimi 		return emit_vex_instruction(compiler, opcode | ((reg_size == 5) ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw);
4574*22dc650dSSadaf Ebrahimi 	return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
4575*22dc650dSSadaf Ebrahimi }
4576*22dc650dSSadaf Ebrahimi 
4577*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
4578*22dc650dSSadaf Ebrahimi 	sljit_s32 freg,
4579*22dc650dSSadaf Ebrahimi 	sljit_s32 dst, sljit_sw dstw)
4580*22dc650dSSadaf Ebrahimi {
4581*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4582*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4583*22dc650dSSadaf Ebrahimi 	sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX);
4584*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_r;
4585*22dc650dSSadaf Ebrahimi 	sljit_uw op;
4586*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
4587*22dc650dSSadaf Ebrahimi 
4588*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4589*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
4590*22dc650dSSadaf Ebrahimi 
4591*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
4592*22dc650dSSadaf Ebrahimi 
4593*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4594*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4595*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
4596*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4597*22dc650dSSadaf Ebrahimi 
4598*22dc650dSSadaf Ebrahimi 	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
4599*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4600*22dc650dSSadaf Ebrahimi 
4601*22dc650dSSadaf Ebrahimi 	if (reg_size == 4) {
4602*22dc650dSSadaf Ebrahimi 		if (type & SLJIT_SIMD_TEST)
4603*22dc650dSSadaf Ebrahimi 			return SLJIT_SUCCESS;
4604*22dc650dSSadaf Ebrahimi 
4605*22dc650dSSadaf Ebrahimi 		op = EX86_PREF_66 | EX86_SSE2_OP2;
4606*22dc650dSSadaf Ebrahimi 
4607*22dc650dSSadaf Ebrahimi 		switch (elem_size) {
4608*22dc650dSSadaf Ebrahimi 		case 1:
4609*22dc650dSSadaf Ebrahimi 			if (use_vex)
4610*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, freg, 0));
4611*22dc650dSSadaf Ebrahimi 			else
4612*22dc650dSSadaf Ebrahimi 				FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
4613*22dc650dSSadaf Ebrahimi 			freg = TMP_FREG;
4614*22dc650dSSadaf Ebrahimi 			break;
4615*22dc650dSSadaf Ebrahimi 		case 2:
4616*22dc650dSSadaf Ebrahimi 			op = EX86_SSE2_OP2;
4617*22dc650dSSadaf Ebrahimi 			break;
4618*22dc650dSSadaf Ebrahimi 		}
4619*22dc650dSSadaf Ebrahimi 
4620*22dc650dSSadaf Ebrahimi 		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4621*22dc650dSSadaf Ebrahimi 		op |= (elem_size < 2) ? PMOVMSKB_r_x : MOVMSKPS_r_x;
4622*22dc650dSSadaf Ebrahimi 
4623*22dc650dSSadaf Ebrahimi 		if (use_vex)
4624*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0));
4625*22dc650dSSadaf Ebrahimi 		else
4626*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_groupf(compiler, op, dst_r, freg, 0));
4627*22dc650dSSadaf Ebrahimi 
4628*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4629*22dc650dSSadaf Ebrahimi 		compiler->mode32 = type & SLJIT_32;
4630*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4631*22dc650dSSadaf Ebrahimi 
4632*22dc650dSSadaf Ebrahimi 		if (elem_size == 1) {
4633*22dc650dSSadaf Ebrahimi 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 8, dst_r, 0);
4634*22dc650dSSadaf Ebrahimi 			FAIL_IF(!inst);
4635*22dc650dSSadaf Ebrahimi 			inst[1] |= SHR;
4636*22dc650dSSadaf Ebrahimi 		}
4637*22dc650dSSadaf Ebrahimi 
4638*22dc650dSSadaf Ebrahimi 		if (dst_r == TMP_REG1)
4639*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4640*22dc650dSSadaf Ebrahimi 
4641*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4642*22dc650dSSadaf Ebrahimi 	}
4643*22dc650dSSadaf Ebrahimi 
4644*22dc650dSSadaf Ebrahimi 	if (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2))
4645*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4646*22dc650dSSadaf Ebrahimi 
4647*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
4648*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4649*22dc650dSSadaf Ebrahimi 
4650*22dc650dSSadaf Ebrahimi 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
4651*22dc650dSSadaf Ebrahimi 
4652*22dc650dSSadaf Ebrahimi 	if (elem_size == 1) {
4653*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
4654*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_byte(compiler, 1));
4655*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0));
4656*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
4657*22dc650dSSadaf Ebrahimi 	} else {
4658*22dc650dSSadaf Ebrahimi 		op = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2;
4659*22dc650dSSadaf Ebrahimi 
4660*22dc650dSSadaf Ebrahimi 		if (elem_size == 0)
4661*22dc650dSSadaf Ebrahimi 			op = PMOVMSKB_r_x | VEX_256 | EX86_PREF_66 | EX86_SSE2_OP2;
4662*22dc650dSSadaf Ebrahimi 		else if (elem_size == 3)
4663*22dc650dSSadaf Ebrahimi 			op |= EX86_PREF_66;
4664*22dc650dSSadaf Ebrahimi 
4665*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_vex_instruction(compiler, op, dst_r, 0, freg, 0));
4666*22dc650dSSadaf Ebrahimi 	}
4667*22dc650dSSadaf Ebrahimi 
4668*22dc650dSSadaf Ebrahimi 	if (dst_r == TMP_REG1) {
4669*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4670*22dc650dSSadaf Ebrahimi 		compiler->mode32 = type & SLJIT_32;
4671*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4672*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
4673*22dc650dSSadaf Ebrahimi 	}
4674*22dc650dSSadaf Ebrahimi 
4675*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
4676*22dc650dSSadaf Ebrahimi }
4677*22dc650dSSadaf Ebrahimi 
4678*22dc650dSSadaf Ebrahimi static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
4679*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_freg, sljit_s32 src_freg)
4680*22dc650dSSadaf Ebrahimi {
4681*22dc650dSSadaf Ebrahimi 	sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2;
4682*22dc650dSSadaf Ebrahimi 
4683*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4);
4684*22dc650dSSadaf Ebrahimi 
4685*22dc650dSSadaf Ebrahimi 	if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3)
4686*22dc650dSSadaf Ebrahimi 		op |= EX86_PREF_66;
4687*22dc650dSSadaf Ebrahimi 
4688*22dc650dSSadaf Ebrahimi 	return emit_groupf(compiler, op, dst_freg, src_freg, 0);
4689*22dc650dSSadaf Ebrahimi }
4690*22dc650dSSadaf Ebrahimi 
4691*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
4692*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
4693*22dc650dSSadaf Ebrahimi {
4694*22dc650dSSadaf Ebrahimi 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
4695*22dc650dSSadaf Ebrahimi 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
4696*22dc650dSSadaf Ebrahimi 	sljit_uw op = 0;
4697*22dc650dSSadaf Ebrahimi 
4698*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4699*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
4700*22dc650dSSadaf Ebrahimi 
4701*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4702*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 1;
4703*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4704*22dc650dSSadaf Ebrahimi 
4705*22dc650dSSadaf Ebrahimi 	if (reg_size == 5) {
4706*22dc650dSSadaf Ebrahimi 		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
4707*22dc650dSSadaf Ebrahimi 			return SLJIT_ERR_UNSUPPORTED;
4708*22dc650dSSadaf Ebrahimi 	} else if (reg_size != 4)
4709*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4710*22dc650dSSadaf Ebrahimi 
4711*22dc650dSSadaf Ebrahimi 	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
4712*22dc650dSSadaf Ebrahimi 		return SLJIT_ERR_UNSUPPORTED;
4713*22dc650dSSadaf Ebrahimi 
4714*22dc650dSSadaf Ebrahimi 	switch (SLJIT_SIMD_GET_OPCODE(type)) {
4715*22dc650dSSadaf Ebrahimi 	case SLJIT_SIMD_OP2_AND:
4716*22dc650dSSadaf Ebrahimi 		op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm;
4717*22dc650dSSadaf Ebrahimi 
4718*22dc650dSSadaf Ebrahimi 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4719*22dc650dSSadaf Ebrahimi 			op |= EX86_PREF_66;
4720*22dc650dSSadaf Ebrahimi 		break;
4721*22dc650dSSadaf Ebrahimi 	case SLJIT_SIMD_OP2_OR:
4722*22dc650dSSadaf Ebrahimi 		op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm;
4723*22dc650dSSadaf Ebrahimi 
4724*22dc650dSSadaf Ebrahimi 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4725*22dc650dSSadaf Ebrahimi 			op |= EX86_PREF_66;
4726*22dc650dSSadaf Ebrahimi 		break;
4727*22dc650dSSadaf Ebrahimi 	case SLJIT_SIMD_OP2_XOR:
4728*22dc650dSSadaf Ebrahimi 		op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm;
4729*22dc650dSSadaf Ebrahimi 
4730*22dc650dSSadaf Ebrahimi 		if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
4731*22dc650dSSadaf Ebrahimi 			op |= EX86_PREF_66;
4732*22dc650dSSadaf Ebrahimi 		break;
4733*22dc650dSSadaf Ebrahimi 	}
4734*22dc650dSSadaf Ebrahimi 
4735*22dc650dSSadaf Ebrahimi 	if (type & SLJIT_SIMD_TEST)
4736*22dc650dSSadaf Ebrahimi 		return SLJIT_SUCCESS;
4737*22dc650dSSadaf Ebrahimi 
4738*22dc650dSSadaf Ebrahimi 	if (reg_size == 5 || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX))) {
4739*22dc650dSSadaf Ebrahimi 		if (reg_size == 5)
4740*22dc650dSSadaf Ebrahimi 			op |= VEX_256;
4741*22dc650dSSadaf Ebrahimi 
4742*22dc650dSSadaf Ebrahimi 		return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0);
4743*22dc650dSSadaf Ebrahimi 	}
4744*22dc650dSSadaf Ebrahimi 
4745*22dc650dSSadaf Ebrahimi 	if (dst_freg != src1_freg) {
4746*22dc650dSSadaf Ebrahimi 		if (dst_freg == src2_freg)
4747*22dc650dSSadaf Ebrahimi 			src2_freg = src1_freg;
4748*22dc650dSSadaf Ebrahimi 		else
4749*22dc650dSSadaf Ebrahimi 			FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg));
4750*22dc650dSSadaf Ebrahimi 	}
4751*22dc650dSSadaf Ebrahimi 
4752*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0));
4753*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
4754*22dc650dSSadaf Ebrahimi }
4755*22dc650dSSadaf Ebrahimi 
4756*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
4757*22dc650dSSadaf Ebrahimi 	sljit_s32 dst_reg,
4758*22dc650dSSadaf Ebrahimi 	sljit_s32 mem_reg)
4759*22dc650dSSadaf Ebrahimi {
4760*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4761*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
4762*22dc650dSSadaf Ebrahimi 
4763*22dc650dSSadaf Ebrahimi 	SLJIT_SKIP_CHECKS(compiler);
4764*22dc650dSSadaf Ebrahimi 	return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0);
4765*22dc650dSSadaf Ebrahimi }
4766*22dc650dSSadaf Ebrahimi 
4767*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op,
4768*22dc650dSSadaf Ebrahimi 	sljit_s32 src_reg,
4769*22dc650dSSadaf Ebrahimi 	sljit_s32 mem_reg,
4770*22dc650dSSadaf Ebrahimi 	sljit_s32 temp_reg)
4771*22dc650dSSadaf Ebrahimi {
4772*22dc650dSSadaf Ebrahimi 	sljit_uw pref;
4773*22dc650dSSadaf Ebrahimi 	sljit_s32 free_reg = TMP_REG1;
4774*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4775*22dc650dSSadaf Ebrahimi 	sljit_sw srcw = 0;
4776*22dc650dSSadaf Ebrahimi 	sljit_sw tempw = 0;
4777*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
4778*22dc650dSSadaf Ebrahimi 
4779*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4780*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
4781*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(src_reg, srcw, (void)0);
4782*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(temp_reg, tempw, (void)0);
4783*22dc650dSSadaf Ebrahimi 
4784*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(FAST_IS_REG(src_reg) || src_reg == SLJIT_MEM1(SLJIT_SP));
4785*22dc650dSSadaf Ebrahimi 	SLJIT_ASSERT(FAST_IS_REG(temp_reg) || temp_reg == SLJIT_MEM1(SLJIT_SP));
4786*22dc650dSSadaf Ebrahimi 
4787*22dc650dSSadaf Ebrahimi 	op = GET_OPCODE(op);
4788*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4789*22dc650dSSadaf Ebrahimi 	if ((src_reg & SLJIT_MEM) || (op == SLJIT_MOV_U8 && reg_map[src_reg] >= 4)) {
4790*22dc650dSSadaf Ebrahimi 		/* Src is virtual register or its low byte is not accessible. */
4791*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(src_reg != SLJIT_R1);
4792*22dc650dSSadaf Ebrahimi 		free_reg = src_reg;
4793*22dc650dSSadaf Ebrahimi 
4794*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw);
4795*22dc650dSSadaf Ebrahimi 		src_reg = TMP_REG1;
4796*22dc650dSSadaf Ebrahimi 
4797*22dc650dSSadaf Ebrahimi 		if (mem_reg == src_reg)
4798*22dc650dSSadaf Ebrahimi 			mem_reg = TMP_REG1;
4799*22dc650dSSadaf Ebrahimi 	}
4800*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_32 */
4801*22dc650dSSadaf Ebrahimi 
4802*22dc650dSSadaf Ebrahimi 	if (temp_reg != SLJIT_R0) {
4803*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4804*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
4805*22dc650dSSadaf Ebrahimi 
4806*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4807*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, 0);
4808*22dc650dSSadaf Ebrahimi 
4809*22dc650dSSadaf Ebrahimi 		if (src_reg == SLJIT_R0)
4810*22dc650dSSadaf Ebrahimi 			src_reg = free_reg;
4811*22dc650dSSadaf Ebrahimi 		if (mem_reg == SLJIT_R0)
4812*22dc650dSSadaf Ebrahimi 			mem_reg = free_reg;
4813*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
4814*22dc650dSSadaf Ebrahimi 		if (src_reg == TMP_REG1 && mem_reg == SLJIT_R0 && (free_reg & SLJIT_MEM)) {
4815*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0);
4816*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0);
4817*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4818*22dc650dSSadaf Ebrahimi 
4819*22dc650dSSadaf Ebrahimi 			mem_reg = SLJIT_R1;
4820*22dc650dSSadaf Ebrahimi 			free_reg = SLJIT_R1;
4821*22dc650dSSadaf Ebrahimi 		} else {
4822*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0);
4823*22dc650dSSadaf Ebrahimi 			EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw);
4824*22dc650dSSadaf Ebrahimi 
4825*22dc650dSSadaf Ebrahimi 			if (src_reg == SLJIT_R0)
4826*22dc650dSSadaf Ebrahimi 				src_reg = free_reg;
4827*22dc650dSSadaf Ebrahimi 			if (mem_reg == SLJIT_R0)
4828*22dc650dSSadaf Ebrahimi 				mem_reg = free_reg;
4829*22dc650dSSadaf Ebrahimi 		}
4830*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4831*22dc650dSSadaf Ebrahimi 	}
4832*22dc650dSSadaf Ebrahimi 
4833*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4834*22dc650dSSadaf Ebrahimi 	compiler->mode32 = op != SLJIT_MOV && op != SLJIT_MOV_P;
4835*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4836*22dc650dSSadaf Ebrahimi 
4837*22dc650dSSadaf Ebrahimi 	/* Lock prefix. */
4838*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_byte(compiler, GROUP_LOCK));
4839*22dc650dSSadaf Ebrahimi 
4840*22dc650dSSadaf Ebrahimi 	pref = 0;
4841*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_MOV_U16)
4842*22dc650dSSadaf Ebrahimi 		pref = EX86_HALF_ARG | EX86_PREF_66;
4843*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4844*22dc650dSSadaf Ebrahimi 	if (op == SLJIT_MOV_U8)
4845*22dc650dSSadaf Ebrahimi 		pref = EX86_REX;
4846*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4847*22dc650dSSadaf Ebrahimi 
4848*22dc650dSSadaf Ebrahimi 	FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0));
4849*22dc650dSSadaf Ebrahimi 
4850*22dc650dSSadaf Ebrahimi 	if (temp_reg != SLJIT_R0) {
4851*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4852*22dc650dSSadaf Ebrahimi 		compiler->mode32 = 0;
4853*22dc650dSSadaf Ebrahimi 		return emit_mov(compiler, SLJIT_R0, 0, TMP_REG1, 0);
4854*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
4855*22dc650dSSadaf Ebrahimi 		EMIT_MOV(compiler, SLJIT_R0, 0, free_reg, 0);
4856*22dc650dSSadaf Ebrahimi 		if (free_reg != TMP_REG1)
4857*22dc650dSSadaf Ebrahimi 			return emit_mov(compiler, free_reg, 0, (free_reg == SLJIT_R1) ? SLJIT_MEM1(SLJIT_SP) : TMP_REG1, 0);
4858*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4859*22dc650dSSadaf Ebrahimi 	}
4860*22dc650dSSadaf Ebrahimi 	return SLJIT_SUCCESS;
4861*22dc650dSSadaf Ebrahimi }
4862*22dc650dSSadaf Ebrahimi 
4863*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
4864*22dc650dSSadaf Ebrahimi {
4865*22dc650dSSadaf Ebrahimi 	CHECK_ERROR();
4866*22dc650dSSadaf Ebrahimi 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
4867*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
4868*22dc650dSSadaf Ebrahimi 
4869*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4870*22dc650dSSadaf Ebrahimi 
4871*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4872*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
4873*22dc650dSSadaf Ebrahimi #endif
4874*22dc650dSSadaf Ebrahimi 
4875*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
4876*22dc650dSSadaf Ebrahimi 
4877*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4878*22dc650dSSadaf Ebrahimi 	if (NOT_HALFWORD(offset)) {
4879*22dc650dSSadaf Ebrahimi 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
4880*22dc650dSSadaf Ebrahimi #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
4881*22dc650dSSadaf Ebrahimi 		SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
4882*22dc650dSSadaf Ebrahimi 		return compiler->error;
4883*22dc650dSSadaf Ebrahimi #else
4884*22dc650dSSadaf Ebrahimi 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
4885*22dc650dSSadaf Ebrahimi #endif
4886*22dc650dSSadaf Ebrahimi 	}
4887*22dc650dSSadaf Ebrahimi #endif
4888*22dc650dSSadaf Ebrahimi 
4889*22dc650dSSadaf Ebrahimi 	if (offset != 0)
4890*22dc650dSSadaf Ebrahimi 		return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
4891*22dc650dSSadaf Ebrahimi 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
4892*22dc650dSSadaf Ebrahimi }
4893*22dc650dSSadaf Ebrahimi 
4894*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
4895*22dc650dSSadaf Ebrahimi {
4896*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
4897*22dc650dSSadaf Ebrahimi 	struct sljit_const *const_;
4898*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4899*22dc650dSSadaf Ebrahimi 	sljit_s32 reg;
4900*22dc650dSSadaf Ebrahimi #endif
4901*22dc650dSSadaf Ebrahimi 
4902*22dc650dSSadaf Ebrahimi 	CHECK_ERROR_PTR();
4903*22dc650dSSadaf Ebrahimi 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
4904*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
4905*22dc650dSSadaf Ebrahimi 
4906*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4907*22dc650dSSadaf Ebrahimi 
4908*22dc650dSSadaf Ebrahimi 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
4909*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!const_);
4910*22dc650dSSadaf Ebrahimi 	set_const(const_, compiler);
4911*22dc650dSSadaf Ebrahimi 
4912*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4913*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
4914*22dc650dSSadaf Ebrahimi 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4915*22dc650dSSadaf Ebrahimi 
4916*22dc650dSSadaf Ebrahimi 	if (emit_load_imm64(compiler, reg, init_value))
4917*22dc650dSSadaf Ebrahimi 		return NULL;
4918*22dc650dSSadaf Ebrahimi #else
4919*22dc650dSSadaf Ebrahimi 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
4920*22dc650dSSadaf Ebrahimi 		return NULL;
4921*22dc650dSSadaf Ebrahimi #endif
4922*22dc650dSSadaf Ebrahimi 
4923*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1);
4924*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!inst);
4925*22dc650dSSadaf Ebrahimi 
4926*22dc650dSSadaf Ebrahimi 	inst[0] = SLJIT_INST_CONST;
4927*22dc650dSSadaf Ebrahimi 
4928*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4929*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM)
4930*22dc650dSSadaf Ebrahimi 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
4931*22dc650dSSadaf Ebrahimi 			return NULL;
4932*22dc650dSSadaf Ebrahimi #endif
4933*22dc650dSSadaf Ebrahimi 
4934*22dc650dSSadaf Ebrahimi 	return const_;
4935*22dc650dSSadaf Ebrahimi }
4936*22dc650dSSadaf Ebrahimi 
4937*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_mov_addr(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw)
4938*22dc650dSSadaf Ebrahimi {
4939*22dc650dSSadaf Ebrahimi 	struct sljit_jump *jump;
4940*22dc650dSSadaf Ebrahimi 	sljit_u8 *inst;
4941*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4942*22dc650dSSadaf Ebrahimi 	sljit_s32 reg;
4943*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4944*22dc650dSSadaf Ebrahimi 
4945*22dc650dSSadaf Ebrahimi 	CHECK_ERROR_PTR();
4946*22dc650dSSadaf Ebrahimi 	CHECK_PTR(check_sljit_emit_mov_addr(compiler, dst, dstw));
4947*22dc650dSSadaf Ebrahimi 	ADJUST_LOCAL_OFFSET(dst, dstw);
4948*22dc650dSSadaf Ebrahimi 
4949*22dc650dSSadaf Ebrahimi 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
4950*22dc650dSSadaf Ebrahimi 
4951*22dc650dSSadaf Ebrahimi 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
4952*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!jump);
4953*22dc650dSSadaf Ebrahimi 	set_mov_addr(jump, compiler, 0);
4954*22dc650dSSadaf Ebrahimi 
4955*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4956*22dc650dSSadaf Ebrahimi 	compiler->mode32 = 0;
4957*22dc650dSSadaf Ebrahimi 	reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
4958*22dc650dSSadaf Ebrahimi 
4959*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(emit_load_imm64(compiler, reg, 0));
4960*22dc650dSSadaf Ebrahimi 	jump->addr = compiler->size;
4961*22dc650dSSadaf Ebrahimi 
4962*22dc650dSSadaf Ebrahimi 	if (reg_map[reg] >= 8)
4963*22dc650dSSadaf Ebrahimi 		jump->flags |= MOV_ADDR_HI;
4964*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86_64 */
4965*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(emit_mov(compiler, dst, dstw, SLJIT_IMM, 0));
4966*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4967*22dc650dSSadaf Ebrahimi 
4968*22dc650dSSadaf Ebrahimi 	inst = (sljit_u8*)ensure_buf(compiler, 1);
4969*22dc650dSSadaf Ebrahimi 	PTR_FAIL_IF(!inst);
4970*22dc650dSSadaf Ebrahimi 
4971*22dc650dSSadaf Ebrahimi 	inst[0] = SLJIT_INST_MOV_ADDR;
4972*22dc650dSSadaf Ebrahimi 
4973*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
4974*22dc650dSSadaf Ebrahimi 	if (dst & SLJIT_MEM)
4975*22dc650dSSadaf Ebrahimi 		PTR_FAIL_IF(emit_mov(compiler, dst, dstw, TMP_REG1, 0));
4976*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86_64 */
4977*22dc650dSSadaf Ebrahimi 
4978*22dc650dSSadaf Ebrahimi 	return jump;
4979*22dc650dSSadaf Ebrahimi }
4980*22dc650dSSadaf Ebrahimi 
4981*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
4982*22dc650dSSadaf Ebrahimi {
4983*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(executable_offset);
4984*22dc650dSSadaf Ebrahimi 
4985*22dc650dSSadaf Ebrahimi 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0);
4986*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
4987*22dc650dSSadaf Ebrahimi 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)(new_target - (addr + 4) - (sljit_uw)executable_offset));
4988*22dc650dSSadaf Ebrahimi #else
4989*22dc650dSSadaf Ebrahimi 	sljit_unaligned_store_sw((void*)addr, (sljit_sw)new_target);
4990*22dc650dSSadaf Ebrahimi #endif
4991*22dc650dSSadaf Ebrahimi 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1);
4992*22dc650dSSadaf Ebrahimi }
4993*22dc650dSSadaf Ebrahimi 
4994*22dc650dSSadaf Ebrahimi SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
4995*22dc650dSSadaf Ebrahimi {
4996*22dc650dSSadaf Ebrahimi 	SLJIT_UNUSED_ARG(executable_offset);
4997*22dc650dSSadaf Ebrahimi 
4998*22dc650dSSadaf Ebrahimi 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 0);
4999*22dc650dSSadaf Ebrahimi 	sljit_unaligned_store_sw((void*)addr, new_constant);
5000*22dc650dSSadaf Ebrahimi 	SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_sw)), 1);
5001*22dc650dSSadaf Ebrahimi }
5002