xref: /aosp_15_r20/external/pcre/src/pcre2_jit_simd_inc.h (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi                     This module by Zoltan Herczeg
10*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
11*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2019 University of Cambridge
12*22dc650dSSadaf Ebrahimi 
13*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
14*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
15*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
16*22dc650dSSadaf Ebrahimi 
17*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
18*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
19*22dc650dSSadaf Ebrahimi 
20*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
21*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
22*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
23*22dc650dSSadaf Ebrahimi 
24*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
25*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
26*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
27*22dc650dSSadaf Ebrahimi 
28*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
39*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
40*22dc650dSSadaf Ebrahimi */
41*22dc650dSSadaf Ebrahimi 
42*22dc650dSSadaf Ebrahimi #if !(defined SUPPORT_VALGRIND)
43*22dc650dSSadaf Ebrahimi 
44*22dc650dSSadaf Ebrahimi #if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45*22dc650dSSadaf Ebrahimi      || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
46*22dc650dSSadaf Ebrahimi      || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
47*22dc650dSSadaf Ebrahimi 
48*22dc650dSSadaf Ebrahimi typedef enum {
49*22dc650dSSadaf Ebrahimi   vector_compare_match1,
50*22dc650dSSadaf Ebrahimi   vector_compare_match1i,
51*22dc650dSSadaf Ebrahimi   vector_compare_match2,
52*22dc650dSSadaf Ebrahimi } vector_compare_type;
53*22dc650dSSadaf Ebrahimi 
54*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
max_fast_forward_char_pair_offset(void)55*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
56*22dc650dSSadaf Ebrahimi {
57*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
58*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
59*22dc650dSSadaf Ebrahimi /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
60*22dc650dSSadaf Ebrahimi return 15;
61*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
62*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
63*22dc650dSSadaf Ebrahimi /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
64*22dc650dSSadaf Ebrahimi return 7;
65*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
66*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
67*22dc650dSSadaf Ebrahimi /* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
68*22dc650dSSadaf Ebrahimi return 3;
69*22dc650dSSadaf Ebrahimi #else
70*22dc650dSSadaf Ebrahimi #error "Unsupported unit width"
71*22dc650dSSadaf Ebrahimi #endif
72*22dc650dSSadaf Ebrahimi }
73*22dc650dSSadaf Ebrahimi #else /* !SLJIT_CONFIG_X86 */
max_fast_forward_char_pair_offset(void)74*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
75*22dc650dSSadaf Ebrahimi {
76*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
77*22dc650dSSadaf Ebrahimi return 15;
78*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
79*22dc650dSSadaf Ebrahimi return 7;
80*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
81*22dc650dSSadaf Ebrahimi return 3;
82*22dc650dSSadaf Ebrahimi #else
83*22dc650dSSadaf Ebrahimi #error "Unsupported unit width"
84*22dc650dSSadaf Ebrahimi #endif
85*22dc650dSSadaf Ebrahimi }
86*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86 */
87*22dc650dSSadaf Ebrahimi 
88*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
jump_if_utf_char_start(struct sljit_compiler * compiler,sljit_s32 reg)89*22dc650dSSadaf Ebrahimi static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
90*22dc650dSSadaf Ebrahimi {
91*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
92*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
93*22dc650dSSadaf Ebrahimi return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
94*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
95*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
96*22dc650dSSadaf Ebrahimi return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
97*22dc650dSSadaf Ebrahimi #else
98*22dc650dSSadaf Ebrahimi #error "Unknown code width"
99*22dc650dSSadaf Ebrahimi #endif
100*22dc650dSSadaf Ebrahimi }
101*22dc650dSSadaf Ebrahimi #endif
102*22dc650dSSadaf Ebrahimi 
103*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
104*22dc650dSSadaf Ebrahimi 
105*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
106*22dc650dSSadaf Ebrahimi 
character_to_int32(PCRE2_UCHAR chr)107*22dc650dSSadaf Ebrahimi static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
108*22dc650dSSadaf Ebrahimi {
109*22dc650dSSadaf Ebrahimi sljit_u32 value = chr;
110*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
111*22dc650dSSadaf Ebrahimi #define SIMD_COMPARE_TYPE_INDEX 0
112*22dc650dSSadaf Ebrahimi return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
113*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
114*22dc650dSSadaf Ebrahimi #define SIMD_COMPARE_TYPE_INDEX 1
115*22dc650dSSadaf Ebrahimi return (sljit_s32)((value << 16) | value);
116*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
117*22dc650dSSadaf Ebrahimi #define SIMD_COMPARE_TYPE_INDEX 2
118*22dc650dSSadaf Ebrahimi return (sljit_s32)(value);
119*22dc650dSSadaf Ebrahimi #else
120*22dc650dSSadaf Ebrahimi #error "Unsupported unit width"
121*22dc650dSSadaf Ebrahimi #endif
122*22dc650dSSadaf Ebrahimi }
123*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 reg_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)124*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
125*22dc650dSSadaf Ebrahimi   sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
126*22dc650dSSadaf Ebrahimi {
127*22dc650dSSadaf Ebrahimi sljit_u8 instruction[4];
128*22dc650dSSadaf Ebrahimi 
129*22dc650dSSadaf Ebrahimi if (reg_type == SLJIT_SIMD_REG_128)
130*22dc650dSSadaf Ebrahimi   {
131*22dc650dSSadaf Ebrahimi   instruction[0] = 0x66;
132*22dc650dSSadaf Ebrahimi   instruction[1] = 0x0f;
133*22dc650dSSadaf Ebrahimi   }
134*22dc650dSSadaf Ebrahimi else
135*22dc650dSSadaf Ebrahimi   {
136*22dc650dSSadaf Ebrahimi   /* Two byte VEX prefix. */
137*22dc650dSSadaf Ebrahimi   instruction[0] = 0xc5;
138*22dc650dSSadaf Ebrahimi   instruction[1] = 0xfd;
139*22dc650dSSadaf Ebrahimi   }
140*22dc650dSSadaf Ebrahimi 
141*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(step >= 0 && step <= 3);
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
144*22dc650dSSadaf Ebrahimi   {
145*22dc650dSSadaf Ebrahimi   if (step == 0)
146*22dc650dSSadaf Ebrahimi     {
147*22dc650dSSadaf Ebrahimi     if (compare_type == vector_compare_match1i)
148*22dc650dSSadaf Ebrahimi       {
149*22dc650dSSadaf Ebrahimi       /* POR xmm1, xmm2/m128 */
150*22dc650dSSadaf Ebrahimi       if (reg_type == SLJIT_SIMD_REG_256)
151*22dc650dSSadaf Ebrahimi         instruction[1] ^= (dst_ind << 3);
152*22dc650dSSadaf Ebrahimi 
153*22dc650dSSadaf Ebrahimi       /* Prefix is filled. */
154*22dc650dSSadaf Ebrahimi       instruction[2] = 0xeb;
155*22dc650dSSadaf Ebrahimi       instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
156*22dc650dSSadaf Ebrahimi       sljit_emit_op_custom(compiler, instruction, 4);
157*22dc650dSSadaf Ebrahimi       }
158*22dc650dSSadaf Ebrahimi     return;
159*22dc650dSSadaf Ebrahimi     }
160*22dc650dSSadaf Ebrahimi 
161*22dc650dSSadaf Ebrahimi   if (step != 2)
162*22dc650dSSadaf Ebrahimi     return;
163*22dc650dSSadaf Ebrahimi 
164*22dc650dSSadaf Ebrahimi   /* PCMPEQB/W/D xmm1, xmm2/m128 */
165*22dc650dSSadaf Ebrahimi   if (reg_type == SLJIT_SIMD_REG_256)
166*22dc650dSSadaf Ebrahimi     instruction[1] ^= (dst_ind << 3);
167*22dc650dSSadaf Ebrahimi 
168*22dc650dSSadaf Ebrahimi   /* Prefix is filled. */
169*22dc650dSSadaf Ebrahimi   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
170*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
171*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
172*22dc650dSSadaf Ebrahimi   return;
173*22dc650dSSadaf Ebrahimi   }
174*22dc650dSSadaf Ebrahimi 
175*22dc650dSSadaf Ebrahimi if (reg_type == SLJIT_SIMD_REG_256)
176*22dc650dSSadaf Ebrahimi   {
177*22dc650dSSadaf Ebrahimi   if (step == 2)
178*22dc650dSSadaf Ebrahimi     return;
179*22dc650dSSadaf Ebrahimi 
180*22dc650dSSadaf Ebrahimi   if (step == 0)
181*22dc650dSSadaf Ebrahimi     {
182*22dc650dSSadaf Ebrahimi     step = 2;
183*22dc650dSSadaf Ebrahimi     instruction[1] ^= (dst_ind << 3);
184*22dc650dSSadaf Ebrahimi     }
185*22dc650dSSadaf Ebrahimi   }
186*22dc650dSSadaf Ebrahimi 
187*22dc650dSSadaf Ebrahimi switch (step)
188*22dc650dSSadaf Ebrahimi   {
189*22dc650dSSadaf Ebrahimi   case 0:
190*22dc650dSSadaf Ebrahimi   SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
191*22dc650dSSadaf Ebrahimi 
192*22dc650dSSadaf Ebrahimi   /* MOVDQA xmm1, xmm2/m128 */
193*22dc650dSSadaf Ebrahimi   /* Prefix is filled. */
194*22dc650dSSadaf Ebrahimi   instruction[2] = 0x6f;
195*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
196*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
197*22dc650dSSadaf Ebrahimi   return;
198*22dc650dSSadaf Ebrahimi 
199*22dc650dSSadaf Ebrahimi   case 1:
200*22dc650dSSadaf Ebrahimi   /* PCMPEQB/W/D xmm1, xmm2/m128 */
201*22dc650dSSadaf Ebrahimi   if (reg_type == SLJIT_SIMD_REG_256)
202*22dc650dSSadaf Ebrahimi     instruction[1] ^= (dst_ind << 3);
203*22dc650dSSadaf Ebrahimi 
204*22dc650dSSadaf Ebrahimi   /* Prefix is filled. */
205*22dc650dSSadaf Ebrahimi   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
206*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
207*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
208*22dc650dSSadaf Ebrahimi   return;
209*22dc650dSSadaf Ebrahimi 
210*22dc650dSSadaf Ebrahimi   case 2:
211*22dc650dSSadaf Ebrahimi   /* PCMPEQB/W/D xmm1, xmm2/m128 */
212*22dc650dSSadaf Ebrahimi   /* Prefix is filled. */
213*22dc650dSSadaf Ebrahimi   instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
214*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
215*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
216*22dc650dSSadaf Ebrahimi   return;
217*22dc650dSSadaf Ebrahimi 
218*22dc650dSSadaf Ebrahimi   case 3:
219*22dc650dSSadaf Ebrahimi   /* POR xmm1, xmm2/m128 */
220*22dc650dSSadaf Ebrahimi   if (reg_type == SLJIT_SIMD_REG_256)
221*22dc650dSSadaf Ebrahimi     instruction[1] ^= (dst_ind << 3);
222*22dc650dSSadaf Ebrahimi 
223*22dc650dSSadaf Ebrahimi   /* Prefix is filled. */
224*22dc650dSSadaf Ebrahimi   instruction[2] = 0xeb;
225*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
226*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
227*22dc650dSSadaf Ebrahimi   return;
228*22dc650dSSadaf Ebrahimi   }
229*22dc650dSSadaf Ebrahimi }
230*22dc650dSSadaf Ebrahimi 
231*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
232*22dc650dSSadaf Ebrahimi 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)233*22dc650dSSadaf Ebrahimi static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
234*22dc650dSSadaf Ebrahimi {
235*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
236*22dc650dSSadaf Ebrahimi sljit_u8 instruction[8];
237*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
238*22dc650dSSadaf Ebrahimi /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
239*22dc650dSSadaf Ebrahimi sljit_s32 reg_type = SLJIT_SIMD_REG_128;
240*22dc650dSSadaf Ebrahimi sljit_s32 value;
241*22dc650dSSadaf Ebrahimi struct sljit_label *start;
242*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
243*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
244*22dc650dSSadaf Ebrahimi #endif
245*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
246*22dc650dSSadaf Ebrahimi struct sljit_jump *partial_quit[2];
247*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
248*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
249*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
250*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
251*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
252*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
253*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
254*22dc650dSSadaf Ebrahimi int i;
255*22dc650dSSadaf Ebrahimi 
256*22dc650dSSadaf Ebrahimi SLJIT_UNUSED_ARG(offset);
257*22dc650dSSadaf Ebrahimi 
258*22dc650dSSadaf Ebrahimi if (char1 != char2)
259*22dc650dSSadaf Ebrahimi   {
260*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
261*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
262*22dc650dSSadaf Ebrahimi 
263*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
264*22dc650dSSadaf Ebrahimi     {
265*22dc650dSSadaf Ebrahimi     bit = 0;
266*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
267*22dc650dSSadaf Ebrahimi     }
268*22dc650dSSadaf Ebrahimi   }
269*22dc650dSSadaf Ebrahimi 
270*22dc650dSSadaf Ebrahimi partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
271*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
272*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[0]);
273*22dc650dSSadaf Ebrahimi 
274*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
275*22dc650dSSadaf Ebrahimi value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
276*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
277*22dc650dSSadaf Ebrahimi 
278*22dc650dSSadaf Ebrahimi if (char1 != char2)
279*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
280*22dc650dSSadaf Ebrahimi 
281*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
282*22dc650dSSadaf Ebrahimi 
283*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
284*22dc650dSSadaf Ebrahimi 
285*22dc650dSSadaf Ebrahimi if (char1 != char2)
286*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
287*22dc650dSSadaf Ebrahimi 
288*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
289*22dc650dSSadaf Ebrahimi restart = LABEL();
290*22dc650dSSadaf Ebrahimi #endif
291*22dc650dSSadaf Ebrahimi 
292*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
293*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
294*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
295*22dc650dSSadaf Ebrahimi 
296*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
297*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
298*22dc650dSSadaf Ebrahimi 
299*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
300*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
301*22dc650dSSadaf Ebrahimi 
302*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
303*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
304*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
305*22dc650dSSadaf Ebrahimi 
306*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
307*22dc650dSSadaf Ebrahimi 
308*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
309*22dc650dSSadaf Ebrahimi 
310*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
311*22dc650dSSadaf Ebrahimi start = LABEL();
312*22dc650dSSadaf Ebrahimi 
313*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
314*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
315*22dc650dSSadaf Ebrahimi 
316*22dc650dSSadaf Ebrahimi partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
317*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
318*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[1]);
319*22dc650dSSadaf Ebrahimi 
320*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
321*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
322*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
323*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
324*22dc650dSSadaf Ebrahimi 
325*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
326*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327*22dc650dSSadaf Ebrahimi 
328*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
329*22dc650dSSadaf Ebrahimi 
330*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(tmp1_reg_ind < 8);
331*22dc650dSSadaf Ebrahimi /* BSF r32, r/m32 */
332*22dc650dSSadaf Ebrahimi instruction[0] = 0x0f;
333*22dc650dSSadaf Ebrahimi instruction[1] = 0xbc;
334*22dc650dSSadaf Ebrahimi instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
335*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 3);
336*22dc650dSSadaf Ebrahimi 
337*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
338*22dc650dSSadaf Ebrahimi 
339*22dc650dSSadaf Ebrahimi if (common->mode != PCRE2_JIT_COMPLETE)
340*22dc650dSSadaf Ebrahimi   {
341*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[0]);
342*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[1]);
343*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
344*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
345*22dc650dSSadaf Ebrahimi   }
346*22dc650dSSadaf Ebrahimi else
347*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
348*22dc650dSSadaf Ebrahimi 
349*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
350*22dc650dSSadaf Ebrahimi if (common->utf && offset > 0)
351*22dc650dSSadaf Ebrahimi   {
352*22dc650dSSadaf Ebrahimi   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
353*22dc650dSSadaf Ebrahimi 
354*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
355*22dc650dSSadaf Ebrahimi 
356*22dc650dSSadaf Ebrahimi   quit = jump_if_utf_char_start(compiler, TMP1);
357*22dc650dSSadaf Ebrahimi 
358*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
359*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
360*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
361*22dc650dSSadaf Ebrahimi   JUMPTO(SLJIT_JUMP, restart);
362*22dc650dSSadaf Ebrahimi 
363*22dc650dSSadaf Ebrahimi   JUMPHERE(quit);
364*22dc650dSSadaf Ebrahimi   }
365*22dc650dSSadaf Ebrahimi #endif
366*22dc650dSSadaf Ebrahimi }
367*22dc650dSSadaf Ebrahimi 
368*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
369*22dc650dSSadaf Ebrahimi 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)370*22dc650dSSadaf Ebrahimi static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
371*22dc650dSSadaf Ebrahimi {
372*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
373*22dc650dSSadaf Ebrahimi sljit_u8 instruction[8];
374*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
375*22dc650dSSadaf Ebrahimi /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
376*22dc650dSSadaf Ebrahimi sljit_s32 reg_type = SLJIT_SIMD_REG_128;
377*22dc650dSSadaf Ebrahimi sljit_s32 value;
378*22dc650dSSadaf Ebrahimi struct sljit_label *start;
379*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
380*22dc650dSSadaf Ebrahimi jump_list *not_found = NULL;
381*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
382*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
383*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
384*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
385*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
386*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
387*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
388*22dc650dSSadaf Ebrahimi int i;
389*22dc650dSSadaf Ebrahimi 
390*22dc650dSSadaf Ebrahimi if (char1 != char2)
391*22dc650dSSadaf Ebrahimi   {
392*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
393*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
394*22dc650dSSadaf Ebrahimi 
395*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
396*22dc650dSSadaf Ebrahimi     {
397*22dc650dSSadaf Ebrahimi     bit = 0;
398*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
399*22dc650dSSadaf Ebrahimi     }
400*22dc650dSSadaf Ebrahimi   }
401*22dc650dSSadaf Ebrahimi 
402*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
403*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
404*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
405*22dc650dSSadaf Ebrahimi 
406*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
407*22dc650dSSadaf Ebrahimi 
408*22dc650dSSadaf Ebrahimi value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
409*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
410*22dc650dSSadaf Ebrahimi 
411*22dc650dSSadaf Ebrahimi if (char1 != char2)
412*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
413*22dc650dSSadaf Ebrahimi 
414*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
415*22dc650dSSadaf Ebrahimi 
416*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
417*22dc650dSSadaf Ebrahimi 
418*22dc650dSSadaf Ebrahimi if (char1 != char2)
419*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
420*22dc650dSSadaf Ebrahimi 
421*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
422*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
423*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
424*22dc650dSSadaf Ebrahimi 
425*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
426*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
427*22dc650dSSadaf Ebrahimi 
428*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
429*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
430*22dc650dSSadaf Ebrahimi 
431*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
432*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
433*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
434*22dc650dSSadaf Ebrahimi 
435*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
436*22dc650dSSadaf Ebrahimi 
437*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
438*22dc650dSSadaf Ebrahimi 
439*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
440*22dc650dSSadaf Ebrahimi start = LABEL();
441*22dc650dSSadaf Ebrahimi 
442*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
443*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
444*22dc650dSSadaf Ebrahimi 
445*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
446*22dc650dSSadaf Ebrahimi 
447*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
448*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
449*22dc650dSSadaf Ebrahimi 
450*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
451*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
452*22dc650dSSadaf Ebrahimi 
453*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
454*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
455*22dc650dSSadaf Ebrahimi 
456*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
457*22dc650dSSadaf Ebrahimi 
458*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(tmp1_reg_ind < 8);
459*22dc650dSSadaf Ebrahimi /* BSF r32, r/m32 */
460*22dc650dSSadaf Ebrahimi instruction[0] = 0x0f;
461*22dc650dSSadaf Ebrahimi instruction[1] = 0xbc;
462*22dc650dSSadaf Ebrahimi instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
463*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 3);
464*22dc650dSSadaf Ebrahimi 
465*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
466*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
467*22dc650dSSadaf Ebrahimi 
468*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
469*22dc650dSSadaf Ebrahimi return not_found;
470*22dc650dSSadaf Ebrahimi }
471*22dc650dSSadaf Ebrahimi 
472*22dc650dSSadaf Ebrahimi #ifndef _WIN64
473*22dc650dSSadaf Ebrahimi 
474*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
475*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)476*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
477*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
478*22dc650dSSadaf Ebrahimi {
479*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
480*22dc650dSSadaf Ebrahimi sljit_u8 instruction[8];
481*22dc650dSSadaf Ebrahimi /* The AVX2 code path is currently disabled. */
482*22dc650dSSadaf Ebrahimi /* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
483*22dc650dSSadaf Ebrahimi sljit_s32 reg_type = SLJIT_SIMD_REG_128;
484*22dc650dSSadaf Ebrahimi sljit_s32 value;
485*22dc650dSSadaf Ebrahimi vector_compare_type compare1_type = vector_compare_match1;
486*22dc650dSSadaf Ebrahimi vector_compare_type compare2_type = vector_compare_match1;
487*22dc650dSSadaf Ebrahimi sljit_u32 bit1 = 0;
488*22dc650dSSadaf Ebrahimi sljit_u32 bit2 = 0;
489*22dc650dSSadaf Ebrahimi sljit_u32 diff = IN_UCHARS(offs1 - offs2);
490*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
491*22dc650dSSadaf Ebrahimi sljit_s32 data1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
492*22dc650dSSadaf Ebrahimi sljit_s32 data2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
493*22dc650dSSadaf Ebrahimi sljit_s32 cmp1a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
494*22dc650dSSadaf Ebrahimi sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
495*22dc650dSSadaf Ebrahimi sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4);
496*22dc650dSSadaf Ebrahimi sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5);
497*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6);
498*22dc650dSSadaf Ebrahimi sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0);
499*22dc650dSSadaf Ebrahimi struct sljit_label *start;
500*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
501*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
502*22dc650dSSadaf Ebrahimi #endif
503*22dc650dSSadaf Ebrahimi struct sljit_jump *jump[2];
504*22dc650dSSadaf Ebrahimi int i;
505*22dc650dSSadaf Ebrahimi 
506*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
507*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
508*22dc650dSSadaf Ebrahimi 
509*22dc650dSSadaf Ebrahimi /* Initialize. */
510*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
511*22dc650dSSadaf Ebrahimi   {
512*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
513*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
514*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
515*22dc650dSSadaf Ebrahimi 
516*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
517*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
518*22dc650dSSadaf Ebrahimi   }
519*22dc650dSSadaf Ebrahimi 
520*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
521*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
522*22dc650dSSadaf Ebrahimi 
523*22dc650dSSadaf Ebrahimi if (char1a == char1b)
524*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
525*22dc650dSSadaf Ebrahimi else
526*22dc650dSSadaf Ebrahimi   {
527*22dc650dSSadaf Ebrahimi   bit1 = char1a ^ char1b;
528*22dc650dSSadaf Ebrahimi   if (is_powerof2(bit1))
529*22dc650dSSadaf Ebrahimi     {
530*22dc650dSSadaf Ebrahimi     compare1_type = vector_compare_match1i;
531*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
532*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
533*22dc650dSSadaf Ebrahimi     }
534*22dc650dSSadaf Ebrahimi   else
535*22dc650dSSadaf Ebrahimi     {
536*22dc650dSSadaf Ebrahimi     compare1_type = vector_compare_match2;
537*22dc650dSSadaf Ebrahimi     bit1 = 0;
538*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
539*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
540*22dc650dSSadaf Ebrahimi     }
541*22dc650dSSadaf Ebrahimi   }
542*22dc650dSSadaf Ebrahimi 
543*22dc650dSSadaf Ebrahimi value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
544*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, TMP1, 0);
545*22dc650dSSadaf Ebrahimi 
546*22dc650dSSadaf Ebrahimi if (char1a != char1b)
547*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR4, 0, TMP2, 0);
548*22dc650dSSadaf Ebrahimi 
549*22dc650dSSadaf Ebrahimi if (char2a == char2b)
550*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
551*22dc650dSSadaf Ebrahimi else
552*22dc650dSSadaf Ebrahimi   {
553*22dc650dSSadaf Ebrahimi   bit2 = char2a ^ char2b;
554*22dc650dSSadaf Ebrahimi   if (is_powerof2(bit2))
555*22dc650dSSadaf Ebrahimi     {
556*22dc650dSSadaf Ebrahimi     compare2_type = vector_compare_match1i;
557*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
558*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
559*22dc650dSSadaf Ebrahimi     }
560*22dc650dSSadaf Ebrahimi   else
561*22dc650dSSadaf Ebrahimi     {
562*22dc650dSSadaf Ebrahimi     compare2_type = vector_compare_match2;
563*22dc650dSSadaf Ebrahimi     bit2 = 0;
564*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
565*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
566*22dc650dSSadaf Ebrahimi     }
567*22dc650dSSadaf Ebrahimi   }
568*22dc650dSSadaf Ebrahimi 
569*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR3, 0, TMP1, 0);
570*22dc650dSSadaf Ebrahimi 
571*22dc650dSSadaf Ebrahimi if (char2a != char2b)
572*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR5, 0, TMP2, 0);
573*22dc650dSSadaf Ebrahimi 
574*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
575*22dc650dSSadaf Ebrahimi if (char1a != char1b)
576*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR4, SLJIT_FR4, 0);
577*22dc650dSSadaf Ebrahimi 
578*22dc650dSSadaf Ebrahimi sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR3, SLJIT_FR3, 0);
579*22dc650dSSadaf Ebrahimi if (char2a != char2b)
580*22dc650dSSadaf Ebrahimi   sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR5, SLJIT_FR5, 0);
581*22dc650dSSadaf Ebrahimi 
582*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
583*22dc650dSSadaf Ebrahimi restart = LABEL();
584*22dc650dSSadaf Ebrahimi #endif
585*22dc650dSSadaf Ebrahimi 
586*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
587*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
588*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
589*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
590*22dc650dSSadaf Ebrahimi 
591*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
592*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
593*22dc650dSSadaf Ebrahimi 
594*22dc650dSSadaf Ebrahimi jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
595*22dc650dSSadaf Ebrahimi 
596*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
597*22dc650dSSadaf Ebrahimi jump[1] = JUMP(SLJIT_JUMP);
598*22dc650dSSadaf Ebrahimi 
599*22dc650dSSadaf Ebrahimi JUMPHERE(jump[0]);
600*22dc650dSSadaf Ebrahimi 
601*22dc650dSSadaf Ebrahimi if (reg_type == SLJIT_SIMD_REG_256)
602*22dc650dSSadaf Ebrahimi   {
603*22dc650dSSadaf Ebrahimi   if (diff != 16)
604*22dc650dSSadaf Ebrahimi     {
605*22dc650dSSadaf Ebrahimi     /* PSLLDQ ymm1, ymm2, imm8 */
606*22dc650dSSadaf Ebrahimi     instruction[0] = 0xc5;
607*22dc650dSSadaf Ebrahimi     instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
608*22dc650dSSadaf Ebrahimi     instruction[2] = 0x73;
609*22dc650dSSadaf Ebrahimi     instruction[3] = 0xc0 | (7 << 3) | data1_ind;
610*22dc650dSSadaf Ebrahimi     instruction[4] = diff & 0xf;
611*22dc650dSSadaf Ebrahimi     sljit_emit_op_custom(compiler, instruction, 5);
612*22dc650dSSadaf Ebrahimi     }
613*22dc650dSSadaf Ebrahimi 
614*22dc650dSSadaf Ebrahimi   instruction[0] = 0xc4;
615*22dc650dSSadaf Ebrahimi   instruction[1] = 0xe3;
616*22dc650dSSadaf Ebrahimi   if (diff < 16)
617*22dc650dSSadaf Ebrahimi     {
618*22dc650dSSadaf Ebrahimi     /* VINSERTI128 xmm1, xmm2, xmm3/m128 */
619*22dc650dSSadaf Ebrahimi     /* instruction[0] = 0xc4; */
620*22dc650dSSadaf Ebrahimi     /* instruction[1] = 0xe3; */
621*22dc650dSSadaf Ebrahimi     instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
622*22dc650dSSadaf Ebrahimi     instruction[3] = 0x38;
623*22dc650dSSadaf Ebrahimi     SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);
624*22dc650dSSadaf Ebrahimi     instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
625*22dc650dSSadaf Ebrahimi     instruction[5] = (sljit_u8)(16 - diff);
626*22dc650dSSadaf Ebrahimi     instruction[6] = 1;
627*22dc650dSSadaf Ebrahimi     sljit_emit_op_custom(compiler, instruction, 7);
628*22dc650dSSadaf Ebrahimi     }
629*22dc650dSSadaf Ebrahimi   else
630*22dc650dSSadaf Ebrahimi     {
631*22dc650dSSadaf Ebrahimi     /* VPERM2I128 xmm1, xmm2, xmm3/m128 */
632*22dc650dSSadaf Ebrahimi     /* instruction[0] = 0xc4; */
633*22dc650dSSadaf Ebrahimi     /* instruction[1] = 0xe3; */
634*22dc650dSSadaf Ebrahimi     value = (diff == 16) ? data1_ind : data2_ind;
635*22dc650dSSadaf Ebrahimi     instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
636*22dc650dSSadaf Ebrahimi     instruction[3] = 0x46;
637*22dc650dSSadaf Ebrahimi     instruction[4] = 0xc0 | (data2_ind << 3) | value;
638*22dc650dSSadaf Ebrahimi     instruction[5] = 0x08;
639*22dc650dSSadaf Ebrahimi     sljit_emit_op_custom(compiler, instruction, 6);
640*22dc650dSSadaf Ebrahimi     }
641*22dc650dSSadaf Ebrahimi   }
642*22dc650dSSadaf Ebrahimi else
643*22dc650dSSadaf Ebrahimi   {
644*22dc650dSSadaf Ebrahimi   /* MOVDQA xmm1, xmm2/m128 */
645*22dc650dSSadaf Ebrahimi   instruction[0] = 0x66;
646*22dc650dSSadaf Ebrahimi   instruction[1] = 0x0f;
647*22dc650dSSadaf Ebrahimi   instruction[2] = 0x6f;
648*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
649*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 4);
650*22dc650dSSadaf Ebrahimi 
651*22dc650dSSadaf Ebrahimi   /* PSLLDQ xmm1, imm8 */
652*22dc650dSSadaf Ebrahimi   /* instruction[0] = 0x66; */
653*22dc650dSSadaf Ebrahimi   /* instruction[1] = 0x0f; */
654*22dc650dSSadaf Ebrahimi   instruction[2] = 0x73;
655*22dc650dSSadaf Ebrahimi   instruction[3] = 0xc0 | (7 << 3) | data2_ind;
656*22dc650dSSadaf Ebrahimi   instruction[4] = diff;
657*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 5);
658*22dc650dSSadaf Ebrahimi   }
659*22dc650dSSadaf Ebrahimi 
660*22dc650dSSadaf Ebrahimi JUMPHERE(jump[1]);
661*22dc650dSSadaf Ebrahimi 
662*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
663*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
664*22dc650dSSadaf Ebrahimi 
665*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
666*22dc650dSSadaf Ebrahimi   {
667*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
668*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
669*22dc650dSSadaf Ebrahimi   }
670*22dc650dSSadaf Ebrahimi 
671*22dc650dSSadaf Ebrahimi sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
672*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
673*22dc650dSSadaf Ebrahimi 
674*22dc650dSSadaf Ebrahimi /* Ignore matches before the first STR_PTR. */
675*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
676*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
677*22dc650dSSadaf Ebrahimi 
678*22dc650dSSadaf Ebrahimi jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
679*22dc650dSSadaf Ebrahimi 
680*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
681*22dc650dSSadaf Ebrahimi 
682*22dc650dSSadaf Ebrahimi /* Main loop. */
683*22dc650dSSadaf Ebrahimi start = LABEL();
684*22dc650dSSadaf Ebrahimi 
685*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
686*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
687*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
688*22dc650dSSadaf Ebrahimi 
689*22dc650dSSadaf Ebrahimi value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
690*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
691*22dc650dSSadaf Ebrahimi sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
692*22dc650dSSadaf Ebrahimi 
693*22dc650dSSadaf Ebrahimi for (i = 0; i < 4; i++)
694*22dc650dSSadaf Ebrahimi   {
695*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
696*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
697*22dc650dSSadaf Ebrahimi   }
698*22dc650dSSadaf Ebrahimi 
699*22dc650dSSadaf Ebrahimi sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
700*22dc650dSSadaf Ebrahimi sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
701*22dc650dSSadaf Ebrahimi 
702*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
703*22dc650dSSadaf Ebrahimi 
704*22dc650dSSadaf Ebrahimi JUMPHERE(jump[0]);
705*22dc650dSSadaf Ebrahimi 
706*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(tmp1_reg_ind < 8);
707*22dc650dSSadaf Ebrahimi /* BSF r32, r/m32 */
708*22dc650dSSadaf Ebrahimi instruction[0] = 0x0f;
709*22dc650dSSadaf Ebrahimi instruction[1] = 0xbc;
710*22dc650dSSadaf Ebrahimi instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
711*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 3);
712*22dc650dSSadaf Ebrahimi 
713*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
714*22dc650dSSadaf Ebrahimi 
715*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
716*22dc650dSSadaf Ebrahimi 
717*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
718*22dc650dSSadaf Ebrahimi if (common->utf)
719*22dc650dSSadaf Ebrahimi   {
720*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
721*22dc650dSSadaf Ebrahimi 
722*22dc650dSSadaf Ebrahimi   jump[0] = jump_if_utf_char_start(compiler, TMP1);
723*22dc650dSSadaf Ebrahimi 
724*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
725*22dc650dSSadaf Ebrahimi   CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
726*22dc650dSSadaf Ebrahimi 
727*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
728*22dc650dSSadaf Ebrahimi 
729*22dc650dSSadaf Ebrahimi   JUMPHERE(jump[0]);
730*22dc650dSSadaf Ebrahimi   }
731*22dc650dSSadaf Ebrahimi #endif
732*22dc650dSSadaf Ebrahimi 
733*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
734*22dc650dSSadaf Ebrahimi 
735*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
736*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
737*22dc650dSSadaf Ebrahimi }
738*22dc650dSSadaf Ebrahimi 
739*22dc650dSSadaf Ebrahimi #endif /* !_WIN64 */
740*22dc650dSSadaf Ebrahimi 
741*22dc650dSSadaf Ebrahimi #undef SIMD_COMPARE_TYPE_INDEX
742*22dc650dSSadaf Ebrahimi 
743*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_X86 */
744*22dc650dSSadaf Ebrahimi 
745*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
746*22dc650dSSadaf Ebrahimi 
747*22dc650dSSadaf Ebrahimi #include <arm_neon.h>
748*22dc650dSSadaf Ebrahimi 
749*22dc650dSSadaf Ebrahimi typedef union {
750*22dc650dSSadaf Ebrahimi   unsigned int x;
751*22dc650dSSadaf Ebrahimi   struct { unsigned char c1, c2, c3, c4; } c;
752*22dc650dSSadaf Ebrahimi } int_char;
753*22dc650dSSadaf Ebrahimi 
754*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
utf_continue(PCRE2_SPTR s)755*22dc650dSSadaf Ebrahimi static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
756*22dc650dSSadaf Ebrahimi {
757*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
758*22dc650dSSadaf Ebrahimi return (*s & 0xc0) == 0x80;
759*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
760*22dc650dSSadaf Ebrahimi return (*s & 0xfc00) == 0xdc00;
761*22dc650dSSadaf Ebrahimi #else
762*22dc650dSSadaf Ebrahimi #error "Unknown code width"
763*22dc650dSSadaf Ebrahimi #endif
764*22dc650dSSadaf Ebrahimi }
765*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
766*22dc650dSSadaf Ebrahimi 
767*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
768*22dc650dSSadaf Ebrahimi # define VECTOR_FACTOR 16
769*22dc650dSSadaf Ebrahimi # define vect_t uint8x16_t
770*22dc650dSSadaf Ebrahimi # define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
771*22dc650dSSadaf Ebrahimi # define VCEQQ vceqq_u8
772*22dc650dSSadaf Ebrahimi # define VORRQ vorrq_u8
773*22dc650dSSadaf Ebrahimi # define VST1Q vst1q_u8
774*22dc650dSSadaf Ebrahimi # define VDUPQ vdupq_n_u8
775*22dc650dSSadaf Ebrahimi # define VEXTQ vextq_u8
776*22dc650dSSadaf Ebrahimi # define VANDQ vandq_u8
777*22dc650dSSadaf Ebrahimi typedef union {
778*22dc650dSSadaf Ebrahimi        uint8_t mem[16];
779*22dc650dSSadaf Ebrahimi        uint64_t dw[2];
780*22dc650dSSadaf Ebrahimi } quad_word;
781*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
782*22dc650dSSadaf Ebrahimi # define VECTOR_FACTOR 8
783*22dc650dSSadaf Ebrahimi # define vect_t uint16x8_t
784*22dc650dSSadaf Ebrahimi # define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
785*22dc650dSSadaf Ebrahimi # define VCEQQ vceqq_u16
786*22dc650dSSadaf Ebrahimi # define VORRQ vorrq_u16
787*22dc650dSSadaf Ebrahimi # define VST1Q vst1q_u16
788*22dc650dSSadaf Ebrahimi # define VDUPQ vdupq_n_u16
789*22dc650dSSadaf Ebrahimi # define VEXTQ vextq_u16
790*22dc650dSSadaf Ebrahimi # define VANDQ vandq_u16
791*22dc650dSSadaf Ebrahimi typedef union {
792*22dc650dSSadaf Ebrahimi        uint16_t mem[8];
793*22dc650dSSadaf Ebrahimi        uint64_t dw[2];
794*22dc650dSSadaf Ebrahimi } quad_word;
795*22dc650dSSadaf Ebrahimi #else
796*22dc650dSSadaf Ebrahimi # define VECTOR_FACTOR 4
797*22dc650dSSadaf Ebrahimi # define vect_t uint32x4_t
798*22dc650dSSadaf Ebrahimi # define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
799*22dc650dSSadaf Ebrahimi # define VCEQQ vceqq_u32
800*22dc650dSSadaf Ebrahimi # define VORRQ vorrq_u32
801*22dc650dSSadaf Ebrahimi # define VST1Q vst1q_u32
802*22dc650dSSadaf Ebrahimi # define VDUPQ vdupq_n_u32
803*22dc650dSSadaf Ebrahimi # define VEXTQ vextq_u32
804*22dc650dSSadaf Ebrahimi # define VANDQ vandq_u32
805*22dc650dSSadaf Ebrahimi typedef union {
806*22dc650dSSadaf Ebrahimi        uint32_t mem[4];
807*22dc650dSSadaf Ebrahimi        uint64_t dw[2];
808*22dc650dSSadaf Ebrahimi } quad_word;
809*22dc650dSSadaf Ebrahimi #endif
810*22dc650dSSadaf Ebrahimi 
811*22dc650dSSadaf Ebrahimi #define FFCS
812*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
813*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
814*22dc650dSSadaf Ebrahimi # define FF_UTF
815*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
816*22dc650dSSadaf Ebrahimi # undef FF_UTF
817*22dc650dSSadaf Ebrahimi #endif
818*22dc650dSSadaf Ebrahimi #undef FFCS
819*22dc650dSSadaf Ebrahimi 
820*22dc650dSSadaf Ebrahimi #define FFCS_2
821*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
822*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
823*22dc650dSSadaf Ebrahimi # define FF_UTF
824*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
825*22dc650dSSadaf Ebrahimi # undef FF_UTF
826*22dc650dSSadaf Ebrahimi #endif
827*22dc650dSSadaf Ebrahimi #undef FFCS_2
828*22dc650dSSadaf Ebrahimi 
829*22dc650dSSadaf Ebrahimi #define FFCS_MASK
830*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
831*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
832*22dc650dSSadaf Ebrahimi # define FF_UTF
833*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
834*22dc650dSSadaf Ebrahimi # undef FF_UTF
835*22dc650dSSadaf Ebrahimi #endif
836*22dc650dSSadaf Ebrahimi #undef FFCS_MASK
837*22dc650dSSadaf Ebrahimi 
838*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
839*22dc650dSSadaf Ebrahimi 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)840*22dc650dSSadaf Ebrahimi static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
841*22dc650dSSadaf Ebrahimi {
842*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
843*22dc650dSSadaf Ebrahimi int_char ic;
844*22dc650dSSadaf Ebrahimi struct sljit_jump *partial_quit, *quit;
845*22dc650dSSadaf Ebrahimi /* Save temporary registers. */
846*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
847*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
848*22dc650dSSadaf Ebrahimi 
849*22dc650dSSadaf Ebrahimi /* Prepare function arguments */
850*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
851*22dc650dSSadaf Ebrahimi GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
852*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
853*22dc650dSSadaf Ebrahimi 
854*22dc650dSSadaf Ebrahimi if (char1 == char2)
855*22dc650dSSadaf Ebrahimi   {
856*22dc650dSSadaf Ebrahimi     ic.c.c1 = char1;
857*22dc650dSSadaf Ebrahimi     ic.c.c2 = char2;
858*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
859*22dc650dSSadaf Ebrahimi 
860*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
861*22dc650dSSadaf Ebrahimi   if (common->utf && offset > 0)
862*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
863*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
864*22dc650dSSadaf Ebrahimi   else
865*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
866*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
867*22dc650dSSadaf Ebrahimi #else
868*22dc650dSSadaf Ebrahimi   sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
869*22dc650dSSadaf Ebrahimi                    SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
870*22dc650dSSadaf Ebrahimi #endif
871*22dc650dSSadaf Ebrahimi   }
872*22dc650dSSadaf Ebrahimi else
873*22dc650dSSadaf Ebrahimi   {
874*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR mask = char1 ^ char2;
875*22dc650dSSadaf Ebrahimi   if (is_powerof2(mask))
876*22dc650dSSadaf Ebrahimi     {
877*22dc650dSSadaf Ebrahimi     ic.c.c1 = char1 | mask;
878*22dc650dSSadaf Ebrahimi     ic.c.c2 = mask;
879*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
880*22dc650dSSadaf Ebrahimi 
881*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
882*22dc650dSSadaf Ebrahimi     if (common->utf && offset > 0)
883*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
884*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
885*22dc650dSSadaf Ebrahimi     else
886*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
888*22dc650dSSadaf Ebrahimi #else
889*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
891*22dc650dSSadaf Ebrahimi #endif
892*22dc650dSSadaf Ebrahimi     }
893*22dc650dSSadaf Ebrahimi   else
894*22dc650dSSadaf Ebrahimi     {
895*22dc650dSSadaf Ebrahimi       ic.c.c1 = char1;
896*22dc650dSSadaf Ebrahimi       ic.c.c2 = char2;
897*22dc650dSSadaf Ebrahimi       OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
898*22dc650dSSadaf Ebrahimi 
899*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
900*22dc650dSSadaf Ebrahimi     if (common->utf && offset > 0)
901*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
902*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
903*22dc650dSSadaf Ebrahimi     else
904*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
905*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
906*22dc650dSSadaf Ebrahimi #else
907*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
909*22dc650dSSadaf Ebrahimi #endif
910*22dc650dSSadaf Ebrahimi     }
911*22dc650dSSadaf Ebrahimi   }
912*22dc650dSSadaf Ebrahimi /* Restore registers. */
913*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
914*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
915*22dc650dSSadaf Ebrahimi 
916*22dc650dSSadaf Ebrahimi /* Check return value. */
917*22dc650dSSadaf Ebrahimi partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
918*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
919*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit);
920*22dc650dSSadaf Ebrahimi 
921*22dc650dSSadaf Ebrahimi /* Fast forward STR_PTR to the result of memchr. */
922*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
923*22dc650dSSadaf Ebrahimi if (common->mode != PCRE2_JIT_COMPLETE)
924*22dc650dSSadaf Ebrahimi   {
925*22dc650dSSadaf Ebrahimi   quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
926*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit);
927*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
928*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
929*22dc650dSSadaf Ebrahimi   JUMPHERE(quit);
930*22dc650dSSadaf Ebrahimi   }
931*22dc650dSSadaf Ebrahimi }
932*22dc650dSSadaf Ebrahimi 
933*22dc650dSSadaf Ebrahimi typedef enum {
934*22dc650dSSadaf Ebrahimi   compare_match1,
935*22dc650dSSadaf Ebrahimi   compare_match1i,
936*22dc650dSSadaf Ebrahimi   compare_match2,
937*22dc650dSSadaf Ebrahimi } compare_type;
938*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_compare(compare_type ctype,vect_t dst,vect_t cmp1,vect_t cmp2)939*22dc650dSSadaf Ebrahimi static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
940*22dc650dSSadaf Ebrahimi {
941*22dc650dSSadaf Ebrahimi if (ctype == compare_match2)
942*22dc650dSSadaf Ebrahimi   {
943*22dc650dSSadaf Ebrahimi   vect_t tmp = dst;
944*22dc650dSSadaf Ebrahimi   dst = VCEQQ(dst, cmp1);
945*22dc650dSSadaf Ebrahimi   tmp = VCEQQ(tmp, cmp2);
946*22dc650dSSadaf Ebrahimi   dst = VORRQ(dst, tmp);
947*22dc650dSSadaf Ebrahimi   return dst;
948*22dc650dSSadaf Ebrahimi   }
949*22dc650dSSadaf Ebrahimi 
950*22dc650dSSadaf Ebrahimi if (ctype == compare_match1i)
951*22dc650dSSadaf Ebrahimi   dst = VORRQ(dst, cmp2);
952*22dc650dSSadaf Ebrahimi dst = VCEQQ(dst, cmp1);
953*22dc650dSSadaf Ebrahimi return dst;
954*22dc650dSSadaf Ebrahimi }
955*22dc650dSSadaf Ebrahimi 
max_fast_forward_char_pair_offset(void)956*22dc650dSSadaf Ebrahimi static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
957*22dc650dSSadaf Ebrahimi {
958*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
959*22dc650dSSadaf Ebrahimi return 15;
960*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
961*22dc650dSSadaf Ebrahimi return 7;
962*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
963*22dc650dSSadaf Ebrahimi return 3;
964*22dc650dSSadaf Ebrahimi #else
965*22dc650dSSadaf Ebrahimi #error "Unsupported unit width"
966*22dc650dSSadaf Ebrahimi #endif
967*22dc650dSSadaf Ebrahimi }
968*22dc650dSSadaf Ebrahimi 
969*22dc650dSSadaf Ebrahimi /* ARM doesn't have a shift left across lanes. */
shift_left_n_lanes(vect_t a,sljit_u8 n)970*22dc650dSSadaf Ebrahimi static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
971*22dc650dSSadaf Ebrahimi {
972*22dc650dSSadaf Ebrahimi vect_t zero = VDUPQ(0);
973*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
974*22dc650dSSadaf Ebrahimi /* VEXTQ takes an immediate as last argument. */
975*22dc650dSSadaf Ebrahimi #define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
976*22dc650dSSadaf Ebrahimi switch (n)
977*22dc650dSSadaf Ebrahimi   {
978*22dc650dSSadaf Ebrahimi   C(1); C(2); C(3);
979*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
980*22dc650dSSadaf Ebrahimi   C(4); C(5); C(6); C(7);
981*22dc650dSSadaf Ebrahimi # if PCRE2_CODE_UNIT_WIDTH != 16
982*22dc650dSSadaf Ebrahimi   C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
983*22dc650dSSadaf Ebrahimi # endif
984*22dc650dSSadaf Ebrahimi #endif
985*22dc650dSSadaf Ebrahimi   default:
986*22dc650dSSadaf Ebrahimi     /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
987*22dc650dSSadaf Ebrahimi        happen. The return is still here for compilers to not warn. */
988*22dc650dSSadaf Ebrahimi     return a;
989*22dc650dSSadaf Ebrahimi   }
990*22dc650dSSadaf Ebrahimi }
991*22dc650dSSadaf Ebrahimi 
992*22dc650dSSadaf Ebrahimi #define FFCPS
993*22dc650dSSadaf Ebrahimi #define FFCPS_DIFF1
994*22dc650dSSadaf Ebrahimi #define FFCPS_CHAR1A2A
995*22dc650dSSadaf Ebrahimi 
996*22dc650dSSadaf Ebrahimi #define FFCPS_0
997*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
998*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
999*22dc650dSSadaf Ebrahimi # define FF_UTF
1000*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
1001*22dc650dSSadaf Ebrahimi # undef FF_UTF
1002*22dc650dSSadaf Ebrahimi #endif
1003*22dc650dSSadaf Ebrahimi #undef FFCPS_0
1004*22dc650dSSadaf Ebrahimi 
1005*22dc650dSSadaf Ebrahimi #undef FFCPS_CHAR1A2A
1006*22dc650dSSadaf Ebrahimi 
1007*22dc650dSSadaf Ebrahimi #define FFCPS_1
1008*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
1009*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1010*22dc650dSSadaf Ebrahimi # define FF_UTF
1011*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
1012*22dc650dSSadaf Ebrahimi # undef FF_UTF
1013*22dc650dSSadaf Ebrahimi #endif
1014*22dc650dSSadaf Ebrahimi #undef FFCPS_1
1015*22dc650dSSadaf Ebrahimi 
1016*22dc650dSSadaf Ebrahimi #undef FFCPS_DIFF1
1017*22dc650dSSadaf Ebrahimi 
1018*22dc650dSSadaf Ebrahimi #define FFCPS_DEFAULT
1019*22dc650dSSadaf Ebrahimi #include "pcre2_jit_neon_inc.h"
1020*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1021*22dc650dSSadaf Ebrahimi # define FF_UTF
1022*22dc650dSSadaf Ebrahimi # include "pcre2_jit_neon_inc.h"
1023*22dc650dSSadaf Ebrahimi # undef FF_UTF
1024*22dc650dSSadaf Ebrahimi #endif
1025*22dc650dSSadaf Ebrahimi #undef FFCPS
1026*22dc650dSSadaf Ebrahimi 
1027*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1028*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1029*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1030*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1031*22dc650dSSadaf Ebrahimi {
1032*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
1033*22dc650dSSadaf Ebrahimi sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1034*22dc650dSSadaf Ebrahimi struct sljit_jump *partial_quit;
1035*22dc650dSSadaf Ebrahimi int_char ic;
1036*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1037*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1038*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(compiler->scratches == 5);
1039*22dc650dSSadaf Ebrahimi 
1040*22dc650dSSadaf Ebrahimi /* Save temporary register STR_PTR. */
1041*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1042*22dc650dSSadaf Ebrahimi 
1043*22dc650dSSadaf Ebrahimi /* Prepare arguments for the function call. */
1044*22dc650dSSadaf Ebrahimi if (common->match_end_ptr == 0)
1045*22dc650dSSadaf Ebrahimi    OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1046*22dc650dSSadaf Ebrahimi else
1047*22dc650dSSadaf Ebrahimi   {
1048*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1049*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1050*22dc650dSSadaf Ebrahimi 
1051*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
1052*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
1053*22dc650dSSadaf Ebrahimi   }
1054*22dc650dSSadaf Ebrahimi 
1055*22dc650dSSadaf Ebrahimi GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
1056*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1057*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1058*22dc650dSSadaf Ebrahimi ic.c.c1 = char1a;
1059*22dc650dSSadaf Ebrahimi ic.c.c2 = char1b;
1060*22dc650dSSadaf Ebrahimi ic.c.c3 = char2a;
1061*22dc650dSSadaf Ebrahimi ic.c.c4 = char2b;
1062*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1063*22dc650dSSadaf Ebrahimi 
1064*22dc650dSSadaf Ebrahimi if (diff == 1) {
1065*22dc650dSSadaf Ebrahimi   if (char1a == char1b && char2a == char2b) {
1066*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1067*22dc650dSSadaf Ebrahimi     if (common->utf)
1068*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1069*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
1070*22dc650dSSadaf Ebrahimi     else
1071*22dc650dSSadaf Ebrahimi #endif
1072*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1073*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
1074*22dc650dSSadaf Ebrahimi   } else {
1075*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1076*22dc650dSSadaf Ebrahimi     if (common->utf)
1077*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1078*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
1079*22dc650dSSadaf Ebrahimi     else
1080*22dc650dSSadaf Ebrahimi #endif
1081*22dc650dSSadaf Ebrahimi       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1082*22dc650dSSadaf Ebrahimi                        SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
1083*22dc650dSSadaf Ebrahimi   }
1084*22dc650dSSadaf Ebrahimi } else {
1085*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086*22dc650dSSadaf Ebrahimi   if (common->utf)
1087*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
1089*22dc650dSSadaf Ebrahimi   else
1090*22dc650dSSadaf Ebrahimi #endif
1091*22dc650dSSadaf Ebrahimi     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092*22dc650dSSadaf Ebrahimi                      SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
1093*22dc650dSSadaf Ebrahimi }
1094*22dc650dSSadaf Ebrahimi 
1095*22dc650dSSadaf Ebrahimi /* Restore STR_PTR register. */
1096*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1097*22dc650dSSadaf Ebrahimi 
1098*22dc650dSSadaf Ebrahimi /* Check return value. */
1099*22dc650dSSadaf Ebrahimi partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1100*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, partial_quit);
1101*22dc650dSSadaf Ebrahimi 
1102*22dc650dSSadaf Ebrahimi /* Fast forward STR_PTR to the result of memchr. */
1103*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1104*22dc650dSSadaf Ebrahimi 
1105*22dc650dSSadaf Ebrahimi JUMPHERE(partial_quit);
1106*22dc650dSSadaf Ebrahimi }
1107*22dc650dSSadaf Ebrahimi 
1108*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1109*22dc650dSSadaf Ebrahimi 
1110*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1111*22dc650dSSadaf Ebrahimi 
1112*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1113*22dc650dSSadaf Ebrahimi #define VECTOR_ELEMENT_SIZE 0
1114*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
1115*22dc650dSSadaf Ebrahimi #define VECTOR_ELEMENT_SIZE 1
1116*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
1117*22dc650dSSadaf Ebrahimi #define VECTOR_ELEMENT_SIZE 2
1118*22dc650dSSadaf Ebrahimi #else
1119*22dc650dSSadaf Ebrahimi #error "Unsupported unit width"
1120*22dc650dSSadaf Ebrahimi #endif
1121*22dc650dSSadaf Ebrahimi 
load_from_mem_vector(struct sljit_compiler * compiler,BOOL vlbb,sljit_s32 dst_vreg,sljit_s32 base_reg,sljit_s32 index_reg)1122*22dc650dSSadaf Ebrahimi static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1123*22dc650dSSadaf Ebrahimi   sljit_s32 base_reg, sljit_s32 index_reg)
1124*22dc650dSSadaf Ebrahimi {
1125*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1126*22dc650dSSadaf Ebrahimi 
1127*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1128*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(base_reg << 12);
1129*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1130*22dc650dSSadaf Ebrahimi 
1131*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1132*22dc650dSSadaf Ebrahimi }
1133*22dc650dSSadaf Ebrahimi 
1134*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1135*22dc650dSSadaf Ebrahimi 
replicate_imm_vector(struct sljit_compiler * compiler,int step,sljit_s32 dst_vreg,PCRE2_UCHAR chr,sljit_s32 tmp_general_reg)1136*22dc650dSSadaf Ebrahimi static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1137*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1138*22dc650dSSadaf Ebrahimi {
1139*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1140*22dc650dSSadaf Ebrahimi 
1141*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(step >= 0 && step <= 1);
1142*22dc650dSSadaf Ebrahimi 
1143*22dc650dSSadaf Ebrahimi if (chr < 0x7fff)
1144*22dc650dSSadaf Ebrahimi   {
1145*22dc650dSSadaf Ebrahimi   if (step == 1)
1146*22dc650dSSadaf Ebrahimi     return;
1147*22dc650dSSadaf Ebrahimi 
1148*22dc650dSSadaf Ebrahimi   /* VREPI */
1149*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1150*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)chr;
1151*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1152*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1153*22dc650dSSadaf Ebrahimi   return;
1154*22dc650dSSadaf Ebrahimi   }
1155*22dc650dSSadaf Ebrahimi 
1156*22dc650dSSadaf Ebrahimi if (step == 0)
1157*22dc650dSSadaf Ebrahimi   {
1158*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1159*22dc650dSSadaf Ebrahimi 
1160*22dc650dSSadaf Ebrahimi   /* VLVG */
1161*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
1162*22dc650dSSadaf Ebrahimi   instruction[1] = 0;
1163*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1164*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1165*22dc650dSSadaf Ebrahimi   return;
1166*22dc650dSSadaf Ebrahimi   }
1167*22dc650dSSadaf Ebrahimi 
1168*22dc650dSSadaf Ebrahimi /* VREP */
1169*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1170*22dc650dSSadaf Ebrahimi instruction[1] = 0;
1171*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1172*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1173*22dc650dSSadaf Ebrahimi }
1174*22dc650dSSadaf Ebrahimi 
1175*22dc650dSSadaf Ebrahimi #endif
1176*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_sse2_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,int step,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1177*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1178*22dc650dSSadaf Ebrahimi   int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1179*22dc650dSSadaf Ebrahimi {
1180*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1181*22dc650dSSadaf Ebrahimi 
1182*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(step >= 0 && step <= 2);
1183*22dc650dSSadaf Ebrahimi 
1184*22dc650dSSadaf Ebrahimi if (step == 1)
1185*22dc650dSSadaf Ebrahimi   {
1186*22dc650dSSadaf Ebrahimi   /* VCEQ */
1187*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1188*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(cmp1_ind << 12);
1189*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1190*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1191*22dc650dSSadaf Ebrahimi   return;
1192*22dc650dSSadaf Ebrahimi   }
1193*22dc650dSSadaf Ebrahimi 
1194*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1195*22dc650dSSadaf Ebrahimi   {
1196*22dc650dSSadaf Ebrahimi   if (step == 0 && compare_type == vector_compare_match1i)
1197*22dc650dSSadaf Ebrahimi     {
1198*22dc650dSSadaf Ebrahimi     /* VO */
1199*22dc650dSSadaf Ebrahimi     instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1200*22dc650dSSadaf Ebrahimi     instruction[1] = (sljit_u16)(cmp2_ind << 12);
1201*22dc650dSSadaf Ebrahimi     instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1202*22dc650dSSadaf Ebrahimi     sljit_emit_op_custom(compiler, instruction, 6);
1203*22dc650dSSadaf Ebrahimi     }
1204*22dc650dSSadaf Ebrahimi   return;
1205*22dc650dSSadaf Ebrahimi   }
1206*22dc650dSSadaf Ebrahimi 
1207*22dc650dSSadaf Ebrahimi switch (step)
1208*22dc650dSSadaf Ebrahimi   {
1209*22dc650dSSadaf Ebrahimi   case 0:
1210*22dc650dSSadaf Ebrahimi   /* VCEQ */
1211*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1212*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(cmp2_ind << 12);
1213*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1214*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1215*22dc650dSSadaf Ebrahimi   return;
1216*22dc650dSSadaf Ebrahimi 
1217*22dc650dSSadaf Ebrahimi   case 2:
1218*22dc650dSSadaf Ebrahimi   /* VO */
1219*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1220*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(tmp_ind << 12);
1221*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1222*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1223*22dc650dSSadaf Ebrahimi   return;
1224*22dc650dSSadaf Ebrahimi   }
1225*22dc650dSSadaf Ebrahimi }
1226*22dc650dSSadaf Ebrahimi 
1227*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1228*22dc650dSSadaf Ebrahimi 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1229*22dc650dSSadaf Ebrahimi static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1230*22dc650dSSadaf Ebrahimi {
1231*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
1232*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1233*22dc650dSSadaf Ebrahimi struct sljit_label *start;
1234*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1235*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
1236*22dc650dSSadaf Ebrahimi #endif
1237*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
1238*22dc650dSSadaf Ebrahimi struct sljit_jump *partial_quit[2];
1239*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
1240*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1241*22dc650dSSadaf Ebrahimi sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1242*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = 0;
1243*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = 1;
1244*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = 2;
1245*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = 3;
1246*22dc650dSSadaf Ebrahimi sljit_s32 zero_ind = 4;
1247*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
1248*22dc650dSSadaf Ebrahimi int i;
1249*22dc650dSSadaf Ebrahimi 
1250*22dc650dSSadaf Ebrahimi SLJIT_UNUSED_ARG(offset);
1251*22dc650dSSadaf Ebrahimi 
1252*22dc650dSSadaf Ebrahimi if (char1 != char2)
1253*22dc650dSSadaf Ebrahimi   {
1254*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
1255*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
1256*22dc650dSSadaf Ebrahimi 
1257*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
1258*22dc650dSSadaf Ebrahimi     {
1259*22dc650dSSadaf Ebrahimi     bit = 0;
1260*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
1261*22dc650dSSadaf Ebrahimi     }
1262*22dc650dSSadaf Ebrahimi   }
1263*22dc650dSSadaf Ebrahimi 
1264*22dc650dSSadaf Ebrahimi partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1265*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
1266*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[0]);
1267*22dc650dSSadaf Ebrahimi 
1268*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
1269*22dc650dSSadaf Ebrahimi 
1270*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1271*22dc650dSSadaf Ebrahimi 
1272*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
1273*22dc650dSSadaf Ebrahimi 
1274*22dc650dSSadaf Ebrahimi /* VREPI */
1275*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1276*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(char1 | bit);
1277*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1278*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1279*22dc650dSSadaf Ebrahimi 
1280*22dc650dSSadaf Ebrahimi if (char1 != char2)
1281*22dc650dSSadaf Ebrahimi   {
1282*22dc650dSSadaf Ebrahimi   /* VREPI */
1283*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1284*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1285*22dc650dSSadaf Ebrahimi   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1286*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1287*22dc650dSSadaf Ebrahimi   }
1288*22dc650dSSadaf Ebrahimi 
1289*22dc650dSSadaf Ebrahimi #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1290*22dc650dSSadaf Ebrahimi 
1291*22dc650dSSadaf Ebrahimi for (int i = 0; i < 2; i++)
1292*22dc650dSSadaf Ebrahimi   {
1293*22dc650dSSadaf Ebrahimi   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1294*22dc650dSSadaf Ebrahimi 
1295*22dc650dSSadaf Ebrahimi   if (char1 != char2)
1296*22dc650dSSadaf Ebrahimi     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1297*22dc650dSSadaf Ebrahimi   }
1298*22dc650dSSadaf Ebrahimi 
1299*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1300*22dc650dSSadaf Ebrahimi 
1301*22dc650dSSadaf Ebrahimi if (compare_type == vector_compare_match2)
1302*22dc650dSSadaf Ebrahimi   {
1303*22dc650dSSadaf Ebrahimi   /* VREPI */
1304*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1305*22dc650dSSadaf Ebrahimi   instruction[1] = 0;
1306*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1307*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1308*22dc650dSSadaf Ebrahimi   }
1309*22dc650dSSadaf Ebrahimi 
1310*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1311*22dc650dSSadaf Ebrahimi restart = LABEL();
1312*22dc650dSSadaf Ebrahimi #endif
1313*22dc650dSSadaf Ebrahimi 
1314*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1315*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1316*22dc650dSSadaf Ebrahimi 
1317*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1318*22dc650dSSadaf Ebrahimi   {
1319*22dc650dSSadaf Ebrahimi   if (compare_type == vector_compare_match1i)
1320*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1321*22dc650dSSadaf Ebrahimi 
1322*22dc650dSSadaf Ebrahimi   /* VFEE */
1323*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1324*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1325*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1326*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1327*22dc650dSSadaf Ebrahimi   }
1328*22dc650dSSadaf Ebrahimi else
1329*22dc650dSSadaf Ebrahimi   {
1330*22dc650dSSadaf Ebrahimi   for (i = 0; i < 3; i++)
1331*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1332*22dc650dSSadaf Ebrahimi 
1333*22dc650dSSadaf Ebrahimi   /* VFENE */
1334*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1335*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1336*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1337*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1338*22dc650dSSadaf Ebrahimi   }
1339*22dc650dSSadaf Ebrahimi 
1340*22dc650dSSadaf Ebrahimi /* VLGVB */
1341*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1342*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1343*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1344*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1345*22dc650dSSadaf Ebrahimi 
1346*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1347*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1348*22dc650dSSadaf Ebrahimi 
1349*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1350*22dc650dSSadaf Ebrahimi 
1351*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
1352*22dc650dSSadaf Ebrahimi start = LABEL();
1353*22dc650dSSadaf Ebrahimi 
1354*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1355*22dc650dSSadaf Ebrahimi 
1356*22dc650dSSadaf Ebrahimi partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1357*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
1358*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[1]);
1359*22dc650dSSadaf Ebrahimi 
1360*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1361*22dc650dSSadaf Ebrahimi 
1362*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1363*22dc650dSSadaf Ebrahimi   {
1364*22dc650dSSadaf Ebrahimi   if (compare_type == vector_compare_match1i)
1365*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1366*22dc650dSSadaf Ebrahimi 
1367*22dc650dSSadaf Ebrahimi   /* VFEE */
1368*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1369*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1370*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1371*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1372*22dc650dSSadaf Ebrahimi   }
1373*22dc650dSSadaf Ebrahimi else
1374*22dc650dSSadaf Ebrahimi   {
1375*22dc650dSSadaf Ebrahimi   for (i = 0; i < 3; i++)
1376*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1377*22dc650dSSadaf Ebrahimi 
1378*22dc650dSSadaf Ebrahimi   /* VFENE */
1379*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1380*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1381*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1382*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1383*22dc650dSSadaf Ebrahimi   }
1384*22dc650dSSadaf Ebrahimi 
1385*22dc650dSSadaf Ebrahimi sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1386*22dc650dSSadaf Ebrahimi JUMPTO(SLJIT_OVERFLOW, start);
1387*22dc650dSSadaf Ebrahimi 
1388*22dc650dSSadaf Ebrahimi /* VLGVB */
1389*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1390*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1391*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1392*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1393*22dc650dSSadaf Ebrahimi 
1394*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1395*22dc650dSSadaf Ebrahimi 
1396*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
1397*22dc650dSSadaf Ebrahimi 
1398*22dc650dSSadaf Ebrahimi if (common->mode != PCRE2_JIT_COMPLETE)
1399*22dc650dSSadaf Ebrahimi   {
1400*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[0]);
1401*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[1]);
1402*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1403*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
1404*22dc650dSSadaf Ebrahimi   }
1405*22dc650dSSadaf Ebrahimi else
1406*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1407*22dc650dSSadaf Ebrahimi 
1408*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1409*22dc650dSSadaf Ebrahimi if (common->utf && offset > 0)
1410*22dc650dSSadaf Ebrahimi   {
1411*22dc650dSSadaf Ebrahimi   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1412*22dc650dSSadaf Ebrahimi 
1413*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1414*22dc650dSSadaf Ebrahimi 
1415*22dc650dSSadaf Ebrahimi   quit = jump_if_utf_char_start(compiler, TMP1);
1416*22dc650dSSadaf Ebrahimi 
1417*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1418*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1419*22dc650dSSadaf Ebrahimi 
1420*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1421*22dc650dSSadaf Ebrahimi   JUMPTO(SLJIT_JUMP, restart);
1422*22dc650dSSadaf Ebrahimi 
1423*22dc650dSSadaf Ebrahimi   JUMPHERE(quit);
1424*22dc650dSSadaf Ebrahimi   }
1425*22dc650dSSadaf Ebrahimi #endif
1426*22dc650dSSadaf Ebrahimi }
1427*22dc650dSSadaf Ebrahimi 
1428*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1429*22dc650dSSadaf Ebrahimi 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)1430*22dc650dSSadaf Ebrahimi static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1431*22dc650dSSadaf Ebrahimi {
1432*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
1433*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1434*22dc650dSSadaf Ebrahimi struct sljit_label *start;
1435*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
1436*22dc650dSSadaf Ebrahimi jump_list *not_found = NULL;
1437*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
1438*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1439*22dc650dSSadaf Ebrahimi sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
1440*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = 0;
1441*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = 1;
1442*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = 2;
1443*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = 3;
1444*22dc650dSSadaf Ebrahimi sljit_s32 zero_ind = 4;
1445*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
1446*22dc650dSSadaf Ebrahimi int i;
1447*22dc650dSSadaf Ebrahimi 
1448*22dc650dSSadaf Ebrahimi if (char1 != char2)
1449*22dc650dSSadaf Ebrahimi   {
1450*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
1451*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
1452*22dc650dSSadaf Ebrahimi 
1453*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
1454*22dc650dSSadaf Ebrahimi     {
1455*22dc650dSSadaf Ebrahimi     bit = 0;
1456*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
1457*22dc650dSSadaf Ebrahimi     }
1458*22dc650dSSadaf Ebrahimi   }
1459*22dc650dSSadaf Ebrahimi 
1460*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1461*22dc650dSSadaf Ebrahimi 
1462*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
1463*22dc650dSSadaf Ebrahimi 
1464*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1465*22dc650dSSadaf Ebrahimi 
1466*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
1467*22dc650dSSadaf Ebrahimi 
1468*22dc650dSSadaf Ebrahimi /* VREPI */
1469*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1470*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(char1 | bit);
1471*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1472*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1473*22dc650dSSadaf Ebrahimi 
1474*22dc650dSSadaf Ebrahimi if (char1 != char2)
1475*22dc650dSSadaf Ebrahimi   {
1476*22dc650dSSadaf Ebrahimi   /* VREPI */
1477*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1478*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1479*22dc650dSSadaf Ebrahimi   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1480*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1481*22dc650dSSadaf Ebrahimi   }
1482*22dc650dSSadaf Ebrahimi 
1483*22dc650dSSadaf Ebrahimi #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1484*22dc650dSSadaf Ebrahimi 
1485*22dc650dSSadaf Ebrahimi for (int i = 0; i < 2; i++)
1486*22dc650dSSadaf Ebrahimi   {
1487*22dc650dSSadaf Ebrahimi   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1488*22dc650dSSadaf Ebrahimi 
1489*22dc650dSSadaf Ebrahimi   if (char1 != char2)
1490*22dc650dSSadaf Ebrahimi     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1491*22dc650dSSadaf Ebrahimi   }
1492*22dc650dSSadaf Ebrahimi 
1493*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1494*22dc650dSSadaf Ebrahimi 
1495*22dc650dSSadaf Ebrahimi if (compare_type == vector_compare_match2)
1496*22dc650dSSadaf Ebrahimi   {
1497*22dc650dSSadaf Ebrahimi   /* VREPI */
1498*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1499*22dc650dSSadaf Ebrahimi   instruction[1] = 0;
1500*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1501*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1502*22dc650dSSadaf Ebrahimi   }
1503*22dc650dSSadaf Ebrahimi 
1504*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1505*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1506*22dc650dSSadaf Ebrahimi 
1507*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1508*22dc650dSSadaf Ebrahimi   {
1509*22dc650dSSadaf Ebrahimi   if (compare_type == vector_compare_match1i)
1510*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1511*22dc650dSSadaf Ebrahimi 
1512*22dc650dSSadaf Ebrahimi   /* VFEE */
1513*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1514*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1515*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1516*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1517*22dc650dSSadaf Ebrahimi   }
1518*22dc650dSSadaf Ebrahimi else
1519*22dc650dSSadaf Ebrahimi   {
1520*22dc650dSSadaf Ebrahimi   for (i = 0; i < 3; i++)
1521*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1522*22dc650dSSadaf Ebrahimi 
1523*22dc650dSSadaf Ebrahimi   /* VFENE */
1524*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1525*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1526*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1527*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1528*22dc650dSSadaf Ebrahimi   }
1529*22dc650dSSadaf Ebrahimi 
1530*22dc650dSSadaf Ebrahimi /* VLGVB */
1531*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1532*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1533*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1534*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1535*22dc650dSSadaf Ebrahimi 
1536*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1537*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1538*22dc650dSSadaf Ebrahimi 
1539*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1540*22dc650dSSadaf Ebrahimi 
1541*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
1542*22dc650dSSadaf Ebrahimi start = LABEL();
1543*22dc650dSSadaf Ebrahimi 
1544*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1545*22dc650dSSadaf Ebrahimi 
1546*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1547*22dc650dSSadaf Ebrahimi 
1548*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1549*22dc650dSSadaf Ebrahimi 
1550*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1551*22dc650dSSadaf Ebrahimi   {
1552*22dc650dSSadaf Ebrahimi   if (compare_type == vector_compare_match1i)
1553*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1554*22dc650dSSadaf Ebrahimi 
1555*22dc650dSSadaf Ebrahimi   /* VFEE */
1556*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1557*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1558*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1559*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1560*22dc650dSSadaf Ebrahimi   }
1561*22dc650dSSadaf Ebrahimi else
1562*22dc650dSSadaf Ebrahimi   {
1563*22dc650dSSadaf Ebrahimi   for (i = 0; i < 3; i++)
1564*22dc650dSSadaf Ebrahimi     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1565*22dc650dSSadaf Ebrahimi 
1566*22dc650dSSadaf Ebrahimi   /* VFENE */
1567*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1568*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1569*22dc650dSSadaf Ebrahimi   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1570*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1571*22dc650dSSadaf Ebrahimi   }
1572*22dc650dSSadaf Ebrahimi 
1573*22dc650dSSadaf Ebrahimi sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1574*22dc650dSSadaf Ebrahimi JUMPTO(SLJIT_OVERFLOW, start);
1575*22dc650dSSadaf Ebrahimi 
1576*22dc650dSSadaf Ebrahimi /* VLGVB */
1577*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1578*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1579*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1580*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1581*22dc650dSSadaf Ebrahimi 
1582*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1583*22dc650dSSadaf Ebrahimi 
1584*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
1585*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1586*22dc650dSSadaf Ebrahimi 
1587*22dc650dSSadaf Ebrahimi return not_found;
1588*22dc650dSSadaf Ebrahimi }
1589*22dc650dSSadaf Ebrahimi 
1590*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1591*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)1592*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1593*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1594*22dc650dSSadaf Ebrahimi {
1595*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
1596*22dc650dSSadaf Ebrahimi sljit_u16 instruction[3];
1597*22dc650dSSadaf Ebrahimi struct sljit_label *start;
1598*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1599*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
1600*22dc650dSSadaf Ebrahimi #endif
1601*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
1602*22dc650dSSadaf Ebrahimi struct sljit_jump *jump[2];
1603*22dc650dSSadaf Ebrahimi vector_compare_type compare1_type = vector_compare_match1;
1604*22dc650dSSadaf Ebrahimi vector_compare_type compare2_type = vector_compare_match1;
1605*22dc650dSSadaf Ebrahimi sljit_u32 bit1 = 0;
1606*22dc650dSSadaf Ebrahimi sljit_u32 bit2 = 0;
1607*22dc650dSSadaf Ebrahimi sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1608*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1609*22dc650dSSadaf Ebrahimi sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
1610*22dc650dSSadaf Ebrahimi sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1611*22dc650dSSadaf Ebrahimi sljit_s32 data1_ind = 0;
1612*22dc650dSSadaf Ebrahimi sljit_s32 data2_ind = 1;
1613*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_ind = 2;
1614*22dc650dSSadaf Ebrahimi sljit_s32 tmp2_ind = 3;
1615*22dc650dSSadaf Ebrahimi sljit_s32 cmp1a_ind = 4;
1616*22dc650dSSadaf Ebrahimi sljit_s32 cmp1b_ind = 5;
1617*22dc650dSSadaf Ebrahimi sljit_s32 cmp2a_ind = 6;
1618*22dc650dSSadaf Ebrahimi sljit_s32 cmp2b_ind = 7;
1619*22dc650dSSadaf Ebrahimi sljit_s32 zero_ind = 8;
1620*22dc650dSSadaf Ebrahimi int i;
1621*22dc650dSSadaf Ebrahimi 
1622*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1623*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1624*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1625*22dc650dSSadaf Ebrahimi 
1626*22dc650dSSadaf Ebrahimi if (char1a != char1b)
1627*22dc650dSSadaf Ebrahimi   {
1628*22dc650dSSadaf Ebrahimi   bit1 = char1a ^ char1b;
1629*22dc650dSSadaf Ebrahimi   compare1_type = vector_compare_match1i;
1630*22dc650dSSadaf Ebrahimi 
1631*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit1))
1632*22dc650dSSadaf Ebrahimi     {
1633*22dc650dSSadaf Ebrahimi     bit1 = 0;
1634*22dc650dSSadaf Ebrahimi     compare1_type = vector_compare_match2;
1635*22dc650dSSadaf Ebrahimi     }
1636*22dc650dSSadaf Ebrahimi   }
1637*22dc650dSSadaf Ebrahimi 
1638*22dc650dSSadaf Ebrahimi if (char2a != char2b)
1639*22dc650dSSadaf Ebrahimi   {
1640*22dc650dSSadaf Ebrahimi   bit2 = char2a ^ char2b;
1641*22dc650dSSadaf Ebrahimi   compare2_type = vector_compare_match1i;
1642*22dc650dSSadaf Ebrahimi 
1643*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit2))
1644*22dc650dSSadaf Ebrahimi     {
1645*22dc650dSSadaf Ebrahimi     bit2 = 0;
1646*22dc650dSSadaf Ebrahimi     compare2_type = vector_compare_match2;
1647*22dc650dSSadaf Ebrahimi     }
1648*22dc650dSSadaf Ebrahimi   }
1649*22dc650dSSadaf Ebrahimi 
1650*22dc650dSSadaf Ebrahimi /* Initialize. */
1651*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
1652*22dc650dSSadaf Ebrahimi   {
1653*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1654*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1655*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1656*22dc650dSSadaf Ebrahimi 
1657*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1658*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1659*22dc650dSSadaf Ebrahimi   }
1660*22dc650dSSadaf Ebrahimi 
1661*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1662*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1663*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1664*22dc650dSSadaf Ebrahimi 
1665*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
1666*22dc650dSSadaf Ebrahimi 
1667*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1668*22dc650dSSadaf Ebrahimi 
1669*22dc650dSSadaf Ebrahimi /* VREPI */
1670*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1671*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(char1a | bit1);
1672*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1673*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1674*22dc650dSSadaf Ebrahimi 
1675*22dc650dSSadaf Ebrahimi if (char1a != char1b)
1676*22dc650dSSadaf Ebrahimi   {
1677*22dc650dSSadaf Ebrahimi   /* VREPI */
1678*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1679*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1680*22dc650dSSadaf Ebrahimi   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1681*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1682*22dc650dSSadaf Ebrahimi   }
1683*22dc650dSSadaf Ebrahimi 
1684*22dc650dSSadaf Ebrahimi /* VREPI */
1685*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1686*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(char2a | bit2);
1687*22dc650dSSadaf Ebrahimi /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1688*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1689*22dc650dSSadaf Ebrahimi 
1690*22dc650dSSadaf Ebrahimi if (char2a != char2b)
1691*22dc650dSSadaf Ebrahimi   {
1692*22dc650dSSadaf Ebrahimi   /* VREPI */
1693*22dc650dSSadaf Ebrahimi   instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1694*22dc650dSSadaf Ebrahimi   instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1695*22dc650dSSadaf Ebrahimi   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1696*22dc650dSSadaf Ebrahimi   sljit_emit_op_custom(compiler, instruction, 6);
1697*22dc650dSSadaf Ebrahimi   }
1698*22dc650dSSadaf Ebrahimi 
1699*22dc650dSSadaf Ebrahimi #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1700*22dc650dSSadaf Ebrahimi 
1701*22dc650dSSadaf Ebrahimi for (int i = 0; i < 2; i++)
1702*22dc650dSSadaf Ebrahimi   {
1703*22dc650dSSadaf Ebrahimi   replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1704*22dc650dSSadaf Ebrahimi 
1705*22dc650dSSadaf Ebrahimi   if (char1a != char1b)
1706*22dc650dSSadaf Ebrahimi     replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1707*22dc650dSSadaf Ebrahimi 
1708*22dc650dSSadaf Ebrahimi   replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1709*22dc650dSSadaf Ebrahimi 
1710*22dc650dSSadaf Ebrahimi   if (char2a != char2b)
1711*22dc650dSSadaf Ebrahimi     replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1712*22dc650dSSadaf Ebrahimi   }
1713*22dc650dSSadaf Ebrahimi 
1714*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1715*22dc650dSSadaf Ebrahimi 
1716*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1717*22dc650dSSadaf Ebrahimi 
1718*22dc650dSSadaf Ebrahimi /* VREPI */
1719*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1720*22dc650dSSadaf Ebrahimi instruction[1] = 0;
1721*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1722*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1723*22dc650dSSadaf Ebrahimi 
1724*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1725*22dc650dSSadaf Ebrahimi restart = LABEL();
1726*22dc650dSSadaf Ebrahimi #endif
1727*22dc650dSSadaf Ebrahimi 
1728*22dc650dSSadaf Ebrahimi jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1729*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1730*22dc650dSSadaf Ebrahimi jump[1] = JUMP(SLJIT_JUMP);
1731*22dc650dSSadaf Ebrahimi JUMPHERE(jump[0]);
1732*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1733*22dc650dSSadaf Ebrahimi JUMPHERE(jump[1]);
1734*22dc650dSSadaf Ebrahimi 
1735*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1736*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1737*22dc650dSSadaf Ebrahimi 
1738*22dc650dSSadaf Ebrahimi for (i = 0; i < 3; i++)
1739*22dc650dSSadaf Ebrahimi   {
1740*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1741*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1742*22dc650dSSadaf Ebrahimi   }
1743*22dc650dSSadaf Ebrahimi 
1744*22dc650dSSadaf Ebrahimi /* VN */
1745*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1746*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(data2_ind << 12);
1747*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1748*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1749*22dc650dSSadaf Ebrahimi 
1750*22dc650dSSadaf Ebrahimi /* VFENE */
1751*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1752*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1753*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1754*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1755*22dc650dSSadaf Ebrahimi 
1756*22dc650dSSadaf Ebrahimi /* VLGVB */
1757*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1758*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1759*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1760*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1761*22dc650dSSadaf Ebrahimi 
1762*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1763*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1764*22dc650dSSadaf Ebrahimi 
1765*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1766*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1767*22dc650dSSadaf Ebrahimi 
1768*22dc650dSSadaf Ebrahimi /* Main loop. */
1769*22dc650dSSadaf Ebrahimi start = LABEL();
1770*22dc650dSSadaf Ebrahimi 
1771*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1772*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1773*22dc650dSSadaf Ebrahimi 
1774*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1775*22dc650dSSadaf Ebrahimi load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1776*22dc650dSSadaf Ebrahimi 
1777*22dc650dSSadaf Ebrahimi for (i = 0; i < 3; i++)
1778*22dc650dSSadaf Ebrahimi   {
1779*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1780*22dc650dSSadaf Ebrahimi   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1781*22dc650dSSadaf Ebrahimi   }
1782*22dc650dSSadaf Ebrahimi 
1783*22dc650dSSadaf Ebrahimi /* VN */
1784*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1785*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)(data2_ind << 12);
1786*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1787*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1788*22dc650dSSadaf Ebrahimi 
1789*22dc650dSSadaf Ebrahimi /* VFENE */
1790*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1791*22dc650dSSadaf Ebrahimi instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1792*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1793*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1794*22dc650dSSadaf Ebrahimi 
1795*22dc650dSSadaf Ebrahimi sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1796*22dc650dSSadaf Ebrahimi JUMPTO(SLJIT_OVERFLOW, start);
1797*22dc650dSSadaf Ebrahimi 
1798*22dc650dSSadaf Ebrahimi /* VLGVB */
1799*22dc650dSSadaf Ebrahimi instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1800*22dc650dSSadaf Ebrahimi instruction[1] = 7;
1801*22dc650dSSadaf Ebrahimi instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1802*22dc650dSSadaf Ebrahimi sljit_emit_op_custom(compiler, instruction, 6);
1803*22dc650dSSadaf Ebrahimi 
1804*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1805*22dc650dSSadaf Ebrahimi 
1806*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
1807*22dc650dSSadaf Ebrahimi 
1808*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1809*22dc650dSSadaf Ebrahimi 
1810*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1811*22dc650dSSadaf Ebrahimi if (common->utf)
1812*22dc650dSSadaf Ebrahimi   {
1813*22dc650dSSadaf Ebrahimi   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1814*22dc650dSSadaf Ebrahimi 
1815*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1816*22dc650dSSadaf Ebrahimi 
1817*22dc650dSSadaf Ebrahimi   quit = jump_if_utf_char_start(compiler, TMP1);
1818*22dc650dSSadaf Ebrahimi 
1819*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1820*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1821*22dc650dSSadaf Ebrahimi 
1822*22dc650dSSadaf Ebrahimi   /* TMP1 contains diff. */
1823*22dc650dSSadaf Ebrahimi   OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1824*22dc650dSSadaf Ebrahimi   OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1825*22dc650dSSadaf Ebrahimi   JUMPTO(SLJIT_JUMP, restart);
1826*22dc650dSSadaf Ebrahimi 
1827*22dc650dSSadaf Ebrahimi   JUMPHERE(quit);
1828*22dc650dSSadaf Ebrahimi   }
1829*22dc650dSSadaf Ebrahimi #endif
1830*22dc650dSSadaf Ebrahimi 
1831*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1832*22dc650dSSadaf Ebrahimi 
1833*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
1834*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1835*22dc650dSSadaf Ebrahimi }
1836*22dc650dSSadaf Ebrahimi 
1837*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_S390X */
1838*22dc650dSSadaf Ebrahimi 
1839*22dc650dSSadaf Ebrahimi #if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
1840*22dc650dSSadaf Ebrahimi 
1841*22dc650dSSadaf Ebrahimi #ifdef __linux__
1842*22dc650dSSadaf Ebrahimi /* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
1843*22dc650dSSadaf Ebrahimi #include <sys/auxv.h>
1844*22dc650dSSadaf Ebrahimi #define LOONGARCH_HWCAP_LSX  (1 << 4)
1845*22dc650dSSadaf Ebrahimi #define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
1846*22dc650dSSadaf Ebrahimi #else
1847*22dc650dSSadaf Ebrahimi #define HAS_LSX_SUPPORT 0
1848*22dc650dSSadaf Ebrahimi #endif
1849*22dc650dSSadaf Ebrahimi 
1850*22dc650dSSadaf Ebrahimi typedef sljit_ins sljit_u32;
1851*22dc650dSSadaf Ebrahimi 
1852*22dc650dSSadaf Ebrahimi #define SI12_IMM_MASK   0x003ffc00
1853*22dc650dSSadaf Ebrahimi #define UI5_IMM_MASK    0x00007c00
1854*22dc650dSSadaf Ebrahimi #define UI2_IMM_MASK    0x00000c00
1855*22dc650dSSadaf Ebrahimi 
1856*22dc650dSSadaf Ebrahimi #define VD(vd)      ((sljit_ins)vd << 0)
1857*22dc650dSSadaf Ebrahimi #define VJ(vj)      ((sljit_ins)vj << 5)
1858*22dc650dSSadaf Ebrahimi #define VK(vk)      ((sljit_ins)vk << 10)
1859*22dc650dSSadaf Ebrahimi #define RD_V(rd)    ((sljit_ins)rd << 0)
1860*22dc650dSSadaf Ebrahimi #define RJ_V(rj)    ((sljit_ins)rj << 5)
1861*22dc650dSSadaf Ebrahimi 
1862*22dc650dSSadaf Ebrahimi #define IMM_SI12(imm)   (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
1863*22dc650dSSadaf Ebrahimi #define IMM_UI5(imm)    (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
1864*22dc650dSSadaf Ebrahimi #define IMM_UI2(imm)    (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
1865*22dc650dSSadaf Ebrahimi 
1866*22dc650dSSadaf Ebrahimi // LSX OPCODES:
1867*22dc650dSSadaf Ebrahimi #define VLD           0x2c000000
1868*22dc650dSSadaf Ebrahimi #define VOR_V         0x71268000
1869*22dc650dSSadaf Ebrahimi #define VAND_V        0x71260000
1870*22dc650dSSadaf Ebrahimi #define VBSLL_V       0x728e0000
1871*22dc650dSSadaf Ebrahimi #define VMSKLTZ_B     0x729c4000
1872*22dc650dSSadaf Ebrahimi #define VPICKVE2GR_WU 0x72f3e000
1873*22dc650dSSadaf Ebrahimi 
1874*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1875*22dc650dSSadaf Ebrahimi #define VREPLGR2VR  0x729f0000
1876*22dc650dSSadaf Ebrahimi #define VSEQ        0x70000000
1877*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
1878*22dc650dSSadaf Ebrahimi #define VREPLGR2VR  0x729f0400
1879*22dc650dSSadaf Ebrahimi #define VSEQ        0x70008000
1880*22dc650dSSadaf Ebrahimi #else
1881*22dc650dSSadaf Ebrahimi #define VREPLGR2VR  0x729f0800
1882*22dc650dSSadaf Ebrahimi #define VSEQ        0x70010000
1883*22dc650dSSadaf Ebrahimi #endif
1884*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_lsx_compare(struct sljit_compiler * compiler,vector_compare_type compare_type,sljit_s32 dst_ind,sljit_s32 cmp1_ind,sljit_s32 cmp2_ind,sljit_s32 tmp_ind)1885*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1886*22dc650dSSadaf Ebrahimi   sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1887*22dc650dSSadaf Ebrahimi {
1888*22dc650dSSadaf Ebrahimi if (compare_type != vector_compare_match2)
1889*22dc650dSSadaf Ebrahimi   {
1890*22dc650dSSadaf Ebrahimi   if (compare_type == vector_compare_match1i)
1891*22dc650dSSadaf Ebrahimi     {
1892*22dc650dSSadaf Ebrahimi     /* VOR.V vd, vj, vk */
1893*22dc650dSSadaf Ebrahimi     push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
1894*22dc650dSSadaf Ebrahimi     }
1895*22dc650dSSadaf Ebrahimi 
1896*22dc650dSSadaf Ebrahimi   /* VSEQ.B/H/W vd, vj, vk */
1897*22dc650dSSadaf Ebrahimi   push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1898*22dc650dSSadaf Ebrahimi   return;
1899*22dc650dSSadaf Ebrahimi   }
1900*22dc650dSSadaf Ebrahimi 
1901*22dc650dSSadaf Ebrahimi /* VBSLL.V vd, vj, ui5 */
1902*22dc650dSSadaf Ebrahimi push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
1903*22dc650dSSadaf Ebrahimi 
1904*22dc650dSSadaf Ebrahimi /* VSEQ.B/H/W vd, vj, vk */
1905*22dc650dSSadaf Ebrahimi push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1906*22dc650dSSadaf Ebrahimi 
1907*22dc650dSSadaf Ebrahimi /* VSEQ.B/H/W vd, vj, vk */
1908*22dc650dSSadaf Ebrahimi push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
1909*22dc650dSSadaf Ebrahimi 
1910*22dc650dSSadaf Ebrahimi /* VOR vd, vj, vk */
1911*22dc650dSSadaf Ebrahimi push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
1912*22dc650dSSadaf Ebrahimi return;
1913*22dc650dSSadaf Ebrahimi }
1914*22dc650dSSadaf Ebrahimi 
1915*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
1916*22dc650dSSadaf Ebrahimi 
fast_forward_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2,sljit_s32 offset)1917*22dc650dSSadaf Ebrahimi static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1918*22dc650dSSadaf Ebrahimi {
1919*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
1920*22dc650dSSadaf Ebrahimi struct sljit_label *start;
1921*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1922*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
1923*22dc650dSSadaf Ebrahimi #endif
1924*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
1925*22dc650dSSadaf Ebrahimi struct sljit_jump *partial_quit[2];
1926*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
1927*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
1928*22dc650dSSadaf Ebrahimi sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1929*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = 0;
1930*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = 1;
1931*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = 2;
1932*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = 3;
1933*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
1934*22dc650dSSadaf Ebrahimi 
1935*22dc650dSSadaf Ebrahimi SLJIT_UNUSED_ARG(offset);
1936*22dc650dSSadaf Ebrahimi 
1937*22dc650dSSadaf Ebrahimi if (char1 != char2)
1938*22dc650dSSadaf Ebrahimi   {
1939*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
1940*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
1941*22dc650dSSadaf Ebrahimi 
1942*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
1943*22dc650dSSadaf Ebrahimi     {
1944*22dc650dSSadaf Ebrahimi     bit = 0;
1945*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
1946*22dc650dSSadaf Ebrahimi     }
1947*22dc650dSSadaf Ebrahimi   }
1948*22dc650dSSadaf Ebrahimi 
1949*22dc650dSSadaf Ebrahimi partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1950*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
1951*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[0]);
1952*22dc650dSSadaf Ebrahimi 
1953*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
1954*22dc650dSSadaf Ebrahimi 
1955*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
1956*22dc650dSSadaf Ebrahimi 
1957*22dc650dSSadaf Ebrahimi /* VREPLGR2VR.B/H/W vd, rj */
1958*22dc650dSSadaf Ebrahimi push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
1959*22dc650dSSadaf Ebrahimi 
1960*22dc650dSSadaf Ebrahimi if (char1 != char2)
1961*22dc650dSSadaf Ebrahimi   {
1962*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
1963*22dc650dSSadaf Ebrahimi 
1964*22dc650dSSadaf Ebrahimi   /* VREPLGR2VR.B/H/W vd, rj */
1965*22dc650dSSadaf Ebrahimi   push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
1966*22dc650dSSadaf Ebrahimi   }
1967*22dc650dSSadaf Ebrahimi 
1968*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
1969*22dc650dSSadaf Ebrahimi 
1970*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1971*22dc650dSSadaf Ebrahimi restart = LABEL();
1972*22dc650dSSadaf Ebrahimi #endif
1973*22dc650dSSadaf Ebrahimi 
1974*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
1975*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1976*22dc650dSSadaf Ebrahimi 
1977*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
1978*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
1979*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1980*22dc650dSSadaf Ebrahimi 
1981*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
1982*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
1983*22dc650dSSadaf Ebrahimi 
1984*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
1985*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
1986*22dc650dSSadaf Ebrahimi 
1987*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1988*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1989*22dc650dSSadaf Ebrahimi 
1990*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1991*22dc650dSSadaf Ebrahimi 
1992*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1993*22dc650dSSadaf Ebrahimi 
1994*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
1995*22dc650dSSadaf Ebrahimi start = LABEL();
1996*22dc650dSSadaf Ebrahimi 
1997*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1998*22dc650dSSadaf Ebrahimi 
1999*22dc650dSSadaf Ebrahimi partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2000*22dc650dSSadaf Ebrahimi if (common->mode == PCRE2_JIT_COMPLETE)
2001*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, partial_quit[1]);
2002*22dc650dSSadaf Ebrahimi 
2003*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2004*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2005*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2006*22dc650dSSadaf Ebrahimi 
2007*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
2008*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2009*22dc650dSSadaf Ebrahimi 
2010*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
2011*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2012*22dc650dSSadaf Ebrahimi 
2013*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2014*22dc650dSSadaf Ebrahimi 
2015*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
2016*22dc650dSSadaf Ebrahimi 
2017*22dc650dSSadaf Ebrahimi /* CTZ.W rd, rj */
2018*22dc650dSSadaf Ebrahimi push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2019*22dc650dSSadaf Ebrahimi 
2020*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2021*22dc650dSSadaf Ebrahimi 
2022*22dc650dSSadaf Ebrahimi if (common->mode != PCRE2_JIT_COMPLETE)
2023*22dc650dSSadaf Ebrahimi   {
2024*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[0]);
2025*22dc650dSSadaf Ebrahimi   JUMPHERE(partial_quit[1]);
2026*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
2027*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
2028*22dc650dSSadaf Ebrahimi   }
2029*22dc650dSSadaf Ebrahimi else
2030*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2031*22dc650dSSadaf Ebrahimi 
2032*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2033*22dc650dSSadaf Ebrahimi if (common->utf && offset > 0)
2034*22dc650dSSadaf Ebrahimi   {
2035*22dc650dSSadaf Ebrahimi   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
2036*22dc650dSSadaf Ebrahimi 
2037*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
2038*22dc650dSSadaf Ebrahimi 
2039*22dc650dSSadaf Ebrahimi   quit = jump_if_utf_char_start(compiler, TMP1);
2040*22dc650dSSadaf Ebrahimi 
2041*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2042*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2043*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2044*22dc650dSSadaf Ebrahimi   JUMPTO(SLJIT_JUMP, restart);
2045*22dc650dSSadaf Ebrahimi 
2046*22dc650dSSadaf Ebrahimi   JUMPHERE(quit);
2047*22dc650dSSadaf Ebrahimi   }
2048*22dc650dSSadaf Ebrahimi #endif
2049*22dc650dSSadaf Ebrahimi }
2050*22dc650dSSadaf Ebrahimi 
2051*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
2052*22dc650dSSadaf Ebrahimi 
fast_requested_char_simd(compiler_common * common,PCRE2_UCHAR char1,PCRE2_UCHAR char2)2053*22dc650dSSadaf Ebrahimi static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
2054*22dc650dSSadaf Ebrahimi {
2055*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
2056*22dc650dSSadaf Ebrahimi struct sljit_label *start;
2057*22dc650dSSadaf Ebrahimi struct sljit_jump *quit;
2058*22dc650dSSadaf Ebrahimi jump_list *not_found = NULL;
2059*22dc650dSSadaf Ebrahimi vector_compare_type compare_type = vector_compare_match1;
2060*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2061*22dc650dSSadaf Ebrahimi sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2062*22dc650dSSadaf Ebrahimi sljit_s32 data_ind = 0;
2063*22dc650dSSadaf Ebrahimi sljit_s32 tmp_ind = 1;
2064*22dc650dSSadaf Ebrahimi sljit_s32 cmp1_ind = 2;
2065*22dc650dSSadaf Ebrahimi sljit_s32 cmp2_ind = 3;
2066*22dc650dSSadaf Ebrahimi sljit_u32 bit = 0;
2067*22dc650dSSadaf Ebrahimi 
2068*22dc650dSSadaf Ebrahimi if (char1 != char2)
2069*22dc650dSSadaf Ebrahimi   {
2070*22dc650dSSadaf Ebrahimi   bit = char1 ^ char2;
2071*22dc650dSSadaf Ebrahimi   compare_type = vector_compare_match1i;
2072*22dc650dSSadaf Ebrahimi 
2073*22dc650dSSadaf Ebrahimi   if (!is_powerof2(bit))
2074*22dc650dSSadaf Ebrahimi     {
2075*22dc650dSSadaf Ebrahimi     bit = 0;
2076*22dc650dSSadaf Ebrahimi     compare_type = vector_compare_match2;
2077*22dc650dSSadaf Ebrahimi     }
2078*22dc650dSSadaf Ebrahimi   }
2079*22dc650dSSadaf Ebrahimi 
2080*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2081*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
2082*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
2083*22dc650dSSadaf Ebrahimi 
2084*22dc650dSSadaf Ebrahimi /* First part (unaligned start) */
2085*22dc650dSSadaf Ebrahimi 
2086*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2087*22dc650dSSadaf Ebrahimi 
2088*22dc650dSSadaf Ebrahimi /* VREPLGR2VR vd, rj */
2089*22dc650dSSadaf Ebrahimi push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2090*22dc650dSSadaf Ebrahimi 
2091*22dc650dSSadaf Ebrahimi if (char1 != char2)
2092*22dc650dSSadaf Ebrahimi   {
2093*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2094*22dc650dSSadaf Ebrahimi   /* VREPLGR2VR vd, rj */
2095*22dc650dSSadaf Ebrahimi   push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2096*22dc650dSSadaf Ebrahimi   }
2097*22dc650dSSadaf Ebrahimi 
2098*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
2099*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2100*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2101*22dc650dSSadaf Ebrahimi 
2102*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2103*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2104*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2105*22dc650dSSadaf Ebrahimi 
2106*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
2107*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2108*22dc650dSSadaf Ebrahimi 
2109*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
2110*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2111*22dc650dSSadaf Ebrahimi 
2112*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2113*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2114*22dc650dSSadaf Ebrahimi 
2115*22dc650dSSadaf Ebrahimi quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2116*22dc650dSSadaf Ebrahimi 
2117*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2118*22dc650dSSadaf Ebrahimi 
2119*22dc650dSSadaf Ebrahimi /* Second part (aligned) */
2120*22dc650dSSadaf Ebrahimi start = LABEL();
2121*22dc650dSSadaf Ebrahimi 
2122*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2123*22dc650dSSadaf Ebrahimi 
2124*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2125*22dc650dSSadaf Ebrahimi 
2126*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2127*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2128*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2129*22dc650dSSadaf Ebrahimi 
2130*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
2131*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2132*22dc650dSSadaf Ebrahimi 
2133*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
2134*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2135*22dc650dSSadaf Ebrahimi 
2136*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2137*22dc650dSSadaf Ebrahimi 
2138*22dc650dSSadaf Ebrahimi JUMPHERE(quit);
2139*22dc650dSSadaf Ebrahimi 
2140*22dc650dSSadaf Ebrahimi /* CTZ.W rd, rj */
2141*22dc650dSSadaf Ebrahimi push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2142*22dc650dSSadaf Ebrahimi 
2143*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
2144*22dc650dSSadaf Ebrahimi add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2145*22dc650dSSadaf Ebrahimi 
2146*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
2147*22dc650dSSadaf Ebrahimi return not_found;
2148*22dc650dSSadaf Ebrahimi }
2149*22dc650dSSadaf Ebrahimi 
2150*22dc650dSSadaf Ebrahimi #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
2151*22dc650dSSadaf Ebrahimi 
fast_forward_char_pair_simd(compiler_common * common,sljit_s32 offs1,PCRE2_UCHAR char1a,PCRE2_UCHAR char1b,sljit_s32 offs2,PCRE2_UCHAR char2a,PCRE2_UCHAR char2b)2152*22dc650dSSadaf Ebrahimi static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
2153*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
2154*22dc650dSSadaf Ebrahimi {
2155*22dc650dSSadaf Ebrahimi DEFINE_COMPILER;
2156*22dc650dSSadaf Ebrahimi vector_compare_type compare1_type = vector_compare_match1;
2157*22dc650dSSadaf Ebrahimi vector_compare_type compare2_type = vector_compare_match1;
2158*22dc650dSSadaf Ebrahimi sljit_u32 bit1 = 0;
2159*22dc650dSSadaf Ebrahimi sljit_u32 bit2 = 0;
2160*22dc650dSSadaf Ebrahimi sljit_u32 diff = IN_UCHARS(offs1 - offs2);
2161*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
2162*22dc650dSSadaf Ebrahimi sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
2163*22dc650dSSadaf Ebrahimi sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2164*22dc650dSSadaf Ebrahimi sljit_s32 data1_ind = 0;
2165*22dc650dSSadaf Ebrahimi sljit_s32 data2_ind = 1;
2166*22dc650dSSadaf Ebrahimi sljit_s32 tmp1_ind = 2;
2167*22dc650dSSadaf Ebrahimi sljit_s32 tmp2_ind = 3;
2168*22dc650dSSadaf Ebrahimi sljit_s32 cmp1a_ind = 4;
2169*22dc650dSSadaf Ebrahimi sljit_s32 cmp1b_ind = 5;
2170*22dc650dSSadaf Ebrahimi sljit_s32 cmp2a_ind = 6;
2171*22dc650dSSadaf Ebrahimi sljit_s32 cmp2b_ind = 7;
2172*22dc650dSSadaf Ebrahimi struct sljit_label *start;
2173*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2174*22dc650dSSadaf Ebrahimi struct sljit_label *restart;
2175*22dc650dSSadaf Ebrahimi #endif
2176*22dc650dSSadaf Ebrahimi struct sljit_jump *jump[2];
2177*22dc650dSSadaf Ebrahimi 
2178*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
2179*22dc650dSSadaf Ebrahimi SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
2180*22dc650dSSadaf Ebrahimi 
2181*22dc650dSSadaf Ebrahimi /* Initialize. */
2182*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
2183*22dc650dSSadaf Ebrahimi   {
2184*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
2185*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
2186*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
2187*22dc650dSSadaf Ebrahimi 
2188*22dc650dSSadaf Ebrahimi   OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
2189*22dc650dSSadaf Ebrahimi   SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
2190*22dc650dSSadaf Ebrahimi   }
2191*22dc650dSSadaf Ebrahimi 
2192*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2193*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2194*22dc650dSSadaf Ebrahimi 
2195*22dc650dSSadaf Ebrahimi if (char1a == char1b)
2196*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2197*22dc650dSSadaf Ebrahimi else
2198*22dc650dSSadaf Ebrahimi   {
2199*22dc650dSSadaf Ebrahimi   bit1 = char1a ^ char1b;
2200*22dc650dSSadaf Ebrahimi   if (is_powerof2(bit1))
2201*22dc650dSSadaf Ebrahimi     {
2202*22dc650dSSadaf Ebrahimi     compare1_type = vector_compare_match1i;
2203*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
2204*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
2205*22dc650dSSadaf Ebrahimi     }
2206*22dc650dSSadaf Ebrahimi   else
2207*22dc650dSSadaf Ebrahimi     {
2208*22dc650dSSadaf Ebrahimi     compare1_type = vector_compare_match2;
2209*22dc650dSSadaf Ebrahimi     bit1 = 0;
2210*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2211*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
2212*22dc650dSSadaf Ebrahimi     }
2213*22dc650dSSadaf Ebrahimi   }
2214*22dc650dSSadaf Ebrahimi 
2215*22dc650dSSadaf Ebrahimi /* VREPLGR2VR vd, rj */
2216*22dc650dSSadaf Ebrahimi push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
2217*22dc650dSSadaf Ebrahimi 
2218*22dc650dSSadaf Ebrahimi if (char1a != char1b)
2219*22dc650dSSadaf Ebrahimi   {
2220*22dc650dSSadaf Ebrahimi   /* VREPLGR2VR vd, rj */
2221*22dc650dSSadaf Ebrahimi   push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
2222*22dc650dSSadaf Ebrahimi   }
2223*22dc650dSSadaf Ebrahimi 
2224*22dc650dSSadaf Ebrahimi if (char2a == char2b)
2225*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2226*22dc650dSSadaf Ebrahimi else
2227*22dc650dSSadaf Ebrahimi   {
2228*22dc650dSSadaf Ebrahimi   bit2 = char2a ^ char2b;
2229*22dc650dSSadaf Ebrahimi   if (is_powerof2(bit2))
2230*22dc650dSSadaf Ebrahimi     {
2231*22dc650dSSadaf Ebrahimi     compare2_type = vector_compare_match1i;
2232*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
2233*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
2234*22dc650dSSadaf Ebrahimi     }
2235*22dc650dSSadaf Ebrahimi   else
2236*22dc650dSSadaf Ebrahimi     {
2237*22dc650dSSadaf Ebrahimi     compare2_type = vector_compare_match2;
2238*22dc650dSSadaf Ebrahimi     bit2 = 0;
2239*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2240*22dc650dSSadaf Ebrahimi     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
2241*22dc650dSSadaf Ebrahimi     }
2242*22dc650dSSadaf Ebrahimi   }
2243*22dc650dSSadaf Ebrahimi 
2244*22dc650dSSadaf Ebrahimi /* VREPLGR2VR vd, rj */
2245*22dc650dSSadaf Ebrahimi push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
2246*22dc650dSSadaf Ebrahimi 
2247*22dc650dSSadaf Ebrahimi if (char2a != char2b)
2248*22dc650dSSadaf Ebrahimi   {
2249*22dc650dSSadaf Ebrahimi   /* VREPLGR2VR vd, rj */
2250*22dc650dSSadaf Ebrahimi   push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
2251*22dc650dSSadaf Ebrahimi   }
2252*22dc650dSSadaf Ebrahimi 
2253*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2254*22dc650dSSadaf Ebrahimi restart = LABEL();
2255*22dc650dSSadaf Ebrahimi #endif
2256*22dc650dSSadaf Ebrahimi 
2257*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
2258*22dc650dSSadaf Ebrahimi OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2259*22dc650dSSadaf Ebrahimi OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2260*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2261*22dc650dSSadaf Ebrahimi 
2262*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2263*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2264*22dc650dSSadaf Ebrahimi 
2265*22dc650dSSadaf Ebrahimi jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
2266*22dc650dSSadaf Ebrahimi 
2267*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2268*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2269*22dc650dSSadaf Ebrahimi jump[1] = JUMP(SLJIT_JUMP);
2270*22dc650dSSadaf Ebrahimi 
2271*22dc650dSSadaf Ebrahimi JUMPHERE(jump[0]);
2272*22dc650dSSadaf Ebrahimi 
2273*22dc650dSSadaf Ebrahimi /* VBSLL.V vd, vj, ui5 */
2274*22dc650dSSadaf Ebrahimi push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
2275*22dc650dSSadaf Ebrahimi 
2276*22dc650dSSadaf Ebrahimi JUMPHERE(jump[1]);
2277*22dc650dSSadaf Ebrahimi 
2278*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
2279*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
2280*22dc650dSSadaf Ebrahimi 
2281*22dc650dSSadaf Ebrahimi /* VAND vd, vj, vk */
2282*22dc650dSSadaf Ebrahimi push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2283*22dc650dSSadaf Ebrahimi 
2284*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
2285*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2286*22dc650dSSadaf Ebrahimi 
2287*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
2288*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2289*22dc650dSSadaf Ebrahimi 
2290*22dc650dSSadaf Ebrahimi /* Ignore matches before the first STR_PTR. */
2291*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2292*22dc650dSSadaf Ebrahimi OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2293*22dc650dSSadaf Ebrahimi 
2294*22dc650dSSadaf Ebrahimi jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2295*22dc650dSSadaf Ebrahimi 
2296*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2297*22dc650dSSadaf Ebrahimi 
2298*22dc650dSSadaf Ebrahimi /* Main loop. */
2299*22dc650dSSadaf Ebrahimi start = LABEL();
2300*22dc650dSSadaf Ebrahimi 
2301*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2302*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2303*22dc650dSSadaf Ebrahimi 
2304*22dc650dSSadaf Ebrahimi /* VLD vd, rj, si12 */
2305*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2306*22dc650dSSadaf Ebrahimi push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2307*22dc650dSSadaf Ebrahimi 
2308*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
2309*22dc650dSSadaf Ebrahimi fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
2310*22dc650dSSadaf Ebrahimi 
2311*22dc650dSSadaf Ebrahimi /* VAND.V vd, vj, vk */
2312*22dc650dSSadaf Ebrahimi push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2313*22dc650dSSadaf Ebrahimi 
2314*22dc650dSSadaf Ebrahimi /* VMSKLTZ.B vd, vj */
2315*22dc650dSSadaf Ebrahimi push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2316*22dc650dSSadaf Ebrahimi 
2317*22dc650dSSadaf Ebrahimi /* VPICKVE2GR.WU rd, vj, ui2 */
2318*22dc650dSSadaf Ebrahimi push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2319*22dc650dSSadaf Ebrahimi 
2320*22dc650dSSadaf Ebrahimi CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2321*22dc650dSSadaf Ebrahimi 
2322*22dc650dSSadaf Ebrahimi JUMPHERE(jump[0]);
2323*22dc650dSSadaf Ebrahimi 
2324*22dc650dSSadaf Ebrahimi /* CTZ.W rd, rj */
2325*22dc650dSSadaf Ebrahimi push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2326*22dc650dSSadaf Ebrahimi 
2327*22dc650dSSadaf Ebrahimi OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2328*22dc650dSSadaf Ebrahimi 
2329*22dc650dSSadaf Ebrahimi add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2330*22dc650dSSadaf Ebrahimi 
2331*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2332*22dc650dSSadaf Ebrahimi if (common->utf)
2333*22dc650dSSadaf Ebrahimi   {
2334*22dc650dSSadaf Ebrahimi   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
2335*22dc650dSSadaf Ebrahimi 
2336*22dc650dSSadaf Ebrahimi   jump[0] = jump_if_utf_char_start(compiler, TMP1);
2337*22dc650dSSadaf Ebrahimi 
2338*22dc650dSSadaf Ebrahimi   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2339*22dc650dSSadaf Ebrahimi   CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
2340*22dc650dSSadaf Ebrahimi 
2341*22dc650dSSadaf Ebrahimi   add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
2342*22dc650dSSadaf Ebrahimi 
2343*22dc650dSSadaf Ebrahimi   JUMPHERE(jump[0]);
2344*22dc650dSSadaf Ebrahimi   }
2345*22dc650dSSadaf Ebrahimi #endif
2346*22dc650dSSadaf Ebrahimi 
2347*22dc650dSSadaf Ebrahimi OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2348*22dc650dSSadaf Ebrahimi 
2349*22dc650dSSadaf Ebrahimi if (common->match_end_ptr != 0)
2350*22dc650dSSadaf Ebrahimi   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
2351*22dc650dSSadaf Ebrahimi }
2352*22dc650dSSadaf Ebrahimi 
2353*22dc650dSSadaf Ebrahimi #endif /* SLJIT_CONFIG_LOONGARCH_64 */
2354*22dc650dSSadaf Ebrahimi 
2355*22dc650dSSadaf Ebrahimi #endif /* !SUPPORT_VALGRIND */
2356