/aosp_15_r20/external/XNNPACK/src/f32-velu/gen/ |
H A D | velu-scalar-rr2-lut16-p3-x6.c | 94 float vs5 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx5] + ven5); in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 123 vs5 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 151 vt5 *= vs5; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 152 vs5 -= vone; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 171 const float ve5 = (vp5 + vs5) * valpha; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
H A D | velu-scalar-rr2-p6-x6.c | 74 float vs5 = uint32_as_float(float_as_uint32(vn5) << 23); in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() local 112 vs5 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 161 vt5 *= vs5; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 162 vs5 -= vone; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 181 const float ve5 = (vp5 + vs5) * valpha; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6()
|
H A D | velu-avx512f-rr1-lut16-p3-perm-x96.c | 85 __m512 vs5 = _mm512_castsi512_ps(_mm512_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96() local 113 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96() 120 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96() 140 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96()
|
H A D | velu-wasm-rr2-lut16-p3-x6.c | 94 float vs5 = uint32_as_float(xnn_table_exp2minus_k_over_16[vidx5] + ven5); in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() local 127 vt5 *= vs5; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 128 vs5 -= vone; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 147 const float ve5 = (vp5 + vs5) * valpha; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6()
|
H A D | velu-wasm-rr2-p6-x6.c | 74 float vs5 = uint32_as_float(float_as_uint32(vn5) << 23); in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() local 137 vt5 *= vs5; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 138 vs5 -= vone; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 157 const float ve5 = (vp5 + vs5) * valpha; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6()
|
H A D | velu-avx2-rr1-lut4-p4-perm-x48.c | 90 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() local 118 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 130 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48() 143 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48()
|
H A D | velu-avx2-rr1-lut16-p3-gather-x48.c | 98 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() local 119 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 131 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48() 144 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48()
|
H A D | velu-avx2-rr1-lut8-p4-perm-x48.c | 90 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() local 118 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 130 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48() 143 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48()
|
H A D | velu-avx512f-rr1-p6-x96.c | 74 __m512 vs5 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn5), 23)); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() local 123 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 130 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 150 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96()
|
H A D | velu-avx2-rr1-p6-x48.c | 72 __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() local 121 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 133 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 146 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48()
|
H A D | velu-avx512f-rr1-lut16-p3-perm-x112.c | 90 __m512 vs5 = _mm512_castsi512_ps(_mm512_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112() local 122 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112() 131 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112() 153 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112()
|
H A D | velu-avx2-rr1-lut4-p4-perm-x56.c | 96 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() local 128 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 142 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56() 157 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56()
|
H A D | velu-avx2-rr1-lut8-p4-perm-x56.c | 96 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() local 128 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 142 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56() 157 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56()
|
H A D | velu-avx512f-rr1-p6-x112.c | 77 __m512 vs5 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn5), 23)); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x112() local 133 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x112() 142 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x112() 164 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x112()
|
H A D | velu-avx2-rr1-lut16-p3-gather-x56.c | 105 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() local 129 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 143 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56() 158 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56()
|
H A D | velu-avx2-rr1-p6-x56.c | 75 __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() local 131 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 145 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56() 160 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x56()
|
H A D | velu-avx2-rr1-p6-x64.c | 78 __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() local 141 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 157 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64() 174 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x64()
|
H A D | velu-avx512f-rr1-p6-x128.c | 80 __m512 vs5 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(vn5), 23)); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x128() local 143 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x128() 154 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x128() 178 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x128()
|
H A D | velu-avx512f-rr1-lut16-p3-perm-x128.c | 95 __m512 vs5 = _mm512_castsi512_ps(_mm512_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() local 131 vt5 = _mm512_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() 142 vs5 = _mm512_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128() 166 __m512 vy5 = _mm512_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128()
|
H A D | velu-avx2-rr1-lut16-p3-gather-x64.c | 112 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() local 139 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 155 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64() 172 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64()
|
H A D | velu-avx2-rr1-lut8-p4-perm-x64.c | 102 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() local 138 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 154 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64() 171 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64()
|
H A D | velu-avx2-rr1-lut16-p3-gather-x72.c | 119 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() local 149 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 167 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72() 186 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72()
|
H A D | velu-avx2-rr1-p6-x72.c | 81 __m256 vs5 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn5), 23)); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() local 151 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 169 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72() 188 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x72()
|
H A D | velu-avx-rr2-p6-x48.c | 94 __m256 vs5 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs5_lo), vs5_hi, 1); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() local 148 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 149 vs5 = _mm256_sub_ps(vs5, vone); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 168 const __m256 ve5 = _mm256_mul_ps(_mm256_add_ps(vp5, vs5), valpha); in xnn_f32_velu_ukernel__avx_rr2_p6_x48()
|
H A D | velu-avx2-rr1-lut4-p4-perm-x72.c | 108 __m256 vs5 = _mm256_castsi256_ps(_mm256_add_epi32(vl5, ven5)); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() local 148 vt5 = _mm256_mul_ps(vt5, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 166 vs5 = _mm256_fmsub_ps(vs5, valpha, valpha); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72() 185 const __m256 ve5 = _mm256_fmadd_ps(vp5, valpha, vs5); in xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72()
|