Lines Matching +full:18 +full:- +full:ibl

3 // This source code is licensed under the BSD-style license found in the
18 #include <xnnpack/intrinsics-polyfill.h>
48 const __m128 vscale = _mm_load_ps(params->sse.scale); in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
49 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
50 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
135 for (k -= 9; k > 8; k -= 8) { in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
312 c -= 4; in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
350 } while (--output_pixels != 0); in xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4()
370 const __m128 vscale = _mm_load_ps(params->sse.scale); in xnn_f32_avgpool_minmax_ukernel_9x__sse_c4()
371 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_avgpool_minmax_ukernel_9x__sse_c4()
372 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_avgpool_minmax_ukernel_9x__sse_c4()
482 c -= 4; in xnn_f32_avgpool_minmax_ukernel_9x__sse_c4()
519 } while (--output_pixels != 0); in xnn_f32_avgpool_minmax_ukernel_9x__sse_c4()
545 const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
548 …const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
560 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
561 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
564 const size_t input_y2 = output_y * 2 + 2 - input_padding_top; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
603 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- ) in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
611 for (; iw >= 4; iw -= 4) { in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1143 i0 = (const float*) ((uintptr_t) i0 - input_width_increment); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1144 i1 = (const float*) ((uintptr_t) i1 - input_width_increment); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1145 i2 = (const float*) ((uintptr_t) i2 - input_width_increment); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1146 i3 = (const float*) ((uintptr_t) i3 - input_width_increment); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1147 i4 = (const float*) ((uintptr_t) i4 - input_width_increment); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
1179 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1180 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1272 const float* i18 = input[18]; in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1311 for (; c >= 8; c -= 8) { in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1553 for (; c >= 4; c -= 4) { in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1834 } while (--output_width != 0); in xnn_f32_dwconv_minmax_ukernel_up8x25__sse()
1852 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up8x3__sse()
1853 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up8x3__sse()
1874 for (; c >= 8; c -= 8) { in xnn_f32_dwconv_minmax_ukernel_up8x3__sse()
1918 for (; c >= 4; c -= 4) { in xnn_f32_dwconv_minmax_ukernel_up8x3__sse()
1979 } while (--output_width != 0); in xnn_f32_dwconv_minmax_ukernel_up8x3__sse()
1997 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up8x4__sse()
1998 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up8x4__sse()
2024 for (; c >= 8; c -= 8) { in xnn_f32_dwconv_minmax_ukernel_up8x4__sse()
2077 for (; c >= 4; c -= 4) { in xnn_f32_dwconv_minmax_ukernel_up8x4__sse()
2148 } while (--output_width != 0); in xnn_f32_dwconv_minmax_ukernel_up8x4__sse()
2166 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv_minmax_ukernel_up8x9__sse()
2167 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv_minmax_ukernel_up8x9__sse()
2218 for (; c >= 8; c -= 8) { in xnn_f32_dwconv_minmax_ukernel_up8x9__sse()
2316 for (; c >= 4; c -= 4) { in xnn_f32_dwconv_minmax_ukernel_up8x9__sse()
2437 } while (--output_width != 0); in xnn_f32_dwconv_minmax_ukernel_up8x9__sse()
2455 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2456 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2457 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2509 for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) { in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2704 i0 = (const float*) ((uintptr_t) i2 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2705 i1 = (const float*) ((uintptr_t) i3 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
2732 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2733 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2734 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2735 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2750 const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2760 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2771 for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2889 i0 = (const float*) ((uintptr_t) i2 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2894 output_height -= 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2895 padded_input_height -= 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3()
2914 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
2915 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
2916 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
2936 const __m128 vk32 = _mm_load1_ps(weights + 18); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
3009 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
3436 w -= 4 * sizeof(float); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
3659 i0 = (const float*) ((uintptr_t) i4 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
3660 i1 = (const float*) ((uintptr_t) i5 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
3693 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3694 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3695 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3696 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3716 const __m128 vk32 = _mm_load1_ps(weights + 18); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3725 const uint32_t padding_top_less_1 = padding_top - 1; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3729 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3739 …const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ +… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3745 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3815 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
3996 // Last block has 1-8 pixels to process. in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
4151 i0 = (const float*) ((uintptr_t) i4 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
4152 i1 = (const float*) ((uintptr_t) i5 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
4153 i2 = (const float*) ((uintptr_t) i6 - input_decrement); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
4183 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4184 const __m128 vmultiplier = _mm_load_ps(params->sse.multiplier); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4185 const __m128 voutput_min = _mm_load_ps(params->sse.output_min); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4186 const __m128 voutput_max = _mm_load_ps(params->sse.output_max); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4208 n -= 4 * sizeof(float); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4243 channels -= 4; in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4253 n -= 4 * sizeof(float); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4272 channels -= 1; in xnn_f32_gavgpool_cw_ukernel__sse_x4()
4297 const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4327 for (rows -= 7; rows > 7; rows -= 7) { in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4394 const __m128 vscale = _mm_load_ps(params->sse.scale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4395 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4396 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4434 channels -= 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
4509 const __m128 vscale = _mm_load_ps(params->sse.scale); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4()
4510 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4()
4511 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4()
4545 channels -= 4; in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4()
4621 k -= sizeof(float); in xnn_f32_gemm_minmax_ukernel_1x8__sse_load1()
4624 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_gemm_minmax_ukernel_1x8__sse_load1()
4628 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_gemm_minmax_ukernel_1x8__sse_load1()
4637 a0 = (const float*) ((uintptr_t) a0 - kc); in xnn_f32_gemm_minmax_ukernel_1x8__sse_load1()
4639 nc -= 8; in xnn_f32_gemm_minmax_ukernel_1x8__sse_load1()
4718 for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4776 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4780 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4787 a2 = (const float*) ((uintptr_t) a2 - kc); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4790 a3 = (const float*) ((uintptr_t) a3 - kc); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4793 a0 = (const float*) ((uintptr_t) a0 - kc); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4796 a1 = (const float*) ((uintptr_t) a1 - kc); in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4798 nc -= 2; in xnn_f32_gemm_minmax_ukernel_4x2c4__sse()
4888 k -= sizeof(float); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4891 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4901 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4925 a3 = (const float*) ((uintptr_t) a3 - kc); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4926 a2 = (const float*) ((uintptr_t) a2 - kc); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4927 a1 = (const float*) ((uintptr_t) a1 - kc); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4928 a0 = (const float*) ((uintptr_t) a0 - kc); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4930 nc -= 8; in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
4993 for (; p >= 8; p -= 8) { in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5072 for (; p >= 4; p -= 4) { in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5155 // result = (1 - alpha_h) * (1 - alpha_v) * top_left + in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5156 // alpha_h * (1 - alpha_v) * top_right + in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5157 // (1 - alpha_h) * alpha_v * bottom_left + in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5161 // result = left + alpha_h * (right - left), in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5163 // left = top_left + alpha_v * (bottom_left - top_left), in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5164 // right = top_right + alpha_v * (bottom_right - top_right). in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5171 const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); in xnn_f32_ibilinear_chw_ukernel__sse_p8() local
5175 const __m128 vblbr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl); in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5178 // left_diff = bottom_left - top_left in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5179 // right_diff = bottom_right - top_right in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5187 *output++ = l + alphah * (r - l); in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5192 } while (--channels != 0); in xnn_f32_ibilinear_chw_ukernel__sse_p8()
5222 for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { in xnn_f32_ibilinear_ukernel__sse_c8()
5256 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { in xnn_f32_ibilinear_ukernel__sse_c8()
5307 } while (--output_pixels != 0); in xnn_f32_ibilinear_ukernel__sse_c8()
5363 k -= sizeof(float); in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5365 p -= 1 * sizeof(void*); in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5368 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5372 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5381 a = (const float**restrict) ((uintptr_t) a - ks); in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5382 nc -= 8; in xnn_f32_igemm_minmax_ukernel_1x8__sse_load1()
5483 for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5528 p -= 4 * sizeof(void*); in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5539 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5543 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5557 a = (const float**restrict) ((uintptr_t) a - ks); in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5558 nc -= 2; in xnn_f32_igemm_minmax_ukernel_4x2c4__sse()
5669 k -= sizeof(float); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5671 p -= 4 * sizeof(void*); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5674 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5684 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5708 a = (const float**restrict) ((uintptr_t) a - ks); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5709 nc -= 8; in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
5770 const __m128 voutput_max = _mm_load_ps(params->sse.max); in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
5771 const __m128 voutput_min = _mm_load_ps(params->sse.min); in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
5819 for (; c >= 4; c -= 4) { in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
5894 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
5935 for (; c >= 4; c -= 4) { in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
6001 } while (--output_pixels != 0); in xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4()
6022 const __m128 voutput_min = _mm_load_ps(params->sse.min); in xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4()
6023 const __m128 voutput_max = _mm_load_ps(params->sse.max); in xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4()
6108 for (k -= 9; k > 8; k -= 8) { in xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4()
6288 c -= 4; in xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4()
6326 } while (--output_pixels != 0); in xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4()
6347 const __m128 voutput_min = _mm_load_ps(params->sse.min); in xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4()
6348 const __m128 voutput_max = _mm_load_ps(params->sse.max); in xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4()
6461 c -= 4; in xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4()
6498 } while (--output_pixels != 0); in xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4()
6514 for (; n >= 64; n -= 64) { in xnn_f32_rmax_ukernel__sse()
6527 for (; n >= 16; n -= 16) { in xnn_f32_rmax_ukernel__sse()
6538 n -= 4; in xnn_f32_rmax_ukernel__sse()
6559 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6560 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6561 size_t output_decrement = output_stride * nc - 32 * sizeof(float); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6598 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6625 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6626 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6628 mc -= 32 * sizeof(float); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6656 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6671 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6672 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6694 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6703 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6704 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6723 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6729 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6730 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6751 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6757 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6758 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6777 } while (--nnz != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6783 } while (--n != 0); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6784 output = (float*restrict) ((uintptr_t) output - output_decrement); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
6803 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vadd_minmax_ukernel__sse_x8()
6804 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vadd_minmax_ukernel__sse_x8()
6806 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vadd_minmax_ukernel__sse_x8()
6829 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vadd_minmax_ukernel__sse_x8()
6873 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vaddc_minmax_ukernel__sse_x8()
6874 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vaddc_minmax_ukernel__sse_x8()
6877 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vaddc_minmax_ukernel__sse_x8()
6896 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vaddc_minmax_ukernel__sse_x8()
6936 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vdiv_minmax_ukernel__sse_x8()
6937 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vdiv_minmax_ukernel__sse_x8()
6939 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vdiv_minmax_ukernel__sse_x8()
6962 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vdiv_minmax_ukernel__sse_x8()
7006 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vdivc_minmax_ukernel__sse_x8()
7007 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vdivc_minmax_ukernel__sse_x8()
7010 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vdivc_minmax_ukernel__sse_x8()
7029 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vdivc_minmax_ukernel__sse_x8()
7070 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vmax_ukernel__sse_x8()
7088 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vmax_ukernel__sse_x8()
7130 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vmaxc_ukernel__sse_x8()
7144 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vmaxc_ukernel__sse_x8()
7181 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vmin_ukernel__sse_x8()
7199 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vmin_ukernel__sse_x8()
7241 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vminc_ukernel__sse_x8()
7255 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vminc_ukernel__sse_x8()
7291 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vmul_minmax_ukernel__sse_x8()
7292 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vmul_minmax_ukernel__sse_x8()
7294 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vmul_minmax_ukernel__sse_x8()
7317 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vmul_minmax_ukernel__sse_x8()
7361 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vmulc_minmax_ukernel__sse_x8()
7362 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vmulc_minmax_ukernel__sse_x8()
7365 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vmulc_minmax_ukernel__sse_x8()
7384 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vmulc_minmax_ukernel__sse_x8()
7424 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vrdivc_minmax_ukernel__sse_x8()
7425 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vrdivc_minmax_ukernel__sse_x8()
7428 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vrdivc_minmax_ukernel__sse_x8()
7447 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vrdivc_minmax_ukernel__sse_x8()
7487 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vrsubc_minmax_ukernel__sse_x8()
7488 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vrsubc_minmax_ukernel__sse_x8()
7491 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vrsubc_minmax_ukernel__sse_x8()
7510 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vrsubc_minmax_ukernel__sse_x8()
7551 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vsqrdiff_ukernel__sse_x8()
7571 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsqrdiff_ukernel__sse_x8()
7615 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vsqrdiffc_ukernel__sse_x8()
7631 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsqrdiffc_ukernel__sse_x8()
7669 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vsub_minmax_ukernel__sse_x8()
7670 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vsub_minmax_ukernel__sse_x8()
7672 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vsub_minmax_ukernel__sse_x8()
7695 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsub_minmax_ukernel__sse_x8()
7739 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vsubc_minmax_ukernel__sse_x8()
7740 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vsubc_minmax_ukernel__sse_x8()
7743 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vsubc_minmax_ukernel__sse_x8()
7762 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsubc_minmax_ukernel__sse_x8()
7800 const __m128 vy_min = _mm_load_ps(params->sse.min); in xnn_f32_vclamp_ukernel__sse_x8()
7801 const __m128 vy_max = _mm_load_ps(params->sse.max); in xnn_f32_vclamp_ukernel__sse_x8()
7803 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vclamp_ukernel__sse_x8()
7818 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vclamp_ukernel__sse_x8()
7853 const __m128 vsixth = _mm_load_ps(params->sse.sixth); in xnn_f32_vhswish_ukernel__sse_x8()
7854 const __m128 vhalf = _mm_load_ps(params->sse.half); in xnn_f32_vhswish_ukernel__sse_x8()
7855 const __m128 vone = _mm_load_ps(params->sse.one); in xnn_f32_vhswish_ukernel__sse_x8()
7858 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vhswish_ukernel__sse_x8()
7882 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vhswish_ukernel__sse_x8()
7921 const __m128 vslope = _mm_load_ps(params->sse.slope); in xnn_f32_vlrelu_ukernel__sse_x8()
7923 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vlrelu_ukernel__sse_x8()
7940 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vlrelu_ukernel__sse_x8()
7988 const size_t input_increment = input_stride * 2 - channels; in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x()
7989 const size_t output_increment = output_stride * 2 - channels; in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x()
7991 const __m128 vmin = _mm_load_ps(params->sse.min); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x()
7992 const __m128 vmax = _mm_load_ps(params->sse.max); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x()
8001 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x()
8087 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsqrt_ukernel__sse_sqrt_x4()
8119 const __m128 vnonsign_mask = _mm_load_ps(params->sse.nonsign_mask); in xnn_f32_vabs_ukernel__sse_x8()
8120 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vabs_ukernel__sse_x8()
8132 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vabs_ukernel__sse_x8()
8164 const __m128 vsign_mask = _mm_load_ps(params->sse.sign_mask); in xnn_f32_vneg_ukernel__sse_x8()
8165 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vneg_ukernel__sse_x8()
8177 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vneg_ukernel__sse_x8()
8209 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) { in xnn_f32_vsqr_ukernel__sse_x8()
8221 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { in xnn_f32_vsqr_ukernel__sse_x8()
8268 for (; k >= 4; k -= 4) { in xnn_x32_packx_ukernel_4x__sse()
8314 } while (--k != 0); in xnn_x32_packx_ukernel_4x__sse()
8332 …const size_t input_vreset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; in xnn_x32_transposec_ukernel__4x4_sse()
8333 …const size_t output_vreset = tile_height * output_stride - round_down_po2(block_height, 2) * sizeo… in xnn_x32_transposec_ukernel__4x4_sse()
8357 for (; bh >= 4; bh -= 4) { in xnn_x32_transposec_ukernel__4x4_sse()