/aosp_15_r20/external/XNNPACK/src/x8-zip/ |
H A D | xm-neon.c | 47 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 0); in xnn_x8_zip_xm_ukernel__neon() 50 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 1); in xnn_x8_zip_xm_ukernel__neon() 53 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 0); in xnn_x8_zip_xm_ukernel__neon() 56 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 1); in xnn_x8_zip_xm_ukernel__neon() 59 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 0); in xnn_x8_zip_xm_ukernel__neon() 62 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 1); in xnn_x8_zip_xm_ukernel__neon() 65 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 0); in xnn_x8_zip_xm_ukernel__neon() 68 vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 1); in xnn_x8_zip_xm_ukernel__neon() 90 uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]); in xnn_x8_zip_xm_ukernel__neon() 91 uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]); in xnn_x8_zip_xm_ukernel__neon() [all …]
|
/aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/x8zip/ |
H A D | xm-neon.c | 53 vreinterpret_u32_u16(vxyzw_lo.val[0]), in pytorch_qnnp_x8zip_xm__neon() 59 vreinterpret_u32_u16(vxyzw_lo.val[0]), in pytorch_qnnp_x8zip_xm__neon() 65 vreinterpret_u32_u16(vxyzw_lo.val[1]), in pytorch_qnnp_x8zip_xm__neon() 71 vreinterpret_u32_u16(vxyzw_lo.val[1]), in pytorch_qnnp_x8zip_xm__neon() 77 vreinterpret_u32_u16(vxyzw_hi.val[0]), in pytorch_qnnp_x8zip_xm__neon() 83 vreinterpret_u32_u16(vxyzw_hi.val[0]), in pytorch_qnnp_x8zip_xm__neon() 89 vreinterpret_u32_u16(vxyzw_hi.val[1]), in pytorch_qnnp_x8zip_xm__neon() 95 vreinterpret_u32_u16(vxyzw_hi.val[1]), in pytorch_qnnp_x8zip_xm__neon() 123 uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]); in pytorch_qnnp_x8zip_xm__neon() 124 uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]); in pytorch_qnnp_x8zip_xm__neon() [all …]
|
/aosp_15_r20/external/libaom/aom_dsp/arm/ |
H A D | transpose_neon.h | 111 uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), in transpose_arrays_u8_8x16() 112 vreinterpret_u32_u16(w5.val[0])); in transpose_arrays_u8_8x16() 113 uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), in transpose_arrays_u8_8x16() 114 vreinterpret_u32_u16(w5.val[1])); in transpose_arrays_u8_8x16() 115 uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), in transpose_arrays_u8_8x16() 116 vreinterpret_u32_u16(w13.val[0])); in transpose_arrays_u8_8x16() 117 uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), in transpose_arrays_u8_8x16() 118 vreinterpret_u32_u16(w13.val[1])); in transpose_arrays_u8_8x16() 133 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), in transpose_arrays_u8_8x16() 134 vreinterpret_u32_u16(w5.val[0])); in transpose_arrays_u8_8x16() [all …]
|
/aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/ |
H A D | CpuTransposeKernel.cpp | 130 …const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_… in transpose_8bit_elements() 131 …const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_… in transpose_8bit_elements() 132 …const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_… in transpose_8bit_elements() 133 …const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_… in transpose_8bit_elements() 257 …const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_… in transpose_16bit_elements() 258 …const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_… in transpose_16bit_elements()
|
/aosp_15_r20/external/XNNPACK/src/x16-transposec/gen/ |
H A D | 8x8-reuse-multi-zip-neon.c | 180 vst1_lane_u32((void*) o7, vreinterpret_u32_u16(v7_low), 0); o7 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 181 vst1_lane_u32((void*) o6, vreinterpret_u32_u16(v6_low), 0); o6 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 182 vst1_lane_u32((void*) o5, vreinterpret_u32_u16(v5_low), 0); o5 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 183 vst1_lane_u32((void*) o4, vreinterpret_u32_u16(v4_low), 0); o4 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 184 vst1_lane_u32((void*) o3, vreinterpret_u32_u16(v3_low), 0); o3 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 185 vst1_lane_u32((void*) o2, vreinterpret_u32_u16(v2_low), 0); o2 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 186 vst1_lane_u32((void*) o1, vreinterpret_u32_u16(v1_low), 0); o1 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon() 187 vst1_lane_u32((void*) o0, vreinterpret_u32_u16(v0_low), 0); o0 += 2; in xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon()
|
H A D | 8x8-reuse-switch-zip-neon.c | 186 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v7_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 188 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v6_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 190 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v5_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 192 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v4_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 194 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v3_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 196 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v2_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 198 vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon() 200 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon()
|
H A D | 8x8-reuse-dec-zip-neon.c | 202 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 206 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 210 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 214 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 218 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 222 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 226 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon() 230 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon()
|
H A D | 8x8-multi-switch-zip-neon.c | 188 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v7_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 190 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v6_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 192 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v5_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 194 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v4_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 196 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v3_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 198 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v2_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 200 vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon() 202 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon()
|
H A D | 8x8-reuse-mov-zip-neon.c | 216 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 221 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 226 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 231 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 236 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 241 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 246 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon() 251 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon()
|
H A D | 8x8-multi-dec-zip-neon.c | 204 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 208 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 212 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 216 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 220 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 224 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 228 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon() 232 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon()
|
H A D | 8x8-multi-mov-zip-neon.c | 218 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 223 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 228 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 233 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 238 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 243 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 248 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon() 253 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon()
|
H A D | 4x4-reuse-multi-zip-neon.c | 97 vst1_lane_u32((void*) o3, vreinterpret_u32_u16(v3_low), 0); o3 += 2; in xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon() 98 vst1_lane_u32((void*) o2, vreinterpret_u32_u16(v2_low), 0); o2 += 2; in xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon() 99 vst1_lane_u32((void*) o1, vreinterpret_u32_u16(v1_low), 0); o1 += 2; in xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon() 100 vst1_lane_u32((void*) o0, vreinterpret_u32_u16(v0_low), 0); o0 += 2; in xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon()
|
H A D | 4x4-reuse-switch-zip-neon.c | 101 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v3_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon() 103 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v2_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon() 105 vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon() 107 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon()
|
H A D | 4x4-multi-multi-zip-neon.c | 99 vst1_lane_u32((void*) o3, vreinterpret_u32_u16(v3_low), 0); o3 += 2; in xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon() 100 vst1_lane_u32((void*) o2, vreinterpret_u32_u16(v2_low), 0); o2 += 2; in xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon() 101 vst1_lane_u32((void*) o1, vreinterpret_u32_u16(v1_low), 0); o1 += 2; in xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon() 102 vst1_lane_u32((void*) o0, vreinterpret_u32_u16(v0_low), 0); o0 += 2; in xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon()
|
H A D | 4x4-reuse-dec-zip-neon.c | 101 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon() 105 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon() 109 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon() 113 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon()
|
H A D | 4x4-multi-switch-zip-neon.c | 103 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v3_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon() 105 …vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v2_low), 0); oN = (uint16_t*) ((uintptr_t) oN + min… in xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon() 107 vst1_lane_u32((void*) oN, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon() 109 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon()
|
H A D | 4x4-multi-dec-zip-neon.c | 103 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon() 107 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon() 111 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon() 115 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon()
|
H A D | 4x4-reuse-mov-zip-neon.c | 104 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon() 109 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon() 114 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon() 119 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon()
|
H A D | 4x4-multi-mov-zip-neon.c | 106 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon() 111 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon() 116 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); in xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon() 121 vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; in xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon()
|
/aosp_15_r20/external/libgav1/src/dsp/arm/ |
H A D | common_neon.h | 236 vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane)); in Load2() 384 ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane)); in Store2() 782 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); in Transpose4x4() 787 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); in Transpose4x4() 805 vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1])); in Transpose4x4()
|
/aosp_15_r20/external/gemmlowp/internal/ |
H A D | output_neon.h | 675 c[0] = vtrn_u32(vreinterpret_u32_u16(b[0].val[0]), 676 vreinterpret_u32_u16(b[2].val[0])); 677 c[1] = vtrn_u32(vreinterpret_u32_u16(b[1].val[0]), 678 vreinterpret_u32_u16(b[3].val[0])); 679 c[2] = vtrn_u32(vreinterpret_u32_u16(b[0].val[1]), 680 vreinterpret_u32_u16(b[2].val[1])); 681 c[3] = vtrn_u32(vreinterpret_u32_u16(b[1].val[1]), 682 vreinterpret_u32_u16(b[3].val[1]));
|
/aosp_15_r20/external/XNNPACK/src/bf16-gemm/gen/ |
H A D | 5x4c8-minmax-neonfma-zip.c | 387 vst1_lane_u32((void*) c0, vreinterpret_u32_u16(vout0x0123), 0); c0 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 388 vst1_lane_u32((void*) c1, vreinterpret_u32_u16(vout1x0123), 0); c1 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 389 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 390 vst1_lane_u32((void*) c3, vreinterpret_u32_u16(vout3x0123), 0); c3 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 391 vst1_lane_u32((void*) c4, vreinterpret_u32_u16(vout4x0123), 0); c4 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
|
H A D | 5x4c8-minmax-neonfma-shland.c | 387 vst1_lane_u32((void*) c0, vreinterpret_u32_u16(vout0x0123), 0); c0 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 388 vst1_lane_u32((void*) c1, vreinterpret_u32_u16(vout1x0123), 0); c1 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 389 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 390 vst1_lane_u32((void*) c3, vreinterpret_u32_u16(vout3x0123), 0); c3 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 391 vst1_lane_u32((void*) c4, vreinterpret_u32_u16(vout4x0123), 0); c4 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
|
H A D | 4x4c8-minmax-neonfma-shland.c | 331 vst1_lane_u32((void*) c0, vreinterpret_u32_u16(vout0x0123), 0); c0 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 332 vst1_lane_u32((void*) c1, vreinterpret_u32_u16(vout1x0123), 0); c1 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 333 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 334 vst1_lane_u32((void*) c3, vreinterpret_u32_u16(vout3x0123), 0); c3 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
H A D | 4x4c8-minmax-neonfma-zip.c | 331 vst1_lane_u32((void*) c0, vreinterpret_u32_u16(vout0x0123), 0); c0 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 332 vst1_lane_u32((void*) c1, vreinterpret_u32_u16(vout1x0123), 0); c1 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 333 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 334 vst1_lane_u32((void*) c3, vreinterpret_u32_u16(vout3x0123), 0); c3 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|