xref: /aosp_15_r20/external/XNNPACK/src/x32-transposec/sse2.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker// Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker//
3*4bdc9457SAndroid Build Coastguard Worker// This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker// LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker$import math
6*4bdc9457SAndroid Build Coastguard Worker$assert IN_PTRS in ["MULTI", "REUSE"]
7*4bdc9457SAndroid Build Coastguard Worker$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV"]
8*4bdc9457SAndroid Build Coastguard Worker$assert SIZE in [8, 16, 32, 64]
9*4bdc9457SAndroid Build Coastguard Worker$TILE_SIZE = int(128/SIZE)
10*4bdc9457SAndroid Build Coastguard Worker$NUM_ITERS = int(math.log2(TILE_SIZE))
11*4bdc9457SAndroid Build Coastguard Worker
12*4bdc9457SAndroid Build Coastguard Worker#include <immintrin.h>
13*4bdc9457SAndroid Build Coastguard Worker
14*4bdc9457SAndroid Build Coastguard Worker#include <assert.h>
15*4bdc9457SAndroid Build Coastguard Worker
16*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/common.h>
17*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/math.h>
18*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/transpose.h>
19*4bdc9457SAndroid Build Coastguard Worker#include <xnnpack/unaligned.h>
20*4bdc9457SAndroid Build Coastguard Worker
21*4bdc9457SAndroid Build Coastguard Worker
22*4bdc9457SAndroid Build Coastguard Workervoid xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_sse2(
23*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* input,
24*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* output,
25*4bdc9457SAndroid Build Coastguard Worker    size_t input_stride,
26*4bdc9457SAndroid Build Coastguard Worker    size_t output_stride,
27*4bdc9457SAndroid Build Coastguard Worker    size_t block_width,
28*4bdc9457SAndroid Build Coastguard Worker    size_t block_height) XNN_OOB_READS
29*4bdc9457SAndroid Build Coastguard Worker{
30*4bdc9457SAndroid Build Coastguard Worker  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
31*4bdc9457SAndroid Build Coastguard Worker  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));
32*4bdc9457SAndroid Build Coastguard Worker
33*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_height = ${TILE_SIZE};
34*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_width = ${TILE_SIZE};
35*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
36*4bdc9457SAndroid Build Coastguard Worker  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
37*4bdc9457SAndroid Build Coastguard Worker  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
38*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
39*4bdc9457SAndroid Build Coastguard Worker    const size_t input_offset = tile_height * input_stride;
40*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "MOV":
41*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
42*4bdc9457SAndroid Build Coastguard Worker  $else:
43*4bdc9457SAndroid Build Coastguard Worker    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);
44*4bdc9457SAndroid Build Coastguard Worker
45*4bdc9457SAndroid Build Coastguard Worker  $if IN_PTRS == "MULTI":
46*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
47*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
48*4bdc9457SAndroid Build Coastguard Worker      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
49*4bdc9457SAndroid Build Coastguard Worker  $else:
50*4bdc9457SAndroid Build Coastguard Worker    const uint${SIZE}_t* i0 = input;
51*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "MULTI":
52*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
53*4bdc9457SAndroid Build Coastguard Worker    $for N in range(1, TILE_SIZE):
54*4bdc9457SAndroid Build Coastguard Worker      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
55*4bdc9457SAndroid Build Coastguard Worker  $elif OUT_PTRS == "SWITCH":
56*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
57*4bdc9457SAndroid Build Coastguard Worker  $else:
58*4bdc9457SAndroid Build Coastguard Worker    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
59*4bdc9457SAndroid Build Coastguard Worker  $if OUT_PTRS == "MOV":
60*4bdc9457SAndroid Build Coastguard Worker    const size_t minus_output_stride = -output_stride;
61*4bdc9457SAndroid Build Coastguard Worker  $elif OUT_PTRS == "SWITCH" and SIZE != 64:
62*4bdc9457SAndroid Build Coastguard Worker    const size_t minus_output_stride = -output_stride;
63*4bdc9457SAndroid Build Coastguard Worker
64*4bdc9457SAndroid Build Coastguard Worker  do {
65*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
66*4bdc9457SAndroid Build Coastguard Worker      if XNN_UNPREDICTABLE(block_width < 2) {
67*4bdc9457SAndroid Build Coastguard Worker        o1 = o0;
68*4bdc9457SAndroid Build Coastguard Worker      }
69*4bdc9457SAndroid Build Coastguard Worker      $for N in range(2, TILE_SIZE, 2):
70*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width <= ${N}) {
71*4bdc9457SAndroid Build Coastguard Worker          o${N} = o0;
72*4bdc9457SAndroid Build Coastguard Worker        }
73*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
74*4bdc9457SAndroid Build Coastguard Worker          o${N+1} = o0;
75*4bdc9457SAndroid Build Coastguard Worker        }
76*4bdc9457SAndroid Build Coastguard Worker    $elif OUT_PTRS == "MOV":
77*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
78*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
79*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_offset = oN_stride + tile_hbytes;
80*4bdc9457SAndroid Build Coastguard Worker    $elif OUT_PTRS == "SWITCH":
81*4bdc9457SAndroid Build Coastguard Worker      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
82*4bdc9457SAndroid Build Coastguard Worker      const size_t oN_stride = rem * output_stride;
83*4bdc9457SAndroid Build Coastguard Worker    size_t bh = block_height;
84*4bdc9457SAndroid Build Coastguard Worker    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
85*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE):
86*4bdc9457SAndroid Build Coastguard Worker        $if IN_PTRS == "REUSE":
87*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i0);
88*4bdc9457SAndroid Build Coastguard Worker          i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
89*4bdc9457SAndroid Build Coastguard Worker        $else:
90*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N});
91*4bdc9457SAndroid Build Coastguard Worker          i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);
92*4bdc9457SAndroid Build Coastguard Worker
93*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE >> 1):
94*4bdc9457SAndroid Build Coastguard Worker        const __m128i v${NUM_ITERS-1}_${N*2} = _mm_unpacklo_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1});
95*4bdc9457SAndroid Build Coastguard Worker        const __m128i v${NUM_ITERS-1}_${N*2+1} = _mm_unpackhi_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1});
96*4bdc9457SAndroid Build Coastguard Worker
97*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=2:
98*4bdc9457SAndroid Build Coastguard Worker        $for N in range(0, TILE_SIZE, 4):
99*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS-2}_${N} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2});
100*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS-2}_${N+1} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2});
101*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS-2}_${N+2} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3});
102*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS-2}_${N+3} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3});
103*4bdc9457SAndroid Build Coastguard Worker
104*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=3:
105*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, TILE_SIZE, 8):
106*4bdc9457SAndroid Build Coastguard Worker          $for N in range(0, 4):
107*4bdc9457SAndroid Build Coastguard Worker            const __m128i v${NUM_ITERS-3}_${M+2*N} = _mm_unpacklo_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4});
108*4bdc9457SAndroid Build Coastguard Worker            const __m128i v${NUM_ITERS-3}_${M+2*N+1} = _mm_unpackhi_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4});
109*4bdc9457SAndroid Build Coastguard Worker
110*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=4:
111*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
112*4bdc9457SAndroid Build Coastguard Worker          const __m128i v0_${N*2} = _mm_unpacklo_epi64(v1_${N}, v1_${N+8});
113*4bdc9457SAndroid Build Coastguard Worker          const __m128i v0_${N*2+1} = _mm_unpackhi_epi64(v1_${N}, v1_${N+8});
114*4bdc9457SAndroid Build Coastguard Worker
115*4bdc9457SAndroid Build Coastguard Worker      $if OUT_PTRS == "SWITCH":
116*4bdc9457SAndroid Build Coastguard Worker        uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
117*4bdc9457SAndroid Build Coastguard Worker        switch (rem) {
118*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(2, TILE_SIZE)):
119*4bdc9457SAndroid Build Coastguard Worker            case ${N}:
120*4bdc9457SAndroid Build Coastguard Worker              _mm_storeu_si128((__m128i*) oN, v0_${N});
121*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
122*4bdc9457SAndroid Build Coastguard Worker          case 1:
123*4bdc9457SAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) oN, v0_1);
124*4bdc9457SAndroid Build Coastguard Worker          case 0:
125*4bdc9457SAndroid Build Coastguard Worker            _mm_storeu_si128((__m128i*) o, v0_0);
126*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
127*4bdc9457SAndroid Build Coastguard Worker            break;
128*4bdc9457SAndroid Build Coastguard Worker          default:
129*4bdc9457SAndroid Build Coastguard Worker            XNN_UNREACHABLE;
130*4bdc9457SAndroid Build Coastguard Worker        }
131*4bdc9457SAndroid Build Coastguard Worker      $elif OUT_PTRS == "MOV":
132*4bdc9457SAndroid Build Coastguard Worker        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
133*4bdc9457SAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) o, v0_${TILE_SIZE-1});
134*4bdc9457SAndroid Build Coastguard Worker        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
135*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(2, TILE_SIZE, 2)):
136*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
137*4bdc9457SAndroid Build Coastguard Worker            o = oN;
138*4bdc9457SAndroid Build Coastguard Worker          }
139*4bdc9457SAndroid Build Coastguard Worker          _mm_storeu_si128((__m128i*) o, v0_${N});
140*4bdc9457SAndroid Build Coastguard Worker          oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
141*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
142*4bdc9457SAndroid Build Coastguard Worker            o = oN;
143*4bdc9457SAndroid Build Coastguard Worker          }
144*4bdc9457SAndroid Build Coastguard Worker          _mm_storeu_si128((__m128i*) o, v0_${N-1});
145*4bdc9457SAndroid Build Coastguard Worker          oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
146*4bdc9457SAndroid Build Coastguard Worker        if XNN_UNPREDICTABLE(block_width > 1) {
147*4bdc9457SAndroid Build Coastguard Worker          o = oN;
148*4bdc9457SAndroid Build Coastguard Worker        }
149*4bdc9457SAndroid Build Coastguard Worker        _mm_storeu_si128((__m128i*) o, v0_0);
150*4bdc9457SAndroid Build Coastguard Worker      $else:
151*4bdc9457SAndroid Build Coastguard Worker        $for N in reversed(range(TILE_SIZE)):
152*4bdc9457SAndroid Build Coastguard Worker          _mm_storeu_si128((__m128i*) o${N}, v0_${N});
153*4bdc9457SAndroid Build Coastguard Worker          o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
154*4bdc9457SAndroid Build Coastguard Worker    }
155*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MOV":
156*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
157*4bdc9457SAndroid Build Coastguard Worker    if (bh != 0) {
158*4bdc9457SAndroid Build Coastguard Worker      $if IN_PTRS == "REUSE":
159*4bdc9457SAndroid Build Coastguard Worker        const __m128i v${NUM_ITERS}_0 = _mm_loadu_si128((const __m128i*) i0);
160*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
161*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
162*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
163*4bdc9457SAndroid Build Coastguard Worker            i${N} = i${N-1};
164*4bdc9457SAndroid Build Coastguard Worker          }
165*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N});
166*4bdc9457SAndroid Build Coastguard Worker          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
167*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
168*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i${N};
169*4bdc9457SAndroid Build Coastguard Worker          }
170*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N+1} = _mm_loadu_si128((const __m128i*) i${N+1});
171*4bdc9457SAndroid Build Coastguard Worker      $else:
172*4bdc9457SAndroid Build Coastguard Worker        const __m128i v${NUM_ITERS}_0 = _mm_loadu_si128((const __m128i*) i0);
173*4bdc9457SAndroid Build Coastguard Worker        $for N in range(1, TILE_SIZE - 1, 2):
174*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh < ${N+1}) {
175*4bdc9457SAndroid Build Coastguard Worker            i${N} = i0;
176*4bdc9457SAndroid Build Coastguard Worker          }
177*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N} = _mm_loadu_si128((const __m128i*) i${N});
178*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
179*4bdc9457SAndroid Build Coastguard Worker            i${N+1} = i0;
180*4bdc9457SAndroid Build Coastguard Worker          }
181*4bdc9457SAndroid Build Coastguard Worker          const __m128i v${NUM_ITERS}_${N+1} = _mm_loadu_si128((const __m128i*) i${N+1});
182*4bdc9457SAndroid Build Coastguard Worker      const __m128i v${NUM_ITERS}_${TILE_SIZE-1} = _mm_undefined_si128();
183*4bdc9457SAndroid Build Coastguard Worker
184*4bdc9457SAndroid Build Coastguard Worker      $CONST = "const "
185*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS == 1:
186*4bdc9457SAndroid Build Coastguard Worker        $CONST = ""
187*4bdc9457SAndroid Build Coastguard Worker      $for N in range(TILE_SIZE >> 1):
188*4bdc9457SAndroid Build Coastguard Worker        ${CONST}__m128i v${NUM_ITERS-1}_${N*2} = _mm_unpacklo_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1});
189*4bdc9457SAndroid Build Coastguard Worker        ${CONST}__m128i v${NUM_ITERS-1}_${N*2+1} = _mm_unpackhi_epi${SIZE}(v${NUM_ITERS}_${N*2}, v${NUM_ITERS}_${N*2+1});
190*4bdc9457SAndroid Build Coastguard Worker
191*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS == 2:
192*4bdc9457SAndroid Build Coastguard Worker        $CONST = ""
193*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=2:
194*4bdc9457SAndroid Build Coastguard Worker        $for N in range(0, TILE_SIZE, 4):
195*4bdc9457SAndroid Build Coastguard Worker          ${CONST}__m128i v${NUM_ITERS-2}_${N} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2});
196*4bdc9457SAndroid Build Coastguard Worker          ${CONST}__m128i v${NUM_ITERS-2}_${N+1} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N}, v${NUM_ITERS-1}_${N+2});
197*4bdc9457SAndroid Build Coastguard Worker          ${CONST}__m128i v${NUM_ITERS-2}_${N+2} = _mm_unpacklo_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3});
198*4bdc9457SAndroid Build Coastguard Worker          ${CONST}__m128i v${NUM_ITERS-2}_${N+3} = _mm_unpackhi_epi${SIZE*2}(v${NUM_ITERS-1}_${N+1}, v${NUM_ITERS-1}_${N+3});
199*4bdc9457SAndroid Build Coastguard Worker
200*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS == 3:
201*4bdc9457SAndroid Build Coastguard Worker        $CONST = ""
202*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=3:
203*4bdc9457SAndroid Build Coastguard Worker        $for M in range(0, TILE_SIZE, 8):
204*4bdc9457SAndroid Build Coastguard Worker          $for N in range(0, 4):
205*4bdc9457SAndroid Build Coastguard Worker            ${CONST}__m128i v${NUM_ITERS-3}_${M+2*N} = _mm_unpacklo_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4});
206*4bdc9457SAndroid Build Coastguard Worker            ${CONST}__m128i v${NUM_ITERS-3}_${M+2*N+1} = _mm_unpackhi_epi${SIZE*4}(v${NUM_ITERS-2}_${M+N}, v${NUM_ITERS-2}_${M+N+4});
207*4bdc9457SAndroid Build Coastguard Worker
208*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>=4:
209*4bdc9457SAndroid Build Coastguard Worker        $for N in range(TILE_SIZE >> 1):
210*4bdc9457SAndroid Build Coastguard Worker          __m128i v0_${N*2} = _mm_unpacklo_epi64(v1_${N}, v1_${N+8});
211*4bdc9457SAndroid Build Coastguard Worker          __m128i v0_${N*2+1} = _mm_unpackhi_epi64(v1_${N}, v1_${N+8});
212*4bdc9457SAndroid Build Coastguard Worker
213*4bdc9457SAndroid Build Coastguard Worker      if (bh & ${TILE_SIZE>>1}) {
214*4bdc9457SAndroid Build Coastguard Worker        $if OUT_PTRS == "SWITCH":
215*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
216*4bdc9457SAndroid Build Coastguard Worker          switch (rem) {
217*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE)):
218*4bdc9457SAndroid Build Coastguard Worker              case ${N}:
219*4bdc9457SAndroid Build Coastguard Worker                _mm_storel_epi64((__m128i*) oN, v0_${N});
220*4bdc9457SAndroid Build Coastguard Worker                oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
221*4bdc9457SAndroid Build Coastguard Worker            case 1:
222*4bdc9457SAndroid Build Coastguard Worker              _mm_storel_epi64((__m128i*) oN, v0_1);
223*4bdc9457SAndroid Build Coastguard Worker            case 0:
224*4bdc9457SAndroid Build Coastguard Worker              _mm_storel_epi64((__m128i*) o, v0_0);
225*4bdc9457SAndroid Build Coastguard Worker              break;
226*4bdc9457SAndroid Build Coastguard Worker            default:
227*4bdc9457SAndroid Build Coastguard Worker              XNN_UNREACHABLE;
228*4bdc9457SAndroid Build Coastguard Worker          }
229*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 1:
230*4bdc9457SAndroid Build Coastguard Worker            o += ${TILE_SIZE>>1};
231*4bdc9457SAndroid Build Coastguard Worker        $elif OUT_PTRS == "MOV":
232*4bdc9457SAndroid Build Coastguard Worker          o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
233*4bdc9457SAndroid Build Coastguard Worker          _mm_storel_epi64((__m128i*) o, v0_${TILE_SIZE-1});
234*4bdc9457SAndroid Build Coastguard Worker          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
235*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(2, TILE_SIZE, 2)):
236*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > ${N+1}) {
237*4bdc9457SAndroid Build Coastguard Worker              o = oN;
238*4bdc9457SAndroid Build Coastguard Worker            }
239*4bdc9457SAndroid Build Coastguard Worker            _mm_storel_epi64((__m128i*) o, v0_${N});
240*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
241*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
242*4bdc9457SAndroid Build Coastguard Worker              o = oN;
243*4bdc9457SAndroid Build Coastguard Worker            }
244*4bdc9457SAndroid Build Coastguard Worker            _mm_storel_epi64((__m128i*) o, v0_${N-1});
245*4bdc9457SAndroid Build Coastguard Worker            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
246*4bdc9457SAndroid Build Coastguard Worker          if XNN_UNPREDICTABLE(block_width > 1) {
247*4bdc9457SAndroid Build Coastguard Worker            o = oN;
248*4bdc9457SAndroid Build Coastguard Worker          }
249*4bdc9457SAndroid Build Coastguard Worker          _mm_storel_epi64((__m128i*) o, v0_0);
250*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 1:
251*4bdc9457SAndroid Build Coastguard Worker            o += ${TILE_SIZE>>1};
252*4bdc9457SAndroid Build Coastguard Worker        $else:
253*4bdc9457SAndroid Build Coastguard Worker          $for N in reversed(range(TILE_SIZE)):
254*4bdc9457SAndroid Build Coastguard Worker            _mm_storel_epi64((__m128i*) o${N}, v0_${N});
255*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS>1:
256*4bdc9457SAndroid Build Coastguard Worker              o${N} += ${TILE_SIZE>>1};
257*4bdc9457SAndroid Build Coastguard Worker        $if NUM_ITERS > 1:
258*4bdc9457SAndroid Build Coastguard Worker          $for N in range(TILE_SIZE):
259*4bdc9457SAndroid Build Coastguard Worker            v0_${N} = _mm_unpackhi_epi64(v0_${N}, v0_${N});
260*4bdc9457SAndroid Build Coastguard Worker      }
261*4bdc9457SAndroid Build Coastguard Worker
262*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>1:
263*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>2}) {
264*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
265*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
266*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
267*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
268*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
269*4bdc9457SAndroid Build Coastguard Worker                  unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_${N}));
270*4bdc9457SAndroid Build Coastguard Worker                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
271*4bdc9457SAndroid Build Coastguard Worker              case 1:
272*4bdc9457SAndroid Build Coastguard Worker                unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_1));
273*4bdc9457SAndroid Build Coastguard Worker              case 0:
274*4bdc9457SAndroid Build Coastguard Worker                unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
275*4bdc9457SAndroid Build Coastguard Worker                break;
276*4bdc9457SAndroid Build Coastguard Worker              default:
277*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
278*4bdc9457SAndroid Build Coastguard Worker            }
279*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS > 2:
280*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>2};
281*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS == "MOV":
282*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
283*4bdc9457SAndroid Build Coastguard Worker            unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1}));
284*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
285*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
286*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
287*4bdc9457SAndroid Build Coastguard Worker                o = oN;
288*4bdc9457SAndroid Build Coastguard Worker              }
289*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${N}));
290*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
291*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
292*4bdc9457SAndroid Build Coastguard Worker                o = oN;
293*4bdc9457SAndroid Build Coastguard Worker              }
294*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_${N-1}));
295*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
296*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
297*4bdc9457SAndroid Build Coastguard Worker              o = oN;
298*4bdc9457SAndroid Build Coastguard Worker            }
299*4bdc9457SAndroid Build Coastguard Worker            unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
300*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS > 2:
301*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>2};
302*4bdc9457SAndroid Build Coastguard Worker          $else:
303*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
304*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u32(o${N}, (uint32_t) _mm_cvtsi128_si32(v0_${N}));
305*4bdc9457SAndroid Build Coastguard Worker              $if NUM_ITERS>2:
306*4bdc9457SAndroid Build Coastguard Worker                o${N} += ${TILE_SIZE>>2};
307*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS > 2:
308*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
309*4bdc9457SAndroid Build Coastguard Worker              v0_${N} = _mm_srli_epi64(v0_${N}, 32);
310*4bdc9457SAndroid Build Coastguard Worker        }
311*4bdc9457SAndroid Build Coastguard Worker      $if NUM_ITERS>2:
312*4bdc9457SAndroid Build Coastguard Worker        if (bh & ${TILE_SIZE>>3}) {
313*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
314*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
315*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
316*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
317*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
318*4bdc9457SAndroid Build Coastguard Worker                  unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_${N}));
319*4bdc9457SAndroid Build Coastguard Worker                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
320*4bdc9457SAndroid Build Coastguard Worker              case 1:
321*4bdc9457SAndroid Build Coastguard Worker                unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_1));
322*4bdc9457SAndroid Build Coastguard Worker              case 0:
323*4bdc9457SAndroid Build Coastguard Worker                unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
324*4bdc9457SAndroid Build Coastguard Worker                break;
325*4bdc9457SAndroid Build Coastguard Worker              default:
326*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
327*4bdc9457SAndroid Build Coastguard Worker            }
328*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS>3:
329*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>3};
330*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS == "MOV":
331*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
332*4bdc9457SAndroid Build Coastguard Worker            unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1}));
333*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
334*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
335*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
336*4bdc9457SAndroid Build Coastguard Worker                o = oN;
337*4bdc9457SAndroid Build Coastguard Worker              }
338*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N}));
339*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
340*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
341*4bdc9457SAndroid Build Coastguard Worker                o = oN;
342*4bdc9457SAndroid Build Coastguard Worker              }
343*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_${N-1}));
344*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
345*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
346*4bdc9457SAndroid Build Coastguard Worker              o = oN;
347*4bdc9457SAndroid Build Coastguard Worker            }
348*4bdc9457SAndroid Build Coastguard Worker            unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
349*4bdc9457SAndroid Build Coastguard Worker            $if NUM_ITERS > 3:
350*4bdc9457SAndroid Build Coastguard Worker              o += ${TILE_SIZE>>3};
351*4bdc9457SAndroid Build Coastguard Worker          $else:
352*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(TILE_SIZE)):
353*4bdc9457SAndroid Build Coastguard Worker              unaligned_store_u16(o${N}, (uint16_t) _mm_cvtsi128_si32(v0_${N}));
354*4bdc9457SAndroid Build Coastguard Worker              $if NUM_ITERS>3:
355*4bdc9457SAndroid Build Coastguard Worker                o${N} += ${TILE_SIZE>>3};
356*4bdc9457SAndroid Build Coastguard Worker          $if NUM_ITERS>3:
357*4bdc9457SAndroid Build Coastguard Worker            $for N in range(TILE_SIZE):
358*4bdc9457SAndroid Build Coastguard Worker              v0_${N} = _mm_srli_epi32(v0_${N}, 16);
359*4bdc9457SAndroid Build Coastguard Worker        }
360*4bdc9457SAndroid Build Coastguard Worker      $if SIZE == 8:
361*4bdc9457SAndroid Build Coastguard Worker        if (bh & 1) {
362*4bdc9457SAndroid Build Coastguard Worker          $if OUT_PTRS == "SWITCH":
363*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
364*4bdc9457SAndroid Build Coastguard Worker            switch (rem) {
365*4bdc9457SAndroid Build Coastguard Worker              $for N in reversed(range(2, TILE_SIZE)):
366*4bdc9457SAndroid Build Coastguard Worker                case ${N}:
367*4bdc9457SAndroid Build Coastguard Worker                  *oN = (uint8_t) _mm_cvtsi128_si32(v0_${N});
368*4bdc9457SAndroid Build Coastguard Worker                  oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
369*4bdc9457SAndroid Build Coastguard Worker              case 1:
370*4bdc9457SAndroid Build Coastguard Worker                *oN = (uint8_t) _mm_cvtsi128_si32(v0_1);
371*4bdc9457SAndroid Build Coastguard Worker              case 0:
372*4bdc9457SAndroid Build Coastguard Worker                *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
373*4bdc9457SAndroid Build Coastguard Worker                break;
374*4bdc9457SAndroid Build Coastguard Worker              default:
375*4bdc9457SAndroid Build Coastguard Worker                XNN_UNREACHABLE;
376*4bdc9457SAndroid Build Coastguard Worker            }
377*4bdc9457SAndroid Build Coastguard Worker          $elif OUT_PTRS == "MOV":
378*4bdc9457SAndroid Build Coastguard Worker            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
379*4bdc9457SAndroid Build Coastguard Worker            *o = (uint8_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1});
380*4bdc9457SAndroid Build Coastguard Worker            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
381*4bdc9457SAndroid Build Coastguard Worker            $for N in reversed(range(2, TILE_SIZE, 2)):
382*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
383*4bdc9457SAndroid Build Coastguard Worker                o = oN;
384*4bdc9457SAndroid Build Coastguard Worker              }
385*4bdc9457SAndroid Build Coastguard Worker              *o = (uint8_t) _mm_cvtsi128_si32(v0_${N});
386*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
387*4bdc9457SAndroid Build Coastguard Worker              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
388*4bdc9457SAndroid Build Coastguard Worker                o = oN;
389*4bdc9457SAndroid Build Coastguard Worker              }
390*4bdc9457SAndroid Build Coastguard Worker              *o = (uint8_t) _mm_cvtsi128_si32(v0_${N-1});
391*4bdc9457SAndroid Build Coastguard Worker              oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
392*4bdc9457SAndroid Build Coastguard Worker            if XNN_UNPREDICTABLE(block_width > 1) {
393*4bdc9457SAndroid Build Coastguard Worker              o = oN;
394*4bdc9457SAndroid Build Coastguard Worker            }
395*4bdc9457SAndroid Build Coastguard Worker            *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
396*4bdc9457SAndroid Build Coastguard Worker        }
397*4bdc9457SAndroid Build Coastguard Worker    }
398*4bdc9457SAndroid Build Coastguard Worker
399*4bdc9457SAndroid Build Coastguard Worker    $if IN_PTRS == "MULTI":
400*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
401*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
402*4bdc9457SAndroid Build Coastguard Worker        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
403*4bdc9457SAndroid Build Coastguard Worker    $else:
404*4bdc9457SAndroid Build Coastguard Worker      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
405*4bdc9457SAndroid Build Coastguard Worker    $if OUT_PTRS == "MULTI":
406*4bdc9457SAndroid Build Coastguard Worker      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
407*4bdc9457SAndroid Build Coastguard Worker      $for N in range(1, TILE_SIZE):
408*4bdc9457SAndroid Build Coastguard Worker        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
409*4bdc9457SAndroid Build Coastguard Worker    $else:
410*4bdc9457SAndroid Build Coastguard Worker      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
411*4bdc9457SAndroid Build Coastguard Worker    block_width = doz(block_width, tile_width);
412*4bdc9457SAndroid Build Coastguard Worker  } while (block_width != 0);
413*4bdc9457SAndroid Build Coastguard Worker}
414