1// Copyright 2022 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert SAMPLE_TILE >= 1 7#include <assert.h> 8#include <stddef.h> 9#include <stdint.h> 10 11#include <xnnpack/math.h> 12#include <xnnpack/fft.h> 13 14 15void xnn_cs16_bfly4_ukernel__scalar_x${SAMPLE_TILE}( 16 size_t samples, 17 int16_t* data, 18 const size_t stride, 19 const int16_t* twiddle) 20{ 21 const int16_t* tw1 = twiddle; 22 const int16_t* tw2 = twiddle; 23 const int16_t* tw3 = twiddle; 24 int16_t* data0 = data; 25 int16_t* data1 = data + samples * 2; 26 int16_t* data2 = data + samples * 4; 27 int16_t* data3 = data + samples * 6; 28 29 assert(samples != 0); 30 assert(data != NULL); 31 assert(stride != 0); 32 assert(twiddle != NULL); 33 34 $if SAMPLE_TILE > 1: 35 for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) { 36 $for C in range(SAMPLE_TILE): 37 int32_t vout0r${C} = (int32_t) data0[${C * 2 + 0}]; 38 int32_t vout0i${C} = (int32_t) data0[${C * 2 + 1}]; 39 $for C in range(SAMPLE_TILE): 40 int32_t vout1r${C} = (int32_t) data1[${C * 2 + 0}]; 41 int32_t vout1i${C} = (int32_t) data1[${C * 2 + 1}]; 42 $for C in range(SAMPLE_TILE): 43 int32_t vout2r${C} = (int32_t) data2[${C * 2 + 0}]; 44 int32_t vout2i${C} = (int32_t) data2[${C * 2 + 1}]; 45 $for C in range(SAMPLE_TILE): 46 int32_t vout3r${C} = (int32_t) data3[${C * 2 + 0}]; 47 int32_t vout3i${C} = (int32_t) data3[${C * 2 + 1}]; 48 49 $for C in range(SAMPLE_TILE): 50 const int32_t vtw1r${C} = (const int32_t) tw1[0]; 51 const int32_t vtw1i${C} = (const int32_t) tw1[1]; 52 tw1 += stride * 2; 53 $for C in range(SAMPLE_TILE): 54 const int32_t vtw2r${C} = (const int32_t) tw2[0]; 55 const int32_t vtw2i${C} = (const int32_t) tw2[1]; 56 tw2 += stride * 4; 57 $for C in range(SAMPLE_TILE): 58 const int32_t vtw3r${C} = (const int32_t) tw3[0]; 59 const int32_t vtw3i${C} = (const int32_t) tw3[1]; 60 tw3 += stride * 6; 61 62 // Note 32767 / 4 = 8191. Should be 8192. 63 $for C in range(SAMPLE_TILE): 64 vout0r${C} = math_asr_s32(vout0r${C} * 8191 + 16384, 15); 65 $for C in range(SAMPLE_TILE): 66 vout0i${C} = math_asr_s32(vout0i${C} * 8191 + 16384, 15); 67 $for C in range(SAMPLE_TILE): 68 vout1r${C} = math_asr_s32(vout1r${C} * 8191 + 16384, 15); 69 $for C in range(SAMPLE_TILE): 70 vout1i${C} = math_asr_s32(vout1i${C} * 8191 + 16384, 15); 71 $for C in range(SAMPLE_TILE): 72 vout2r${C} = math_asr_s32(vout2r${C} * 8191 + 16384, 15); 73 $for C in range(SAMPLE_TILE): 74 vout2i${C} = math_asr_s32(vout2i${C} * 8191 + 16384, 15); 75 $for C in range(SAMPLE_TILE): 76 vout3r${C} = math_asr_s32(vout3r${C} * 8191 + 16384, 15); 77 $for C in range(SAMPLE_TILE): 78 vout3i${C} = math_asr_s32(vout3i${C} * 8191 + 16384, 15); 79 80 $for C in range(SAMPLE_TILE): 81 const int32_t vtmp0r${C} = math_asr_s32(vout1r${C} * vtw1r${C} - vout1i${C} * vtw1i${C} + 16384, 15); 82 $for C in range(SAMPLE_TILE): 83 const int32_t vtmp0i${C} = math_asr_s32(vout1r${C} * vtw1i${C} + vout1i${C} * vtw1r${C} + 16384, 15); 84 $for C in range(SAMPLE_TILE): 85 const int32_t vtmp1r${C} = math_asr_s32(vout2r${C} * vtw2r${C} - vout2i${C} * vtw2i${C} + 16384, 15); 86 $for C in range(SAMPLE_TILE): 87 const int32_t vtmp1i${C} = math_asr_s32(vout2r${C} * vtw2i${C} + vout2i${C} * vtw2r${C} + 16384, 15); 88 $for C in range(SAMPLE_TILE): 89 const int32_t vtmp2r${C} = math_asr_s32(vout3r${C} * vtw3r${C} - vout3i${C} * vtw3i${C} + 16384, 15); 90 $for C in range(SAMPLE_TILE): 91 const int32_t vtmp2i${C} = math_asr_s32(vout3r${C} * vtw3i${C} + vout3i${C} * vtw3r${C} + 16384, 15); 92 93 $for C in range(SAMPLE_TILE): 94 const int32_t vtmp5r${C} = vout0r${C} - vtmp1r${C}; 95 $for C in range(SAMPLE_TILE): 96 const int32_t vtmp5i${C} = vout0i${C} - vtmp1i${C}; 97 $for C in range(SAMPLE_TILE): 98 vout0r${C} += vtmp1r${C}; 99 $for C in range(SAMPLE_TILE): 100 vout0i${C} += vtmp1i${C}; 101 $for C in range(SAMPLE_TILE): 102 const int32_t vtmp3r${C} = vtmp0r${C} + vtmp2r${C}; 103 $for C in range(SAMPLE_TILE): 104 const int32_t vtmp3i${C} = vtmp0i${C} + vtmp2i${C}; 105 $for C in range(SAMPLE_TILE): 106 const int32_t vtmp4r${C} = vtmp0r${C} - vtmp2r${C}; 107 $for C in range(SAMPLE_TILE): 108 const int32_t vtmp4i${C} = vtmp0i${C} - vtmp2i${C}; 109 $for C in range(SAMPLE_TILE): 110 vout2r${C} = vout0r${C} - vtmp3r${C}; 111 $for C in range(SAMPLE_TILE): 112 vout2i${C} = vout0i${C} - vtmp3i${C}; 113 $for C in range(SAMPLE_TILE): 114 vout0r${C} += vtmp3r${C}; 115 $for C in range(SAMPLE_TILE): 116 vout0i${C} += vtmp3i${C}; 117 $for C in range(SAMPLE_TILE): 118 vout1r${C} = vtmp5r${C} + vtmp4i${C}; 119 $for C in range(SAMPLE_TILE): 120 vout1i${C} = vtmp5i${C} - vtmp4r${C}; 121 $for C in range(SAMPLE_TILE): 122 vout3r${C} = vtmp5r${C} - vtmp4i${C}; 123 $for C in range(SAMPLE_TILE): 124 vout3i${C} = vtmp5i${C} + vtmp4r${C}; 125 126 $for C in range(SAMPLE_TILE): 127 data0[${C * 2 + 0}] = (int16_t) vout0r${C}; 128 data0[${C * 2 + 1}] = (int16_t) vout0i${C}; 129 data0 += ${SAMPLE_TILE} * 2; 130 $for C in range(SAMPLE_TILE): 131 data1[${C * 2 + 0}] = (int16_t) vout1r${C}; 132 data1[${C * 2 + 1}] = (int16_t) vout1i${C}; 133 data1 += ${SAMPLE_TILE} * 2; 134 $for C in range(SAMPLE_TILE): 135 data2[${C * 2 + 0}] = (int16_t) vout2r${C}; 136 data2[${C * 2 + 1}] = (int16_t) vout2i${C}; 137 data2 += ${SAMPLE_TILE} * 2; 138 $for C in range(SAMPLE_TILE): 139 data3[${C * 2 + 0}] = (int16_t) vout3r${C}; 140 data3[${C * 2 + 1}] = (int16_t) vout3i${C}; 141 data3 += ${SAMPLE_TILE} * 2; 142 } 143 144 if XNN_UNLIKELY(samples != 0) { 145 do { 146 int32_t vout0r = (int32_t) data0[0]; 147 int32_t vout0i = (int32_t) data0[1]; 148 int32_t vout1r = (int32_t) data1[0]; 149 int32_t vout1i = (int32_t) data1[1]; 150 int32_t vout2r = (int32_t) data2[0]; 151 int32_t vout2i = (int32_t) data2[1]; 152 int32_t vout3r = (int32_t) data3[0]; 153 int32_t vout3i = (int32_t) data3[1]; 154 155 const int32_t vtw1r = (const int32_t) tw1[0]; 156 const int32_t vtw1i = (const int32_t) tw1[1]; 157 const int32_t vtw2r = (const int32_t) tw2[0]; 158 const int32_t vtw2i = (const int32_t) tw2[1]; 159 const int32_t vtw3r = (const int32_t) tw3[0]; 160 const int32_t vtw3i = (const int32_t) tw3[1]; 161 tw1 += stride * 2; 162 tw2 += stride * 4; 163 tw3 += stride * 6; 164 165 // Note 32767 / 4 = 8191. Should be 8192. 166 vout0r = math_asr_s32(vout0r * 8191 + 16384, 15); 167 vout0i = math_asr_s32(vout0i * 8191 + 16384, 15); 168 vout1r = math_asr_s32(vout1r * 8191 + 16384, 15); 169 vout1i = math_asr_s32(vout1i * 8191 + 16384, 15); 170 vout2r = math_asr_s32(vout2r * 8191 + 16384, 15); 171 vout2i = math_asr_s32(vout2i * 8191 + 16384, 15); 172 vout3r = math_asr_s32(vout3r * 8191 + 16384, 15); 173 vout3i = math_asr_s32(vout3i * 8191 + 16384, 15); 174 175 const int32_t vtmp0r = math_asr_s32(vout1r * vtw1r - vout1i * vtw1i + 16384, 15); 176 const int32_t vtmp0i = math_asr_s32(vout1r * vtw1i + vout1i * vtw1r + 16384, 15); 177 const int32_t vtmp1r = math_asr_s32(vout2r * vtw2r - vout2i * vtw2i + 16384, 15); 178 const int32_t vtmp1i = math_asr_s32(vout2r * vtw2i + vout2i * vtw2r + 16384, 15); 179 const int32_t vtmp2r = math_asr_s32(vout3r * vtw3r - vout3i * vtw3i + 16384, 15); 180 const int32_t vtmp2i = math_asr_s32(vout3r * vtw3i + vout3i * vtw3r + 16384, 15); 181 182 const int32_t vtmp5r = vout0r - vtmp1r; 183 const int32_t vtmp5i = vout0i - vtmp1i; 184 vout0r += vtmp1r; 185 vout0i += vtmp1i; 186 const int32_t vtmp3r = vtmp0r + vtmp2r; 187 const int32_t vtmp3i = vtmp0i + vtmp2i; 188 const int32_t vtmp4r = vtmp0r - vtmp2r; 189 const int32_t vtmp4i = vtmp0i - vtmp2i; 190 vout2r = vout0r - vtmp3r; 191 vout2i = vout0i - vtmp3i; 192 193 vout0r += vtmp3r; 194 vout0i += vtmp3i; 195 196 vout1r = vtmp5r + vtmp4i; 197 vout1i = vtmp5i - vtmp4r; 198 vout3r = vtmp5r - vtmp4i; 199 vout3i = vtmp5i + vtmp4r; 200 201 data0[0] = (int16_t) vout0r; 202 data0[1] = (int16_t) vout0i; 203 data1[0] = (int16_t) vout1r; 204 data1[1] = (int16_t) vout1i; 205 data2[0] = (int16_t) vout2r; 206 data2[1] = (int16_t) vout2i; 207 data3[0] = (int16_t) vout3r; 208 data3[1] = (int16_t) vout3i; 209 data0 += 2; 210 data1 += 2; 211 data2 += 2; 212 data3 += 2; 213 } while(--samples != 0); 214 } 215} 216