xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv2d-chw/3x3p1-scalar.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ROW_TILE >= 1
7$assert ACCUMULATORS >= 1
8#include <assert.h>
9
10#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13
14void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_${ROW_TILE}x1${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}(
15    size_t input_height,
16    size_t input_width,
17    const float* input,
18    const float* weights,
19    const float* zero,
20    float* output,
21    uint32_t padding_top,
22    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
23{
24  assert(input_height != 0);
25  assert(input_width != 0);
26  assert(input_width % sizeof(float) == 0);
27  assert(padding_top == 1);
28
29  const float vmin = params->scalar.min;
30  const float vmax = params->scalar.max;
31
32  const float vbias = weights[0];
33  const float vk00 = weights[1];
34  const float vk01 = weights[2];
35  const float vk02 = weights[3];
36  const float vk10 = weights[4];
37  const float vk11 = weights[5];
38  const float vk12 = weights[6];
39  const float vk20 = weights[7];
40  const float vk21 = weights[8];
41  const float vk22 = weights[9];
42
43  const float* i0 = zero;
44  const float* i1 = input;
45  $for M in range(2, 2 + ROW_TILE):
46    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
47
48  float* o0 = output;
49  $for M in range(1, ROW_TILE):
50    float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
51
52  size_t output_height = input_height;
53  do {
54    $for M in range(2, 2 + ROW_TILE):
55      if XNN_UNPREDICTABLE(output_height < ${M}) {
56        i${M} = zero;
57        $if M <= ROW_TILE:
58          o${M-1} = o${M-2};
59      }
60
61    $for M in range(2 + ROW_TILE):
62      float vi${M}x0 = 0.0f;
63
64    $for M in range(2 + ROW_TILE):
65      float vi${M}x1 = *i${M}++;
66
67    size_t w = input_width;
68    for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
69      $for M in range(2 + ROW_TILE):
70        const float vi${M}x2 = *i${M}++;
71
72      $for K in range(3):
73        $for M in range(ROW_TILE):
74          $if K == 0:
75            float vo${M}p0 = vbias + vi${M+K}x0 * vk${K}0;
76          $elif K < ACCUMULATORS:
77            float vo${M}p${K} = vi${M+K}x0 * vk${K}0;
78          $else:
79            vo${M}p${K % ACCUMULATORS} += vi${M+K}x0 * vk${K}0;
80
81      $for M in range(2 + ROW_TILE):
82          vi${M}x0 = vi${M}x1;
83
84      $for K in range(3):
85        $for M in range(ROW_TILE):
86          $if K+3 < ACCUMULATORS:
87            float vo${M}p${K+3} = vi${M+K}x1 * vk${K}1;
88          $else:
89            vo${M}p${(K+3) % ACCUMULATORS} += vi${M+K}x1 * vk${K}1;
90
91      $for M in range(2 + ROW_TILE):
92        vi${M}x1 = vi${M}x2;
93
94      $for K in range(3):
95        $for M in range(ROW_TILE):
96          vo${M}p${(K+6) % ACCUMULATORS} += vi${M+K}x2 * vk${K}2;
97
98      $if ACCUMULATORS > 1:
99        $ACC_SLICE = 1
100        $while ACC_SLICE < ACCUMULATORS:
101          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
102            $if A + ACC_SLICE < ACCUMULATORS:
103              $for M in range(ROW_TILE):
104                vo${M}p${A} += vo${M}p${A + ACC_SLICE};
105          $ACC_SLICE *= 2
106
107      $for M in range(ROW_TILE):
108        float vo${M} = math_max_f32(vo${M}p0, vmin);
109
110      $for M in range(ROW_TILE):
111        vo${M} = math_min_f32(vo${M}, vmax);
112
113      $for M in reversed(range(ROW_TILE)):
114        *o${M}++ = vo${M};
115    }
116    // Always process the last pixel separately to account for right edge.
117    assert(w == 1 * sizeof(float));
118    {
119      $for K in range(3):
120        $for M in range(ROW_TILE):
121          $if K == 0:
122            float vo${M}p0 = vbias + vi${M+K}x0 * vk${K}0;
123          $elif K < ACCUMULATORS:
124            float vo${M}p${K} = vi${M+K}x0 * vk${K}0;
125          $else:
126            vo${M}p${K % ACCUMULATORS} += vi${M+K}x0 * vk${K}0;
127
128      $for K in range(3):
129        $for M in range(ROW_TILE):
130          $if K+3 < ACCUMULATORS:
131            float vo${M}p${K+3} = vi${M+K}x1 * vk${K}1;
132          $else:
133            vo${M}p${(K+3) % ACCUMULATORS} += vi${M+K}x1 * vk${K}1;
134
135      $if ACCUMULATORS > 1:
136        $ACC_SLICE = 1
137        $while ACC_SLICE < ACCUMULATORS:
138          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
139            $if A + ACC_SLICE < ACCUMULATORS:
140              $for M in range(ROW_TILE):
141                vo${M}p${A} += vo${M}p${A + ACC_SLICE};
142          $ACC_SLICE *= 2
143
144      $for M in range(ROW_TILE):
145        float vo${M} = math_max_f32(vo${M}p0, vmin);
146
147      $for M in range(ROW_TILE):
148        vo${M} = math_min_f32(vo${M}, vmax);
149
150      $for M in reversed(range(ROW_TILE)):
151        *o${M}++ = vo${M};
152    }
153
154    i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_width);
155    $if ROW_TILE > 1:
156      i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_width);
157      $for M in range(2, 2 + ROW_TILE):
158        i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
159
160    $if ROW_TILE > 1:
161      o0 = o${ROW_TILE - 1};
162      $for M in range(1, ROW_TILE):
163        o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
164
165    $if ROW_TILE > 1:
166      output_height = doz(output_height, ${ROW_TILE});
167  } while (${"--" if ROW_TILE == 1 else ""}output_height != 0);
168}
169