xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv2d-chw/3x3s2p1-scalar.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ROW_TILE >= 1
7$assert ACCUMULATORS >= 1
8#include <assert.h>
9
10#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13
14void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_${ROW_TILE}x1${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}(
15    size_t input_height,
16    size_t input_width,
17    const float* input,
18    const float* weights,
19    const float* zero,
20    float* output,
21    uint32_t padding_top,
22    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
23{
24  assert(input_height != 0);
25  assert(input_width != 0);
26  assert(input_width % sizeof(float) == 0);
27  assert(padding_top >= 0);
28  assert(padding_top <= 1);
29
30  const float vmin = params->scalar.min;
31  const float vmax = params->scalar.max;
32
33  const float vbias = weights[0];
34  const float vk00 = weights[1];
35  const float vk01 = weights[2];
36  const float vk02 = weights[3];
37  const float vk10 = weights[4];
38  const float vk11 = weights[5];
39  const float vk12 = weights[6];
40  const float vk20 = weights[7];
41  const float vk21 = weights[8];
42  const float vk22 = weights[9];
43
44  $if ROW_TILE > 1:
45    const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
46
47  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
48  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
49  if XNN_UNPREDICTABLE(padding_top != 0) {
50    i0 = zero;
51  }
52  $for M in range(2, 1 + 2 * ROW_TILE):
53    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
54
55  float* o0 = output;
56  $for M in range(1, ROW_TILE):
57    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_width);
58
59  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
60  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
61  do {
62    $for M in range(2, 1 + 2 * ROW_TILE):
63      if XNN_UNPREDICTABLE(padded_input_height < ${2 + M}) {
64        i${M} = zero;
65        $if M % 2 == 1:
66          o${(M - 1) // 2} = o${(M - 1) // 2 - 1};
67      }
68
69    $for M in range(1 + 2 * ROW_TILE):
70      float vi${M}x0 = 0.0f;
71
72    size_t w = input_width;
73    for (; w >= 2 * sizeof(float); w -= 2 * sizeof(float)) {
74      $for M in range(1 + 2 * ROW_TILE):
75        const float vi${M}x1 = i${M}[0];
76
77      $for K in range(3):
78        $for M in range(ROW_TILE):
79          $if K == 0:
80            float vo${M}p0 = vbias + vi${2*M+K}x0 * vk${K}0;
81          $elif K < ACCUMULATORS:
82            float vo${M}p${K} = vi${2*M+K}x0 * vk${K}0;
83          $else:
84            vo${M}p${K % ACCUMULATORS} += vi${2*M+K}x0 * vk${K}0;
85
86      $for M in range(1 + 2 * ROW_TILE):
87        const float vi${M}x2 = i${M}[1];
88        i${M} += 2;
89
90      $for K in range(3):
91        $for M in range(ROW_TILE):
92          $if K + 3 < ACCUMULATORS:
93            float vo${M}p${K+3} = vi${2*M+K}x1 * vk${K}1;
94          $else:
95            vo${M}p${(K+3) % ACCUMULATORS} += vi${2*M+K}x1 * vk${K}1;
96
97      $for M in range(1 + 2 * ROW_TILE):
98        vi${M}x0 = vi${M}x2;
99
100      $for K in range(3):
101        $for M in range(ROW_TILE):
102          vo${M}p${(K+6) % ACCUMULATORS} += vi${2*M+K}x2 * vk${K}2;
103
104      $if ACCUMULATORS > 1:
105        $ACC_SLICE = 1
106        $while ACC_SLICE < ACCUMULATORS:
107          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
108            $if A + ACC_SLICE < ACCUMULATORS:
109              $for M in range(ROW_TILE):
110                vo${M}p${A} += vo${M}p${A + ACC_SLICE};
111          $ACC_SLICE *= 2
112
113      $for M in range(ROW_TILE):
114        float vo${M} = math_max_f32(vo${M}p0, vmin);
115
116      $for M in range(ROW_TILE):
117        vo${M} = math_min_f32(vo${M}, vmax);
118
119      $for M in reversed(range(ROW_TILE)):
120        *o${M}++ = vo${M};
121    }
122    // Potentially process the last pixel.
123    assert(w <= 1 * sizeof(float));
124    if (w != 0) {
125      $for M in range(1 + 2 * ROW_TILE):
126        const float vi${M}x1 = *i${M}++;
127
128      $for K in range(3):
129        $for M in range(ROW_TILE):
130          $if K == 0:
131            float vo${M}p0 = vbias + vi${2*M+K}x0 * vk${K}0;
132          $elif K < ACCUMULATORS:
133            float vo${M}p${K} = vi${2*M+K}x0 * vk${K}0;
134          $else:
135            vo${M}p${K % ACCUMULATORS} += vi${2*M+K}x0 * vk${K}0;
136
137      $for K in range(3):
138        $for M in range(ROW_TILE):
139          $if K + 3 < ACCUMULATORS:
140            float vo${M}p${K+3} = vi${2*M+K}x1 * vk${K}1;
141          $else:
142            vo${M}p${(K+3) % ACCUMULATORS} += vi${2*M+K}x1 * vk${K}1;
143
144      $if ACCUMULATORS > 1:
145        $ACC_SLICE = 1
146        $while ACC_SLICE < ACCUMULATORS:
147          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
148            $if A + ACC_SLICE < ACCUMULATORS:
149              $for M in range(ROW_TILE):
150                vo${M}p${A} += vo${M}p${A + ACC_SLICE};
151          $ACC_SLICE *= 2
152
153      $for M in range(ROW_TILE):
154        float vo${M} = math_max_f32(vo${M}p0, vmin);
155
156      $for M in range(ROW_TILE):
157        vo${M} = math_min_f32(vo${M}, vmax);
158
159      $for M in reversed(range(ROW_TILE)):
160        *o${M}++ = vo${M};
161    }
162
163    i0 = (const float*) ((uintptr_t) i${2 * ROW_TILE - 1});
164    i1 = (const float*) ((uintptr_t) i${2 * ROW_TILE});
165    $for M in range(2, 1 + 2 * ROW_TILE):
166      i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
167
168    $if ROW_TILE > 1:
169      o0 = o${ROW_TILE - 1};
170      $for M in range(1, ROW_TILE):
171        o${M} = (float*) ((uintptr_t) o${M-1} + output_width);
172
173    $if ROW_TILE > 1:
174      output_height = doz(output_height, ${ROW_TILE});
175      padded_input_height = doz(padded_input_height, ${ROW_TILE * 2});
176    $else:
177      output_height -= 1;
178      padded_input_height -= 2;
179  } while (output_height != 0);
180}
181