1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/3x3p1-scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xnnpack/dwconv.h>
13 #include <xnnpack/math.h>
14
15
xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1(
17 size_t input_height,
18 size_t input_width,
19 const float* input,
20 const float* weights,
21 const float* zero,
22 float* output,
23 uint32_t padding_top,
24 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
25 {
26 assert(input_height != 0);
27 assert(input_width != 0);
28 assert(input_width % sizeof(float) == 0);
29 assert(padding_top == 1);
30
31 const float vmin = params->scalar.min;
32 const float vmax = params->scalar.max;
33
34 const float vbias = weights[0];
35 const float vk00 = weights[1];
36 const float vk01 = weights[2];
37 const float vk02 = weights[3];
38 const float vk10 = weights[4];
39 const float vk11 = weights[5];
40 const float vk12 = weights[6];
41 const float vk20 = weights[7];
42 const float vk21 = weights[8];
43 const float vk22 = weights[9];
44
45 const float* i0 = zero;
46 const float* i1 = input;
47 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
48 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
49 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
50
51 float* o0 = output;
52 float* o1 = (float*) ((uintptr_t) o0 + input_width);
53 float* o2 = (float*) ((uintptr_t) o1 + input_width);
54
55 size_t output_height = input_height;
56 do {
57 if XNN_UNPREDICTABLE(output_height < 2) {
58 i2 = zero;
59 o1 = o0;
60 }
61 if XNN_UNPREDICTABLE(output_height < 3) {
62 i3 = zero;
63 o2 = o1;
64 }
65 if XNN_UNPREDICTABLE(output_height < 4) {
66 i4 = zero;
67 }
68
69 float vi0x0 = 0.0f;
70 float vi1x0 = 0.0f;
71 float vi2x0 = 0.0f;
72 float vi3x0 = 0.0f;
73 float vi4x0 = 0.0f;
74
75 float vi0x1 = *i0++;
76 float vi1x1 = *i1++;
77 float vi2x1 = *i2++;
78 float vi3x1 = *i3++;
79 float vi4x1 = *i4++;
80
81 size_t w = input_width;
82 for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
83 const float vi0x2 = *i0++;
84 const float vi1x2 = *i1++;
85 const float vi2x2 = *i2++;
86 const float vi3x2 = *i3++;
87 const float vi4x2 = *i4++;
88
89 float vo0p0 = vbias + vi0x0 * vk00;
90 float vo1p0 = vbias + vi1x0 * vk00;
91 float vo2p0 = vbias + vi2x0 * vk00;
92 vo0p0 += vi1x0 * vk10;
93 vo1p0 += vi2x0 * vk10;
94 vo2p0 += vi3x0 * vk10;
95 vo0p0 += vi2x0 * vk20;
96 vo1p0 += vi3x0 * vk20;
97 vo2p0 += vi4x0 * vk20;
98
99 vi0x0 = vi0x1;
100 vi1x0 = vi1x1;
101 vi2x0 = vi2x1;
102 vi3x0 = vi3x1;
103 vi4x0 = vi4x1;
104
105 vo0p0 += vi0x1 * vk01;
106 vo1p0 += vi1x1 * vk01;
107 vo2p0 += vi2x1 * vk01;
108 vo0p0 += vi1x1 * vk11;
109 vo1p0 += vi2x1 * vk11;
110 vo2p0 += vi3x1 * vk11;
111 vo0p0 += vi2x1 * vk21;
112 vo1p0 += vi3x1 * vk21;
113 vo2p0 += vi4x1 * vk21;
114
115 vi0x1 = vi0x2;
116 vi1x1 = vi1x2;
117 vi2x1 = vi2x2;
118 vi3x1 = vi3x2;
119 vi4x1 = vi4x2;
120
121 vo0p0 += vi0x2 * vk02;
122 vo1p0 += vi1x2 * vk02;
123 vo2p0 += vi2x2 * vk02;
124 vo0p0 += vi1x2 * vk12;
125 vo1p0 += vi2x2 * vk12;
126 vo2p0 += vi3x2 * vk12;
127 vo0p0 += vi2x2 * vk22;
128 vo1p0 += vi3x2 * vk22;
129 vo2p0 += vi4x2 * vk22;
130
131
132 float vo0 = math_max_f32(vo0p0, vmin);
133 float vo1 = math_max_f32(vo1p0, vmin);
134 float vo2 = math_max_f32(vo2p0, vmin);
135
136 vo0 = math_min_f32(vo0, vmax);
137 vo1 = math_min_f32(vo1, vmax);
138 vo2 = math_min_f32(vo2, vmax);
139
140 *o2++ = vo2;
141 *o1++ = vo1;
142 *o0++ = vo0;
143 }
144 // Always process the last pixel separately to account for right edge.
145 assert(w == 1 * sizeof(float));
146 {
147 float vo0p0 = vbias + vi0x0 * vk00;
148 float vo1p0 = vbias + vi1x0 * vk00;
149 float vo2p0 = vbias + vi2x0 * vk00;
150 vo0p0 += vi1x0 * vk10;
151 vo1p0 += vi2x0 * vk10;
152 vo2p0 += vi3x0 * vk10;
153 vo0p0 += vi2x0 * vk20;
154 vo1p0 += vi3x0 * vk20;
155 vo2p0 += vi4x0 * vk20;
156
157 vo0p0 += vi0x1 * vk01;
158 vo1p0 += vi1x1 * vk01;
159 vo2p0 += vi2x1 * vk01;
160 vo0p0 += vi1x1 * vk11;
161 vo1p0 += vi2x1 * vk11;
162 vo2p0 += vi3x1 * vk11;
163 vo0p0 += vi2x1 * vk21;
164 vo1p0 += vi3x1 * vk21;
165 vo2p0 += vi4x1 * vk21;
166
167
168 float vo0 = math_max_f32(vo0p0, vmin);
169 float vo1 = math_max_f32(vo1p0, vmin);
170 float vo2 = math_max_f32(vo2p0, vmin);
171
172 vo0 = math_min_f32(vo0, vmax);
173 vo1 = math_min_f32(vo1, vmax);
174 vo2 = math_min_f32(vo2, vmax);
175
176 *o2++ = vo2;
177 *o1++ = vo1;
178 *o0++ = vo0;
179 }
180
181 i0 = (const float*) ((uintptr_t) i3 - input_width);
182 i1 = (const float*) ((uintptr_t) i4 - input_width);
183 i2 = (const float*) ((uintptr_t) i1 + input_width);
184 i3 = (const float*) ((uintptr_t) i2 + input_width);
185 i4 = (const float*) ((uintptr_t) i3 + input_width);
186
187 o0 = o2;
188 o1 = (float*) ((uintptr_t) o0 + input_width);
189 o2 = (float*) ((uintptr_t) o1 + input_width);
190
191 output_height = doz(output_height, 3);
192 } while (output_height != 0);
193 }
194