xref: /aosp_15_r20/external/XNNPACK/src/f32-ppmm/gen/4x4-minmax-scalar.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-ppmm/scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/math.h>
13 #include <xnnpack/ppmm.h>
14 
15 
xnn_f32_ppmm_minmax_ukernel_4x4__scalar(size_t mr,size_t nc,size_t kc,const float * restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_ppmm_minmax_ukernel_4x4__scalar(
17   size_t mr,
18   size_t nc,
19   size_t kc,
20   const float*restrict a,
21   const float*restrict w,
22   float*restrict c,
23   size_t cm_stride,
24   size_t cn_stride,
25   const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(mr != 0);
28   assert(mr <= 4);
29   assert(nc != 0);
30   assert(kc != 0);
31   assert(kc % sizeof(float) == 0);
32 
33   float* c0 = c;
34   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
35   if XNN_UNPREDICTABLE(mr < 2) {
36     c1 = c0;
37   }
38   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
39   if XNN_UNPREDICTABLE(mr <= 2) {
40     c2 = c1;
41   }
42   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
43   if XNN_UNPREDICTABLE(mr != 4) {
44     c3 = c2;
45   }
46 
47   do {
48     float vacc0x0 = w[0];
49     float vacc0x1 = w[1];
50     float vacc0x2 = w[2];
51     float vacc0x3 = w[3];
52     float vacc1x0 = vacc0x0;
53     float vacc1x1 = vacc0x1;
54     float vacc1x2 = vacc0x2;
55     float vacc1x3 = vacc0x3;
56     float vacc2x0 = vacc0x0;
57     float vacc2x1 = vacc0x1;
58     float vacc2x2 = vacc0x2;
59     float vacc2x3 = vacc0x3;
60     float vacc3x0 = vacc0x0;
61     float vacc3x1 = vacc0x1;
62     float vacc3x2 = vacc0x2;
63     float vacc3x3 = vacc0x3;
64     w += 4;
65 
66     size_t k = kc;
67     do {
68       const float va0 = a[0];
69       const float va1 = a[1];
70       const float va2 = a[2];
71       const float va3 = a[3];
72       a += 4;
73 
74       const float vb0 = w[0];
75       const float vb1 = w[1];
76       const float vb2 = w[2];
77       const float vb3 = w[3];
78       w += 4;
79 
80       vacc0x0 += va0 * vb0;
81       vacc1x0 += va1 * vb0;
82       vacc2x0 += va2 * vb0;
83       vacc3x0 += va3 * vb0;
84       vacc0x1 += va0 * vb1;
85       vacc1x1 += va1 * vb1;
86       vacc2x1 += va2 * vb1;
87       vacc3x1 += va3 * vb1;
88       vacc0x2 += va0 * vb2;
89       vacc1x2 += va1 * vb2;
90       vacc2x2 += va2 * vb2;
91       vacc3x2 += va3 * vb2;
92       vacc0x3 += va0 * vb3;
93       vacc1x3 += va1 * vb3;
94       vacc2x3 += va2 * vb3;
95       vacc3x3 += va3 * vb3;
96 
97       k -= sizeof(float);
98     } while (k != 0);
99 
100     const float vmax = params->scalar.max;
101     vacc0x0 = math_min_f32(vacc0x0, vmax);
102     vacc1x0 = math_min_f32(vacc1x0, vmax);
103     vacc2x0 = math_min_f32(vacc2x0, vmax);
104     vacc3x0 = math_min_f32(vacc3x0, vmax);
105     vacc0x1 = math_min_f32(vacc0x1, vmax);
106     vacc1x1 = math_min_f32(vacc1x1, vmax);
107     vacc2x1 = math_min_f32(vacc2x1, vmax);
108     vacc3x1 = math_min_f32(vacc3x1, vmax);
109     vacc0x2 = math_min_f32(vacc0x2, vmax);
110     vacc1x2 = math_min_f32(vacc1x2, vmax);
111     vacc2x2 = math_min_f32(vacc2x2, vmax);
112     vacc3x2 = math_min_f32(vacc3x2, vmax);
113     vacc0x3 = math_min_f32(vacc0x3, vmax);
114     vacc1x3 = math_min_f32(vacc1x3, vmax);
115     vacc2x3 = math_min_f32(vacc2x3, vmax);
116     vacc3x3 = math_min_f32(vacc3x3, vmax);
117 
118     const float vmin = params->scalar.min;
119     vacc0x0 = math_max_f32(vacc0x0, vmin);
120     vacc1x0 = math_max_f32(vacc1x0, vmin);
121     vacc2x0 = math_max_f32(vacc2x0, vmin);
122     vacc3x0 = math_max_f32(vacc3x0, vmin);
123     vacc0x1 = math_max_f32(vacc0x1, vmin);
124     vacc1x1 = math_max_f32(vacc1x1, vmin);
125     vacc2x1 = math_max_f32(vacc2x1, vmin);
126     vacc3x1 = math_max_f32(vacc3x1, vmin);
127     vacc0x2 = math_max_f32(vacc0x2, vmin);
128     vacc1x2 = math_max_f32(vacc1x2, vmin);
129     vacc2x2 = math_max_f32(vacc2x2, vmin);
130     vacc3x2 = math_max_f32(vacc3x2, vmin);
131     vacc0x3 = math_max_f32(vacc0x3, vmin);
132     vacc1x3 = math_max_f32(vacc1x3, vmin);
133     vacc2x3 = math_max_f32(vacc2x3, vmin);
134     vacc3x3 = math_max_f32(vacc3x3, vmin);
135 
136     if XNN_LIKELY(nc >= 4) {
137       c3[0] = vacc3x0;
138       c3[1] = vacc3x1;
139       c3[2] = vacc3x2;
140       c3[3] = vacc3x3;
141       c2[0] = vacc2x0;
142       c2[1] = vacc2x1;
143       c2[2] = vacc2x2;
144       c2[3] = vacc2x3;
145       c1[0] = vacc1x0;
146       c1[1] = vacc1x1;
147       c1[2] = vacc1x2;
148       c1[3] = vacc1x3;
149       c0[0] = vacc0x0;
150       c0[1] = vacc0x1;
151       c0[2] = vacc0x2;
152       c0[3] = vacc0x3;
153 
154       a = (const float*) ((uintptr_t) a - kc * 4);
155 
156       c3 = (float*) ((uintptr_t) c3 + cn_stride);
157       c2 = (float*) ((uintptr_t) c2 + cn_stride);
158       c1 = (float*) ((uintptr_t) c1 + cn_stride);
159       c0 = (float*) ((uintptr_t) c0 + cn_stride);
160 
161       nc -= 4;
162     } else {
163       if (nc & 2) {
164         c3[0] = vacc3x0;
165         c3[1] = vacc3x1;
166         c2[0] = vacc2x0;
167         c2[1] = vacc2x1;
168         c1[0] = vacc1x0;
169         c1[1] = vacc1x1;
170         c0[0] = vacc0x0;
171         c0[1] = vacc0x1;
172 
173         vacc3x0 = vacc3x2;
174         vacc2x0 = vacc2x2;
175         vacc1x0 = vacc1x2;
176         vacc0x0 = vacc0x2;
177 
178         c3 += 2;
179         c2 += 2;
180         c1 += 2;
181         c0 += 2;
182       }
183       if (nc & 1) {
184         *c3 = vacc3x0;
185         *c2 = vacc2x0;
186         *c1 = vacc1x0;
187         *c0 = vacc0x0;
188       }
189 
190       nc = 0;
191     }
192   } while (nc != 0);
193 }
194