xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jdsample-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1 /*
2  * jdsample-neon.c - upsampling (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
6  *
7  * This software is provided 'as-is', without any express or implied
8  * warranty.  In no event will the authors be held liable for any damages
9  * arising from the use of this software.
10  *
11  * Permission is granted to anyone to use this software for any purpose,
12  * including commercial applications, and to alter it and redistribute it
13  * freely, subject to the following restrictions:
14  *
15  * 1. The origin of this software must not be misrepresented; you must not
16  *    claim that you wrote the original software. If you use this software
17  *    in a product, an acknowledgment in the product documentation would be
18  *    appreciated but is not required.
19  * 2. Altered source versions must be plainly marked as such, and must not be
20  *    misrepresented as being the original software.
21  * 3. This notice may not be removed or altered from any source distribution.
22  */
23 
24 #define JPEG_INTERNALS
25 #include "../../jinclude.h"
26 #include "../../jpeglib.h"
27 #include "../../jsimd.h"
28 #include "../../jdct.h"
29 #include "../../jsimddct.h"
30 #include "../jsimd.h"
31 
32 #include <arm_neon.h>
33 
34 
35 /* The diagram below shows a row of samples produced by h2v1 downsampling.
36  *
37  *                s0        s1        s2
38  *            +---------+---------+---------+
39  *            |         |         |         |
40  *            | p0   p1 | p2   p3 | p4   p5 |
41  *            |         |         |         |
42  *            +---------+---------+---------+
43  *
44  * Samples s0-s2 were created by averaging the original pixel component values
45  * centered at positions p0-p5 above.  To approximate those original pixel
46  * component values, we proportionally blend the adjacent samples in each row.
47  *
48  * An upsampled pixel component value is computed by blending the sample
49  * containing the pixel center with the nearest neighboring sample, in the
50  * ratio 3:1.  For example:
51  *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
52  *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
53  * When computing the first and last pixel component values in the row, there
54  * is no adjacent sample to blend, so:
55  *     p0(upsampled) = s0
56  *     p5(upsampled) = s2
57  */
58 
jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)59 void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
60                                     JDIMENSION downsampled_width,
61                                     JSAMPARRAY input_data,
62                                     JSAMPARRAY *output_data_ptr)
63 {
64   JSAMPARRAY output_data = *output_data_ptr;
65   JSAMPROW inptr, outptr;
66   int inrow;
67   unsigned colctr;
68   /* Set up constants. */
69   const uint16x8_t one_u16 = vdupq_n_u16(1);
70   const uint8x8_t three_u8 = vdup_n_u8(3);
71 
72   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
73     inptr = input_data[inrow];
74     outptr = output_data[inrow];
75     /* First pixel component value in this row of the original image */
76     *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
77 
78     /*    3/4 * containing sample + 1/4 * nearest neighboring sample
79      * For p1: containing sample = s0, nearest neighboring sample = s1
80      * For p2: containing sample = s1, nearest neighboring sample = s0
81      */
82     uint8x16_t s0 = vld1q_u8(inptr);
83     uint8x16_t s1 = vld1q_u8(inptr + 1);
84     /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
85      * denote low half and high half respectively.
86      */
87     uint16x8_t s1_add_3s0_l =
88       vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
89     uint16x8_t s1_add_3s0_h =
90       vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
91     uint16x8_t s0_add_3s1_l =
92       vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
93     uint16x8_t s0_add_3s1_h =
94       vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
95     /* Add ordered dithering bias to odd pixel values. */
96     s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
97     s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
98 
99     /* The offset is initially 1, because the first pixel component has already
100      * been stored.  However, in subsequent iterations of the SIMD loop, this
101      * offset is (2 * colctr - 1) to stay within the bounds of the sample
102      * buffers without having to resort to a slow scalar tail case for the last
103      * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
104      * in jmemmgr.c for more details.
105      */
106     unsigned outptr_offset = 1;
107     uint8x16x2_t output_pixels;
108 
109     /* We use software pipelining to maximise performance.  The code indented
110      * an extra two spaces begins the next iteration of the loop.
111      */
112     for (colctr = 16; colctr < downsampled_width; colctr += 16) {
113 
114         s0 = vld1q_u8(inptr + colctr - 1);
115         s1 = vld1q_u8(inptr + colctr);
116 
117       /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
118       output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
119                                          vrshrn_n_u16(s1_add_3s0_h, 2));
120       output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
121                                          vshrn_n_u16(s0_add_3s1_h, 2));
122 
123         /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
124          * denote low half and high half respectively.
125          */
126         s1_add_3s0_l =
127           vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
128         s1_add_3s0_h =
129           vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
130         s0_add_3s1_l =
131           vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
132         s0_add_3s1_h =
133           vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
134         /* Add ordered dithering bias to odd pixel values. */
135         s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
136         s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
137 
138       /* Store pixel component values to memory. */
139       vst2q_u8(outptr + outptr_offset, output_pixels);
140       outptr_offset = 2 * colctr - 1;
141     }
142 
143     /* Complete the last iteration of the loop. */
144 
145     /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
146     output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
147                                        vrshrn_n_u16(s1_add_3s0_h, 2));
148     output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
149                                        vshrn_n_u16(s0_add_3s1_h, 2));
150     /* Store pixel component values to memory. */
151     vst2q_u8(outptr + outptr_offset, output_pixels);
152 
153     /* Last pixel component value in this row of the original image */
154     outptr[2 * downsampled_width - 1] =
155       GETJSAMPLE(inptr[downsampled_width - 1]);
156   }
157 }
158 
159 
160 /* The diagram below shows an array of samples produced by h2v2 downsampling.
161  *
162  *                s0        s1        s2
163  *            +---------+---------+---------+
164  *            | p0   p1 | p2   p3 | p4   p5 |
165  *       sA   |         |         |         |
166  *            | p6   p7 | p8   p9 | p10  p11|
167  *            +---------+---------+---------+
168  *            | p12  p13| p14  p15| p16  p17|
169  *       sB   |         |         |         |
170  *            | p18  p19| p20  p21| p22  p23|
171  *            +---------+---------+---------+
172  *            | p24  p25| p26  p27| p28  p29|
173  *       sC   |         |         |         |
174  *            | p30  p31| p32  p33| p34  p35|
175  *            +---------+---------+---------+
176  *
177  * Samples s0A-s2C were created by averaging the original pixel component
178  * values centered at positions p0-p35 above.  To approximate one of those
179  * original pixel component values, we proportionally blend the sample
180  * containing the pixel center with the nearest neighboring samples in each
181  * row, column, and diagonal.
182  *
183  * An upsampled pixel component value is computed by first blending the sample
184  * containing the pixel center with the nearest neighboring samples in the
185  * same column, in the ratio 3:1, and then blending each column sum with the
186  * nearest neighboring column sum, in the ratio 3:1.  For example:
187  *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
188  *                      1/4 * (3/4 * s0B + 1/4 * s0A)
189  *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
190  * When computing the first and last pixel component values in the row, there
191  * is no horizontally adjacent sample to blend, so:
192  *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
193  *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
194  * When computing the first and last pixel component values in the column,
195  * there is no vertically adjacent sample to blend, so:
196  *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
197  *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
198  * When computing the corner pixel component values, there is no adjacent
199  * sample to blend, so:
200  *     p0(upsampled) = s0A
201  *     p35(upsampled) = s2C
202  */
203 
jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)204 void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
205                                     JDIMENSION downsampled_width,
206                                     JSAMPARRAY input_data,
207                                     JSAMPARRAY *output_data_ptr)
208 {
209   JSAMPARRAY output_data = *output_data_ptr;
210   JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
211   int inrow, outrow;
212   unsigned colctr;
213   /* Set up constants. */
214   const uint16x8_t seven_u16 = vdupq_n_u16(7);
215   const uint8x8_t three_u8 = vdup_n_u8(3);
216   const uint16x8_t three_u16 = vdupq_n_u16(3);
217 
218   inrow = outrow = 0;
219   while (outrow < max_v_samp_factor) {
220     inptr0 = input_data[inrow - 1];
221     inptr1 = input_data[inrow];
222     inptr2 = input_data[inrow + 1];
223     /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
224      * respectively.
225      */
226     outptr0 = output_data[outrow++];
227     outptr1 = output_data[outrow++];
228 
229     /* First pixel component value in this row of the original image */
230     int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
231     *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
232     int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
233     *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
234 
235     /* Step 1: Blend samples vertically in columns s0 and s1.
236      * Leave the divide by 4 until the end, when it can be done for both
237      * dimensions at once, right-shifting by 4.
238      */
239 
240     /* Load and compute s0colsum0 and s0colsum1. */
241     uint8x16_t s0A = vld1q_u8(inptr0);
242     uint8x16_t s0B = vld1q_u8(inptr1);
243     uint8x16_t s0C = vld1q_u8(inptr2);
244     /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
245      * denote low half and high half respectively.
246      */
247     uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
248                                       vget_low_u8(s0B), three_u8);
249     uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
250                                       vget_high_u8(s0B), three_u8);
251     uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
252                                       vget_low_u8(s0B), three_u8);
253     uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
254                                       vget_high_u8(s0B), three_u8);
255     /* Load and compute s1colsum0 and s1colsum1. */
256     uint8x16_t s1A = vld1q_u8(inptr0 + 1);
257     uint8x16_t s1B = vld1q_u8(inptr1 + 1);
258     uint8x16_t s1C = vld1q_u8(inptr2 + 1);
259     uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
260                                       vget_low_u8(s1B), three_u8);
261     uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
262                                       vget_high_u8(s1B), three_u8);
263     uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
264                                       vget_low_u8(s1B), three_u8);
265     uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
266                                       vget_high_u8(s1B), three_u8);
267 
268     /* Step 2: Blend the already-blended columns. */
269 
270     uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
271     uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
272     uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
273     uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
274     uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
275     uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
276     uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
277     uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
278     /* Add ordered dithering bias to odd pixel values. */
279     output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
280     output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
281     output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
282     output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
283     /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
284     uint8x16x2_t output_pixels0 = { {
285       vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
286       vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
287     } };
288     uint8x16x2_t output_pixels1 = { {
289       vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
290       vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
291     } };
292 
293     /* Store pixel component values to memory.
294      * The minimum size of the output buffer for each row is 64 bytes => no
295      * need to worry about buffer overflow here.  See "Creation of 2-D sample
296      * arrays" in jmemmgr.c for more details.
297      */
298     vst2q_u8(outptr0 + 1, output_pixels0);
299     vst2q_u8(outptr1 + 1, output_pixels1);
300 
301     /* The first pixel of the image shifted our loads and stores by one byte.
302      * We have to re-align on a 32-byte boundary at some point before the end
303      * of the row (we do it now on the 32/33 pixel boundary) to stay within the
304      * bounds of the sample buffers without having to resort to a slow scalar
305      * tail case for the last (downsampled_width % 16) samples.  See "Creation
306      * of 2-D sample arrays" in jmemmgr.c for more details.
307      */
308     for (colctr = 16; colctr < downsampled_width; colctr += 16) {
309       /* Step 1: Blend samples vertically in columns s0 and s1. */
310 
311       /* Load and compute s0colsum0 and s0colsum1. */
312       s0A = vld1q_u8(inptr0 + colctr - 1);
313       s0B = vld1q_u8(inptr1 + colctr - 1);
314       s0C = vld1q_u8(inptr2 + colctr - 1);
315       s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
316                              three_u8);
317       s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
318                              three_u8);
319       s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
320                              three_u8);
321       s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
322                              three_u8);
323       /* Load and compute s1colsum0 and s1colsum1. */
324       s1A = vld1q_u8(inptr0 + colctr);
325       s1B = vld1q_u8(inptr1 + colctr);
326       s1C = vld1q_u8(inptr2 + colctr);
327       s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
328                              three_u8);
329       s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
330                              three_u8);
331       s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
332                              three_u8);
333       s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
334                              three_u8);
335 
336       /* Step 2: Blend the already-blended columns. */
337 
338       output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
339       output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
340       output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
341       output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
342       output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
343       output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
344       output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
345       output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
346       /* Add ordered dithering bias to odd pixel values. */
347       output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
348       output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
349       output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
350       output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
351       /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
352       output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
353                                           vshrn_n_u16(output0_p1_h, 4));
354       output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
355                                           vrshrn_n_u16(output0_p2_h, 4));
356       output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
357                                           vshrn_n_u16(output1_p1_h, 4));
358       output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
359                                           vrshrn_n_u16(output1_p2_h, 4));
360       /* Store pixel component values to memory. */
361       vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
362       vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
363     }
364 
365     /* Last pixel component value in this row of the original image */
366     int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
367                     GETJSAMPLE(inptr0[downsampled_width - 1]);
368     outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
369     int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
370                     GETJSAMPLE(inptr2[downsampled_width - 1]);
371     outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
372     inrow++;
373   }
374 }
375 
376 
377 /* The diagram below shows a column of samples produced by h1v2 downsampling
378  * (or by losslessly rotating or transposing an h2v1-downsampled image.)
379  *
380  *            +---------+
381  *            |   p0    |
382  *     sA     |         |
383  *            |   p1    |
384  *            +---------+
385  *            |   p2    |
386  *     sB     |         |
387  *            |   p3    |
388  *            +---------+
389  *            |   p4    |
390  *     sC     |         |
391  *            |   p5    |
392  *            +---------+
393  *
394  * Samples sA-sC were created by averaging the original pixel component values
395  * centered at positions p0-p5 above.  To approximate those original pixel
396  * component values, we proportionally blend the adjacent samples in each
397  * column.
398  *
399  * An upsampled pixel component value is computed by blending the sample
400  * containing the pixel center with the nearest neighboring sample, in the
401  * ratio 3:1.  For example:
402  *     p1(upsampled) = 3/4 * sA + 1/4 * sB
403  *     p2(upsampled) = 3/4 * sB + 1/4 * sA
404  * When computing the first and last pixel component values in the column,
405  * there is no adjacent sample to blend, so:
406  *     p0(upsampled) = sA
407  *     p5(upsampled) = sC
408  */
409 
jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)410 void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
411                                     JDIMENSION downsampled_width,
412                                     JSAMPARRAY input_data,
413                                     JSAMPARRAY *output_data_ptr)
414 {
415   JSAMPARRAY output_data = *output_data_ptr;
416   JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
417   int inrow, outrow;
418   unsigned colctr;
419   /* Set up constants. */
420   const uint16x8_t one_u16 = vdupq_n_u16(1);
421   const uint8x8_t three_u8 = vdup_n_u8(3);
422 
423   inrow = outrow = 0;
424   while (outrow < max_v_samp_factor) {
425     inptr0 = input_data[inrow - 1];
426     inptr1 = input_data[inrow];
427     inptr2 = input_data[inrow + 1];
428     /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
429      * respectively.
430      */
431     outptr0 = output_data[outrow++];
432     outptr1 = output_data[outrow++];
433     inrow++;
434 
435     /* The size of the input and output buffers is always a multiple of 32
436      * bytes => no need to worry about buffer overflow when reading/writing
437      * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
438      * details.
439      */
440     for (colctr = 0; colctr < downsampled_width; colctr += 16) {
441       /* Load samples. */
442       uint8x16_t sA = vld1q_u8(inptr0 + colctr);
443       uint8x16_t sB = vld1q_u8(inptr1 + colctr);
444       uint8x16_t sC = vld1q_u8(inptr2 + colctr);
445       /* Blend samples vertically. */
446       uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
447                                       vget_low_u8(sB), three_u8);
448       uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
449                                       vget_high_u8(sB), three_u8);
450       uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
451                                       vget_low_u8(sB), three_u8);
452       uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
453                                       vget_high_u8(sB), three_u8);
454       /* Add ordered dithering bias to pixel values in even output rows. */
455       colsum0_l = vaddq_u16(colsum0_l, one_u16);
456       colsum0_h = vaddq_u16(colsum0_h, one_u16);
457       /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
458       uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
459                                               vshrn_n_u16(colsum0_h, 2));
460       uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
461                                               vrshrn_n_u16(colsum1_h, 2));
462       /* Store pixel component values to memory. */
463       vst1q_u8(outptr0 + colctr, output_pixels0);
464       vst1q_u8(outptr1 + colctr, output_pixels1);
465     }
466   }
467 }
468 
469 
470 /* The diagram below shows a row of samples produced by h2v1 downsampling.
471  *
472  *                s0        s1
473  *            +---------+---------+
474  *            |         |         |
475  *            | p0   p1 | p2   p3 |
476  *            |         |         |
477  *            +---------+---------+
478  *
479  * Samples s0 and s1 were created by averaging the original pixel component
480  * values centered at positions p0-p3 above.  To approximate those original
481  * pixel component values, we duplicate the samples horizontally:
482  *     p0(upsampled) = p1(upsampled) = s0
483  *     p2(upsampled) = p3(upsampled) = s1
484  */
485 
jsimd_h2v1_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)486 void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
487                               JSAMPARRAY input_data,
488                               JSAMPARRAY *output_data_ptr)
489 {
490   JSAMPARRAY output_data = *output_data_ptr;
491   JSAMPROW inptr, outptr;
492   int inrow;
493   unsigned colctr;
494 
495   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
496     inptr = input_data[inrow];
497     outptr = output_data[inrow];
498     for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
499       uint8x16_t samples = vld1q_u8(inptr + colctr);
500       /* Duplicate the samples.  The store operation below interleaves them so
501        * that adjacent pixel component values take on the same sample value,
502        * per above.
503        */
504       uint8x16x2_t output_pixels = { { samples, samples } };
505       /* Store pixel component values to memory.
506        * Due to the way sample buffers are allocated, we don't need to worry
507        * about tail cases when output_width is not a multiple of 32.  See
508        * "Creation of 2-D sample arrays" in jmemmgr.c for details.
509        */
510       vst2q_u8(outptr + 2 * colctr, output_pixels);
511     }
512   }
513 }
514 
515 
516 /* The diagram below shows an array of samples produced by h2v2 downsampling.
517  *
518  *                s0        s1
519  *            +---------+---------+
520  *            | p0   p1 | p2   p3 |
521  *       sA   |         |         |
522  *            | p4   p5 | p6   p7 |
523  *            +---------+---------+
524  *            | p8   p9 | p10  p11|
525  *       sB   |         |         |
526  *            | p12  p13| p14  p15|
527  *            +---------+---------+
528  *
529  * Samples s0A-s1B were created by averaging the original pixel component
530  * values centered at positions p0-p15 above.  To approximate those original
531  * pixel component values, we duplicate the samples both horizontally and
532  * vertically:
533  *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
534  *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
535  *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
536  *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
537  */
538 
jsimd_h2v2_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)539 void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
540                               JSAMPARRAY input_data,
541                               JSAMPARRAY *output_data_ptr)
542 {
543   JSAMPARRAY output_data = *output_data_ptr;
544   JSAMPROW inptr, outptr0, outptr1;
545   int inrow, outrow;
546   unsigned colctr;
547 
548   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
549     inptr = input_data[inrow];
550     outptr0 = output_data[outrow++];
551     outptr1 = output_data[outrow++];
552 
553     for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
554       uint8x16_t samples = vld1q_u8(inptr + colctr);
555       /* Duplicate the samples.  The store operation below interleaves them so
556        * that adjacent pixel component values take on the same sample value,
557        * per above.
558        */
559       uint8x16x2_t output_pixels = { { samples, samples } };
560       /* Store pixel component values for both output rows to memory.
561        * Due to the way sample buffers are allocated, we don't need to worry
562        * about tail cases when output_width is not a multiple of 32.  See
563        * "Creation of 2-D sample arrays" in jmemmgr.c for details.
564        */
565       vst2q_u8(outptr0 + 2 * colctr, output_pixels);
566       vst2q_u8(outptr1 + 2 * colctr, output_pixels);
567     }
568   }
569 }
570