1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker * jcsample-neon.c - downsampling (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker *
4*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker *
6*dfc6aa5cSAndroid Build Coastguard Worker * This software is provided 'as-is', without any express or implied
7*dfc6aa5cSAndroid Build Coastguard Worker * warranty. In no event will the authors be held liable for any damages
8*dfc6aa5cSAndroid Build Coastguard Worker * arising from the use of this software.
9*dfc6aa5cSAndroid Build Coastguard Worker *
10*dfc6aa5cSAndroid Build Coastguard Worker * Permission is granted to anyone to use this software for any purpose,
11*dfc6aa5cSAndroid Build Coastguard Worker * including commercial applications, and to alter it and redistribute it
12*dfc6aa5cSAndroid Build Coastguard Worker * freely, subject to the following restrictions:
13*dfc6aa5cSAndroid Build Coastguard Worker *
14*dfc6aa5cSAndroid Build Coastguard Worker * 1. The origin of this software must not be misrepresented; you must not
15*dfc6aa5cSAndroid Build Coastguard Worker * claim that you wrote the original software. If you use this software
16*dfc6aa5cSAndroid Build Coastguard Worker * in a product, an acknowledgment in the product documentation would be
17*dfc6aa5cSAndroid Build Coastguard Worker * appreciated but is not required.
18*dfc6aa5cSAndroid Build Coastguard Worker * 2. Altered source versions must be plainly marked as such, and must not be
19*dfc6aa5cSAndroid Build Coastguard Worker * misrepresented as being the original software.
20*dfc6aa5cSAndroid Build Coastguard Worker * 3. This notice may not be removed or altered from any source distribution.
21*dfc6aa5cSAndroid Build Coastguard Worker */
22*dfc6aa5cSAndroid Build Coastguard Worker
23*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
24*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "align.h"
31*dfc6aa5cSAndroid Build Coastguard Worker
32*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker
35*dfc6aa5cSAndroid Build Coastguard Worker ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
36*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
37*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
38*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
39*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
40*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
41*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
42*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
43*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
44*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
45*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
46*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
47*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
48*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
49*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
50*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
51*dfc6aa5cSAndroid Build Coastguard Worker 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
52*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
53*dfc6aa5cSAndroid Build Coastguard Worker 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
54*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
55*dfc6aa5cSAndroid Build Coastguard Worker 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
56*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
57*dfc6aa5cSAndroid Build Coastguard Worker 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
58*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
59*dfc6aa5cSAndroid Build Coastguard Worker 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
60*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
61*dfc6aa5cSAndroid Build Coastguard Worker 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
62*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
63*dfc6aa5cSAndroid Build Coastguard Worker 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
64*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
65*dfc6aa5cSAndroid Build Coastguard Worker 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
66*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
67*dfc6aa5cSAndroid Build Coastguard Worker 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
68*dfc6aa5cSAndroid Build Coastguard Worker };
69*dfc6aa5cSAndroid Build Coastguard Worker
70*dfc6aa5cSAndroid Build Coastguard Worker
71*dfc6aa5cSAndroid Build Coastguard Worker /* Downsample pixel values of a single component.
72*dfc6aa5cSAndroid Build Coastguard Worker * This version handles the common case of 2:1 horizontal and 1:1 vertical,
73*dfc6aa5cSAndroid Build Coastguard Worker * without smoothing.
74*dfc6aa5cSAndroid Build Coastguard Worker */
75*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v1_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)76*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
77*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION v_samp_factor,
78*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION width_in_blocks,
79*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data, JSAMPARRAY output_data)
80*dfc6aa5cSAndroid Build Coastguard Worker {
81*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr, outptr;
82*dfc6aa5cSAndroid Build Coastguard Worker /* Load expansion mask to pad remaining elements of last DCT block. */
83*dfc6aa5cSAndroid Build Coastguard Worker const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
84*dfc6aa5cSAndroid Build Coastguard Worker const uint8x16_t expand_mask =
85*dfc6aa5cSAndroid Build Coastguard Worker vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
86*dfc6aa5cSAndroid Build Coastguard Worker /* Load bias pattern (alternating every pixel.) */
87*dfc6aa5cSAndroid Build Coastguard Worker /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
88*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
89*dfc6aa5cSAndroid Build Coastguard Worker unsigned i, outrow;
90*dfc6aa5cSAndroid Build Coastguard Worker
91*dfc6aa5cSAndroid Build Coastguard Worker for (outrow = 0; outrow < v_samp_factor; outrow++) {
92*dfc6aa5cSAndroid Build Coastguard Worker outptr = output_data[outrow];
93*dfc6aa5cSAndroid Build Coastguard Worker inptr = input_data[outrow];
94*dfc6aa5cSAndroid Build Coastguard Worker
95*dfc6aa5cSAndroid Build Coastguard Worker /* Downsample all but the last DCT block of pixels. */
96*dfc6aa5cSAndroid Build Coastguard Worker for (i = 0; i < width_in_blocks - 1; i++) {
97*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
98*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values, widen to 16-bit, and add bias. */
99*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
100*dfc6aa5cSAndroid Build Coastguard Worker /* Divide total by 2 and narrow to 8-bit. */
101*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
102*dfc6aa5cSAndroid Build Coastguard Worker /* Store samples to memory. */
103*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr + i * DCTSIZE, samples_u8);
104*dfc6aa5cSAndroid Build Coastguard Worker }
105*dfc6aa5cSAndroid Build Coastguard Worker
106*dfc6aa5cSAndroid Build Coastguard Worker /* Load pixels in last DCT block into a table. */
107*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
108*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
109*dfc6aa5cSAndroid Build Coastguard Worker /* Pad the empty elements with the value of the last pixel. */
110*dfc6aa5cSAndroid Build Coastguard Worker pixels = vqtbl1q_u8(pixels, expand_mask);
111*dfc6aa5cSAndroid Build Coastguard Worker #else
112*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
113*dfc6aa5cSAndroid Build Coastguard Worker pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
114*dfc6aa5cSAndroid Build Coastguard Worker vtbl2_u8(table, vget_high_u8(expand_mask)));
115*dfc6aa5cSAndroid Build Coastguard Worker #endif
116*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values, widen to 16-bit, and add bias. */
117*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
118*dfc6aa5cSAndroid Build Coastguard Worker /* Divide total by 2, narrow to 8-bit, and store. */
119*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
120*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
121*dfc6aa5cSAndroid Build Coastguard Worker }
122*dfc6aa5cSAndroid Build Coastguard Worker }
123*dfc6aa5cSAndroid Build Coastguard Worker
124*dfc6aa5cSAndroid Build Coastguard Worker
125*dfc6aa5cSAndroid Build Coastguard Worker /* Downsample pixel values of a single component.
126*dfc6aa5cSAndroid Build Coastguard Worker * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
127*dfc6aa5cSAndroid Build Coastguard Worker * without smoothing.
128*dfc6aa5cSAndroid Build Coastguard Worker */
129*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_h2v2_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)130*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
131*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION v_samp_factor,
132*dfc6aa5cSAndroid Build Coastguard Worker JDIMENSION width_in_blocks,
133*dfc6aa5cSAndroid Build Coastguard Worker JSAMPARRAY input_data, JSAMPARRAY output_data)
134*dfc6aa5cSAndroid Build Coastguard Worker {
135*dfc6aa5cSAndroid Build Coastguard Worker JSAMPROW inptr0, inptr1, outptr;
136*dfc6aa5cSAndroid Build Coastguard Worker /* Load expansion mask to pad remaining elements of last DCT block. */
137*dfc6aa5cSAndroid Build Coastguard Worker const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
138*dfc6aa5cSAndroid Build Coastguard Worker const uint8x16_t expand_mask =
139*dfc6aa5cSAndroid Build Coastguard Worker vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
140*dfc6aa5cSAndroid Build Coastguard Worker /* Load bias pattern (alternating every pixel.) */
141*dfc6aa5cSAndroid Build Coastguard Worker /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
142*dfc6aa5cSAndroid Build Coastguard Worker const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
143*dfc6aa5cSAndroid Build Coastguard Worker unsigned i, outrow;
144*dfc6aa5cSAndroid Build Coastguard Worker
145*dfc6aa5cSAndroid Build Coastguard Worker for (outrow = 0; outrow < v_samp_factor; outrow++) {
146*dfc6aa5cSAndroid Build Coastguard Worker outptr = output_data[outrow];
147*dfc6aa5cSAndroid Build Coastguard Worker inptr0 = input_data[outrow];
148*dfc6aa5cSAndroid Build Coastguard Worker inptr1 = input_data[outrow + 1];
149*dfc6aa5cSAndroid Build Coastguard Worker
150*dfc6aa5cSAndroid Build Coastguard Worker /* Downsample all but the last DCT block of pixels. */
151*dfc6aa5cSAndroid Build Coastguard Worker for (i = 0; i < width_in_blocks - 1; i++) {
152*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
153*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
154*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
155*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
156*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
157*dfc6aa5cSAndroid Build Coastguard Worker */
158*dfc6aa5cSAndroid Build Coastguard Worker samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
159*dfc6aa5cSAndroid Build Coastguard Worker /* Divide total by 4 and narrow to 8-bit. */
160*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
161*dfc6aa5cSAndroid Build Coastguard Worker /* Store samples to memory and increment pointers. */
162*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr + i * DCTSIZE, samples_u8);
163*dfc6aa5cSAndroid Build Coastguard Worker }
164*dfc6aa5cSAndroid Build Coastguard Worker
165*dfc6aa5cSAndroid Build Coastguard Worker /* Load pixels in last DCT block into a table. */
166*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels_r0 =
167*dfc6aa5cSAndroid Build Coastguard Worker vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
168*dfc6aa5cSAndroid Build Coastguard Worker uint8x16_t pixels_r1 =
169*dfc6aa5cSAndroid Build Coastguard Worker vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
170*dfc6aa5cSAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(_M_ARM64)
171*dfc6aa5cSAndroid Build Coastguard Worker /* Pad the empty elements with the value of the last pixel. */
172*dfc6aa5cSAndroid Build Coastguard Worker pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
173*dfc6aa5cSAndroid Build Coastguard Worker pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
174*dfc6aa5cSAndroid Build Coastguard Worker #else
175*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t table_r0 =
176*dfc6aa5cSAndroid Build Coastguard Worker { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
177*dfc6aa5cSAndroid Build Coastguard Worker uint8x8x2_t table_r1 =
178*dfc6aa5cSAndroid Build Coastguard Worker { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
179*dfc6aa5cSAndroid Build Coastguard Worker pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
180*dfc6aa5cSAndroid Build Coastguard Worker vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
181*dfc6aa5cSAndroid Build Coastguard Worker pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
182*dfc6aa5cSAndroid Build Coastguard Worker vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
183*dfc6aa5cSAndroid Build Coastguard Worker #endif
184*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
185*dfc6aa5cSAndroid Build Coastguard Worker uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
186*dfc6aa5cSAndroid Build Coastguard Worker /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
187*dfc6aa5cSAndroid Build Coastguard Worker samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
188*dfc6aa5cSAndroid Build Coastguard Worker /* Divide total by 4, narrow to 8-bit, and store. */
189*dfc6aa5cSAndroid Build Coastguard Worker uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
190*dfc6aa5cSAndroid Build Coastguard Worker vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
191*dfc6aa5cSAndroid Build Coastguard Worker }
192*dfc6aa5cSAndroid Build Coastguard Worker }
193