xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jcgryext-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1 /*
2  * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* This file is included by jcgray-neon.c */
24 
25 
26 /* RGB -> Grayscale conversion is defined by the following equation:
27  *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
28  *
29  * Avoid floating point arithmetic by using shifted integer constants:
30  *    0.29899597 = 19595 * 2^-16
31  *    0.58700561 = 38470 * 2^-16
32  *    0.11399841 =  7471 * 2^-16
33  * These constants are defined in jcgray-neon.c
34  *
35  * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
36  */
37 
jsimd_rgb_gray_convert_neon(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)38 void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
39                                  JSAMPIMAGE output_buf, JDIMENSION output_row,
40                                  int num_rows)
41 {
42   JSAMPROW inptr;
43   JSAMPROW outptr;
44   /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
45   ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
46 
47   while (--num_rows >= 0) {
48     inptr = *input_buf++;
49     outptr = output_buf[0][output_row];
50     output_row++;
51 
52     int cols_remaining = image_width;
53     for (; cols_remaining > 0; cols_remaining -= 16) {
54 
55       /* To prevent buffer overread by the vector load instructions, the last
56        * (image_width % 16) columns of data are first memcopied to a temporary
57        * buffer large enough to accommodate the vector load.
58        */
59       if (cols_remaining < 16) {
60         memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
61         inptr = tmp_buf;
62       }
63 
64 #if RGB_PIXELSIZE == 4
65       uint8x16x4_t input_pixels = vld4q_u8(inptr);
66 #else
67       uint8x16x3_t input_pixels = vld3q_u8(inptr);
68 #endif
69       uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
70       uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
71       uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
72       uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
73       uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
74       uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
75 
76       /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
77       uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
78       uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
79       uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
80       uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
81       y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
82       y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
83       y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
84       y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
85       y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
86       y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
87       y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
88       y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
89 
90       /* Descale Y values (rounding right shift) and narrow to 16-bit. */
91       uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
92                                     vrshrn_n_u32(y_lh, 16));
93       uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
94                                     vrshrn_n_u32(y_hh, 16));
95 
96       /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
97        * permitted up to the next multiple of ALIGN_SIZE bytes.
98        */
99       vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
100 
101       /* Increment pointers. */
102       inptr += (16 * RGB_PIXELSIZE);
103       outptr += 16;
104     }
105   }
106 }
107