1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
12*fb1b10abSAndroid Build Coastguard Worker
13*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
14*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/ppc/types_vsx.h"
15*fb1b10abSAndroid Build Coastguard Worker
16*fb1b10abSAndroid Build Coastguard Worker extern const int16_t vpx_rv[];
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
19*fb1b10abSAndroid Build Coastguard Worker 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
20*fb1b10abSAndroid Build Coastguard Worker 0x1C, 0x1D, 0x1E, 0x1F };
21*fb1b10abSAndroid Build Coastguard Worker
22*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
23*fb1b10abSAndroid Build Coastguard Worker 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
24*fb1b10abSAndroid Build Coastguard Worker 0x1C, 0x1D, 0x1E, 0x1F };
25*fb1b10abSAndroid Build Coastguard Worker
apply_filter(uint8x16_t ctx[4],uint8x16_t v,uint8x16_t filter)26*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
27*fb1b10abSAndroid Build Coastguard Worker uint8x16_t filter) {
28*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
29*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
30*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t k3 = vec_avg(k1, k2);
31*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1]));
32*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3]));
33*fb1b10abSAndroid Build Coastguard Worker const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
34*fb1b10abSAndroid Build Coastguard Worker return vec_sel(v, vec_avg(k3, v), mask);
35*fb1b10abSAndroid Build Coastguard Worker }
36*fb1b10abSAndroid Build Coastguard Worker
vert_ctx(uint8x16_t ctx[4],int col,uint8_t * src,int stride)37*fb1b10abSAndroid Build Coastguard Worker static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
38*fb1b10abSAndroid Build Coastguard Worker int stride) {
39*fb1b10abSAndroid Build Coastguard Worker ctx[0] = vec_vsx_ld(col - 2 * stride, src);
40*fb1b10abSAndroid Build Coastguard Worker ctx[1] = vec_vsx_ld(col - stride, src);
41*fb1b10abSAndroid Build Coastguard Worker ctx[2] = vec_vsx_ld(col + stride, src);
42*fb1b10abSAndroid Build Coastguard Worker ctx[3] = vec_vsx_ld(col + 2 * stride, src);
43*fb1b10abSAndroid Build Coastguard Worker }
44*fb1b10abSAndroid Build Coastguard Worker
horz_ctx(uint8x16_t ctx[4],uint8x16_t left_ctx,uint8x16_t v,uint8x16_t right_ctx)45*fb1b10abSAndroid Build Coastguard Worker static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
46*fb1b10abSAndroid Build Coastguard Worker uint8x16_t v, uint8x16_t right_ctx) {
47*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
48*fb1b10abSAndroid Build Coastguard Worker 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
49*fb1b10abSAndroid Build Coastguard Worker 0x1A, 0x1B, 0x1C, 0x1D };
50*fb1b10abSAndroid Build Coastguard Worker
51*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
52*fb1b10abSAndroid Build Coastguard Worker 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
53*fb1b10abSAndroid Build Coastguard Worker 0x1B, 0x1C, 0x1D, 0x1E };
54*fb1b10abSAndroid Build Coastguard Worker
55*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
56*fb1b10abSAndroid Build Coastguard Worker 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
57*fb1b10abSAndroid Build Coastguard Worker 0x0D, 0x0E, 0x0F, 0x10 };
58*fb1b10abSAndroid Build Coastguard Worker
59*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
60*fb1b10abSAndroid Build Coastguard Worker 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
61*fb1b10abSAndroid Build Coastguard Worker 0x0E, 0x0F, 0x10, 0x11 };
62*fb1b10abSAndroid Build Coastguard Worker ctx[0] = vec_perm(left_ctx, v, l2_perm);
63*fb1b10abSAndroid Build Coastguard Worker ctx[1] = vec_perm(left_ctx, v, l1_perm);
64*fb1b10abSAndroid Build Coastguard Worker ctx[2] = vec_perm(v, right_ctx, r1_perm);
65*fb1b10abSAndroid Build Coastguard Worker ctx[3] = vec_perm(v, right_ctx, r2_perm);
66*fb1b10abSAndroid Build Coastguard Worker }
vpx_post_proc_down_and_across_mb_row_vsx(unsigned char * src_ptr,unsigned char * dst_ptr,int src_pixels_per_line,int dst_pixels_per_line,int cols,unsigned char * f,int size)67*fb1b10abSAndroid Build Coastguard Worker void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
68*fb1b10abSAndroid Build Coastguard Worker unsigned char *dst_ptr,
69*fb1b10abSAndroid Build Coastguard Worker int src_pixels_per_line,
70*fb1b10abSAndroid Build Coastguard Worker int dst_pixels_per_line, int cols,
71*fb1b10abSAndroid Build Coastguard Worker unsigned char *f, int size) {
72*fb1b10abSAndroid Build Coastguard Worker int row, col;
73*fb1b10abSAndroid Build Coastguard Worker uint8x16_t ctx[4], out, v, left_ctx;
74*fb1b10abSAndroid Build Coastguard Worker
75*fb1b10abSAndroid Build Coastguard Worker for (row = 0; row < size; row++) {
76*fb1b10abSAndroid Build Coastguard Worker for (col = 0; col < cols - 8; col += 16) {
77*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t filter = vec_vsx_ld(col, f);
78*fb1b10abSAndroid Build Coastguard Worker v = vec_vsx_ld(col, src_ptr);
79*fb1b10abSAndroid Build Coastguard Worker vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
80*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
81*fb1b10abSAndroid Build Coastguard Worker }
82*fb1b10abSAndroid Build Coastguard Worker
83*fb1b10abSAndroid Build Coastguard Worker if (col != cols) {
84*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t filter = vec_vsx_ld(col, f);
85*fb1b10abSAndroid Build Coastguard Worker v = vec_vsx_ld(col, src_ptr);
86*fb1b10abSAndroid Build Coastguard Worker vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
87*fb1b10abSAndroid Build Coastguard Worker out = apply_filter(ctx, v, filter);
88*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
89*fb1b10abSAndroid Build Coastguard Worker }
90*fb1b10abSAndroid Build Coastguard Worker
91*fb1b10abSAndroid Build Coastguard Worker /* now post_proc_across */
92*fb1b10abSAndroid Build Coastguard Worker left_ctx = vec_splats(dst_ptr[0]);
93*fb1b10abSAndroid Build Coastguard Worker v = vec_vsx_ld(0, dst_ptr);
94*fb1b10abSAndroid Build Coastguard Worker for (col = 0; col < cols - 8; col += 16) {
95*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t filter = vec_vsx_ld(col, f);
96*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t right_ctx = (col + 16 == cols)
97*fb1b10abSAndroid Build Coastguard Worker ? vec_splats(dst_ptr[cols - 1])
98*fb1b10abSAndroid Build Coastguard Worker : vec_vsx_ld(col, dst_ptr + 16);
99*fb1b10abSAndroid Build Coastguard Worker horz_ctx(ctx, left_ctx, v, right_ctx);
100*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
101*fb1b10abSAndroid Build Coastguard Worker left_ctx = v;
102*fb1b10abSAndroid Build Coastguard Worker v = right_ctx;
103*fb1b10abSAndroid Build Coastguard Worker }
104*fb1b10abSAndroid Build Coastguard Worker
105*fb1b10abSAndroid Build Coastguard Worker if (col != cols) {
106*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t filter = vec_vsx_ld(col, f);
107*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
108*fb1b10abSAndroid Build Coastguard Worker horz_ctx(ctx, left_ctx, v, right_ctx);
109*fb1b10abSAndroid Build Coastguard Worker out = apply_filter(ctx, v, filter);
110*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
111*fb1b10abSAndroid Build Coastguard Worker }
112*fb1b10abSAndroid Build Coastguard Worker
113*fb1b10abSAndroid Build Coastguard Worker src_ptr += src_pixels_per_line;
114*fb1b10abSAndroid Build Coastguard Worker dst_ptr += dst_pixels_per_line;
115*fb1b10abSAndroid Build Coastguard Worker }
116*fb1b10abSAndroid Build Coastguard Worker }
117*fb1b10abSAndroid Build Coastguard Worker
118*fb1b10abSAndroid Build Coastguard Worker // C: s[c + 7]
next7l_s16(uint8x16_t c)119*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t next7l_s16(uint8x16_t c) {
120*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t next7_perm = {
121*fb1b10abSAndroid Build Coastguard Worker 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13,
122*fb1b10abSAndroid Build Coastguard Worker 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17,
123*fb1b10abSAndroid Build Coastguard Worker };
124*fb1b10abSAndroid Build Coastguard Worker return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm);
125*fb1b10abSAndroid Build Coastguard Worker }
126*fb1b10abSAndroid Build Coastguard Worker
127*fb1b10abSAndroid Build Coastguard Worker // Slide across window and add.
slide_sum_s16(int16x8_t x)128*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t slide_sum_s16(int16x8_t x) {
129*fb1b10abSAndroid Build Coastguard Worker // x = A B C D E F G H
130*fb1b10abSAndroid Build Coastguard Worker //
131*fb1b10abSAndroid Build Coastguard Worker // 0 A B C D E F G
132*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3))));
133*fb1b10abSAndroid Build Coastguard Worker // 0 0 A B C D E F
134*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))),
135*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 A B C D E
136*fb1b10abSAndroid Build Coastguard Worker vec_slo(x, vec_splats((int8_t)(6 << 3))));
137*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 0 A B C D
138*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))),
139*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 0 0 A B C
140*fb1b10abSAndroid Build Coastguard Worker vec_slo(x, vec_splats((int8_t)(10 << 3))));
141*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 0 0 0 A B
142*fb1b10abSAndroid Build Coastguard Worker const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))),
143*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 0 0 0 0 A
144*fb1b10abSAndroid Build Coastguard Worker vec_slo(x, vec_splats((int8_t)(14 << 3))));
145*fb1b10abSAndroid Build Coastguard Worker return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
146*fb1b10abSAndroid Build Coastguard Worker }
147*fb1b10abSAndroid Build Coastguard Worker
148*fb1b10abSAndroid Build Coastguard Worker // Slide across window and add.
slide_sumsq_s32(int32x4_t xsq_even,int32x4_t xsq_odd)149*fb1b10abSAndroid Build Coastguard Worker static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) {
150*fb1b10abSAndroid Build Coastguard Worker // 0 A C E
151*fb1b10abSAndroid Build Coastguard Worker // + 0 B D F
152*fb1b10abSAndroid Build Coastguard Worker int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
153*fb1b10abSAndroid Build Coastguard Worker vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3))));
154*fb1b10abSAndroid Build Coastguard Worker // 0 0 A C
155*fb1b10abSAndroid Build Coastguard Worker // + 0 0 B D
156*fb1b10abSAndroid Build Coastguard Worker int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
157*fb1b10abSAndroid Build Coastguard Worker vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3))));
158*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 A
159*fb1b10abSAndroid Build Coastguard Worker // + 0 0 0 B
160*fb1b10abSAndroid Build Coastguard Worker int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
161*fb1b10abSAndroid Build Coastguard Worker vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
162*fb1b10abSAndroid Build Coastguard Worker sumsq_1 = vec_add(sumsq_1, xsq_even);
163*fb1b10abSAndroid Build Coastguard Worker sumsq_2 = vec_add(sumsq_2, sumsq_3);
164*fb1b10abSAndroid Build Coastguard Worker return vec_add(sumsq_1, sumsq_2);
165*fb1b10abSAndroid Build Coastguard Worker }
166*fb1b10abSAndroid Build Coastguard Worker
167*fb1b10abSAndroid Build Coastguard Worker // C: (b + sum + val) >> 4
filter_s16(int16x8_t b,int16x8_t sum,int16x8_t val)168*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) {
169*fb1b10abSAndroid Build Coastguard Worker return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4));
170*fb1b10abSAndroid Build Coastguard Worker }
171*fb1b10abSAndroid Build Coastguard Worker
172*fb1b10abSAndroid Build Coastguard Worker // C: sumsq * 15 - sum * sum
mask_s16(int32x4_t sumsq_even,int32x4_t sumsq_odd,int16x8_t sum,int32x4_t lim)173*fb1b10abSAndroid Build Coastguard Worker static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd,
174*fb1b10abSAndroid Build Coastguard Worker int16x8_t sum, int32x4_t lim) {
175*fb1b10abSAndroid Build Coastguard Worker static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
176*fb1b10abSAndroid Build Coastguard Worker 0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
177*fb1b10abSAndroid Build Coastguard Worker 0x0C, 0x0D, 0x1C, 0x1D };
178*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sumsq_odd_scaled =
179*fb1b10abSAndroid Build Coastguard Worker vec_mul(sumsq_odd, vec_splats((int32_t)15));
180*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sumsq_even_scaled =
181*fb1b10abSAndroid Build Coastguard Worker vec_mul(sumsq_even, vec_splats((int32_t)15));
182*fb1b10abSAndroid Build Coastguard Worker const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum));
183*fb1b10abSAndroid Build Coastguard Worker const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum));
184*fb1b10abSAndroid Build Coastguard Worker
185*fb1b10abSAndroid Build Coastguard Worker const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim);
186*fb1b10abSAndroid Build Coastguard Worker const bool32x4_t mask_even = vec_cmplt(thres_even, lim);
187*fb1b10abSAndroid Build Coastguard Worker return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge);
188*fb1b10abSAndroid Build Coastguard Worker }
189*fb1b10abSAndroid Build Coastguard Worker
vpx_mbpost_proc_across_ip_vsx(unsigned char * src,int pitch,int rows,int cols,int flimit)190*fb1b10abSAndroid Build Coastguard Worker void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows,
191*fb1b10abSAndroid Build Coastguard Worker int cols, int flimit) {
192*fb1b10abSAndroid Build Coastguard Worker int row, col;
193*fb1b10abSAndroid Build Coastguard Worker const int32x4_t lim = vec_splats(flimit);
194*fb1b10abSAndroid Build Coastguard Worker
195*fb1b10abSAndroid Build Coastguard Worker // 8 columns are processed at a time.
196*fb1b10abSAndroid Build Coastguard Worker assert(cols % 8 == 0);
197*fb1b10abSAndroid Build Coastguard Worker
198*fb1b10abSAndroid Build Coastguard Worker for (row = 0; row < rows; row++) {
199*fb1b10abSAndroid Build Coastguard Worker // The sum is signed and requires at most 13 bits.
200*fb1b10abSAndroid Build Coastguard Worker // (8 bits + sign) * 15 (4 bits)
201*fb1b10abSAndroid Build Coastguard Worker int16x8_t sum;
202*fb1b10abSAndroid Build Coastguard Worker // The sum of squares requires at most 20 bits.
203*fb1b10abSAndroid Build Coastguard Worker // (16 bits + sign) * 15 (4 bits)
204*fb1b10abSAndroid Build Coastguard Worker int32x4_t sumsq_even, sumsq_odd;
205*fb1b10abSAndroid Build Coastguard Worker
206*fb1b10abSAndroid Build Coastguard Worker // Fill left context with first col.
207*fb1b10abSAndroid Build Coastguard Worker int16x8_t left_ctx = vec_splats((int16_t)src[0]);
208*fb1b10abSAndroid Build Coastguard Worker int16_t s = src[0] * 9;
209*fb1b10abSAndroid Build Coastguard Worker int32_t ssq = src[0] * src[0] * 9 + 16;
210*fb1b10abSAndroid Build Coastguard Worker
211*fb1b10abSAndroid Build Coastguard Worker // Fill the next 6 columns of the sliding window with cols 2 to 7.
212*fb1b10abSAndroid Build Coastguard Worker for (col = 1; col <= 6; ++col) {
213*fb1b10abSAndroid Build Coastguard Worker s += src[col];
214*fb1b10abSAndroid Build Coastguard Worker ssq += src[col] * src[col];
215*fb1b10abSAndroid Build Coastguard Worker }
216*fb1b10abSAndroid Build Coastguard Worker // Set this sum to every element in the window.
217*fb1b10abSAndroid Build Coastguard Worker sum = vec_splats(s);
218*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_splats(ssq);
219*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_splats(ssq);
220*fb1b10abSAndroid Build Coastguard Worker
221*fb1b10abSAndroid Build Coastguard Worker for (col = 0; col < cols; col += 8) {
222*fb1b10abSAndroid Build Coastguard Worker bool16x8_t mask;
223*fb1b10abSAndroid Build Coastguard Worker int16x8_t filtered, masked;
224*fb1b10abSAndroid Build Coastguard Worker uint8x16_t out;
225*fb1b10abSAndroid Build Coastguard Worker
226*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t val = vec_vsx_ld(0, src + col);
227*fb1b10abSAndroid Build Coastguard Worker const int16x8_t val_high = unpack_to_s16_h(val);
228*fb1b10abSAndroid Build Coastguard Worker
229*fb1b10abSAndroid Build Coastguard Worker // C: s[c + 7]
230*fb1b10abSAndroid Build Coastguard Worker const int16x8_t right_ctx = (col + 8 == cols)
231*fb1b10abSAndroid Build Coastguard Worker ? vec_splats((int16_t)src[col + 7])
232*fb1b10abSAndroid Build Coastguard Worker : next7l_s16(val);
233*fb1b10abSAndroid Build Coastguard Worker
234*fb1b10abSAndroid Build Coastguard Worker // C: x = s[c + 7] - s[c - 8];
235*fb1b10abSAndroid Build Coastguard Worker const int16x8_t x = vec_sub(right_ctx, left_ctx);
236*fb1b10abSAndroid Build Coastguard Worker const int32x4_t xsq_even =
237*fb1b10abSAndroid Build Coastguard Worker vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx));
238*fb1b10abSAndroid Build Coastguard Worker const int32x4_t xsq_odd =
239*fb1b10abSAndroid Build Coastguard Worker vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx));
240*fb1b10abSAndroid Build Coastguard Worker
241*fb1b10abSAndroid Build Coastguard Worker const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd);
242*fb1b10abSAndroid Build Coastguard Worker // A C E G
243*fb1b10abSAndroid Build Coastguard Worker // 0 B D F
244*fb1b10abSAndroid Build Coastguard Worker // 0 A C E
245*fb1b10abSAndroid Build Coastguard Worker // 0 0 B D
246*fb1b10abSAndroid Build Coastguard Worker // 0 0 A C
247*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 B
248*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 A
249*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_add(sumsq_even, sumsq_tmp);
250*fb1b10abSAndroid Build Coastguard Worker // B D F G
251*fb1b10abSAndroid Build Coastguard Worker // A C E G
252*fb1b10abSAndroid Build Coastguard Worker // 0 B D F
253*fb1b10abSAndroid Build Coastguard Worker // 0 A C E
254*fb1b10abSAndroid Build Coastguard Worker // 0 0 B D
255*fb1b10abSAndroid Build Coastguard Worker // 0 0 A C
256*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 B
257*fb1b10abSAndroid Build Coastguard Worker // 0 0 0 A
258*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
259*fb1b10abSAndroid Build Coastguard Worker
260*fb1b10abSAndroid Build Coastguard Worker sum = vec_add(sum, slide_sum_s16(x));
261*fb1b10abSAndroid Build Coastguard Worker
262*fb1b10abSAndroid Build Coastguard Worker // C: (8 + sum + s[c]) >> 4
263*fb1b10abSAndroid Build Coastguard Worker filtered = filter_s16(vec_splats((int16_t)8), sum, val_high);
264*fb1b10abSAndroid Build Coastguard Worker // C: sumsq * 15 - sum * sum
265*fb1b10abSAndroid Build Coastguard Worker mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
266*fb1b10abSAndroid Build Coastguard Worker masked = vec_sel(val_high, filtered, mask);
267*fb1b10abSAndroid Build Coastguard Worker
268*fb1b10abSAndroid Build Coastguard Worker out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge);
269*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(out, 0, src + col);
270*fb1b10abSAndroid Build Coastguard Worker
271*fb1b10abSAndroid Build Coastguard Worker // Update window sum and square sum
272*fb1b10abSAndroid Build Coastguard Worker sum = vec_splat(sum, 7);
273*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_splat(sumsq_odd, 3);
274*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_splat(sumsq_odd, 3);
275*fb1b10abSAndroid Build Coastguard Worker
276*fb1b10abSAndroid Build Coastguard Worker // C: s[c - 8] (for next iteration)
277*fb1b10abSAndroid Build Coastguard Worker left_ctx = val_high;
278*fb1b10abSAndroid Build Coastguard Worker }
279*fb1b10abSAndroid Build Coastguard Worker src += pitch;
280*fb1b10abSAndroid Build Coastguard Worker }
281*fb1b10abSAndroid Build Coastguard Worker }
282*fb1b10abSAndroid Build Coastguard Worker
vpx_mbpost_proc_down_vsx(uint8_t * dst,int pitch,int rows,int cols,int flimit)283*fb1b10abSAndroid Build Coastguard Worker void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols,
284*fb1b10abSAndroid Build Coastguard Worker int flimit) {
285*fb1b10abSAndroid Build Coastguard Worker int col, row, i;
286*fb1b10abSAndroid Build Coastguard Worker int16x8_t window[16];
287*fb1b10abSAndroid Build Coastguard Worker const int32x4_t lim = vec_splats(flimit);
288*fb1b10abSAndroid Build Coastguard Worker
289*fb1b10abSAndroid Build Coastguard Worker // 8 columns are processed at a time.
290*fb1b10abSAndroid Build Coastguard Worker assert(cols % 8 == 0);
291*fb1b10abSAndroid Build Coastguard Worker // If rows is less than 8 the bottom border extension fails.
292*fb1b10abSAndroid Build Coastguard Worker assert(rows >= 8);
293*fb1b10abSAndroid Build Coastguard Worker
294*fb1b10abSAndroid Build Coastguard Worker for (col = 0; col < cols; col += 8) {
295*fb1b10abSAndroid Build Coastguard Worker // The sum is signed and requires at most 13 bits.
296*fb1b10abSAndroid Build Coastguard Worker // (8 bits + sign) * 15 (4 bits)
297*fb1b10abSAndroid Build Coastguard Worker int16x8_t r1, sum;
298*fb1b10abSAndroid Build Coastguard Worker // The sum of squares requires at most 20 bits.
299*fb1b10abSAndroid Build Coastguard Worker // (16 bits + sign) * 15 (4 bits)
300*fb1b10abSAndroid Build Coastguard Worker int32x4_t sumsq_even, sumsq_odd;
301*fb1b10abSAndroid Build Coastguard Worker
302*fb1b10abSAndroid Build Coastguard Worker r1 = unpack_to_s16_h(vec_vsx_ld(0, dst));
303*fb1b10abSAndroid Build Coastguard Worker // Fill sliding window with first row.
304*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i <= 8; i++) {
305*fb1b10abSAndroid Build Coastguard Worker window[i] = r1;
306*fb1b10abSAndroid Build Coastguard Worker }
307*fb1b10abSAndroid Build Coastguard Worker // First 9 rows of the sliding window are the same.
308*fb1b10abSAndroid Build Coastguard Worker // sum = r1 * 9
309*fb1b10abSAndroid Build Coastguard Worker sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
310*fb1b10abSAndroid Build Coastguard Worker
311*fb1b10abSAndroid Build Coastguard Worker // sumsq = r1 * r1 * 9
312*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_mule(sum, r1);
313*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_mulo(sum, r1);
314*fb1b10abSAndroid Build Coastguard Worker
315*fb1b10abSAndroid Build Coastguard Worker // Fill the next 6 rows of the sliding window with rows 2 to 7.
316*fb1b10abSAndroid Build Coastguard Worker for (i = 1; i <= 6; ++i) {
317*fb1b10abSAndroid Build Coastguard Worker const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
318*fb1b10abSAndroid Build Coastguard Worker window[i + 8] = next_row;
319*fb1b10abSAndroid Build Coastguard Worker sum = vec_add(sum, next_row);
320*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
321*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
322*fb1b10abSAndroid Build Coastguard Worker }
323*fb1b10abSAndroid Build Coastguard Worker
324*fb1b10abSAndroid Build Coastguard Worker for (row = 0; row < rows; row++) {
325*fb1b10abSAndroid Build Coastguard Worker int32x4_t d15_even, d15_odd, d0_even, d0_odd;
326*fb1b10abSAndroid Build Coastguard Worker bool16x8_t mask;
327*fb1b10abSAndroid Build Coastguard Worker int16x8_t filtered, masked;
328*fb1b10abSAndroid Build Coastguard Worker uint8x16_t out;
329*fb1b10abSAndroid Build Coastguard Worker
330*fb1b10abSAndroid Build Coastguard Worker const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127));
331*fb1b10abSAndroid Build Coastguard Worker
332*fb1b10abSAndroid Build Coastguard Worker // Move the sliding window
333*fb1b10abSAndroid Build Coastguard Worker if (row + 7 < rows) {
334*fb1b10abSAndroid Build Coastguard Worker window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst));
335*fb1b10abSAndroid Build Coastguard Worker } else {
336*fb1b10abSAndroid Build Coastguard Worker window[15] = window[14];
337*fb1b10abSAndroid Build Coastguard Worker }
338*fb1b10abSAndroid Build Coastguard Worker
339*fb1b10abSAndroid Build Coastguard Worker // C: sum += s[7 * pitch] - s[-8 * pitch];
340*fb1b10abSAndroid Build Coastguard Worker sum = vec_add(sum, vec_sub(window[15], window[0]));
341*fb1b10abSAndroid Build Coastguard Worker
342*fb1b10abSAndroid Build Coastguard Worker // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 *
343*fb1b10abSAndroid Build Coastguard Worker // pitch];
344*fb1b10abSAndroid Build Coastguard Worker // Optimization Note: Caching a squared-window for odd and even is
345*fb1b10abSAndroid Build Coastguard Worker // slower than just repeating the multiplies.
346*fb1b10abSAndroid Build Coastguard Worker d15_odd = vec_mulo(window[15], window[15]);
347*fb1b10abSAndroid Build Coastguard Worker d15_even = vec_mule(window[15], window[15]);
348*fb1b10abSAndroid Build Coastguard Worker d0_odd = vec_mulo(window[0], window[0]);
349*fb1b10abSAndroid Build Coastguard Worker d0_even = vec_mule(window[0], window[0]);
350*fb1b10abSAndroid Build Coastguard Worker sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd));
351*fb1b10abSAndroid Build Coastguard Worker sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even));
352*fb1b10abSAndroid Build Coastguard Worker
353*fb1b10abSAndroid Build Coastguard Worker // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4
354*fb1b10abSAndroid Build Coastguard Worker filtered = filter_s16(rv, sum, window[8]);
355*fb1b10abSAndroid Build Coastguard Worker
356*fb1b10abSAndroid Build Coastguard Worker // C: sumsq * 15 - sum * sum
357*fb1b10abSAndroid Build Coastguard Worker mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
358*fb1b10abSAndroid Build Coastguard Worker masked = vec_sel(window[8], filtered, mask);
359*fb1b10abSAndroid Build Coastguard Worker
360*fb1b10abSAndroid Build Coastguard Worker // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per
361*fb1b10abSAndroid Build Coastguard Worker // iteration
362*fb1b10abSAndroid Build Coastguard Worker out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
363*fb1b10abSAndroid Build Coastguard Worker load_merge);
364*fb1b10abSAndroid Build Coastguard Worker vec_vsx_st(out, 0, dst + row * pitch);
365*fb1b10abSAndroid Build Coastguard Worker
366*fb1b10abSAndroid Build Coastguard Worker // Optimization Note: Turns out that the following loop is faster than
367*fb1b10abSAndroid Build Coastguard Worker // using pointers to manage the sliding window.
368*fb1b10abSAndroid Build Coastguard Worker for (i = 1; i < 16; i++) {
369*fb1b10abSAndroid Build Coastguard Worker window[i - 1] = window[i];
370*fb1b10abSAndroid Build Coastguard Worker }
371*fb1b10abSAndroid Build Coastguard Worker }
372*fb1b10abSAndroid Build Coastguard Worker dst += 8;
373*fb1b10abSAndroid Build Coastguard Worker }
374*fb1b10abSAndroid Build Coastguard Worker }
375