1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <altivec.h>
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/cfl.h"
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker #define OFF_0 0
19*77c1e3ccSAndroid Build Coastguard Worker #define OFF_1 16
20*77c1e3ccSAndroid Build Coastguard Worker #define OFF_2 32
21*77c1e3ccSAndroid Build Coastguard Worker #define OFF_3 48
22*77c1e3ccSAndroid Build Coastguard Worker #define CFL_BUF_LINE_BYTES 64
23*77c1e3ccSAndroid Build Coastguard Worker #define CFL_LINE_1 64
24*77c1e3ccSAndroid Build Coastguard Worker #define CFL_LINE_2 128
25*77c1e3ccSAndroid Build Coastguard Worker #define CFL_LINE_3 192
26*77c1e3ccSAndroid Build Coastguard Worker
27*77c1e3ccSAndroid Build Coastguard Worker typedef vector signed char int8x16_t; // NOLINT(runtime/int)
28*77c1e3ccSAndroid Build Coastguard Worker typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int)
29*77c1e3ccSAndroid Build Coastguard Worker typedef vector signed short int16x8_t; // NOLINT(runtime/int)
30*77c1e3ccSAndroid Build Coastguard Worker typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int)
31*77c1e3ccSAndroid Build Coastguard Worker typedef vector signed int int32x4_t; // NOLINT(runtime/int)
32*77c1e3ccSAndroid Build Coastguard Worker typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int)
33*77c1e3ccSAndroid Build Coastguard Worker typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int)
34*77c1e3ccSAndroid Build Coastguard Worker
subtract_average_vsx(const uint16_t * src_ptr,int16_t * dst,int width,int height,int round_offset,int num_pel_log2)35*77c1e3ccSAndroid Build Coastguard Worker static inline void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
36*77c1e3ccSAndroid Build Coastguard Worker int width, int height, int round_offset,
37*77c1e3ccSAndroid Build Coastguard Worker int num_pel_log2) {
38*77c1e3ccSAndroid Build Coastguard Worker // int16_t *dst = dst_ptr;
39*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dst_end = dst + height * CFL_BUF_LINE;
40*77c1e3ccSAndroid Build Coastguard Worker const int16_t *sum_buf = (const int16_t *)src_ptr;
41*77c1e3ccSAndroid Build Coastguard Worker const int16_t *end = sum_buf + height * CFL_BUF_LINE;
42*77c1e3ccSAndroid Build Coastguard Worker const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
43*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
44*77c1e3ccSAndroid Build Coastguard Worker 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
45*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
46*77c1e3ccSAndroid Build Coastguard Worker 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
47*77c1e3ccSAndroid Build Coastguard Worker
48*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
49*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
50*77c1e3ccSAndroid Build Coastguard Worker do {
51*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
52*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
53*77c1e3ccSAndroid Build Coastguard Worker if (width >= 16) {
54*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
55*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_1 =
56*77c1e3ccSAndroid Build Coastguard Worker vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
57*77c1e3ccSAndroid Build Coastguard Worker }
58*77c1e3ccSAndroid Build Coastguard Worker if (width == 32) {
59*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
60*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_1 =
61*77c1e3ccSAndroid Build Coastguard Worker vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
62*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
63*77c1e3ccSAndroid Build Coastguard Worker sum_32x4_1 =
64*77c1e3ccSAndroid Build Coastguard Worker vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
65*77c1e3ccSAndroid Build Coastguard Worker }
66*77c1e3ccSAndroid Build Coastguard Worker } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
67*77c1e3ccSAndroid Build Coastguard Worker int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
68*77c1e3ccSAndroid Build Coastguard Worker
69*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
70*77c1e3ccSAndroid Build Coastguard Worker sum_32x4 = vec_add(sum_32x4, perm_64);
71*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
72*77c1e3ccSAndroid Build Coastguard Worker sum_32x4 = vec_add(sum_32x4, perm_32);
73*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t avg = vec_sr(sum_32x4, div_shift);
74*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t vec_avg = vec_pack(avg, avg);
75*77c1e3ccSAndroid Build Coastguard Worker do {
76*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
77*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
78*77c1e3ccSAndroid Build Coastguard Worker OFF_0 + CFL_BUF_LINE_BYTES, dst);
79*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
80*77c1e3ccSAndroid Build Coastguard Worker OFF_0 + CFL_LINE_2, dst);
81*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
82*77c1e3ccSAndroid Build Coastguard Worker OFF_0 + CFL_LINE_3, dst);
83*77c1e3ccSAndroid Build Coastguard Worker if (width >= 16) {
84*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
85*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
86*77c1e3ccSAndroid Build Coastguard Worker OFF_1 + CFL_LINE_1, dst);
87*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
88*77c1e3ccSAndroid Build Coastguard Worker OFF_1 + CFL_LINE_2, dst);
89*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
90*77c1e3ccSAndroid Build Coastguard Worker OFF_1 + CFL_LINE_3, dst);
91*77c1e3ccSAndroid Build Coastguard Worker }
92*77c1e3ccSAndroid Build Coastguard Worker if (width == 32) {
93*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
94*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
95*77c1e3ccSAndroid Build Coastguard Worker OFF_2 + CFL_LINE_1, dst);
96*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
97*77c1e3ccSAndroid Build Coastguard Worker OFF_2 + CFL_LINE_2, dst);
98*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
99*77c1e3ccSAndroid Build Coastguard Worker OFF_2 + CFL_LINE_3, dst);
100*77c1e3ccSAndroid Build Coastguard Worker
101*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
102*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
103*77c1e3ccSAndroid Build Coastguard Worker OFF_3 + CFL_LINE_1, dst);
104*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
105*77c1e3ccSAndroid Build Coastguard Worker OFF_3 + CFL_LINE_2, dst);
106*77c1e3ccSAndroid Build Coastguard Worker vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
107*77c1e3ccSAndroid Build Coastguard Worker OFF_3 + CFL_LINE_3, dst);
108*77c1e3ccSAndroid Build Coastguard Worker }
109*77c1e3ccSAndroid Build Coastguard Worker } while ((dst += CFL_BUF_LINE * 4) < dst_end);
110*77c1e3ccSAndroid Build Coastguard Worker }
111*77c1e3ccSAndroid Build Coastguard Worker
112*77c1e3ccSAndroid Build Coastguard Worker // Declare wrappers for VSX sizes
113*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
114*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
115*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
116*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
117*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
118*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
119*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
120*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
121*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
122*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
123*77c1e3ccSAndroid Build Coastguard Worker CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
124*77c1e3ccSAndroid Build Coastguard Worker
125*77c1e3ccSAndroid Build Coastguard Worker // Based on observation, for small blocks VSX does not outperform C (no 64bit
126*77c1e3ccSAndroid Build Coastguard Worker // load and store intrinsics). So we call the C code for block widths 4.
127*77c1e3ccSAndroid Build Coastguard Worker extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
128*77c1e3ccSAndroid Build Coastguard Worker extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
129*77c1e3ccSAndroid Build Coastguard Worker extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
130*77c1e3ccSAndroid Build Coastguard Worker
cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size)131*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
132*77c1e3ccSAndroid Build Coastguard Worker static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
133*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_4x4_c, /* 4x4 */
134*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_8x8_vsx, /* 8x8 */
135*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_16x16_vsx, /* 16x16 */
136*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_32x32_vsx, /* 32x32 */
137*77c1e3ccSAndroid Build Coastguard Worker NULL, /* 64x64 (invalid CFL size) */
138*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_4x8_c, /* 4x8 */
139*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_8x4_vsx, /* 8x4 */
140*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_8x16_vsx, /* 8x16 */
141*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_16x8_vsx, /* 16x8 */
142*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_16x32_vsx, /* 16x32 */
143*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_32x16_vsx, /* 32x16 */
144*77c1e3ccSAndroid Build Coastguard Worker NULL, /* 32x64 (invalid CFL size) */
145*77c1e3ccSAndroid Build Coastguard Worker NULL, /* 64x32 (invalid CFL size) */
146*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_4x16_c, /* 4x16 */
147*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_16x4_vsx, /* 16x4 */
148*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_8x32_vsx, /* 8x32 */
149*77c1e3ccSAndroid Build Coastguard Worker cfl_subtract_average_32x8_vsx, /* 32x8 */
150*77c1e3ccSAndroid Build Coastguard Worker NULL, /* 16x64 (invalid CFL size) */
151*77c1e3ccSAndroid Build Coastguard Worker NULL, /* 64x16 (invalid CFL size) */
152*77c1e3ccSAndroid Build Coastguard Worker };
153*77c1e3ccSAndroid Build Coastguard Worker // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
154*77c1e3ccSAndroid Build Coastguard Worker // index the function pointer array out of bounds.
155*77c1e3ccSAndroid Build Coastguard Worker return sub_avg[tx_size % TX_SIZES_ALL];
156*77c1e3ccSAndroid Build Coastguard Worker }
157