xref: /aosp_15_r20/external/libgav1/src/dsp/x86/intra_edge_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intra_edge.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <xmmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/common.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35 
36 constexpr int kKernelTaps = 5;
37 constexpr int kKernels[3][kKernelTaps] = {
38     {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
39 constexpr int kMaxEdgeBufferSize = 129;
40 
41 // This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
42 // Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
43 // write as overlapping sets of 8-bytes.
ComputeKernel1Store12(uint8_t * LIBGAV1_RESTRICT dest,const uint8_t * LIBGAV1_RESTRICT source)44 inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest,
45                                   const uint8_t* LIBGAV1_RESTRICT source) {
46   const __m128i edge_lo = LoadUnaligned16(source);
47   const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
48   // Samples matched with the '4' tap, expanded to 16-bit.
49   const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
50   const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
51   // Samples matched with the '8' tap, expanded to 16-bit.
52   const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
53   const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
54 
55   // Apply the taps by shifting.
56   const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
57   const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
58   const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
59   const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
60   // Move latter 4x values down to add with first 4x values for each output.
61   const __m128i partial_sums_lo =
62       _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
63   const __m128i partial_sums_hi =
64       _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
65   // Move 6x values down to add for the final kernel sum for each output.
66   const __m128i sums_lo = RightShiftWithRounding_U16(
67       _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
68   const __m128i sums_hi = RightShiftWithRounding_U16(
69       _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
70 
71   const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
72   const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
73   const __m128i result =
74       _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
75   StoreUnaligned16(dest, result);
76 }
77 
78 // This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
79 // Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
80 // be overwritten or safely discarded.
ComputeKernel2Store12(uint8_t * LIBGAV1_RESTRICT dest,const uint8_t * LIBGAV1_RESTRICT source)81 inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest,
82                                   const uint8_t* LIBGAV1_RESTRICT source) {
83   const __m128i edge_lo = LoadUnaligned16(source);
84   const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
85   const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
86   const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
87   const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
88   const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
89   // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
90   const __m128i outers5_lo =
91       _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
92   const __m128i outers5_hi =
93       _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
94   // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
95   const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
96                                             _mm_slli_epi16(centers_lo, 2));
97   const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
98                                             _mm_slli_epi16(centers_hi, 2));
99   // Move latter 5x values down to add with first 5x values for each output.
100   const __m128i partial_sums_lo =
101       _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
102   // Move 6x values down to add for the final kernel sum for each output.
103   const __m128i sums_lo = RightShiftWithRounding_U16(
104       _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
105   // Shift latter 5x values to add with first 5x values for each output.
106   const __m128i partial_sums_hi =
107       _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
108   // Move 6x values down to add for the final kernel sum for each output.
109   const __m128i sums_hi = RightShiftWithRounding_U16(
110       _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
111   // First 6 values are valid outputs.
112   const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
113   const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
114   const __m128i result =
115       _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
116   StoreUnaligned16(dest, result);
117 }
118 
119 // This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
ComputeKernel3Store8(uint8_t * LIBGAV1_RESTRICT dest,const uint8_t * LIBGAV1_RESTRICT source)120 inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest,
121                                  const uint8_t* LIBGAV1_RESTRICT source) {
122   const __m128i edge_lo = LoadUnaligned16(source);
123   const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
124   // Finish |edge_lo| life cycle quickly.
125   // Multiply for 2x.
126   const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
127   // Multiply 2x by 2 and align.
128   const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
129   // Finish |source2| life cycle quickly.
130   // Move latter 2x values down to add with first 2x values for each output.
131   __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
132   // First 4x values already aligned to add with running total.
133   sum = _mm_add_epi16(sum, source4_lo);
134   // Move second 4x values down to add with running total.
135   sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
136   // Move third 4x values down to add with running total.
137   sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
138   // Multiply for 2x.
139   const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
140   // Multiply 2x by 2 and align.
141   const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
142   // Move latter 2x values down to add with first 2x values for each output.
143   __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
144   // First 4x values already aligned to add with running total.
145   sum_hi = _mm_add_epi16(sum_hi, source4_hi);
146   // Move second 4x values down to add with running total.
147   sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
148   // Move third 4x values down to add with running total.
149   sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
150 
151   // Because we have only 8 values here, it is safe to align before packing down
152   // to 8-bit without losing data.
153   sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
154   sum = RightShiftWithRounding_U16(sum, 4);
155   StoreLo8(dest, _mm_packus_epi16(sum, sum));
156 }
157 
IntraEdgeFilter_SSE4_1(void * buffer,int size,int strength)158 void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
159   uint8_t edge[kMaxEdgeBufferSize + 4];
160   memcpy(edge, buffer, size);
161   auto* dst_buffer = static_cast<uint8_t*>(buffer);
162 
163   // Only process |size| - 1 elements. Nothing to do in this case.
164   if (size == 1) return;
165 
166   int i = 0;
167   switch (strength) {
168     case 1:
169       // To avoid overwriting, we stop short from the total write size plus the
170       // initial offset. In this case 12 valid values are written in two blocks
171       // of 8 bytes each.
172       for (; i < size - 17; i += 12) {
173         ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
174       }
175       break;
176     case 2:
177       // See the comment for case 1.
178       for (; i < size - 17; i += 12) {
179         ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
180       }
181       break;
182     default:
183       assert(strength == 3);
184       // The first filter input is repeated for taps of value 2 and 4.
185       dst_buffer[1] = RightShiftWithRounding(
186           (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
187       // In this case, one block of 8 bytes is written in each iteration, with
188       // an offset of 2.
189       for (; i < size - 10; i += 8) {
190         ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
191       }
192   }
193   const int kernel_index = strength - 1;
194   for (int final_index = Clip3(i, 1, size - 2); final_index < size;
195        ++final_index) {
196     int sum = 0;
197     for (int j = 0; j < kKernelTaps; ++j) {
198       const int k = Clip3(final_index + j - 2, 0, size - 1);
199       sum += kKernels[kernel_index][j] * edge[k];
200     }
201     dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
202   }
203 }
204 
205 constexpr int kMaxUpsampleSize = 16;
206 
207 // Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
208 // interleaves the results with the original values. This implementation assumes
209 // that it is safe to write the maximum number of upsampled pixels (32) to the
210 // edge buffer, even when |size| is small.
IntraEdgeUpsampler_SSE4_1(void * buffer,int size)211 void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
212   assert(size % 4 == 0 && size <= kMaxUpsampleSize);
213   auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
214   uint8_t temp[kMaxUpsampleSize + 8];
215   temp[0] = temp[1] = pixel_buffer[-1];
216   memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
217   temp[size + 2] = pixel_buffer[size - 1];
218 
219   pixel_buffer[-2] = temp[0];
220   const __m128i data = LoadUnaligned16(temp);
221   const __m128i src_lo = _mm_cvtepu8_epi16(data);
222   const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
223   const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
224   const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
225   __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
226   sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
227   sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
228   sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
229   const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
230                                               _mm_srli_si128(data, 2));
231   StoreUnaligned16(pixel_buffer - 1, result_lo);
232   if (size > 8) {
233     const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
234     const __m128i src9_hi_extra =
235         _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
236     __m128i sum_hi =
237         _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
238     sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
239     sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
240     sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
241     const __m128i result_hi =
242         _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
243     StoreUnaligned16(pixel_buffer + 15, result_hi);
244   }
245 }
246 
Init8bpp()247 void Init8bpp() {
248   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
249   assert(dsp != nullptr);
250 #if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
251   dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
252 #endif
253 #if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
254   dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
255 #endif
256 }
257 
258 }  // namespace
259 
IntraEdgeInit_SSE4_1()260 void IntraEdgeInit_SSE4_1() { Init8bpp(); }
261 
262 }  // namespace dsp
263 }  // namespace libgav1
264 
265 #else   // !LIBGAV1_TARGETING_SSE4_1
266 namespace libgav1 {
267 namespace dsp {
268 
IntraEdgeInit_SSE4_1()269 void IntraEdgeInit_SSE4_1() {}
270 
271 }  // namespace dsp
272 }  // namespace libgav1
273 #endif  // LIBGAV1_TARGETING_SSE4_1
274