xref: /aosp_15_r20/external/libavc/common/x86/svc/isvc_mem_fns_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21 
22  * *******************************************************************************
23 
24  * * @file
25  *  isvc_mem_fns_sse42.c
26  *
27  * @brief
28  *  SSE4.2 variants of
29  * functions used for memory operations
30  *
31 
32  * *******************************************************************************
33 
34  */
35 #include <string.h>
36 #include <immintrin.h>
37 
38 #include "ih264_typedefs.h"
39 #include "isvc_mem_fns.h"
40 
isvc_memset_2d_sse42(UWORD8 * pu1_dst,WORD32 i4_dst_stride,UWORD8 u1_val,WORD32 i4_blk_wd,WORD32 i4_blk_ht)41 void isvc_memset_2d_sse42(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 u1_val, WORD32 i4_blk_wd,
42                           WORD32 i4_blk_ht)
43 {
44     WORD32 i, j;
45 
46     if((i4_blk_wd == 4) && (i4_blk_ht == 4))
47     {
48         *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
49         pu1_dst += i4_dst_stride;
50 
51         *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
52         pu1_dst += i4_dst_stride;
53 
54         *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
55         pu1_dst += i4_dst_stride;
56 
57         *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
58     }
59     else if((i4_blk_wd == 8) && (i4_blk_ht == 8))
60     {
61         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
62         pu1_dst += i4_dst_stride;
63 
64         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
65         pu1_dst += i4_dst_stride;
66 
67         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
68         pu1_dst += i4_dst_stride;
69 
70         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
71         pu1_dst += i4_dst_stride;
72 
73         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
74         pu1_dst += i4_dst_stride;
75 
76         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
77         pu1_dst += i4_dst_stride;
78 
79         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
80         pu1_dst += i4_dst_stride;
81 
82         _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
83     }
84     else if((i4_blk_wd % 16 == 0) && (i4_blk_ht % 16 == 0))
85     {
86         UWORD8 *pu1_dst_col_ptr, *pu1_dst_row_ptr;
87 
88         WORD32 i4_width_by_16 = i4_blk_wd / 16;
89         WORD32 i4_height_by_16 = i4_blk_ht / 16;
90 
91         for(i = 0; i < i4_height_by_16; i++)
92         {
93             pu1_dst_row_ptr = pu1_dst + i * 16 * i4_dst_stride;
94 
95             for(j = 0; j < i4_width_by_16; j++)
96             {
97                 pu1_dst_col_ptr = pu1_dst_row_ptr + (j << 4);
98 
99                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
100                 pu1_dst_col_ptr += i4_dst_stride;
101 
102                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
103                 pu1_dst_col_ptr += i4_dst_stride;
104 
105                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
106                 pu1_dst_col_ptr += i4_dst_stride;
107 
108                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
109                 pu1_dst_col_ptr += i4_dst_stride;
110 
111                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
112                 pu1_dst_col_ptr += i4_dst_stride;
113 
114                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
115                 pu1_dst_col_ptr += i4_dst_stride;
116 
117                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
118                 pu1_dst_col_ptr += i4_dst_stride;
119 
120                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
121                 pu1_dst_col_ptr += i4_dst_stride;
122 
123                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
124                 pu1_dst_col_ptr += i4_dst_stride;
125 
126                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
127                 pu1_dst_col_ptr += i4_dst_stride;
128 
129                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
130                 pu1_dst_col_ptr += i4_dst_stride;
131 
132                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
133                 pu1_dst_col_ptr += i4_dst_stride;
134 
135                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
136                 pu1_dst_col_ptr += i4_dst_stride;
137 
138                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
139                 pu1_dst_col_ptr += i4_dst_stride;
140 
141                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
142                 pu1_dst_col_ptr += i4_dst_stride;
143 
144                 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
145             }
146         }
147     }
148     else
149     {
150         for(i = 0; i < i4_blk_ht; i++)
151         {
152             memset(pu1_dst, u1_val, i4_blk_wd);
153 
154             pu1_dst += i4_dst_stride;
155         }
156     }
157 }
158