1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21
22 * *******************************************************************************
23
24 * * @file
25 * isvc_mem_fns_sse42.c
26 *
27 * @brief
28 * SSE4.2 variants of
29 * functions used for memory operations
30 *
31
32 * *******************************************************************************
33
34 */
35 #include <string.h>
36 #include <immintrin.h>
37
38 #include "ih264_typedefs.h"
39 #include "isvc_mem_fns.h"
40
isvc_memset_2d_sse42(UWORD8 * pu1_dst,WORD32 i4_dst_stride,UWORD8 u1_val,WORD32 i4_blk_wd,WORD32 i4_blk_ht)41 void isvc_memset_2d_sse42(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 u1_val, WORD32 i4_blk_wd,
42 WORD32 i4_blk_ht)
43 {
44 WORD32 i, j;
45
46 if((i4_blk_wd == 4) && (i4_blk_ht == 4))
47 {
48 *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
49 pu1_dst += i4_dst_stride;
50
51 *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
52 pu1_dst += i4_dst_stride;
53
54 *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
55 pu1_dst += i4_dst_stride;
56
57 *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
58 }
59 else if((i4_blk_wd == 8) && (i4_blk_ht == 8))
60 {
61 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
62 pu1_dst += i4_dst_stride;
63
64 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
65 pu1_dst += i4_dst_stride;
66
67 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
68 pu1_dst += i4_dst_stride;
69
70 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
71 pu1_dst += i4_dst_stride;
72
73 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
74 pu1_dst += i4_dst_stride;
75
76 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
77 pu1_dst += i4_dst_stride;
78
79 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
80 pu1_dst += i4_dst_stride;
81
82 _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
83 }
84 else if((i4_blk_wd % 16 == 0) && (i4_blk_ht % 16 == 0))
85 {
86 UWORD8 *pu1_dst_col_ptr, *pu1_dst_row_ptr;
87
88 WORD32 i4_width_by_16 = i4_blk_wd / 16;
89 WORD32 i4_height_by_16 = i4_blk_ht / 16;
90
91 for(i = 0; i < i4_height_by_16; i++)
92 {
93 pu1_dst_row_ptr = pu1_dst + i * 16 * i4_dst_stride;
94
95 for(j = 0; j < i4_width_by_16; j++)
96 {
97 pu1_dst_col_ptr = pu1_dst_row_ptr + (j << 4);
98
99 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
100 pu1_dst_col_ptr += i4_dst_stride;
101
102 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
103 pu1_dst_col_ptr += i4_dst_stride;
104
105 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
106 pu1_dst_col_ptr += i4_dst_stride;
107
108 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
109 pu1_dst_col_ptr += i4_dst_stride;
110
111 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
112 pu1_dst_col_ptr += i4_dst_stride;
113
114 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
115 pu1_dst_col_ptr += i4_dst_stride;
116
117 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
118 pu1_dst_col_ptr += i4_dst_stride;
119
120 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
121 pu1_dst_col_ptr += i4_dst_stride;
122
123 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
124 pu1_dst_col_ptr += i4_dst_stride;
125
126 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
127 pu1_dst_col_ptr += i4_dst_stride;
128
129 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
130 pu1_dst_col_ptr += i4_dst_stride;
131
132 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
133 pu1_dst_col_ptr += i4_dst_stride;
134
135 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
136 pu1_dst_col_ptr += i4_dst_stride;
137
138 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
139 pu1_dst_col_ptr += i4_dst_stride;
140
141 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
142 pu1_dst_col_ptr += i4_dst_stride;
143
144 _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
145 }
146 }
147 }
148 else
149 {
150 for(i = 0; i < i4_blk_ht; i++)
151 {
152 memset(pu1_dst, u1_val, i4_blk_wd);
153
154 pu1_dst += i4_dst_stride;
155 }
156 }
157 }
158