1*a97c2a1fSXin Li /******************************************************************************
2*a97c2a1fSXin Li *
3*a97c2a1fSXin Li * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li *
5*a97c2a1fSXin Li * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li * You may obtain a copy of the License at:
8*a97c2a1fSXin Li *
9*a97c2a1fSXin Li * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li *
11*a97c2a1fSXin Li * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li * limitations under the License.
16*a97c2a1fSXin Li *
17*a97c2a1fSXin Li *****************************************************************************
18*a97c2a1fSXin Li * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li */
20*a97c2a1fSXin Li /**
21*a97c2a1fSXin Li *******************************************************************************
22*a97c2a1fSXin Li * @file
23*a97c2a1fSXin Li * icv_variance_sse42.c
24*a97c2a1fSXin Li *
25*a97c2a1fSXin Li * @brief
26*a97c2a1fSXin Li * This file contains the functions to compute variance
27*a97c2a1fSXin Li *
28*a97c2a1fSXin Li * @author
29*a97c2a1fSXin Li * Ittiam
30*a97c2a1fSXin Li *
31*a97c2a1fSXin Li * @par List of Functions:
32*a97c2a1fSXin Li * icv_variance_8x4_ssse3()
33*a97c2a1fSXin Li *
34*a97c2a1fSXin Li * @remarks
35*a97c2a1fSXin Li * None
36*a97c2a1fSXin Li *
37*a97c2a1fSXin Li *******************************************************************************
38*a97c2a1fSXin Li */
39*a97c2a1fSXin Li /*****************************************************************************/
40*a97c2a1fSXin Li /* File Includes */
41*a97c2a1fSXin Li /*****************************************************************************/
42*a97c2a1fSXin Li /* System include files */
43*a97c2a1fSXin Li #include <stdio.h>
44*a97c2a1fSXin Li #include <stdint.h>
45*a97c2a1fSXin Li #include <string.h>
46*a97c2a1fSXin Li #include <stdlib.h>
47*a97c2a1fSXin Li #include <assert.h>
48*a97c2a1fSXin Li #include <immintrin.h>
49*a97c2a1fSXin Li
50*a97c2a1fSXin Li /* User include files */
51*a97c2a1fSXin Li #include "icv_datatypes.h"
52*a97c2a1fSXin Li #include "icv_macros.h"
53*a97c2a1fSXin Li #include "icv_platform_macros.h"
54*a97c2a1fSXin Li #include "icv.h"
55*a97c2a1fSXin Li
56*a97c2a1fSXin Li /**
57*a97c2a1fSXin Li *******************************************************************************
58*a97c2a1fSXin Li *
59*a97c2a1fSXin Li * @brief
60*a97c2a1fSXin Li * Computes variance of a given 8x4 block
61*a97c2a1fSXin Li *
62*a97c2a1fSXin Li * @par Description
63*a97c2a1fSXin Li * Compute variance of a given 8x4 block
64*a97c2a1fSXin Li *
65*a97c2a1fSXin Li * @param[in] pu1_src
66*a97c2a1fSXin Li * Source
67*a97c2a1fSXin Li *
68*a97c2a1fSXin Li * @param[in] src_strd
69*a97c2a1fSXin Li * Source stride
70*a97c2a1fSXin Li *
71*a97c2a1fSXin Li * @param[in] wd
72*a97c2a1fSXin Li * Assumed to be 8
73*a97c2a1fSXin Li *
74*a97c2a1fSXin Li * @param[in] ht
75*a97c2a1fSXin Li * Assumed to be 4
76*a97c2a1fSXin Li *
77*a97c2a1fSXin Li * @returns
78*a97c2a1fSXin Li * Variance
79*a97c2a1fSXin Li *
80*a97c2a1fSXin Li * @remarks
81*a97c2a1fSXin Li *
82*a97c2a1fSXin Li *******************************************************************************
83*a97c2a1fSXin Li */
icv_variance_8x4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 wd,WORD32 ht)84*a97c2a1fSXin Li WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
85*a97c2a1fSXin Li {
86*a97c2a1fSXin Li WORD32 sum;
87*a97c2a1fSXin Li WORD32 sum_sqr;
88*a97c2a1fSXin Li WORD32 blk_sz;
89*a97c2a1fSXin Li WORD32 vrnc;
90*a97c2a1fSXin Li __m128 src_r0, src_r1;
91*a97c2a1fSXin Li __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
92*a97c2a1fSXin Li __m128i sum_r0, sum_r1;
93*a97c2a1fSXin Li __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
94*a97c2a1fSXin Li __m128i vsum, vsum_sqr;
95*a97c2a1fSXin Li __m128i zero;
96*a97c2a1fSXin Li UNUSED(wd);
97*a97c2a1fSXin Li UNUSED(ht);
98*a97c2a1fSXin Li
99*a97c2a1fSXin Li ASSERT(wd == 8);
100*a97c2a1fSXin Li ASSERT(ht == 4);
101*a97c2a1fSXin Li
102*a97c2a1fSXin Li sum = 0;
103*a97c2a1fSXin Li sum_sqr = 0;
104*a97c2a1fSXin Li
105*a97c2a1fSXin Li blk_sz = 8 * 4;
106*a97c2a1fSXin Li
107*a97c2a1fSXin Li zero = _mm_setzero_si128();
108*a97c2a1fSXin Li
109*a97c2a1fSXin Li /* Load source */
110*a97c2a1fSXin Li src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
111*a97c2a1fSXin Li pu1_src += src_strd;
112*a97c2a1fSXin Li
113*a97c2a1fSXin Li src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
114*a97c2a1fSXin Li pu1_src += src_strd;
115*a97c2a1fSXin Li
116*a97c2a1fSXin Li src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
117*a97c2a1fSXin Li pu1_src += src_strd;
118*a97c2a1fSXin Li
119*a97c2a1fSXin Li src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
120*a97c2a1fSXin Li pu1_src += src_strd;
121*a97c2a1fSXin Li
122*a97c2a1fSXin Li /* Compute sum of all elements */
123*a97c2a1fSXin Li /* Use SAD with 0, since there is no pairwise addition */
124*a97c2a1fSXin Li sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero);
125*a97c2a1fSXin Li sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero);
126*a97c2a1fSXin Li
127*a97c2a1fSXin Li /* Accumulate SAD */
128*a97c2a1fSXin Li vsum = _mm_add_epi64(sum_r0, sum_r1);
129*a97c2a1fSXin Li vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
130*a97c2a1fSXin Li
131*a97c2a1fSXin Li sum = _mm_cvtsi128_si32(vsum);
132*a97c2a1fSXin Li
133*a97c2a1fSXin Li /* Unpack to 16 bits */
134*a97c2a1fSXin Li ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
135*a97c2a1fSXin Li ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
136*a97c2a1fSXin Li ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
137*a97c2a1fSXin Li ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
138*a97c2a1fSXin Li
139*a97c2a1fSXin Li /* Compute sum of squares */
140*a97c2a1fSXin Li sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0);
141*a97c2a1fSXin Li sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1);
142*a97c2a1fSXin Li sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2);
143*a97c2a1fSXin Li sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3);
144*a97c2a1fSXin Li
145*a97c2a1fSXin Li vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1);
146*a97c2a1fSXin Li vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
147*a97c2a1fSXin Li vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
148*a97c2a1fSXin Li
149*a97c2a1fSXin Li vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
150*a97c2a1fSXin Li vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
151*a97c2a1fSXin Li sum_sqr = _mm_cvtsi128_si32(vsum_sqr);
152*a97c2a1fSXin Li
153*a97c2a1fSXin Li /* Compute variance */
154*a97c2a1fSXin Li vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
155*a97c2a1fSXin Li
156*a97c2a1fSXin Li return vrnc;
157*a97c2a1fSXin Li }
158*a97c2a1fSXin Li
159