1*638691a0SAndroid Build Coastguard Worker /* 16-bit signed integer dot product
2*638691a0SAndroid Build Coastguard Worker * Altivec-assisted version
3*638691a0SAndroid Build Coastguard Worker * Copyright 2004 Phil Karn
4*638691a0SAndroid Build Coastguard Worker * May be used under the terms of the GNU Lesser General Public License (LGPL)
5*638691a0SAndroid Build Coastguard Worker */
6*638691a0SAndroid Build Coastguard Worker #include <stdlib.h>
7*638691a0SAndroid Build Coastguard Worker #include "fec.h"
8*638691a0SAndroid Build Coastguard Worker
9*638691a0SAndroid Build Coastguard Worker struct dotprod {
10*638691a0SAndroid Build Coastguard Worker int len; /* Number of coefficients */
11*638691a0SAndroid Build Coastguard Worker
12*638691a0SAndroid Build Coastguard Worker /* On an Altivec machine, these hold 8 copies of the coefficients,
13*638691a0SAndroid Build Coastguard Worker * preshifted by 0,1,..7 words to meet all possible input data
14*638691a0SAndroid Build Coastguard Worker */
15*638691a0SAndroid Build Coastguard Worker signed short *coeffs[8];
16*638691a0SAndroid Build Coastguard Worker };
17*638691a0SAndroid Build Coastguard Worker
18*638691a0SAndroid Build Coastguard Worker /* Create and return a descriptor for use with the dot product function */
initdp_av(signed short coeffs[],int len)19*638691a0SAndroid Build Coastguard Worker void *initdp_av(signed short coeffs[],int len){
20*638691a0SAndroid Build Coastguard Worker struct dotprod *dp;
21*638691a0SAndroid Build Coastguard Worker int i,j;
22*638691a0SAndroid Build Coastguard Worker
23*638691a0SAndroid Build Coastguard Worker if(len == 0)
24*638691a0SAndroid Build Coastguard Worker return NULL;
25*638691a0SAndroid Build Coastguard Worker
26*638691a0SAndroid Build Coastguard Worker dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
27*638691a0SAndroid Build Coastguard Worker dp->len = len;
28*638691a0SAndroid Build Coastguard Worker
29*638691a0SAndroid Build Coastguard Worker /* Make 8 copies of coefficients, one for each data alignment,
30*638691a0SAndroid Build Coastguard Worker * each aligned to 16-byte boundary
31*638691a0SAndroid Build Coastguard Worker */
32*638691a0SAndroid Build Coastguard Worker for(i=0;i<8;i++){
33*638691a0SAndroid Build Coastguard Worker dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
34*638691a0SAndroid Build Coastguard Worker for(j=0;j<len;j++)
35*638691a0SAndroid Build Coastguard Worker dp->coeffs[i][j+i] = coeffs[j];
36*638691a0SAndroid Build Coastguard Worker }
37*638691a0SAndroid Build Coastguard Worker return (void *)dp;
38*638691a0SAndroid Build Coastguard Worker }
39*638691a0SAndroid Build Coastguard Worker
40*638691a0SAndroid Build Coastguard Worker
41*638691a0SAndroid Build Coastguard Worker /* Free a dot product descriptor created earlier */
freedp_av(void * p)42*638691a0SAndroid Build Coastguard Worker void freedp_av(void *p){
43*638691a0SAndroid Build Coastguard Worker struct dotprod *dp = (struct dotprod *)p;
44*638691a0SAndroid Build Coastguard Worker int i;
45*638691a0SAndroid Build Coastguard Worker
46*638691a0SAndroid Build Coastguard Worker for(i=0;i<8;i++)
47*638691a0SAndroid Build Coastguard Worker if(dp->coeffs[i] != NULL)
48*638691a0SAndroid Build Coastguard Worker free(dp->coeffs[i]);
49*638691a0SAndroid Build Coastguard Worker free(dp);
50*638691a0SAndroid Build Coastguard Worker }
51*638691a0SAndroid Build Coastguard Worker
52*638691a0SAndroid Build Coastguard Worker /* Compute a dot product given a descriptor and an input array
53*638691a0SAndroid Build Coastguard Worker * The length is taken from the descriptor
54*638691a0SAndroid Build Coastguard Worker */
dotprod_av(void * p,signed short a[])55*638691a0SAndroid Build Coastguard Worker long dotprod_av(void *p,signed short a[]){
56*638691a0SAndroid Build Coastguard Worker struct dotprod *dp = (struct dotprod *)p;
57*638691a0SAndroid Build Coastguard Worker int al;
58*638691a0SAndroid Build Coastguard Worker vector signed short *ar,*d;
59*638691a0SAndroid Build Coastguard Worker vector signed int sums0,sums1,sums2,sums3;
60*638691a0SAndroid Build Coastguard Worker union { vector signed int v; signed int w[4];} s;
61*638691a0SAndroid Build Coastguard Worker int nblocks;
62*638691a0SAndroid Build Coastguard Worker
63*638691a0SAndroid Build Coastguard Worker /* round ar down to beginning of 16-byte block containing 0th element of
64*638691a0SAndroid Build Coastguard Worker * input buffer. Then set d to one of 8 sets of shifted coefficients
65*638691a0SAndroid Build Coastguard Worker */
66*638691a0SAndroid Build Coastguard Worker ar = (vector signed short *)((int)a & ~15);
67*638691a0SAndroid Build Coastguard Worker al = ((int)a & 15)/sizeof(signed short);
68*638691a0SAndroid Build Coastguard Worker d = (vector signed short *)dp->coeffs[al];
69*638691a0SAndroid Build Coastguard Worker
70*638691a0SAndroid Build Coastguard Worker nblocks = (dp->len+al-1)/8+1;
71*638691a0SAndroid Build Coastguard Worker
72*638691a0SAndroid Build Coastguard Worker /* Sum into four vectors each holding four 32-bit partial sums */
73*638691a0SAndroid Build Coastguard Worker sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
74*638691a0SAndroid Build Coastguard Worker while(nblocks >= 4){
75*638691a0SAndroid Build Coastguard Worker sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
76*638691a0SAndroid Build Coastguard Worker sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
77*638691a0SAndroid Build Coastguard Worker sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
78*638691a0SAndroid Build Coastguard Worker sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
79*638691a0SAndroid Build Coastguard Worker nblocks -= 4;
80*638691a0SAndroid Build Coastguard Worker }
81*638691a0SAndroid Build Coastguard Worker sums0 = vec_adds(sums0,sums1);
82*638691a0SAndroid Build Coastguard Worker sums2 = vec_adds(sums2,sums3);
83*638691a0SAndroid Build Coastguard Worker sums0 = vec_adds(sums0,sums2);
84*638691a0SAndroid Build Coastguard Worker while(nblocks-- > 0){
85*638691a0SAndroid Build Coastguard Worker sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
86*638691a0SAndroid Build Coastguard Worker }
87*638691a0SAndroid Build Coastguard Worker /* Sum 4 partial sums into final result */
88*638691a0SAndroid Build Coastguard Worker s.v = vec_sums(sums0,(vector signed int)(0));
89*638691a0SAndroid Build Coastguard Worker
90*638691a0SAndroid Build Coastguard Worker return s.w[3];
91*638691a0SAndroid Build Coastguard Worker }
92*638691a0SAndroid Build Coastguard Worker
93*638691a0SAndroid Build Coastguard Worker
94