xref: /aosp_15_r20/external/clpeak/src/kernels/global_bandwidth_kernels.cl (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1*1cd03ba3SJeremy KempMSTRINGIFY(
2*1cd03ba3SJeremy Kemp
3*1cd03ba3SJeremy Kemp\n#undef FETCH_2
4*1cd03ba3SJeremy Kemp\n#undef FETCH_8
5*1cd03ba3SJeremy Kemp\n
6*1cd03ba3SJeremy Kemp\n#define FETCH_2(sum, id, A, jumpBy)      sum += A[id];   id += jumpBy;   sum += A[id];   id += jumpBy;
7*1cd03ba3SJeremy Kemp\n#define FETCH_4(sum, id, A, jumpBy)      FETCH_2(sum, id, A, jumpBy);   FETCH_2(sum, id, A, jumpBy);
8*1cd03ba3SJeremy Kemp\n#define FETCH_8(sum, id, A, jumpBy)      FETCH_4(sum, id, A, jumpBy);   FETCH_4(sum, id, A, jumpBy);
9*1cd03ba3SJeremy Kemp\n
10*1cd03ba3SJeremy Kemp\n
11*1cd03ba3SJeremy Kemp\n#define FETCH_PER_WI  16
12*1cd03ba3SJeremy Kemp\n
13*1cd03ba3SJeremy Kemp
14*1cd03ba3SJeremy Kemp// Kernels fetching by local_size offset
15*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v1_local_offset(__global float *A, __global float *B)
16*1cd03ba3SJeremy Kemp{
17*1cd03ba3SJeremy Kemp    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
18*1cd03ba3SJeremy Kemp    float sum = 0;
19*1cd03ba3SJeremy Kemp
20*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
21*1cd03ba3SJeremy Kemp    {
22*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_local_size(0));
23*1cd03ba3SJeremy Kemp    }
24*1cd03ba3SJeremy Kemp
25*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = sum;
26*1cd03ba3SJeremy Kemp}
27*1cd03ba3SJeremy Kemp
28*1cd03ba3SJeremy Kemp
29*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v2_local_offset(__global float2 *A, __global float *B)
30*1cd03ba3SJeremy Kemp{
31*1cd03ba3SJeremy Kemp    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
32*1cd03ba3SJeremy Kemp    float2 sum = 0;
33*1cd03ba3SJeremy Kemp
34*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
35*1cd03ba3SJeremy Kemp    {
36*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_local_size(0));
37*1cd03ba3SJeremy Kemp    }
38*1cd03ba3SJeremy Kemp
39*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1);
40*1cd03ba3SJeremy Kemp}
41*1cd03ba3SJeremy Kemp
42*1cd03ba3SJeremy Kemp
43*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v4_local_offset(__global float4 *A, __global float *B)
44*1cd03ba3SJeremy Kemp{
45*1cd03ba3SJeremy Kemp    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
46*1cd03ba3SJeremy Kemp    float4 sum = 0;
47*1cd03ba3SJeremy Kemp
48*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
49*1cd03ba3SJeremy Kemp    {
50*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_local_size(0));
51*1cd03ba3SJeremy Kemp    }
52*1cd03ba3SJeremy Kemp
53*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3);
54*1cd03ba3SJeremy Kemp}
55*1cd03ba3SJeremy Kemp
56*1cd03ba3SJeremy Kemp
57*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v8_local_offset(__global float8 *A, __global float *B)
58*1cd03ba3SJeremy Kemp{
59*1cd03ba3SJeremy Kemp    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
60*1cd03ba3SJeremy Kemp    float8 sum = 0;
61*1cd03ba3SJeremy Kemp
62*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
63*1cd03ba3SJeremy Kemp    {
64*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_local_size(0));
65*1cd03ba3SJeremy Kemp    }
66*1cd03ba3SJeremy Kemp
67*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
68*1cd03ba3SJeremy Kemp}
69*1cd03ba3SJeremy Kemp
70*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v16_local_offset(__global float16 *A, __global float *B)
71*1cd03ba3SJeremy Kemp{
72*1cd03ba3SJeremy Kemp    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
73*1cd03ba3SJeremy Kemp    float16 sum = 0;
74*1cd03ba3SJeremy Kemp
75*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
76*1cd03ba3SJeremy Kemp    {
77*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_local_size(0));
78*1cd03ba3SJeremy Kemp    }
79*1cd03ba3SJeremy Kemp
80*1cd03ba3SJeremy Kemp    float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
81*1cd03ba3SJeremy Kemp    t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF);
82*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = t;
83*1cd03ba3SJeremy Kemp}
84*1cd03ba3SJeremy Kemp
85*1cd03ba3SJeremy Kemp
86*1cd03ba3SJeremy Kemp// Kernels fetching by global_size offset
87*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v1_global_offset(__global float *A, __global float *B)
88*1cd03ba3SJeremy Kemp{
89*1cd03ba3SJeremy Kemp    int id = get_global_id(0);
90*1cd03ba3SJeremy Kemp    float sum = 0;
91*1cd03ba3SJeremy Kemp
92*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
93*1cd03ba3SJeremy Kemp    {
94*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_global_size(0));
95*1cd03ba3SJeremy Kemp    }
96*1cd03ba3SJeremy Kemp
97*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = sum;
98*1cd03ba3SJeremy Kemp}
99*1cd03ba3SJeremy Kemp
100*1cd03ba3SJeremy Kemp
101*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v2_global_offset(__global float2 *A, __global float *B)
102*1cd03ba3SJeremy Kemp{
103*1cd03ba3SJeremy Kemp    int id = get_global_id(0);
104*1cd03ba3SJeremy Kemp    float2 sum = 0;
105*1cd03ba3SJeremy Kemp
106*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
107*1cd03ba3SJeremy Kemp    {
108*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_global_size(0));
109*1cd03ba3SJeremy Kemp    }
110*1cd03ba3SJeremy Kemp
111*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1);
112*1cd03ba3SJeremy Kemp}
113*1cd03ba3SJeremy Kemp
114*1cd03ba3SJeremy Kemp
115*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v4_global_offset(__global float4 *A, __global float *B)
116*1cd03ba3SJeremy Kemp{
117*1cd03ba3SJeremy Kemp    int id = get_global_id(0);
118*1cd03ba3SJeremy Kemp    float4 sum = 0;
119*1cd03ba3SJeremy Kemp
120*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
121*1cd03ba3SJeremy Kemp    {
122*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_global_size(0));
123*1cd03ba3SJeremy Kemp    }
124*1cd03ba3SJeremy Kemp
125*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3);
126*1cd03ba3SJeremy Kemp}
127*1cd03ba3SJeremy Kemp
128*1cd03ba3SJeremy Kemp
129*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v8_global_offset(__global float8 *A, __global float *B)
130*1cd03ba3SJeremy Kemp{
131*1cd03ba3SJeremy Kemp    int id = get_global_id(0);
132*1cd03ba3SJeremy Kemp    float8 sum = 0;
133*1cd03ba3SJeremy Kemp
134*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
135*1cd03ba3SJeremy Kemp    {
136*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_global_size(0));
137*1cd03ba3SJeremy Kemp    }
138*1cd03ba3SJeremy Kemp
139*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
140*1cd03ba3SJeremy Kemp}
141*1cd03ba3SJeremy Kemp
142*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v16_global_offset(__global float16 *A, __global float *B)
143*1cd03ba3SJeremy Kemp{
144*1cd03ba3SJeremy Kemp    int id = get_global_id(0);
145*1cd03ba3SJeremy Kemp    float16 sum = 0;
146*1cd03ba3SJeremy Kemp
147*1cd03ba3SJeremy Kemp    for(int i=0; i<4; i++)
148*1cd03ba3SJeremy Kemp    {
149*1cd03ba3SJeremy Kemp        FETCH_4(sum, id, A, get_global_size(0));
150*1cd03ba3SJeremy Kemp    }
151*1cd03ba3SJeremy Kemp
152*1cd03ba3SJeremy Kemp    float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
153*1cd03ba3SJeremy Kemp    t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF);
154*1cd03ba3SJeremy Kemp    B[get_global_id(0)] = t;
155*1cd03ba3SJeremy Kemp}
156*1cd03ba3SJeremy Kemp
157*1cd03ba3SJeremy Kemp
158*1cd03ba3SJeremy Kemp)
159