1*1cd03ba3SJeremy KempMSTRINGIFY( 2*1cd03ba3SJeremy Kemp 3*1cd03ba3SJeremy Kemp\n#undef FETCH_2 4*1cd03ba3SJeremy Kemp\n#undef FETCH_8 5*1cd03ba3SJeremy Kemp\n 6*1cd03ba3SJeremy Kemp\n#define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum += A[id]; id += jumpBy; 7*1cd03ba3SJeremy Kemp\n#define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy); FETCH_2(sum, id, A, jumpBy); 8*1cd03ba3SJeremy Kemp\n#define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy); FETCH_4(sum, id, A, jumpBy); 9*1cd03ba3SJeremy Kemp\n 10*1cd03ba3SJeremy Kemp\n 11*1cd03ba3SJeremy Kemp\n#define FETCH_PER_WI 16 12*1cd03ba3SJeremy Kemp\n 13*1cd03ba3SJeremy Kemp 14*1cd03ba3SJeremy Kemp// Kernels fetching by local_size offset 15*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v1_local_offset(__global float *A, __global float *B) 16*1cd03ba3SJeremy Kemp{ 17*1cd03ba3SJeremy Kemp int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 18*1cd03ba3SJeremy Kemp float sum = 0; 19*1cd03ba3SJeremy Kemp 20*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 21*1cd03ba3SJeremy Kemp { 22*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_local_size(0)); 23*1cd03ba3SJeremy Kemp } 24*1cd03ba3SJeremy Kemp 25*1cd03ba3SJeremy Kemp B[get_global_id(0)] = sum; 26*1cd03ba3SJeremy Kemp} 27*1cd03ba3SJeremy Kemp 28*1cd03ba3SJeremy Kemp 29*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v2_local_offset(__global float2 *A, __global float *B) 30*1cd03ba3SJeremy Kemp{ 31*1cd03ba3SJeremy Kemp int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 32*1cd03ba3SJeremy Kemp float2 sum = 0; 33*1cd03ba3SJeremy Kemp 34*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 35*1cd03ba3SJeremy Kemp { 36*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_local_size(0)); 37*1cd03ba3SJeremy Kemp } 38*1cd03ba3SJeremy Kemp 39*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1); 40*1cd03ba3SJeremy Kemp} 41*1cd03ba3SJeremy Kemp 42*1cd03ba3SJeremy Kemp 43*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v4_local_offset(__global float4 *A, __global float *B) 44*1cd03ba3SJeremy Kemp{ 45*1cd03ba3SJeremy Kemp int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 46*1cd03ba3SJeremy Kemp float4 sum = 0; 47*1cd03ba3SJeremy Kemp 48*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 49*1cd03ba3SJeremy Kemp { 50*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_local_size(0)); 51*1cd03ba3SJeremy Kemp } 52*1cd03ba3SJeremy Kemp 53*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); 54*1cd03ba3SJeremy Kemp} 55*1cd03ba3SJeremy Kemp 56*1cd03ba3SJeremy Kemp 57*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v8_local_offset(__global float8 *A, __global float *B) 58*1cd03ba3SJeremy Kemp{ 59*1cd03ba3SJeremy Kemp int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 60*1cd03ba3SJeremy Kemp float8 sum = 0; 61*1cd03ba3SJeremy Kemp 62*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 63*1cd03ba3SJeremy Kemp { 64*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_local_size(0)); 65*1cd03ba3SJeremy Kemp } 66*1cd03ba3SJeremy Kemp 67*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 68*1cd03ba3SJeremy Kemp} 69*1cd03ba3SJeremy Kemp 70*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v16_local_offset(__global float16 *A, __global float *B) 71*1cd03ba3SJeremy Kemp{ 72*1cd03ba3SJeremy Kemp int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 73*1cd03ba3SJeremy Kemp float16 sum = 0; 74*1cd03ba3SJeremy Kemp 75*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 76*1cd03ba3SJeremy Kemp { 77*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_local_size(0)); 78*1cd03ba3SJeremy Kemp } 79*1cd03ba3SJeremy Kemp 80*1cd03ba3SJeremy Kemp float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 81*1cd03ba3SJeremy Kemp t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); 82*1cd03ba3SJeremy Kemp B[get_global_id(0)] = t; 83*1cd03ba3SJeremy Kemp} 84*1cd03ba3SJeremy Kemp 85*1cd03ba3SJeremy Kemp 86*1cd03ba3SJeremy Kemp// Kernels fetching by global_size offset 87*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v1_global_offset(__global float *A, __global float *B) 88*1cd03ba3SJeremy Kemp{ 89*1cd03ba3SJeremy Kemp int id = get_global_id(0); 90*1cd03ba3SJeremy Kemp float sum = 0; 91*1cd03ba3SJeremy Kemp 92*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 93*1cd03ba3SJeremy Kemp { 94*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_global_size(0)); 95*1cd03ba3SJeremy Kemp } 96*1cd03ba3SJeremy Kemp 97*1cd03ba3SJeremy Kemp B[get_global_id(0)] = sum; 98*1cd03ba3SJeremy Kemp} 99*1cd03ba3SJeremy Kemp 100*1cd03ba3SJeremy Kemp 101*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v2_global_offset(__global float2 *A, __global float *B) 102*1cd03ba3SJeremy Kemp{ 103*1cd03ba3SJeremy Kemp int id = get_global_id(0); 104*1cd03ba3SJeremy Kemp float2 sum = 0; 105*1cd03ba3SJeremy Kemp 106*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 107*1cd03ba3SJeremy Kemp { 108*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_global_size(0)); 109*1cd03ba3SJeremy Kemp } 110*1cd03ba3SJeremy Kemp 111*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1); 112*1cd03ba3SJeremy Kemp} 113*1cd03ba3SJeremy Kemp 114*1cd03ba3SJeremy Kemp 115*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v4_global_offset(__global float4 *A, __global float *B) 116*1cd03ba3SJeremy Kemp{ 117*1cd03ba3SJeremy Kemp int id = get_global_id(0); 118*1cd03ba3SJeremy Kemp float4 sum = 0; 119*1cd03ba3SJeremy Kemp 120*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 121*1cd03ba3SJeremy Kemp { 122*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_global_size(0)); 123*1cd03ba3SJeremy Kemp } 124*1cd03ba3SJeremy Kemp 125*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); 126*1cd03ba3SJeremy Kemp} 127*1cd03ba3SJeremy Kemp 128*1cd03ba3SJeremy Kemp 129*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v8_global_offset(__global float8 *A, __global float *B) 130*1cd03ba3SJeremy Kemp{ 131*1cd03ba3SJeremy Kemp int id = get_global_id(0); 132*1cd03ba3SJeremy Kemp float8 sum = 0; 133*1cd03ba3SJeremy Kemp 134*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 135*1cd03ba3SJeremy Kemp { 136*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_global_size(0)); 137*1cd03ba3SJeremy Kemp } 138*1cd03ba3SJeremy Kemp 139*1cd03ba3SJeremy Kemp B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 140*1cd03ba3SJeremy Kemp} 141*1cd03ba3SJeremy Kemp 142*1cd03ba3SJeremy Kemp__kernel void global_bandwidth_v16_global_offset(__global float16 *A, __global float *B) 143*1cd03ba3SJeremy Kemp{ 144*1cd03ba3SJeremy Kemp int id = get_global_id(0); 145*1cd03ba3SJeremy Kemp float16 sum = 0; 146*1cd03ba3SJeremy Kemp 147*1cd03ba3SJeremy Kemp for(int i=0; i<4; i++) 148*1cd03ba3SJeremy Kemp { 149*1cd03ba3SJeremy Kemp FETCH_4(sum, id, A, get_global_size(0)); 150*1cd03ba3SJeremy Kemp } 151*1cd03ba3SJeremy Kemp 152*1cd03ba3SJeremy Kemp float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 153*1cd03ba3SJeremy Kemp t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); 154*1cd03ba3SJeremy Kemp B[get_global_id(0)] = t; 155*1cd03ba3SJeremy Kemp} 156*1cd03ba3SJeremy Kemp 157*1cd03ba3SJeremy Kemp 158*1cd03ba3SJeremy Kemp) 159