xref: /aosp_15_r20/external/gemmlowp/standalone/cache_counters.cc (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han #include <asm/unistd.h>
2*5f39d1b3SJooyung Han #include <linux/perf_event.h>
3*5f39d1b3SJooyung Han #include <sys/ioctl.h>
4*5f39d1b3SJooyung Han #include <unistd.h>
5*5f39d1b3SJooyung Han #include <algorithm>
6*5f39d1b3SJooyung Han #include <cstdint>
7*5f39d1b3SJooyung Han #include <cstdio>
8*5f39d1b3SJooyung Han #include <cstdlib>
9*5f39d1b3SJooyung Han #include <cstring>
10*5f39d1b3SJooyung Han #include <random>
11*5f39d1b3SJooyung Han 
12*5f39d1b3SJooyung Han #ifndef __aarch64__
13*5f39d1b3SJooyung Han #error This program is for 64-bit ARM only.
14*5f39d1b3SJooyung Han #endif
15*5f39d1b3SJooyung Han 
16*5f39d1b3SJooyung Han struct PerfEvent {
17*5f39d1b3SJooyung Han   perf_event_attr pe;
18*5f39d1b3SJooyung Han   int fd = -1;
19*5f39d1b3SJooyung Han 
PerfEventPerfEvent20*5f39d1b3SJooyung Han   PerfEvent(std::uint32_t type, std::uint64_t config) {
21*5f39d1b3SJooyung Han     memset(&pe, 0, sizeof(pe));
22*5f39d1b3SJooyung Han     pe.size = sizeof(pe);
23*5f39d1b3SJooyung Han     pe.type = type;
24*5f39d1b3SJooyung Han     pe.config = config;
25*5f39d1b3SJooyung Han     pe.disabled = 1;
26*5f39d1b3SJooyung Han     pe.exclude_kernel = 1;
27*5f39d1b3SJooyung Han     pe.exclude_hv = 1;
28*5f39d1b3SJooyung Han     fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
29*5f39d1b3SJooyung Han     if (fd == -1) {
30*5f39d1b3SJooyung Han       fprintf(stderr, "perf_event_open failed for config 0x%lx\n", config);
31*5f39d1b3SJooyung Han       abort();
32*5f39d1b3SJooyung Han     }
33*5f39d1b3SJooyung Han   }
34*5f39d1b3SJooyung Han 
StartPerfEvent35*5f39d1b3SJooyung Han   void Start() {
36*5f39d1b3SJooyung Han     ioctl(fd, PERF_EVENT_IOC_RESET, 0);
37*5f39d1b3SJooyung Han     ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
38*5f39d1b3SJooyung Han   }
39*5f39d1b3SJooyung Han 
StopPerfEvent40*5f39d1b3SJooyung Han   std::int64_t Stop() {
41*5f39d1b3SJooyung Han     ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
42*5f39d1b3SJooyung Han     std::int64_t count = 0;
43*5f39d1b3SJooyung Han     read(fd, &count, sizeof(count));
44*5f39d1b3SJooyung Han     return count;
45*5f39d1b3SJooyung Han   }
46*5f39d1b3SJooyung Han 
~PerfEventPerfEvent47*5f39d1b3SJooyung Han   ~PerfEvent() { close(fd); }
48*5f39d1b3SJooyung Han };
49*5f39d1b3SJooyung Han 
50*5f39d1b3SJooyung Han struct ArmPmuEvent : PerfEvent {
51*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
52*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
53*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
54*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE = 0x04;
55*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
56*5f39d1b3SJooyung Han   static constexpr std::uint16_t LD_RETIRED = 0x06;
57*5f39d1b3SJooyung Han   static constexpr std::uint16_t ST_RETIRED = 0x07;
58*5f39d1b3SJooyung Han   static constexpr std::uint16_t INST_RETIRED = 0x08;
59*5f39d1b3SJooyung Han   static constexpr std::uint16_t EXC_TAKEN = 0x09;
60*5f39d1b3SJooyung Han   static constexpr std::uint16_t EXC_RETURN = 0x0A;
61*5f39d1b3SJooyung Han   static constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
62*5f39d1b3SJooyung Han   static constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
63*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
64*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
65*5f39d1b3SJooyung Han   static constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
66*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_MIS_PRED = 0x10;
67*5f39d1b3SJooyung Han   static constexpr std::uint16_t CPU_CYCLES = 0x11;
68*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_PRED = 0x12;
69*5f39d1b3SJooyung Han   static constexpr std::uint16_t MEM_ACCESS = 0x13;
70*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1I_CACHE = 0x14;
71*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE_WB = 0x15;
72*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE = 0x16;
73*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
74*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE_WB = 0x18;
75*5f39d1b3SJooyung Han   static constexpr std::uint16_t BUS_ACCESS = 0x19;
76*5f39d1b3SJooyung Han   static constexpr std::uint16_t MEMORY_ERROR = 0x1A;
77*5f39d1b3SJooyung Han   static constexpr std::uint16_t INST_SPEC = 0x1B;
78*5f39d1b3SJooyung Han   static constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
79*5f39d1b3SJooyung Han   static constexpr std::uint16_t BUS_CYCLES = 0x1D;
80*5f39d1b3SJooyung Han   static constexpr std::uint16_t CHAIN = 0x1E;
81*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
82*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
83*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_RETIRED = 0x21;
84*5f39d1b3SJooyung Han   static constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
85*5f39d1b3SJooyung Han   static constexpr std::uint16_t STALL_FRONTEND = 0x23;
86*5f39d1b3SJooyung Han   static constexpr std::uint16_t STALL_BACKEND = 0x24;
87*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_TLB = 0x25;
88*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1I_TLB = 0x26;
89*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2I_CACHE = 0x27;
90*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
91*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
92*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
93*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE = 0x2B;
94*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
95*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
96*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
97*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_TLB = 0x2F;
98*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2I_TLB = 0x30;
99*5f39d1b3SJooyung Han   static constexpr std::uint16_t LL_CACHE = 0x32;
100*5f39d1b3SJooyung Han   static constexpr std::uint16_t LL_CACHE_MISS = 0x33;
101*5f39d1b3SJooyung Han   static constexpr std::uint16_t DTLB_WALK = 0x34;
102*5f39d1b3SJooyung Han   static constexpr std::uint16_t LL_CACHE_RD = 0x36;
103*5f39d1b3SJooyung Han   static constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;
104*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE_RD = 0x40;
105*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
106*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
107*5f39d1b3SJooyung Han   static constexpr std::uint16_t L1D_TLB_RD = 0x4E;
108*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE_RD = 0x50;
109*5f39d1b3SJooyung Han   static constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
110*5f39d1b3SJooyung Han   static constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
111*5f39d1b3SJooyung Han   static constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
112*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
113*5f39d1b3SJooyung Han   static constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;
ArmPmuEventArmPmuEvent114*5f39d1b3SJooyung Han   ArmPmuEvent(std::uint16_t number) : PerfEvent(PERF_TYPE_RAW, number) {}
115*5f39d1b3SJooyung Han };
116*5f39d1b3SJooyung Han 
117*5f39d1b3SJooyung Han struct CacheCounts {
118*5f39d1b3SJooyung Han   int ld_retired = 0;
119*5f39d1b3SJooyung Han   int mem_access = 0;
120*5f39d1b3SJooyung Han   int ll_cache = 0;
121*5f39d1b3SJooyung Han   int ll_cache_miss = 0;
122*5f39d1b3SJooyung Han   int l1d_cache = 0;
123*5f39d1b3SJooyung Han   int l1d_cache_refill = 0;
124*5f39d1b3SJooyung Han   int l2d_cache = 0;
125*5f39d1b3SJooyung Han   int l2d_cache_refill = 0;
126*5f39d1b3SJooyung Han   int l3d_cache = 0;
127*5f39d1b3SJooyung Han   int l3d_cache_refill = 0;
128*5f39d1b3SJooyung Han };
129*5f39d1b3SJooyung Han 
PrintCacheCounts(const CacheCounts & cache_counts)130*5f39d1b3SJooyung Han void PrintCacheCounts(const CacheCounts& cache_counts) {
131*5f39d1b3SJooyung Han   printf("ld_retired = %d\n", cache_counts.ld_retired);
132*5f39d1b3SJooyung Han   printf("mem_access = %d\n", cache_counts.mem_access);
133*5f39d1b3SJooyung Han   printf("ll_cache = %d\n", cache_counts.ll_cache);
134*5f39d1b3SJooyung Han   printf("ll_cache_miss = %d\n", cache_counts.ll_cache_miss);
135*5f39d1b3SJooyung Han   printf("l1d_cache = %d\n", cache_counts.l1d_cache);
136*5f39d1b3SJooyung Han   printf("l1d_cache_refill = %d\n", cache_counts.l1d_cache_refill);
137*5f39d1b3SJooyung Han   printf("l2d_cache = %d\n", cache_counts.l2d_cache);
138*5f39d1b3SJooyung Han   printf("l2d_cache_refill = %d\n", cache_counts.l2d_cache_refill);
139*5f39d1b3SJooyung Han   printf("l3d_cache = %d\n", cache_counts.l3d_cache);
140*5f39d1b3SJooyung Han   printf("l3d_cache_refill = %d\n", cache_counts.l3d_cache_refill);
141*5f39d1b3SJooyung Han }
142*5f39d1b3SJooyung Han 
Workload(int accesses,int size,std::uint8_t * buf)143*5f39d1b3SJooyung Han void Workload(int accesses, int size, std::uint8_t* buf) {
144*5f39d1b3SJooyung Han   // The main reason to do this in assembly is an attempt to make sense
145*5f39d1b3SJooyung Han   // of instruction count counters, such as LD_RETIRED.
146*5f39d1b3SJooyung Han   // Also, if we did this in C++, we would need to be watchful of the compiler
147*5f39d1b3SJooyung Han   // optimizing away operations whose result isn't consumed.
148*5f39d1b3SJooyung Han   //
149*5f39d1b3SJooyung Han   // Note that TWO separate tricks are needed here to prevent Cortex-A76
150*5f39d1b3SJooyung Han   // speculative execution om prefetching data from future loop iterations:
151*5f39d1b3SJooyung Han   //   1. A data-dependency whereby the pointers being dereferenced at the
152*5f39d1b3SJooyung Han   //      next loop iteration depend on values loaded at the current iteration.
153*5f39d1b3SJooyung Han   //      That is the role of 'dummy'.
154*5f39d1b3SJooyung Han   //   2. A pseudo-random sequence. This is the role of register w0,
155*5f39d1b3SJooyung Han   //      where we implement a simple xorshift pseudorandom generator.
156*5f39d1b3SJooyung Han   // BOTH of these tricks are needed: if we disable just one of them,
157*5f39d1b3SJooyung Han   // Cortex-A76 successfully speculates some addresses, resulting in different
158*5f39d1b3SJooyung Han   // L3 / DRAM hit percentages on large sizes.
159*5f39d1b3SJooyung Han   std::uint64_t dummy = 123456789;
160*5f39d1b3SJooyung Han   asm volatile(
161*5f39d1b3SJooyung Han       // w0 := xorshift RNG state. Must be nonzero.
162*5f39d1b3SJooyung Han       "mov w0, #1\n"
163*5f39d1b3SJooyung Han       "1:\n"
164*5f39d1b3SJooyung Han       // xorshift RNG iteration: update w0 with the next pseudorandom value
165*5f39d1b3SJooyung Han       // in [1 .. 2^32-1].
166*5f39d1b3SJooyung Han       // This pseudorandomness is crucial to preventing speculative execution
167*5f39d1b3SJooyung Han       // on Cortex-A76 from prefetching data from future loop iterations.
168*5f39d1b3SJooyung Han       "eor w0, w0, w0, lsl #13\n"
169*5f39d1b3SJooyung Han       "eor w0, w0, w0, lsr #17\n"
170*5f39d1b3SJooyung Han       "eor w0, w0, w0, lsl #5\n"
171*5f39d1b3SJooyung Han       // w1 := size - 1 = size mask (size is required to be power-of-two).
172*5f39d1b3SJooyung Han       "sub w1, %w[size], #1\n"
173*5f39d1b3SJooyung Han       // w2 := (pseudorandom value w0) xor (data-dependent sum).
174*5f39d1b3SJooyung Han       "eor w2, w0, %w[dummy]\n"
175*5f39d1b3SJooyung Han       // w1 := w2 modulo size
176*5f39d1b3SJooyung Han       "and w1, w2, w1\n"
177*5f39d1b3SJooyung Han       // align w1
178*5f39d1b3SJooyung Han       "and w1, w1, #-64\n"
179*5f39d1b3SJooyung Han       // load at offset w1, again using x1 as destination.
180*5f39d1b3SJooyung Han       "ldr x1, [%[buf], w1, uxtw]\n"
181*5f39d1b3SJooyung Han       // Update our dummy so it depends on the value we have just loaded.
182*5f39d1b3SJooyung Han       // This data-dependency is key to preventing speculative execution on
183*5f39d1b3SJooyung Han       // Cortex-A76 from prefetching data from future loop iterations.
184*5f39d1b3SJooyung Han       "add %[dummy], %[dummy], w1, uxtw\n"
185*5f39d1b3SJooyung Han       // loop back.
186*5f39d1b3SJooyung Han       "subs %w[accesses], %w[accesses], #1\n"
187*5f39d1b3SJooyung Han       "bne 1b\n"
188*5f39d1b3SJooyung Han       : [ accesses ] "+r"(accesses), [ dummy ] "+r"(dummy)
189*5f39d1b3SJooyung Han       : [ size ] "r"(size), [ buf ] "r"(buf)
190*5f39d1b3SJooyung Han       : "memory", "cc", "x0", "x1", "x2");
191*5f39d1b3SJooyung Han }
192*5f39d1b3SJooyung Han 
MeasureCacheCounts(int accesses,int size,std::uint8_t * buf,CacheCounts * cache_counts)193*5f39d1b3SJooyung Han void MeasureCacheCounts(int accesses, int size, std::uint8_t* buf,
194*5f39d1b3SJooyung Han                         CacheCounts* cache_counts) {
195*5f39d1b3SJooyung Han   const bool only_reads = getenv("ONLY_READS");
196*5f39d1b3SJooyung Han   ArmPmuEvent ld_retired(ArmPmuEvent::LD_RETIRED);
197*5f39d1b3SJooyung Han   ArmPmuEvent mem_access(only_reads ? ArmPmuEvent::MEM_ACCESS_RD
198*5f39d1b3SJooyung Han                                     : ArmPmuEvent::MEM_ACCESS);
199*5f39d1b3SJooyung Han   ArmPmuEvent ll_cache(only_reads ? ArmPmuEvent::LL_CACHE_RD
200*5f39d1b3SJooyung Han                                   : ArmPmuEvent::LL_CACHE);
201*5f39d1b3SJooyung Han   ArmPmuEvent ll_cache_miss(only_reads ? ArmPmuEvent::LL_CACHE_MISS_RD
202*5f39d1b3SJooyung Han                                        : ArmPmuEvent::LL_CACHE_MISS);
203*5f39d1b3SJooyung Han   ArmPmuEvent l1d_cache(only_reads ? ArmPmuEvent::L1D_CACHE_RD
204*5f39d1b3SJooyung Han                                    : ArmPmuEvent::L1D_CACHE);
205*5f39d1b3SJooyung Han   ArmPmuEvent l1d_cache_refill(only_reads ? ArmPmuEvent::L1D_CACHE_REFILL_RD
206*5f39d1b3SJooyung Han                                           : ArmPmuEvent::L1D_CACHE_REFILL);
207*5f39d1b3SJooyung Han   ArmPmuEvent l2d_cache(only_reads ? ArmPmuEvent::L2D_CACHE_RD
208*5f39d1b3SJooyung Han                                    : ArmPmuEvent::L2D_CACHE);
209*5f39d1b3SJooyung Han   ArmPmuEvent l2d_cache_refill(only_reads ? ArmPmuEvent::L2D_CACHE_REFILL_RD
210*5f39d1b3SJooyung Han                                           : ArmPmuEvent::L2D_CACHE_REFILL);
211*5f39d1b3SJooyung Han   ArmPmuEvent l3d_cache(only_reads ? ArmPmuEvent::L3D_CACHE_RD
212*5f39d1b3SJooyung Han                                    : ArmPmuEvent::L3D_CACHE);
213*5f39d1b3SJooyung Han   ArmPmuEvent l3d_cache_refill(only_reads ? ArmPmuEvent::L3D_CACHE_REFILL_RD
214*5f39d1b3SJooyung Han                                           : ArmPmuEvent::L3D_CACHE_REFILL);
215*5f39d1b3SJooyung Han 
216*5f39d1b3SJooyung Han   ld_retired.Start();
217*5f39d1b3SJooyung Han   mem_access.Start();
218*5f39d1b3SJooyung Han   ll_cache.Start();
219*5f39d1b3SJooyung Han   ll_cache_miss.Start();
220*5f39d1b3SJooyung Han   l1d_cache.Start();
221*5f39d1b3SJooyung Han   l1d_cache_refill.Start();
222*5f39d1b3SJooyung Han   l2d_cache.Start();
223*5f39d1b3SJooyung Han   l2d_cache_refill.Start();
224*5f39d1b3SJooyung Han   l3d_cache.Start();
225*5f39d1b3SJooyung Han   l3d_cache_refill.Start();
226*5f39d1b3SJooyung Han 
227*5f39d1b3SJooyung Han   Workload(accesses, size, buf);
228*5f39d1b3SJooyung Han 
229*5f39d1b3SJooyung Han   cache_counts->ld_retired = ld_retired.Stop();
230*5f39d1b3SJooyung Han   cache_counts->mem_access = mem_access.Stop();
231*5f39d1b3SJooyung Han   cache_counts->ll_cache = ll_cache.Stop();
232*5f39d1b3SJooyung Han   cache_counts->ll_cache_miss = ll_cache_miss.Stop();
233*5f39d1b3SJooyung Han   cache_counts->l1d_cache = l1d_cache.Stop();
234*5f39d1b3SJooyung Han   cache_counts->l1d_cache_refill = l1d_cache_refill.Stop();
235*5f39d1b3SJooyung Han   cache_counts->l2d_cache = l2d_cache.Stop();
236*5f39d1b3SJooyung Han   cache_counts->l2d_cache_refill = l2d_cache_refill.Stop();
237*5f39d1b3SJooyung Han   cache_counts->l3d_cache = l3d_cache.Stop();
238*5f39d1b3SJooyung Han   cache_counts->l3d_cache_refill = l3d_cache_refill.Stop();
239*5f39d1b3SJooyung Han }
240*5f39d1b3SJooyung Han 
241*5f39d1b3SJooyung Han struct PieChart {
242*5f39d1b3SJooyung Han   // How many accesses were recorded, total? The other fields must sum to that.
243*5f39d1b3SJooyung Han   int total;
244*5f39d1b3SJooyung Han   // How many accesses were serviced with the typical cost of a L1 cache hit?
245*5f39d1b3SJooyung Han   int l1_hits;
246*5f39d1b3SJooyung Han   // How many accesses were serviced with the typical cost of a L2 cache hit?
247*5f39d1b3SJooyung Han   int l2_hits;
248*5f39d1b3SJooyung Han   // How many accesses were serviced with the typical cost of a L3 cache hit?
249*5f39d1b3SJooyung Han   int l3_hits;
250*5f39d1b3SJooyung Han   // How many accesses were serviced with the typical cost of a DRAM access?
251*5f39d1b3SJooyung Han   int dram_hits;
252*5f39d1b3SJooyung Han 
~PieChartPieChart253*5f39d1b3SJooyung Han   ~PieChart() {
254*5f39d1b3SJooyung Han     // Consistency check
255*5f39d1b3SJooyung Han     if (total != l1_hits + l2_hits + l3_hits + dram_hits) {
256*5f39d1b3SJooyung Han       fprintf(stderr, "inconsistent pie-chart\n");
257*5f39d1b3SJooyung Han       abort();
258*5f39d1b3SJooyung Han     }
259*5f39d1b3SJooyung Han   }
260*5f39d1b3SJooyung Han };
261*5f39d1b3SJooyung Han 
262*5f39d1b3SJooyung Han struct Hypothesis {
~HypothesisHypothesis263*5f39d1b3SJooyung Han   virtual ~Hypothesis() {}
264*5f39d1b3SJooyung Han   virtual const char* Name() const = 0;
265*5f39d1b3SJooyung Han   virtual void Analyze(const CacheCounts& cache_counts,
266*5f39d1b3SJooyung Han                        PieChart* pie) const = 0;
267*5f39d1b3SJooyung Han };
268*5f39d1b3SJooyung Han 
269*5f39d1b3SJooyung Han struct Hypothesis1 : Hypothesis {
NameHypothesis1270*5f39d1b3SJooyung Han   const char* Name() const override { return "Hypothesis1"; }
AnalyzeHypothesis1271*5f39d1b3SJooyung Han   void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
272*5f39d1b3SJooyung Han     pie->total = cache_counts.l1d_cache + cache_counts.l1d_cache_refill;
273*5f39d1b3SJooyung Han     pie->l1_hits = cache_counts.l1d_cache - cache_counts.l2d_cache_refill -
274*5f39d1b3SJooyung Han                    cache_counts.l3d_cache_refill;
275*5f39d1b3SJooyung Han     pie->l2_hits = cache_counts.l1d_cache_refill;
276*5f39d1b3SJooyung Han     pie->l3_hits = cache_counts.l2d_cache_refill;
277*5f39d1b3SJooyung Han     pie->dram_hits = cache_counts.l3d_cache_refill;
278*5f39d1b3SJooyung Han   }
279*5f39d1b3SJooyung Han };
280*5f39d1b3SJooyung Han 
281*5f39d1b3SJooyung Han struct Hypothesis2 : Hypothesis {
NameHypothesis2282*5f39d1b3SJooyung Han   const char* Name() const override { return "Hypothesis2"; }
AnalyzeHypothesis2283*5f39d1b3SJooyung Han   void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
284*5f39d1b3SJooyung Han     pie->total = cache_counts.l1d_cache;
285*5f39d1b3SJooyung Han     pie->l1_hits = cache_counts.l1d_cache - cache_counts.l2d_cache;
286*5f39d1b3SJooyung Han     pie->l2_hits = cache_counts.l2d_cache - cache_counts.l3d_cache;
287*5f39d1b3SJooyung Han     pie->l3_hits = cache_counts.l3d_cache - cache_counts.l3d_cache_refill;
288*5f39d1b3SJooyung Han     pie->dram_hits = cache_counts.l3d_cache_refill;
289*5f39d1b3SJooyung Han   }
290*5f39d1b3SJooyung Han };
291*5f39d1b3SJooyung Han 
292*5f39d1b3SJooyung Han struct Hypothesis3 : Hypothesis {
NameHypothesis3293*5f39d1b3SJooyung Han   const char* Name() const override { return "Hypothesis3"; }
AnalyzeHypothesis3294*5f39d1b3SJooyung Han   void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
295*5f39d1b3SJooyung Han     pie->total = cache_counts.l1d_cache;
296*5f39d1b3SJooyung Han     int corrected_l2 = std::min(cache_counts.l2d_cache, cache_counts.l1d_cache);
297*5f39d1b3SJooyung Han     int corrected_l3 = std::min(cache_counts.l3d_cache, corrected_l2);
298*5f39d1b3SJooyung Han     pie->l1_hits = cache_counts.l1d_cache - corrected_l2;
299*5f39d1b3SJooyung Han     pie->l2_hits = corrected_l2 - corrected_l3;
300*5f39d1b3SJooyung Han     pie->l3_hits = corrected_l3 - cache_counts.l3d_cache_refill;
301*5f39d1b3SJooyung Han     pie->dram_hits = cache_counts.l3d_cache_refill;
302*5f39d1b3SJooyung Han   }
303*5f39d1b3SJooyung Han };
304*5f39d1b3SJooyung Han 
305*5f39d1b3SJooyung Han struct Hypothesis4 : Hypothesis {
NameHypothesis4306*5f39d1b3SJooyung Han   const char* Name() const override { return "Hypothesis4"; }
AnalyzeHypothesis4307*5f39d1b3SJooyung Han   void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
308*5f39d1b3SJooyung Han     pie->total = cache_counts.l1d_cache;
309*5f39d1b3SJooyung Han     pie->l1_hits = cache_counts.l1d_cache - cache_counts.l1d_cache_refill;
310*5f39d1b3SJooyung Han     pie->l2_hits =
311*5f39d1b3SJooyung Han         cache_counts.l1d_cache_refill - cache_counts.l2d_cache_refill;
312*5f39d1b3SJooyung Han     pie->l3_hits =
313*5f39d1b3SJooyung Han         cache_counts.l2d_cache_refill - cache_counts.l3d_cache_refill;
314*5f39d1b3SJooyung Han     pie->dram_hits = cache_counts.l3d_cache_refill;
315*5f39d1b3SJooyung Han   }
316*5f39d1b3SJooyung Han };
317*5f39d1b3SJooyung Han 
318*5f39d1b3SJooyung Han struct Hypothesis5 : Hypothesis {
NameHypothesis5319*5f39d1b3SJooyung Han   const char* Name() const override { return "Hypothesis5"; }
AnalyzeHypothesis5320*5f39d1b3SJooyung Han   void Analyze(const CacheCounts& cache_counts, PieChart* pie) const override {
321*5f39d1b3SJooyung Han     pie->l1_hits =
322*5f39d1b3SJooyung Han         std::max(0, cache_counts.l1d_cache - cache_counts.l1d_cache_refill);
323*5f39d1b3SJooyung Han     pie->l2_hits = std::max(
324*5f39d1b3SJooyung Han         0, cache_counts.l1d_cache_refill - cache_counts.l2d_cache_refill);
325*5f39d1b3SJooyung Han     const int l3_misses =
326*5f39d1b3SJooyung Han         std::max(cache_counts.ll_cache_miss, cache_counts.l3d_cache_refill);
327*5f39d1b3SJooyung Han     pie->l3_hits = std::max(0, cache_counts.l2d_cache_refill - l3_misses);
328*5f39d1b3SJooyung Han     pie->dram_hits = l3_misses;
329*5f39d1b3SJooyung Han     pie->total = pie->l1_hits + pie->l2_hits + pie->l3_hits + pie->dram_hits;
330*5f39d1b3SJooyung Han   }
331*5f39d1b3SJooyung Han };
332*5f39d1b3SJooyung Han 
PrintPieChart(const PieChart & pie)333*5f39d1b3SJooyung Han void PrintPieChart(const PieChart& pie) {
334*5f39d1b3SJooyung Han   printf("total accesses: %d\n", pie.total);
335*5f39d1b3SJooyung Han   double l1_hits_pct = 100. * pie.l1_hits / pie.total;
336*5f39d1b3SJooyung Han   double l2_hits_pct = 100. * pie.l2_hits / pie.total;
337*5f39d1b3SJooyung Han   double l3_hits_pct = 100. * pie.l3_hits / pie.total;
338*5f39d1b3SJooyung Han   double dram_hits_pct = 100. * pie.dram_hits / pie.total;
339*5f39d1b3SJooyung Han   printf("L1 hits: %.2f%%\n", l1_hits_pct);
340*5f39d1b3SJooyung Han   printf("L2 hits: %.2f%%\n", l2_hits_pct);
341*5f39d1b3SJooyung Han   printf("L1/2 hits: %.2f%%\n", l1_hits_pct + l2_hits_pct);
342*5f39d1b3SJooyung Han   printf("L3 hits: %.2f%%\n", l3_hits_pct);
343*5f39d1b3SJooyung Han   printf("L1/2/3 hits: %.2f%%\n", l1_hits_pct + l2_hits_pct + l3_hits_pct);
344*5f39d1b3SJooyung Han   printf("DRAM hits: %.2f%%\n", dram_hits_pct);
345*5f39d1b3SJooyung Han }
346*5f39d1b3SJooyung Han 
PrintPieChartCsvNoNewline(const PieChart & pie)347*5f39d1b3SJooyung Han void PrintPieChartCsvNoNewline(const PieChart& pie) {
348*5f39d1b3SJooyung Han   double l1_hits_pct = 100. * pie.l1_hits / pie.total;
349*5f39d1b3SJooyung Han   double l2_hits_pct = 100. * pie.l2_hits / pie.total;
350*5f39d1b3SJooyung Han   double l3_hits_pct = 100. * pie.l3_hits / pie.total;
351*5f39d1b3SJooyung Han   double dram_hits_pct = 100. * pie.dram_hits / pie.total;
352*5f39d1b3SJooyung Han   printf("%.2f,%.2f,%.2f,%.2f", l1_hits_pct, l2_hits_pct, l3_hits_pct,
353*5f39d1b3SJooyung Han          dram_hits_pct);
354*5f39d1b3SJooyung Han }
355*5f39d1b3SJooyung Han 
Study(int accesses,int size,std::uint8_t * buf)356*5f39d1b3SJooyung Han void Study(int accesses, int size, std::uint8_t* buf) {
357*5f39d1b3SJooyung Han   CacheCounts cache_counts;
358*5f39d1b3SJooyung Han   MeasureCacheCounts(accesses, size, buf, &cache_counts);
359*5f39d1b3SJooyung Han   const Hypothesis* hypotheses[] = {
360*5f39d1b3SJooyung Han       new Hypothesis5, new Hypothesis4, new Hypothesis3,
361*5f39d1b3SJooyung Han       new Hypothesis2, new Hypothesis1,
362*5f39d1b3SJooyung Han   };
363*5f39d1b3SJooyung Han   if (getenv("DUMP_CSV")) {
364*5f39d1b3SJooyung Han     printf("%d", size);
365*5f39d1b3SJooyung Han     for (const Hypothesis* hypothesis : hypotheses) {
366*5f39d1b3SJooyung Han       printf(",");
367*5f39d1b3SJooyung Han       PieChart pie;
368*5f39d1b3SJooyung Han       hypothesis->Analyze(cache_counts, &pie);
369*5f39d1b3SJooyung Han       PrintPieChartCsvNoNewline(pie);
370*5f39d1b3SJooyung Han     }
371*5f39d1b3SJooyung Han     printf("\n");
372*5f39d1b3SJooyung Han   } else {
373*5f39d1b3SJooyung Han     printf("\n\n\naccesses=%d, size=%d:\n", accesses, size);
374*5f39d1b3SJooyung Han     printf("\nCache counts:\n");
375*5f39d1b3SJooyung Han     PrintCacheCounts(cache_counts);
376*5f39d1b3SJooyung Han     for (const Hypothesis* hypothesis : hypotheses) {
377*5f39d1b3SJooyung Han       printf("\n%s:\n", hypothesis->Name());
378*5f39d1b3SJooyung Han       PieChart pie;
379*5f39d1b3SJooyung Han       hypothesis->Analyze(cache_counts, &pie);
380*5f39d1b3SJooyung Han       PrintPieChart(pie);
381*5f39d1b3SJooyung Han     }
382*5f39d1b3SJooyung Han   }
383*5f39d1b3SJooyung Han   fflush(stdout);
384*5f39d1b3SJooyung Han   for (const Hypothesis* hypothesis : hypotheses) {
385*5f39d1b3SJooyung Han     delete hypothesis;
386*5f39d1b3SJooyung Han   }
387*5f39d1b3SJooyung Han }
388*5f39d1b3SJooyung Han 
main()389*5f39d1b3SJooyung Han int main() {
390*5f39d1b3SJooyung Han   const int kMinSize = 1 << 12;
391*5f39d1b3SJooyung Han   const int kMaxSize = 1 << 24;
392*5f39d1b3SJooyung Han   const int kAccesses = 1e8;
393*5f39d1b3SJooyung Han   void* buf_void = nullptr;
394*5f39d1b3SJooyung Han   posix_memalign(&buf_void, 64, kMaxSize);
395*5f39d1b3SJooyung Han   std::uint8_t* buf = static_cast<std::uint8_t*>(buf_void);
396*5f39d1b3SJooyung Han   std::default_random_engine random_engine;
397*5f39d1b3SJooyung Han   for (int i = 0; i < kMaxSize; i++) {
398*5f39d1b3SJooyung Han     buf[i] = random_engine();
399*5f39d1b3SJooyung Han   }
400*5f39d1b3SJooyung Han   for (int size = kMinSize; size <= kMaxSize; size *= 2) {
401*5f39d1b3SJooyung Han     Study(kAccesses, size, buf);
402*5f39d1b3SJooyung Han   }
403*5f39d1b3SJooyung Han   delete[] buf;
404*5f39d1b3SJooyung Han }
405