xref: /aosp_15_r20/external/coreboot/src/vendorcode/cavium/bdk/libbdk-dram/bdk-dram-test.c (revision b9411a12aaaa7e1e6a6fb7c5e057f44ee179a49c)
1 /***********************license start***********************************
2 * Copyright (c) 2003-2017  Cavium Inc. ([email protected]). All rights
3 * reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 *   * Redistributions of source code must retain the above copyright
11 *     notice, this list of conditions and the following disclaimer.
12 *
13 *   * Redistributions in binary form must reproduce the above
14 *     copyright notice, this list of conditions and the following
15 *     disclaimer in the documentation and/or other materials provided
16 *     with the distribution.
17 *
18 *   * Neither the name of Cavium Inc. nor the names of
19 *     its contributors may be used to endorse or promote products
20 *     derived from this software without specific prior written
21 *     permission.
22 *
23 * This Software, including technical data, may be subject to U.S. export
24 * control laws, including the U.S. Export Administration Act and its
25 * associated regulations, and may be subject to export or import
26 * regulations in other countries.
27 *
28 * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
29 * AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
30 * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
31 * TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
32 * REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
33 * DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
34 * OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
35 * PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
36 * QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK
37 * ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
38 ***********************license end**************************************/
39 #include <bdk.h>
40 #include <string.h>
41 #include "libbdk-arch/bdk-csrs-gti.h"
42 #include "libbdk-arch/bdk-csrs-ocx.h"
43 
44 #include <bdk-minimal.h>        /* for printf --> printk */
45 #include <libbdk-dram/bdk-dram-test.h>
46 #include <libbdk-hal/bdk-atomic.h>
47 #include <libbdk-hal/bdk-clock.h>
48 #include <libbdk-hal/bdk-utils.h>
49 #include <libbdk-os/bdk-init.h>
50 #include <libbdk-os/bdk-thread.h>
51 #include <stdio.h>
52 
53 /* This code is an optional part of the BDK. It is only linked in
54     if BDK_REQUIRE() needs it */
55 BDK_REQUIRE_DEFINE(DRAM_TEST);
56 
57 #define MAX_ERRORS_TO_REPORT 50
58 #define RETRY_LIMIT 1000
59 
60 typedef struct
61 {
62     const char *        name;       /* Friendly name for the test */
63     __bdk_dram_test_t   test_func;  /* Function to call */
64     int                 bursts;     /* Bursts parameter to pass to the test */
65     int                 max_cores;  /* Maximum number of cores the test should be run on in parallel. Zero means all */
66 } dram_test_info_t;
67 
68 static const dram_test_info_t TEST_INFO[] = {
69     /* Name,                    Test function,                      Bursts, Max Cores */
70     { "Data Bus",               __bdk_dram_test_mem_data_bus,       8,      1},
71     { "Address Bus",            __bdk_dram_test_mem_address_bus,    0,      1},
72     { "Marching Rows",          __bdk_dram_test_mem_rows,           16,     0},
73     { "Random Data",            __bdk_dram_test_mem_random,         32,     0},
74     { "Random XOR (32 Burst)",  __bdk_dram_test_mem_xor,            32,     0},
75     { "Self Address",           __bdk_dram_test_mem_self_addr,      1,      0},
76     { "March C- Solid Bits",    __bdk_dram_test_mem_solid,          1,      0},
77     { "March C- Checkerboard",  __bdk_dram_test_mem_checkerboard,   1,      0},
78     { "Walking Ones Left",      __bdk_dram_test_mem_leftwalk1,      1,      0},
79     { "Walking Ones Right",     __bdk_dram_test_mem_rightwalk1,     1,      0},
80     { "Walking Zeros Left",     __bdk_dram_test_mem_leftwalk0,      1,      0},
81     { "Walking Zeros Right",    __bdk_dram_test_mem_rightwalk0,     1,      0},
82     { "Random XOR (224 Burst)", __bdk_dram_test_mem_xor,            224,    0},
83     { "Fast Scan",              __bdk_dram_test_fast_scan,          0,      0},
84     { NULL,                     NULL,                               0,      0}
85 };
86 
87 /* These variables count the number of ECC errors. They should only be accessed atomically */
88 int64_t __bdk_dram_ecc_single_bit_errors[BDK_MAX_MEM_CHANS];
89 int64_t __bdk_dram_ecc_double_bit_errors[BDK_MAX_MEM_CHANS];
90 
91 static int64_t dram_test_thread_done;
92 static int64_t dram_test_thread_errors;
93 static uint64_t dram_test_thread_start;
94 static uint64_t dram_test_thread_end;
95 static uint64_t dram_test_thread_size;
96 
97 /**
98  * Force the memory at the pointer location to be written to memory and evicted
99  * from L2. L1 will be unaffected.
100  *
101  * @param address Physical memory location
102  */
__bdk_dram_flush_to_mem(uint64_t address)103 void __bdk_dram_flush_to_mem(uint64_t address)
104 {
105     BDK_MB;
106     char *ptr = bdk_phys_to_ptr(address);
107     BDK_CACHE_WBI_L2(ptr);
108 }
109 
110 /**
111  * Force a memory region to be written to DRAM and evicted from L2
112  *
113  * @param area   Start of the region
114  * @param max_address
115  *               End of the region (exclusive)
116  */
__bdk_dram_flush_to_mem_range(uint64_t area,uint64_t max_address)117 void __bdk_dram_flush_to_mem_range(uint64_t area, uint64_t max_address)
118 {
119     char *ptr = bdk_phys_to_ptr(area);
120     char *end = bdk_phys_to_ptr(max_address);
121     BDK_MB;
122     while (ptr < end)
123     {
124         BDK_CACHE_WBI_L2(ptr);
125         ptr += 128;
126     }
127 }
128 
129 /**
130  * Convert a test enumeration into a string
131  *
132  * @param test   Test to convert
133  *
134  * @return String for display
135  */
bdk_dram_get_test_name(int test)136 const char *bdk_dram_get_test_name(int test)
137 {
138     if (test < (int) ARRAY_SIZE(TEST_INFO))
139         return TEST_INFO[test].name;
140     else
141         return NULL;
142 }
143 
144 static bdk_dram_test_flags_t dram_test_flags; // FIXME: Don't use global
145 /**
146  * This function is run as a thread to perform memory tests over multiple cores.
147  * Each thread gets a section of memory to work on, which is controlled by global
148  * variables at the beginning of this file.
149  *
150  * @param arg    Number of the region we should check
151  * @param arg1   Pointer to the test_info structure
152  */
dram_test_thread(int arg,void * arg1)153 static void dram_test_thread(int arg, void *arg1)
154 {
155     const dram_test_info_t *test_info = arg1;
156     const int bursts = test_info->bursts;
157     const int range_number = arg;
158 
159     /* Figure out our work memory range.
160      *
161      * Note start_address and end_address just provide the physical offset
162      * portion of the address and do not have the node bits set. This is
163      * to simplify address checks and calculations. Later, when about to run
164      * the memory test, the routines adds in the node bits to form the final
165      * addresses.
166      */
167     uint64_t start_address = dram_test_thread_start + dram_test_thread_size * range_number;
168     uint64_t end_address = start_address + dram_test_thread_size;
169     if (end_address > dram_test_thread_end)
170         end_address = dram_test_thread_end;
171 
172     bdk_node_t test_node = bdk_numa_local();
173     if (dram_test_flags & BDK_DRAM_TEST_USE_CCPI)
174         test_node ^= 1;
175     /* Insert the node part of the address */
176     start_address = bdk_numa_get_address(test_node, start_address);
177     end_address = bdk_numa_get_address(test_node, end_address);
178     /* Test the region */
179     BDK_TRACE(DRAM_TEST, "  Node %d, core %d, Testing [0x%011llx:0x%011llx]\n",
180         bdk_numa_local(), bdk_get_core_num() & 127, start_address, end_address - 1);
181     test_info->test_func(start_address, end_address, bursts);
182 
183     /* Report that we're done */
184     BDK_TRACE(DRAM_TEST, "Thread %d on node %d done with memory test\n", range_number, bdk_numa_local());
185     bdk_atomic_add64_nosync(&dram_test_thread_done, 1);
186 }
187 
188 /**
189  * Run the memory test.
190  *
191  * @param test_info
192  * @param start_address
193  *                  Physical address to start at
194  * @param length    Length of memory block
195  * @param flags     Flags to control memory test options. Zero defaults to testing all
196  *                  node with statistics and progress output.
197  *
198  * @return Number of errors found. Zero is success. Negative means the test
199  *         did not run due to some other failure.
200  */
__bdk_dram_run_test(const dram_test_info_t * test_info,uint64_t start_address,uint64_t length,bdk_dram_test_flags_t flags)201 static int __bdk_dram_run_test(const dram_test_info_t *test_info, uint64_t start_address,
202                                uint64_t length, bdk_dram_test_flags_t flags)
203 {
204     /* Figure out the addess of the byte one off the top of memory */
205     uint64_t max_address = bdk_dram_get_size_mbytes(bdk_numa_local());
206     BDK_TRACE(DRAM_TEST, "DRAM available per node: %llu MB\n", max_address);
207     max_address <<= 20;
208 
209     /* Make sure we have enough */
210     if (max_address < (16<<20))
211     {
212         bdk_error("DRAM size is too small\n");
213         return -1;
214     }
215 
216     /* Make sure the amount is sane */
217     if (CAVIUM_IS_MODEL(CAVIUM_CN8XXX))
218     {
219         if (max_address > (1ull << 40)) /* 40 bits in CN8XXX */
220             max_address = 1ull << 40;
221     }
222     else
223     {
224         if (max_address > (1ull << 43)) /* 43 bits in CN9XXX */
225             max_address = 1ull << 43;
226     }
227     BDK_TRACE(DRAM_TEST, "DRAM max address: 0x%011llx\n", max_address-1);
228 
229     /* Make sure the start address is lower than the top of memory */
230     if (start_address >= max_address)
231     {
232         bdk_error("Start address is larger than the amount of memory: 0x%011llx versus 0x%011llx\n",
233                   start_address, max_address);
234         return -1;
235     }
236     if (length == (uint64_t)-1)
237         length = max_address - start_address;
238 
239     /* Final range checks */
240     uint64_t end_address = start_address + length;
241     if (end_address > max_address)
242     {
243         end_address = max_address;
244         length = end_address - start_address;
245     }
246     if (length == 0)
247         return 0;
248 
249     /* Ready to run the test. Figure out how many cores we need */
250     int max_cores = test_info->max_cores;
251     int total_cores_all_nodes = max_cores;
252 
253     /* Figure out the number of cores available in the system */
254     if (max_cores == 0)
255     {
256         max_cores += bdk_get_num_running_cores(bdk_numa_local());
257         /* Calculate the total number of cores being used. The per node number
258            is confusing to people */
259         for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
260             if (flags & (1 << node))
261             {
262                 if (flags & BDK_DRAM_TEST_USE_CCPI)
263                     total_cores_all_nodes += bdk_get_num_running_cores(node ^ 1);
264                 else
265                     total_cores_all_nodes += bdk_get_num_running_cores(node);
266             }
267     }
268     if (!(flags & BDK_DRAM_TEST_NO_BANNERS))
269         printf("Starting Test \"%s\" for [0x%011llx:0x%011llx] using %d core(s)\n",
270            test_info->name, start_address, end_address - 1, total_cores_all_nodes);
271 
272     /* Remember the LMC perf counters for stats after the test */
273     uint64_t start_dram_dclk[BDK_NUMA_MAX_NODES][4];
274     uint64_t start_dram_ops[BDK_NUMA_MAX_NODES][4];
275     uint64_t stop_dram_dclk[BDK_NUMA_MAX_NODES][4];
276     uint64_t stop_dram_ops[BDK_NUMA_MAX_NODES][4];
277     for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
278     {
279         if (flags & (1 << node))
280         {
281             const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
282             for (int i = 0; i < num_dram_controllers; i++)
283             {
284                 start_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i));
285                 start_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i));
286             }
287         }
288     }
289     /* Remember the CCPI link counters for stats after the test */
290     uint64_t start_ccpi_data[BDK_NUMA_MAX_NODES][3];
291     uint64_t start_ccpi_idle[BDK_NUMA_MAX_NODES][3];
292     uint64_t start_ccpi_err[BDK_NUMA_MAX_NODES][3];
293     uint64_t stop_ccpi_data[BDK_NUMA_MAX_NODES][3];
294     uint64_t stop_ccpi_idle[BDK_NUMA_MAX_NODES][3];
295     uint64_t stop_ccpi_err[BDK_NUMA_MAX_NODES][3];
296     if (!bdk_numa_is_only_one())
297     {
298         for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
299         {
300             if (flags & (1 << node))
301             {
302                 for (int link = 0; link < 3; link++)
303                 {
304                     start_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link));
305                     start_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link));
306                     start_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link));
307                 }
308             }
309         }
310     }
311 
312     /* WARNING: This code assumes the same memory range is being tested on
313        all nodes. The same number of cores are used on each node to test
314        its local memory */
315     uint64_t work_address = start_address;
316     dram_test_flags = flags;
317     bdk_atomic_set64(&dram_test_thread_errors, 0);
318     while ((work_address < end_address) && ((dram_test_thread_errors == 0) || (flags & BDK_DRAM_TEST_NO_STOP_ERROR)))
319     {
320         /* Check at most MAX_CHUNK_SIZE across each iteration. We only report
321            progress between chunks, so keep them reasonably small */
322         const uint64_t MAX_CHUNK_SIZE = 1ull << 28; /* 256MB */
323         uint64_t size = end_address - work_address;
324         if (size > MAX_CHUNK_SIZE)
325             size = MAX_CHUNK_SIZE;
326 
327         /* Divide memory evenly between the cores. Round the size up so that
328            all memory is covered. The last core may have slightly less memory to
329            test */
330         uint64_t thread_size = (size + (max_cores - 1)) / max_cores;
331         thread_size += 127;
332         thread_size &= -128;
333         dram_test_thread_start = work_address;
334         dram_test_thread_end = work_address + size;
335         dram_test_thread_size = thread_size;
336         BDK_WMB;
337 
338         /* Poke the watchdog */
339         BDK_CSR_WRITE(bdk_numa_local(), BDK_GTI_CWD_POKEX(0), 0);
340 
341         /* disable progress output when batch mode is ON  */
342         if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) {
343 
344             /* Report progress percentage */
345             int percent_x10 = (work_address - start_address) * 1000 / (end_address - start_address);
346             printf("  %3d.%d%% complete, testing [0x%011llx:0x%011llx]\r",
347                    percent_x10 / 10, percent_x10 % 10,  work_address, work_address + size - 1);
348             fflush(stdout);
349         }
350 
351         work_address += size;
352 
353         /* Start threads for all the cores */
354         int total_count = 0;
355         bdk_atomic_set64(&dram_test_thread_done, 0);
356         for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
357         {
358             if (flags & (1 << node))
359             {
360                 const int num_cores = bdk_get_num_cores(node);
361                 int per_node = 0;
362                 for (int core = 0; core < num_cores; core++)
363                 {
364                     if (per_node >= max_cores)
365                         break;
366                     BDK_TRACE(DRAM_TEST, "Starting thread %d on node %d for memory test\n", per_node, node);
367                     dram_test_thread(per_node, (void *)test_info);
368                 }
369             }
370         }
371 
372 #if 0
373         /* Wait for threads to finish */
374         while (bdk_atomic_get64(&dram_test_thread_done) < total_count)
375             bdk_thread_yield();
376 #else
377 #define TIMEOUT_SECS 30  // FIXME: long enough so multicore RXOR 224 should not print out
378         /* Wait for threads to finish, with progress */
379         int cur_count;
380         uint64_t cur_time;
381         uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME?
382         uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
383         do {
384             cur_count = bdk_atomic_get64(&dram_test_thread_done);
385             cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
386             if (cur_time >= timeout) {
387                 BDK_TRACE(DRAM_TEST, "N%d: Waiting for %d cores\n",
388                           bdk_numa_local(), total_count - cur_count);
389                 timeout = cur_time + period;
390             }
391         } while (cur_count < total_count);
392 #endif
393     }
394 
395     /* Get the DRAM perf counters */
396     for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
397     {
398         if (flags & (1 << node))
399         {
400             const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
401             for (int i = 0; i < num_dram_controllers; i++)
402             {
403                 stop_dram_dclk[node][i] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(i));
404                 stop_dram_ops[node][i] = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(i));
405             }
406         }
407     }
408     /* Get the CCPI link counters */
409     if (!bdk_numa_is_only_one())
410     {
411         for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
412         {
413             if (flags & (1 << node))
414             {
415                 for (int link = 0; link < 3; link++)
416                 {
417                     stop_ccpi_data[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_DATA_CNT(link));
418                     stop_ccpi_idle[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_IDLE_CNT(link));
419                     stop_ccpi_err[node][link] = BDK_CSR_READ(node, BDK_OCX_TLKX_STAT_ERR_CNT(link));
420                 }
421             }
422         }
423     }
424 
425     /* disable progress output when batch mode is ON  */
426     if (!(flags & BDK_DRAM_TEST_NO_PROGRESS)) {
427 
428         /* Report progress percentage as complete */
429         printf("  %3d.%d%% complete, testing [0x%011llx:0x%011llx]\n",
430                100, 0,  start_address, end_address - 1);
431         fflush(stdout);
432     }
433 
434     if (!(flags & BDK_DRAM_TEST_NO_STATS))
435     {
436         /* Display LMC load */
437         for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
438         {
439             if (flags & (1 << node))
440             {
441                 const int num_dram_controllers = __bdk_dram_get_num_lmc(node);
442                 for (int i = 0; i < num_dram_controllers; i++)
443                 {
444                     uint64_t ops = stop_dram_ops[node][i] - start_dram_ops[node][i];
445                     uint64_t dclk = stop_dram_dclk[node][i] - start_dram_dclk[node][i];
446                     if (dclk == 0)
447                         dclk = 1;
448                     uint64_t percent_x10 = ops * 1000 / dclk;
449                     printf("  Node %d, LMC%d: ops %llu, cycles %llu, used %llu.%llu%%\n",
450                         node, i, ops, dclk, percent_x10 / 10, percent_x10 % 10);
451                 }
452             }
453         }
454         if (flags & BDK_DRAM_TEST_USE_CCPI)
455         {
456             /* Display CCPI load */
457             for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
458             {
459                 if (flags & (1 << node))
460                 {
461                     for (int link = 0; link < 3; link++)
462                     {
463                         uint64_t busy = stop_ccpi_data[node][link] - start_ccpi_data[node][link];
464                         busy += stop_ccpi_err[node][link] - start_ccpi_err[node][link];
465                         uint64_t total = stop_ccpi_idle[node][link] - start_ccpi_idle[node][link];
466                         total += busy;
467                         if (total == 0)
468                             continue;
469                         uint64_t percent_x10 = busy * 1000 / total;
470                         printf("  Node %d, CCPI%d: busy %llu, total %llu, used %llu.%llu%%\n",
471                             node, link, busy, total, percent_x10 / 10, percent_x10 % 10);
472                     }
473                 }
474             }
475         }
476     }
477     return dram_test_thread_errors;
478 }
479 
480 /**
481  * Perform a memory test.
482  *
483  * @param test   Test type to run
484  * @param start_address
485  *               Physical address to start at
486  * @param length Length of memory block
487  * @param flags  Flags to control memory test options. Zero defaults to testing all
488  *               node with statistics and progress output.
489  *
490  * @return Number of errors found. Zero is success. Negative means the test
491  *         did not run due to some other failure.
492  */
bdk_dram_test(int test,uint64_t start_address,uint64_t length,bdk_dram_test_flags_t flags)493 int bdk_dram_test(int test, uint64_t start_address, uint64_t length, bdk_dram_test_flags_t flags)
494 {
495     /* These limits are arbitrary. They just make sure we aren't doing something
496        silly, like test a non cache line aligned memory region */
497     if (start_address & 0xffff)
498     {
499         bdk_error("DRAM test start address must be aligned on a 64KB boundary\n");
500         return -1;
501     }
502     if (length & 0xffff)
503     {
504         bdk_error("DRAM test length must be a multiple of 64KB\n");
505         return -1;
506     }
507 
508     const char *name = bdk_dram_get_test_name(test);
509     if (name == NULL)
510     {
511         bdk_error("Invalid DRAM test number %d\n", test);
512         return -1;
513     }
514 
515     /* If no nodes are selected assume the user meant all nodes */
516     if ((flags & (BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3)) == 0)
517         flags |= BDK_DRAM_TEST_NODE0 | BDK_DRAM_TEST_NODE1 | BDK_DRAM_TEST_NODE2 | BDK_DRAM_TEST_NODE3;
518 
519     /* Remove nodes from the flags that don't exist */
520     for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
521     {
522         if (flags & BDK_DRAM_TEST_USE_CCPI)
523         {
524             if (!bdk_numa_exists(node ^ 1))
525                 flags &= ~(1 << node);
526         }
527         else
528         {
529             if (!bdk_numa_exists(node))
530                 flags &= ~(1 << node);
531         }
532     }
533 
534 
535     /* Make sure the start address is higher that the BDK's active range */
536     uint64_t top_of_bdk = bdk_dram_get_top_of_bdk();
537     if (start_address < top_of_bdk)
538         start_address = top_of_bdk;
539 
540     /* Clear ECC error counters before starting the test */
541     for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) {
542         bdk_atomic_set64(&__bdk_dram_ecc_single_bit_errors[chan], 0);
543         bdk_atomic_set64(&__bdk_dram_ecc_double_bit_errors[chan], 0);
544     }
545 
546     /* Make sure at least one core from each node is running */
547     /* FIXME(dhendrix): we only care about core0 on node0 for now */
548 #if 0
549     for (bdk_node_t node = BDK_NODE_0; node < BDK_NUMA_MAX_NODES; node++)
550     {
551         if (flags & (1<<node))
552         {
553             int use_node = (flags & BDK_DRAM_TEST_USE_CCPI) ? node ^ 1 : node;
554             if (bdk_get_running_coremask(use_node) == 0)
555                 bdk_init_cores(use_node, 1);
556         }
557     }
558 #endif
559 
560     /* This returns any data compare errors found */
561     int errors = __bdk_dram_run_test(&TEST_INFO[test], start_address, length, flags);
562 
563     /* Check ECC error counters after the test */
564     int64_t ecc_single = 0;
565     int64_t ecc_double = 0;
566     int64_t ecc_single_errs[BDK_MAX_MEM_CHANS];
567     int64_t ecc_double_errs[BDK_MAX_MEM_CHANS];
568 
569     for (int chan = 0; chan < BDK_MAX_MEM_CHANS; chan++) {
570         ecc_single += (ecc_single_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_single_bit_errors[chan]));
571         ecc_double += (ecc_double_errs[chan] = bdk_atomic_get64(&__bdk_dram_ecc_double_bit_errors[chan]));
572     }
573 
574     /* Always print any ECC errors */
575     if (ecc_single || ecc_double)
576     {
577         printf("Test \"%s\": ECC errors, %lld/%lld/%lld/%lld corrected, %lld/%lld/%lld/%lld uncorrected\n",
578                name,
579                ecc_single_errs[0], ecc_single_errs[1], ecc_single_errs[2], ecc_single_errs[3],
580                ecc_double_errs[0], ecc_double_errs[1], ecc_double_errs[2], ecc_double_errs[3]);
581     }
582     if (errors || ecc_double || ecc_single) {
583         printf("Test \"%s\": FAIL: %lld single, %lld double, %d compare errors\n",
584                name, ecc_single, ecc_double, errors);
585     }
586     else
587         BDK_TRACE(DRAM_TEST, "Test \"%s\": PASS\n", name);
588 
589     return (errors + ecc_double + ecc_single);
590 }
591 
592 /**
593  * Report a DRAM address in decoded format.
594  *
595  * @param address Physical address the error occurred at
596  *
597  */
__bdk_dram_report_address_decode(uint64_t address,char * buffer,int len)598 static void __bdk_dram_report_address_decode(uint64_t address, char *buffer, int len)
599 {
600     int node, lmc, dimm, prank, lrank, bank, row, col;
601 
602     bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
603 
604     snprintf(buffer, len, "[0x%011lx] (N%d,LMC%d,DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)",
605              address, node, lmc, dimm, prank, lrank, bank, row, col);
606 }
607 
608 /**
609  * Report a DRAM address in a new decoded format.
610  *
611  * @param address Physical address the error occurred at
612  * @param xor     XOR of data read vs expected data
613  *
614  */
__bdk_dram_report_address_decode_new(uint64_t address,uint64_t orig_xor,char * buffer,int len)615 static void __bdk_dram_report_address_decode_new(uint64_t address, uint64_t orig_xor, char *buffer, int len)
616 {
617     int node, lmc, dimm, prank, lrank, bank, row, col;
618 
619     int byte = 8; // means no byte-lanes in error, should not happen
620     uint64_t bits, print_bits = 0;
621     uint64_t xor = orig_xor;
622 
623     // find the byte-lane(s) with errors
624     for (int i = 0; i < 8; i++) {
625         bits = xor & 0xffULL;
626         xor >>= 8;
627         if (bits) {
628             if (byte != 8) {
629                 byte = 9; // means more than 1 byte-lane was present
630                 print_bits = orig_xor; // print the full original
631                 break; // quit now
632             } else {
633                 byte = i; // keep checking
634                 print_bits = bits;
635             }
636         }
637     }
638 
639     bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
640 
641     snprintf(buffer, len, "N%d.LMC%d: CMP byte %d xor 0x%02lx (DIMM%d,Rank%d/%d,Bank%02d,Row 0x%05x,Col 0x%04x)[0x%011lx]",
642              node, lmc, byte, print_bits, dimm, prank, lrank, bank, row, col, address);
643 }
644 
645 /**
646  * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is
647  * exceeded. Used when a single address is involved in the failure.
648  *
649  * @param address Physical address the error occurred at
650  * @param data    Data read from memory
651  * @param correct Correct data
652  * @param burst   Which burst this is from, informational only
653  * @param fails   -1 for no retries done, >= 0 number of failures during retries
654  *
655  * @return Zero if a message was logged, non-zero if the error limit has been reached
656  */
__bdk_dram_report_error(uint64_t address,uint64_t data,uint64_t correct,int burst,int fails)657 void __bdk_dram_report_error(uint64_t address, uint64_t data, uint64_t correct, int burst, int fails)
658 {
659     char buffer[128];
660     char failbuf[32];
661     int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1);
662     uint64_t xor = data ^ correct;
663 
664     if (errors < MAX_ERRORS_TO_REPORT)
665     {
666         if (fails < 0) {
667             snprintf(failbuf, sizeof(failbuf), " ");
668         } else {
669             int percent_x10 = fails * 1000 / RETRY_LIMIT;
670             snprintf(failbuf, sizeof(failbuf), ", retries failed %3d.%d%%",
671                      percent_x10 / 10, percent_x10 % 10);
672         }
673 
674         __bdk_dram_report_address_decode_new(address, xor, buffer, sizeof(buffer));
675         bdk_error("%s%s\n", buffer, failbuf);
676 
677         if (errors == MAX_ERRORS_TO_REPORT-1)
678             bdk_error("No further DRAM errors will be reported\n");
679     }
680     return;
681 }
682 
683 /**
684  * Report a DRAM error. Errors are not shown after MAX_ERRORS_TO_REPORT is
685  * exceeded. Used when two addresses might be involved in the failure.
686  *
687  * @param address1 First address involved in the failure
688  * @param data1    Data from the first address
689  * @param address2 Second address involved in the failure
690  * @param data2    Data from second address
691  * @param burst    Which burst this is from, informational only
692  * @param fails    -1 for no retries done, >= 0 number of failures during retries
693  *
694  * @return Zero if a message was logged, non-zero if the error limit has been reached
695  */
__bdk_dram_report_error2(uint64_t address1,uint64_t data1,uint64_t address2,uint64_t data2,int burst,int fails)696 void __bdk_dram_report_error2(uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2,
697                               int burst, int fails)
698 {
699     int64_t errors = bdk_atomic_fetch_and_add64(&dram_test_thread_errors, 1);
700     if (errors < MAX_ERRORS_TO_REPORT)
701     {
702         char buffer1[80], buffer2[80];
703         char failbuf[32];
704 
705         if (fails < 0) {
706             snprintf(failbuf, sizeof(failbuf), " ");
707         } else {
708             snprintf(failbuf, sizeof(failbuf), ", retried %d failed %d", RETRY_LIMIT, fails);
709         }
710         __bdk_dram_report_address_decode(address1, buffer1, sizeof(buffer1));
711         __bdk_dram_report_address_decode(address2, buffer2, sizeof(buffer2));
712 
713         bdk_error("compare: data1: 0x%016llx, xor: 0x%016llx%s\n"
714                   "       %s\n       %s\n",
715                   data1, data1 ^ data2, failbuf,
716                   buffer1, buffer2);
717 
718         if (errors == MAX_ERRORS_TO_REPORT-1)
719             bdk_error("No further DRAM errors will be reported\n");
720     }
721     return;
722 }
723 
724 /* Report the circumstances of a failure and try re-reading the memory
725  * location to see if the error is transient or permanent.
726  *
727  * Note: re-reading requires using evicting addresses
728  */
__bdk_dram_retry_failure(int burst,uint64_t address,uint64_t data,uint64_t expected)729 int __bdk_dram_retry_failure(int burst, uint64_t address, uint64_t data, uint64_t expected)
730 {
731     int refail = 0;
732 
733     // bypass the retries if we are already over the limit...
734     if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) {
735 
736         /* Try re-reading the memory location. A transient error may fail
737          * on one read and work on another. Keep on retrying even when a
738          * read succeeds.
739          */
740         for (int i = 0; i < RETRY_LIMIT; i++) {
741 
742             __bdk_dram_flush_to_mem(address);
743             BDK_DCACHE_INVALIDATE;
744 
745             uint64_t new = __bdk_dram_read64(address);
746 
747             if (new != expected) {
748                 refail++;
749             }
750         }
751     } else
752         refail = -1;
753 
754     // this will increment the errors always, but maybe not print...
755     __bdk_dram_report_error(address, data, expected, burst, refail);
756 
757     return 1;
758 }
759 
760 /**
761  * retry_failure2
762  *
763  * @param burst
764  * @param address1
765  * @param address2
766  */
__bdk_dram_retry_failure2(int burst,uint64_t address1,uint64_t data1,uint64_t address2,uint64_t data2)767 int __bdk_dram_retry_failure2(int burst, uint64_t address1, uint64_t data1, uint64_t address2, uint64_t data2)
768 {
769     int refail = 0;
770 
771     // bypass the retries if we are already over the limit...
772     if (bdk_atomic_get64(&dram_test_thread_errors) < MAX_ERRORS_TO_REPORT) {
773 
774         for (int i = 0; i < RETRY_LIMIT; i++) {
775             __bdk_dram_flush_to_mem(address1);
776             __bdk_dram_flush_to_mem(address2);
777             BDK_DCACHE_INVALIDATE;
778 
779             uint64_t d1 = __bdk_dram_read64(address1);
780             uint64_t d2 = __bdk_dram_read64(address2);
781 
782             if (d1 != d2) {
783                 refail++;
784             }
785         }
786     } else
787         refail = -1;
788 
789     // this will increment the errors always, but maybe not print...
790     __bdk_dram_report_error2(address1, data1, address2, data2, burst, refail);
791 
792     return 1;
793 }
794 
795 /**
796  * Inject a DRAM error at a specific address in memory. The injection can either
797  * be a single bit inside the byte, or a double bit error in the ECC byte. Double
798  * bit errors may corrupt memory, causing software to crash. The corruption is
799  * written to memory and will continue to exist until the cache line is written
800  * again. After a call to this function, the BDK should report a ECC error. Double
801  * bit errors corrupt bits 0-1.
802  *
803  * @param address Physical address to corrupt. Any byte alignment is supported
804  * @param bit     Bit to corrupt in the byte (0-7), or -1 to create a double bit fault in the ECC
805  *                byte.
806  */
bdk_dram_test_inject_error(uint64_t address,int bit)807 void bdk_dram_test_inject_error(uint64_t address, int bit)
808 {
809     uint64_t aligned_address = address & -16;
810     int corrupt_bit = -1;
811     if (bit >= 0)
812         corrupt_bit = (address & 0xf) * 8 + bit;
813 
814     /* Extract the DRAM controller information */
815     int node, lmc, dimm, prank, lrank, bank, row, col;
816     bdk_dram_address_extract_info(address, &node, &lmc, &dimm, &prank, &lrank, &bank, &row, &col);
817 
818     /* Read the current data */
819     uint64_t data = __bdk_dram_read64(aligned_address);
820 
821     /* Program LMC to inject the error */
822     if ((corrupt_bit >= 0) && (corrupt_bit < 64))
823         BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 1ull << corrupt_bit);
824     else if (bit == -1)
825         BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 3);
826     else
827         BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0);
828     if (corrupt_bit >= 64)
829         BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 1ull << (corrupt_bit - 64));
830     else
831         BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0);
832     BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc),
833         c.s.ecc_corrupt_idx = (address & 0x7f) >> 4;
834         c.s.ecc_corrupt_ena = 1);
835     BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc));
836 
837     /* Perform a write and push it to DRAM. This creates the error */
838     __bdk_dram_write64(aligned_address, data);
839     __bdk_dram_flush_to_mem(aligned_address);
840 
841     /* Disable error injection */
842     BDK_CSR_MODIFY(c, node, BDK_LMCX_ECC_PARITY_TEST(lmc),
843         c.s.ecc_corrupt_ena = 0);
844     BDK_CSR_READ(node, BDK_LMCX_ECC_PARITY_TEST(lmc));
845     BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK0(lmc), 0);
846     BDK_CSR_WRITE(node, BDK_LMCX_CHAR_MASK2(lmc), 0);
847 
848     /* Read back the data, which should now cause an error */
849     printf("Loading the injected error address 0x%llx, node=%d, lmc=%d, dimm=%d, rank=%d/%d, bank=%d, row=%d, col=%d\n",
850            address, node, lmc, dimm, prank, lrank, bank, row, col);
851     __bdk_dram_read64(aligned_address);
852 }
853