1 /*
2 * Copyright © 2022 Google, Inc.
3 * Copyright © 2022 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "util/macros.h"
8 #include "crashdec.h"
9 #include "cffdec.h"
10
11 #define MAX_PREFETCH_IBS 4
12
13 /* CP_INDIRECT_BUFFER contains an optimization to read ahead and start
14 * fetching up to 3 subsequent CP_INDIRECT_BUFFER contents into the ROQ before
15 * starting to execute the current IB. This effectively combines them into one
16 * CP_INDIRECT_BUFFER. The result is that if the ROQ is fast enough and
17 * prefetches some of the extra IBs before the first IB finishes, the ROQ may
18 * be in a different IB than the CP is processing. That is, normally we'd have
19 * a situation like this:
20 *
21 * CP_INDIRECT_BUFFER
22 * ...
23 * CP_FOO <- PFP/SQE is reading from here
24 * ...
25 * CP_BAR <- ROQ has prefetched up to here
26 *
27 * where CP_IB*_BASE and CP_IB*_REM_SIZE point to CP_BAR and the difference
28 * between CP_FOO and CP_BAR is given by CP_ROQ_AVAIL_IBn::REM, but instead we
29 * may get a situation like this:
30 *
31 * CP_INDIRECT_BUFFER
32 * ...
33 * CP_FOO <- PFP/SQE is reading here
34 * ...
35 * CP_INDIRECT_BUFFER
36 * ...
37 * CP_BAR <- ROQ has prefetched up to here
38 *
39 * in this case, the "rem" we get with CP_ROQ_AVAIL_IBn::REM added will be
40 * larger than the size of the second IB, indicating that we need to back up
41 * to the IB before it. This can theoretically even happen recursively with
42 * IB2:
43 *
44 * CP_INDIRECT_BUFFER:
45 * ...
46 * CP_INDIRECT_BUFFER:
47 * ...
48 * CP_FOO <- PFP/SQE IB2 is reading here
49 * ...
50 * CP_INDIRECT_BUFFER:
51 * CP_INDIRECT_BUFFER:
52 * ...
53 * CP_BAR <- ROQ IB2 has prefetched up to here
54 * ...
55 * CP_BAZ <- PFP/SQE IB1 is reading here
56 *
57 * Here the ROQ has prefetched the second IB1, then when processing the IB2 at
58 * the end of the first IB1 it peeks ahead in ROQ and sees another IB2 right
59 * afterward in the second IB1 and starts prefetching that too, so that the
60 * ROQ is in a different IB1 *and* IB2 from the CP.
61 *
62 * To account for this when locating the position that the SQE was at in the
63 * cmdstream at the time of the crash, we do a pre-pass scanning the
64 * CP_INDIRECT_BUFFER packets, keeping a history of previous IB's so that we
65 * can backtrack (because CP_IBn_BASE can be several IB's ahead of SQE). Once
66 * we find the IB1 position that is being read into ROQ, we backtrack until
67 * we find the IB1 position that SQE is at, and (roughly) repeat the process
68 * in IB2. This has one calculation in that we need to start scanning for the
69 * CP_INDIRECT_BUFFER to IB2 from before the detected IB1 position.
70 */
71
72 struct ib {
73 uint64_t ibaddr;
74 uint32_t ibsize;
75 };
76
77 struct prefetch_state {
78 struct ib history[MAX_PREFETCH_IBS];
79 unsigned num, next;
80 };
81
82 static void
push_ib(struct prefetch_state * s,struct ib * ib)83 push_ib(struct prefetch_state *s, struct ib *ib)
84 {
85 s->history[s->next++ % ARRAY_SIZE(s->history)] = *ib;
86 s->num = MIN2(s->num + 1, ARRAY_SIZE(s->history));
87 }
88
89 static struct ib *
get_ib(struct prefetch_state * s,int n)90 get_ib(struct prefetch_state *s, int n)
91 {
92 if ((n >= s->num) || (n < 0))
93 return NULL;
94 int idx = s->next - (s->num - n);
95 return &s->history[idx % ARRAY_SIZE(s->history)];
96 }
97
98 static void
reset_state(struct prefetch_state * s)99 reset_state(struct prefetch_state *s)
100 {
101 s->num = s->next = 0;
102 }
103
104 /**
105 * Once we find the ROQ prefetch position, work backwards to find the SQE
106 * position.
107 */
108 static struct ib *
reverse_prefetch(struct prefetch_state * s,int lvl)109 reverse_prefetch(struct prefetch_state *s, int lvl)
110 {
111 unsigned rem = options.ibs[lvl].rem;
112
113 for (int n = s->num - 1; n >= 0; n--) {
114 struct ib *ib = get_ib(s, n);
115 if (ib->ibsize > rem) {
116 options.ibs[lvl].crash_found = 1;
117 options.ibs[lvl].base = ib->ibaddr;
118 options.ibs[lvl].rem = rem;
119
120 return ib;
121 }
122 rem -= ib->ibsize;
123 }
124
125 return NULL;
126 }
127
128 /**
129 * Scan cmdstream looking for CP_INDIRECT_BUFFER packets, tracking history
130 * of consecutive CP_INDIRECT_BUFFER packets, until we find the one that
131 * matches CP_IBn_BASE.
132 */
133 static struct ib *
scan_cmdstream(struct prefetch_state * s,int lvl,uint32_t * dwords,uint32_t sizedwords)134 scan_cmdstream(struct prefetch_state *s, int lvl, uint32_t *dwords, uint32_t sizedwords)
135 {
136 int dwords_left = sizedwords;
137 uint32_t count = 0; /* dword count including packet header */
138 uint32_t val;
139
140 while (dwords_left > 0) {
141 if (pkt_is_opcode(dwords[0], &val, &count)) {
142 if (!strcmp(pktname(val), "CP_INDIRECT_BUFFER")) {
143 uint64_t ibaddr;
144 uint32_t ibsize;
145
146 parse_cp_indirect(&dwords[1], count - 1, &ibaddr, &ibsize);
147 push_ib(s, &(struct ib){ ibaddr, ibsize });
148
149 /* If we've found the IB indicated by CP_IBn_BASE, then we can
150 * search backwards from here to find the SQE position:
151 */
152 if (ibaddr == options.ibs[lvl].base)
153 return reverse_prefetch(s, lvl);
154
155 goto next_pkt;
156 }
157 } else if (pkt_is_regwrite(dwords[0], &val, &count)) {
158 } else {
159 count = find_next_packet(dwords, dwords_left);
160 }
161
162 /* prefetch only happens across consecutive CP_INDIRECT_BUFFER, so
163 * any other packet resets the state:
164 */
165 reset_state(s);
166
167 next_pkt:
168 dwords += count;
169 dwords_left -= count;
170 }
171
172 return NULL;
173 }
174
175 void
handle_prefetch(uint32_t * dwords,uint32_t sizedwords)176 handle_prefetch(uint32_t *dwords, uint32_t sizedwords)
177 {
178 struct prefetch_state rb_state = {};
179 struct ib *ib1 = scan_cmdstream(&rb_state, 1, dwords, sizedwords);
180
181 if (!ib1)
182 return;
183
184 /* If the gpu crashed in IB1, we can skip the rest: */
185 if (!options.ibs[2].rem)
186 return;
187
188 struct prefetch_state ib1_state = {};
189
190 /* Once we find the actual IB1 position, we need to find the IB2 position.
191 * But because IB2 prefetch can span IB1 CP_INDIRECT_BUFFER targets. But
192 * there are a limited # of buffers that can be prefetched, and we already
193 * have a history of enough RB->IB1 IB's, so we can simply scan forward
194 * from our oldest history entry until we find the IB2 match..
195 */
196 for (int n = 0; n < rb_state.num; n++) {
197 struct ib *ib = get_ib(&rb_state, n);
198 uint32_t *ibaddr = hostptr(ib->ibaddr);
199 if (!ibaddr)
200 break;
201 struct ib *ib2 = scan_cmdstream(&ib1_state, 2, ibaddr, ib->ibsize);
202
203 /* If the crash happens in IB2, but IB1 has a sequence of CP_INDIRECT_BUFFER's
204 * then IB1 could actually be further ahead than IB2, ie:
205 *
206 * IB1:CP_INDIRECT_BUFFER
207 * IB2: .. crash somewhere in here ..
208 * IB1:CP_INDIRECT_BUFFER
209 * IB1:CP_INDIRECT_BUFFER <-- detected IB1 position
210 *
211 * Our logic for detecting the correct IB1 position is not incorrect.
212 * It is just that SQE has already consumed some additional IB's. So
213 * reset the IB1 crash position back to the oldest RB->IB1 IB that we
214 * remember.
215 *
216 * This isn't *quite* correct, but cffdec will only mark the crash when
217 * it finds the location in IB2 if we've determined that the crash is
218 * in IB2, but will only consider the address in IB2 if it has seen the
219 * IB1 base.
220 *
221 * The main case we are trying to account for here is GMEM mode crash in
222 * IB2 which *isn't* the first bin/tile. Ie. the crash happens later
223 * than the first time we encounter the IB2 crash address.
224 *
225 * This approach works in practice because there will be some other pkts
226 * in IB1 to setup for the next tile, breaking up prefetch.
227 */
228 if (ib2) {
229 assert(options.ibs[2].crash_found);
230 struct ib *first_rb_ib = get_ib(&rb_state, 0);
231
232 options.ibs[1].base = first_rb_ib->ibaddr;
233 options.ibs[1].rem = first_rb_ib->ibsize;
234
235 break;
236 }
237
238 if (ib == ib1)
239 break;
240 }
241 }
242