1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * IOMMU API for RISC-V IOMMU implementations.
4 *
5 * Copyright © 2022-2024 Rivos Inc.
6 * Copyright © 2023 FORTH-ICS/CARV
7 *
8 * Authors
9 * Tomasz Jeznach <[email protected]>
10 * Nick Kossifidis <[email protected]>
11 */
12
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14
15 #include <linux/compiler.h>
16 #include <linux/crash_dump.h>
17 #include <linux/init.h>
18 #include <linux/iommu.h>
19 #include <linux/iopoll.h>
20 #include <linux/kernel.h>
21 #include <linux/pci.h>
22
23 #include "../iommu-pages.h"
24 #include "iommu-bits.h"
25 #include "iommu.h"
26
27 /* Timeouts in [us] */
28 #define RISCV_IOMMU_QCSR_TIMEOUT 150000
29 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000
30 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000
31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
32
33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
34 #define RISCV_IOMMU_DEF_CQ_COUNT 8192
35 #define RISCV_IOMMU_DEF_FQ_COUNT 4096
36
37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
38 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
39 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
40
41 #define dev_to_iommu(dev) \
42 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
43
44 /* IOMMU PSCID allocation namespace. */
45 static DEFINE_IDA(riscv_iommu_pscids);
46 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
47
48 /* Device resource-managed allocations */
49 struct riscv_iommu_devres {
50 void *addr;
51 int order;
52 };
53
riscv_iommu_devres_pages_release(struct device * dev,void * res)54 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
55 {
56 struct riscv_iommu_devres *devres = res;
57
58 iommu_free_pages(devres->addr, devres->order);
59 }
60
riscv_iommu_devres_pages_match(struct device * dev,void * res,void * p)61 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
62 {
63 struct riscv_iommu_devres *devres = res;
64 struct riscv_iommu_devres *target = p;
65
66 return devres->addr == target->addr;
67 }
68
riscv_iommu_get_pages(struct riscv_iommu_device * iommu,int order)69 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
70 {
71 struct riscv_iommu_devres *devres;
72 void *addr;
73
74 addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
75 GFP_KERNEL_ACCOUNT, order);
76 if (unlikely(!addr))
77 return NULL;
78
79 devres = devres_alloc(riscv_iommu_devres_pages_release,
80 sizeof(struct riscv_iommu_devres), GFP_KERNEL);
81
82 if (unlikely(!devres)) {
83 iommu_free_pages(addr, order);
84 return NULL;
85 }
86
87 devres->addr = addr;
88 devres->order = order;
89
90 devres_add(iommu->dev, devres);
91
92 return addr;
93 }
94
riscv_iommu_free_pages(struct riscv_iommu_device * iommu,void * addr)95 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
96 {
97 struct riscv_iommu_devres devres = { .addr = addr };
98
99 devres_release(iommu->dev, riscv_iommu_devres_pages_release,
100 riscv_iommu_devres_pages_match, &devres);
101 }
102
103 /*
104 * Hardware queue allocation and management.
105 */
106
107 /* Setup queue base, control registers and default queue length */
108 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
109 struct riscv_iommu_queue *_q = q; \
110 _q->qid = RISCV_IOMMU_INTR_ ## name; \
111 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
112 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
113 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
114 } while (0)
115
116 /* Note: offsets are the same for all queues */
117 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
118 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
119 #define Q_ITEM(q, index) ((q)->mask & (index))
120 #define Q_IPSR(q) BIT((q)->qid)
121
122 /*
123 * Discover queue ring buffer hardware configuration, allocate in-memory
124 * ring buffer or use fixed I/O memory location, configure queue base register.
125 * Must be called before hardware queue is enabled.
126 *
127 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
128 * @entry_size - queue single element size in bytes.
129 */
riscv_iommu_queue_alloc(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,size_t entry_size)130 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
131 struct riscv_iommu_queue *queue,
132 size_t entry_size)
133 {
134 unsigned int logsz;
135 u64 qb, rb;
136
137 /*
138 * Use WARL base register property to discover maximum allowed
139 * number of entries and optional fixed IO address for queue location.
140 */
141 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
142 qb = riscv_iommu_readq(iommu, queue->qbr);
143
144 /*
145 * Calculate and verify hardware supported queue length, as reported
146 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
147 * Update queue size based on hardware supported value.
148 */
149 logsz = ilog2(queue->mask);
150 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
151 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
152
153 /*
154 * Use WARL base register property to discover an optional fixed IO
155 * address for queue ring buffer location. Otherwise allocate contiguous
156 * system memory.
157 */
158 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
159 const size_t queue_size = entry_size << (logsz + 1);
160
161 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
162 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
163 } else {
164 do {
165 const size_t queue_size = entry_size << (logsz + 1);
166 const int order = get_order(queue_size);
167
168 queue->base = riscv_iommu_get_pages(iommu, order);
169 queue->phys = __pa(queue->base);
170 } while (!queue->base && logsz-- > 0);
171 }
172
173 if (!queue->base)
174 return -ENOMEM;
175
176 qb = phys_to_ppn(queue->phys) |
177 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
178
179 /* Update base register and read back to verify hw accepted our write */
180 riscv_iommu_writeq(iommu, queue->qbr, qb);
181 rb = riscv_iommu_readq(iommu, queue->qbr);
182 if (rb != qb) {
183 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
184 return -ENODEV;
185 }
186
187 /* Update actual queue mask */
188 queue->mask = (2U << logsz) - 1;
189
190 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
191 queue->qid, logsz + 1);
192
193 return 0;
194 }
195
196 /* Check interrupt queue status, IPSR */
riscv_iommu_queue_ipsr(int irq,void * data)197 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
198 {
199 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
200
201 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
202 return IRQ_WAKE_THREAD;
203
204 return IRQ_NONE;
205 }
206
riscv_iommu_queue_vec(struct riscv_iommu_device * iommu,int n)207 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
208 {
209 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
210 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
211 }
212
213 /*
214 * Enable queue processing in the hardware, register interrupt handler.
215 *
216 * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
217 * @irq_handler - threaded interrupt handler.
218 */
riscv_iommu_queue_enable(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,irq_handler_t irq_handler)219 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
220 struct riscv_iommu_queue *queue,
221 irq_handler_t irq_handler)
222 {
223 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
224 u32 csr;
225 int rc;
226
227 if (queue->iommu)
228 return -EBUSY;
229
230 /* Polling not implemented */
231 if (!irq)
232 return -ENODEV;
233
234 queue->iommu = iommu;
235 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
236 IRQF_ONESHOT | IRQF_SHARED,
237 dev_name(iommu->dev), queue);
238 if (rc) {
239 queue->iommu = NULL;
240 return rc;
241 }
242
243 /* Empty queue before enabling it */
244 if (queue->qid == RISCV_IOMMU_INTR_CQ)
245 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
246 else
247 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
248
249 /*
250 * Enable queue with interrupts, clear any memory fault if any.
251 * Wait for the hardware to acknowledge request and activate queue
252 * processing.
253 * Note: All CSR bitfields are in the same offsets for all queues.
254 */
255 riscv_iommu_writel(iommu, queue->qcr,
256 RISCV_IOMMU_QUEUE_ENABLE |
257 RISCV_IOMMU_QUEUE_INTR_ENABLE |
258 RISCV_IOMMU_QUEUE_MEM_FAULT);
259
260 riscv_iommu_readl_timeout(iommu, queue->qcr,
261 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
262 10, RISCV_IOMMU_QCSR_TIMEOUT);
263
264 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
265 RISCV_IOMMU_QUEUE_BUSY |
266 RISCV_IOMMU_QUEUE_MEM_FAULT))) {
267 /* Best effort to stop and disable failing hardware queue. */
268 riscv_iommu_writel(iommu, queue->qcr, 0);
269 free_irq(irq, queue);
270 queue->iommu = NULL;
271 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
272 return -EBUSY;
273 }
274
275 /* Clear any pending interrupt flag. */
276 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
277
278 return 0;
279 }
280
281 /*
282 * Disable queue. Wait for the hardware to acknowledge request and
283 * stop processing enqueued requests. Report errors but continue.
284 */
riscv_iommu_queue_disable(struct riscv_iommu_queue * queue)285 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
286 {
287 struct riscv_iommu_device *iommu = queue->iommu;
288 u32 csr;
289
290 if (!iommu)
291 return;
292
293 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
294 riscv_iommu_writel(iommu, queue->qcr, 0);
295 riscv_iommu_readl_timeout(iommu, queue->qcr,
296 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
297 10, RISCV_IOMMU_QCSR_TIMEOUT);
298
299 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
300 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
301 queue->qid, csr);
302
303 queue->iommu = NULL;
304 }
305
306 /*
307 * Returns number of available valid queue entries and the first item index.
308 * Update shadow producer index if necessary.
309 */
riscv_iommu_queue_consume(struct riscv_iommu_queue * queue,unsigned int * index)310 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
311 unsigned int *index)
312 {
313 unsigned int head = atomic_read(&queue->head);
314 unsigned int tail = atomic_read(&queue->tail);
315 unsigned int last = Q_ITEM(queue, tail);
316 int available = (int)(tail - head);
317
318 *index = head;
319
320 if (available > 0)
321 return available;
322
323 /* read hardware producer index, check reserved register bits are not set. */
324 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
325 tail, (tail & ~queue->mask) == 0,
326 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
327 dev_err_once(queue->iommu->dev,
328 "Hardware error: queue access timeout\n");
329 return 0;
330 }
331
332 if (tail == last)
333 return 0;
334
335 /* update shadow producer index */
336 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
337 }
338
339 /*
340 * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
341 */
riscv_iommu_queue_release(struct riscv_iommu_queue * queue,int count)342 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
343 {
344 const unsigned int head = atomic_add_return(count, &queue->head);
345
346 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
347 }
348
349 /* Return actual consumer index based on hardware reported queue head index. */
riscv_iommu_queue_cons(struct riscv_iommu_queue * queue)350 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
351 {
352 const unsigned int cons = atomic_read(&queue->head);
353 const unsigned int last = Q_ITEM(queue, cons);
354 unsigned int head;
355
356 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
357 !(head & ~queue->mask),
358 0, RISCV_IOMMU_QUEUE_TIMEOUT))
359 return cons;
360
361 return cons + ((head - last) & queue->mask);
362 }
363
364 /* Wait for submitted item to be processed. */
riscv_iommu_queue_wait(struct riscv_iommu_queue * queue,unsigned int index,unsigned int timeout_us)365 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
366 unsigned int index,
367 unsigned int timeout_us)
368 {
369 unsigned int cons = atomic_read(&queue->head);
370
371 /* Already processed by the consumer */
372 if ((int)(cons - index) > 0)
373 return 0;
374
375 /* Monitor consumer index */
376 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
377 (int)(cons - index) > 0, 0, timeout_us);
378 }
379
380 /* Enqueue an entry and wait to be processed if timeout_us > 0
381 *
382 * Error handling for IOMMU hardware not responding in reasonable time
383 * will be added as separate patch series along with other RAS features.
384 * For now, only report hardware failure and continue.
385 */
riscv_iommu_queue_send(struct riscv_iommu_queue * queue,void * entry,size_t entry_size)386 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
387 void *entry, size_t entry_size)
388 {
389 unsigned int prod;
390 unsigned int head;
391 unsigned int tail;
392 unsigned long flags;
393
394 /* Do not preempt submission flow. */
395 local_irq_save(flags);
396
397 /* 1. Allocate some space in the queue */
398 prod = atomic_inc_return(&queue->prod) - 1;
399 head = atomic_read(&queue->head);
400
401 /* 2. Wait for space availability. */
402 if ((prod - head) > queue->mask) {
403 if (readx_poll_timeout(atomic_read, &queue->head,
404 head, (prod - head) < queue->mask,
405 0, RISCV_IOMMU_QUEUE_TIMEOUT))
406 goto err_busy;
407 } else if ((prod - head) == queue->mask) {
408 const unsigned int last = Q_ITEM(queue, head);
409
410 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
411 !(head & ~queue->mask) && head != last,
412 0, RISCV_IOMMU_QUEUE_TIMEOUT))
413 goto err_busy;
414 atomic_add((head - last) & queue->mask, &queue->head);
415 }
416
417 /* 3. Store entry in the ring buffer */
418 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
419
420 /* 4. Wait for all previous entries to be ready */
421 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
422 0, RISCV_IOMMU_QUEUE_TIMEOUT))
423 goto err_busy;
424
425 /*
426 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
427 * completed and visible before signaling the tail doorbell to fetch
428 * the next command. 'fence ow, ow'
429 */
430 dma_wmb();
431 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
432
433 /*
434 * 6. Make sure the doorbell write to the device has finished before updating
435 * the shadow tail index in normal memory. 'fence o, w'
436 */
437 mmiowb();
438 atomic_inc(&queue->tail);
439
440 /* 7. Complete submission and restore local interrupts */
441 local_irq_restore(flags);
442
443 return prod;
444
445 err_busy:
446 local_irq_restore(flags);
447 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
448
449 return prod;
450 }
451
452 /*
453 * IOMMU Command queue chapter 3.1
454 */
455
456 /* Command queue interrupt handler thread function */
riscv_iommu_cmdq_process(int irq,void * data)457 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
458 {
459 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
460 unsigned int ctrl;
461
462 /* Clear MF/CQ errors, complete error recovery to be implemented. */
463 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
464 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
465 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
466 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
467 dev_warn(queue->iommu->dev,
468 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
469 queue->qid,
470 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
471 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
472 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
473 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
474 }
475
476 /* Placeholder for command queue interrupt notifiers */
477
478 /* Clear command interrupt pending. */
479 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
480
481 return IRQ_HANDLED;
482 }
483
484 /* Send command to the IOMMU command queue */
riscv_iommu_cmd_send(struct riscv_iommu_device * iommu,struct riscv_iommu_command * cmd)485 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
486 struct riscv_iommu_command *cmd)
487 {
488 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
489 }
490
491 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
riscv_iommu_cmd_sync(struct riscv_iommu_device * iommu,unsigned int timeout_us)492 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
493 unsigned int timeout_us)
494 {
495 struct riscv_iommu_command cmd;
496 unsigned int prod;
497
498 riscv_iommu_cmd_iofence(&cmd);
499 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
500
501 if (!timeout_us)
502 return;
503
504 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
505 dev_err_once(iommu->dev,
506 "Hardware error: command execution timeout\n");
507 }
508
509 /*
510 * IOMMU Fault/Event queue chapter 3.2
511 */
512
riscv_iommu_fault(struct riscv_iommu_device * iommu,struct riscv_iommu_fq_record * event)513 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
514 struct riscv_iommu_fq_record *event)
515 {
516 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
517 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
518
519 /* Placeholder for future fault handling implementation, report only. */
520 if (err)
521 dev_warn_ratelimited(iommu->dev,
522 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
523 err, devid, event->iotval, event->iotval2);
524 }
525
526 /* Fault queue interrupt handler thread function */
riscv_iommu_fltq_process(int irq,void * data)527 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
528 {
529 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
530 struct riscv_iommu_device *iommu = queue->iommu;
531 struct riscv_iommu_fq_record *events;
532 unsigned int ctrl, idx;
533 int cnt, len;
534
535 events = (struct riscv_iommu_fq_record *)queue->base;
536
537 /* Clear fault interrupt pending and process all received fault events. */
538 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
539
540 do {
541 cnt = riscv_iommu_queue_consume(queue, &idx);
542 for (len = 0; len < cnt; idx++, len++)
543 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
544 riscv_iommu_queue_release(queue, cnt);
545 } while (cnt > 0);
546
547 /* Clear MF/OF errors, complete error recovery to be implemented. */
548 ctrl = riscv_iommu_readl(iommu, queue->qcr);
549 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
550 riscv_iommu_writel(iommu, queue->qcr, ctrl);
551 dev_warn(iommu->dev,
552 "Queue #%u error; memory fault:%d overflow:%d\n",
553 queue->qid,
554 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
555 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
556 }
557
558 return IRQ_HANDLED;
559 }
560
561 /* Lookup and initialize device context info structure. */
riscv_iommu_get_dc(struct riscv_iommu_device * iommu,unsigned int devid)562 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
563 unsigned int devid)
564 {
565 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
566 unsigned int depth;
567 unsigned long ddt, old, new;
568 void *ptr;
569 u8 ddi_bits[3] = { 0 };
570 u64 *ddtp = NULL;
571
572 /* Make sure the mode is valid */
573 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
574 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
575 return NULL;
576
577 /*
578 * Device id partitioning for base format:
579 * DDI[0]: bits 0 - 6 (1st level) (7 bits)
580 * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
581 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
582 *
583 * For extended format:
584 * DDI[0]: bits 0 - 5 (1st level) (6 bits)
585 * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
586 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
587 */
588 if (base_format) {
589 ddi_bits[0] = 7;
590 ddi_bits[1] = 7 + 9;
591 ddi_bits[2] = 7 + 9 + 8;
592 } else {
593 ddi_bits[0] = 6;
594 ddi_bits[1] = 6 + 9;
595 ddi_bits[2] = 6 + 9 + 9;
596 }
597
598 /* Make sure device id is within range */
599 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
600 if (devid >= (1 << ddi_bits[depth]))
601 return NULL;
602
603 /* Get to the level of the non-leaf node that holds the device context */
604 for (ddtp = iommu->ddt_root; depth-- > 0;) {
605 const int split = ddi_bits[depth];
606 /*
607 * Each non-leaf node is 64bits wide and on each level
608 * nodes are indexed by DDI[depth].
609 */
610 ddtp += (devid >> split) & 0x1FF;
611
612 /*
613 * Check if this node has been populated and if not
614 * allocate a new level and populate it.
615 */
616 do {
617 ddt = READ_ONCE(*(unsigned long *)ddtp);
618 if (ddt & RISCV_IOMMU_DDTE_V) {
619 ddtp = __va(ppn_to_phys(ddt));
620 break;
621 }
622
623 ptr = riscv_iommu_get_pages(iommu, 0);
624 if (!ptr)
625 return NULL;
626
627 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
628 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
629
630 if (old == ddt) {
631 ddtp = (u64 *)ptr;
632 break;
633 }
634
635 /* Race setting DDT detected, re-read and retry. */
636 riscv_iommu_free_pages(iommu, ptr);
637 } while (1);
638 }
639
640 /*
641 * Grab the node that matches DDI[depth], note that when using base
642 * format the device context is 4 * 64bits, and the extended format
643 * is 8 * 64bits, hence the (3 - base_format) below.
644 */
645 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
646
647 return (struct riscv_iommu_dc *)ddtp;
648 }
649
650 /*
651 * This is best effort IOMMU translation shutdown flow.
652 * Disable IOMMU without waiting for hardware response.
653 */
riscv_iommu_disable(struct riscv_iommu_device * iommu)654 void riscv_iommu_disable(struct riscv_iommu_device *iommu)
655 {
656 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
657 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
658 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
659 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
661 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
662 }
663
664 #define riscv_iommu_read_ddtp(iommu) ({ \
665 u64 ddtp; \
666 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
667 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
668 RISCV_IOMMU_DDTP_TIMEOUT); \
669 ddtp; })
670
riscv_iommu_iodir_alloc(struct riscv_iommu_device * iommu)671 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
672 {
673 u64 ddtp;
674 unsigned int mode;
675
676 ddtp = riscv_iommu_read_ddtp(iommu);
677 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
678 return -EBUSY;
679
680 /*
681 * It is optional for the hardware to report a fixed address for device
682 * directory root page when DDT.MODE is OFF or BARE.
683 */
684 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
685 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
686 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
687 /* Use WARL to discover hardware fixed DDT PPN */
688 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
689 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
690 ddtp = riscv_iommu_read_ddtp(iommu);
691 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
692 return -EBUSY;
693
694 iommu->ddt_phys = ppn_to_phys(ddtp);
695 if (iommu->ddt_phys)
696 iommu->ddt_root = devm_ioremap(iommu->dev,
697 iommu->ddt_phys, PAGE_SIZE);
698 if (iommu->ddt_root)
699 memset(iommu->ddt_root, 0, PAGE_SIZE);
700 }
701
702 if (!iommu->ddt_root) {
703 iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
704 iommu->ddt_phys = __pa(iommu->ddt_root);
705 }
706
707 if (!iommu->ddt_root)
708 return -ENOMEM;
709
710 return 0;
711 }
712
713 /*
714 * Discover supported DDT modes starting from requested value,
715 * configure DDTP register with accepted mode and root DDT address.
716 * Accepted iommu->ddt_mode is updated on success.
717 */
riscv_iommu_iodir_set_mode(struct riscv_iommu_device * iommu,unsigned int ddtp_mode)718 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
719 unsigned int ddtp_mode)
720 {
721 struct device *dev = iommu->dev;
722 u64 ddtp, rq_ddtp;
723 unsigned int mode, rq_mode = ddtp_mode;
724 struct riscv_iommu_command cmd;
725
726 ddtp = riscv_iommu_read_ddtp(iommu);
727 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
728 return -EBUSY;
729
730 /* Disallow state transition from xLVL to xLVL. */
731 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
732 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
733 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
734 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
735 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
736 return -EINVAL;
737
738 do {
739 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
740 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
741 rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
742
743 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
744 ddtp = riscv_iommu_read_ddtp(iommu);
745 if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
746 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
747 rq_mode, ddtp);
748 return -EBUSY;
749 }
750
751 /* Verify IOMMU hardware accepts new DDTP config. */
752 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
753
754 if (rq_mode == mode)
755 break;
756
757 /* Hardware mandatory DDTP mode has not been accepted. */
758 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
759 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
760 ddtp, rq_ddtp);
761 return -EINVAL;
762 }
763
764 /*
765 * Mode field is WARL, an IOMMU may support a subset of
766 * directory table levels in which case if we tried to set
767 * an unsupported number of levels we'll readback either
768 * a valid xLVL or off/bare. If we got off/bare, try again
769 * with a smaller xLVL.
770 */
771 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
772 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
773 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
774 rq_mode--;
775 continue;
776 }
777
778 /*
779 * We tried all supported modes and IOMMU hardware failed to
780 * accept new settings, something went very wrong since off/bare
781 * and at least one xLVL must be supported.
782 */
783 dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
784 mode, ddtp_mode);
785 return -EINVAL;
786 } while (1);
787
788 iommu->ddt_mode = mode;
789 if (mode != ddtp_mode)
790 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
791
792 /* Invalidate device context cache */
793 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
794 riscv_iommu_cmd_send(iommu, &cmd);
795
796 /* Invalidate address translation cache */
797 riscv_iommu_cmd_inval_vma(&cmd);
798 riscv_iommu_cmd_send(iommu, &cmd);
799
800 /* IOFENCE.C */
801 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
802
803 return 0;
804 }
805
806 /* This struct contains protection domain specific IOMMU driver data. */
807 struct riscv_iommu_domain {
808 struct iommu_domain domain;
809 struct list_head bonds;
810 spinlock_t lock; /* protect bonds list updates. */
811 int pscid;
812 bool amo_enabled;
813 int numa_node;
814 unsigned int pgd_mode;
815 unsigned long *pgd_root;
816 };
817
818 #define iommu_domain_to_riscv(iommu_domain) \
819 container_of(iommu_domain, struct riscv_iommu_domain, domain)
820
821 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
822 struct riscv_iommu_info {
823 struct riscv_iommu_domain *domain;
824 };
825
826 /*
827 * Linkage between an iommu_domain and attached devices.
828 *
829 * Protection domain requiring IOATC and DevATC translation cache invalidations,
830 * should be linked to attached devices using a riscv_iommu_bond structure.
831 * Devices should be linked to the domain before first use and unlinked after
832 * the translations from the referenced protection domain can no longer be used.
833 * Blocking and identity domains are not tracked here, as the IOMMU hardware
834 * does not cache negative and/or identity (BARE mode) translations, and DevATC
835 * is disabled for those protection domains.
836 *
837 * The device pointer and IOMMU data remain stable in the bond struct after
838 * _probe_device() where it's attached to the managed IOMMU, up to the
839 * completion of the _release_device() call. The release of the bond structure
840 * is synchronized with the device release.
841 */
842 struct riscv_iommu_bond {
843 struct list_head list;
844 struct rcu_head rcu;
845 struct device *dev;
846 };
847
riscv_iommu_bond_link(struct riscv_iommu_domain * domain,struct device * dev)848 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
849 struct device *dev)
850 {
851 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
852 struct riscv_iommu_bond *bond;
853 struct list_head *bonds;
854
855 bond = kzalloc(sizeof(*bond), GFP_KERNEL);
856 if (!bond)
857 return -ENOMEM;
858 bond->dev = dev;
859
860 /*
861 * List of devices attached to the domain is arranged based on
862 * managed IOMMU device.
863 */
864
865 spin_lock(&domain->lock);
866 list_for_each(bonds, &domain->bonds)
867 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
868 break;
869 list_add_rcu(&bond->list, bonds);
870 spin_unlock(&domain->lock);
871
872 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
873 smp_mb();
874
875 return 0;
876 }
877
riscv_iommu_bond_unlink(struct riscv_iommu_domain * domain,struct device * dev)878 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
879 struct device *dev)
880 {
881 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
882 struct riscv_iommu_bond *bond, *found = NULL;
883 struct riscv_iommu_command cmd;
884 int count = 0;
885
886 if (!domain)
887 return;
888
889 spin_lock(&domain->lock);
890 list_for_each_entry(bond, &domain->bonds, list) {
891 if (found && count)
892 break;
893 else if (bond->dev == dev)
894 found = bond;
895 else if (dev_to_iommu(bond->dev) == iommu)
896 count++;
897 }
898 if (found)
899 list_del_rcu(&found->list);
900 spin_unlock(&domain->lock);
901 kfree_rcu(found, rcu);
902
903 /*
904 * If this was the last bond between this domain and the IOMMU
905 * invalidate all cached entries for domain's PSCID.
906 */
907 if (!count) {
908 riscv_iommu_cmd_inval_vma(&cmd);
909 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
910 riscv_iommu_cmd_send(iommu, &cmd);
911
912 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
913 }
914 }
915
916 /*
917 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
918 * This limit will be replaced with range invalidations, if supported by
919 * the hardware, when RISC-V IOMMU architecture specification update for
920 * range invalidations update will be available.
921 */
922 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
923
riscv_iommu_iotlb_inval(struct riscv_iommu_domain * domain,unsigned long start,unsigned long end)924 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
925 unsigned long start, unsigned long end)
926 {
927 struct riscv_iommu_bond *bond;
928 struct riscv_iommu_device *iommu, *prev;
929 struct riscv_iommu_command cmd;
930 unsigned long len = end - start + 1;
931 unsigned long iova;
932
933 /*
934 * For each IOMMU linked with this protection domain (via bonds->dev),
935 * an IOTLB invaliation command will be submitted and executed.
936 *
937 * Possbile race with domain attach flow is handled by sequencing
938 * bond creation - riscv_iommu_bond_link(), and device directory
939 * update - riscv_iommu_iodir_update().
940 *
941 * PTE Update / IOTLB Inval Device attach & directory update
942 * -------------------------- --------------------------
943 * update page table entries add dev to the bond list
944 * FENCE RW,RW FENCE RW,RW
945 * For all IOMMUs: (can be empty) Update FSC/PSCID
946 * FENCE IOW,IOW FENCE IOW,IOW
947 * IOTLB.INVAL IODIR.INVAL
948 * IOFENCE.C
949 *
950 * If bond list is not updated with new device, directory context will
951 * be configured with already valid page table content. If an IOMMU is
952 * linked to the protection domain it will receive invalidation
953 * requests for updated page table entries.
954 */
955 smp_mb();
956
957 rcu_read_lock();
958
959 prev = NULL;
960 list_for_each_entry_rcu(bond, &domain->bonds, list) {
961 iommu = dev_to_iommu(bond->dev);
962
963 /*
964 * IOTLB invalidation request can be safely omitted if already sent
965 * to the IOMMU for the same PSCID, and with domain->bonds list
966 * arranged based on the device's IOMMU, it's sufficient to check
967 * last device the invalidation was sent to.
968 */
969 if (iommu == prev)
970 continue;
971
972 riscv_iommu_cmd_inval_vma(&cmd);
973 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
974 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
975 for (iova = start; iova < end; iova += PAGE_SIZE) {
976 riscv_iommu_cmd_inval_set_addr(&cmd, iova);
977 riscv_iommu_cmd_send(iommu, &cmd);
978 }
979 } else {
980 riscv_iommu_cmd_send(iommu, &cmd);
981 }
982 prev = iommu;
983 }
984
985 prev = NULL;
986 list_for_each_entry_rcu(bond, &domain->bonds, list) {
987 iommu = dev_to_iommu(bond->dev);
988 if (iommu == prev)
989 continue;
990
991 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
992 prev = iommu;
993 }
994 rcu_read_unlock();
995 }
996
997 #define RISCV_IOMMU_FSC_BARE 0
998
999 /*
1000 * Update IODIR for the device.
1001 *
1002 * During the execution of riscv_iommu_probe_device(), IODIR entries are
1003 * allocated for the device's identifiers. Device context invalidation
1004 * becomes necessary only if one of the updated entries was previously
1005 * marked as valid, given that invalid device context entries are not
1006 * cached by the IOMMU hardware.
1007 * In this implementation, updating a valid device context while the
1008 * device is not quiesced might be disruptive, potentially causing
1009 * interim translation faults.
1010 */
riscv_iommu_iodir_update(struct riscv_iommu_device * iommu,struct device * dev,u64 fsc,u64 ta)1011 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1012 struct device *dev, u64 fsc, u64 ta)
1013 {
1014 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1015 struct riscv_iommu_dc *dc;
1016 struct riscv_iommu_command cmd;
1017 bool sync_required = false;
1018 u64 tc;
1019 int i;
1020
1021 for (i = 0; i < fwspec->num_ids; i++) {
1022 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1023 tc = READ_ONCE(dc->tc);
1024 if (!(tc & RISCV_IOMMU_DC_TC_V))
1025 continue;
1026
1027 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1028
1029 /* Invalidate device context cached values */
1030 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1031 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1032 riscv_iommu_cmd_send(iommu, &cmd);
1033 sync_required = true;
1034 }
1035
1036 if (sync_required)
1037 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1038
1039 /*
1040 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1041 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1042 */
1043 for (i = 0; i < fwspec->num_ids; i++) {
1044 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1045 tc = READ_ONCE(dc->tc);
1046 tc |= ta & RISCV_IOMMU_DC_TC_V;
1047
1048 WRITE_ONCE(dc->fsc, fsc);
1049 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1050 /* Update device context, write TC.V as the last step. */
1051 dma_wmb();
1052 WRITE_ONCE(dc->tc, tc);
1053
1054 /* Invalidate device context after update */
1055 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1056 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1057 riscv_iommu_cmd_send(iommu, &cmd);
1058 }
1059
1060 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1061 }
1062
1063 /*
1064 * IOVA page translation tree management.
1065 */
1066
riscv_iommu_iotlb_flush_all(struct iommu_domain * iommu_domain)1067 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1068 {
1069 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1070
1071 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1072 }
1073
riscv_iommu_iotlb_sync(struct iommu_domain * iommu_domain,struct iommu_iotlb_gather * gather)1074 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1075 struct iommu_iotlb_gather *gather)
1076 {
1077 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1078
1079 riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1080 }
1081
1082 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1083
1084 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1085 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
1086 #define _io_pte_none(pte) ((pte) == 0)
1087 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1088
riscv_iommu_pte_free(struct riscv_iommu_domain * domain,unsigned long pte,struct list_head * freelist)1089 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
1090 unsigned long pte, struct list_head *freelist)
1091 {
1092 unsigned long *ptr;
1093 int i;
1094
1095 if (!_io_pte_present(pte) || _io_pte_leaf(pte))
1096 return;
1097
1098 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1099
1100 /* Recursively free all sub page table pages */
1101 for (i = 0; i < PTRS_PER_PTE; i++) {
1102 pte = READ_ONCE(ptr[i]);
1103 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
1104 riscv_iommu_pte_free(domain, pte, freelist);
1105 }
1106
1107 if (freelist)
1108 list_add_tail(&virt_to_page(ptr)->lru, freelist);
1109 else
1110 iommu_free_page(ptr);
1111 }
1112
riscv_iommu_pte_alloc(struct riscv_iommu_domain * domain,unsigned long iova,size_t pgsize,gfp_t gfp)1113 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
1114 unsigned long iova, size_t pgsize,
1115 gfp_t gfp)
1116 {
1117 unsigned long *ptr = domain->pgd_root;
1118 unsigned long pte, old;
1119 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1120 void *addr;
1121
1122 do {
1123 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1124
1125 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1126 /*
1127 * Note: returned entry might be a non-leaf if there was
1128 * existing mapping with smaller granularity. Up to the caller
1129 * to replace and invalidate.
1130 */
1131 if (((size_t)1 << shift) == pgsize)
1132 return ptr;
1133 pte_retry:
1134 pte = READ_ONCE(*ptr);
1135 /*
1136 * This is very likely incorrect as we should not be adding
1137 * new mapping with smaller granularity on top
1138 * of existing 2M/1G mapping. Fail.
1139 */
1140 if (_io_pte_present(pte) && _io_pte_leaf(pte))
1141 return NULL;
1142 /*
1143 * Non-leaf entry is missing, allocate and try to add to the
1144 * page table. This might race with other mappings, retry.
1145 */
1146 if (_io_pte_none(pte)) {
1147 addr = iommu_alloc_page_node(domain->numa_node, gfp);
1148 if (!addr)
1149 return NULL;
1150 old = pte;
1151 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
1152 if (cmpxchg_relaxed(ptr, old, pte) != old) {
1153 iommu_free_page(addr);
1154 goto pte_retry;
1155 }
1156 }
1157 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1158 } while (level-- > 0);
1159
1160 return NULL;
1161 }
1162
riscv_iommu_pte_fetch(struct riscv_iommu_domain * domain,unsigned long iova,size_t * pte_pgsize)1163 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
1164 unsigned long iova, size_t *pte_pgsize)
1165 {
1166 unsigned long *ptr = domain->pgd_root;
1167 unsigned long pte;
1168 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1169
1170 do {
1171 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1172
1173 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1174 pte = READ_ONCE(*ptr);
1175 if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
1176 *pte_pgsize = (size_t)1 << shift;
1177 return ptr;
1178 }
1179 if (_io_pte_none(pte))
1180 return NULL;
1181 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1182 } while (level-- > 0);
1183
1184 return NULL;
1185 }
1186
riscv_iommu_map_pages(struct iommu_domain * iommu_domain,unsigned long iova,phys_addr_t phys,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)1187 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
1188 unsigned long iova, phys_addr_t phys,
1189 size_t pgsize, size_t pgcount, int prot,
1190 gfp_t gfp, size_t *mapped)
1191 {
1192 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1193 size_t size = 0;
1194 unsigned long *ptr;
1195 unsigned long pte, old, pte_prot;
1196 int rc = 0;
1197 LIST_HEAD(freelist);
1198
1199 if (!(prot & IOMMU_WRITE))
1200 pte_prot = _PAGE_BASE | _PAGE_READ;
1201 else if (domain->amo_enabled)
1202 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
1203 else
1204 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
1205
1206 while (pgcount) {
1207 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
1208 if (!ptr) {
1209 rc = -ENOMEM;
1210 break;
1211 }
1212
1213 old = READ_ONCE(*ptr);
1214 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
1215 if (cmpxchg_relaxed(ptr, old, pte) != old)
1216 continue;
1217
1218 riscv_iommu_pte_free(domain, old, &freelist);
1219
1220 size += pgsize;
1221 iova += pgsize;
1222 phys += pgsize;
1223 --pgcount;
1224 }
1225
1226 *mapped = size;
1227
1228 if (!list_empty(&freelist)) {
1229 /*
1230 * In 1.0 spec version, the smallest scope we can use to
1231 * invalidate all levels of page table (i.e. leaf and non-leaf)
1232 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1233 * This will be updated with hardware support for
1234 * capability.NL (non-leaf) IOTINVAL command.
1235 */
1236 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1237 iommu_put_pages_list(&freelist);
1238 }
1239
1240 return rc;
1241 }
1242
riscv_iommu_unmap_pages(struct iommu_domain * iommu_domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)1243 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
1244 unsigned long iova, size_t pgsize,
1245 size_t pgcount,
1246 struct iommu_iotlb_gather *gather)
1247 {
1248 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1249 size_t size = pgcount << __ffs(pgsize);
1250 unsigned long *ptr, old;
1251 size_t unmapped = 0;
1252 size_t pte_size;
1253
1254 while (unmapped < size) {
1255 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1256 if (!ptr)
1257 return unmapped;
1258
1259 /* partial unmap is not allowed, fail. */
1260 if (iova & (pte_size - 1))
1261 return unmapped;
1262
1263 old = READ_ONCE(*ptr);
1264 if (cmpxchg_relaxed(ptr, old, 0) != old)
1265 continue;
1266
1267 iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
1268 pte_size);
1269
1270 iova += pte_size;
1271 unmapped += pte_size;
1272 }
1273
1274 return unmapped;
1275 }
1276
riscv_iommu_iova_to_phys(struct iommu_domain * iommu_domain,dma_addr_t iova)1277 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
1278 dma_addr_t iova)
1279 {
1280 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1281 size_t pte_size;
1282 unsigned long *ptr;
1283
1284 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1285 if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
1286 return 0;
1287
1288 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
1289 }
1290
riscv_iommu_free_paging_domain(struct iommu_domain * iommu_domain)1291 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1292 {
1293 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1294 const unsigned long pfn = virt_to_pfn(domain->pgd_root);
1295
1296 WARN_ON(!list_empty(&domain->bonds));
1297
1298 if ((int)domain->pscid > 0)
1299 ida_free(&riscv_iommu_pscids, domain->pscid);
1300
1301 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
1302 kfree(domain);
1303 }
1304
riscv_iommu_pt_supported(struct riscv_iommu_device * iommu,int pgd_mode)1305 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1306 {
1307 switch (pgd_mode) {
1308 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1309 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1310
1311 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1312 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1313
1314 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1315 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1316 }
1317 return false;
1318 }
1319
riscv_iommu_attach_paging_domain(struct iommu_domain * iommu_domain,struct device * dev)1320 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1321 struct device *dev)
1322 {
1323 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1324 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1325 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1326 u64 fsc, ta;
1327
1328 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
1329 return -ENODEV;
1330
1331 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
1332 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
1333 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1334 RISCV_IOMMU_PC_TA_V;
1335
1336 if (riscv_iommu_bond_link(domain, dev))
1337 return -ENOMEM;
1338
1339 riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1340 riscv_iommu_bond_unlink(info->domain, dev);
1341 info->domain = domain;
1342
1343 return 0;
1344 }
1345
1346 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1347 .attach_dev = riscv_iommu_attach_paging_domain,
1348 .free = riscv_iommu_free_paging_domain,
1349 .map_pages = riscv_iommu_map_pages,
1350 .unmap_pages = riscv_iommu_unmap_pages,
1351 .iova_to_phys = riscv_iommu_iova_to_phys,
1352 .iotlb_sync = riscv_iommu_iotlb_sync,
1353 .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1354 };
1355
riscv_iommu_alloc_paging_domain(struct device * dev)1356 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1357 {
1358 struct riscv_iommu_domain *domain;
1359 struct riscv_iommu_device *iommu;
1360 unsigned int pgd_mode;
1361 dma_addr_t va_mask;
1362 int va_bits;
1363
1364 iommu = dev_to_iommu(dev);
1365 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1366 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
1367 va_bits = 57;
1368 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1369 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
1370 va_bits = 48;
1371 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1372 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
1373 va_bits = 39;
1374 } else {
1375 dev_err(dev, "cannot find supported page table mode\n");
1376 return ERR_PTR(-ENODEV);
1377 }
1378
1379 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1380 if (!domain)
1381 return ERR_PTR(-ENOMEM);
1382
1383 INIT_LIST_HEAD_RCU(&domain->bonds);
1384 spin_lock_init(&domain->lock);
1385 domain->numa_node = dev_to_node(iommu->dev);
1386 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
1387 domain->pgd_mode = pgd_mode;
1388 domain->pgd_root = iommu_alloc_page_node(domain->numa_node,
1389 GFP_KERNEL_ACCOUNT);
1390 if (!domain->pgd_root) {
1391 kfree(domain);
1392 return ERR_PTR(-ENOMEM);
1393 }
1394
1395 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1396 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1397 if (domain->pscid < 0) {
1398 iommu_free_page(domain->pgd_root);
1399 kfree(domain);
1400 return ERR_PTR(-ENOMEM);
1401 }
1402
1403 /*
1404 * Note: RISC-V Privilege spec mandates that virtual addresses
1405 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1406 * bits >= VA_BITS need to also be set or else we'll get a
1407 * page fault. However the code that creates the mappings
1408 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1409 * for now, so we'll end up with invalid virtual addresses
1410 * to map. As a workaround until we get this sorted out
1411 * limit the available virtual addresses to VA_BITS - 1.
1412 */
1413 va_mask = DMA_BIT_MASK(va_bits - 1);
1414
1415 domain->domain.geometry.aperture_start = 0;
1416 domain->domain.geometry.aperture_end = va_mask;
1417 domain->domain.geometry.force_aperture = true;
1418 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
1419
1420 domain->domain.ops = &riscv_iommu_paging_domain_ops;
1421
1422 return &domain->domain;
1423 }
1424
riscv_iommu_attach_blocking_domain(struct iommu_domain * iommu_domain,struct device * dev)1425 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1426 struct device *dev)
1427 {
1428 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1429 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1430
1431 /* Make device context invalid, translation requests will fault w/ #258 */
1432 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1433 riscv_iommu_bond_unlink(info->domain, dev);
1434 info->domain = NULL;
1435
1436 return 0;
1437 }
1438
1439 static struct iommu_domain riscv_iommu_blocking_domain = {
1440 .type = IOMMU_DOMAIN_BLOCKED,
1441 .ops = &(const struct iommu_domain_ops) {
1442 .attach_dev = riscv_iommu_attach_blocking_domain,
1443 }
1444 };
1445
riscv_iommu_attach_identity_domain(struct iommu_domain * iommu_domain,struct device * dev)1446 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1447 struct device *dev)
1448 {
1449 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1450 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1451
1452 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1453 riscv_iommu_bond_unlink(info->domain, dev);
1454 info->domain = NULL;
1455
1456 return 0;
1457 }
1458
1459 static struct iommu_domain riscv_iommu_identity_domain = {
1460 .type = IOMMU_DOMAIN_IDENTITY,
1461 .ops = &(const struct iommu_domain_ops) {
1462 .attach_dev = riscv_iommu_attach_identity_domain,
1463 }
1464 };
1465
riscv_iommu_device_group(struct device * dev)1466 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1467 {
1468 if (dev_is_pci(dev))
1469 return pci_device_group(dev);
1470 return generic_device_group(dev);
1471 }
1472
riscv_iommu_of_xlate(struct device * dev,const struct of_phandle_args * args)1473 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1474 {
1475 return iommu_fwspec_add_ids(dev, args->args, 1);
1476 }
1477
riscv_iommu_probe_device(struct device * dev)1478 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1479 {
1480 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1481 struct riscv_iommu_device *iommu;
1482 struct riscv_iommu_info *info;
1483 struct riscv_iommu_dc *dc;
1484 u64 tc;
1485 int i;
1486
1487 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1488 return ERR_PTR(-ENODEV);
1489
1490 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1491 if (!iommu)
1492 return ERR_PTR(-ENODEV);
1493
1494 /*
1495 * IOMMU hardware operating in fail-over BARE mode will provide
1496 * identity translation for all connected devices anyway...
1497 */
1498 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1499 return ERR_PTR(-ENODEV);
1500
1501 info = kzalloc(sizeof(*info), GFP_KERNEL);
1502 if (!info)
1503 return ERR_PTR(-ENOMEM);
1504 /*
1505 * Allocate and pre-configure device context entries in
1506 * the device directory. Do not mark the context valid yet.
1507 */
1508 tc = 0;
1509 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
1510 tc |= RISCV_IOMMU_DC_TC_SADE;
1511 for (i = 0; i < fwspec->num_ids; i++) {
1512 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1513 if (!dc) {
1514 kfree(info);
1515 return ERR_PTR(-ENODEV);
1516 }
1517 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1518 dev_warn(dev, "already attached to IOMMU device directory\n");
1519 WRITE_ONCE(dc->tc, tc);
1520 }
1521
1522 dev_iommu_priv_set(dev, info);
1523
1524 return &iommu->iommu;
1525 }
1526
riscv_iommu_release_device(struct device * dev)1527 static void riscv_iommu_release_device(struct device *dev)
1528 {
1529 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1530
1531 kfree_rcu_mightsleep(info);
1532 }
1533
1534 static const struct iommu_ops riscv_iommu_ops = {
1535 .pgsize_bitmap = SZ_4K,
1536 .of_xlate = riscv_iommu_of_xlate,
1537 .identity_domain = &riscv_iommu_identity_domain,
1538 .blocked_domain = &riscv_iommu_blocking_domain,
1539 .release_domain = &riscv_iommu_blocking_domain,
1540 .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1541 .device_group = riscv_iommu_device_group,
1542 .probe_device = riscv_iommu_probe_device,
1543 .release_device = riscv_iommu_release_device,
1544 };
1545
riscv_iommu_init_check(struct riscv_iommu_device * iommu)1546 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1547 {
1548 u64 ddtp;
1549
1550 /*
1551 * Make sure the IOMMU is switched off or in pass-through mode during
1552 * regular boot flow and disable translation when we boot into a kexec
1553 * kernel and the previous kernel left them enabled.
1554 */
1555 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1556 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1557 return -EBUSY;
1558
1559 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1560 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1561 if (!is_kdump_kernel())
1562 return -EBUSY;
1563 riscv_iommu_disable(iommu);
1564 }
1565
1566 /* Configure accesses to in-memory data structures for CPU-native byte order. */
1567 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1568 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1569 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1570 return -EINVAL;
1571 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1572 iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1573 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1574 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1575 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1576 return -EINVAL;
1577 }
1578
1579 /*
1580 * Distribute interrupt vectors, always use first vector for CIV.
1581 * At least one interrupt is required. Read back and verify.
1582 */
1583 if (!iommu->irqs_count)
1584 return -EINVAL;
1585
1586 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1587 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1588 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1589 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1590 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1591 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1592 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
1593 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1594 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1595 return -EINVAL;
1596
1597 return 0;
1598 }
1599
riscv_iommu_remove(struct riscv_iommu_device * iommu)1600 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1601 {
1602 iommu_device_unregister(&iommu->iommu);
1603 iommu_device_sysfs_remove(&iommu->iommu);
1604 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1605 riscv_iommu_queue_disable(&iommu->cmdq);
1606 riscv_iommu_queue_disable(&iommu->fltq);
1607 }
1608
riscv_iommu_init(struct riscv_iommu_device * iommu)1609 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1610 {
1611 int rc;
1612
1613 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1614 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1615
1616 rc = riscv_iommu_init_check(iommu);
1617 if (rc)
1618 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1619
1620 rc = riscv_iommu_iodir_alloc(iommu);
1621 if (rc)
1622 return rc;
1623
1624 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1625 sizeof(struct riscv_iommu_command));
1626 if (rc)
1627 return rc;
1628
1629 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1630 sizeof(struct riscv_iommu_fq_record));
1631 if (rc)
1632 return rc;
1633
1634 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1635 if (rc)
1636 return rc;
1637
1638 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1639 if (rc)
1640 goto err_queue_disable;
1641
1642 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1643 if (rc)
1644 goto err_queue_disable;
1645
1646 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1647 dev_name(iommu->dev));
1648 if (rc) {
1649 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1650 goto err_iodir_off;
1651 }
1652
1653 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1654 if (rc) {
1655 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1656 goto err_remove_sysfs;
1657 }
1658
1659 return 0;
1660
1661 err_remove_sysfs:
1662 iommu_device_sysfs_remove(&iommu->iommu);
1663 err_iodir_off:
1664 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1665 err_queue_disable:
1666 riscv_iommu_queue_disable(&iommu->fltq);
1667 riscv_iommu_queue_disable(&iommu->cmdq);
1668 return rc;
1669 }
1670