1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * IOMMU API for RISC-V IOMMU implementations.
4 *
5 * Copyright © 2022-2024 Rivos Inc.
6 * Copyright © 2023 FORTH-ICS/CARV
7 *
8 * Authors
9 * Tomasz Jeznach <tjeznach@rivosinc.com>
10 * Nick Kossifidis <mick@ics.forth.gr>
11 */
12
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14
15 #include <linux/compiler.h>
16 #include <linux/crash_dump.h>
17 #include <linux/init.h>
18 #include <linux/iommu.h>
19 #include <linux/iopoll.h>
20 #include <linux/kernel.h>
21 #include <linux/pci.h>
22
23 #include "../iommu-pages.h"
24 #include "iommu-bits.h"
25 #include "iommu.h"
26
27 /* Timeouts in [us] */
28 #define RISCV_IOMMU_QCSR_TIMEOUT 150000
29 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000
30 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000
31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
32
33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
34 #define RISCV_IOMMU_DEF_CQ_COUNT 8192
35 #define RISCV_IOMMU_DEF_FQ_COUNT 4096
36
37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
38 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
39 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
40
41 #define dev_to_iommu(dev) \
42 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
43
44 /* IOMMU PSCID allocation namespace. */
45 static DEFINE_IDA(riscv_iommu_pscids);
46 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
47
48 /* Device resource-managed allocations */
49 struct riscv_iommu_devres {
50 void *addr;
51 };
52
riscv_iommu_devres_pages_release(struct device * dev,void * res)53 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
54 {
55 struct riscv_iommu_devres *devres = res;
56
57 iommu_free_pages(devres->addr);
58 }
59
riscv_iommu_devres_pages_match(struct device * dev,void * res,void * p)60 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
61 {
62 struct riscv_iommu_devres *devres = res;
63 struct riscv_iommu_devres *target = p;
64
65 return devres->addr == target->addr;
66 }
67
riscv_iommu_get_pages(struct riscv_iommu_device * iommu,unsigned int size)68 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
69 unsigned int size)
70 {
71 struct riscv_iommu_devres *devres;
72 void *addr;
73
74 addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
75 GFP_KERNEL_ACCOUNT, size);
76 if (unlikely(!addr))
77 return NULL;
78
79 devres = devres_alloc(riscv_iommu_devres_pages_release,
80 sizeof(struct riscv_iommu_devres), GFP_KERNEL);
81
82 if (unlikely(!devres)) {
83 iommu_free_pages(addr);
84 return NULL;
85 }
86
87 devres->addr = addr;
88
89 devres_add(iommu->dev, devres);
90
91 return addr;
92 }
93
riscv_iommu_free_pages(struct riscv_iommu_device * iommu,void * addr)94 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
95 {
96 struct riscv_iommu_devres devres = { .addr = addr };
97
98 devres_release(iommu->dev, riscv_iommu_devres_pages_release,
99 riscv_iommu_devres_pages_match, &devres);
100 }
101
102 /*
103 * Hardware queue allocation and management.
104 */
105
106 /* Setup queue base, control registers and default queue length */
107 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
108 struct riscv_iommu_queue *_q = q; \
109 _q->qid = RISCV_IOMMU_INTR_ ## name; \
110 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
111 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
112 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
113 } while (0)
114
115 /* Note: offsets are the same for all queues */
116 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
117 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
118 #define Q_ITEM(q, index) ((q)->mask & (index))
119 #define Q_IPSR(q) BIT((q)->qid)
120
121 /*
122 * Discover queue ring buffer hardware configuration, allocate in-memory
123 * ring buffer or use fixed I/O memory location, configure queue base register.
124 * Must be called before hardware queue is enabled.
125 *
126 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
127 * @entry_size - queue single element size in bytes.
128 */
riscv_iommu_queue_alloc(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,size_t entry_size)129 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
130 struct riscv_iommu_queue *queue,
131 size_t entry_size)
132 {
133 unsigned int logsz;
134 u64 qb, rb;
135
136 /*
137 * Use WARL base register property to discover maximum allowed
138 * number of entries and optional fixed IO address for queue location.
139 */
140 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
141 qb = riscv_iommu_readq(iommu, queue->qbr);
142
143 /*
144 * Calculate and verify hardware supported queue length, as reported
145 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
146 * Update queue size based on hardware supported value.
147 */
148 logsz = ilog2(queue->mask);
149 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
150 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
151
152 /*
153 * Use WARL base register property to discover an optional fixed IO
154 * address for queue ring buffer location. Otherwise allocate contiguous
155 * system memory.
156 */
157 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
158 const size_t queue_size = entry_size << (logsz + 1);
159
160 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
161 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
162 } else {
163 do {
164 const size_t queue_size = entry_size << (logsz + 1);
165
166 queue->base = riscv_iommu_get_pages(
167 iommu, max(queue_size, SZ_4K));
168 queue->phys = __pa(queue->base);
169 } while (!queue->base && logsz-- > 0);
170 }
171
172 if (!queue->base)
173 return -ENOMEM;
174
175 qb = phys_to_ppn(queue->phys) |
176 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
177
178 /* Update base register and read back to verify hw accepted our write */
179 riscv_iommu_writeq(iommu, queue->qbr, qb);
180 rb = riscv_iommu_readq(iommu, queue->qbr);
181 if (rb != qb) {
182 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
183 return -ENODEV;
184 }
185
186 /* Update actual queue mask */
187 queue->mask = (2U << logsz) - 1;
188
189 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
190 queue->qid, logsz + 1);
191
192 return 0;
193 }
194
195 /* Check interrupt queue status, IPSR */
riscv_iommu_queue_ipsr(int irq,void * data)196 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
197 {
198 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
199
200 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
201 return IRQ_WAKE_THREAD;
202
203 return IRQ_NONE;
204 }
205
riscv_iommu_queue_vec(struct riscv_iommu_device * iommu,int n)206 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
207 {
208 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
209 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
210 }
211
212 /*
213 * Enable queue processing in the hardware, register interrupt handler.
214 *
215 * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
216 * @irq_handler - threaded interrupt handler.
217 */
riscv_iommu_queue_enable(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,irq_handler_t irq_handler)218 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
219 struct riscv_iommu_queue *queue,
220 irq_handler_t irq_handler)
221 {
222 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
223 u32 csr;
224 int rc;
225
226 if (queue->iommu)
227 return -EBUSY;
228
229 /* Polling not implemented */
230 if (!irq)
231 return -ENODEV;
232
233 queue->iommu = iommu;
234 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
235 IRQF_ONESHOT | IRQF_SHARED,
236 dev_name(iommu->dev), queue);
237 if (rc) {
238 queue->iommu = NULL;
239 return rc;
240 }
241
242 /* Empty queue before enabling it */
243 if (queue->qid == RISCV_IOMMU_INTR_CQ)
244 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
245 else
246 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
247
248 /*
249 * Enable queue with interrupts, clear any memory fault if any.
250 * Wait for the hardware to acknowledge request and activate queue
251 * processing.
252 * Note: All CSR bitfields are in the same offsets for all queues.
253 */
254 riscv_iommu_writel(iommu, queue->qcr,
255 RISCV_IOMMU_QUEUE_ENABLE |
256 RISCV_IOMMU_QUEUE_INTR_ENABLE |
257 RISCV_IOMMU_QUEUE_MEM_FAULT);
258
259 riscv_iommu_readl_timeout(iommu, queue->qcr,
260 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
261 10, RISCV_IOMMU_QCSR_TIMEOUT);
262
263 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
264 RISCV_IOMMU_QUEUE_BUSY |
265 RISCV_IOMMU_QUEUE_MEM_FAULT))) {
266 /* Best effort to stop and disable failing hardware queue. */
267 riscv_iommu_writel(iommu, queue->qcr, 0);
268 free_irq(irq, queue);
269 queue->iommu = NULL;
270 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
271 return -EBUSY;
272 }
273
274 /* Clear any pending interrupt flag. */
275 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
276
277 return 0;
278 }
279
280 /*
281 * Disable queue. Wait for the hardware to acknowledge request and
282 * stop processing enqueued requests. Report errors but continue.
283 */
riscv_iommu_queue_disable(struct riscv_iommu_queue * queue)284 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
285 {
286 struct riscv_iommu_device *iommu = queue->iommu;
287 u32 csr;
288
289 if (!iommu)
290 return;
291
292 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
293 riscv_iommu_writel(iommu, queue->qcr, 0);
294 riscv_iommu_readl_timeout(iommu, queue->qcr,
295 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
296 10, RISCV_IOMMU_QCSR_TIMEOUT);
297
298 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
299 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
300 queue->qid, csr);
301
302 queue->iommu = NULL;
303 }
304
305 /*
306 * Returns number of available valid queue entries and the first item index.
307 * Update shadow producer index if necessary.
308 */
riscv_iommu_queue_consume(struct riscv_iommu_queue * queue,unsigned int * index)309 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
310 unsigned int *index)
311 {
312 unsigned int head = atomic_read(&queue->head);
313 unsigned int tail = atomic_read(&queue->tail);
314 unsigned int last = Q_ITEM(queue, tail);
315 int available = (int)(tail - head);
316
317 *index = head;
318
319 if (available > 0)
320 return available;
321
322 /* read hardware producer index, check reserved register bits are not set. */
323 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
324 tail, (tail & ~queue->mask) == 0,
325 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
326 dev_err_once(queue->iommu->dev,
327 "Hardware error: queue access timeout\n");
328 return 0;
329 }
330
331 if (tail == last)
332 return 0;
333
334 /* update shadow producer index */
335 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
336 }
337
338 /*
339 * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
340 */
riscv_iommu_queue_release(struct riscv_iommu_queue * queue,int count)341 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
342 {
343 const unsigned int head = atomic_add_return(count, &queue->head);
344
345 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
346 }
347
348 /* Return actual consumer index based on hardware reported queue head index. */
riscv_iommu_queue_cons(struct riscv_iommu_queue * queue)349 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
350 {
351 const unsigned int cons = atomic_read(&queue->head);
352 const unsigned int last = Q_ITEM(queue, cons);
353 unsigned int head;
354
355 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
356 !(head & ~queue->mask),
357 0, RISCV_IOMMU_QUEUE_TIMEOUT))
358 return cons;
359
360 return cons + ((head - last) & queue->mask);
361 }
362
363 /* Wait for submitted item to be processed. */
riscv_iommu_queue_wait(struct riscv_iommu_queue * queue,unsigned int index,unsigned int timeout_us)364 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
365 unsigned int index,
366 unsigned int timeout_us)
367 {
368 unsigned int cons = atomic_read(&queue->head);
369
370 /* Already processed by the consumer */
371 if ((int)(cons - index) > 0)
372 return 0;
373
374 /* Monitor consumer index */
375 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
376 (int)(cons - index) > 0, 0, timeout_us);
377 }
378
379 /* Enqueue an entry and wait to be processed if timeout_us > 0
380 *
381 * Error handling for IOMMU hardware not responding in reasonable time
382 * will be added as separate patch series along with other RAS features.
383 * For now, only report hardware failure and continue.
384 */
riscv_iommu_queue_send(struct riscv_iommu_queue * queue,void * entry,size_t entry_size)385 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
386 void *entry, size_t entry_size)
387 {
388 unsigned int prod;
389 unsigned int head;
390 unsigned int tail;
391 unsigned long flags;
392
393 /* Do not preempt submission flow. */
394 local_irq_save(flags);
395
396 /* 1. Allocate some space in the queue */
397 prod = atomic_inc_return(&queue->prod) - 1;
398 head = atomic_read(&queue->head);
399
400 /* 2. Wait for space availability. */
401 if ((prod - head) > queue->mask) {
402 if (readx_poll_timeout(atomic_read, &queue->head,
403 head, (prod - head) < queue->mask,
404 0, RISCV_IOMMU_QUEUE_TIMEOUT))
405 goto err_busy;
406 } else if ((prod - head) == queue->mask) {
407 const unsigned int last = Q_ITEM(queue, head);
408
409 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
410 !(head & ~queue->mask) && head != last,
411 0, RISCV_IOMMU_QUEUE_TIMEOUT))
412 goto err_busy;
413 atomic_add((head - last) & queue->mask, &queue->head);
414 }
415
416 /* 3. Store entry in the ring buffer */
417 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
418
419 /* 4. Wait for all previous entries to be ready */
420 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
421 0, RISCV_IOMMU_QUEUE_TIMEOUT))
422 goto err_busy;
423
424 /*
425 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
426 * completed and visible before signaling the tail doorbell to fetch
427 * the next command. 'fence ow, ow'
428 */
429 dma_wmb();
430 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
431
432 /*
433 * 6. Make sure the doorbell write to the device has finished before updating
434 * the shadow tail index in normal memory. 'fence o, w'
435 */
436 mmiowb();
437 atomic_inc(&queue->tail);
438
439 /* 7. Complete submission and restore local interrupts */
440 local_irq_restore(flags);
441
442 return prod;
443
444 err_busy:
445 local_irq_restore(flags);
446 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
447
448 return prod;
449 }
450
451 /*
452 * IOMMU Command queue chapter 3.1
453 */
454
455 /* Command queue interrupt handler thread function */
riscv_iommu_cmdq_process(int irq,void * data)456 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
457 {
458 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
459 unsigned int ctrl;
460
461 /* Clear MF/CQ errors, complete error recovery to be implemented. */
462 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
463 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
464 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
465 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
466 dev_warn(queue->iommu->dev,
467 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
468 queue->qid,
469 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
470 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
471 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
472 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
473 }
474
475 /* Placeholder for command queue interrupt notifiers */
476
477 /* Clear command interrupt pending. */
478 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
479
480 return IRQ_HANDLED;
481 }
482
483 /* Send command to the IOMMU command queue */
riscv_iommu_cmd_send(struct riscv_iommu_device * iommu,struct riscv_iommu_command * cmd)484 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
485 struct riscv_iommu_command *cmd)
486 {
487 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
488 }
489
490 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
riscv_iommu_cmd_sync(struct riscv_iommu_device * iommu,unsigned int timeout_us)491 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
492 unsigned int timeout_us)
493 {
494 struct riscv_iommu_command cmd;
495 unsigned int prod;
496
497 riscv_iommu_cmd_iofence(&cmd);
498 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
499
500 if (!timeout_us)
501 return;
502
503 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
504 dev_err_once(iommu->dev,
505 "Hardware error: command execution timeout\n");
506 }
507
508 /*
509 * IOMMU Fault/Event queue chapter 3.2
510 */
511
riscv_iommu_fault(struct riscv_iommu_device * iommu,struct riscv_iommu_fq_record * event)512 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
513 struct riscv_iommu_fq_record *event)
514 {
515 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
516 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
517
518 /* Placeholder for future fault handling implementation, report only. */
519 if (err)
520 dev_warn_ratelimited(iommu->dev,
521 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
522 err, devid, event->iotval, event->iotval2);
523 }
524
525 /* Fault queue interrupt handler thread function */
riscv_iommu_fltq_process(int irq,void * data)526 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
527 {
528 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
529 struct riscv_iommu_device *iommu = queue->iommu;
530 struct riscv_iommu_fq_record *events;
531 unsigned int ctrl, idx;
532 int cnt, len;
533
534 events = (struct riscv_iommu_fq_record *)queue->base;
535
536 /* Clear fault interrupt pending and process all received fault events. */
537 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
538
539 do {
540 cnt = riscv_iommu_queue_consume(queue, &idx);
541 for (len = 0; len < cnt; idx++, len++)
542 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
543 riscv_iommu_queue_release(queue, cnt);
544 } while (cnt > 0);
545
546 /* Clear MF/OF errors, complete error recovery to be implemented. */
547 ctrl = riscv_iommu_readl(iommu, queue->qcr);
548 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
549 riscv_iommu_writel(iommu, queue->qcr, ctrl);
550 dev_warn(iommu->dev,
551 "Queue #%u error; memory fault:%d overflow:%d\n",
552 queue->qid,
553 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
554 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
555 }
556
557 return IRQ_HANDLED;
558 }
559
560 /* Lookup and initialize device context info structure. */
riscv_iommu_get_dc(struct riscv_iommu_device * iommu,unsigned int devid)561 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
562 unsigned int devid)
563 {
564 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
565 unsigned int depth;
566 unsigned long ddt, old, new;
567 void *ptr;
568 u8 ddi_bits[3] = { 0 };
569 u64 *ddtp = NULL;
570
571 /* Make sure the mode is valid */
572 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
573 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
574 return NULL;
575
576 /*
577 * Device id partitioning for base format:
578 * DDI[0]: bits 0 - 6 (1st level) (7 bits)
579 * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
580 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
581 *
582 * For extended format:
583 * DDI[0]: bits 0 - 5 (1st level) (6 bits)
584 * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
585 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
586 */
587 if (base_format) {
588 ddi_bits[0] = 7;
589 ddi_bits[1] = 7 + 9;
590 ddi_bits[2] = 7 + 9 + 8;
591 } else {
592 ddi_bits[0] = 6;
593 ddi_bits[1] = 6 + 9;
594 ddi_bits[2] = 6 + 9 + 9;
595 }
596
597 /* Make sure device id is within range */
598 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
599 if (devid >= (1 << ddi_bits[depth]))
600 return NULL;
601
602 /* Get to the level of the non-leaf node that holds the device context */
603 for (ddtp = iommu->ddt_root; depth-- > 0;) {
604 const int split = ddi_bits[depth];
605 /*
606 * Each non-leaf node is 64bits wide and on each level
607 * nodes are indexed by DDI[depth].
608 */
609 ddtp += (devid >> split) & 0x1FF;
610
611 /*
612 * Check if this node has been populated and if not
613 * allocate a new level and populate it.
614 */
615 do {
616 ddt = READ_ONCE(*(unsigned long *)ddtp);
617 if (ddt & RISCV_IOMMU_DDTE_V) {
618 ddtp = __va(ppn_to_phys(ddt));
619 break;
620 }
621
622 ptr = riscv_iommu_get_pages(iommu, SZ_4K);
623 if (!ptr)
624 return NULL;
625
626 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
627 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
628
629 if (old == ddt) {
630 ddtp = (u64 *)ptr;
631 break;
632 }
633
634 /* Race setting DDT detected, re-read and retry. */
635 riscv_iommu_free_pages(iommu, ptr);
636 } while (1);
637 }
638
639 /*
640 * Grab the node that matches DDI[depth], note that when using base
641 * format the device context is 4 * 64bits, and the extended format
642 * is 8 * 64bits, hence the (3 - base_format) below.
643 */
644 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
645
646 return (struct riscv_iommu_dc *)ddtp;
647 }
648
649 /*
650 * This is best effort IOMMU translation shutdown flow.
651 * Disable IOMMU without waiting for hardware response.
652 */
riscv_iommu_disable(struct riscv_iommu_device * iommu)653 void riscv_iommu_disable(struct riscv_iommu_device *iommu)
654 {
655 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
656 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
657 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
658 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
659 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
661 }
662
663 #define riscv_iommu_read_ddtp(iommu) ({ \
664 u64 ddtp; \
665 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
666 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
667 RISCV_IOMMU_DDTP_TIMEOUT); \
668 ddtp; })
669
riscv_iommu_iodir_alloc(struct riscv_iommu_device * iommu)670 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
671 {
672 u64 ddtp;
673 unsigned int mode;
674
675 ddtp = riscv_iommu_read_ddtp(iommu);
676 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
677 return -EBUSY;
678
679 /*
680 * It is optional for the hardware to report a fixed address for device
681 * directory root page when DDT.MODE is OFF or BARE.
682 */
683 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
684 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
685 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
686 /* Use WARL to discover hardware fixed DDT PPN */
687 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
688 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
689 ddtp = riscv_iommu_read_ddtp(iommu);
690 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
691 return -EBUSY;
692
693 iommu->ddt_phys = ppn_to_phys(ddtp);
694 if (iommu->ddt_phys)
695 iommu->ddt_root = devm_ioremap(iommu->dev,
696 iommu->ddt_phys, PAGE_SIZE);
697 if (iommu->ddt_root)
698 memset(iommu->ddt_root, 0, PAGE_SIZE);
699 }
700
701 if (!iommu->ddt_root) {
702 iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
703 iommu->ddt_phys = __pa(iommu->ddt_root);
704 }
705
706 if (!iommu->ddt_root)
707 return -ENOMEM;
708
709 return 0;
710 }
711
712 /*
713 * Discover supported DDT modes starting from requested value,
714 * configure DDTP register with accepted mode and root DDT address.
715 * Accepted iommu->ddt_mode is updated on success.
716 */
riscv_iommu_iodir_set_mode(struct riscv_iommu_device * iommu,unsigned int ddtp_mode)717 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
718 unsigned int ddtp_mode)
719 {
720 struct device *dev = iommu->dev;
721 u64 ddtp, rq_ddtp;
722 unsigned int mode, rq_mode = ddtp_mode;
723 struct riscv_iommu_command cmd;
724
725 ddtp = riscv_iommu_read_ddtp(iommu);
726 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
727 return -EBUSY;
728
729 /* Disallow state transition from xLVL to xLVL. */
730 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
731 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
732 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
733 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
734 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
735 return -EINVAL;
736
737 do {
738 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
739 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
740 rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
741
742 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
743 ddtp = riscv_iommu_read_ddtp(iommu);
744 if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
745 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
746 rq_mode, ddtp);
747 return -EBUSY;
748 }
749
750 /* Verify IOMMU hardware accepts new DDTP config. */
751 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
752
753 if (rq_mode == mode)
754 break;
755
756 /* Hardware mandatory DDTP mode has not been accepted. */
757 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
758 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
759 ddtp, rq_ddtp);
760 return -EINVAL;
761 }
762
763 /*
764 * Mode field is WARL, an IOMMU may support a subset of
765 * directory table levels in which case if we tried to set
766 * an unsupported number of levels we'll readback either
767 * a valid xLVL or off/bare. If we got off/bare, try again
768 * with a smaller xLVL.
769 */
770 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
771 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
772 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
773 rq_mode--;
774 continue;
775 }
776
777 /*
778 * We tried all supported modes and IOMMU hardware failed to
779 * accept new settings, something went very wrong since off/bare
780 * and at least one xLVL must be supported.
781 */
782 dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
783 mode, ddtp_mode);
784 return -EINVAL;
785 } while (1);
786
787 iommu->ddt_mode = mode;
788 if (mode != ddtp_mode)
789 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
790
791 /* Invalidate device context cache */
792 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
793 riscv_iommu_cmd_send(iommu, &cmd);
794
795 /* Invalidate address translation cache */
796 riscv_iommu_cmd_inval_vma(&cmd);
797 riscv_iommu_cmd_send(iommu, &cmd);
798
799 /* IOFENCE.C */
800 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
801
802 return 0;
803 }
804
805 /* This struct contains protection domain specific IOMMU driver data. */
806 struct riscv_iommu_domain {
807 struct iommu_domain domain;
808 struct list_head bonds;
809 spinlock_t lock; /* protect bonds list updates. */
810 int pscid;
811 bool amo_enabled;
812 int numa_node;
813 unsigned int pgd_mode;
814 unsigned long *pgd_root;
815 };
816
817 #define iommu_domain_to_riscv(iommu_domain) \
818 container_of(iommu_domain, struct riscv_iommu_domain, domain)
819
820 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
821 struct riscv_iommu_info {
822 struct riscv_iommu_domain *domain;
823 };
824
825 /*
826 * Linkage between an iommu_domain and attached devices.
827 *
828 * Protection domain requiring IOATC and DevATC translation cache invalidations,
829 * should be linked to attached devices using a riscv_iommu_bond structure.
830 * Devices should be linked to the domain before first use and unlinked after
831 * the translations from the referenced protection domain can no longer be used.
832 * Blocking and identity domains are not tracked here, as the IOMMU hardware
833 * does not cache negative and/or identity (BARE mode) translations, and DevATC
834 * is disabled for those protection domains.
835 *
836 * The device pointer and IOMMU data remain stable in the bond struct after
837 * _probe_device() where it's attached to the managed IOMMU, up to the
838 * completion of the _release_device() call. The release of the bond structure
839 * is synchronized with the device release.
840 */
841 struct riscv_iommu_bond {
842 struct list_head list;
843 struct rcu_head rcu;
844 struct device *dev;
845 };
846
riscv_iommu_bond_link(struct riscv_iommu_domain * domain,struct device * dev)847 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
848 struct device *dev)
849 {
850 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
851 struct riscv_iommu_bond *bond;
852 struct list_head *bonds;
853
854 bond = kzalloc(sizeof(*bond), GFP_KERNEL);
855 if (!bond)
856 return -ENOMEM;
857 bond->dev = dev;
858
859 /*
860 * List of devices attached to the domain is arranged based on
861 * managed IOMMU device.
862 */
863
864 spin_lock(&domain->lock);
865 list_for_each(bonds, &domain->bonds)
866 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
867 break;
868 list_add_rcu(&bond->list, bonds);
869 spin_unlock(&domain->lock);
870
871 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
872 smp_mb();
873
874 return 0;
875 }
876
riscv_iommu_bond_unlink(struct riscv_iommu_domain * domain,struct device * dev)877 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
878 struct device *dev)
879 {
880 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
881 struct riscv_iommu_bond *bond, *found = NULL;
882 struct riscv_iommu_command cmd;
883 int count = 0;
884
885 if (!domain)
886 return;
887
888 spin_lock(&domain->lock);
889 list_for_each_entry(bond, &domain->bonds, list) {
890 if (found && count)
891 break;
892 else if (bond->dev == dev)
893 found = bond;
894 else if (dev_to_iommu(bond->dev) == iommu)
895 count++;
896 }
897 if (found)
898 list_del_rcu(&found->list);
899 spin_unlock(&domain->lock);
900 kfree_rcu(found, rcu);
901
902 /*
903 * If this was the last bond between this domain and the IOMMU
904 * invalidate all cached entries for domain's PSCID.
905 */
906 if (!count) {
907 riscv_iommu_cmd_inval_vma(&cmd);
908 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
909 riscv_iommu_cmd_send(iommu, &cmd);
910
911 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
912 }
913 }
914
915 /*
916 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
917 * This limit will be replaced with range invalidations, if supported by
918 * the hardware, when RISC-V IOMMU architecture specification update for
919 * range invalidations update will be available.
920 */
921 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
922
riscv_iommu_iotlb_inval(struct riscv_iommu_domain * domain,unsigned long start,unsigned long end)923 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
924 unsigned long start, unsigned long end)
925 {
926 struct riscv_iommu_bond *bond;
927 struct riscv_iommu_device *iommu, *prev;
928 struct riscv_iommu_command cmd;
929 unsigned long len = end - start + 1;
930 unsigned long iova;
931
932 /*
933 * For each IOMMU linked with this protection domain (via bonds->dev),
934 * an IOTLB invaliation command will be submitted and executed.
935 *
936 * Possbile race with domain attach flow is handled by sequencing
937 * bond creation - riscv_iommu_bond_link(), and device directory
938 * update - riscv_iommu_iodir_update().
939 *
940 * PTE Update / IOTLB Inval Device attach & directory update
941 * -------------------------- --------------------------
942 * update page table entries add dev to the bond list
943 * FENCE RW,RW FENCE RW,RW
944 * For all IOMMUs: (can be empty) Update FSC/PSCID
945 * FENCE IOW,IOW FENCE IOW,IOW
946 * IOTLB.INVAL IODIR.INVAL
947 * IOFENCE.C
948 *
949 * If bond list is not updated with new device, directory context will
950 * be configured with already valid page table content. If an IOMMU is
951 * linked to the protection domain it will receive invalidation
952 * requests for updated page table entries.
953 */
954 smp_mb();
955
956 rcu_read_lock();
957
958 prev = NULL;
959 list_for_each_entry_rcu(bond, &domain->bonds, list) {
960 iommu = dev_to_iommu(bond->dev);
961
962 /*
963 * IOTLB invalidation request can be safely omitted if already sent
964 * to the IOMMU for the same PSCID, and with domain->bonds list
965 * arranged based on the device's IOMMU, it's sufficient to check
966 * last device the invalidation was sent to.
967 */
968 if (iommu == prev)
969 continue;
970
971 riscv_iommu_cmd_inval_vma(&cmd);
972 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
973 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
974 for (iova = start; iova < end; iova += PAGE_SIZE) {
975 riscv_iommu_cmd_inval_set_addr(&cmd, iova);
976 riscv_iommu_cmd_send(iommu, &cmd);
977 }
978 } else {
979 riscv_iommu_cmd_send(iommu, &cmd);
980 }
981 prev = iommu;
982 }
983
984 prev = NULL;
985 list_for_each_entry_rcu(bond, &domain->bonds, list) {
986 iommu = dev_to_iommu(bond->dev);
987 if (iommu == prev)
988 continue;
989
990 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
991 prev = iommu;
992 }
993 rcu_read_unlock();
994 }
995
996 #define RISCV_IOMMU_FSC_BARE 0
997
998 /*
999 * Update IODIR for the device.
1000 *
1001 * During the execution of riscv_iommu_probe_device(), IODIR entries are
1002 * allocated for the device's identifiers. Device context invalidation
1003 * becomes necessary only if one of the updated entries was previously
1004 * marked as valid, given that invalid device context entries are not
1005 * cached by the IOMMU hardware.
1006 * In this implementation, updating a valid device context while the
1007 * device is not quiesced might be disruptive, potentially causing
1008 * interim translation faults.
1009 */
riscv_iommu_iodir_update(struct riscv_iommu_device * iommu,struct device * dev,u64 fsc,u64 ta)1010 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1011 struct device *dev, u64 fsc, u64 ta)
1012 {
1013 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1014 struct riscv_iommu_dc *dc;
1015 struct riscv_iommu_command cmd;
1016 bool sync_required = false;
1017 u64 tc;
1018 int i;
1019
1020 for (i = 0; i < fwspec->num_ids; i++) {
1021 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1022 tc = READ_ONCE(dc->tc);
1023 if (!(tc & RISCV_IOMMU_DC_TC_V))
1024 continue;
1025
1026 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1027
1028 /* Invalidate device context cached values */
1029 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1030 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1031 riscv_iommu_cmd_send(iommu, &cmd);
1032 sync_required = true;
1033 }
1034
1035 if (sync_required)
1036 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1037
1038 /*
1039 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1040 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1041 */
1042 for (i = 0; i < fwspec->num_ids; i++) {
1043 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1044 tc = READ_ONCE(dc->tc);
1045 tc |= ta & RISCV_IOMMU_DC_TC_V;
1046
1047 WRITE_ONCE(dc->fsc, fsc);
1048 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1049 /* Update device context, write TC.V as the last step. */
1050 dma_wmb();
1051 WRITE_ONCE(dc->tc, tc);
1052
1053 /* Invalidate device context after update */
1054 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1055 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1056 riscv_iommu_cmd_send(iommu, &cmd);
1057 }
1058
1059 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1060 }
1061
1062 /*
1063 * IOVA page translation tree management.
1064 */
1065
riscv_iommu_iotlb_flush_all(struct iommu_domain * iommu_domain)1066 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1067 {
1068 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1069
1070 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1071 }
1072
riscv_iommu_iotlb_sync(struct iommu_domain * iommu_domain,struct iommu_iotlb_gather * gather)1073 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1074 struct iommu_iotlb_gather *gather)
1075 {
1076 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1077
1078 riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1079 }
1080
1081 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1082
1083 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1084 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
1085 #define _io_pte_none(pte) ((pte) == 0)
1086 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1087
riscv_iommu_pte_free(struct riscv_iommu_domain * domain,unsigned long pte,struct iommu_pages_list * freelist)1088 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
1089 unsigned long pte,
1090 struct iommu_pages_list *freelist)
1091 {
1092 unsigned long *ptr;
1093 int i;
1094
1095 if (!_io_pte_present(pte) || _io_pte_leaf(pte))
1096 return;
1097
1098 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1099
1100 /* Recursively free all sub page table pages */
1101 for (i = 0; i < PTRS_PER_PTE; i++) {
1102 pte = READ_ONCE(ptr[i]);
1103 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
1104 riscv_iommu_pte_free(domain, pte, freelist);
1105 }
1106
1107 if (freelist)
1108 iommu_pages_list_add(freelist, ptr);
1109 else
1110 iommu_free_pages(ptr);
1111 }
1112
riscv_iommu_pte_alloc(struct riscv_iommu_domain * domain,unsigned long iova,size_t pgsize,gfp_t gfp)1113 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
1114 unsigned long iova, size_t pgsize,
1115 gfp_t gfp)
1116 {
1117 unsigned long *ptr = domain->pgd_root;
1118 unsigned long pte, old;
1119 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1120 void *addr;
1121
1122 do {
1123 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1124
1125 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1126 /*
1127 * Note: returned entry might be a non-leaf if there was
1128 * existing mapping with smaller granularity. Up to the caller
1129 * to replace and invalidate.
1130 */
1131 if (((size_t)1 << shift) == pgsize)
1132 return ptr;
1133 pte_retry:
1134 pte = READ_ONCE(*ptr);
1135 /*
1136 * This is very likely incorrect as we should not be adding
1137 * new mapping with smaller granularity on top
1138 * of existing 2M/1G mapping. Fail.
1139 */
1140 if (_io_pte_present(pte) && _io_pte_leaf(pte))
1141 return NULL;
1142 /*
1143 * Non-leaf entry is missing, allocate and try to add to the
1144 * page table. This might race with other mappings, retry.
1145 */
1146 if (_io_pte_none(pte)) {
1147 addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
1148 SZ_4K);
1149 if (!addr)
1150 return NULL;
1151 old = pte;
1152 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
1153 if (cmpxchg_relaxed(ptr, old, pte) != old) {
1154 iommu_free_pages(addr);
1155 goto pte_retry;
1156 }
1157 }
1158 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1159 } while (level-- > 0);
1160
1161 return NULL;
1162 }
1163
riscv_iommu_pte_fetch(struct riscv_iommu_domain * domain,unsigned long iova,size_t * pte_pgsize)1164 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
1165 unsigned long iova, size_t *pte_pgsize)
1166 {
1167 unsigned long *ptr = domain->pgd_root;
1168 unsigned long pte;
1169 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1170
1171 do {
1172 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1173
1174 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1175 pte = READ_ONCE(*ptr);
1176 if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
1177 *pte_pgsize = (size_t)1 << shift;
1178 return ptr;
1179 }
1180 if (_io_pte_none(pte))
1181 return NULL;
1182 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1183 } while (level-- > 0);
1184
1185 return NULL;
1186 }
1187
riscv_iommu_map_pages(struct iommu_domain * iommu_domain,unsigned long iova,phys_addr_t phys,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)1188 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
1189 unsigned long iova, phys_addr_t phys,
1190 size_t pgsize, size_t pgcount, int prot,
1191 gfp_t gfp, size_t *mapped)
1192 {
1193 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1194 size_t size = 0;
1195 unsigned long *ptr;
1196 unsigned long pte, old, pte_prot;
1197 int rc = 0;
1198 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
1199
1200 if (!(prot & IOMMU_WRITE))
1201 pte_prot = _PAGE_BASE | _PAGE_READ;
1202 else if (domain->amo_enabled)
1203 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
1204 else
1205 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
1206
1207 while (pgcount) {
1208 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
1209 if (!ptr) {
1210 rc = -ENOMEM;
1211 break;
1212 }
1213
1214 old = READ_ONCE(*ptr);
1215 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
1216 if (cmpxchg_relaxed(ptr, old, pte) != old)
1217 continue;
1218
1219 riscv_iommu_pte_free(domain, old, &freelist);
1220
1221 size += pgsize;
1222 iova += pgsize;
1223 phys += pgsize;
1224 --pgcount;
1225 }
1226
1227 *mapped = size;
1228
1229 if (!iommu_pages_list_empty(&freelist)) {
1230 /*
1231 * In 1.0 spec version, the smallest scope we can use to
1232 * invalidate all levels of page table (i.e. leaf and non-leaf)
1233 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1234 * This will be updated with hardware support for
1235 * capability.NL (non-leaf) IOTINVAL command.
1236 */
1237 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1238 iommu_put_pages_list(&freelist);
1239 }
1240
1241 return rc;
1242 }
1243
riscv_iommu_unmap_pages(struct iommu_domain * iommu_domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)1244 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
1245 unsigned long iova, size_t pgsize,
1246 size_t pgcount,
1247 struct iommu_iotlb_gather *gather)
1248 {
1249 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1250 size_t size = pgcount << __ffs(pgsize);
1251 unsigned long *ptr, old;
1252 size_t unmapped = 0;
1253 size_t pte_size;
1254
1255 while (unmapped < size) {
1256 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1257 if (!ptr)
1258 return unmapped;
1259
1260 /* partial unmap is not allowed, fail. */
1261 if (iova & (pte_size - 1))
1262 return unmapped;
1263
1264 old = READ_ONCE(*ptr);
1265 if (cmpxchg_relaxed(ptr, old, 0) != old)
1266 continue;
1267
1268 iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
1269 pte_size);
1270
1271 iova += pte_size;
1272 unmapped += pte_size;
1273 }
1274
1275 return unmapped;
1276 }
1277
riscv_iommu_iova_to_phys(struct iommu_domain * iommu_domain,dma_addr_t iova)1278 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
1279 dma_addr_t iova)
1280 {
1281 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1282 size_t pte_size;
1283 unsigned long *ptr;
1284
1285 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1286 if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
1287 return 0;
1288
1289 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
1290 }
1291
riscv_iommu_free_paging_domain(struct iommu_domain * iommu_domain)1292 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1293 {
1294 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1295 const unsigned long pfn = virt_to_pfn(domain->pgd_root);
1296
1297 WARN_ON(!list_empty(&domain->bonds));
1298
1299 if ((int)domain->pscid > 0)
1300 ida_free(&riscv_iommu_pscids, domain->pscid);
1301
1302 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
1303 kfree(domain);
1304 }
1305
riscv_iommu_pt_supported(struct riscv_iommu_device * iommu,int pgd_mode)1306 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1307 {
1308 switch (pgd_mode) {
1309 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1310 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1311
1312 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1313 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1314
1315 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1316 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1317 }
1318 return false;
1319 }
1320
riscv_iommu_attach_paging_domain(struct iommu_domain * iommu_domain,struct device * dev)1321 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1322 struct device *dev)
1323 {
1324 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1325 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1326 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1327 u64 fsc, ta;
1328
1329 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
1330 return -ENODEV;
1331
1332 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
1333 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
1334 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1335 RISCV_IOMMU_PC_TA_V;
1336
1337 if (riscv_iommu_bond_link(domain, dev))
1338 return -ENOMEM;
1339
1340 riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1341 riscv_iommu_bond_unlink(info->domain, dev);
1342 info->domain = domain;
1343
1344 return 0;
1345 }
1346
1347 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1348 .attach_dev = riscv_iommu_attach_paging_domain,
1349 .free = riscv_iommu_free_paging_domain,
1350 .map_pages = riscv_iommu_map_pages,
1351 .unmap_pages = riscv_iommu_unmap_pages,
1352 .iova_to_phys = riscv_iommu_iova_to_phys,
1353 .iotlb_sync = riscv_iommu_iotlb_sync,
1354 .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1355 };
1356
riscv_iommu_alloc_paging_domain(struct device * dev)1357 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1358 {
1359 struct riscv_iommu_domain *domain;
1360 struct riscv_iommu_device *iommu;
1361 unsigned int pgd_mode;
1362 dma_addr_t va_mask;
1363 int va_bits;
1364
1365 iommu = dev_to_iommu(dev);
1366 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1367 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
1368 va_bits = 57;
1369 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1370 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
1371 va_bits = 48;
1372 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1373 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
1374 va_bits = 39;
1375 } else {
1376 dev_err(dev, "cannot find supported page table mode\n");
1377 return ERR_PTR(-ENODEV);
1378 }
1379
1380 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1381 if (!domain)
1382 return ERR_PTR(-ENOMEM);
1383
1384 INIT_LIST_HEAD_RCU(&domain->bonds);
1385 spin_lock_init(&domain->lock);
1386 domain->numa_node = dev_to_node(iommu->dev);
1387 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
1388 domain->pgd_mode = pgd_mode;
1389 domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node,
1390 GFP_KERNEL_ACCOUNT, SZ_4K);
1391 if (!domain->pgd_root) {
1392 kfree(domain);
1393 return ERR_PTR(-ENOMEM);
1394 }
1395
1396 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1397 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1398 if (domain->pscid < 0) {
1399 iommu_free_pages(domain->pgd_root);
1400 kfree(domain);
1401 return ERR_PTR(-ENOMEM);
1402 }
1403
1404 /*
1405 * Note: RISC-V Privilege spec mandates that virtual addresses
1406 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1407 * bits >= VA_BITS need to also be set or else we'll get a
1408 * page fault. However the code that creates the mappings
1409 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1410 * for now, so we'll end up with invalid virtual addresses
1411 * to map. As a workaround until we get this sorted out
1412 * limit the available virtual addresses to VA_BITS - 1.
1413 */
1414 va_mask = DMA_BIT_MASK(va_bits - 1);
1415
1416 domain->domain.geometry.aperture_start = 0;
1417 domain->domain.geometry.aperture_end = va_mask;
1418 domain->domain.geometry.force_aperture = true;
1419 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
1420
1421 domain->domain.ops = &riscv_iommu_paging_domain_ops;
1422
1423 return &domain->domain;
1424 }
1425
riscv_iommu_attach_blocking_domain(struct iommu_domain * iommu_domain,struct device * dev)1426 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1427 struct device *dev)
1428 {
1429 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1430 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1431
1432 /* Make device context invalid, translation requests will fault w/ #258 */
1433 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1434 riscv_iommu_bond_unlink(info->domain, dev);
1435 info->domain = NULL;
1436
1437 return 0;
1438 }
1439
1440 static struct iommu_domain riscv_iommu_blocking_domain = {
1441 .type = IOMMU_DOMAIN_BLOCKED,
1442 .ops = &(const struct iommu_domain_ops) {
1443 .attach_dev = riscv_iommu_attach_blocking_domain,
1444 }
1445 };
1446
riscv_iommu_attach_identity_domain(struct iommu_domain * iommu_domain,struct device * dev)1447 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1448 struct device *dev)
1449 {
1450 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1451 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1452
1453 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1454 riscv_iommu_bond_unlink(info->domain, dev);
1455 info->domain = NULL;
1456
1457 return 0;
1458 }
1459
1460 static struct iommu_domain riscv_iommu_identity_domain = {
1461 .type = IOMMU_DOMAIN_IDENTITY,
1462 .ops = &(const struct iommu_domain_ops) {
1463 .attach_dev = riscv_iommu_attach_identity_domain,
1464 }
1465 };
1466
riscv_iommu_device_group(struct device * dev)1467 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1468 {
1469 if (dev_is_pci(dev))
1470 return pci_device_group(dev);
1471 return generic_device_group(dev);
1472 }
1473
riscv_iommu_of_xlate(struct device * dev,const struct of_phandle_args * args)1474 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1475 {
1476 return iommu_fwspec_add_ids(dev, args->args, 1);
1477 }
1478
riscv_iommu_probe_device(struct device * dev)1479 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1480 {
1481 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1482 struct riscv_iommu_device *iommu;
1483 struct riscv_iommu_info *info;
1484 struct riscv_iommu_dc *dc;
1485 u64 tc;
1486 int i;
1487
1488 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1489 return ERR_PTR(-ENODEV);
1490
1491 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1492 if (!iommu)
1493 return ERR_PTR(-ENODEV);
1494
1495 /*
1496 * IOMMU hardware operating in fail-over BARE mode will provide
1497 * identity translation for all connected devices anyway...
1498 */
1499 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1500 return ERR_PTR(-ENODEV);
1501
1502 info = kzalloc(sizeof(*info), GFP_KERNEL);
1503 if (!info)
1504 return ERR_PTR(-ENOMEM);
1505 /*
1506 * Allocate and pre-configure device context entries in
1507 * the device directory. Do not mark the context valid yet.
1508 */
1509 tc = 0;
1510 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
1511 tc |= RISCV_IOMMU_DC_TC_SADE;
1512 for (i = 0; i < fwspec->num_ids; i++) {
1513 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1514 if (!dc) {
1515 kfree(info);
1516 return ERR_PTR(-ENODEV);
1517 }
1518 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1519 dev_warn(dev, "already attached to IOMMU device directory\n");
1520 WRITE_ONCE(dc->tc, tc);
1521 }
1522
1523 dev_iommu_priv_set(dev, info);
1524
1525 return &iommu->iommu;
1526 }
1527
riscv_iommu_release_device(struct device * dev)1528 static void riscv_iommu_release_device(struct device *dev)
1529 {
1530 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1531
1532 kfree_rcu_mightsleep(info);
1533 }
1534
1535 static const struct iommu_ops riscv_iommu_ops = {
1536 .pgsize_bitmap = SZ_4K,
1537 .of_xlate = riscv_iommu_of_xlate,
1538 .identity_domain = &riscv_iommu_identity_domain,
1539 .blocked_domain = &riscv_iommu_blocking_domain,
1540 .release_domain = &riscv_iommu_blocking_domain,
1541 .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1542 .device_group = riscv_iommu_device_group,
1543 .probe_device = riscv_iommu_probe_device,
1544 .release_device = riscv_iommu_release_device,
1545 };
1546
riscv_iommu_init_check(struct riscv_iommu_device * iommu)1547 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1548 {
1549 u64 ddtp;
1550
1551 /*
1552 * Make sure the IOMMU is switched off or in pass-through mode during
1553 * regular boot flow and disable translation when we boot into a kexec
1554 * kernel and the previous kernel left them enabled.
1555 */
1556 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1557 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1558 return -EBUSY;
1559
1560 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1561 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1562 if (!is_kdump_kernel())
1563 return -EBUSY;
1564 riscv_iommu_disable(iommu);
1565 }
1566
1567 /* Configure accesses to in-memory data structures for CPU-native byte order. */
1568 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1569 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1570 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1571 return -EINVAL;
1572 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1573 iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1574 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1575 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1576 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1577 return -EINVAL;
1578 }
1579
1580 /*
1581 * Distribute interrupt vectors, always use first vector for CIV.
1582 * At least one interrupt is required. Read back and verify.
1583 */
1584 if (!iommu->irqs_count)
1585 return -EINVAL;
1586
1587 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1588 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1589 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1590 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1591 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1592 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1593 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
1594 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1595 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1596 return -EINVAL;
1597
1598 return 0;
1599 }
1600
riscv_iommu_remove(struct riscv_iommu_device * iommu)1601 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1602 {
1603 iommu_device_unregister(&iommu->iommu);
1604 iommu_device_sysfs_remove(&iommu->iommu);
1605 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1606 riscv_iommu_queue_disable(&iommu->cmdq);
1607 riscv_iommu_queue_disable(&iommu->fltq);
1608 }
1609
riscv_iommu_init(struct riscv_iommu_device * iommu)1610 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1611 {
1612 int rc;
1613
1614 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1615 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1616
1617 rc = riscv_iommu_init_check(iommu);
1618 if (rc)
1619 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1620
1621 rc = riscv_iommu_iodir_alloc(iommu);
1622 if (rc)
1623 return rc;
1624
1625 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1626 sizeof(struct riscv_iommu_command));
1627 if (rc)
1628 return rc;
1629
1630 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1631 sizeof(struct riscv_iommu_fq_record));
1632 if (rc)
1633 return rc;
1634
1635 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1636 if (rc)
1637 return rc;
1638
1639 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1640 if (rc)
1641 goto err_queue_disable;
1642
1643 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1644 if (rc)
1645 goto err_queue_disable;
1646
1647 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1648 dev_name(iommu->dev));
1649 if (rc) {
1650 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1651 goto err_iodir_off;
1652 }
1653
1654 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1655 if (rc) {
1656 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1657 goto err_remove_sysfs;
1658 }
1659
1660 return 0;
1661
1662 err_remove_sysfs:
1663 iommu_device_sysfs_remove(&iommu->iommu);
1664 err_iodir_off:
1665 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1666 err_queue_disable:
1667 riscv_iommu_queue_disable(&iommu->fltq);
1668 riscv_iommu_queue_disable(&iommu->cmdq);
1669 return rc;
1670 }
1671