1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * IOMMU API for RISC-V IOMMU implementations.
4 *
5 * Copyright © 2022-2024 Rivos Inc.
6 * Copyright © 2023 FORTH-ICS/CARV
7 *
8 * Authors
9 * Tomasz Jeznach <tjeznach@rivosinc.com>
10 * Nick Kossifidis <mick@ics.forth.gr>
11 */
12
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14
15 #include <linux/acpi.h>
16 #include <linux/acpi_rimt.h>
17 #include <linux/compiler.h>
18 #include <linux/crash_dump.h>
19 #include <linux/init.h>
20 #include <linux/iommu.h>
21 #include <linux/iopoll.h>
22 #include <linux/kernel.h>
23 #include <linux/pci.h>
24
25 #include "../iommu-pages.h"
26 #include "iommu-bits.h"
27 #include "iommu.h"
28
29 /* Timeouts in [us] */
30 #define RISCV_IOMMU_QCSR_TIMEOUT 150000
31 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000
32 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000
33 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
34
35 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
36 #define RISCV_IOMMU_DEF_CQ_COUNT 8192
37 #define RISCV_IOMMU_DEF_FQ_COUNT 4096
38
39 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
40 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
41 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
42
43 #define dev_to_iommu(dev) \
44 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
45
46 /* IOMMU PSCID allocation namespace. */
47 static DEFINE_IDA(riscv_iommu_pscids);
48 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
49
50 /* Device resource-managed allocations */
51 struct riscv_iommu_devres {
52 void *addr;
53 };
54
riscv_iommu_devres_pages_release(struct device * dev,void * res)55 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
56 {
57 struct riscv_iommu_devres *devres = res;
58
59 iommu_free_pages(devres->addr);
60 }
61
riscv_iommu_devres_pages_match(struct device * dev,void * res,void * p)62 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
63 {
64 struct riscv_iommu_devres *devres = res;
65 struct riscv_iommu_devres *target = p;
66
67 return devres->addr == target->addr;
68 }
69
riscv_iommu_get_pages(struct riscv_iommu_device * iommu,unsigned int size)70 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
71 unsigned int size)
72 {
73 struct riscv_iommu_devres *devres;
74 void *addr;
75
76 addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
77 GFP_KERNEL_ACCOUNT, size);
78 if (unlikely(!addr))
79 return NULL;
80
81 devres = devres_alloc(riscv_iommu_devres_pages_release,
82 sizeof(struct riscv_iommu_devres), GFP_KERNEL);
83
84 if (unlikely(!devres)) {
85 iommu_free_pages(addr);
86 return NULL;
87 }
88
89 devres->addr = addr;
90
91 devres_add(iommu->dev, devres);
92
93 return addr;
94 }
95
riscv_iommu_free_pages(struct riscv_iommu_device * iommu,void * addr)96 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
97 {
98 struct riscv_iommu_devres devres = { .addr = addr };
99
100 devres_release(iommu->dev, riscv_iommu_devres_pages_release,
101 riscv_iommu_devres_pages_match, &devres);
102 }
103
104 /*
105 * Hardware queue allocation and management.
106 */
107
108 /* Setup queue base, control registers and default queue length */
109 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
110 struct riscv_iommu_queue *_q = q; \
111 _q->qid = RISCV_IOMMU_INTR_ ## name; \
112 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
113 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
114 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
115 } while (0)
116
117 /* Note: offsets are the same for all queues */
118 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
119 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
120 #define Q_ITEM(q, index) ((q)->mask & (index))
121 #define Q_IPSR(q) BIT((q)->qid)
122
123 /*
124 * Discover queue ring buffer hardware configuration, allocate in-memory
125 * ring buffer or use fixed I/O memory location, configure queue base register.
126 * Must be called before hardware queue is enabled.
127 *
128 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
129 * @entry_size - queue single element size in bytes.
130 */
riscv_iommu_queue_alloc(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,size_t entry_size)131 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
132 struct riscv_iommu_queue *queue,
133 size_t entry_size)
134 {
135 unsigned int logsz;
136 u64 qb, rb;
137
138 /*
139 * Use WARL base register property to discover maximum allowed
140 * number of entries and optional fixed IO address for queue location.
141 */
142 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
143 qb = riscv_iommu_readq(iommu, queue->qbr);
144
145 /*
146 * Calculate and verify hardware supported queue length, as reported
147 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
148 * Update queue size based on hardware supported value.
149 */
150 logsz = ilog2(queue->mask);
151 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
152 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
153
154 /*
155 * Use WARL base register property to discover an optional fixed IO
156 * address for queue ring buffer location. Otherwise allocate contiguous
157 * system memory.
158 */
159 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
160 const size_t queue_size = entry_size << (logsz + 1);
161
162 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
163 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
164 } else {
165 do {
166 const size_t queue_size = entry_size << (logsz + 1);
167
168 queue->base = riscv_iommu_get_pages(
169 iommu, max(queue_size, SZ_4K));
170 queue->phys = __pa(queue->base);
171 } while (!queue->base && logsz-- > 0);
172 }
173
174 if (!queue->base)
175 return -ENOMEM;
176
177 qb = phys_to_ppn(queue->phys) |
178 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
179
180 /* Update base register and read back to verify hw accepted our write */
181 riscv_iommu_writeq(iommu, queue->qbr, qb);
182 rb = riscv_iommu_readq(iommu, queue->qbr);
183 if (rb != qb) {
184 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
185 return -ENODEV;
186 }
187
188 /* Update actual queue mask */
189 queue->mask = (2U << logsz) - 1;
190
191 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
192 queue->qid, logsz + 1);
193
194 return 0;
195 }
196
197 /* Check interrupt queue status, IPSR */
riscv_iommu_queue_ipsr(int irq,void * data)198 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
199 {
200 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
201
202 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
203 return IRQ_WAKE_THREAD;
204
205 return IRQ_NONE;
206 }
207
riscv_iommu_queue_vec(struct riscv_iommu_device * iommu,int n)208 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
209 {
210 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
211 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
212 }
213
214 /*
215 * Enable queue processing in the hardware, register interrupt handler.
216 *
217 * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
218 * @irq_handler - threaded interrupt handler.
219 */
riscv_iommu_queue_enable(struct riscv_iommu_device * iommu,struct riscv_iommu_queue * queue,irq_handler_t irq_handler)220 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
221 struct riscv_iommu_queue *queue,
222 irq_handler_t irq_handler)
223 {
224 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
225 u32 csr;
226 int rc;
227
228 if (queue->iommu)
229 return -EBUSY;
230
231 /* Polling not implemented */
232 if (!irq)
233 return -ENODEV;
234
235 queue->iommu = iommu;
236 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
237 IRQF_ONESHOT | IRQF_SHARED,
238 dev_name(iommu->dev), queue);
239 if (rc) {
240 queue->iommu = NULL;
241 return rc;
242 }
243
244 /* Empty queue before enabling it */
245 if (queue->qid == RISCV_IOMMU_INTR_CQ)
246 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
247 else
248 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
249
250 /*
251 * Enable queue with interrupts, clear any memory fault if any.
252 * Wait for the hardware to acknowledge request and activate queue
253 * processing.
254 * Note: All CSR bitfields are in the same offsets for all queues.
255 */
256 riscv_iommu_writel(iommu, queue->qcr,
257 RISCV_IOMMU_QUEUE_ENABLE |
258 RISCV_IOMMU_QUEUE_INTR_ENABLE |
259 RISCV_IOMMU_QUEUE_MEM_FAULT);
260
261 riscv_iommu_readl_timeout(iommu, queue->qcr,
262 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
263 10, RISCV_IOMMU_QCSR_TIMEOUT);
264
265 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
266 RISCV_IOMMU_QUEUE_BUSY |
267 RISCV_IOMMU_QUEUE_MEM_FAULT))) {
268 /* Best effort to stop and disable failing hardware queue. */
269 riscv_iommu_writel(iommu, queue->qcr, 0);
270 free_irq(irq, queue);
271 queue->iommu = NULL;
272 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
273 return -EBUSY;
274 }
275
276 /* Clear any pending interrupt flag. */
277 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
278
279 return 0;
280 }
281
282 /*
283 * Disable queue. Wait for the hardware to acknowledge request and
284 * stop processing enqueued requests. Report errors but continue.
285 */
riscv_iommu_queue_disable(struct riscv_iommu_queue * queue)286 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
287 {
288 struct riscv_iommu_device *iommu = queue->iommu;
289 u32 csr;
290
291 if (!iommu)
292 return;
293
294 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
295 riscv_iommu_writel(iommu, queue->qcr, 0);
296 riscv_iommu_readl_timeout(iommu, queue->qcr,
297 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
298 10, RISCV_IOMMU_QCSR_TIMEOUT);
299
300 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
301 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
302 queue->qid, csr);
303
304 queue->iommu = NULL;
305 }
306
307 /*
308 * Returns number of available valid queue entries and the first item index.
309 * Update shadow producer index if necessary.
310 */
riscv_iommu_queue_consume(struct riscv_iommu_queue * queue,unsigned int * index)311 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
312 unsigned int *index)
313 {
314 unsigned int head = atomic_read(&queue->head);
315 unsigned int tail = atomic_read(&queue->tail);
316 unsigned int last = Q_ITEM(queue, tail);
317 int available = (int)(tail - head);
318
319 *index = head;
320
321 if (available > 0)
322 return available;
323
324 /* read hardware producer index, check reserved register bits are not set. */
325 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
326 tail, (tail & ~queue->mask) == 0,
327 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
328 dev_err_once(queue->iommu->dev,
329 "Hardware error: queue access timeout\n");
330 return 0;
331 }
332
333 if (tail == last)
334 return 0;
335
336 /* update shadow producer index */
337 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
338 }
339
340 /*
341 * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
342 */
riscv_iommu_queue_release(struct riscv_iommu_queue * queue,int count)343 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
344 {
345 const unsigned int head = atomic_add_return(count, &queue->head);
346
347 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
348 }
349
350 /* Return actual consumer index based on hardware reported queue head index. */
riscv_iommu_queue_cons(struct riscv_iommu_queue * queue)351 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
352 {
353 const unsigned int cons = atomic_read(&queue->head);
354 const unsigned int last = Q_ITEM(queue, cons);
355 unsigned int head;
356
357 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
358 !(head & ~queue->mask),
359 0, RISCV_IOMMU_QUEUE_TIMEOUT))
360 return cons;
361
362 return cons + ((head - last) & queue->mask);
363 }
364
365 /* Wait for submitted item to be processed. */
riscv_iommu_queue_wait(struct riscv_iommu_queue * queue,unsigned int index,unsigned int timeout_us)366 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
367 unsigned int index,
368 unsigned int timeout_us)
369 {
370 unsigned int cons = atomic_read(&queue->head);
371
372 /* Already processed by the consumer */
373 if ((int)(cons - index) > 0)
374 return 0;
375
376 /* Monitor consumer index */
377 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
378 (int)(cons - index) > 0, 0, timeout_us);
379 }
380
381 /* Enqueue an entry and wait to be processed if timeout_us > 0
382 *
383 * Error handling for IOMMU hardware not responding in reasonable time
384 * will be added as separate patch series along with other RAS features.
385 * For now, only report hardware failure and continue.
386 */
riscv_iommu_queue_send(struct riscv_iommu_queue * queue,void * entry,size_t entry_size)387 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
388 void *entry, size_t entry_size)
389 {
390 unsigned int prod;
391 unsigned int head;
392 unsigned int tail;
393 unsigned long flags;
394
395 /* Do not preempt submission flow. */
396 local_irq_save(flags);
397
398 /* 1. Allocate some space in the queue */
399 prod = atomic_inc_return(&queue->prod) - 1;
400 head = atomic_read(&queue->head);
401
402 /* 2. Wait for space availability. */
403 if ((prod - head) > queue->mask) {
404 if (readx_poll_timeout(atomic_read, &queue->head,
405 head, (prod - head) < queue->mask,
406 0, RISCV_IOMMU_QUEUE_TIMEOUT))
407 goto err_busy;
408 } else if ((prod - head) == queue->mask) {
409 const unsigned int last = Q_ITEM(queue, head);
410
411 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
412 !(head & ~queue->mask) && head != last,
413 0, RISCV_IOMMU_QUEUE_TIMEOUT))
414 goto err_busy;
415 atomic_add((head - last) & queue->mask, &queue->head);
416 }
417
418 /* 3. Store entry in the ring buffer */
419 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
420
421 /* 4. Wait for all previous entries to be ready */
422 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
423 0, RISCV_IOMMU_QUEUE_TIMEOUT))
424 goto err_busy;
425
426 /*
427 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
428 * completed and visible before signaling the tail doorbell to fetch
429 * the next command. 'fence ow, ow'
430 */
431 dma_wmb();
432 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
433
434 /*
435 * 6. Make sure the doorbell write to the device has finished before updating
436 * the shadow tail index in normal memory. 'fence o, w'
437 */
438 mmiowb();
439 atomic_inc(&queue->tail);
440
441 /* 7. Complete submission and restore local interrupts */
442 local_irq_restore(flags);
443
444 return prod;
445
446 err_busy:
447 local_irq_restore(flags);
448 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
449
450 return prod;
451 }
452
453 /*
454 * IOMMU Command queue chapter 3.1
455 */
456
457 /* Command queue interrupt handler thread function */
riscv_iommu_cmdq_process(int irq,void * data)458 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
459 {
460 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
461 unsigned int ctrl;
462
463 /* Clear MF/CQ errors, complete error recovery to be implemented. */
464 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
465 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
466 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
467 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
468 dev_warn(queue->iommu->dev,
469 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
470 queue->qid,
471 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
472 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
473 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
474 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
475 }
476
477 /* Placeholder for command queue interrupt notifiers */
478
479 /* Clear command interrupt pending. */
480 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
481
482 return IRQ_HANDLED;
483 }
484
485 /* Send command to the IOMMU command queue */
riscv_iommu_cmd_send(struct riscv_iommu_device * iommu,struct riscv_iommu_command * cmd)486 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
487 struct riscv_iommu_command *cmd)
488 {
489 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
490 }
491
492 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
riscv_iommu_cmd_sync(struct riscv_iommu_device * iommu,unsigned int timeout_us)493 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
494 unsigned int timeout_us)
495 {
496 struct riscv_iommu_command cmd;
497 unsigned int prod;
498
499 riscv_iommu_cmd_iofence(&cmd);
500 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
501
502 if (!timeout_us)
503 return;
504
505 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
506 dev_err_once(iommu->dev,
507 "Hardware error: command execution timeout\n");
508 }
509
510 /*
511 * IOMMU Fault/Event queue chapter 3.2
512 */
513
riscv_iommu_fault(struct riscv_iommu_device * iommu,struct riscv_iommu_fq_record * event)514 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
515 struct riscv_iommu_fq_record *event)
516 {
517 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
518 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
519
520 /* Placeholder for future fault handling implementation, report only. */
521 if (err)
522 dev_warn_ratelimited(iommu->dev,
523 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
524 err, devid, event->iotval, event->iotval2);
525 }
526
527 /* Fault queue interrupt handler thread function */
riscv_iommu_fltq_process(int irq,void * data)528 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
529 {
530 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
531 struct riscv_iommu_device *iommu = queue->iommu;
532 struct riscv_iommu_fq_record *events;
533 unsigned int ctrl, idx;
534 int cnt, len;
535
536 events = (struct riscv_iommu_fq_record *)queue->base;
537
538 /* Clear fault interrupt pending and process all received fault events. */
539 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
540
541 do {
542 cnt = riscv_iommu_queue_consume(queue, &idx);
543 for (len = 0; len < cnt; idx++, len++)
544 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
545 riscv_iommu_queue_release(queue, cnt);
546 } while (cnt > 0);
547
548 /* Clear MF/OF errors, complete error recovery to be implemented. */
549 ctrl = riscv_iommu_readl(iommu, queue->qcr);
550 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
551 riscv_iommu_writel(iommu, queue->qcr, ctrl);
552 dev_warn(iommu->dev,
553 "Queue #%u error; memory fault:%d overflow:%d\n",
554 queue->qid,
555 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
556 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
557 }
558
559 return IRQ_HANDLED;
560 }
561
562 /* Lookup and initialize device context info structure. */
riscv_iommu_get_dc(struct riscv_iommu_device * iommu,unsigned int devid)563 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
564 unsigned int devid)
565 {
566 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
567 unsigned int depth;
568 unsigned long ddt, old, new;
569 void *ptr;
570 u8 ddi_bits[3] = { 0 };
571 u64 *ddtp = NULL;
572
573 /* Make sure the mode is valid */
574 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
575 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
576 return NULL;
577
578 /*
579 * Device id partitioning for base format:
580 * DDI[0]: bits 0 - 6 (1st level) (7 bits)
581 * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
582 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
583 *
584 * For extended format:
585 * DDI[0]: bits 0 - 5 (1st level) (6 bits)
586 * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
587 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
588 */
589 if (base_format) {
590 ddi_bits[0] = 7;
591 ddi_bits[1] = 7 + 9;
592 ddi_bits[2] = 7 + 9 + 8;
593 } else {
594 ddi_bits[0] = 6;
595 ddi_bits[1] = 6 + 9;
596 ddi_bits[2] = 6 + 9 + 9;
597 }
598
599 /* Make sure device id is within range */
600 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
601 if (devid >= (1 << ddi_bits[depth]))
602 return NULL;
603
604 /* Get to the level of the non-leaf node that holds the device context */
605 for (ddtp = iommu->ddt_root; depth-- > 0;) {
606 const int split = ddi_bits[depth];
607 /*
608 * Each non-leaf node is 64bits wide and on each level
609 * nodes are indexed by DDI[depth].
610 */
611 ddtp += (devid >> split) & 0x1FF;
612
613 /*
614 * Check if this node has been populated and if not
615 * allocate a new level and populate it.
616 */
617 do {
618 ddt = READ_ONCE(*(unsigned long *)ddtp);
619 if (ddt & RISCV_IOMMU_DDTE_V) {
620 ddtp = __va(ppn_to_phys(ddt));
621 break;
622 }
623
624 ptr = riscv_iommu_get_pages(iommu, SZ_4K);
625 if (!ptr)
626 return NULL;
627
628 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
629 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
630
631 if (old == ddt) {
632 ddtp = (u64 *)ptr;
633 break;
634 }
635
636 /* Race setting DDT detected, re-read and retry. */
637 riscv_iommu_free_pages(iommu, ptr);
638 } while (1);
639 }
640
641 /*
642 * Grab the node that matches DDI[depth], note that when using base
643 * format the device context is 4 * 64bits, and the extended format
644 * is 8 * 64bits, hence the (3 - base_format) below.
645 */
646 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
647
648 return (struct riscv_iommu_dc *)ddtp;
649 }
650
651 /*
652 * This is best effort IOMMU translation shutdown flow.
653 * Disable IOMMU without waiting for hardware response.
654 */
riscv_iommu_disable(struct riscv_iommu_device * iommu)655 void riscv_iommu_disable(struct riscv_iommu_device *iommu)
656 {
657 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
658 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
659 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
661 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
662 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
663 }
664
665 #define riscv_iommu_read_ddtp(iommu) ({ \
666 u64 ddtp; \
667 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
668 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
669 RISCV_IOMMU_DDTP_TIMEOUT); \
670 ddtp; })
671
riscv_iommu_iodir_alloc(struct riscv_iommu_device * iommu)672 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
673 {
674 u64 ddtp;
675 unsigned int mode;
676
677 ddtp = riscv_iommu_read_ddtp(iommu);
678 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
679 return -EBUSY;
680
681 /*
682 * It is optional for the hardware to report a fixed address for device
683 * directory root page when DDT.MODE is OFF or BARE.
684 */
685 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
686 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
687 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
688 /* Use WARL to discover hardware fixed DDT PPN */
689 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
690 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
691 ddtp = riscv_iommu_read_ddtp(iommu);
692 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
693 return -EBUSY;
694
695 iommu->ddt_phys = ppn_to_phys(ddtp);
696 if (iommu->ddt_phys)
697 iommu->ddt_root = devm_ioremap(iommu->dev,
698 iommu->ddt_phys, PAGE_SIZE);
699 if (iommu->ddt_root)
700 memset(iommu->ddt_root, 0, PAGE_SIZE);
701 }
702
703 if (!iommu->ddt_root) {
704 iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
705 iommu->ddt_phys = __pa(iommu->ddt_root);
706 }
707
708 if (!iommu->ddt_root)
709 return -ENOMEM;
710
711 return 0;
712 }
713
714 /*
715 * Discover supported DDT modes starting from requested value,
716 * configure DDTP register with accepted mode and root DDT address.
717 * Accepted iommu->ddt_mode is updated on success.
718 */
riscv_iommu_iodir_set_mode(struct riscv_iommu_device * iommu,unsigned int ddtp_mode)719 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
720 unsigned int ddtp_mode)
721 {
722 struct device *dev = iommu->dev;
723 u64 ddtp, rq_ddtp;
724 unsigned int mode, rq_mode = ddtp_mode;
725 struct riscv_iommu_command cmd;
726
727 ddtp = riscv_iommu_read_ddtp(iommu);
728 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
729 return -EBUSY;
730
731 /* Disallow state transition from xLVL to xLVL. */
732 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
733 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
734 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
735 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
736 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
737 return -EINVAL;
738
739 do {
740 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
741 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
742 rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
743
744 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
745 ddtp = riscv_iommu_read_ddtp(iommu);
746 if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
747 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
748 rq_mode, ddtp);
749 return -EBUSY;
750 }
751
752 /* Verify IOMMU hardware accepts new DDTP config. */
753 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
754
755 if (rq_mode == mode)
756 break;
757
758 /* Hardware mandatory DDTP mode has not been accepted. */
759 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
760 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
761 ddtp, rq_ddtp);
762 return -EINVAL;
763 }
764
765 /*
766 * Mode field is WARL, an IOMMU may support a subset of
767 * directory table levels in which case if we tried to set
768 * an unsupported number of levels we'll readback either
769 * a valid xLVL or off/bare. If we got off/bare, try again
770 * with a smaller xLVL.
771 */
772 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
773 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
774 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
775 rq_mode--;
776 continue;
777 }
778
779 /*
780 * We tried all supported modes and IOMMU hardware failed to
781 * accept new settings, something went very wrong since off/bare
782 * and at least one xLVL must be supported.
783 */
784 dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
785 mode, ddtp_mode);
786 return -EINVAL;
787 } while (1);
788
789 iommu->ddt_mode = mode;
790 if (mode != ddtp_mode)
791 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
792
793 /* Invalidate device context cache */
794 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
795 riscv_iommu_cmd_send(iommu, &cmd);
796
797 /* Invalidate address translation cache */
798 riscv_iommu_cmd_inval_vma(&cmd);
799 riscv_iommu_cmd_send(iommu, &cmd);
800
801 /* IOFENCE.C */
802 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
803
804 return 0;
805 }
806
807 /* This struct contains protection domain specific IOMMU driver data. */
808 struct riscv_iommu_domain {
809 struct iommu_domain domain;
810 struct list_head bonds;
811 spinlock_t lock; /* protect bonds list updates. */
812 int pscid;
813 bool amo_enabled;
814 int numa_node;
815 unsigned int pgd_mode;
816 unsigned long *pgd_root;
817 };
818
819 #define iommu_domain_to_riscv(iommu_domain) \
820 container_of(iommu_domain, struct riscv_iommu_domain, domain)
821
822 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
823 struct riscv_iommu_info {
824 struct riscv_iommu_domain *domain;
825 };
826
827 /*
828 * Linkage between an iommu_domain and attached devices.
829 *
830 * Protection domain requiring IOATC and DevATC translation cache invalidations,
831 * should be linked to attached devices using a riscv_iommu_bond structure.
832 * Devices should be linked to the domain before first use and unlinked after
833 * the translations from the referenced protection domain can no longer be used.
834 * Blocking and identity domains are not tracked here, as the IOMMU hardware
835 * does not cache negative and/or identity (BARE mode) translations, and DevATC
836 * is disabled for those protection domains.
837 *
838 * The device pointer and IOMMU data remain stable in the bond struct after
839 * _probe_device() where it's attached to the managed IOMMU, up to the
840 * completion of the _release_device() call. The release of the bond structure
841 * is synchronized with the device release.
842 */
843 struct riscv_iommu_bond {
844 struct list_head list;
845 struct rcu_head rcu;
846 struct device *dev;
847 };
848
riscv_iommu_bond_link(struct riscv_iommu_domain * domain,struct device * dev)849 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
850 struct device *dev)
851 {
852 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
853 struct riscv_iommu_bond *bond;
854 struct list_head *bonds;
855
856 bond = kzalloc(sizeof(*bond), GFP_KERNEL);
857 if (!bond)
858 return -ENOMEM;
859 bond->dev = dev;
860
861 /*
862 * List of devices attached to the domain is arranged based on
863 * managed IOMMU device.
864 */
865
866 spin_lock(&domain->lock);
867 list_for_each(bonds, &domain->bonds)
868 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
869 break;
870 list_add_rcu(&bond->list, bonds);
871 spin_unlock(&domain->lock);
872
873 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
874 smp_mb();
875
876 return 0;
877 }
878
riscv_iommu_bond_unlink(struct riscv_iommu_domain * domain,struct device * dev)879 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
880 struct device *dev)
881 {
882 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
883 struct riscv_iommu_bond *bond, *found = NULL;
884 struct riscv_iommu_command cmd;
885 int count = 0;
886
887 if (!domain)
888 return;
889
890 spin_lock(&domain->lock);
891 list_for_each_entry(bond, &domain->bonds, list) {
892 if (found && count)
893 break;
894 else if (bond->dev == dev)
895 found = bond;
896 else if (dev_to_iommu(bond->dev) == iommu)
897 count++;
898 }
899 if (found)
900 list_del_rcu(&found->list);
901 spin_unlock(&domain->lock);
902 kfree_rcu(found, rcu);
903
904 /*
905 * If this was the last bond between this domain and the IOMMU
906 * invalidate all cached entries for domain's PSCID.
907 */
908 if (!count) {
909 riscv_iommu_cmd_inval_vma(&cmd);
910 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
911 riscv_iommu_cmd_send(iommu, &cmd);
912
913 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
914 }
915 }
916
917 /*
918 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
919 * This limit will be replaced with range invalidations, if supported by
920 * the hardware, when RISC-V IOMMU architecture specification update for
921 * range invalidations update will be available.
922 */
923 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
924
riscv_iommu_iotlb_inval(struct riscv_iommu_domain * domain,unsigned long start,unsigned long end)925 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
926 unsigned long start, unsigned long end)
927 {
928 struct riscv_iommu_bond *bond;
929 struct riscv_iommu_device *iommu, *prev;
930 struct riscv_iommu_command cmd;
931 unsigned long len = end - start + 1;
932 unsigned long iova;
933
934 /*
935 * For each IOMMU linked with this protection domain (via bonds->dev),
936 * an IOTLB invaliation command will be submitted and executed.
937 *
938 * Possbile race with domain attach flow is handled by sequencing
939 * bond creation - riscv_iommu_bond_link(), and device directory
940 * update - riscv_iommu_iodir_update().
941 *
942 * PTE Update / IOTLB Inval Device attach & directory update
943 * -------------------------- --------------------------
944 * update page table entries add dev to the bond list
945 * FENCE RW,RW FENCE RW,RW
946 * For all IOMMUs: (can be empty) Update FSC/PSCID
947 * FENCE IOW,IOW FENCE IOW,IOW
948 * IOTLB.INVAL IODIR.INVAL
949 * IOFENCE.C
950 *
951 * If bond list is not updated with new device, directory context will
952 * be configured with already valid page table content. If an IOMMU is
953 * linked to the protection domain it will receive invalidation
954 * requests for updated page table entries.
955 */
956 smp_mb();
957
958 rcu_read_lock();
959
960 prev = NULL;
961 list_for_each_entry_rcu(bond, &domain->bonds, list) {
962 iommu = dev_to_iommu(bond->dev);
963
964 /*
965 * IOTLB invalidation request can be safely omitted if already sent
966 * to the IOMMU for the same PSCID, and with domain->bonds list
967 * arranged based on the device's IOMMU, it's sufficient to check
968 * last device the invalidation was sent to.
969 */
970 if (iommu == prev)
971 continue;
972
973 riscv_iommu_cmd_inval_vma(&cmd);
974 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
975 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
976 for (iova = start; iova < end; iova += PAGE_SIZE) {
977 riscv_iommu_cmd_inval_set_addr(&cmd, iova);
978 riscv_iommu_cmd_send(iommu, &cmd);
979 }
980 } else {
981 riscv_iommu_cmd_send(iommu, &cmd);
982 }
983 prev = iommu;
984 }
985
986 prev = NULL;
987 list_for_each_entry_rcu(bond, &domain->bonds, list) {
988 iommu = dev_to_iommu(bond->dev);
989 if (iommu == prev)
990 continue;
991
992 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
993 prev = iommu;
994 }
995 rcu_read_unlock();
996 }
997
998 #define RISCV_IOMMU_FSC_BARE 0
999
1000 /*
1001 * Update IODIR for the device.
1002 *
1003 * During the execution of riscv_iommu_probe_device(), IODIR entries are
1004 * allocated for the device's identifiers. Device context invalidation
1005 * becomes necessary only if one of the updated entries was previously
1006 * marked as valid, given that invalid device context entries are not
1007 * cached by the IOMMU hardware.
1008 * In this implementation, updating a valid device context while the
1009 * device is not quiesced might be disruptive, potentially causing
1010 * interim translation faults.
1011 */
riscv_iommu_iodir_update(struct riscv_iommu_device * iommu,struct device * dev,u64 fsc,u64 ta)1012 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1013 struct device *dev, u64 fsc, u64 ta)
1014 {
1015 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1016 struct riscv_iommu_dc *dc;
1017 struct riscv_iommu_command cmd;
1018 bool sync_required = false;
1019 u64 tc;
1020 int i;
1021
1022 for (i = 0; i < fwspec->num_ids; i++) {
1023 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1024 tc = READ_ONCE(dc->tc);
1025 if (!(tc & RISCV_IOMMU_DC_TC_V))
1026 continue;
1027
1028 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1029
1030 /* Invalidate device context cached values */
1031 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1032 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1033 riscv_iommu_cmd_send(iommu, &cmd);
1034 sync_required = true;
1035 }
1036
1037 if (sync_required)
1038 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1039
1040 /*
1041 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1042 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1043 */
1044 for (i = 0; i < fwspec->num_ids; i++) {
1045 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1046 tc = READ_ONCE(dc->tc);
1047 tc |= ta & RISCV_IOMMU_DC_TC_V;
1048
1049 WRITE_ONCE(dc->fsc, fsc);
1050 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1051 /* Update device context, write TC.V as the last step. */
1052 dma_wmb();
1053 WRITE_ONCE(dc->tc, tc);
1054
1055 /* Invalidate device context after update */
1056 riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1057 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1058 riscv_iommu_cmd_send(iommu, &cmd);
1059 }
1060
1061 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1062 }
1063
1064 /*
1065 * IOVA page translation tree management.
1066 */
1067
riscv_iommu_iotlb_flush_all(struct iommu_domain * iommu_domain)1068 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1069 {
1070 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1071
1072 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1073 }
1074
riscv_iommu_iotlb_sync(struct iommu_domain * iommu_domain,struct iommu_iotlb_gather * gather)1075 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1076 struct iommu_iotlb_gather *gather)
1077 {
1078 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1079
1080 riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1081 }
1082
1083 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1084
1085 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1086 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
1087 #define _io_pte_none(pte) ((pte) == 0)
1088 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1089
riscv_iommu_pte_free(struct riscv_iommu_domain * domain,unsigned long pte,struct iommu_pages_list * freelist)1090 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
1091 unsigned long pte,
1092 struct iommu_pages_list *freelist)
1093 {
1094 unsigned long *ptr;
1095 int i;
1096
1097 if (!_io_pte_present(pte) || _io_pte_leaf(pte))
1098 return;
1099
1100 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1101
1102 /* Recursively free all sub page table pages */
1103 for (i = 0; i < PTRS_PER_PTE; i++) {
1104 pte = READ_ONCE(ptr[i]);
1105 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
1106 riscv_iommu_pte_free(domain, pte, freelist);
1107 }
1108
1109 if (freelist)
1110 iommu_pages_list_add(freelist, ptr);
1111 else
1112 iommu_free_pages(ptr);
1113 }
1114
riscv_iommu_pte_alloc(struct riscv_iommu_domain * domain,unsigned long iova,size_t pgsize,gfp_t gfp)1115 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
1116 unsigned long iova, size_t pgsize,
1117 gfp_t gfp)
1118 {
1119 unsigned long *ptr = domain->pgd_root;
1120 unsigned long pte, old;
1121 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1122 void *addr;
1123
1124 do {
1125 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1126
1127 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1128 /*
1129 * Note: returned entry might be a non-leaf if there was
1130 * existing mapping with smaller granularity. Up to the caller
1131 * to replace and invalidate.
1132 */
1133 if (((size_t)1 << shift) == pgsize)
1134 return ptr;
1135 pte_retry:
1136 pte = READ_ONCE(*ptr);
1137 /*
1138 * This is very likely incorrect as we should not be adding
1139 * new mapping with smaller granularity on top
1140 * of existing 2M/1G mapping. Fail.
1141 */
1142 if (_io_pte_present(pte) && _io_pte_leaf(pte))
1143 return NULL;
1144 /*
1145 * Non-leaf entry is missing, allocate and try to add to the
1146 * page table. This might race with other mappings, retry.
1147 */
1148 if (_io_pte_none(pte)) {
1149 addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
1150 SZ_4K);
1151 if (!addr)
1152 return NULL;
1153 old = pte;
1154 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
1155 if (cmpxchg_relaxed(ptr, old, pte) != old) {
1156 iommu_free_pages(addr);
1157 goto pte_retry;
1158 }
1159 }
1160 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1161 } while (level-- > 0);
1162
1163 return NULL;
1164 }
1165
riscv_iommu_pte_fetch(struct riscv_iommu_domain * domain,unsigned long iova,size_t * pte_pgsize)1166 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
1167 unsigned long iova, size_t *pte_pgsize)
1168 {
1169 unsigned long *ptr = domain->pgd_root;
1170 unsigned long pte;
1171 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1172
1173 do {
1174 const int shift = PAGE_SHIFT + PT_SHIFT * level;
1175
1176 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1177 pte = READ_ONCE(*ptr);
1178 if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
1179 *pte_pgsize = (size_t)1 << shift;
1180 return ptr;
1181 }
1182 if (_io_pte_none(pte))
1183 return NULL;
1184 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1185 } while (level-- > 0);
1186
1187 return NULL;
1188 }
1189
riscv_iommu_map_pages(struct iommu_domain * iommu_domain,unsigned long iova,phys_addr_t phys,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)1190 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
1191 unsigned long iova, phys_addr_t phys,
1192 size_t pgsize, size_t pgcount, int prot,
1193 gfp_t gfp, size_t *mapped)
1194 {
1195 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1196 size_t size = 0;
1197 unsigned long *ptr;
1198 unsigned long pte, old, pte_prot;
1199 int rc = 0;
1200 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
1201
1202 if (!(prot & IOMMU_WRITE))
1203 pte_prot = _PAGE_BASE | _PAGE_READ;
1204 else if (domain->amo_enabled)
1205 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
1206 else
1207 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
1208
1209 while (pgcount) {
1210 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
1211 if (!ptr) {
1212 rc = -ENOMEM;
1213 break;
1214 }
1215
1216 old = READ_ONCE(*ptr);
1217 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
1218 if (cmpxchg_relaxed(ptr, old, pte) != old)
1219 continue;
1220
1221 riscv_iommu_pte_free(domain, old, &freelist);
1222
1223 size += pgsize;
1224 iova += pgsize;
1225 phys += pgsize;
1226 --pgcount;
1227 }
1228
1229 *mapped = size;
1230
1231 if (!iommu_pages_list_empty(&freelist)) {
1232 /*
1233 * In 1.0 spec version, the smallest scope we can use to
1234 * invalidate all levels of page table (i.e. leaf and non-leaf)
1235 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1236 * This will be updated with hardware support for
1237 * capability.NL (non-leaf) IOTINVAL command.
1238 */
1239 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1240 iommu_put_pages_list(&freelist);
1241 }
1242
1243 return rc;
1244 }
1245
riscv_iommu_unmap_pages(struct iommu_domain * iommu_domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)1246 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
1247 unsigned long iova, size_t pgsize,
1248 size_t pgcount,
1249 struct iommu_iotlb_gather *gather)
1250 {
1251 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1252 size_t size = pgcount << __ffs(pgsize);
1253 unsigned long *ptr, old;
1254 size_t unmapped = 0;
1255 size_t pte_size;
1256
1257 while (unmapped < size) {
1258 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1259 if (!ptr)
1260 return unmapped;
1261
1262 /* partial unmap is not allowed, fail. */
1263 if (iova & (pte_size - 1))
1264 return unmapped;
1265
1266 old = READ_ONCE(*ptr);
1267 if (cmpxchg_relaxed(ptr, old, 0) != old)
1268 continue;
1269
1270 iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
1271 pte_size);
1272
1273 iova += pte_size;
1274 unmapped += pte_size;
1275 }
1276
1277 return unmapped;
1278 }
1279
riscv_iommu_iova_to_phys(struct iommu_domain * iommu_domain,dma_addr_t iova)1280 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
1281 dma_addr_t iova)
1282 {
1283 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1284 size_t pte_size;
1285 unsigned long *ptr;
1286
1287 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1288 if (!ptr)
1289 return 0;
1290
1291 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
1292 }
1293
riscv_iommu_free_paging_domain(struct iommu_domain * iommu_domain)1294 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1295 {
1296 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1297 const unsigned long pfn = virt_to_pfn(domain->pgd_root);
1298
1299 WARN_ON(!list_empty(&domain->bonds));
1300
1301 if ((int)domain->pscid > 0)
1302 ida_free(&riscv_iommu_pscids, domain->pscid);
1303
1304 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
1305 kfree(domain);
1306 }
1307
riscv_iommu_pt_supported(struct riscv_iommu_device * iommu,int pgd_mode)1308 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1309 {
1310 switch (pgd_mode) {
1311 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1312 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1313
1314 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1315 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1316
1317 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1318 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1319 }
1320 return false;
1321 }
1322
riscv_iommu_attach_paging_domain(struct iommu_domain * iommu_domain,struct device * dev,struct iommu_domain * old)1323 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1324 struct device *dev,
1325 struct iommu_domain *old)
1326 {
1327 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1328 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1329 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1330 u64 fsc, ta;
1331
1332 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
1333 return -ENODEV;
1334
1335 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
1336 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
1337 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1338 RISCV_IOMMU_PC_TA_V;
1339
1340 if (riscv_iommu_bond_link(domain, dev))
1341 return -ENOMEM;
1342
1343 riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1344 riscv_iommu_bond_unlink(info->domain, dev);
1345 info->domain = domain;
1346
1347 return 0;
1348 }
1349
1350 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1351 .attach_dev = riscv_iommu_attach_paging_domain,
1352 .free = riscv_iommu_free_paging_domain,
1353 .map_pages = riscv_iommu_map_pages,
1354 .unmap_pages = riscv_iommu_unmap_pages,
1355 .iova_to_phys = riscv_iommu_iova_to_phys,
1356 .iotlb_sync = riscv_iommu_iotlb_sync,
1357 .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1358 };
1359
riscv_iommu_alloc_paging_domain(struct device * dev)1360 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1361 {
1362 struct riscv_iommu_domain *domain;
1363 struct riscv_iommu_device *iommu;
1364 unsigned int pgd_mode;
1365 dma_addr_t va_mask;
1366 int va_bits;
1367
1368 iommu = dev_to_iommu(dev);
1369 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1370 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
1371 va_bits = 57;
1372 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1373 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
1374 va_bits = 48;
1375 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1376 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
1377 va_bits = 39;
1378 } else {
1379 dev_err(dev, "cannot find supported page table mode\n");
1380 return ERR_PTR(-ENODEV);
1381 }
1382
1383 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1384 if (!domain)
1385 return ERR_PTR(-ENOMEM);
1386
1387 INIT_LIST_HEAD_RCU(&domain->bonds);
1388 spin_lock_init(&domain->lock);
1389 domain->numa_node = dev_to_node(iommu->dev);
1390 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
1391 domain->pgd_mode = pgd_mode;
1392 domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node,
1393 GFP_KERNEL_ACCOUNT, SZ_4K);
1394 if (!domain->pgd_root) {
1395 kfree(domain);
1396 return ERR_PTR(-ENOMEM);
1397 }
1398
1399 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1400 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1401 if (domain->pscid < 0) {
1402 iommu_free_pages(domain->pgd_root);
1403 kfree(domain);
1404 return ERR_PTR(-ENOMEM);
1405 }
1406
1407 /*
1408 * Note: RISC-V Privilege spec mandates that virtual addresses
1409 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1410 * bits >= VA_BITS need to also be set or else we'll get a
1411 * page fault. However the code that creates the mappings
1412 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1413 * for now, so we'll end up with invalid virtual addresses
1414 * to map. As a workaround until we get this sorted out
1415 * limit the available virtual addresses to VA_BITS - 1.
1416 */
1417 va_mask = DMA_BIT_MASK(va_bits - 1);
1418
1419 domain->domain.geometry.aperture_start = 0;
1420 domain->domain.geometry.aperture_end = va_mask;
1421 domain->domain.geometry.force_aperture = true;
1422 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
1423
1424 domain->domain.ops = &riscv_iommu_paging_domain_ops;
1425
1426 return &domain->domain;
1427 }
1428
riscv_iommu_attach_blocking_domain(struct iommu_domain * iommu_domain,struct device * dev,struct iommu_domain * old)1429 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1430 struct device *dev,
1431 struct iommu_domain *old)
1432 {
1433 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1434 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1435
1436 /* Make device context invalid, translation requests will fault w/ #258 */
1437 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1438 riscv_iommu_bond_unlink(info->domain, dev);
1439 info->domain = NULL;
1440
1441 return 0;
1442 }
1443
1444 static struct iommu_domain riscv_iommu_blocking_domain = {
1445 .type = IOMMU_DOMAIN_BLOCKED,
1446 .ops = &(const struct iommu_domain_ops) {
1447 .attach_dev = riscv_iommu_attach_blocking_domain,
1448 }
1449 };
1450
riscv_iommu_attach_identity_domain(struct iommu_domain * iommu_domain,struct device * dev,struct iommu_domain * old)1451 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1452 struct device *dev,
1453 struct iommu_domain *old)
1454 {
1455 struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1456 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1457
1458 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1459 riscv_iommu_bond_unlink(info->domain, dev);
1460 info->domain = NULL;
1461
1462 return 0;
1463 }
1464
1465 static struct iommu_domain riscv_iommu_identity_domain = {
1466 .type = IOMMU_DOMAIN_IDENTITY,
1467 .ops = &(const struct iommu_domain_ops) {
1468 .attach_dev = riscv_iommu_attach_identity_domain,
1469 }
1470 };
1471
riscv_iommu_device_group(struct device * dev)1472 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1473 {
1474 if (dev_is_pci(dev))
1475 return pci_device_group(dev);
1476 return generic_device_group(dev);
1477 }
1478
riscv_iommu_of_xlate(struct device * dev,const struct of_phandle_args * args)1479 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1480 {
1481 return iommu_fwspec_add_ids(dev, args->args, 1);
1482 }
1483
riscv_iommu_probe_device(struct device * dev)1484 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1485 {
1486 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1487 struct riscv_iommu_device *iommu;
1488 struct riscv_iommu_info *info;
1489 struct riscv_iommu_dc *dc;
1490 u64 tc;
1491 int i;
1492
1493 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1494 return ERR_PTR(-ENODEV);
1495
1496 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1497 if (!iommu)
1498 return ERR_PTR(-ENODEV);
1499
1500 /*
1501 * IOMMU hardware operating in fail-over BARE mode will provide
1502 * identity translation for all connected devices anyway...
1503 */
1504 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1505 return ERR_PTR(-ENODEV);
1506
1507 info = kzalloc(sizeof(*info), GFP_KERNEL);
1508 if (!info)
1509 return ERR_PTR(-ENOMEM);
1510 /*
1511 * Allocate and pre-configure device context entries in
1512 * the device directory. Do not mark the context valid yet.
1513 */
1514 tc = 0;
1515 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
1516 tc |= RISCV_IOMMU_DC_TC_SADE;
1517 for (i = 0; i < fwspec->num_ids; i++) {
1518 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1519 if (!dc) {
1520 kfree(info);
1521 return ERR_PTR(-ENODEV);
1522 }
1523 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1524 dev_warn(dev, "already attached to IOMMU device directory\n");
1525 WRITE_ONCE(dc->tc, tc);
1526 }
1527
1528 dev_iommu_priv_set(dev, info);
1529
1530 return &iommu->iommu;
1531 }
1532
riscv_iommu_release_device(struct device * dev)1533 static void riscv_iommu_release_device(struct device *dev)
1534 {
1535 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1536
1537 kfree_rcu_mightsleep(info);
1538 }
1539
1540 static const struct iommu_ops riscv_iommu_ops = {
1541 .of_xlate = riscv_iommu_of_xlate,
1542 .identity_domain = &riscv_iommu_identity_domain,
1543 .blocked_domain = &riscv_iommu_blocking_domain,
1544 .release_domain = &riscv_iommu_blocking_domain,
1545 .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1546 .device_group = riscv_iommu_device_group,
1547 .probe_device = riscv_iommu_probe_device,
1548 .release_device = riscv_iommu_release_device,
1549 };
1550
riscv_iommu_init_check(struct riscv_iommu_device * iommu)1551 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1552 {
1553 u64 ddtp;
1554
1555 /*
1556 * Make sure the IOMMU is switched off or in pass-through mode during
1557 * regular boot flow and disable translation when we boot into a kexec
1558 * kernel and the previous kernel left them enabled.
1559 */
1560 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1561 if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1562 return -EBUSY;
1563
1564 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1565 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1566 if (!is_kdump_kernel())
1567 return -EBUSY;
1568 riscv_iommu_disable(iommu);
1569 }
1570
1571 /* Configure accesses to in-memory data structures for CPU-native byte order. */
1572 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1573 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1574 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1575 return -EINVAL;
1576 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1577 iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1578 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1579 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1580 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1581 return -EINVAL;
1582 }
1583
1584 /*
1585 * Distribute interrupt vectors, always use first vector for CIV.
1586 * At least one interrupt is required. Read back and verify.
1587 */
1588 if (!iommu->irqs_count)
1589 return -EINVAL;
1590
1591 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1592 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1593 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1594 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1595 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1596 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1597 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
1598 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1599 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1600 return -EINVAL;
1601
1602 return 0;
1603 }
1604
riscv_iommu_remove(struct riscv_iommu_device * iommu)1605 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1606 {
1607 iommu_device_unregister(&iommu->iommu);
1608 iommu_device_sysfs_remove(&iommu->iommu);
1609 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1610 riscv_iommu_queue_disable(&iommu->cmdq);
1611 riscv_iommu_queue_disable(&iommu->fltq);
1612 }
1613
riscv_iommu_init(struct riscv_iommu_device * iommu)1614 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1615 {
1616 int rc;
1617
1618 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1619 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1620
1621 rc = riscv_iommu_init_check(iommu);
1622 if (rc)
1623 return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1624
1625 rc = riscv_iommu_iodir_alloc(iommu);
1626 if (rc)
1627 return rc;
1628
1629 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1630 sizeof(struct riscv_iommu_command));
1631 if (rc)
1632 return rc;
1633
1634 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1635 sizeof(struct riscv_iommu_fq_record));
1636 if (rc)
1637 return rc;
1638
1639 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1640 if (rc)
1641 return rc;
1642
1643 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1644 if (rc)
1645 goto err_queue_disable;
1646
1647 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1648 if (rc)
1649 goto err_queue_disable;
1650
1651 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1652 dev_name(iommu->dev));
1653 if (rc) {
1654 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1655 goto err_iodir_off;
1656 }
1657
1658 if (!acpi_disabled) {
1659 rc = rimt_iommu_register(iommu->dev);
1660 if (rc) {
1661 dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n");
1662 goto err_remove_sysfs;
1663 }
1664 }
1665
1666 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1667 if (rc) {
1668 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1669 goto err_remove_sysfs;
1670 }
1671
1672 return 0;
1673
1674 err_remove_sysfs:
1675 iommu_device_sysfs_remove(&iommu->iommu);
1676 err_iodir_off:
1677 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1678 err_queue_disable:
1679 riscv_iommu_queue_disable(&iommu->fltq);
1680 riscv_iommu_queue_disable(&iommu->cmdq);
1681 return rc;
1682 }
1683