xref: /linux/drivers/iommu/riscv/iommu.c (revision 68a052239fc4b351e961f698b824f7654a346091)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * IOMMU API for RISC-V IOMMU implementations.
4  *
5  * Copyright © 2022-2024 Rivos Inc.
6  * Copyright © 2023 FORTH-ICS/CARV
7  *
8  * Authors
9  *	Tomasz Jeznach <tjeznach@rivosinc.com>
10  *	Nick Kossifidis <mick@ics.forth.gr>
11  */
12 
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14 
15 #include <linux/acpi.h>
16 #include <linux/acpi_rimt.h>
17 #include <linux/compiler.h>
18 #include <linux/crash_dump.h>
19 #include <linux/init.h>
20 #include <linux/iommu.h>
21 #include <linux/iopoll.h>
22 #include <linux/kernel.h>
23 #include <linux/pci.h>
24 
25 #include "../iommu-pages.h"
26 #include "iommu-bits.h"
27 #include "iommu.h"
28 
29 /* Timeouts in [us] */
30 #define RISCV_IOMMU_QCSR_TIMEOUT	150000
31 #define RISCV_IOMMU_QUEUE_TIMEOUT	150000
32 #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
33 #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
34 
35 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
36 #define RISCV_IOMMU_DEF_CQ_COUNT	8192
37 #define RISCV_IOMMU_DEF_FQ_COUNT	4096
38 
39 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
40 #define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
41 #define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
42 
43 #define dev_to_iommu(dev) \
44 	iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
45 
46 /* IOMMU PSCID allocation namespace. */
47 static DEFINE_IDA(riscv_iommu_pscids);
48 #define RISCV_IOMMU_MAX_PSCID		(BIT(20) - 1)
49 
50 /* Device resource-managed allocations */
51 struct riscv_iommu_devres {
52 	void *addr;
53 };
54 
55 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
56 {
57 	struct riscv_iommu_devres *devres = res;
58 
59 	iommu_free_pages(devres->addr);
60 }
61 
62 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
63 {
64 	struct riscv_iommu_devres *devres = res;
65 	struct riscv_iommu_devres *target = p;
66 
67 	return devres->addr == target->addr;
68 }
69 
70 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
71 				   unsigned int size)
72 {
73 	struct riscv_iommu_devres *devres;
74 	void *addr;
75 
76 	addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
77 					 GFP_KERNEL_ACCOUNT, size);
78 	if (unlikely(!addr))
79 		return NULL;
80 
81 	devres = devres_alloc(riscv_iommu_devres_pages_release,
82 			      sizeof(struct riscv_iommu_devres), GFP_KERNEL);
83 
84 	if (unlikely(!devres)) {
85 		iommu_free_pages(addr);
86 		return NULL;
87 	}
88 
89 	devres->addr = addr;
90 
91 	devres_add(iommu->dev, devres);
92 
93 	return addr;
94 }
95 
96 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
97 {
98 	struct riscv_iommu_devres devres = { .addr = addr };
99 
100 	devres_release(iommu->dev, riscv_iommu_devres_pages_release,
101 		       riscv_iommu_devres_pages_match, &devres);
102 }
103 
104 /*
105  * Hardware queue allocation and management.
106  */
107 
108 /* Setup queue base, control registers and default queue length */
109 #define RISCV_IOMMU_QUEUE_INIT(q, name) do {				\
110 	struct riscv_iommu_queue *_q = q;				\
111 	_q->qid = RISCV_IOMMU_INTR_ ## name;				\
112 	_q->qbr = RISCV_IOMMU_REG_ ## name ## B;			\
113 	_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;			\
114 	_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
115 } while (0)
116 
117 /* Note: offsets are the same for all queues */
118 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
119 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
120 #define Q_ITEM(q, index) ((q)->mask & (index))
121 #define Q_IPSR(q) BIT((q)->qid)
122 
123 /*
124  * Discover queue ring buffer hardware configuration, allocate in-memory
125  * ring buffer or use fixed I/O memory location, configure queue base register.
126  * Must be called before hardware queue is enabled.
127  *
128  * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
129  * @entry_size - queue single element size in bytes.
130  */
131 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
132 				   struct riscv_iommu_queue *queue,
133 				   size_t entry_size)
134 {
135 	unsigned int logsz;
136 	u64 qb, rb;
137 
138 	/*
139 	 * Use WARL base register property to discover maximum allowed
140 	 * number of entries and optional fixed IO address for queue location.
141 	 */
142 	riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
143 	qb = riscv_iommu_readq(iommu, queue->qbr);
144 
145 	/*
146 	 * Calculate and verify hardware supported queue length, as reported
147 	 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
148 	 * Update queue size based on hardware supported value.
149 	 */
150 	logsz = ilog2(queue->mask);
151 	if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
152 		logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
153 
154 	/*
155 	 * Use WARL base register property to discover an optional fixed IO
156 	 * address for queue ring buffer location. Otherwise allocate contiguous
157 	 * system memory.
158 	 */
159 	if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
160 		const size_t queue_size = entry_size << (logsz + 1);
161 
162 		queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
163 		queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
164 	} else {
165 		do {
166 			const size_t queue_size = entry_size << (logsz + 1);
167 
168 			queue->base = riscv_iommu_get_pages(
169 				iommu, max(queue_size, SZ_4K));
170 			queue->phys = __pa(queue->base);
171 		} while (!queue->base && logsz-- > 0);
172 	}
173 
174 	if (!queue->base)
175 		return -ENOMEM;
176 
177 	qb = phys_to_ppn(queue->phys) |
178 	     FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
179 
180 	/* Update base register and read back to verify hw accepted our write */
181 	riscv_iommu_writeq(iommu, queue->qbr, qb);
182 	rb = riscv_iommu_readq(iommu, queue->qbr);
183 	if (rb != qb) {
184 		dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
185 		return -ENODEV;
186 	}
187 
188 	/* Update actual queue mask */
189 	queue->mask = (2U << logsz) - 1;
190 
191 	dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
192 		queue->qid, logsz + 1);
193 
194 	return 0;
195 }
196 
197 /* Check interrupt queue status, IPSR */
198 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
199 {
200 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
201 
202 	if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
203 		return IRQ_WAKE_THREAD;
204 
205 	return IRQ_NONE;
206 }
207 
208 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
209 {
210 	/* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
211 	return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
212 }
213 
214 /*
215  * Enable queue processing in the hardware, register interrupt handler.
216  *
217  * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
218  * @irq_handler - threaded interrupt handler.
219  */
220 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
221 				    struct riscv_iommu_queue *queue,
222 				    irq_handler_t irq_handler)
223 {
224 	const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
225 	u32 csr;
226 	int rc;
227 
228 	if (queue->iommu)
229 		return -EBUSY;
230 
231 	/* Polling not implemented */
232 	if (!irq)
233 		return -ENODEV;
234 
235 	queue->iommu = iommu;
236 	rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
237 				  IRQF_ONESHOT | IRQF_SHARED,
238 				  dev_name(iommu->dev), queue);
239 	if (rc) {
240 		queue->iommu = NULL;
241 		return rc;
242 	}
243 
244 	/* Empty queue before enabling it */
245 	if (queue->qid == RISCV_IOMMU_INTR_CQ)
246 		riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
247 	else
248 		riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
249 
250 	/*
251 	 * Enable queue with interrupts, clear any memory fault if any.
252 	 * Wait for the hardware to acknowledge request and activate queue
253 	 * processing.
254 	 * Note: All CSR bitfields are in the same offsets for all queues.
255 	 */
256 	riscv_iommu_writel(iommu, queue->qcr,
257 			   RISCV_IOMMU_QUEUE_ENABLE |
258 			   RISCV_IOMMU_QUEUE_INTR_ENABLE |
259 			   RISCV_IOMMU_QUEUE_MEM_FAULT);
260 
261 	riscv_iommu_readl_timeout(iommu, queue->qcr,
262 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
263 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
264 
265 	if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
266 						RISCV_IOMMU_QUEUE_BUSY |
267 						RISCV_IOMMU_QUEUE_MEM_FAULT))) {
268 		/* Best effort to stop and disable failing hardware queue. */
269 		riscv_iommu_writel(iommu, queue->qcr, 0);
270 		free_irq(irq, queue);
271 		queue->iommu = NULL;
272 		dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
273 		return -EBUSY;
274 	}
275 
276 	/* Clear any pending interrupt flag. */
277 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
278 
279 	return 0;
280 }
281 
282 /*
283  * Disable queue. Wait for the hardware to acknowledge request and
284  * stop processing enqueued requests. Report errors but continue.
285  */
286 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
287 {
288 	struct riscv_iommu_device *iommu = queue->iommu;
289 	u32 csr;
290 
291 	if (!iommu)
292 		return;
293 
294 	free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
295 	riscv_iommu_writel(iommu, queue->qcr, 0);
296 	riscv_iommu_readl_timeout(iommu, queue->qcr,
297 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
298 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
299 
300 	if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
301 		dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
302 			queue->qid, csr);
303 
304 	queue->iommu = NULL;
305 }
306 
307 /*
308  * Returns number of available valid queue entries and the first item index.
309  * Update shadow producer index if necessary.
310  */
311 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
312 				     unsigned int *index)
313 {
314 	unsigned int head = atomic_read(&queue->head);
315 	unsigned int tail = atomic_read(&queue->tail);
316 	unsigned int last = Q_ITEM(queue, tail);
317 	int available = (int)(tail - head);
318 
319 	*index = head;
320 
321 	if (available > 0)
322 		return available;
323 
324 	/* read hardware producer index, check reserved register bits are not set. */
325 	if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
326 				      tail, (tail & ~queue->mask) == 0,
327 				      0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
328 		dev_err_once(queue->iommu->dev,
329 			     "Hardware error: queue access timeout\n");
330 		return 0;
331 	}
332 
333 	if (tail == last)
334 		return 0;
335 
336 	/* update shadow producer index */
337 	return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
338 }
339 
340 /*
341  * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
342  */
343 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
344 {
345 	const unsigned int head = atomic_add_return(count, &queue->head);
346 
347 	riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
348 }
349 
350 /* Return actual consumer index based on hardware reported queue head index. */
351 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
352 {
353 	const unsigned int cons = atomic_read(&queue->head);
354 	const unsigned int last = Q_ITEM(queue, cons);
355 	unsigned int head;
356 
357 	if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
358 				      !(head & ~queue->mask),
359 				      0, RISCV_IOMMU_QUEUE_TIMEOUT))
360 		return cons;
361 
362 	return cons + ((head - last) & queue->mask);
363 }
364 
365 /* Wait for submitted item to be processed. */
366 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
367 				  unsigned int index,
368 				  unsigned int timeout_us)
369 {
370 	unsigned int cons = atomic_read(&queue->head);
371 
372 	/* Already processed by the consumer */
373 	if ((int)(cons - index) > 0)
374 		return 0;
375 
376 	/* Monitor consumer index */
377 	return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
378 				 (int)(cons - index) > 0, 0, timeout_us);
379 }
380 
381 /* Enqueue an entry and wait to be processed if timeout_us > 0
382  *
383  * Error handling for IOMMU hardware not responding in reasonable time
384  * will be added as separate patch series along with other RAS features.
385  * For now, only report hardware failure and continue.
386  */
387 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
388 					   void *entry, size_t entry_size)
389 {
390 	unsigned int prod;
391 	unsigned int head;
392 	unsigned int tail;
393 	unsigned long flags;
394 
395 	/* Do not preempt submission flow. */
396 	local_irq_save(flags);
397 
398 	/* 1. Allocate some space in the queue */
399 	prod = atomic_inc_return(&queue->prod) - 1;
400 	head = atomic_read(&queue->head);
401 
402 	/* 2. Wait for space availability. */
403 	if ((prod - head) > queue->mask) {
404 		if (readx_poll_timeout(atomic_read, &queue->head,
405 				       head, (prod - head) < queue->mask,
406 				       0, RISCV_IOMMU_QUEUE_TIMEOUT))
407 			goto err_busy;
408 	} else if ((prod - head) == queue->mask) {
409 		const unsigned int last = Q_ITEM(queue, head);
410 
411 		if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
412 					      !(head & ~queue->mask) && head != last,
413 					      0, RISCV_IOMMU_QUEUE_TIMEOUT))
414 			goto err_busy;
415 		atomic_add((head - last) & queue->mask, &queue->head);
416 	}
417 
418 	/* 3. Store entry in the ring buffer */
419 	memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
420 
421 	/* 4. Wait for all previous entries to be ready */
422 	if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
423 			       0, RISCV_IOMMU_QUEUE_TIMEOUT))
424 		goto err_busy;
425 
426 	/*
427 	 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
428 	 *    completed and visible before signaling the tail doorbell to fetch
429 	 *    the next command. 'fence ow, ow'
430 	 */
431 	dma_wmb();
432 	riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
433 
434 	/*
435 	 * 6. Make sure the doorbell write to the device has finished before updating
436 	 *    the shadow tail index in normal memory. 'fence o, w'
437 	 */
438 	mmiowb();
439 	atomic_inc(&queue->tail);
440 
441 	/* 7. Complete submission and restore local interrupts */
442 	local_irq_restore(flags);
443 
444 	return prod;
445 
446 err_busy:
447 	local_irq_restore(flags);
448 	dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
449 
450 	return prod;
451 }
452 
453 /*
454  * IOMMU Command queue chapter 3.1
455  */
456 
457 /* Command queue interrupt handler thread function */
458 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
459 {
460 	const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
461 	unsigned int ctrl;
462 
463 	/* Clear MF/CQ errors, complete error recovery to be implemented. */
464 	ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
465 	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
466 		    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
467 		riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
468 		dev_warn(queue->iommu->dev,
469 			 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
470 			 queue->qid,
471 			 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
472 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
473 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
474 			 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
475 	}
476 
477 	/* Placeholder for command queue interrupt notifiers */
478 
479 	/* Clear command interrupt pending. */
480 	riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
481 
482 	return IRQ_HANDLED;
483 }
484 
485 /* Send command to the IOMMU command queue */
486 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
487 				 struct riscv_iommu_command *cmd)
488 {
489 	riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
490 }
491 
492 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
493 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
494 				 unsigned int timeout_us)
495 {
496 	struct riscv_iommu_command cmd;
497 	unsigned int prod;
498 
499 	riscv_iommu_cmd_iofence(&cmd);
500 	prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
501 
502 	if (!timeout_us)
503 		return;
504 
505 	if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
506 		dev_err_once(iommu->dev,
507 			     "Hardware error: command execution timeout\n");
508 }
509 
510 /*
511  * IOMMU Fault/Event queue chapter 3.2
512  */
513 
514 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
515 			      struct riscv_iommu_fq_record *event)
516 {
517 	unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
518 	unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
519 
520 	/* Placeholder for future fault handling implementation, report only. */
521 	if (err)
522 		dev_warn_ratelimited(iommu->dev,
523 				     "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
524 				     err, devid, event->iotval, event->iotval2);
525 }
526 
527 /* Fault queue interrupt handler thread function */
528 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
529 {
530 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
531 	struct riscv_iommu_device *iommu = queue->iommu;
532 	struct riscv_iommu_fq_record *events;
533 	unsigned int ctrl, idx;
534 	int cnt, len;
535 
536 	events = (struct riscv_iommu_fq_record *)queue->base;
537 
538 	/* Clear fault interrupt pending and process all received fault events. */
539 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
540 
541 	do {
542 		cnt = riscv_iommu_queue_consume(queue, &idx);
543 		for (len = 0; len < cnt; idx++, len++)
544 			riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
545 		riscv_iommu_queue_release(queue, cnt);
546 	} while (cnt > 0);
547 
548 	/* Clear MF/OF errors, complete error recovery to be implemented. */
549 	ctrl = riscv_iommu_readl(iommu, queue->qcr);
550 	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
551 		riscv_iommu_writel(iommu, queue->qcr, ctrl);
552 		dev_warn(iommu->dev,
553 			 "Queue #%u error; memory fault:%d overflow:%d\n",
554 			 queue->qid,
555 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
556 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
557 	}
558 
559 	return IRQ_HANDLED;
560 }
561 
562 /* Lookup and initialize device context info structure. */
563 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
564 						 unsigned int devid)
565 {
566 	const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
567 	unsigned int depth;
568 	unsigned long ddt, old, new;
569 	void *ptr;
570 	u8 ddi_bits[3] = { 0 };
571 	u64 *ddtp = NULL;
572 
573 	/* Make sure the mode is valid */
574 	if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
575 	    iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
576 		return NULL;
577 
578 	/*
579 	 * Device id partitioning for base format:
580 	 * DDI[0]: bits 0 - 6   (1st level) (7 bits)
581 	 * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
582 	 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
583 	 *
584 	 * For extended format:
585 	 * DDI[0]: bits 0 - 5   (1st level) (6 bits)
586 	 * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
587 	 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
588 	 */
589 	if (base_format) {
590 		ddi_bits[0] = 7;
591 		ddi_bits[1] = 7 + 9;
592 		ddi_bits[2] = 7 + 9 + 8;
593 	} else {
594 		ddi_bits[0] = 6;
595 		ddi_bits[1] = 6 + 9;
596 		ddi_bits[2] = 6 + 9 + 9;
597 	}
598 
599 	/* Make sure device id is within range */
600 	depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
601 	if (devid >= (1 << ddi_bits[depth]))
602 		return NULL;
603 
604 	/* Get to the level of the non-leaf node that holds the device context */
605 	for (ddtp = iommu->ddt_root; depth-- > 0;) {
606 		const int split = ddi_bits[depth];
607 		/*
608 		 * Each non-leaf node is 64bits wide and on each level
609 		 * nodes are indexed by DDI[depth].
610 		 */
611 		ddtp += (devid >> split) & 0x1FF;
612 
613 		/*
614 		 * Check if this node has been populated and if not
615 		 * allocate a new level and populate it.
616 		 */
617 		do {
618 			ddt = READ_ONCE(*(unsigned long *)ddtp);
619 			if (ddt & RISCV_IOMMU_DDTE_V) {
620 				ddtp = __va(ppn_to_phys(ddt));
621 				break;
622 			}
623 
624 			ptr = riscv_iommu_get_pages(iommu, SZ_4K);
625 			if (!ptr)
626 				return NULL;
627 
628 			new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
629 			old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
630 
631 			if (old == ddt) {
632 				ddtp = (u64 *)ptr;
633 				break;
634 			}
635 
636 			/* Race setting DDT detected, re-read and retry. */
637 			riscv_iommu_free_pages(iommu, ptr);
638 		} while (1);
639 	}
640 
641 	/*
642 	 * Grab the node that matches DDI[depth], note that when using base
643 	 * format the device context is 4 * 64bits, and the extended format
644 	 * is 8 * 64bits, hence the (3 - base_format) below.
645 	 */
646 	ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
647 
648 	return (struct riscv_iommu_dc *)ddtp;
649 }
650 
651 /*
652  * This is best effort IOMMU translation shutdown flow.
653  * Disable IOMMU without waiting for hardware response.
654  */
655 void riscv_iommu_disable(struct riscv_iommu_device *iommu)
656 {
657 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
658 			   FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
659 				      RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
660 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
661 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
662 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
663 }
664 
665 #define riscv_iommu_read_ddtp(iommu) ({ \
666 	u64 ddtp; \
667 	riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
668 				  !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
669 				  RISCV_IOMMU_DDTP_TIMEOUT); \
670 	ddtp; })
671 
672 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
673 {
674 	u64 ddtp;
675 	unsigned int mode;
676 
677 	ddtp = riscv_iommu_read_ddtp(iommu);
678 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
679 		return -EBUSY;
680 
681 	/*
682 	 * It is optional for the hardware to report a fixed address for device
683 	 * directory root page when DDT.MODE is OFF or BARE.
684 	 */
685 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
686 	if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
687 	    mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
688 		/* Use WARL to discover hardware fixed DDT PPN */
689 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
690 				   FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
691 		ddtp = riscv_iommu_read_ddtp(iommu);
692 		if (ddtp & RISCV_IOMMU_DDTP_BUSY)
693 			return -EBUSY;
694 
695 		iommu->ddt_phys = ppn_to_phys(ddtp);
696 		if (iommu->ddt_phys)
697 			iommu->ddt_root = devm_ioremap(iommu->dev,
698 						       iommu->ddt_phys, PAGE_SIZE);
699 		if (iommu->ddt_root)
700 			memset(iommu->ddt_root, 0, PAGE_SIZE);
701 	}
702 
703 	if (!iommu->ddt_root) {
704 		iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
705 		iommu->ddt_phys = __pa(iommu->ddt_root);
706 	}
707 
708 	if (!iommu->ddt_root)
709 		return -ENOMEM;
710 
711 	return 0;
712 }
713 
714 /*
715  * Discover supported DDT modes starting from requested value,
716  * configure DDTP register with accepted mode and root DDT address.
717  * Accepted iommu->ddt_mode is updated on success.
718  */
719 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
720 				      unsigned int ddtp_mode)
721 {
722 	struct device *dev = iommu->dev;
723 	u64 ddtp, rq_ddtp;
724 	unsigned int mode, rq_mode = ddtp_mode;
725 	struct riscv_iommu_command cmd;
726 
727 	ddtp = riscv_iommu_read_ddtp(iommu);
728 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
729 		return -EBUSY;
730 
731 	/* Disallow state transition from xLVL to xLVL. */
732 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
733 	if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
734 	    mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
735 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
736 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
737 		return -EINVAL;
738 
739 	do {
740 		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
741 		if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
742 			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
743 
744 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
745 		ddtp = riscv_iommu_read_ddtp(iommu);
746 		if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
747 			dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
748 				rq_mode, ddtp);
749 			return -EBUSY;
750 		}
751 
752 		/* Verify IOMMU hardware accepts new DDTP config. */
753 		mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
754 
755 		if (rq_mode == mode)
756 			break;
757 
758 		/* Hardware mandatory DDTP mode has not been accepted. */
759 		if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
760 			dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
761 				ddtp, rq_ddtp);
762 			return -EINVAL;
763 		}
764 
765 		/*
766 		 * Mode field is WARL, an IOMMU may support a subset of
767 		 * directory table levels in which case if we tried to set
768 		 * an unsupported number of levels we'll readback either
769 		 * a valid xLVL or off/bare. If we got off/bare, try again
770 		 * with a smaller xLVL.
771 		 */
772 		if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
773 		    rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
774 			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
775 			rq_mode--;
776 			continue;
777 		}
778 
779 		/*
780 		 * We tried all supported modes and IOMMU hardware failed to
781 		 * accept new settings, something went very wrong since off/bare
782 		 * and at least one xLVL must be supported.
783 		 */
784 		dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
785 			mode, ddtp_mode);
786 		return -EINVAL;
787 	} while (1);
788 
789 	iommu->ddt_mode = mode;
790 	if (mode != ddtp_mode)
791 		dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
792 
793 	/* Invalidate device context cache */
794 	riscv_iommu_cmd_iodir_inval_ddt(&cmd);
795 	riscv_iommu_cmd_send(iommu, &cmd);
796 
797 	/* Invalidate address translation cache */
798 	riscv_iommu_cmd_inval_vma(&cmd);
799 	riscv_iommu_cmd_send(iommu, &cmd);
800 
801 	/* IOFENCE.C */
802 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
803 
804 	return 0;
805 }
806 
807 /* This struct contains protection domain specific IOMMU driver data. */
808 struct riscv_iommu_domain {
809 	struct iommu_domain domain;
810 	struct list_head bonds;
811 	spinlock_t lock;		/* protect bonds list updates. */
812 	int pscid;
813 	bool amo_enabled;
814 	int numa_node;
815 	unsigned int pgd_mode;
816 	unsigned long *pgd_root;
817 };
818 
819 #define iommu_domain_to_riscv(iommu_domain) \
820 	container_of(iommu_domain, struct riscv_iommu_domain, domain)
821 
822 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
823 struct riscv_iommu_info {
824 	struct riscv_iommu_domain *domain;
825 };
826 
827 /*
828  * Linkage between an iommu_domain and attached devices.
829  *
830  * Protection domain requiring IOATC and DevATC translation cache invalidations,
831  * should be linked to attached devices using a riscv_iommu_bond structure.
832  * Devices should be linked to the domain before first use and unlinked after
833  * the translations from the referenced protection domain can no longer be used.
834  * Blocking and identity domains are not tracked here, as the IOMMU hardware
835  * does not cache negative and/or identity (BARE mode) translations, and DevATC
836  * is disabled for those protection domains.
837  *
838  * The device pointer and IOMMU data remain stable in the bond struct after
839  * _probe_device() where it's attached to the managed IOMMU, up to the
840  * completion of the _release_device() call. The release of the bond structure
841  * is synchronized with the device release.
842  */
843 struct riscv_iommu_bond {
844 	struct list_head list;
845 	struct rcu_head rcu;
846 	struct device *dev;
847 };
848 
849 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
850 				 struct device *dev)
851 {
852 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
853 	struct riscv_iommu_bond *bond;
854 	struct list_head *bonds;
855 
856 	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
857 	if (!bond)
858 		return -ENOMEM;
859 	bond->dev = dev;
860 
861 	/*
862 	 * List of devices attached to the domain is arranged based on
863 	 * managed IOMMU device.
864 	 */
865 
866 	spin_lock(&domain->lock);
867 	list_for_each(bonds, &domain->bonds)
868 		if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
869 			break;
870 	list_add_rcu(&bond->list, bonds);
871 	spin_unlock(&domain->lock);
872 
873 	/* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
874 	smp_mb();
875 
876 	return 0;
877 }
878 
879 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
880 				    struct device *dev)
881 {
882 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
883 	struct riscv_iommu_bond *bond, *found = NULL;
884 	struct riscv_iommu_command cmd;
885 	int count = 0;
886 
887 	if (!domain)
888 		return;
889 
890 	spin_lock(&domain->lock);
891 	list_for_each_entry(bond, &domain->bonds, list) {
892 		if (found && count)
893 			break;
894 		else if (bond->dev == dev)
895 			found = bond;
896 		else if (dev_to_iommu(bond->dev) == iommu)
897 			count++;
898 	}
899 	if (found)
900 		list_del_rcu(&found->list);
901 	spin_unlock(&domain->lock);
902 	kfree_rcu(found, rcu);
903 
904 	/*
905 	 * If this was the last bond between this domain and the IOMMU
906 	 * invalidate all cached entries for domain's PSCID.
907 	 */
908 	if (!count) {
909 		riscv_iommu_cmd_inval_vma(&cmd);
910 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
911 		riscv_iommu_cmd_send(iommu, &cmd);
912 
913 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
914 	}
915 }
916 
917 /*
918  * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
919  * This limit will be replaced with range invalidations, if supported by
920  * the hardware, when RISC-V IOMMU architecture specification update for
921  * range invalidations update will be available.
922  */
923 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
924 
925 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
926 				    unsigned long start, unsigned long end)
927 {
928 	struct riscv_iommu_bond *bond;
929 	struct riscv_iommu_device *iommu, *prev;
930 	struct riscv_iommu_command cmd;
931 	unsigned long len = end - start + 1;
932 	unsigned long iova;
933 
934 	/*
935 	 * For each IOMMU linked with this protection domain (via bonds->dev),
936 	 * an IOTLB invaliation command will be submitted and executed.
937 	 *
938 	 * Possbile race with domain attach flow is handled by sequencing
939 	 * bond creation - riscv_iommu_bond_link(), and device directory
940 	 * update - riscv_iommu_iodir_update().
941 	 *
942 	 * PTE Update / IOTLB Inval           Device attach & directory update
943 	 * --------------------------         --------------------------
944 	 * update page table entries          add dev to the bond list
945 	 * FENCE RW,RW                        FENCE RW,RW
946 	 * For all IOMMUs: (can be empty)     Update FSC/PSCID
947 	 *   FENCE IOW,IOW                      FENCE IOW,IOW
948 	 *   IOTLB.INVAL                        IODIR.INVAL
949 	 *   IOFENCE.C
950 	 *
951 	 * If bond list is not updated with new device, directory context will
952 	 * be configured with already valid page table content. If an IOMMU is
953 	 * linked to the protection domain it will receive invalidation
954 	 * requests for updated page table entries.
955 	 */
956 	smp_mb();
957 
958 	rcu_read_lock();
959 
960 	prev = NULL;
961 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
962 		iommu = dev_to_iommu(bond->dev);
963 
964 		/*
965 		 * IOTLB invalidation request can be safely omitted if already sent
966 		 * to the IOMMU for the same PSCID, and with domain->bonds list
967 		 * arranged based on the device's IOMMU, it's sufficient to check
968 		 * last device the invalidation was sent to.
969 		 */
970 		if (iommu == prev)
971 			continue;
972 
973 		riscv_iommu_cmd_inval_vma(&cmd);
974 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
975 		if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
976 			for (iova = start; iova < end; iova += PAGE_SIZE) {
977 				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
978 				riscv_iommu_cmd_send(iommu, &cmd);
979 			}
980 		} else {
981 			riscv_iommu_cmd_send(iommu, &cmd);
982 		}
983 		prev = iommu;
984 	}
985 
986 	prev = NULL;
987 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
988 		iommu = dev_to_iommu(bond->dev);
989 		if (iommu == prev)
990 			continue;
991 
992 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
993 		prev = iommu;
994 	}
995 	rcu_read_unlock();
996 }
997 
998 #define RISCV_IOMMU_FSC_BARE 0
999 
1000 /*
1001  * Update IODIR for the device.
1002  *
1003  * During the execution of riscv_iommu_probe_device(), IODIR entries are
1004  * allocated for the device's identifiers.  Device context invalidation
1005  * becomes necessary only if one of the updated entries was previously
1006  * marked as valid, given that invalid device context entries are not
1007  * cached by the IOMMU hardware.
1008  * In this implementation, updating a valid device context while the
1009  * device is not quiesced might be disruptive, potentially causing
1010  * interim translation faults.
1011  */
1012 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1013 				     struct device *dev, u64 fsc, u64 ta)
1014 {
1015 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1016 	struct riscv_iommu_dc *dc;
1017 	struct riscv_iommu_command cmd;
1018 	bool sync_required = false;
1019 	u64 tc;
1020 	int i;
1021 
1022 	for (i = 0; i < fwspec->num_ids; i++) {
1023 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1024 		tc = READ_ONCE(dc->tc);
1025 		if (!(tc & RISCV_IOMMU_DC_TC_V))
1026 			continue;
1027 
1028 		WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1029 
1030 		/* Invalidate device context cached values */
1031 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1032 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1033 		riscv_iommu_cmd_send(iommu, &cmd);
1034 		sync_required = true;
1035 	}
1036 
1037 	if (sync_required)
1038 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1039 
1040 	/*
1041 	 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1042 	 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1043 	 */
1044 	for (i = 0; i < fwspec->num_ids; i++) {
1045 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1046 		tc = READ_ONCE(dc->tc);
1047 		tc |= ta & RISCV_IOMMU_DC_TC_V;
1048 
1049 		WRITE_ONCE(dc->fsc, fsc);
1050 		WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1051 		/* Update device context, write TC.V as the last step. */
1052 		dma_wmb();
1053 		WRITE_ONCE(dc->tc, tc);
1054 
1055 		/* Invalidate device context after update */
1056 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1057 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1058 		riscv_iommu_cmd_send(iommu, &cmd);
1059 	}
1060 
1061 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1062 }
1063 
1064 /*
1065  * IOVA page translation tree management.
1066  */
1067 
1068 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1069 {
1070 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1071 
1072 	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1073 }
1074 
1075 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1076 				   struct iommu_iotlb_gather *gather)
1077 {
1078 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1079 
1080 	riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1081 }
1082 
1083 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1084 
1085 #define _io_pte_present(pte)	((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1086 #define _io_pte_leaf(pte)	((pte) & _PAGE_LEAF)
1087 #define _io_pte_none(pte)	((pte) == 0)
1088 #define _io_pte_entry(pn, prot)	((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1089 
1090 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
1091 				 unsigned long pte,
1092 				 struct iommu_pages_list *freelist)
1093 {
1094 	unsigned long *ptr;
1095 	int i;
1096 
1097 	if (!_io_pte_present(pte) || _io_pte_leaf(pte))
1098 		return;
1099 
1100 	ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1101 
1102 	/* Recursively free all sub page table pages */
1103 	for (i = 0; i < PTRS_PER_PTE; i++) {
1104 		pte = READ_ONCE(ptr[i]);
1105 		if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
1106 			riscv_iommu_pte_free(domain, pte, freelist);
1107 	}
1108 
1109 	if (freelist)
1110 		iommu_pages_list_add(freelist, ptr);
1111 	else
1112 		iommu_free_pages(ptr);
1113 }
1114 
1115 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
1116 					    unsigned long iova, size_t pgsize,
1117 					    gfp_t gfp)
1118 {
1119 	unsigned long *ptr = domain->pgd_root;
1120 	unsigned long pte, old;
1121 	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1122 	void *addr;
1123 
1124 	do {
1125 		const int shift = PAGE_SHIFT + PT_SHIFT * level;
1126 
1127 		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1128 		/*
1129 		 * Note: returned entry might be a non-leaf if there was
1130 		 * existing mapping with smaller granularity. Up to the caller
1131 		 * to replace and invalidate.
1132 		 */
1133 		if (((size_t)1 << shift) == pgsize)
1134 			return ptr;
1135 pte_retry:
1136 		pte = READ_ONCE(*ptr);
1137 		/*
1138 		 * This is very likely incorrect as we should not be adding
1139 		 * new mapping with smaller granularity on top
1140 		 * of existing 2M/1G mapping. Fail.
1141 		 */
1142 		if (_io_pte_present(pte) && _io_pte_leaf(pte))
1143 			return NULL;
1144 		/*
1145 		 * Non-leaf entry is missing, allocate and try to add to the
1146 		 * page table. This might race with other mappings, retry.
1147 		 */
1148 		if (_io_pte_none(pte)) {
1149 			addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
1150 							 SZ_4K);
1151 			if (!addr)
1152 				return NULL;
1153 			old = pte;
1154 			pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
1155 			if (cmpxchg_relaxed(ptr, old, pte) != old) {
1156 				iommu_free_pages(addr);
1157 				goto pte_retry;
1158 			}
1159 		}
1160 		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1161 	} while (level-- > 0);
1162 
1163 	return NULL;
1164 }
1165 
1166 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
1167 					    unsigned long iova, size_t *pte_pgsize)
1168 {
1169 	unsigned long *ptr = domain->pgd_root;
1170 	unsigned long pte;
1171 	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1172 
1173 	do {
1174 		const int shift = PAGE_SHIFT + PT_SHIFT * level;
1175 
1176 		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1177 		pte = READ_ONCE(*ptr);
1178 		if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
1179 			*pte_pgsize = (size_t)1 << shift;
1180 			return ptr;
1181 		}
1182 		if (_io_pte_none(pte))
1183 			return NULL;
1184 		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1185 	} while (level-- > 0);
1186 
1187 	return NULL;
1188 }
1189 
1190 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
1191 				 unsigned long iova, phys_addr_t phys,
1192 				 size_t pgsize, size_t pgcount, int prot,
1193 				 gfp_t gfp, size_t *mapped)
1194 {
1195 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1196 	size_t size = 0;
1197 	unsigned long *ptr;
1198 	unsigned long pte, old, pte_prot;
1199 	int rc = 0;
1200 	struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
1201 
1202 	if (!(prot & IOMMU_WRITE))
1203 		pte_prot = _PAGE_BASE | _PAGE_READ;
1204 	else if (domain->amo_enabled)
1205 		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
1206 	else
1207 		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
1208 
1209 	while (pgcount) {
1210 		ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
1211 		if (!ptr) {
1212 			rc = -ENOMEM;
1213 			break;
1214 		}
1215 
1216 		old = READ_ONCE(*ptr);
1217 		pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
1218 		if (cmpxchg_relaxed(ptr, old, pte) != old)
1219 			continue;
1220 
1221 		riscv_iommu_pte_free(domain, old, &freelist);
1222 
1223 		size += pgsize;
1224 		iova += pgsize;
1225 		phys += pgsize;
1226 		--pgcount;
1227 	}
1228 
1229 	*mapped = size;
1230 
1231 	if (!iommu_pages_list_empty(&freelist)) {
1232 		/*
1233 		 * In 1.0 spec version, the smallest scope we can use to
1234 		 * invalidate all levels of page table (i.e. leaf and non-leaf)
1235 		 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1236 		 * This will be updated with hardware support for
1237 		 * capability.NL (non-leaf) IOTINVAL command.
1238 		 */
1239 		riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1240 		iommu_put_pages_list(&freelist);
1241 	}
1242 
1243 	return rc;
1244 }
1245 
1246 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
1247 				      unsigned long iova, size_t pgsize,
1248 				      size_t pgcount,
1249 				      struct iommu_iotlb_gather *gather)
1250 {
1251 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1252 	size_t size = pgcount << __ffs(pgsize);
1253 	unsigned long *ptr, old;
1254 	size_t unmapped = 0;
1255 	size_t pte_size;
1256 
1257 	while (unmapped < size) {
1258 		ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1259 		if (!ptr)
1260 			return unmapped;
1261 
1262 		/* partial unmap is not allowed, fail. */
1263 		if (iova & (pte_size - 1))
1264 			return unmapped;
1265 
1266 		old = READ_ONCE(*ptr);
1267 		if (cmpxchg_relaxed(ptr, old, 0) != old)
1268 			continue;
1269 
1270 		iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
1271 					    pte_size);
1272 
1273 		iova += pte_size;
1274 		unmapped += pte_size;
1275 	}
1276 
1277 	return unmapped;
1278 }
1279 
1280 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
1281 					    dma_addr_t iova)
1282 {
1283 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1284 	size_t pte_size;
1285 	unsigned long *ptr;
1286 
1287 	ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1288 	if (!ptr)
1289 		return 0;
1290 
1291 	return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
1292 }
1293 
1294 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1295 {
1296 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1297 	const unsigned long pfn = virt_to_pfn(domain->pgd_root);
1298 
1299 	WARN_ON(!list_empty(&domain->bonds));
1300 
1301 	if ((int)domain->pscid > 0)
1302 		ida_free(&riscv_iommu_pscids, domain->pscid);
1303 
1304 	riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
1305 	kfree(domain);
1306 }
1307 
1308 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1309 {
1310 	switch (pgd_mode) {
1311 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1312 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1313 
1314 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1315 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1316 
1317 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1318 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1319 	}
1320 	return false;
1321 }
1322 
1323 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1324 					    struct device *dev)
1325 {
1326 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1327 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1328 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1329 	u64 fsc, ta;
1330 
1331 	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
1332 		return -ENODEV;
1333 
1334 	fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
1335 	      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
1336 	ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1337 	     RISCV_IOMMU_PC_TA_V;
1338 
1339 	if (riscv_iommu_bond_link(domain, dev))
1340 		return -ENOMEM;
1341 
1342 	riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1343 	riscv_iommu_bond_unlink(info->domain, dev);
1344 	info->domain = domain;
1345 
1346 	return 0;
1347 }
1348 
1349 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1350 	.attach_dev = riscv_iommu_attach_paging_domain,
1351 	.free = riscv_iommu_free_paging_domain,
1352 	.map_pages = riscv_iommu_map_pages,
1353 	.unmap_pages = riscv_iommu_unmap_pages,
1354 	.iova_to_phys = riscv_iommu_iova_to_phys,
1355 	.iotlb_sync = riscv_iommu_iotlb_sync,
1356 	.flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1357 };
1358 
1359 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1360 {
1361 	struct riscv_iommu_domain *domain;
1362 	struct riscv_iommu_device *iommu;
1363 	unsigned int pgd_mode;
1364 	dma_addr_t va_mask;
1365 	int va_bits;
1366 
1367 	iommu = dev_to_iommu(dev);
1368 	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1369 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
1370 		va_bits = 57;
1371 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1372 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
1373 		va_bits = 48;
1374 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1375 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
1376 		va_bits = 39;
1377 	} else {
1378 		dev_err(dev, "cannot find supported page table mode\n");
1379 		return ERR_PTR(-ENODEV);
1380 	}
1381 
1382 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1383 	if (!domain)
1384 		return ERR_PTR(-ENOMEM);
1385 
1386 	INIT_LIST_HEAD_RCU(&domain->bonds);
1387 	spin_lock_init(&domain->lock);
1388 	domain->numa_node = dev_to_node(iommu->dev);
1389 	domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
1390 	domain->pgd_mode = pgd_mode;
1391 	domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node,
1392 						     GFP_KERNEL_ACCOUNT, SZ_4K);
1393 	if (!domain->pgd_root) {
1394 		kfree(domain);
1395 		return ERR_PTR(-ENOMEM);
1396 	}
1397 
1398 	domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1399 					RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1400 	if (domain->pscid < 0) {
1401 		iommu_free_pages(domain->pgd_root);
1402 		kfree(domain);
1403 		return ERR_PTR(-ENOMEM);
1404 	}
1405 
1406 	/*
1407 	 * Note: RISC-V Privilege spec mandates that virtual addresses
1408 	 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1409 	 * bits >= VA_BITS need to also be set or else we'll get a
1410 	 * page fault. However the code that creates the mappings
1411 	 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1412 	 * for now, so we'll end up with invalid virtual addresses
1413 	 * to map. As a workaround until we get this sorted out
1414 	 * limit the available virtual addresses to VA_BITS - 1.
1415 	 */
1416 	va_mask = DMA_BIT_MASK(va_bits - 1);
1417 
1418 	domain->domain.geometry.aperture_start = 0;
1419 	domain->domain.geometry.aperture_end = va_mask;
1420 	domain->domain.geometry.force_aperture = true;
1421 	domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
1422 
1423 	domain->domain.ops = &riscv_iommu_paging_domain_ops;
1424 
1425 	return &domain->domain;
1426 }
1427 
1428 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1429 					      struct device *dev)
1430 {
1431 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1432 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1433 
1434 	/* Make device context invalid, translation requests will fault w/ #258 */
1435 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1436 	riscv_iommu_bond_unlink(info->domain, dev);
1437 	info->domain = NULL;
1438 
1439 	return 0;
1440 }
1441 
1442 static struct iommu_domain riscv_iommu_blocking_domain = {
1443 	.type = IOMMU_DOMAIN_BLOCKED,
1444 	.ops = &(const struct iommu_domain_ops) {
1445 		.attach_dev = riscv_iommu_attach_blocking_domain,
1446 	}
1447 };
1448 
1449 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1450 					      struct device *dev)
1451 {
1452 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1453 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1454 
1455 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1456 	riscv_iommu_bond_unlink(info->domain, dev);
1457 	info->domain = NULL;
1458 
1459 	return 0;
1460 }
1461 
1462 static struct iommu_domain riscv_iommu_identity_domain = {
1463 	.type = IOMMU_DOMAIN_IDENTITY,
1464 	.ops = &(const struct iommu_domain_ops) {
1465 		.attach_dev = riscv_iommu_attach_identity_domain,
1466 	}
1467 };
1468 
1469 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1470 {
1471 	if (dev_is_pci(dev))
1472 		return pci_device_group(dev);
1473 	return generic_device_group(dev);
1474 }
1475 
1476 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1477 {
1478 	return iommu_fwspec_add_ids(dev, args->args, 1);
1479 }
1480 
1481 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1482 {
1483 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1484 	struct riscv_iommu_device *iommu;
1485 	struct riscv_iommu_info *info;
1486 	struct riscv_iommu_dc *dc;
1487 	u64 tc;
1488 	int i;
1489 
1490 	if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1491 		return ERR_PTR(-ENODEV);
1492 
1493 	iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1494 	if (!iommu)
1495 		return ERR_PTR(-ENODEV);
1496 
1497 	/*
1498 	 * IOMMU hardware operating in fail-over BARE mode will provide
1499 	 * identity translation for all connected devices anyway...
1500 	 */
1501 	if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1502 		return ERR_PTR(-ENODEV);
1503 
1504 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1505 	if (!info)
1506 		return ERR_PTR(-ENOMEM);
1507 	/*
1508 	 * Allocate and pre-configure device context entries in
1509 	 * the device directory. Do not mark the context valid yet.
1510 	 */
1511 	tc = 0;
1512 	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
1513 		tc |= RISCV_IOMMU_DC_TC_SADE;
1514 	for (i = 0; i < fwspec->num_ids; i++) {
1515 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1516 		if (!dc) {
1517 			kfree(info);
1518 			return ERR_PTR(-ENODEV);
1519 		}
1520 		if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1521 			dev_warn(dev, "already attached to IOMMU device directory\n");
1522 		WRITE_ONCE(dc->tc, tc);
1523 	}
1524 
1525 	dev_iommu_priv_set(dev, info);
1526 
1527 	return &iommu->iommu;
1528 }
1529 
1530 static void riscv_iommu_release_device(struct device *dev)
1531 {
1532 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1533 
1534 	kfree_rcu_mightsleep(info);
1535 }
1536 
1537 static const struct iommu_ops riscv_iommu_ops = {
1538 	.of_xlate = riscv_iommu_of_xlate,
1539 	.identity_domain = &riscv_iommu_identity_domain,
1540 	.blocked_domain = &riscv_iommu_blocking_domain,
1541 	.release_domain = &riscv_iommu_blocking_domain,
1542 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1543 	.device_group = riscv_iommu_device_group,
1544 	.probe_device = riscv_iommu_probe_device,
1545 	.release_device	= riscv_iommu_release_device,
1546 };
1547 
1548 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1549 {
1550 	u64 ddtp;
1551 
1552 	/*
1553 	 * Make sure the IOMMU is switched off or in pass-through mode during
1554 	 * regular boot flow and disable translation when we boot into a kexec
1555 	 * kernel and the previous kernel left them enabled.
1556 	 */
1557 	ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1558 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1559 		return -EBUSY;
1560 
1561 	if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1562 	     RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1563 		if (!is_kdump_kernel())
1564 			return -EBUSY;
1565 		riscv_iommu_disable(iommu);
1566 	}
1567 
1568 	/* Configure accesses to in-memory data structures for CPU-native byte order. */
1569 	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1570 	    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1571 		if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1572 			return -EINVAL;
1573 		riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1574 				   iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1575 		iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1576 		if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1577 		    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1578 			return -EINVAL;
1579 	}
1580 
1581 	/*
1582 	 * Distribute interrupt vectors, always use first vector for CIV.
1583 	 * At least one interrupt is required. Read back and verify.
1584 	 */
1585 	if (!iommu->irqs_count)
1586 		return -EINVAL;
1587 
1588 	iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1589 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1590 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1591 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1592 	iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1593 	if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1594 		    FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
1595 		max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1596 		    FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1597 		return -EINVAL;
1598 
1599 	return 0;
1600 }
1601 
1602 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1603 {
1604 	iommu_device_unregister(&iommu->iommu);
1605 	iommu_device_sysfs_remove(&iommu->iommu);
1606 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1607 	riscv_iommu_queue_disable(&iommu->cmdq);
1608 	riscv_iommu_queue_disable(&iommu->fltq);
1609 }
1610 
1611 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1612 {
1613 	int rc;
1614 
1615 	RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1616 	RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1617 
1618 	rc = riscv_iommu_init_check(iommu);
1619 	if (rc)
1620 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1621 
1622 	rc = riscv_iommu_iodir_alloc(iommu);
1623 	if (rc)
1624 		return rc;
1625 
1626 	rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1627 				     sizeof(struct riscv_iommu_command));
1628 	if (rc)
1629 		return rc;
1630 
1631 	rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1632 				     sizeof(struct riscv_iommu_fq_record));
1633 	if (rc)
1634 		return rc;
1635 
1636 	rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1637 	if (rc)
1638 		return rc;
1639 
1640 	rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1641 	if (rc)
1642 		goto err_queue_disable;
1643 
1644 	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1645 	if (rc)
1646 		goto err_queue_disable;
1647 
1648 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1649 				    dev_name(iommu->dev));
1650 	if (rc) {
1651 		dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1652 		goto err_iodir_off;
1653 	}
1654 
1655 	if (!acpi_disabled) {
1656 		rc = rimt_iommu_register(iommu->dev);
1657 		if (rc) {
1658 			dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n");
1659 			goto err_remove_sysfs;
1660 		}
1661 	}
1662 
1663 	rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1664 	if (rc) {
1665 		dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1666 		goto err_remove_sysfs;
1667 	}
1668 
1669 	return 0;
1670 
1671 err_remove_sysfs:
1672 	iommu_device_sysfs_remove(&iommu->iommu);
1673 err_iodir_off:
1674 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1675 err_queue_disable:
1676 	riscv_iommu_queue_disable(&iommu->fltq);
1677 	riscv_iommu_queue_disable(&iommu->cmdq);
1678 	return rc;
1679 }
1680