xref: /linux/drivers/iommu/riscv/iommu.c (revision 0a670e151a71434765de69590944e18c08ee08cf)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * IOMMU API for RISC-V IOMMU implementations.
4  *
5  * Copyright © 2022-2024 Rivos Inc.
6  * Copyright © 2023 FORTH-ICS/CARV
7  *
8  * Authors
9  *	Tomasz Jeznach <tjeznach@rivosinc.com>
10  *	Nick Kossifidis <mick@ics.forth.gr>
11  */
12 
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14 
15 #include <linux/compiler.h>
16 #include <linux/crash_dump.h>
17 #include <linux/init.h>
18 #include <linux/iommu.h>
19 #include <linux/iopoll.h>
20 #include <linux/kernel.h>
21 #include <linux/pci.h>
22 
23 #include "../iommu-pages.h"
24 #include "iommu-bits.h"
25 #include "iommu.h"
26 
27 /* Timeouts in [us] */
28 #define RISCV_IOMMU_QCSR_TIMEOUT	150000
29 #define RISCV_IOMMU_QUEUE_TIMEOUT	150000
30 #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
32 
33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
34 #define RISCV_IOMMU_DEF_CQ_COUNT	8192
35 #define RISCV_IOMMU_DEF_FQ_COUNT	4096
36 
37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
38 #define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
39 #define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
40 
41 #define dev_to_iommu(dev) \
42 	iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
43 
44 /* IOMMU PSCID allocation namespace. */
45 static DEFINE_IDA(riscv_iommu_pscids);
46 #define RISCV_IOMMU_MAX_PSCID		(BIT(20) - 1)
47 
48 /* Device resource-managed allocations */
49 struct riscv_iommu_devres {
50 	void *addr;
51 	int order;
52 };
53 
54 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
55 {
56 	struct riscv_iommu_devres *devres = res;
57 
58 	iommu_free_pages(devres->addr, devres->order);
59 }
60 
61 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
62 {
63 	struct riscv_iommu_devres *devres = res;
64 	struct riscv_iommu_devres *target = p;
65 
66 	return devres->addr == target->addr;
67 }
68 
69 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
70 {
71 	struct riscv_iommu_devres *devres;
72 	void *addr;
73 
74 	addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
75 				      GFP_KERNEL_ACCOUNT, order);
76 	if (unlikely(!addr))
77 		return NULL;
78 
79 	devres = devres_alloc(riscv_iommu_devres_pages_release,
80 			      sizeof(struct riscv_iommu_devres), GFP_KERNEL);
81 
82 	if (unlikely(!devres)) {
83 		iommu_free_pages(addr, order);
84 		return NULL;
85 	}
86 
87 	devres->addr = addr;
88 	devres->order = order;
89 
90 	devres_add(iommu->dev, devres);
91 
92 	return addr;
93 }
94 
95 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
96 {
97 	struct riscv_iommu_devres devres = { .addr = addr };
98 
99 	devres_release(iommu->dev, riscv_iommu_devres_pages_release,
100 		       riscv_iommu_devres_pages_match, &devres);
101 }
102 
103 /*
104  * Hardware queue allocation and management.
105  */
106 
107 /* Setup queue base, control registers and default queue length */
108 #define RISCV_IOMMU_QUEUE_INIT(q, name) do {				\
109 	struct riscv_iommu_queue *_q = q;				\
110 	_q->qid = RISCV_IOMMU_INTR_ ## name;				\
111 	_q->qbr = RISCV_IOMMU_REG_ ## name ## B;			\
112 	_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;			\
113 	_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
114 } while (0)
115 
116 /* Note: offsets are the same for all queues */
117 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
118 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
119 #define Q_ITEM(q, index) ((q)->mask & (index))
120 #define Q_IPSR(q) BIT((q)->qid)
121 
122 /*
123  * Discover queue ring buffer hardware configuration, allocate in-memory
124  * ring buffer or use fixed I/O memory location, configure queue base register.
125  * Must be called before hardware queue is enabled.
126  *
127  * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
128  * @entry_size - queue single element size in bytes.
129  */
130 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
131 				   struct riscv_iommu_queue *queue,
132 				   size_t entry_size)
133 {
134 	unsigned int logsz;
135 	u64 qb, rb;
136 
137 	/*
138 	 * Use WARL base register property to discover maximum allowed
139 	 * number of entries and optional fixed IO address for queue location.
140 	 */
141 	riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
142 	qb = riscv_iommu_readq(iommu, queue->qbr);
143 
144 	/*
145 	 * Calculate and verify hardware supported queue length, as reported
146 	 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
147 	 * Update queue size based on hardware supported value.
148 	 */
149 	logsz = ilog2(queue->mask);
150 	if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
151 		logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
152 
153 	/*
154 	 * Use WARL base register property to discover an optional fixed IO
155 	 * address for queue ring buffer location. Otherwise allocate contiguous
156 	 * system memory.
157 	 */
158 	if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
159 		const size_t queue_size = entry_size << (logsz + 1);
160 
161 		queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
162 		queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
163 	} else {
164 		do {
165 			const size_t queue_size = entry_size << (logsz + 1);
166 			const int order = get_order(queue_size);
167 
168 			queue->base = riscv_iommu_get_pages(iommu, order);
169 			queue->phys = __pa(queue->base);
170 		} while (!queue->base && logsz-- > 0);
171 	}
172 
173 	if (!queue->base)
174 		return -ENOMEM;
175 
176 	qb = phys_to_ppn(queue->phys) |
177 	     FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
178 
179 	/* Update base register and read back to verify hw accepted our write */
180 	riscv_iommu_writeq(iommu, queue->qbr, qb);
181 	rb = riscv_iommu_readq(iommu, queue->qbr);
182 	if (rb != qb) {
183 		dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
184 		return -ENODEV;
185 	}
186 
187 	/* Update actual queue mask */
188 	queue->mask = (2U << logsz) - 1;
189 
190 	dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
191 		queue->qid, logsz + 1);
192 
193 	return 0;
194 }
195 
196 /* Check interrupt queue status, IPSR */
197 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
198 {
199 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
200 
201 	if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
202 		return IRQ_WAKE_THREAD;
203 
204 	return IRQ_NONE;
205 }
206 
207 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
208 {
209 	/* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
210 	return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
211 }
212 
213 /*
214  * Enable queue processing in the hardware, register interrupt handler.
215  *
216  * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
217  * @irq_handler - threaded interrupt handler.
218  */
219 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
220 				    struct riscv_iommu_queue *queue,
221 				    irq_handler_t irq_handler)
222 {
223 	const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
224 	u32 csr;
225 	int rc;
226 
227 	if (queue->iommu)
228 		return -EBUSY;
229 
230 	/* Polling not implemented */
231 	if (!irq)
232 		return -ENODEV;
233 
234 	queue->iommu = iommu;
235 	rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
236 				  IRQF_ONESHOT | IRQF_SHARED,
237 				  dev_name(iommu->dev), queue);
238 	if (rc) {
239 		queue->iommu = NULL;
240 		return rc;
241 	}
242 
243 	/*
244 	 * Enable queue with interrupts, clear any memory fault if any.
245 	 * Wait for the hardware to acknowledge request and activate queue
246 	 * processing.
247 	 * Note: All CSR bitfields are in the same offsets for all queues.
248 	 */
249 	riscv_iommu_writel(iommu, queue->qcr,
250 			   RISCV_IOMMU_QUEUE_ENABLE |
251 			   RISCV_IOMMU_QUEUE_INTR_ENABLE |
252 			   RISCV_IOMMU_QUEUE_MEM_FAULT);
253 
254 	riscv_iommu_readl_timeout(iommu, queue->qcr,
255 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
256 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
257 
258 	if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
259 						RISCV_IOMMU_QUEUE_BUSY |
260 						RISCV_IOMMU_QUEUE_MEM_FAULT))) {
261 		/* Best effort to stop and disable failing hardware queue. */
262 		riscv_iommu_writel(iommu, queue->qcr, 0);
263 		free_irq(irq, queue);
264 		queue->iommu = NULL;
265 		dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
266 		return -EBUSY;
267 	}
268 
269 	/* Clear any pending interrupt flag. */
270 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
271 
272 	return 0;
273 }
274 
275 /*
276  * Disable queue. Wait for the hardware to acknowledge request and
277  * stop processing enqueued requests. Report errors but continue.
278  */
279 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
280 {
281 	struct riscv_iommu_device *iommu = queue->iommu;
282 	u32 csr;
283 
284 	if (!iommu)
285 		return;
286 
287 	free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
288 	riscv_iommu_writel(iommu, queue->qcr, 0);
289 	riscv_iommu_readl_timeout(iommu, queue->qcr,
290 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
291 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
292 
293 	if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
294 		dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
295 			queue->qid, csr);
296 
297 	queue->iommu = NULL;
298 }
299 
300 /*
301  * Returns number of available valid queue entries and the first item index.
302  * Update shadow producer index if necessary.
303  */
304 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
305 				     unsigned int *index)
306 {
307 	unsigned int head = atomic_read(&queue->head);
308 	unsigned int tail = atomic_read(&queue->tail);
309 	unsigned int last = Q_ITEM(queue, tail);
310 	int available = (int)(tail - head);
311 
312 	*index = head;
313 
314 	if (available > 0)
315 		return available;
316 
317 	/* read hardware producer index, check reserved register bits are not set. */
318 	if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
319 				      tail, (tail & ~queue->mask) == 0,
320 				      0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
321 		dev_err_once(queue->iommu->dev,
322 			     "Hardware error: queue access timeout\n");
323 		return 0;
324 	}
325 
326 	if (tail == last)
327 		return 0;
328 
329 	/* update shadow producer index */
330 	return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
331 }
332 
333 /*
334  * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
335  */
336 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
337 {
338 	const unsigned int head = atomic_add_return(count, &queue->head);
339 
340 	riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
341 }
342 
343 /* Return actual consumer index based on hardware reported queue head index. */
344 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
345 {
346 	const unsigned int cons = atomic_read(&queue->head);
347 	const unsigned int last = Q_ITEM(queue, cons);
348 	unsigned int head;
349 
350 	if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
351 				      !(head & ~queue->mask),
352 				      0, RISCV_IOMMU_QUEUE_TIMEOUT))
353 		return cons;
354 
355 	return cons + ((head - last) & queue->mask);
356 }
357 
358 /* Wait for submitted item to be processed. */
359 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
360 				  unsigned int index,
361 				  unsigned int timeout_us)
362 {
363 	unsigned int cons = atomic_read(&queue->head);
364 
365 	/* Already processed by the consumer */
366 	if ((int)(cons - index) > 0)
367 		return 0;
368 
369 	/* Monitor consumer index */
370 	return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
371 				 (int)(cons - index) > 0, 0, timeout_us);
372 }
373 
374 /* Enqueue an entry and wait to be processed if timeout_us > 0
375  *
376  * Error handling for IOMMU hardware not responding in reasonable time
377  * will be added as separate patch series along with other RAS features.
378  * For now, only report hardware failure and continue.
379  */
380 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
381 					   void *entry, size_t entry_size)
382 {
383 	unsigned int prod;
384 	unsigned int head;
385 	unsigned int tail;
386 	unsigned long flags;
387 
388 	/* Do not preempt submission flow. */
389 	local_irq_save(flags);
390 
391 	/* 1. Allocate some space in the queue */
392 	prod = atomic_inc_return(&queue->prod) - 1;
393 	head = atomic_read(&queue->head);
394 
395 	/* 2. Wait for space availability. */
396 	if ((prod - head) > queue->mask) {
397 		if (readx_poll_timeout(atomic_read, &queue->head,
398 				       head, (prod - head) < queue->mask,
399 				       0, RISCV_IOMMU_QUEUE_TIMEOUT))
400 			goto err_busy;
401 	} else if ((prod - head) == queue->mask) {
402 		const unsigned int last = Q_ITEM(queue, head);
403 
404 		if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
405 					      !(head & ~queue->mask) && head != last,
406 					      0, RISCV_IOMMU_QUEUE_TIMEOUT))
407 			goto err_busy;
408 		atomic_add((head - last) & queue->mask, &queue->head);
409 	}
410 
411 	/* 3. Store entry in the ring buffer */
412 	memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
413 
414 	/* 4. Wait for all previous entries to be ready */
415 	if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
416 			       0, RISCV_IOMMU_QUEUE_TIMEOUT))
417 		goto err_busy;
418 
419 	/*
420 	 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
421 	 *    completed and visible before signaling the tail doorbell to fetch
422 	 *    the next command. 'fence ow, ow'
423 	 */
424 	dma_wmb();
425 	riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
426 
427 	/*
428 	 * 6. Make sure the doorbell write to the device has finished before updating
429 	 *    the shadow tail index in normal memory. 'fence o, w'
430 	 */
431 	mmiowb();
432 	atomic_inc(&queue->tail);
433 
434 	/* 7. Complete submission and restore local interrupts */
435 	local_irq_restore(flags);
436 
437 	return prod;
438 
439 err_busy:
440 	local_irq_restore(flags);
441 	dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
442 
443 	return prod;
444 }
445 
446 /*
447  * IOMMU Command queue chapter 3.1
448  */
449 
450 /* Command queue interrupt handler thread function */
451 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
452 {
453 	const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
454 	unsigned int ctrl;
455 
456 	/* Clear MF/CQ errors, complete error recovery to be implemented. */
457 	ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
458 	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
459 		    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
460 		riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
461 		dev_warn(queue->iommu->dev,
462 			 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
463 			 queue->qid,
464 			 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
465 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
466 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
467 			 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
468 	}
469 
470 	/* Placeholder for command queue interrupt notifiers */
471 
472 	/* Clear command interrupt pending. */
473 	riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
474 
475 	return IRQ_HANDLED;
476 }
477 
478 /* Send command to the IOMMU command queue */
479 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
480 				 struct riscv_iommu_command *cmd)
481 {
482 	riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
483 }
484 
485 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
486 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
487 				 unsigned int timeout_us)
488 {
489 	struct riscv_iommu_command cmd;
490 	unsigned int prod;
491 
492 	riscv_iommu_cmd_iofence(&cmd);
493 	prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
494 
495 	if (!timeout_us)
496 		return;
497 
498 	if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
499 		dev_err_once(iommu->dev,
500 			     "Hardware error: command execution timeout\n");
501 }
502 
503 /*
504  * IOMMU Fault/Event queue chapter 3.2
505  */
506 
507 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
508 			      struct riscv_iommu_fq_record *event)
509 {
510 	unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
511 	unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
512 
513 	/* Placeholder for future fault handling implementation, report only. */
514 	if (err)
515 		dev_warn_ratelimited(iommu->dev,
516 				     "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
517 				     err, devid, event->iotval, event->iotval2);
518 }
519 
520 /* Fault queue interrupt handler thread function */
521 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
522 {
523 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
524 	struct riscv_iommu_device *iommu = queue->iommu;
525 	struct riscv_iommu_fq_record *events;
526 	unsigned int ctrl, idx;
527 	int cnt, len;
528 
529 	events = (struct riscv_iommu_fq_record *)queue->base;
530 
531 	/* Clear fault interrupt pending and process all received fault events. */
532 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
533 
534 	do {
535 		cnt = riscv_iommu_queue_consume(queue, &idx);
536 		for (len = 0; len < cnt; idx++, len++)
537 			riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
538 		riscv_iommu_queue_release(queue, cnt);
539 	} while (cnt > 0);
540 
541 	/* Clear MF/OF errors, complete error recovery to be implemented. */
542 	ctrl = riscv_iommu_readl(iommu, queue->qcr);
543 	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
544 		riscv_iommu_writel(iommu, queue->qcr, ctrl);
545 		dev_warn(iommu->dev,
546 			 "Queue #%u error; memory fault:%d overflow:%d\n",
547 			 queue->qid,
548 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
549 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
550 	}
551 
552 	return IRQ_HANDLED;
553 }
554 
555 /* Lookup and initialize device context info structure. */
556 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
557 						 unsigned int devid)
558 {
559 	const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
560 	unsigned int depth;
561 	unsigned long ddt, old, new;
562 	void *ptr;
563 	u8 ddi_bits[3] = { 0 };
564 	u64 *ddtp = NULL;
565 
566 	/* Make sure the mode is valid */
567 	if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
568 	    iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
569 		return NULL;
570 
571 	/*
572 	 * Device id partitioning for base format:
573 	 * DDI[0]: bits 0 - 6   (1st level) (7 bits)
574 	 * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
575 	 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
576 	 *
577 	 * For extended format:
578 	 * DDI[0]: bits 0 - 5   (1st level) (6 bits)
579 	 * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
580 	 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
581 	 */
582 	if (base_format) {
583 		ddi_bits[0] = 7;
584 		ddi_bits[1] = 7 + 9;
585 		ddi_bits[2] = 7 + 9 + 8;
586 	} else {
587 		ddi_bits[0] = 6;
588 		ddi_bits[1] = 6 + 9;
589 		ddi_bits[2] = 6 + 9 + 9;
590 	}
591 
592 	/* Make sure device id is within range */
593 	depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
594 	if (devid >= (1 << ddi_bits[depth]))
595 		return NULL;
596 
597 	/* Get to the level of the non-leaf node that holds the device context */
598 	for (ddtp = iommu->ddt_root; depth-- > 0;) {
599 		const int split = ddi_bits[depth];
600 		/*
601 		 * Each non-leaf node is 64bits wide and on each level
602 		 * nodes are indexed by DDI[depth].
603 		 */
604 		ddtp += (devid >> split) & 0x1FF;
605 
606 		/*
607 		 * Check if this node has been populated and if not
608 		 * allocate a new level and populate it.
609 		 */
610 		do {
611 			ddt = READ_ONCE(*(unsigned long *)ddtp);
612 			if (ddt & RISCV_IOMMU_DDTE_V) {
613 				ddtp = __va(ppn_to_phys(ddt));
614 				break;
615 			}
616 
617 			ptr = riscv_iommu_get_pages(iommu, 0);
618 			if (!ptr)
619 				return NULL;
620 
621 			new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
622 			old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
623 
624 			if (old == ddt) {
625 				ddtp = (u64 *)ptr;
626 				break;
627 			}
628 
629 			/* Race setting DDT detected, re-read and retry. */
630 			riscv_iommu_free_pages(iommu, ptr);
631 		} while (1);
632 	}
633 
634 	/*
635 	 * Grab the node that matches DDI[depth], note that when using base
636 	 * format the device context is 4 * 64bits, and the extended format
637 	 * is 8 * 64bits, hence the (3 - base_format) below.
638 	 */
639 	ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
640 
641 	return (struct riscv_iommu_dc *)ddtp;
642 }
643 
644 /*
645  * This is best effort IOMMU translation shutdown flow.
646  * Disable IOMMU without waiting for hardware response.
647  */
648 static void riscv_iommu_disable(struct riscv_iommu_device *iommu)
649 {
650 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 0);
651 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
652 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
653 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
654 }
655 
656 #define riscv_iommu_read_ddtp(iommu) ({ \
657 	u64 ddtp; \
658 	riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
659 				  !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
660 				  RISCV_IOMMU_DDTP_TIMEOUT); \
661 	ddtp; })
662 
663 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
664 {
665 	u64 ddtp;
666 	unsigned int mode;
667 
668 	ddtp = riscv_iommu_read_ddtp(iommu);
669 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
670 		return -EBUSY;
671 
672 	/*
673 	 * It is optional for the hardware to report a fixed address for device
674 	 * directory root page when DDT.MODE is OFF or BARE.
675 	 */
676 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
677 	if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
678 	    mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
679 		/* Use WARL to discover hardware fixed DDT PPN */
680 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
681 				   FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
682 		ddtp = riscv_iommu_read_ddtp(iommu);
683 		if (ddtp & RISCV_IOMMU_DDTP_BUSY)
684 			return -EBUSY;
685 
686 		iommu->ddt_phys = ppn_to_phys(ddtp);
687 		if (iommu->ddt_phys)
688 			iommu->ddt_root = devm_ioremap(iommu->dev,
689 						       iommu->ddt_phys, PAGE_SIZE);
690 		if (iommu->ddt_root)
691 			memset(iommu->ddt_root, 0, PAGE_SIZE);
692 	}
693 
694 	if (!iommu->ddt_root) {
695 		iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
696 		iommu->ddt_phys = __pa(iommu->ddt_root);
697 	}
698 
699 	if (!iommu->ddt_root)
700 		return -ENOMEM;
701 
702 	return 0;
703 }
704 
705 /*
706  * Discover supported DDT modes starting from requested value,
707  * configure DDTP register with accepted mode and root DDT address.
708  * Accepted iommu->ddt_mode is updated on success.
709  */
710 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
711 				      unsigned int ddtp_mode)
712 {
713 	struct device *dev = iommu->dev;
714 	u64 ddtp, rq_ddtp;
715 	unsigned int mode, rq_mode = ddtp_mode;
716 	struct riscv_iommu_command cmd;
717 
718 	ddtp = riscv_iommu_read_ddtp(iommu);
719 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
720 		return -EBUSY;
721 
722 	/* Disallow state transition from xLVL to xLVL. */
723 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
724 	if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
725 	    mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
726 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
727 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
728 		return -EINVAL;
729 
730 	do {
731 		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
732 		if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
733 			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
734 
735 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
736 		ddtp = riscv_iommu_read_ddtp(iommu);
737 		if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
738 			dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
739 				rq_mode, ddtp);
740 			return -EBUSY;
741 		}
742 
743 		/* Verify IOMMU hardware accepts new DDTP config. */
744 		mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
745 
746 		if (rq_mode == mode)
747 			break;
748 
749 		/* Hardware mandatory DDTP mode has not been accepted. */
750 		if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
751 			dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
752 				ddtp, rq_ddtp);
753 			return -EINVAL;
754 		}
755 
756 		/*
757 		 * Mode field is WARL, an IOMMU may support a subset of
758 		 * directory table levels in which case if we tried to set
759 		 * an unsupported number of levels we'll readback either
760 		 * a valid xLVL or off/bare. If we got off/bare, try again
761 		 * with a smaller xLVL.
762 		 */
763 		if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
764 		    rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
765 			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
766 			rq_mode--;
767 			continue;
768 		}
769 
770 		/*
771 		 * We tried all supported modes and IOMMU hardware failed to
772 		 * accept new settings, something went very wrong since off/bare
773 		 * and at least one xLVL must be supported.
774 		 */
775 		dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
776 			mode, ddtp_mode);
777 		return -EINVAL;
778 	} while (1);
779 
780 	iommu->ddt_mode = mode;
781 	if (mode != ddtp_mode)
782 		dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
783 
784 	/* Invalidate device context cache */
785 	riscv_iommu_cmd_iodir_inval_ddt(&cmd);
786 	riscv_iommu_cmd_send(iommu, &cmd);
787 
788 	/* Invalidate address translation cache */
789 	riscv_iommu_cmd_inval_vma(&cmd);
790 	riscv_iommu_cmd_send(iommu, &cmd);
791 
792 	/* IOFENCE.C */
793 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
794 
795 	return 0;
796 }
797 
798 /* This struct contains protection domain specific IOMMU driver data. */
799 struct riscv_iommu_domain {
800 	struct iommu_domain domain;
801 	struct list_head bonds;
802 	spinlock_t lock;		/* protect bonds list updates. */
803 	int pscid;
804 	bool amo_enabled;
805 	int numa_node;
806 	unsigned int pgd_mode;
807 	unsigned long *pgd_root;
808 };
809 
810 #define iommu_domain_to_riscv(iommu_domain) \
811 	container_of(iommu_domain, struct riscv_iommu_domain, domain)
812 
813 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
814 struct riscv_iommu_info {
815 	struct riscv_iommu_domain *domain;
816 };
817 
818 /*
819  * Linkage between an iommu_domain and attached devices.
820  *
821  * Protection domain requiring IOATC and DevATC translation cache invalidations,
822  * should be linked to attached devices using a riscv_iommu_bond structure.
823  * Devices should be linked to the domain before first use and unlinked after
824  * the translations from the referenced protection domain can no longer be used.
825  * Blocking and identity domains are not tracked here, as the IOMMU hardware
826  * does not cache negative and/or identity (BARE mode) translations, and DevATC
827  * is disabled for those protection domains.
828  *
829  * The device pointer and IOMMU data remain stable in the bond struct after
830  * _probe_device() where it's attached to the managed IOMMU, up to the
831  * completion of the _release_device() call. The release of the bond structure
832  * is synchronized with the device release.
833  */
834 struct riscv_iommu_bond {
835 	struct list_head list;
836 	struct rcu_head rcu;
837 	struct device *dev;
838 };
839 
840 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
841 				 struct device *dev)
842 {
843 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
844 	struct riscv_iommu_bond *bond;
845 	struct list_head *bonds;
846 
847 	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
848 	if (!bond)
849 		return -ENOMEM;
850 	bond->dev = dev;
851 
852 	/*
853 	 * List of devices attached to the domain is arranged based on
854 	 * managed IOMMU device.
855 	 */
856 
857 	spin_lock(&domain->lock);
858 	list_for_each(bonds, &domain->bonds)
859 		if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
860 			break;
861 	list_add_rcu(&bond->list, bonds);
862 	spin_unlock(&domain->lock);
863 
864 	/* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
865 	smp_mb();
866 
867 	return 0;
868 }
869 
870 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
871 				    struct device *dev)
872 {
873 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
874 	struct riscv_iommu_bond *bond, *found = NULL;
875 	struct riscv_iommu_command cmd;
876 	int count = 0;
877 
878 	if (!domain)
879 		return;
880 
881 	spin_lock(&domain->lock);
882 	list_for_each_entry(bond, &domain->bonds, list) {
883 		if (found && count)
884 			break;
885 		else if (bond->dev == dev)
886 			found = bond;
887 		else if (dev_to_iommu(bond->dev) == iommu)
888 			count++;
889 	}
890 	if (found)
891 		list_del_rcu(&found->list);
892 	spin_unlock(&domain->lock);
893 	kfree_rcu(found, rcu);
894 
895 	/*
896 	 * If this was the last bond between this domain and the IOMMU
897 	 * invalidate all cached entries for domain's PSCID.
898 	 */
899 	if (!count) {
900 		riscv_iommu_cmd_inval_vma(&cmd);
901 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
902 		riscv_iommu_cmd_send(iommu, &cmd);
903 
904 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
905 	}
906 }
907 
908 /*
909  * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
910  * This limit will be replaced with range invalidations, if supported by
911  * the hardware, when RISC-V IOMMU architecture specification update for
912  * range invalidations update will be available.
913  */
914 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
915 
916 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
917 				    unsigned long start, unsigned long end)
918 {
919 	struct riscv_iommu_bond *bond;
920 	struct riscv_iommu_device *iommu, *prev;
921 	struct riscv_iommu_command cmd;
922 	unsigned long len = end - start + 1;
923 	unsigned long iova;
924 
925 	/*
926 	 * For each IOMMU linked with this protection domain (via bonds->dev),
927 	 * an IOTLB invaliation command will be submitted and executed.
928 	 *
929 	 * Possbile race with domain attach flow is handled by sequencing
930 	 * bond creation - riscv_iommu_bond_link(), and device directory
931 	 * update - riscv_iommu_iodir_update().
932 	 *
933 	 * PTE Update / IOTLB Inval           Device attach & directory update
934 	 * --------------------------         --------------------------
935 	 * update page table entries          add dev to the bond list
936 	 * FENCE RW,RW                        FENCE RW,RW
937 	 * For all IOMMUs: (can be empty)     Update FSC/PSCID
938 	 *   FENCE IOW,IOW                      FENCE IOW,IOW
939 	 *   IOTLB.INVAL                        IODIR.INVAL
940 	 *   IOFENCE.C
941 	 *
942 	 * If bond list is not updated with new device, directory context will
943 	 * be configured with already valid page table content. If an IOMMU is
944 	 * linked to the protection domain it will receive invalidation
945 	 * requests for updated page table entries.
946 	 */
947 	smp_mb();
948 
949 	rcu_read_lock();
950 
951 	prev = NULL;
952 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
953 		iommu = dev_to_iommu(bond->dev);
954 
955 		/*
956 		 * IOTLB invalidation request can be safely omitted if already sent
957 		 * to the IOMMU for the same PSCID, and with domain->bonds list
958 		 * arranged based on the device's IOMMU, it's sufficient to check
959 		 * last device the invalidation was sent to.
960 		 */
961 		if (iommu == prev)
962 			continue;
963 
964 		riscv_iommu_cmd_inval_vma(&cmd);
965 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
966 		if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
967 			for (iova = start; iova < end; iova += PAGE_SIZE) {
968 				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
969 				riscv_iommu_cmd_send(iommu, &cmd);
970 			}
971 		} else {
972 			riscv_iommu_cmd_send(iommu, &cmd);
973 		}
974 		prev = iommu;
975 	}
976 
977 	prev = NULL;
978 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
979 		iommu = dev_to_iommu(bond->dev);
980 		if (iommu == prev)
981 			continue;
982 
983 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
984 		prev = iommu;
985 	}
986 	rcu_read_unlock();
987 }
988 
989 #define RISCV_IOMMU_FSC_BARE 0
990 
991 /*
992  * Update IODIR for the device.
993  *
994  * During the execution of riscv_iommu_probe_device(), IODIR entries are
995  * allocated for the device's identifiers.  Device context invalidation
996  * becomes necessary only if one of the updated entries was previously
997  * marked as valid, given that invalid device context entries are not
998  * cached by the IOMMU hardware.
999  * In this implementation, updating a valid device context while the
1000  * device is not quiesced might be disruptive, potentially causing
1001  * interim translation faults.
1002  */
1003 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1004 				     struct device *dev, u64 fsc, u64 ta)
1005 {
1006 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1007 	struct riscv_iommu_dc *dc;
1008 	struct riscv_iommu_command cmd;
1009 	bool sync_required = false;
1010 	u64 tc;
1011 	int i;
1012 
1013 	for (i = 0; i < fwspec->num_ids; i++) {
1014 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1015 		tc = READ_ONCE(dc->tc);
1016 		if (!(tc & RISCV_IOMMU_DC_TC_V))
1017 			continue;
1018 
1019 		WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1020 
1021 		/* Invalidate device context cached values */
1022 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1023 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1024 		riscv_iommu_cmd_send(iommu, &cmd);
1025 		sync_required = true;
1026 	}
1027 
1028 	if (sync_required)
1029 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1030 
1031 	/*
1032 	 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1033 	 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1034 	 */
1035 	for (i = 0; i < fwspec->num_ids; i++) {
1036 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1037 		tc = READ_ONCE(dc->tc);
1038 		tc |= ta & RISCV_IOMMU_DC_TC_V;
1039 
1040 		WRITE_ONCE(dc->fsc, fsc);
1041 		WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1042 		/* Update device context, write TC.V as the last step. */
1043 		dma_wmb();
1044 		WRITE_ONCE(dc->tc, tc);
1045 
1046 		/* Invalidate device context after update */
1047 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1048 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1049 		riscv_iommu_cmd_send(iommu, &cmd);
1050 	}
1051 
1052 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1053 }
1054 
1055 /*
1056  * IOVA page translation tree management.
1057  */
1058 
1059 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1060 {
1061 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1062 
1063 	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1064 }
1065 
1066 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1067 				   struct iommu_iotlb_gather *gather)
1068 {
1069 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1070 
1071 	riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1072 }
1073 
1074 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1075 
1076 #define _io_pte_present(pte)	((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1077 #define _io_pte_leaf(pte)	((pte) & _PAGE_LEAF)
1078 #define _io_pte_none(pte)	((pte) == 0)
1079 #define _io_pte_entry(pn, prot)	((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1080 
1081 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
1082 				 unsigned long pte, struct list_head *freelist)
1083 {
1084 	unsigned long *ptr;
1085 	int i;
1086 
1087 	if (!_io_pte_present(pte) || _io_pte_leaf(pte))
1088 		return;
1089 
1090 	ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1091 
1092 	/* Recursively free all sub page table pages */
1093 	for (i = 0; i < PTRS_PER_PTE; i++) {
1094 		pte = READ_ONCE(ptr[i]);
1095 		if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
1096 			riscv_iommu_pte_free(domain, pte, freelist);
1097 	}
1098 
1099 	if (freelist)
1100 		list_add_tail(&virt_to_page(ptr)->lru, freelist);
1101 	else
1102 		iommu_free_page(ptr);
1103 }
1104 
1105 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
1106 					    unsigned long iova, size_t pgsize,
1107 					    gfp_t gfp)
1108 {
1109 	unsigned long *ptr = domain->pgd_root;
1110 	unsigned long pte, old;
1111 	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1112 	void *addr;
1113 
1114 	do {
1115 		const int shift = PAGE_SHIFT + PT_SHIFT * level;
1116 
1117 		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1118 		/*
1119 		 * Note: returned entry might be a non-leaf if there was
1120 		 * existing mapping with smaller granularity. Up to the caller
1121 		 * to replace and invalidate.
1122 		 */
1123 		if (((size_t)1 << shift) == pgsize)
1124 			return ptr;
1125 pte_retry:
1126 		pte = READ_ONCE(*ptr);
1127 		/*
1128 		 * This is very likely incorrect as we should not be adding
1129 		 * new mapping with smaller granularity on top
1130 		 * of existing 2M/1G mapping. Fail.
1131 		 */
1132 		if (_io_pte_present(pte) && _io_pte_leaf(pte))
1133 			return NULL;
1134 		/*
1135 		 * Non-leaf entry is missing, allocate and try to add to the
1136 		 * page table. This might race with other mappings, retry.
1137 		 */
1138 		if (_io_pte_none(pte)) {
1139 			addr = iommu_alloc_page_node(domain->numa_node, gfp);
1140 			if (!addr)
1141 				return NULL;
1142 			old = pte;
1143 			pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
1144 			if (cmpxchg_relaxed(ptr, old, pte) != old) {
1145 				iommu_free_page(addr);
1146 				goto pte_retry;
1147 			}
1148 		}
1149 		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1150 	} while (level-- > 0);
1151 
1152 	return NULL;
1153 }
1154 
1155 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
1156 					    unsigned long iova, size_t *pte_pgsize)
1157 {
1158 	unsigned long *ptr = domain->pgd_root;
1159 	unsigned long pte;
1160 	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
1161 
1162 	do {
1163 		const int shift = PAGE_SHIFT + PT_SHIFT * level;
1164 
1165 		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
1166 		pte = READ_ONCE(*ptr);
1167 		if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
1168 			*pte_pgsize = (size_t)1 << shift;
1169 			return ptr;
1170 		}
1171 		if (_io_pte_none(pte))
1172 			return NULL;
1173 		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
1174 	} while (level-- > 0);
1175 
1176 	return NULL;
1177 }
1178 
1179 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
1180 				 unsigned long iova, phys_addr_t phys,
1181 				 size_t pgsize, size_t pgcount, int prot,
1182 				 gfp_t gfp, size_t *mapped)
1183 {
1184 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1185 	size_t size = 0;
1186 	unsigned long *ptr;
1187 	unsigned long pte, old, pte_prot;
1188 	int rc = 0;
1189 	LIST_HEAD(freelist);
1190 
1191 	if (!(prot & IOMMU_WRITE))
1192 		pte_prot = _PAGE_BASE | _PAGE_READ;
1193 	else if (domain->amo_enabled)
1194 		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
1195 	else
1196 		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
1197 
1198 	while (pgcount) {
1199 		ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
1200 		if (!ptr) {
1201 			rc = -ENOMEM;
1202 			break;
1203 		}
1204 
1205 		old = READ_ONCE(*ptr);
1206 		pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
1207 		if (cmpxchg_relaxed(ptr, old, pte) != old)
1208 			continue;
1209 
1210 		riscv_iommu_pte_free(domain, old, &freelist);
1211 
1212 		size += pgsize;
1213 		iova += pgsize;
1214 		phys += pgsize;
1215 		--pgcount;
1216 	}
1217 
1218 	*mapped = size;
1219 
1220 	if (!list_empty(&freelist)) {
1221 		/*
1222 		 * In 1.0 spec version, the smallest scope we can use to
1223 		 * invalidate all levels of page table (i.e. leaf and non-leaf)
1224 		 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1225 		 * This will be updated with hardware support for
1226 		 * capability.NL (non-leaf) IOTINVAL command.
1227 		 */
1228 		riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1229 		iommu_put_pages_list(&freelist);
1230 	}
1231 
1232 	return rc;
1233 }
1234 
1235 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
1236 				      unsigned long iova, size_t pgsize,
1237 				      size_t pgcount,
1238 				      struct iommu_iotlb_gather *gather)
1239 {
1240 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1241 	size_t size = pgcount << __ffs(pgsize);
1242 	unsigned long *ptr, old;
1243 	size_t unmapped = 0;
1244 	size_t pte_size;
1245 
1246 	while (unmapped < size) {
1247 		ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1248 		if (!ptr)
1249 			return unmapped;
1250 
1251 		/* partial unmap is not allowed, fail. */
1252 		if (iova & (pte_size - 1))
1253 			return unmapped;
1254 
1255 		old = READ_ONCE(*ptr);
1256 		if (cmpxchg_relaxed(ptr, old, 0) != old)
1257 			continue;
1258 
1259 		iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
1260 					    pte_size);
1261 
1262 		iova += pte_size;
1263 		unmapped += pte_size;
1264 	}
1265 
1266 	return unmapped;
1267 }
1268 
1269 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
1270 					    dma_addr_t iova)
1271 {
1272 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1273 	unsigned long pte_size;
1274 	unsigned long *ptr;
1275 
1276 	ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
1277 	if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
1278 		return 0;
1279 
1280 	return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
1281 }
1282 
1283 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1284 {
1285 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1286 	const unsigned long pfn = virt_to_pfn(domain->pgd_root);
1287 
1288 	WARN_ON(!list_empty(&domain->bonds));
1289 
1290 	if ((int)domain->pscid > 0)
1291 		ida_free(&riscv_iommu_pscids, domain->pscid);
1292 
1293 	riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
1294 	kfree(domain);
1295 }
1296 
1297 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1298 {
1299 	switch (pgd_mode) {
1300 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1301 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1302 
1303 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1304 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1305 
1306 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1307 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1308 	}
1309 	return false;
1310 }
1311 
1312 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1313 					    struct device *dev)
1314 {
1315 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1316 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1317 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1318 	u64 fsc, ta;
1319 
1320 	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
1321 		return -ENODEV;
1322 
1323 	fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
1324 	      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
1325 	ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1326 	     RISCV_IOMMU_PC_TA_V;
1327 
1328 	if (riscv_iommu_bond_link(domain, dev))
1329 		return -ENOMEM;
1330 
1331 	riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1332 	riscv_iommu_bond_unlink(info->domain, dev);
1333 	info->domain = domain;
1334 
1335 	return 0;
1336 }
1337 
1338 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1339 	.attach_dev = riscv_iommu_attach_paging_domain,
1340 	.free = riscv_iommu_free_paging_domain,
1341 	.map_pages = riscv_iommu_map_pages,
1342 	.unmap_pages = riscv_iommu_unmap_pages,
1343 	.iova_to_phys = riscv_iommu_iova_to_phys,
1344 	.iotlb_sync = riscv_iommu_iotlb_sync,
1345 	.flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1346 };
1347 
1348 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1349 {
1350 	struct riscv_iommu_domain *domain;
1351 	struct riscv_iommu_device *iommu;
1352 	unsigned int pgd_mode;
1353 	dma_addr_t va_mask;
1354 	int va_bits;
1355 
1356 	iommu = dev_to_iommu(dev);
1357 	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1358 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
1359 		va_bits = 57;
1360 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1361 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
1362 		va_bits = 48;
1363 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1364 		pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
1365 		va_bits = 39;
1366 	} else {
1367 		dev_err(dev, "cannot find supported page table mode\n");
1368 		return ERR_PTR(-ENODEV);
1369 	}
1370 
1371 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1372 	if (!domain)
1373 		return ERR_PTR(-ENOMEM);
1374 
1375 	INIT_LIST_HEAD_RCU(&domain->bonds);
1376 	spin_lock_init(&domain->lock);
1377 	domain->numa_node = dev_to_node(iommu->dev);
1378 	domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
1379 	domain->pgd_mode = pgd_mode;
1380 	domain->pgd_root = iommu_alloc_page_node(domain->numa_node,
1381 						 GFP_KERNEL_ACCOUNT);
1382 	if (!domain->pgd_root) {
1383 		kfree(domain);
1384 		return ERR_PTR(-ENOMEM);
1385 	}
1386 
1387 	domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1388 					RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1389 	if (domain->pscid < 0) {
1390 		iommu_free_page(domain->pgd_root);
1391 		kfree(domain);
1392 		return ERR_PTR(-ENOMEM);
1393 	}
1394 
1395 	/*
1396 	 * Note: RISC-V Privilege spec mandates that virtual addresses
1397 	 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1398 	 * bits >= VA_BITS need to also be set or else we'll get a
1399 	 * page fault. However the code that creates the mappings
1400 	 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1401 	 * for now, so we'll end up with invalid virtual addresses
1402 	 * to map. As a workaround until we get this sorted out
1403 	 * limit the available virtual addresses to VA_BITS - 1.
1404 	 */
1405 	va_mask = DMA_BIT_MASK(va_bits - 1);
1406 
1407 	domain->domain.geometry.aperture_start = 0;
1408 	domain->domain.geometry.aperture_end = va_mask;
1409 	domain->domain.geometry.force_aperture = true;
1410 	domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
1411 
1412 	domain->domain.ops = &riscv_iommu_paging_domain_ops;
1413 
1414 	return &domain->domain;
1415 }
1416 
1417 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1418 					      struct device *dev)
1419 {
1420 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1421 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1422 
1423 	/* Make device context invalid, translation requests will fault w/ #258 */
1424 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1425 	riscv_iommu_bond_unlink(info->domain, dev);
1426 	info->domain = NULL;
1427 
1428 	return 0;
1429 }
1430 
1431 static struct iommu_domain riscv_iommu_blocking_domain = {
1432 	.type = IOMMU_DOMAIN_BLOCKED,
1433 	.ops = &(const struct iommu_domain_ops) {
1434 		.attach_dev = riscv_iommu_attach_blocking_domain,
1435 	}
1436 };
1437 
1438 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1439 					      struct device *dev)
1440 {
1441 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1442 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1443 
1444 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1445 	riscv_iommu_bond_unlink(info->domain, dev);
1446 	info->domain = NULL;
1447 
1448 	return 0;
1449 }
1450 
1451 static struct iommu_domain riscv_iommu_identity_domain = {
1452 	.type = IOMMU_DOMAIN_IDENTITY,
1453 	.ops = &(const struct iommu_domain_ops) {
1454 		.attach_dev = riscv_iommu_attach_identity_domain,
1455 	}
1456 };
1457 
1458 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1459 {
1460 	if (dev_is_pci(dev))
1461 		return pci_device_group(dev);
1462 	return generic_device_group(dev);
1463 }
1464 
1465 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1466 {
1467 	return iommu_fwspec_add_ids(dev, args->args, 1);
1468 }
1469 
1470 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1471 {
1472 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1473 	struct riscv_iommu_device *iommu;
1474 	struct riscv_iommu_info *info;
1475 	struct riscv_iommu_dc *dc;
1476 	u64 tc;
1477 	int i;
1478 
1479 	if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1480 		return ERR_PTR(-ENODEV);
1481 
1482 	iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1483 	if (!iommu)
1484 		return ERR_PTR(-ENODEV);
1485 
1486 	/*
1487 	 * IOMMU hardware operating in fail-over BARE mode will provide
1488 	 * identity translation for all connected devices anyway...
1489 	 */
1490 	if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1491 		return ERR_PTR(-ENODEV);
1492 
1493 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1494 	if (!info)
1495 		return ERR_PTR(-ENOMEM);
1496 	/*
1497 	 * Allocate and pre-configure device context entries in
1498 	 * the device directory. Do not mark the context valid yet.
1499 	 */
1500 	tc = 0;
1501 	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
1502 		tc |= RISCV_IOMMU_DC_TC_SADE;
1503 	for (i = 0; i < fwspec->num_ids; i++) {
1504 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1505 		if (!dc) {
1506 			kfree(info);
1507 			return ERR_PTR(-ENODEV);
1508 		}
1509 		if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1510 			dev_warn(dev, "already attached to IOMMU device directory\n");
1511 		WRITE_ONCE(dc->tc, tc);
1512 	}
1513 
1514 	dev_iommu_priv_set(dev, info);
1515 
1516 	return &iommu->iommu;
1517 }
1518 
1519 static void riscv_iommu_release_device(struct device *dev)
1520 {
1521 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1522 
1523 	kfree_rcu_mightsleep(info);
1524 }
1525 
1526 static const struct iommu_ops riscv_iommu_ops = {
1527 	.pgsize_bitmap = SZ_4K,
1528 	.of_xlate = riscv_iommu_of_xlate,
1529 	.identity_domain = &riscv_iommu_identity_domain,
1530 	.blocked_domain = &riscv_iommu_blocking_domain,
1531 	.release_domain = &riscv_iommu_blocking_domain,
1532 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1533 	.device_group = riscv_iommu_device_group,
1534 	.probe_device = riscv_iommu_probe_device,
1535 	.release_device	= riscv_iommu_release_device,
1536 };
1537 
1538 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1539 {
1540 	u64 ddtp;
1541 
1542 	/*
1543 	 * Make sure the IOMMU is switched off or in pass-through mode during
1544 	 * regular boot flow and disable translation when we boot into a kexec
1545 	 * kernel and the previous kernel left them enabled.
1546 	 */
1547 	ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1548 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1549 		return -EBUSY;
1550 
1551 	if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1552 	     RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1553 		if (!is_kdump_kernel())
1554 			return -EBUSY;
1555 		riscv_iommu_disable(iommu);
1556 	}
1557 
1558 	/* Configure accesses to in-memory data structures for CPU-native byte order. */
1559 	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1560 	    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1561 		if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1562 			return -EINVAL;
1563 		riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1564 				   iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1565 		iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1566 		if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1567 		    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1568 			return -EINVAL;
1569 	}
1570 
1571 	/*
1572 	 * Distribute interrupt vectors, always use first vector for CIV.
1573 	 * At least one interrupt is required. Read back and verify.
1574 	 */
1575 	if (!iommu->irqs_count)
1576 		return -EINVAL;
1577 
1578 	iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1579 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1580 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1581 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1582 	iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1583 	if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1584 		    FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
1585 		max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1586 		    FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1587 		return -EINVAL;
1588 
1589 	return 0;
1590 }
1591 
1592 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1593 {
1594 	iommu_device_unregister(&iommu->iommu);
1595 	iommu_device_sysfs_remove(&iommu->iommu);
1596 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1597 	riscv_iommu_queue_disable(&iommu->cmdq);
1598 	riscv_iommu_queue_disable(&iommu->fltq);
1599 }
1600 
1601 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1602 {
1603 	int rc;
1604 
1605 	RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1606 	RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1607 
1608 	rc = riscv_iommu_init_check(iommu);
1609 	if (rc)
1610 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1611 
1612 	rc = riscv_iommu_iodir_alloc(iommu);
1613 	if (rc)
1614 		return rc;
1615 
1616 	rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1617 				     sizeof(struct riscv_iommu_command));
1618 	if (rc)
1619 		return rc;
1620 
1621 	rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1622 				     sizeof(struct riscv_iommu_fq_record));
1623 	if (rc)
1624 		return rc;
1625 
1626 	rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1627 	if (rc)
1628 		return rc;
1629 
1630 	rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1631 	if (rc)
1632 		goto err_queue_disable;
1633 
1634 	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1635 	if (rc)
1636 		goto err_queue_disable;
1637 
1638 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1639 				    dev_name(iommu->dev));
1640 	if (rc) {
1641 		dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1642 		goto err_iodir_off;
1643 	}
1644 
1645 	rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1646 	if (rc) {
1647 		dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1648 		goto err_remove_sysfs;
1649 	}
1650 
1651 	return 0;
1652 
1653 err_remove_sysfs:
1654 	iommu_device_sysfs_remove(&iommu->iommu);
1655 err_iodir_off:
1656 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1657 err_queue_disable:
1658 	riscv_iommu_queue_disable(&iommu->fltq);
1659 	riscv_iommu_queue_disable(&iommu->cmdq);
1660 	return rc;
1661 }
1662