xref: /linux/drivers/iommu/riscv/iommu.c (revision fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * IOMMU API for RISC-V IOMMU implementations.
4  *
5  * Copyright © 2022-2024 Rivos Inc.
6  * Copyright © 2023 FORTH-ICS/CARV
7  *
8  * Authors
9  *	Tomasz Jeznach <tjeznach@rivosinc.com>
10  *	Nick Kossifidis <mick@ics.forth.gr>
11  */
12 
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
14 
15 #include <linux/acpi.h>
16 #include <linux/acpi_rimt.h>
17 #include <linux/compiler.h>
18 #include <linux/crash_dump.h>
19 #include <linux/init.h>
20 #include <linux/iommu.h>
21 #include <linux/iopoll.h>
22 #include <linux/kernel.h>
23 #include <linux/pci.h>
24 #include <linux/generic_pt/iommu.h>
25 
26 #include "../iommu-pages.h"
27 #include "iommu-bits.h"
28 #include "iommu.h"
29 
30 /* Timeouts in [us] */
31 #define RISCV_IOMMU_QCSR_TIMEOUT	150000
32 #define RISCV_IOMMU_QUEUE_TIMEOUT	150000
33 #define RISCV_IOMMU_DDTP_TIMEOUT	10000000
34 #define RISCV_IOMMU_IOTINVAL_TIMEOUT	90000000
35 
36 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
37 #define RISCV_IOMMU_DEF_CQ_COUNT	8192
38 #define RISCV_IOMMU_DEF_FQ_COUNT	4096
39 
40 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
41 #define phys_to_ppn(pa)  (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
42 #define ppn_to_phys(pn)	 (((pn) << 2) & (((1ULL << 44) - 1) << 12))
43 
44 #define dev_to_iommu(dev) \
45 	iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
46 
47 /* IOMMU PSCID allocation namespace. */
48 static DEFINE_IDA(riscv_iommu_pscids);
49 #define RISCV_IOMMU_MAX_PSCID		(BIT(20) - 1)
50 
51 /* Device resource-managed allocations */
52 struct riscv_iommu_devres {
53 	void *addr;
54 };
55 
56 static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
57 {
58 	struct riscv_iommu_devres *devres = res;
59 
60 	iommu_free_pages(devres->addr);
61 }
62 
63 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
64 {
65 	struct riscv_iommu_devres *devres = res;
66 	struct riscv_iommu_devres *target = p;
67 
68 	return devres->addr == target->addr;
69 }
70 
71 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
72 				   unsigned int size)
73 {
74 	struct riscv_iommu_devres *devres;
75 	void *addr;
76 
77 	addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
78 					 GFP_KERNEL_ACCOUNT, size);
79 	if (unlikely(!addr))
80 		return NULL;
81 
82 	devres = devres_alloc(riscv_iommu_devres_pages_release,
83 			      sizeof(struct riscv_iommu_devres), GFP_KERNEL);
84 
85 	if (unlikely(!devres)) {
86 		iommu_free_pages(addr);
87 		return NULL;
88 	}
89 
90 	devres->addr = addr;
91 
92 	devres_add(iommu->dev, devres);
93 
94 	return addr;
95 }
96 
97 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
98 {
99 	struct riscv_iommu_devres devres = { .addr = addr };
100 
101 	devres_release(iommu->dev, riscv_iommu_devres_pages_release,
102 		       riscv_iommu_devres_pages_match, &devres);
103 }
104 
105 /*
106  * Hardware queue allocation and management.
107  */
108 
109 /* Setup queue base, control registers and default queue length */
110 #define RISCV_IOMMU_QUEUE_INIT(q, name) do {				\
111 	struct riscv_iommu_queue *_q = q;				\
112 	_q->qid = RISCV_IOMMU_INTR_ ## name;				\
113 	_q->qbr = RISCV_IOMMU_REG_ ## name ## B;			\
114 	_q->qcr = RISCV_IOMMU_REG_ ## name ## CSR;			\
115 	_q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
116 } while (0)
117 
118 /* Note: offsets are the same for all queues */
119 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
120 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
121 #define Q_ITEM(q, index) ((q)->mask & (index))
122 #define Q_IPSR(q) BIT((q)->qid)
123 
124 /*
125  * Discover queue ring buffer hardware configuration, allocate in-memory
126  * ring buffer or use fixed I/O memory location, configure queue base register.
127  * Must be called before hardware queue is enabled.
128  *
129  * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
130  * @entry_size - queue single element size in bytes.
131  */
132 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
133 				   struct riscv_iommu_queue *queue,
134 				   size_t entry_size)
135 {
136 	unsigned int logsz;
137 	u64 qb, rb;
138 
139 	/*
140 	 * Use WARL base register property to discover maximum allowed
141 	 * number of entries and optional fixed IO address for queue location.
142 	 */
143 	riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
144 	qb = riscv_iommu_readq(iommu, queue->qbr);
145 
146 	/*
147 	 * Calculate and verify hardware supported queue length, as reported
148 	 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
149 	 * Update queue size based on hardware supported value.
150 	 */
151 	logsz = ilog2(queue->mask);
152 	if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
153 		logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
154 
155 	/*
156 	 * Use WARL base register property to discover an optional fixed IO
157 	 * address for queue ring buffer location. Otherwise allocate contiguous
158 	 * system memory.
159 	 */
160 	if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
161 		const size_t queue_size = entry_size << (logsz + 1);
162 
163 		queue->phys = PFN_PHYS(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
164 		queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
165 	} else {
166 		do {
167 			const size_t queue_size = entry_size << (logsz + 1);
168 
169 			queue->base = riscv_iommu_get_pages(
170 				iommu, max(queue_size, SZ_4K));
171 			queue->phys = __pa(queue->base);
172 		} while (!queue->base && logsz-- > 0);
173 	}
174 
175 	if (!queue->base)
176 		return -ENOMEM;
177 
178 	qb = phys_to_ppn(queue->phys) |
179 	     FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
180 
181 	/* Update base register and read back to verify hw accepted our write */
182 	riscv_iommu_writeq(iommu, queue->qbr, qb);
183 	rb = riscv_iommu_readq(iommu, queue->qbr);
184 	if (rb != qb) {
185 		dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
186 		return -ENODEV;
187 	}
188 
189 	/* Update actual queue mask */
190 	queue->mask = (2U << logsz) - 1;
191 
192 	dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
193 		queue->qid, logsz + 1);
194 
195 	return 0;
196 }
197 
198 /* Check interrupt queue status, IPSR */
199 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
200 {
201 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
202 
203 	if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
204 		return IRQ_WAKE_THREAD;
205 
206 	return IRQ_NONE;
207 }
208 
209 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
210 {
211 	/* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
212 	return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
213 }
214 
215 /*
216  * Enable queue processing in the hardware, register interrupt handler.
217  *
218  * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
219  * @irq_handler - threaded interrupt handler.
220  */
221 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
222 				    struct riscv_iommu_queue *queue,
223 				    irq_handler_t irq_handler)
224 {
225 	const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
226 	u32 csr;
227 	int rc;
228 
229 	if (queue->iommu)
230 		return -EBUSY;
231 
232 	/* Polling not implemented */
233 	if (!irq)
234 		return -ENODEV;
235 
236 	queue->iommu = iommu;
237 	rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
238 				  IRQF_ONESHOT | IRQF_SHARED,
239 				  dev_name(iommu->dev), queue);
240 	if (rc) {
241 		queue->iommu = NULL;
242 		return rc;
243 	}
244 
245 	/* Empty queue before enabling it */
246 	if (queue->qid == RISCV_IOMMU_INTR_CQ)
247 		riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
248 	else
249 		riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
250 
251 	/*
252 	 * Enable queue with interrupts, clear any memory fault if any.
253 	 * Wait for the hardware to acknowledge request and activate queue
254 	 * processing.
255 	 * Note: All CSR bitfields are in the same offsets for all queues.
256 	 */
257 	riscv_iommu_writel(iommu, queue->qcr,
258 			   RISCV_IOMMU_QUEUE_ENABLE |
259 			   RISCV_IOMMU_QUEUE_INTR_ENABLE |
260 			   RISCV_IOMMU_QUEUE_MEM_FAULT);
261 
262 	riscv_iommu_readl_timeout(iommu, queue->qcr,
263 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
264 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
265 
266 	if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
267 						RISCV_IOMMU_QUEUE_BUSY |
268 						RISCV_IOMMU_QUEUE_MEM_FAULT))) {
269 		/* Best effort to stop and disable failing hardware queue. */
270 		riscv_iommu_writel(iommu, queue->qcr, 0);
271 		free_irq(irq, queue);
272 		queue->iommu = NULL;
273 		dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
274 		return -EBUSY;
275 	}
276 
277 	/* Clear any pending interrupt flag. */
278 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
279 
280 	return 0;
281 }
282 
283 /*
284  * Disable queue. Wait for the hardware to acknowledge request and
285  * stop processing enqueued requests. Report errors but continue.
286  */
287 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
288 {
289 	struct riscv_iommu_device *iommu = queue->iommu;
290 	u32 csr;
291 
292 	if (!iommu)
293 		return;
294 
295 	free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
296 	riscv_iommu_writel(iommu, queue->qcr, 0);
297 	riscv_iommu_readl_timeout(iommu, queue->qcr,
298 				  csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
299 				  10, RISCV_IOMMU_QCSR_TIMEOUT);
300 
301 	if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
302 		dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
303 			queue->qid, csr);
304 
305 	queue->iommu = NULL;
306 }
307 
308 /*
309  * Returns number of available valid queue entries and the first item index.
310  * Update shadow producer index if necessary.
311  */
312 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
313 				     unsigned int *index)
314 {
315 	unsigned int head = atomic_read(&queue->head);
316 	unsigned int tail = atomic_read(&queue->tail);
317 	unsigned int last = Q_ITEM(queue, tail);
318 	int available = (int)(tail - head);
319 
320 	*index = head;
321 
322 	if (available > 0)
323 		return available;
324 
325 	/* read hardware producer index, check reserved register bits are not set. */
326 	if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
327 				      tail, (tail & ~queue->mask) == 0,
328 				      0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
329 		dev_err_once(queue->iommu->dev,
330 			     "Hardware error: queue access timeout\n");
331 		return 0;
332 	}
333 
334 	if (tail == last)
335 		return 0;
336 
337 	/* update shadow producer index */
338 	return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
339 }
340 
341 /*
342  * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
343  */
344 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
345 {
346 	const unsigned int head = atomic_add_return(count, &queue->head);
347 
348 	riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
349 }
350 
351 /* Return actual consumer index based on hardware reported queue head index. */
352 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
353 {
354 	const unsigned int cons = atomic_read(&queue->head);
355 	const unsigned int last = Q_ITEM(queue, cons);
356 	unsigned int head;
357 
358 	if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
359 				      !(head & ~queue->mask),
360 				      0, RISCV_IOMMU_QUEUE_TIMEOUT))
361 		return cons;
362 
363 	return cons + ((head - last) & queue->mask);
364 }
365 
366 /* Wait for submitted item to be processed. */
367 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
368 				  unsigned int index,
369 				  unsigned int timeout_us)
370 {
371 	unsigned int cons = atomic_read(&queue->head);
372 	unsigned int flags = RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
373 			     RISCV_IOMMU_CQCSR_CMD_ILL;
374 
375 	/* Already processed by the consumer */
376 	if ((int)(cons - index) > 0)
377 		return 0;
378 
379 	/* Monitor consumer index */
380 	return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
381 				 (riscv_iommu_readl(queue->iommu, queue->qcr) & flags) ||
382 				 (int)(cons - index) > 0, 0, timeout_us);
383 }
384 
385 /* Enqueue an entry and wait to be processed if timeout_us > 0
386  *
387  * Error handling for IOMMU hardware not responding in reasonable time
388  * will be added as separate patch series along with other RAS features.
389  * For now, only report hardware failure and continue.
390  */
391 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
392 					   void *entry, size_t entry_size)
393 {
394 	unsigned int prod;
395 	unsigned int head;
396 	unsigned int tail;
397 	unsigned long flags;
398 
399 	/* Do not preempt submission flow. */
400 	local_irq_save(flags);
401 
402 	/* 1. Allocate some space in the queue */
403 	prod = atomic_inc_return(&queue->prod) - 1;
404 	head = atomic_read(&queue->head);
405 
406 	/* 2. Wait for space availability. */
407 	if ((prod - head) > queue->mask) {
408 		if (readx_poll_timeout(atomic_read, &queue->head,
409 				       head, (prod - head) < queue->mask,
410 				       0, RISCV_IOMMU_QUEUE_TIMEOUT))
411 			goto err_busy;
412 	} else if ((prod - head) == queue->mask) {
413 		const unsigned int last = Q_ITEM(queue, head);
414 
415 		if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
416 					      !(head & ~queue->mask) && head != last,
417 					      0, RISCV_IOMMU_QUEUE_TIMEOUT))
418 			goto err_busy;
419 		atomic_add((head - last) & queue->mask, &queue->head);
420 	}
421 
422 	/* 3. Store entry in the ring buffer */
423 	memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
424 
425 	/* 4. Wait for all previous entries to be ready */
426 	if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
427 			       0, RISCV_IOMMU_QUEUE_TIMEOUT))
428 		goto err_busy;
429 
430 	/*
431 	 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
432 	 *    completed and visible before signaling the tail doorbell to fetch
433 	 *    the next command. 'fence ow, ow'
434 	 */
435 	dma_wmb();
436 	riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
437 
438 	/*
439 	 * 6. Make sure the doorbell write to the device has finished before updating
440 	 *    the shadow tail index in normal memory. 'fence o, w'
441 	 */
442 #ifdef CONFIG_MMIOWB
443 	mmiowb();
444 #endif
445 	atomic_inc(&queue->tail);
446 
447 	/* 7. Complete submission and restore local interrupts */
448 	local_irq_restore(flags);
449 
450 	return prod;
451 
452 err_busy:
453 	local_irq_restore(flags);
454 	dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
455 
456 	return prod;
457 }
458 
459 /*
460  * IOMMU Command queue chapter 3.1
461  */
462 
463 /* Command queue interrupt handler thread function */
464 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
465 {
466 	const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
467 	unsigned int ctrl;
468 
469 	/* Clear MF/CQ errors, complete error recovery to be implemented. */
470 	ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
471 	if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
472 		    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
473 		riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
474 		dev_warn(queue->iommu->dev,
475 			 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
476 			 queue->qid,
477 			 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
478 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
479 			 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
480 			 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
481 	}
482 
483 	/* Placeholder for command queue interrupt notifiers */
484 
485 	/* Clear command interrupt pending. */
486 	riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
487 
488 	return IRQ_HANDLED;
489 }
490 
491 /* Send command to the IOMMU command queue */
492 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
493 				 struct riscv_iommu_command *cmd)
494 {
495 	riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
496 }
497 
498 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
499 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
500 				 unsigned int timeout_us)
501 {
502 	struct riscv_iommu_command cmd;
503 	unsigned int prod;
504 
505 	riscv_iommu_cmd_iofence(&cmd);
506 	prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
507 
508 	if (!timeout_us)
509 		return;
510 
511 	if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
512 		dev_err_once(iommu->dev,
513 			     "Hardware error: command execution timeout\n");
514 }
515 
516 /*
517  * IOMMU Fault/Event queue chapter 3.2
518  */
519 
520 static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
521 			      struct riscv_iommu_fq_record *event)
522 {
523 	unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
524 	unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
525 
526 	/* Placeholder for future fault handling implementation, report only. */
527 	if (err)
528 		dev_warn_ratelimited(iommu->dev,
529 				     "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
530 				     err, devid, event->iotval, event->iotval2);
531 }
532 
533 /* Fault queue interrupt handler thread function */
534 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
535 {
536 	struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
537 	struct riscv_iommu_device *iommu = queue->iommu;
538 	struct riscv_iommu_fq_record *events;
539 	unsigned int ctrl, idx;
540 	int cnt, len;
541 
542 	events = (struct riscv_iommu_fq_record *)queue->base;
543 
544 	/* Clear fault interrupt pending and process all received fault events. */
545 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
546 
547 	do {
548 		cnt = riscv_iommu_queue_consume(queue, &idx);
549 		for (len = 0; len < cnt; idx++, len++)
550 			riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
551 		riscv_iommu_queue_release(queue, cnt);
552 	} while (cnt > 0);
553 
554 	/* Clear MF/OF errors, complete error recovery to be implemented. */
555 	ctrl = riscv_iommu_readl(iommu, queue->qcr);
556 	if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
557 		riscv_iommu_writel(iommu, queue->qcr, ctrl);
558 		dev_warn(iommu->dev,
559 			 "Queue #%u error; memory fault:%d overflow:%d\n",
560 			 queue->qid,
561 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
562 			 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
563 	}
564 
565 	return IRQ_HANDLED;
566 }
567 
568 /* Lookup and initialize device context info structure. */
569 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
570 						 unsigned int devid)
571 {
572 	const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
573 	unsigned int depth;
574 	unsigned long ddt, old, new;
575 	void *ptr;
576 	u8 ddi_bits[3] = { 0 };
577 	u64 *ddtp = NULL;
578 
579 	/* Make sure the mode is valid */
580 	if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
581 	    iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
582 		return NULL;
583 
584 	/*
585 	 * Device id partitioning for base format:
586 	 * DDI[0]: bits 0 - 6   (1st level) (7 bits)
587 	 * DDI[1]: bits 7 - 15  (2nd level) (9 bits)
588 	 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
589 	 *
590 	 * For extended format:
591 	 * DDI[0]: bits 0 - 5   (1st level) (6 bits)
592 	 * DDI[1]: bits 6 - 14  (2nd level) (9 bits)
593 	 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
594 	 */
595 	if (base_format) {
596 		ddi_bits[0] = 7;
597 		ddi_bits[1] = 7 + 9;
598 		ddi_bits[2] = 7 + 9 + 8;
599 	} else {
600 		ddi_bits[0] = 6;
601 		ddi_bits[1] = 6 + 9;
602 		ddi_bits[2] = 6 + 9 + 9;
603 	}
604 
605 	/* Make sure device id is within range */
606 	depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
607 	if (devid >= (1 << ddi_bits[depth]))
608 		return NULL;
609 
610 	/* Get to the level of the non-leaf node that holds the device context */
611 	for (ddtp = iommu->ddt_root; depth-- > 0;) {
612 		const int split = ddi_bits[depth];
613 		/*
614 		 * Each non-leaf node is 64bits wide and on each level
615 		 * nodes are indexed by DDI[depth].
616 		 */
617 		ddtp += (devid >> split) & 0x1FF;
618 
619 		/*
620 		 * Check if this node has been populated and if not
621 		 * allocate a new level and populate it.
622 		 */
623 		do {
624 			ddt = READ_ONCE(*(unsigned long *)ddtp);
625 			if (ddt & RISCV_IOMMU_DDTE_V) {
626 				ddtp = __va(ppn_to_phys(ddt));
627 				break;
628 			}
629 
630 			ptr = riscv_iommu_get_pages(iommu, SZ_4K);
631 			if (!ptr)
632 				return NULL;
633 
634 			new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
635 			old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
636 
637 			if (old == ddt) {
638 				ddtp = (u64 *)ptr;
639 				break;
640 			}
641 
642 			/* Race setting DDT detected, re-read and retry. */
643 			riscv_iommu_free_pages(iommu, ptr);
644 		} while (1);
645 	}
646 
647 	/*
648 	 * Grab the node that matches DDI[depth], note that when using base
649 	 * format the device context is 4 * 64bits, and the extended format
650 	 * is 8 * 64bits, hence the (3 - base_format) below.
651 	 */
652 	ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
653 
654 	return (struct riscv_iommu_dc *)ddtp;
655 }
656 
657 /*
658  * This is best effort IOMMU translation shutdown flow.
659  * Disable IOMMU without waiting for hardware response.
660  */
661 void riscv_iommu_disable(struct riscv_iommu_device *iommu)
662 {
663 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
664 			   FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
665 				      RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
666 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
667 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
668 	riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
669 }
670 
671 #define riscv_iommu_read_ddtp(iommu) ({ \
672 	u64 ddtp; \
673 	riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
674 				  !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
675 				  RISCV_IOMMU_DDTP_TIMEOUT); \
676 	ddtp; })
677 
678 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
679 {
680 	u64 ddtp;
681 	unsigned int mode;
682 
683 	ddtp = riscv_iommu_read_ddtp(iommu);
684 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
685 		return -EBUSY;
686 
687 	/*
688 	 * It is optional for the hardware to report a fixed address for device
689 	 * directory root page when DDT.MODE is OFF or BARE.
690 	 */
691 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
692 	if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
693 	    mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
694 		/* Use WARL to discover hardware fixed DDT PPN */
695 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
696 				   FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
697 		ddtp = riscv_iommu_read_ddtp(iommu);
698 		if (ddtp & RISCV_IOMMU_DDTP_BUSY)
699 			return -EBUSY;
700 
701 		iommu->ddt_phys = ppn_to_phys(ddtp);
702 		if (iommu->ddt_phys)
703 			iommu->ddt_root = devm_ioremap(iommu->dev,
704 						       iommu->ddt_phys, PAGE_SIZE);
705 		if (iommu->ddt_root)
706 			memset(iommu->ddt_root, 0, PAGE_SIZE);
707 	}
708 
709 	if (!iommu->ddt_root) {
710 		iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
711 		iommu->ddt_phys = __pa(iommu->ddt_root);
712 	}
713 
714 	if (!iommu->ddt_root)
715 		return -ENOMEM;
716 
717 	return 0;
718 }
719 
720 /*
721  * Discover supported DDT modes starting from requested value,
722  * configure DDTP register with accepted mode and root DDT address.
723  * Accepted iommu->ddt_mode is updated on success.
724  */
725 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
726 				      unsigned int ddtp_mode)
727 {
728 	struct device *dev = iommu->dev;
729 	u64 ddtp, rq_ddtp;
730 	unsigned int mode, rq_mode = ddtp_mode;
731 	struct riscv_iommu_command cmd;
732 
733 	ddtp = riscv_iommu_read_ddtp(iommu);
734 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
735 		return -EBUSY;
736 
737 	/* Disallow state transition from xLVL to xLVL. */
738 	mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
739 	if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
740 	    mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
741 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
742 	    rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
743 		return -EINVAL;
744 
745 	do {
746 		rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
747 		if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
748 			rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
749 
750 		riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
751 		ddtp = riscv_iommu_read_ddtp(iommu);
752 		if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
753 			dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
754 				rq_mode, ddtp);
755 			return -EBUSY;
756 		}
757 
758 		/* Verify IOMMU hardware accepts new DDTP config. */
759 		mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
760 
761 		if (rq_mode == mode)
762 			break;
763 
764 		/* Hardware mandatory DDTP mode has not been accepted. */
765 		if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
766 			dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
767 				ddtp, rq_ddtp);
768 			return -EINVAL;
769 		}
770 
771 		/*
772 		 * Mode field is WARL, an IOMMU may support a subset of
773 		 * directory table levels in which case if we tried to set
774 		 * an unsupported number of levels we'll readback either
775 		 * a valid xLVL or off/bare. If we got off/bare, try again
776 		 * with a smaller xLVL.
777 		 */
778 		if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
779 		    rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
780 			dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
781 			rq_mode--;
782 			continue;
783 		}
784 
785 		/*
786 		 * We tried all supported modes and IOMMU hardware failed to
787 		 * accept new settings, something went very wrong since off/bare
788 		 * and at least one xLVL must be supported.
789 		 */
790 		dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
791 			mode, ddtp_mode);
792 		return -EINVAL;
793 	} while (1);
794 
795 	iommu->ddt_mode = mode;
796 	if (mode != ddtp_mode)
797 		dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
798 
799 	/* Invalidate device context cache */
800 	riscv_iommu_cmd_iodir_inval_ddt(&cmd);
801 	riscv_iommu_cmd_send(iommu, &cmd);
802 
803 	/* Invalidate address translation cache */
804 	riscv_iommu_cmd_inval_vma(&cmd);
805 	riscv_iommu_cmd_send(iommu, &cmd);
806 
807 	/* IOFENCE.C */
808 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
809 
810 	return 0;
811 }
812 
813 /* This struct contains protection domain specific IOMMU driver data. */
814 struct riscv_iommu_domain {
815 	union {
816 		struct iommu_domain domain;
817 		struct pt_iommu_riscv_64 riscvpt;
818 	};
819 	struct list_head bonds;
820 	spinlock_t lock;		/* protect bonds list updates. */
821 	int pscid;
822 };
823 PT_IOMMU_CHECK_DOMAIN(struct riscv_iommu_domain, riscvpt.iommu, domain);
824 
825 #define iommu_domain_to_riscv(iommu_domain) \
826 	container_of(iommu_domain, struct riscv_iommu_domain, domain)
827 
828 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
829 struct riscv_iommu_info {
830 	struct riscv_iommu_domain *domain;
831 };
832 
833 /*
834  * Linkage between an iommu_domain and attached devices.
835  *
836  * Protection domain requiring IOATC and DevATC translation cache invalidations,
837  * should be linked to attached devices using a riscv_iommu_bond structure.
838  * Devices should be linked to the domain before first use and unlinked after
839  * the translations from the referenced protection domain can no longer be used.
840  * Blocking and identity domains are not tracked here, as the IOMMU hardware
841  * does not cache negative and/or identity (BARE mode) translations, and DevATC
842  * is disabled for those protection domains.
843  *
844  * The device pointer and IOMMU data remain stable in the bond struct after
845  * _probe_device() where it's attached to the managed IOMMU, up to the
846  * completion of the _release_device() call. The release of the bond structure
847  * is synchronized with the device release.
848  */
849 struct riscv_iommu_bond {
850 	struct list_head list;
851 	struct rcu_head rcu;
852 	struct device *dev;
853 };
854 
855 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
856 				 struct device *dev)
857 {
858 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
859 	struct riscv_iommu_bond *bond;
860 	struct list_head *bonds;
861 
862 	bond = kzalloc_obj(*bond);
863 	if (!bond)
864 		return -ENOMEM;
865 	bond->dev = dev;
866 
867 	/*
868 	 * List of devices attached to the domain is arranged based on
869 	 * managed IOMMU device.
870 	 */
871 
872 	spin_lock(&domain->lock);
873 	list_for_each(bonds, &domain->bonds)
874 		if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
875 			break;
876 	list_add_rcu(&bond->list, bonds);
877 	spin_unlock(&domain->lock);
878 
879 	/* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
880 	smp_mb();
881 
882 	return 0;
883 }
884 
885 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
886 				    struct device *dev)
887 {
888 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
889 	struct riscv_iommu_bond *bond, *found = NULL;
890 	struct riscv_iommu_command cmd;
891 	int count = 0;
892 
893 	if (!domain)
894 		return;
895 
896 	spin_lock(&domain->lock);
897 	list_for_each_entry(bond, &domain->bonds, list) {
898 		if (found && count)
899 			break;
900 		else if (bond->dev == dev)
901 			found = bond;
902 		else if (dev_to_iommu(bond->dev) == iommu)
903 			count++;
904 	}
905 	if (found)
906 		list_del_rcu(&found->list);
907 	spin_unlock(&domain->lock);
908 	kfree_rcu(found, rcu);
909 
910 	/*
911 	 * If this was the last bond between this domain and the IOMMU
912 	 * invalidate all cached entries for domain's PSCID.
913 	 */
914 	if (!count) {
915 		riscv_iommu_cmd_inval_vma(&cmd);
916 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
917 		riscv_iommu_cmd_send(iommu, &cmd);
918 
919 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
920 	}
921 }
922 
923 /*
924  * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
925  * This limit will be replaced with range invalidations, if supported by
926  * the hardware, when RISC-V IOMMU architecture specification update for
927  * range invalidations update will be available.
928  */
929 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
930 
931 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
932 				    unsigned long start, unsigned long end)
933 {
934 	struct riscv_iommu_bond *bond;
935 	struct riscv_iommu_device *iommu, *prev;
936 	struct riscv_iommu_command cmd;
937 
938 	/*
939 	 * For each IOMMU linked with this protection domain (via bonds->dev),
940 	 * an IOTLB invaliation command will be submitted and executed.
941 	 *
942 	 * Possbile race with domain attach flow is handled by sequencing
943 	 * bond creation - riscv_iommu_bond_link(), and device directory
944 	 * update - riscv_iommu_iodir_update().
945 	 *
946 	 * PTE Update / IOTLB Inval           Device attach & directory update
947 	 * --------------------------         --------------------------
948 	 * update page table entries          add dev to the bond list
949 	 * FENCE RW,RW                        FENCE RW,RW
950 	 * For all IOMMUs: (can be empty)     Update FSC/PSCID
951 	 *   FENCE IOW,IOW                      FENCE IOW,IOW
952 	 *   IOTLB.INVAL                        IODIR.INVAL
953 	 *   IOFENCE.C
954 	 *
955 	 * If bond list is not updated with new device, directory context will
956 	 * be configured with already valid page table content. If an IOMMU is
957 	 * linked to the protection domain it will receive invalidation
958 	 * requests for updated page table entries.
959 	 */
960 	smp_mb();
961 
962 	rcu_read_lock();
963 
964 	prev = NULL;
965 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
966 		iommu = dev_to_iommu(bond->dev);
967 
968 		/*
969 		 * IOTLB invalidation request can be safely omitted if already sent
970 		 * to the IOMMU for the same PSCID, and with domain->bonds list
971 		 * arranged based on the device's IOMMU, it's sufficient to check
972 		 * last device the invalidation was sent to.
973 		 */
974 		if (iommu == prev)
975 			continue;
976 
977 		riscv_iommu_cmd_inval_vma(&cmd);
978 		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
979 		if (end - start < RISCV_IOMMU_IOTLB_INVAL_LIMIT - 1) {
980 			unsigned long iova = start;
981 
982 			do {
983 				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
984 				riscv_iommu_cmd_send(iommu, &cmd);
985 			} while (!check_add_overflow(iova, PAGE_SIZE, &iova) &&
986 				 iova < end);
987 		} else {
988 			riscv_iommu_cmd_send(iommu, &cmd);
989 		}
990 		prev = iommu;
991 	}
992 
993 	prev = NULL;
994 	list_for_each_entry_rcu(bond, &domain->bonds, list) {
995 		iommu = dev_to_iommu(bond->dev);
996 		if (iommu == prev)
997 			continue;
998 
999 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1000 		prev = iommu;
1001 	}
1002 	rcu_read_unlock();
1003 }
1004 
1005 #define RISCV_IOMMU_FSC_BARE 0
1006 /*
1007  * This function sends IOTINVAL commands as required by the RISC-V
1008  * IOMMU specification (Section 6.3.1 and 6.3.2 in 1.0 spec version)
1009  * after modifying DDT or PDT entries
1010  */
1011 static void riscv_iommu_iodir_iotinval(struct riscv_iommu_device *iommu,
1012 				       bool inval_pdt, unsigned long iohgatp,
1013 				       struct riscv_iommu_dc *dc,
1014 				       struct riscv_iommu_pc *pc)
1015 {
1016 	struct riscv_iommu_command cmd;
1017 
1018 	riscv_iommu_cmd_inval_vma(&cmd);
1019 
1020 	if (FIELD_GET(RISCV_IOMMU_DC_IOHGATP_MODE, iohgatp) ==
1021 	    RISCV_IOMMU_DC_IOHGATP_MODE_BARE) {
1022 		if (inval_pdt) {
1023 			/*
1024 			 * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and
1025 			 * PSCID=PC.PSCID
1026 			 */
1027 			riscv_iommu_cmd_inval_set_pscid(&cmd,
1028 				FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta));
1029 		} else {
1030 			if (!FIELD_GET(RISCV_IOMMU_DC_TC_PDTV, dc->tc) &&
1031 			    FIELD_GET(RISCV_IOMMU_DC_FSC_MODE, dc->fsc) !=
1032 			    RISCV_IOMMU_DC_FSC_MODE_BARE) {
1033 				/*
1034 				 * DC.tc.PDTV == 0 && DC.fsc.MODE != Bare
1035 				 * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and
1036 				 * PSCID=DC.ta.PSCID
1037 				 */
1038 				riscv_iommu_cmd_inval_set_pscid(&cmd,
1039 					FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, dc->ta));
1040 			}
1041 			/* else: IOTINVAL.VMA with GV=AV=PSCV=0 */
1042 		}
1043 	} else {
1044 		riscv_iommu_cmd_inval_set_gscid(&cmd,
1045 			FIELD_GET(RISCV_IOMMU_DC_IOHGATP_GSCID, iohgatp));
1046 
1047 		if (inval_pdt) {
1048 			/*
1049 			 * IOTINVAL.VMA with GV=1, AV=0, and PSCV=1, and
1050 			 * GSCID=DC.iohgatp.GSCID, PSCID=PC.PSCID
1051 			 */
1052 			riscv_iommu_cmd_inval_set_pscid(&cmd,
1053 				FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta));
1054 		}
1055 		/*
1056 		 * else: IOTINVAL.VMA with GV=1,AV=PSCV=0,and
1057 		 * GSCID=DC.iohgatp.GSCID
1058 		 *
1059 		 * IOTINVAL.GVMA with GV=1,AV=0,and
1060 		 * GSCID=DC.iohgatp.GSCID
1061 		 * TODO: For now, the Second-Stage feature have not yet been merged,
1062 		 * also issue IOTINVAL.GVMA once second-stage support is merged.
1063 		 */
1064 	}
1065 	riscv_iommu_cmd_send(iommu, &cmd);
1066 }
1067 /*
1068  * Update IODIR for the device.
1069  *
1070  * During the execution of riscv_iommu_probe_device(), IODIR entries are
1071  * allocated for the device's identifiers.  Device context invalidation
1072  * becomes necessary only if one of the updated entries was previously
1073  * marked as valid, given that invalid device context entries are not
1074  * cached by the IOMMU hardware.
1075  * In this implementation, updating a valid device context while the
1076  * device is not quiesced might be disruptive, potentially causing
1077  * interim translation faults.
1078  */
1079 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
1080 				     struct device *dev, u64 fsc, u64 ta)
1081 {
1082 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1083 	struct riscv_iommu_dc *dc;
1084 	struct riscv_iommu_command cmd;
1085 	bool sync_required = false;
1086 	u64 tc;
1087 	int i;
1088 
1089 	for (i = 0; i < fwspec->num_ids; i++) {
1090 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1091 		tc = READ_ONCE(dc->tc);
1092 		if (!(tc & RISCV_IOMMU_DC_TC_V))
1093 			continue;
1094 
1095 		WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
1096 
1097 		/* Invalidate device context cached values */
1098 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1099 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1100 		riscv_iommu_cmd_send(iommu, &cmd);
1101 		/*
1102 		 * For now, the SVA and PASID features have not yet been merged, the
1103 		 * default configuration is inval_pdt=false and pc=NULL.
1104 		 */
1105 		riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL);
1106 		sync_required = true;
1107 	}
1108 
1109 	if (sync_required)
1110 		riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1111 
1112 	/*
1113 	 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1114 	 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1115 	 */
1116 	for (i = 0; i < fwspec->num_ids; i++) {
1117 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1118 		tc = READ_ONCE(dc->tc);
1119 		tc |= ta & RISCV_IOMMU_DC_TC_V;
1120 
1121 		WRITE_ONCE(dc->fsc, fsc);
1122 		WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
1123 		/* Update device context, write TC.V as the last step. */
1124 		dma_wmb();
1125 		WRITE_ONCE(dc->tc, tc);
1126 
1127 		/* Invalidate device context after update */
1128 		riscv_iommu_cmd_iodir_inval_ddt(&cmd);
1129 		riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
1130 		riscv_iommu_cmd_send(iommu, &cmd);
1131 		/*
1132 		 * For now, the SVA and PASID features have not yet been merged, the
1133 		 * default configuration is inval_pdt=false and pc=NULL.
1134 		 */
1135 		riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL);
1136 	}
1137 
1138 	riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
1139 }
1140 
1141 /*
1142  * IOVA page translation tree management.
1143  */
1144 
1145 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
1146 {
1147 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1148 
1149 	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1150 }
1151 
1152 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
1153 				   struct iommu_iotlb_gather *gather)
1154 {
1155 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1156 
1157 	if (iommu_pages_list_empty(&gather->freelist)) {
1158 		riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
1159 	} else {
1160 		/*
1161 		 * In 1.0 spec version, the smallest scope we can use to
1162 		 * invalidate all levels of page table (i.e. leaf and non-leaf)
1163 		 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1164 		 * This will be updated with hardware support for
1165 		 * capability.NL (non-leaf) IOTINVAL command.
1166 		 */
1167 		riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
1168 		iommu_put_pages_list(&gather->freelist);
1169 	}
1170 }
1171 
1172 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
1173 {
1174 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1175 
1176 	WARN_ON(!list_empty(&domain->bonds));
1177 
1178 	if ((int)domain->pscid > 0)
1179 		ida_free(&riscv_iommu_pscids, domain->pscid);
1180 
1181 	pt_iommu_deinit(&domain->riscvpt.iommu);
1182 	kfree(domain);
1183 }
1184 
1185 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
1186 {
1187 	switch (pgd_mode) {
1188 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
1189 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
1190 
1191 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
1192 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
1193 
1194 	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
1195 		return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
1196 	}
1197 	return false;
1198 }
1199 
1200 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
1201 					    struct device *dev,
1202 					    struct iommu_domain *old)
1203 {
1204 	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
1205 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1206 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1207 	struct pt_iommu_riscv_64_hw_info pt_info;
1208 	u64 fsc, ta;
1209 
1210 	pt_iommu_riscv_64_hw_info(&domain->riscvpt, &pt_info);
1211 
1212 	if (!riscv_iommu_pt_supported(iommu, pt_info.fsc_iosatp_mode))
1213 		return -ENODEV;
1214 
1215 	fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, pt_info.fsc_iosatp_mode) |
1216 	      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, pt_info.ppn);
1217 	ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
1218 	     RISCV_IOMMU_PC_TA_V;
1219 
1220 	if (riscv_iommu_bond_link(domain, dev))
1221 		return -ENOMEM;
1222 
1223 	riscv_iommu_iodir_update(iommu, dev, fsc, ta);
1224 	riscv_iommu_bond_unlink(info->domain, dev);
1225 	info->domain = domain;
1226 
1227 	return 0;
1228 }
1229 
1230 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
1231 	IOMMU_PT_DOMAIN_OPS(riscv_64),
1232 	.attach_dev = riscv_iommu_attach_paging_domain,
1233 	.free = riscv_iommu_free_paging_domain,
1234 	.iotlb_sync = riscv_iommu_iotlb_sync,
1235 	.flush_iotlb_all = riscv_iommu_iotlb_flush_all,
1236 };
1237 
1238 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
1239 {
1240 	struct pt_iommu_riscv_64_cfg cfg = {};
1241 	struct riscv_iommu_domain *domain;
1242 	struct riscv_iommu_device *iommu;
1243 	int ret;
1244 
1245 	iommu = dev_to_iommu(dev);
1246 	if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
1247 		cfg.common.hw_max_vasz_lg2 = 57;
1248 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
1249 		cfg.common.hw_max_vasz_lg2 = 48;
1250 	} else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
1251 		cfg.common.hw_max_vasz_lg2 = 39;
1252 	} else {
1253 		dev_err(dev, "cannot find supported page table mode\n");
1254 		return ERR_PTR(-ENODEV);
1255 	}
1256 	cfg.common.hw_max_oasz_lg2 = 56;
1257 
1258 	domain = kzalloc_obj(*domain);
1259 	if (!domain)
1260 		return ERR_PTR(-ENOMEM);
1261 
1262 	INIT_LIST_HEAD_RCU(&domain->bonds);
1263 	spin_lock_init(&domain->lock);
1264 	/*
1265 	 * 6.4 IOMMU capabilities [..] IOMMU implementations must support the
1266 	 * Svnapot standard extension for NAPOT Translation Contiguity.
1267 	 */
1268 	cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
1269 			      BIT(PT_FEAT_FLUSH_RANGE) |
1270 			      BIT(PT_FEAT_RISCV_SVNAPOT_64K);
1271 	domain->riscvpt.iommu.nid = dev_to_node(iommu->dev);
1272 	domain->domain.ops = &riscv_iommu_paging_domain_ops;
1273 
1274 	domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
1275 					RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
1276 	if (domain->pscid < 0) {
1277 		riscv_iommu_free_paging_domain(&domain->domain);
1278 		return ERR_PTR(-ENOMEM);
1279 	}
1280 
1281 	ret = pt_iommu_riscv_64_init(&domain->riscvpt, &cfg, GFP_KERNEL);
1282 	if (ret) {
1283 		riscv_iommu_free_paging_domain(&domain->domain);
1284 		return ERR_PTR(ret);
1285 	}
1286 	return &domain->domain;
1287 }
1288 
1289 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
1290 					      struct device *dev,
1291 					      struct iommu_domain *old)
1292 {
1293 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1294 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1295 
1296 	/* Make device context invalid, translation requests will fault w/ #258 */
1297 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
1298 	riscv_iommu_bond_unlink(info->domain, dev);
1299 	info->domain = NULL;
1300 
1301 	return 0;
1302 }
1303 
1304 static struct iommu_domain riscv_iommu_blocking_domain = {
1305 	.type = IOMMU_DOMAIN_BLOCKED,
1306 	.ops = &(const struct iommu_domain_ops) {
1307 		.attach_dev = riscv_iommu_attach_blocking_domain,
1308 	}
1309 };
1310 
1311 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
1312 					      struct device *dev,
1313 					      struct iommu_domain *old)
1314 {
1315 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
1316 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1317 
1318 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
1319 	riscv_iommu_bond_unlink(info->domain, dev);
1320 	info->domain = NULL;
1321 
1322 	return 0;
1323 }
1324 
1325 static struct iommu_domain riscv_iommu_identity_domain = {
1326 	.type = IOMMU_DOMAIN_IDENTITY,
1327 	.ops = &(const struct iommu_domain_ops) {
1328 		.attach_dev = riscv_iommu_attach_identity_domain,
1329 	}
1330 };
1331 
1332 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
1333 {
1334 	if (dev_is_pci(dev))
1335 		return pci_device_group(dev);
1336 	return generic_device_group(dev);
1337 }
1338 
1339 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
1340 {
1341 	return iommu_fwspec_add_ids(dev, args->args, 1);
1342 }
1343 
1344 static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
1345 {
1346 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
1347 	struct riscv_iommu_device *iommu;
1348 	struct riscv_iommu_info *info;
1349 	struct riscv_iommu_dc *dc;
1350 	u64 tc;
1351 	int i;
1352 
1353 	if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
1354 		return ERR_PTR(-ENODEV);
1355 
1356 	iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
1357 	if (!iommu)
1358 		return ERR_PTR(-ENODEV);
1359 
1360 	/*
1361 	 * IOMMU hardware operating in fail-over BARE mode will provide
1362 	 * identity translation for all connected devices anyway...
1363 	 */
1364 	if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
1365 		return ERR_PTR(-ENODEV);
1366 
1367 	info = kzalloc_obj(*info);
1368 	if (!info)
1369 		return ERR_PTR(-ENOMEM);
1370 	/*
1371 	 * Allocate and pre-configure device context entries in
1372 	 * the device directory. Do not mark the context valid yet.
1373 	 */
1374 	tc = 0;
1375 	for (i = 0; i < fwspec->num_ids; i++) {
1376 		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
1377 		if (!dc) {
1378 			kfree(info);
1379 			return ERR_PTR(-ENODEV);
1380 		}
1381 		if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
1382 			dev_warn(dev, "already attached to IOMMU device directory\n");
1383 		WRITE_ONCE(dc->tc, tc);
1384 	}
1385 
1386 	dev_iommu_priv_set(dev, info);
1387 
1388 	return &iommu->iommu;
1389 }
1390 
1391 static void riscv_iommu_release_device(struct device *dev)
1392 {
1393 	struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
1394 
1395 	kfree_rcu_mightsleep(info);
1396 }
1397 
1398 static const struct iommu_ops riscv_iommu_ops = {
1399 	.of_xlate = riscv_iommu_of_xlate,
1400 	.identity_domain = &riscv_iommu_identity_domain,
1401 	.blocked_domain = &riscv_iommu_blocking_domain,
1402 	.release_domain = &riscv_iommu_blocking_domain,
1403 	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
1404 	.device_group = riscv_iommu_device_group,
1405 	.probe_device = riscv_iommu_probe_device,
1406 	.release_device	= riscv_iommu_release_device,
1407 };
1408 
1409 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
1410 {
1411 	u64 ddtp;
1412 
1413 	/*
1414 	 * Make sure the IOMMU is switched off or in pass-through mode during
1415 	 * regular boot flow and disable translation when we boot into a kexec
1416 	 * kernel and the previous kernel left them enabled.
1417 	 */
1418 	ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
1419 	if (ddtp & RISCV_IOMMU_DDTP_BUSY)
1420 		return -EBUSY;
1421 
1422 	if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
1423 	     RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
1424 		if (!is_kdump_kernel())
1425 			return -EBUSY;
1426 		riscv_iommu_disable(iommu);
1427 	}
1428 
1429 	/* Configure accesses to in-memory data structures for CPU-native byte order. */
1430 	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1431 	    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
1432 		if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
1433 			return -EINVAL;
1434 		riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
1435 				   iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
1436 		iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
1437 		if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
1438 		    !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
1439 			return -EINVAL;
1440 	}
1441 
1442 	/*
1443 	 * Distribute interrupt vectors, always use first vector for CIV.
1444 	 * At least one interrupt is required. Read back and verify.
1445 	 */
1446 	if (!iommu->irqs_count)
1447 		return -EINVAL;
1448 
1449 	iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
1450 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
1451 		       FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
1452 	riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
1453 	iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
1454 	if (max3(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
1455 		 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec),
1456 		 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
1457 		     FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
1458 		return -EINVAL;
1459 
1460 	return 0;
1461 }
1462 
1463 void riscv_iommu_remove(struct riscv_iommu_device *iommu)
1464 {
1465 	iommu_device_unregister(&iommu->iommu);
1466 	iommu_device_sysfs_remove(&iommu->iommu);
1467 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1468 	riscv_iommu_queue_disable(&iommu->cmdq);
1469 	riscv_iommu_queue_disable(&iommu->fltq);
1470 }
1471 
1472 int riscv_iommu_init(struct riscv_iommu_device *iommu)
1473 {
1474 	int rc;
1475 
1476 	RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
1477 	RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
1478 
1479 	rc = riscv_iommu_init_check(iommu);
1480 	if (rc)
1481 		return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
1482 
1483 	rc = riscv_iommu_iodir_alloc(iommu);
1484 	if (rc)
1485 		return rc;
1486 
1487 	rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
1488 				     sizeof(struct riscv_iommu_command));
1489 	if (rc)
1490 		return rc;
1491 
1492 	rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
1493 				     sizeof(struct riscv_iommu_fq_record));
1494 	if (rc)
1495 		return rc;
1496 
1497 	rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
1498 	if (rc)
1499 		return rc;
1500 
1501 	rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
1502 	if (rc)
1503 		goto err_queue_disable;
1504 
1505 	rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
1506 	if (rc)
1507 		goto err_queue_disable;
1508 
1509 	rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
1510 				    dev_name(iommu->dev));
1511 	if (rc) {
1512 		dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
1513 		goto err_iodir_off;
1514 	}
1515 
1516 	if (!acpi_disabled) {
1517 		rc = rimt_iommu_register(iommu->dev);
1518 		if (rc) {
1519 			dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n");
1520 			goto err_remove_sysfs;
1521 		}
1522 	}
1523 
1524 	rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
1525 	if (rc) {
1526 		dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
1527 		goto err_remove_sysfs;
1528 	}
1529 
1530 	return 0;
1531 
1532 err_remove_sysfs:
1533 	iommu_device_sysfs_remove(&iommu->iommu);
1534 err_iodir_off:
1535 	riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
1536 err_queue_disable:
1537 	riscv_iommu_queue_disable(&iommu->fltq);
1538 	riscv_iommu_queue_disable(&iommu->cmdq);
1539 	return rc;
1540 }
1541 
1542 MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
1543