1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * IOMMU API for RISC-V IOMMU implementations. 4 * 5 * Copyright © 2022-2024 Rivos Inc. 6 * Copyright © 2023 FORTH-ICS/CARV 7 * 8 * Authors 9 * Tomasz Jeznach <tjeznach@rivosinc.com> 10 * Nick Kossifidis <mick@ics.forth.gr> 11 */ 12 13 #define pr_fmt(fmt) "riscv-iommu: " fmt 14 15 #include <linux/acpi.h> 16 #include <linux/acpi_rimt.h> 17 #include <linux/compiler.h> 18 #include <linux/crash_dump.h> 19 #include <linux/init.h> 20 #include <linux/iommu.h> 21 #include <linux/iopoll.h> 22 #include <linux/kernel.h> 23 #include <linux/pci.h> 24 #include <linux/generic_pt/iommu.h> 25 26 #include "../iommu-pages.h" 27 #include "iommu-bits.h" 28 #include "iommu.h" 29 30 /* Timeouts in [us] */ 31 #define RISCV_IOMMU_QCSR_TIMEOUT 150000 32 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000 33 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000 34 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 35 36 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */ 37 #define RISCV_IOMMU_DEF_CQ_COUNT 8192 38 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 39 40 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ 41 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) 42 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) 43 44 #define dev_to_iommu(dev) \ 45 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) 46 47 /* IOMMU PSCID allocation namespace. */ 48 static DEFINE_IDA(riscv_iommu_pscids); 49 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) 50 51 /* Device resource-managed allocations */ 52 struct riscv_iommu_devres { 53 void *addr; 54 }; 55 56 static void riscv_iommu_devres_pages_release(struct device *dev, void *res) 57 { 58 struct riscv_iommu_devres *devres = res; 59 60 iommu_free_pages(devres->addr); 61 } 62 63 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) 64 { 65 struct riscv_iommu_devres *devres = res; 66 struct riscv_iommu_devres *target = p; 67 68 return devres->addr == target->addr; 69 } 70 71 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, 72 unsigned int size) 73 { 74 struct riscv_iommu_devres *devres; 75 void *addr; 76 77 addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev), 78 GFP_KERNEL_ACCOUNT, size); 79 if (unlikely(!addr)) 80 return NULL; 81 82 devres = devres_alloc(riscv_iommu_devres_pages_release, 83 sizeof(struct riscv_iommu_devres), GFP_KERNEL); 84 85 if (unlikely(!devres)) { 86 iommu_free_pages(addr); 87 return NULL; 88 } 89 90 devres->addr = addr; 91 92 devres_add(iommu->dev, devres); 93 94 return addr; 95 } 96 97 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) 98 { 99 struct riscv_iommu_devres devres = { .addr = addr }; 100 101 devres_release(iommu->dev, riscv_iommu_devres_pages_release, 102 riscv_iommu_devres_pages_match, &devres); 103 } 104 105 /* 106 * Hardware queue allocation and management. 107 */ 108 109 /* Setup queue base, control registers and default queue length */ 110 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ 111 struct riscv_iommu_queue *_q = q; \ 112 _q->qid = RISCV_IOMMU_INTR_ ## name; \ 113 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ 114 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ 115 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ 116 } while (0) 117 118 /* Note: offsets are the same for all queues */ 119 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) 120 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) 121 #define Q_ITEM(q, index) ((q)->mask & (index)) 122 #define Q_IPSR(q) BIT((q)->qid) 123 124 /* 125 * Discover queue ring buffer hardware configuration, allocate in-memory 126 * ring buffer or use fixed I/O memory location, configure queue base register. 127 * Must be called before hardware queue is enabled. 128 * 129 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() 130 * @entry_size - queue single element size in bytes. 131 */ 132 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, 133 struct riscv_iommu_queue *queue, 134 size_t entry_size) 135 { 136 unsigned int logsz; 137 u64 qb, rb; 138 139 /* 140 * Use WARL base register property to discover maximum allowed 141 * number of entries and optional fixed IO address for queue location. 142 */ 143 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); 144 qb = riscv_iommu_readq(iommu, queue->qbr); 145 146 /* 147 * Calculate and verify hardware supported queue length, as reported 148 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). 149 * Update queue size based on hardware supported value. 150 */ 151 logsz = ilog2(queue->mask); 152 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) 153 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); 154 155 /* 156 * Use WARL base register property to discover an optional fixed IO 157 * address for queue ring buffer location. Otherwise allocate contiguous 158 * system memory. 159 */ 160 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { 161 const size_t queue_size = entry_size << (logsz + 1); 162 163 queue->phys = PFN_PHYS(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 164 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); 165 } else { 166 do { 167 const size_t queue_size = entry_size << (logsz + 1); 168 169 queue->base = riscv_iommu_get_pages( 170 iommu, max(queue_size, SZ_4K)); 171 queue->phys = __pa(queue->base); 172 } while (!queue->base && logsz-- > 0); 173 } 174 175 if (!queue->base) 176 return -ENOMEM; 177 178 qb = phys_to_ppn(queue->phys) | 179 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); 180 181 /* Update base register and read back to verify hw accepted our write */ 182 riscv_iommu_writeq(iommu, queue->qbr, qb); 183 rb = riscv_iommu_readq(iommu, queue->qbr); 184 if (rb != qb) { 185 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); 186 return -ENODEV; 187 } 188 189 /* Update actual queue mask */ 190 queue->mask = (2U << logsz) - 1; 191 192 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", 193 queue->qid, logsz + 1); 194 195 return 0; 196 } 197 198 /* Check interrupt queue status, IPSR */ 199 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) 200 { 201 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 202 203 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) 204 return IRQ_WAKE_THREAD; 205 206 return IRQ_NONE; 207 } 208 209 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) 210 { 211 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ 212 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; 213 } 214 215 /* 216 * Enable queue processing in the hardware, register interrupt handler. 217 * 218 * @queue - data structure, already allocated with riscv_iommu_queue_alloc() 219 * @irq_handler - threaded interrupt handler. 220 */ 221 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, 222 struct riscv_iommu_queue *queue, 223 irq_handler_t irq_handler) 224 { 225 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; 226 u32 csr; 227 int rc; 228 229 if (queue->iommu) 230 return -EBUSY; 231 232 /* Polling not implemented */ 233 if (!irq) 234 return -ENODEV; 235 236 queue->iommu = iommu; 237 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, 238 IRQF_ONESHOT | IRQF_SHARED, 239 dev_name(iommu->dev), queue); 240 if (rc) { 241 queue->iommu = NULL; 242 return rc; 243 } 244 245 /* Empty queue before enabling it */ 246 if (queue->qid == RISCV_IOMMU_INTR_CQ) 247 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); 248 else 249 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0); 250 251 /* 252 * Enable queue with interrupts, clear any memory fault if any. 253 * Wait for the hardware to acknowledge request and activate queue 254 * processing. 255 * Note: All CSR bitfields are in the same offsets for all queues. 256 */ 257 riscv_iommu_writel(iommu, queue->qcr, 258 RISCV_IOMMU_QUEUE_ENABLE | 259 RISCV_IOMMU_QUEUE_INTR_ENABLE | 260 RISCV_IOMMU_QUEUE_MEM_FAULT); 261 262 riscv_iommu_readl_timeout(iommu, queue->qcr, 263 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 264 10, RISCV_IOMMU_QCSR_TIMEOUT); 265 266 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | 267 RISCV_IOMMU_QUEUE_BUSY | 268 RISCV_IOMMU_QUEUE_MEM_FAULT))) { 269 /* Best effort to stop and disable failing hardware queue. */ 270 riscv_iommu_writel(iommu, queue->qcr, 0); 271 free_irq(irq, queue); 272 queue->iommu = NULL; 273 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); 274 return -EBUSY; 275 } 276 277 /* Clear any pending interrupt flag. */ 278 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 279 280 return 0; 281 } 282 283 /* 284 * Disable queue. Wait for the hardware to acknowledge request and 285 * stop processing enqueued requests. Report errors but continue. 286 */ 287 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) 288 { 289 struct riscv_iommu_device *iommu = queue->iommu; 290 u32 csr; 291 292 if (!iommu) 293 return; 294 295 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); 296 riscv_iommu_writel(iommu, queue->qcr, 0); 297 riscv_iommu_readl_timeout(iommu, queue->qcr, 298 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 299 10, RISCV_IOMMU_QCSR_TIMEOUT); 300 301 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) 302 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", 303 queue->qid, csr); 304 305 queue->iommu = NULL; 306 } 307 308 /* 309 * Returns number of available valid queue entries and the first item index. 310 * Update shadow producer index if necessary. 311 */ 312 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, 313 unsigned int *index) 314 { 315 unsigned int head = atomic_read(&queue->head); 316 unsigned int tail = atomic_read(&queue->tail); 317 unsigned int last = Q_ITEM(queue, tail); 318 int available = (int)(tail - head); 319 320 *index = head; 321 322 if (available > 0) 323 return available; 324 325 /* read hardware producer index, check reserved register bits are not set. */ 326 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), 327 tail, (tail & ~queue->mask) == 0, 328 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { 329 dev_err_once(queue->iommu->dev, 330 "Hardware error: queue access timeout\n"); 331 return 0; 332 } 333 334 if (tail == last) 335 return 0; 336 337 /* update shadow producer index */ 338 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); 339 } 340 341 /* 342 * Release processed queue entries, should match riscv_iommu_queue_consume() calls. 343 */ 344 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) 345 { 346 const unsigned int head = atomic_add_return(count, &queue->head); 347 348 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); 349 } 350 351 /* Return actual consumer index based on hardware reported queue head index. */ 352 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) 353 { 354 const unsigned int cons = atomic_read(&queue->head); 355 const unsigned int last = Q_ITEM(queue, cons); 356 unsigned int head; 357 358 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 359 !(head & ~queue->mask), 360 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 361 return cons; 362 363 return cons + ((head - last) & queue->mask); 364 } 365 366 /* Wait for submitted item to be processed. */ 367 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, 368 unsigned int index, 369 unsigned int timeout_us) 370 { 371 unsigned int cons = atomic_read(&queue->head); 372 unsigned int flags = RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 373 RISCV_IOMMU_CQCSR_CMD_ILL; 374 375 /* Already processed by the consumer */ 376 if ((int)(cons - index) > 0) 377 return 0; 378 379 /* Monitor consumer index */ 380 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, 381 (riscv_iommu_readl(queue->iommu, queue->qcr) & flags) || 382 (int)(cons - index) > 0, 0, timeout_us); 383 } 384 385 /* Enqueue an entry and wait to be processed if timeout_us > 0 386 * 387 * Error handling for IOMMU hardware not responding in reasonable time 388 * will be added as separate patch series along with other RAS features. 389 * For now, only report hardware failure and continue. 390 */ 391 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, 392 void *entry, size_t entry_size) 393 { 394 unsigned int prod; 395 unsigned int head; 396 unsigned int tail; 397 unsigned long flags; 398 399 /* Do not preempt submission flow. */ 400 local_irq_save(flags); 401 402 /* 1. Allocate some space in the queue */ 403 prod = atomic_inc_return(&queue->prod) - 1; 404 head = atomic_read(&queue->head); 405 406 /* 2. Wait for space availability. */ 407 if ((prod - head) > queue->mask) { 408 if (readx_poll_timeout(atomic_read, &queue->head, 409 head, (prod - head) < queue->mask, 410 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 411 goto err_busy; 412 } else if ((prod - head) == queue->mask) { 413 const unsigned int last = Q_ITEM(queue, head); 414 415 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 416 !(head & ~queue->mask) && head != last, 417 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 418 goto err_busy; 419 atomic_add((head - last) & queue->mask, &queue->head); 420 } 421 422 /* 3. Store entry in the ring buffer */ 423 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); 424 425 /* 4. Wait for all previous entries to be ready */ 426 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, 427 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 428 goto err_busy; 429 430 /* 431 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is 432 * completed and visible before signaling the tail doorbell to fetch 433 * the next command. 'fence ow, ow' 434 */ 435 dma_wmb(); 436 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); 437 438 /* 439 * 6. Make sure the doorbell write to the device has finished before updating 440 * the shadow tail index in normal memory. 'fence o, w' 441 */ 442 #ifdef CONFIG_MMIOWB 443 mmiowb(); 444 #endif 445 atomic_inc(&queue->tail); 446 447 /* 7. Complete submission and restore local interrupts */ 448 local_irq_restore(flags); 449 450 return prod; 451 452 err_busy: 453 local_irq_restore(flags); 454 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); 455 456 return prod; 457 } 458 459 /* 460 * IOMMU Command queue chapter 3.1 461 */ 462 463 /* Command queue interrupt handler thread function */ 464 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) 465 { 466 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 467 unsigned int ctrl; 468 469 /* Clear MF/CQ errors, complete error recovery to be implemented. */ 470 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); 471 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 472 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { 473 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); 474 dev_warn(queue->iommu->dev, 475 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", 476 queue->qid, 477 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), 478 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), 479 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), 480 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); 481 } 482 483 /* Placeholder for command queue interrupt notifiers */ 484 485 /* Clear command interrupt pending. */ 486 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 487 488 return IRQ_HANDLED; 489 } 490 491 /* Send command to the IOMMU command queue */ 492 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, 493 struct riscv_iommu_command *cmd) 494 { 495 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); 496 } 497 498 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */ 499 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, 500 unsigned int timeout_us) 501 { 502 struct riscv_iommu_command cmd; 503 unsigned int prod; 504 505 riscv_iommu_cmd_iofence(&cmd); 506 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); 507 508 if (!timeout_us) 509 return; 510 511 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) 512 dev_err_once(iommu->dev, 513 "Hardware error: command execution timeout\n"); 514 } 515 516 /* 517 * IOMMU Fault/Event queue chapter 3.2 518 */ 519 520 static void riscv_iommu_fault(struct riscv_iommu_device *iommu, 521 struct riscv_iommu_fq_record *event) 522 { 523 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); 524 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); 525 526 /* Placeholder for future fault handling implementation, report only. */ 527 if (err) 528 dev_warn_ratelimited(iommu->dev, 529 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", 530 err, devid, event->iotval, event->iotval2); 531 } 532 533 /* Fault queue interrupt handler thread function */ 534 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) 535 { 536 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 537 struct riscv_iommu_device *iommu = queue->iommu; 538 struct riscv_iommu_fq_record *events; 539 unsigned int ctrl, idx; 540 int cnt, len; 541 542 events = (struct riscv_iommu_fq_record *)queue->base; 543 544 /* Clear fault interrupt pending and process all received fault events. */ 545 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 546 547 do { 548 cnt = riscv_iommu_queue_consume(queue, &idx); 549 for (len = 0; len < cnt; idx++, len++) 550 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); 551 riscv_iommu_queue_release(queue, cnt); 552 } while (cnt > 0); 553 554 /* Clear MF/OF errors, complete error recovery to be implemented. */ 555 ctrl = riscv_iommu_readl(iommu, queue->qcr); 556 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { 557 riscv_iommu_writel(iommu, queue->qcr, ctrl); 558 dev_warn(iommu->dev, 559 "Queue #%u error; memory fault:%d overflow:%d\n", 560 queue->qid, 561 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), 562 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); 563 } 564 565 return IRQ_HANDLED; 566 } 567 568 /* Lookup and initialize device context info structure. */ 569 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, 570 unsigned int devid) 571 { 572 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); 573 unsigned int depth; 574 unsigned long ddt, old, new; 575 void *ptr; 576 u8 ddi_bits[3] = { 0 }; 577 u64 *ddtp = NULL; 578 579 /* Make sure the mode is valid */ 580 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || 581 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) 582 return NULL; 583 584 /* 585 * Device id partitioning for base format: 586 * DDI[0]: bits 0 - 6 (1st level) (7 bits) 587 * DDI[1]: bits 7 - 15 (2nd level) (9 bits) 588 * DDI[2]: bits 16 - 23 (3rd level) (8 bits) 589 * 590 * For extended format: 591 * DDI[0]: bits 0 - 5 (1st level) (6 bits) 592 * DDI[1]: bits 6 - 14 (2nd level) (9 bits) 593 * DDI[2]: bits 15 - 23 (3rd level) (9 bits) 594 */ 595 if (base_format) { 596 ddi_bits[0] = 7; 597 ddi_bits[1] = 7 + 9; 598 ddi_bits[2] = 7 + 9 + 8; 599 } else { 600 ddi_bits[0] = 6; 601 ddi_bits[1] = 6 + 9; 602 ddi_bits[2] = 6 + 9 + 9; 603 } 604 605 /* Make sure device id is within range */ 606 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; 607 if (devid >= (1 << ddi_bits[depth])) 608 return NULL; 609 610 /* Get to the level of the non-leaf node that holds the device context */ 611 for (ddtp = iommu->ddt_root; depth-- > 0;) { 612 const int split = ddi_bits[depth]; 613 /* 614 * Each non-leaf node is 64bits wide and on each level 615 * nodes are indexed by DDI[depth]. 616 */ 617 ddtp += (devid >> split) & 0x1FF; 618 619 /* 620 * Check if this node has been populated and if not 621 * allocate a new level and populate it. 622 */ 623 do { 624 ddt = READ_ONCE(*(unsigned long *)ddtp); 625 if (ddt & RISCV_IOMMU_DDTE_V) { 626 ddtp = __va(ppn_to_phys(ddt)); 627 break; 628 } 629 630 ptr = riscv_iommu_get_pages(iommu, SZ_4K); 631 if (!ptr) 632 return NULL; 633 634 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; 635 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); 636 637 if (old == ddt) { 638 ddtp = (u64 *)ptr; 639 break; 640 } 641 642 /* Race setting DDT detected, re-read and retry. */ 643 riscv_iommu_free_pages(iommu, ptr); 644 } while (1); 645 } 646 647 /* 648 * Grab the node that matches DDI[depth], note that when using base 649 * format the device context is 4 * 64bits, and the extended format 650 * is 8 * 64bits, hence the (3 - base_format) below. 651 */ 652 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); 653 654 return (struct riscv_iommu_dc *)ddtp; 655 } 656 657 /* 658 * This is best effort IOMMU translation shutdown flow. 659 * Disable IOMMU without waiting for hardware response. 660 */ 661 void riscv_iommu_disable(struct riscv_iommu_device *iommu) 662 { 663 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 664 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, 665 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); 666 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); 667 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); 668 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); 669 } 670 671 #define riscv_iommu_read_ddtp(iommu) ({ \ 672 u64 ddtp; \ 673 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ 674 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ 675 RISCV_IOMMU_DDTP_TIMEOUT); \ 676 ddtp; }) 677 678 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) 679 { 680 u64 ddtp; 681 unsigned int mode; 682 683 ddtp = riscv_iommu_read_ddtp(iommu); 684 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 685 return -EBUSY; 686 687 /* 688 * It is optional for the hardware to report a fixed address for device 689 * directory root page when DDT.MODE is OFF or BARE. 690 */ 691 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 692 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || 693 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { 694 /* Use WARL to discover hardware fixed DDT PPN */ 695 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 696 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); 697 ddtp = riscv_iommu_read_ddtp(iommu); 698 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 699 return -EBUSY; 700 701 iommu->ddt_phys = ppn_to_phys(ddtp); 702 if (iommu->ddt_phys) 703 iommu->ddt_root = devm_ioremap(iommu->dev, 704 iommu->ddt_phys, PAGE_SIZE); 705 if (iommu->ddt_root) 706 memset(iommu->ddt_root, 0, PAGE_SIZE); 707 } 708 709 if (!iommu->ddt_root) { 710 iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K); 711 iommu->ddt_phys = __pa(iommu->ddt_root); 712 } 713 714 if (!iommu->ddt_root) 715 return -ENOMEM; 716 717 return 0; 718 } 719 720 /* 721 * Discover supported DDT modes starting from requested value, 722 * configure DDTP register with accepted mode and root DDT address. 723 * Accepted iommu->ddt_mode is updated on success. 724 */ 725 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, 726 unsigned int ddtp_mode) 727 { 728 struct device *dev = iommu->dev; 729 u64 ddtp, rq_ddtp; 730 unsigned int mode, rq_mode = ddtp_mode; 731 struct riscv_iommu_command cmd; 732 733 ddtp = riscv_iommu_read_ddtp(iommu); 734 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 735 return -EBUSY; 736 737 /* Disallow state transition from xLVL to xLVL. */ 738 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 739 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 740 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && 741 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 742 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) 743 return -EINVAL; 744 745 do { 746 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); 747 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 748 rq_ddtp |= phys_to_ppn(iommu->ddt_phys); 749 750 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); 751 ddtp = riscv_iommu_read_ddtp(iommu); 752 if (ddtp & RISCV_IOMMU_DDTP_BUSY) { 753 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", 754 rq_mode, ddtp); 755 return -EBUSY; 756 } 757 758 /* Verify IOMMU hardware accepts new DDTP config. */ 759 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 760 761 if (rq_mode == mode) 762 break; 763 764 /* Hardware mandatory DDTP mode has not been accepted. */ 765 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { 766 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", 767 ddtp, rq_ddtp); 768 return -EINVAL; 769 } 770 771 /* 772 * Mode field is WARL, an IOMMU may support a subset of 773 * directory table levels in which case if we tried to set 774 * an unsupported number of levels we'll readback either 775 * a valid xLVL or off/bare. If we got off/bare, try again 776 * with a smaller xLVL. 777 */ 778 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && 779 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { 780 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); 781 rq_mode--; 782 continue; 783 } 784 785 /* 786 * We tried all supported modes and IOMMU hardware failed to 787 * accept new settings, something went very wrong since off/bare 788 * and at least one xLVL must be supported. 789 */ 790 dev_err(dev, "DDTP hw mode %u, failed to set %u\n", 791 mode, ddtp_mode); 792 return -EINVAL; 793 } while (1); 794 795 iommu->ddt_mode = mode; 796 if (mode != ddtp_mode) 797 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); 798 799 /* Invalidate device context cache */ 800 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 801 riscv_iommu_cmd_send(iommu, &cmd); 802 803 /* Invalidate address translation cache */ 804 riscv_iommu_cmd_inval_vma(&cmd); 805 riscv_iommu_cmd_send(iommu, &cmd); 806 807 /* IOFENCE.C */ 808 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 809 810 return 0; 811 } 812 813 /* This struct contains protection domain specific IOMMU driver data. */ 814 struct riscv_iommu_domain { 815 union { 816 struct iommu_domain domain; 817 struct pt_iommu_riscv_64 riscvpt; 818 }; 819 struct list_head bonds; 820 spinlock_t lock; /* protect bonds list updates. */ 821 int pscid; 822 }; 823 PT_IOMMU_CHECK_DOMAIN(struct riscv_iommu_domain, riscvpt.iommu, domain); 824 825 #define iommu_domain_to_riscv(iommu_domain) \ 826 container_of(iommu_domain, struct riscv_iommu_domain, domain) 827 828 /* Private IOMMU data for managed devices, dev_iommu_priv_* */ 829 struct riscv_iommu_info { 830 struct riscv_iommu_domain *domain; 831 }; 832 833 /* 834 * Linkage between an iommu_domain and attached devices. 835 * 836 * Protection domain requiring IOATC and DevATC translation cache invalidations, 837 * should be linked to attached devices using a riscv_iommu_bond structure. 838 * Devices should be linked to the domain before first use and unlinked after 839 * the translations from the referenced protection domain can no longer be used. 840 * Blocking and identity domains are not tracked here, as the IOMMU hardware 841 * does not cache negative and/or identity (BARE mode) translations, and DevATC 842 * is disabled for those protection domains. 843 * 844 * The device pointer and IOMMU data remain stable in the bond struct after 845 * _probe_device() where it's attached to the managed IOMMU, up to the 846 * completion of the _release_device() call. The release of the bond structure 847 * is synchronized with the device release. 848 */ 849 struct riscv_iommu_bond { 850 struct list_head list; 851 struct rcu_head rcu; 852 struct device *dev; 853 }; 854 855 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, 856 struct device *dev) 857 { 858 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 859 struct riscv_iommu_bond *bond; 860 struct list_head *bonds; 861 862 bond = kzalloc_obj(*bond); 863 if (!bond) 864 return -ENOMEM; 865 bond->dev = dev; 866 867 /* 868 * List of devices attached to the domain is arranged based on 869 * managed IOMMU device. 870 */ 871 872 spin_lock(&domain->lock); 873 list_for_each(bonds, &domain->bonds) 874 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) 875 break; 876 list_add_rcu(&bond->list, bonds); 877 spin_unlock(&domain->lock); 878 879 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ 880 smp_mb(); 881 882 return 0; 883 } 884 885 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, 886 struct device *dev) 887 { 888 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 889 struct riscv_iommu_bond *bond, *found = NULL; 890 struct riscv_iommu_command cmd; 891 int count = 0; 892 893 if (!domain) 894 return; 895 896 spin_lock(&domain->lock); 897 list_for_each_entry(bond, &domain->bonds, list) { 898 if (found && count) 899 break; 900 else if (bond->dev == dev) 901 found = bond; 902 else if (dev_to_iommu(bond->dev) == iommu) 903 count++; 904 } 905 if (found) 906 list_del_rcu(&found->list); 907 spin_unlock(&domain->lock); 908 kfree_rcu(found, rcu); 909 910 /* 911 * If this was the last bond between this domain and the IOMMU 912 * invalidate all cached entries for domain's PSCID. 913 */ 914 if (!count) { 915 riscv_iommu_cmd_inval_vma(&cmd); 916 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 917 riscv_iommu_cmd_send(iommu, &cmd); 918 919 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 920 } 921 } 922 923 /* 924 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. 925 * This limit will be replaced with range invalidations, if supported by 926 * the hardware, when RISC-V IOMMU architecture specification update for 927 * range invalidations update will be available. 928 */ 929 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) 930 931 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, 932 unsigned long start, unsigned long end) 933 { 934 struct riscv_iommu_bond *bond; 935 struct riscv_iommu_device *iommu, *prev; 936 struct riscv_iommu_command cmd; 937 938 /* 939 * For each IOMMU linked with this protection domain (via bonds->dev), 940 * an IOTLB invaliation command will be submitted and executed. 941 * 942 * Possbile race with domain attach flow is handled by sequencing 943 * bond creation - riscv_iommu_bond_link(), and device directory 944 * update - riscv_iommu_iodir_update(). 945 * 946 * PTE Update / IOTLB Inval Device attach & directory update 947 * -------------------------- -------------------------- 948 * update page table entries add dev to the bond list 949 * FENCE RW,RW FENCE RW,RW 950 * For all IOMMUs: (can be empty) Update FSC/PSCID 951 * FENCE IOW,IOW FENCE IOW,IOW 952 * IOTLB.INVAL IODIR.INVAL 953 * IOFENCE.C 954 * 955 * If bond list is not updated with new device, directory context will 956 * be configured with already valid page table content. If an IOMMU is 957 * linked to the protection domain it will receive invalidation 958 * requests for updated page table entries. 959 */ 960 smp_mb(); 961 962 rcu_read_lock(); 963 964 prev = NULL; 965 list_for_each_entry_rcu(bond, &domain->bonds, list) { 966 iommu = dev_to_iommu(bond->dev); 967 968 /* 969 * IOTLB invalidation request can be safely omitted if already sent 970 * to the IOMMU for the same PSCID, and with domain->bonds list 971 * arranged based on the device's IOMMU, it's sufficient to check 972 * last device the invalidation was sent to. 973 */ 974 if (iommu == prev) 975 continue; 976 977 riscv_iommu_cmd_inval_vma(&cmd); 978 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 979 if (end - start < RISCV_IOMMU_IOTLB_INVAL_LIMIT - 1) { 980 unsigned long iova = start; 981 982 do { 983 riscv_iommu_cmd_inval_set_addr(&cmd, iova); 984 riscv_iommu_cmd_send(iommu, &cmd); 985 } while (!check_add_overflow(iova, PAGE_SIZE, &iova) && 986 iova < end); 987 } else { 988 riscv_iommu_cmd_send(iommu, &cmd); 989 } 990 prev = iommu; 991 } 992 993 prev = NULL; 994 list_for_each_entry_rcu(bond, &domain->bonds, list) { 995 iommu = dev_to_iommu(bond->dev); 996 if (iommu == prev) 997 continue; 998 999 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1000 prev = iommu; 1001 } 1002 rcu_read_unlock(); 1003 } 1004 1005 #define RISCV_IOMMU_FSC_BARE 0 1006 /* 1007 * This function sends IOTINVAL commands as required by the RISC-V 1008 * IOMMU specification (Section 6.3.1 and 6.3.2 in 1.0 spec version) 1009 * after modifying DDT or PDT entries 1010 */ 1011 static void riscv_iommu_iodir_iotinval(struct riscv_iommu_device *iommu, 1012 bool inval_pdt, unsigned long iohgatp, 1013 struct riscv_iommu_dc *dc, 1014 struct riscv_iommu_pc *pc) 1015 { 1016 struct riscv_iommu_command cmd; 1017 1018 riscv_iommu_cmd_inval_vma(&cmd); 1019 1020 if (FIELD_GET(RISCV_IOMMU_DC_IOHGATP_MODE, iohgatp) == 1021 RISCV_IOMMU_DC_IOHGATP_MODE_BARE) { 1022 if (inval_pdt) { 1023 /* 1024 * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and 1025 * PSCID=PC.PSCID 1026 */ 1027 riscv_iommu_cmd_inval_set_pscid(&cmd, 1028 FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta)); 1029 } else { 1030 if (!FIELD_GET(RISCV_IOMMU_DC_TC_PDTV, dc->tc) && 1031 FIELD_GET(RISCV_IOMMU_DC_FSC_MODE, dc->fsc) != 1032 RISCV_IOMMU_DC_FSC_MODE_BARE) { 1033 /* 1034 * DC.tc.PDTV == 0 && DC.fsc.MODE != Bare 1035 * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and 1036 * PSCID=DC.ta.PSCID 1037 */ 1038 riscv_iommu_cmd_inval_set_pscid(&cmd, 1039 FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, dc->ta)); 1040 } 1041 /* else: IOTINVAL.VMA with GV=AV=PSCV=0 */ 1042 } 1043 } else { 1044 riscv_iommu_cmd_inval_set_gscid(&cmd, 1045 FIELD_GET(RISCV_IOMMU_DC_IOHGATP_GSCID, iohgatp)); 1046 1047 if (inval_pdt) { 1048 /* 1049 * IOTINVAL.VMA with GV=1, AV=0, and PSCV=1, and 1050 * GSCID=DC.iohgatp.GSCID, PSCID=PC.PSCID 1051 */ 1052 riscv_iommu_cmd_inval_set_pscid(&cmd, 1053 FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta)); 1054 } 1055 /* 1056 * else: IOTINVAL.VMA with GV=1,AV=PSCV=0,and 1057 * GSCID=DC.iohgatp.GSCID 1058 * 1059 * IOTINVAL.GVMA with GV=1,AV=0,and 1060 * GSCID=DC.iohgatp.GSCID 1061 * TODO: For now, the Second-Stage feature have not yet been merged, 1062 * also issue IOTINVAL.GVMA once second-stage support is merged. 1063 */ 1064 } 1065 riscv_iommu_cmd_send(iommu, &cmd); 1066 } 1067 /* 1068 * Update IODIR for the device. 1069 * 1070 * During the execution of riscv_iommu_probe_device(), IODIR entries are 1071 * allocated for the device's identifiers. Device context invalidation 1072 * becomes necessary only if one of the updated entries was previously 1073 * marked as valid, given that invalid device context entries are not 1074 * cached by the IOMMU hardware. 1075 * In this implementation, updating a valid device context while the 1076 * device is not quiesced might be disruptive, potentially causing 1077 * interim translation faults. 1078 */ 1079 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, 1080 struct device *dev, u64 fsc, u64 ta) 1081 { 1082 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1083 struct riscv_iommu_dc *dc; 1084 struct riscv_iommu_command cmd; 1085 bool sync_required = false; 1086 u64 tc; 1087 int i; 1088 1089 for (i = 0; i < fwspec->num_ids; i++) { 1090 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1091 tc = READ_ONCE(dc->tc); 1092 if (!(tc & RISCV_IOMMU_DC_TC_V)) 1093 continue; 1094 1095 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); 1096 1097 /* Invalidate device context cached values */ 1098 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1099 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1100 riscv_iommu_cmd_send(iommu, &cmd); 1101 /* 1102 * For now, the SVA and PASID features have not yet been merged, the 1103 * default configuration is inval_pdt=false and pc=NULL. 1104 */ 1105 riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL); 1106 sync_required = true; 1107 } 1108 1109 if (sync_required) 1110 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1111 1112 /* 1113 * For device context with DC_TC_PDTV = 0, translation attributes valid bit 1114 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). 1115 */ 1116 for (i = 0; i < fwspec->num_ids; i++) { 1117 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1118 tc = READ_ONCE(dc->tc); 1119 tc |= ta & RISCV_IOMMU_DC_TC_V; 1120 1121 WRITE_ONCE(dc->fsc, fsc); 1122 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); 1123 /* Update device context, write TC.V as the last step. */ 1124 dma_wmb(); 1125 WRITE_ONCE(dc->tc, tc); 1126 1127 /* Invalidate device context after update */ 1128 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1129 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1130 riscv_iommu_cmd_send(iommu, &cmd); 1131 /* 1132 * For now, the SVA and PASID features have not yet been merged, the 1133 * default configuration is inval_pdt=false and pc=NULL. 1134 */ 1135 riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL); 1136 } 1137 1138 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1139 } 1140 1141 /* 1142 * IOVA page translation tree management. 1143 */ 1144 1145 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) 1146 { 1147 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1148 1149 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1150 } 1151 1152 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, 1153 struct iommu_iotlb_gather *gather) 1154 { 1155 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1156 1157 if (iommu_pages_list_empty(&gather->freelist)) { 1158 riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1159 } else { 1160 /* 1161 * In 1.0 spec version, the smallest scope we can use to 1162 * invalidate all levels of page table (i.e. leaf and non-leaf) 1163 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. 1164 * This will be updated with hardware support for 1165 * capability.NL (non-leaf) IOTINVAL command. 1166 */ 1167 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1168 iommu_put_pages_list(&gather->freelist); 1169 } 1170 } 1171 1172 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) 1173 { 1174 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1175 1176 WARN_ON(!list_empty(&domain->bonds)); 1177 1178 if ((int)domain->pscid > 0) 1179 ida_free(&riscv_iommu_pscids, domain->pscid); 1180 1181 pt_iommu_deinit(&domain->riscvpt.iommu); 1182 kfree(domain); 1183 } 1184 1185 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) 1186 { 1187 switch (pgd_mode) { 1188 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: 1189 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; 1190 1191 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: 1192 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; 1193 1194 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: 1195 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; 1196 } 1197 return false; 1198 } 1199 1200 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, 1201 struct device *dev, 1202 struct iommu_domain *old) 1203 { 1204 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1205 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1206 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1207 struct pt_iommu_riscv_64_hw_info pt_info; 1208 u64 fsc, ta; 1209 1210 pt_iommu_riscv_64_hw_info(&domain->riscvpt, &pt_info); 1211 1212 if (!riscv_iommu_pt_supported(iommu, pt_info.fsc_iosatp_mode)) 1213 return -ENODEV; 1214 1215 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, pt_info.fsc_iosatp_mode) | 1216 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, pt_info.ppn); 1217 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | 1218 RISCV_IOMMU_PC_TA_V; 1219 1220 if (riscv_iommu_bond_link(domain, dev)) 1221 return -ENOMEM; 1222 1223 riscv_iommu_iodir_update(iommu, dev, fsc, ta); 1224 riscv_iommu_bond_unlink(info->domain, dev); 1225 info->domain = domain; 1226 1227 return 0; 1228 } 1229 1230 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { 1231 IOMMU_PT_DOMAIN_OPS(riscv_64), 1232 .attach_dev = riscv_iommu_attach_paging_domain, 1233 .free = riscv_iommu_free_paging_domain, 1234 .iotlb_sync = riscv_iommu_iotlb_sync, 1235 .flush_iotlb_all = riscv_iommu_iotlb_flush_all, 1236 }; 1237 1238 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) 1239 { 1240 struct pt_iommu_riscv_64_cfg cfg = {}; 1241 struct riscv_iommu_domain *domain; 1242 struct riscv_iommu_device *iommu; 1243 int ret; 1244 1245 iommu = dev_to_iommu(dev); 1246 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { 1247 cfg.common.hw_max_vasz_lg2 = 57; 1248 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { 1249 cfg.common.hw_max_vasz_lg2 = 48; 1250 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { 1251 cfg.common.hw_max_vasz_lg2 = 39; 1252 } else { 1253 dev_err(dev, "cannot find supported page table mode\n"); 1254 return ERR_PTR(-ENODEV); 1255 } 1256 cfg.common.hw_max_oasz_lg2 = 56; 1257 1258 domain = kzalloc_obj(*domain); 1259 if (!domain) 1260 return ERR_PTR(-ENOMEM); 1261 1262 INIT_LIST_HEAD_RCU(&domain->bonds); 1263 spin_lock_init(&domain->lock); 1264 /* 1265 * 6.4 IOMMU capabilities [..] IOMMU implementations must support the 1266 * Svnapot standard extension for NAPOT Translation Contiguity. 1267 */ 1268 cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | 1269 BIT(PT_FEAT_FLUSH_RANGE) | 1270 BIT(PT_FEAT_RISCV_SVNAPOT_64K); 1271 domain->riscvpt.iommu.nid = dev_to_node(iommu->dev); 1272 domain->domain.ops = &riscv_iommu_paging_domain_ops; 1273 1274 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, 1275 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); 1276 if (domain->pscid < 0) { 1277 riscv_iommu_free_paging_domain(&domain->domain); 1278 return ERR_PTR(-ENOMEM); 1279 } 1280 1281 ret = pt_iommu_riscv_64_init(&domain->riscvpt, &cfg, GFP_KERNEL); 1282 if (ret) { 1283 riscv_iommu_free_paging_domain(&domain->domain); 1284 return ERR_PTR(ret); 1285 } 1286 return &domain->domain; 1287 } 1288 1289 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, 1290 struct device *dev, 1291 struct iommu_domain *old) 1292 { 1293 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1294 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1295 1296 /* Make device context invalid, translation requests will fault w/ #258 */ 1297 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); 1298 riscv_iommu_bond_unlink(info->domain, dev); 1299 info->domain = NULL; 1300 1301 return 0; 1302 } 1303 1304 static struct iommu_domain riscv_iommu_blocking_domain = { 1305 .type = IOMMU_DOMAIN_BLOCKED, 1306 .ops = &(const struct iommu_domain_ops) { 1307 .attach_dev = riscv_iommu_attach_blocking_domain, 1308 } 1309 }; 1310 1311 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, 1312 struct device *dev, 1313 struct iommu_domain *old) 1314 { 1315 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1316 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1317 1318 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); 1319 riscv_iommu_bond_unlink(info->domain, dev); 1320 info->domain = NULL; 1321 1322 return 0; 1323 } 1324 1325 static struct iommu_domain riscv_iommu_identity_domain = { 1326 .type = IOMMU_DOMAIN_IDENTITY, 1327 .ops = &(const struct iommu_domain_ops) { 1328 .attach_dev = riscv_iommu_attach_identity_domain, 1329 } 1330 }; 1331 1332 static struct iommu_group *riscv_iommu_device_group(struct device *dev) 1333 { 1334 if (dev_is_pci(dev)) 1335 return pci_device_group(dev); 1336 return generic_device_group(dev); 1337 } 1338 1339 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) 1340 { 1341 return iommu_fwspec_add_ids(dev, args->args, 1); 1342 } 1343 1344 static struct iommu_device *riscv_iommu_probe_device(struct device *dev) 1345 { 1346 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1347 struct riscv_iommu_device *iommu; 1348 struct riscv_iommu_info *info; 1349 struct riscv_iommu_dc *dc; 1350 u64 tc; 1351 int i; 1352 1353 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) 1354 return ERR_PTR(-ENODEV); 1355 1356 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); 1357 if (!iommu) 1358 return ERR_PTR(-ENODEV); 1359 1360 /* 1361 * IOMMU hardware operating in fail-over BARE mode will provide 1362 * identity translation for all connected devices anyway... 1363 */ 1364 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 1365 return ERR_PTR(-ENODEV); 1366 1367 info = kzalloc_obj(*info); 1368 if (!info) 1369 return ERR_PTR(-ENOMEM); 1370 /* 1371 * Allocate and pre-configure device context entries in 1372 * the device directory. Do not mark the context valid yet. 1373 */ 1374 tc = 0; 1375 for (i = 0; i < fwspec->num_ids; i++) { 1376 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1377 if (!dc) { 1378 kfree(info); 1379 return ERR_PTR(-ENODEV); 1380 } 1381 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) 1382 dev_warn(dev, "already attached to IOMMU device directory\n"); 1383 WRITE_ONCE(dc->tc, tc); 1384 } 1385 1386 dev_iommu_priv_set(dev, info); 1387 1388 return &iommu->iommu; 1389 } 1390 1391 static void riscv_iommu_release_device(struct device *dev) 1392 { 1393 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1394 1395 kfree_rcu_mightsleep(info); 1396 } 1397 1398 static const struct iommu_ops riscv_iommu_ops = { 1399 .of_xlate = riscv_iommu_of_xlate, 1400 .identity_domain = &riscv_iommu_identity_domain, 1401 .blocked_domain = &riscv_iommu_blocking_domain, 1402 .release_domain = &riscv_iommu_blocking_domain, 1403 .domain_alloc_paging = riscv_iommu_alloc_paging_domain, 1404 .device_group = riscv_iommu_device_group, 1405 .probe_device = riscv_iommu_probe_device, 1406 .release_device = riscv_iommu_release_device, 1407 }; 1408 1409 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) 1410 { 1411 u64 ddtp; 1412 1413 /* 1414 * Make sure the IOMMU is switched off or in pass-through mode during 1415 * regular boot flow and disable translation when we boot into a kexec 1416 * kernel and the previous kernel left them enabled. 1417 */ 1418 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); 1419 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 1420 return -EBUSY; 1421 1422 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > 1423 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { 1424 if (!is_kdump_kernel()) 1425 return -EBUSY; 1426 riscv_iommu_disable(iommu); 1427 } 1428 1429 /* Configure accesses to in-memory data structures for CPU-native byte order. */ 1430 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1431 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { 1432 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) 1433 return -EINVAL; 1434 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, 1435 iommu->fctl ^ RISCV_IOMMU_FCTL_BE); 1436 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); 1437 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1438 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) 1439 return -EINVAL; 1440 } 1441 1442 /* 1443 * Distribute interrupt vectors, always use first vector for CIV. 1444 * At least one interrupt is required. Read back and verify. 1445 */ 1446 if (!iommu->irqs_count) 1447 return -EINVAL; 1448 1449 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | 1450 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | 1451 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); 1452 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); 1453 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); 1454 if (max3(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), 1455 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec), 1456 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), 1457 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) 1458 return -EINVAL; 1459 1460 return 0; 1461 } 1462 1463 void riscv_iommu_remove(struct riscv_iommu_device *iommu) 1464 { 1465 iommu_device_unregister(&iommu->iommu); 1466 iommu_device_sysfs_remove(&iommu->iommu); 1467 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1468 riscv_iommu_queue_disable(&iommu->cmdq); 1469 riscv_iommu_queue_disable(&iommu->fltq); 1470 } 1471 1472 int riscv_iommu_init(struct riscv_iommu_device *iommu) 1473 { 1474 int rc; 1475 1476 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); 1477 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); 1478 1479 rc = riscv_iommu_init_check(iommu); 1480 if (rc) 1481 return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); 1482 1483 rc = riscv_iommu_iodir_alloc(iommu); 1484 if (rc) 1485 return rc; 1486 1487 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, 1488 sizeof(struct riscv_iommu_command)); 1489 if (rc) 1490 return rc; 1491 1492 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, 1493 sizeof(struct riscv_iommu_fq_record)); 1494 if (rc) 1495 return rc; 1496 1497 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); 1498 if (rc) 1499 return rc; 1500 1501 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); 1502 if (rc) 1503 goto err_queue_disable; 1504 1505 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); 1506 if (rc) 1507 goto err_queue_disable; 1508 1509 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", 1510 dev_name(iommu->dev)); 1511 if (rc) { 1512 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); 1513 goto err_iodir_off; 1514 } 1515 1516 if (!acpi_disabled) { 1517 rc = rimt_iommu_register(iommu->dev); 1518 if (rc) { 1519 dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n"); 1520 goto err_remove_sysfs; 1521 } 1522 } 1523 1524 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); 1525 if (rc) { 1526 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); 1527 goto err_remove_sysfs; 1528 } 1529 1530 return 0; 1531 1532 err_remove_sysfs: 1533 iommu_device_sysfs_remove(&iommu->iommu); 1534 err_iodir_off: 1535 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1536 err_queue_disable: 1537 riscv_iommu_queue_disable(&iommu->fltq); 1538 riscv_iommu_queue_disable(&iommu->cmdq); 1539 return rc; 1540 } 1541 1542 MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); 1543