1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * IOMMU API for RISC-V IOMMU implementations. 4 * 5 * Copyright © 2022-2024 Rivos Inc. 6 * Copyright © 2023 FORTH-ICS/CARV 7 * 8 * Authors 9 * Tomasz Jeznach <tjeznach@rivosinc.com> 10 * Nick Kossifidis <mick@ics.forth.gr> 11 */ 12 13 #define pr_fmt(fmt) "riscv-iommu: " fmt 14 15 #include <linux/compiler.h> 16 #include <linux/crash_dump.h> 17 #include <linux/init.h> 18 #include <linux/iommu.h> 19 #include <linux/iopoll.h> 20 #include <linux/kernel.h> 21 #include <linux/pci.h> 22 23 #include "../iommu-pages.h" 24 #include "iommu-bits.h" 25 #include "iommu.h" 26 27 /* Timeouts in [us] */ 28 #define RISCV_IOMMU_QCSR_TIMEOUT 150000 29 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000 30 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000 31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 32 33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */ 34 #define RISCV_IOMMU_DEF_CQ_COUNT 8192 35 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 36 37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ 38 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) 39 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) 40 41 #define dev_to_iommu(dev) \ 42 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) 43 44 /* IOMMU PSCID allocation namespace. */ 45 static DEFINE_IDA(riscv_iommu_pscids); 46 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) 47 48 /* Device resource-managed allocations */ 49 struct riscv_iommu_devres { 50 void *addr; 51 }; 52 53 static void riscv_iommu_devres_pages_release(struct device *dev, void *res) 54 { 55 struct riscv_iommu_devres *devres = res; 56 57 iommu_free_pages(devres->addr); 58 } 59 60 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) 61 { 62 struct riscv_iommu_devres *devres = res; 63 struct riscv_iommu_devres *target = p; 64 65 return devres->addr == target->addr; 66 } 67 68 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, 69 unsigned int size) 70 { 71 struct riscv_iommu_devres *devres; 72 void *addr; 73 74 addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev), 75 GFP_KERNEL_ACCOUNT, size); 76 if (unlikely(!addr)) 77 return NULL; 78 79 devres = devres_alloc(riscv_iommu_devres_pages_release, 80 sizeof(struct riscv_iommu_devres), GFP_KERNEL); 81 82 if (unlikely(!devres)) { 83 iommu_free_pages(addr); 84 return NULL; 85 } 86 87 devres->addr = addr; 88 89 devres_add(iommu->dev, devres); 90 91 return addr; 92 } 93 94 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) 95 { 96 struct riscv_iommu_devres devres = { .addr = addr }; 97 98 devres_release(iommu->dev, riscv_iommu_devres_pages_release, 99 riscv_iommu_devres_pages_match, &devres); 100 } 101 102 /* 103 * Hardware queue allocation and management. 104 */ 105 106 /* Setup queue base, control registers and default queue length */ 107 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ 108 struct riscv_iommu_queue *_q = q; \ 109 _q->qid = RISCV_IOMMU_INTR_ ## name; \ 110 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ 111 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ 112 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ 113 } while (0) 114 115 /* Note: offsets are the same for all queues */ 116 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) 117 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) 118 #define Q_ITEM(q, index) ((q)->mask & (index)) 119 #define Q_IPSR(q) BIT((q)->qid) 120 121 /* 122 * Discover queue ring buffer hardware configuration, allocate in-memory 123 * ring buffer or use fixed I/O memory location, configure queue base register. 124 * Must be called before hardware queue is enabled. 125 * 126 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() 127 * @entry_size - queue single element size in bytes. 128 */ 129 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, 130 struct riscv_iommu_queue *queue, 131 size_t entry_size) 132 { 133 unsigned int logsz; 134 u64 qb, rb; 135 136 /* 137 * Use WARL base register property to discover maximum allowed 138 * number of entries and optional fixed IO address for queue location. 139 */ 140 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); 141 qb = riscv_iommu_readq(iommu, queue->qbr); 142 143 /* 144 * Calculate and verify hardware supported queue length, as reported 145 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). 146 * Update queue size based on hardware supported value. 147 */ 148 logsz = ilog2(queue->mask); 149 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) 150 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); 151 152 /* 153 * Use WARL base register property to discover an optional fixed IO 154 * address for queue ring buffer location. Otherwise allocate contiguous 155 * system memory. 156 */ 157 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { 158 const size_t queue_size = entry_size << (logsz + 1); 159 160 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 161 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); 162 } else { 163 do { 164 const size_t queue_size = entry_size << (logsz + 1); 165 166 queue->base = riscv_iommu_get_pages( 167 iommu, max(queue_size, SZ_4K)); 168 queue->phys = __pa(queue->base); 169 } while (!queue->base && logsz-- > 0); 170 } 171 172 if (!queue->base) 173 return -ENOMEM; 174 175 qb = phys_to_ppn(queue->phys) | 176 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); 177 178 /* Update base register and read back to verify hw accepted our write */ 179 riscv_iommu_writeq(iommu, queue->qbr, qb); 180 rb = riscv_iommu_readq(iommu, queue->qbr); 181 if (rb != qb) { 182 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); 183 return -ENODEV; 184 } 185 186 /* Update actual queue mask */ 187 queue->mask = (2U << logsz) - 1; 188 189 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", 190 queue->qid, logsz + 1); 191 192 return 0; 193 } 194 195 /* Check interrupt queue status, IPSR */ 196 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) 197 { 198 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 199 200 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) 201 return IRQ_WAKE_THREAD; 202 203 return IRQ_NONE; 204 } 205 206 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) 207 { 208 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ 209 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; 210 } 211 212 /* 213 * Enable queue processing in the hardware, register interrupt handler. 214 * 215 * @queue - data structure, already allocated with riscv_iommu_queue_alloc() 216 * @irq_handler - threaded interrupt handler. 217 */ 218 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, 219 struct riscv_iommu_queue *queue, 220 irq_handler_t irq_handler) 221 { 222 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; 223 u32 csr; 224 int rc; 225 226 if (queue->iommu) 227 return -EBUSY; 228 229 /* Polling not implemented */ 230 if (!irq) 231 return -ENODEV; 232 233 queue->iommu = iommu; 234 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, 235 IRQF_ONESHOT | IRQF_SHARED, 236 dev_name(iommu->dev), queue); 237 if (rc) { 238 queue->iommu = NULL; 239 return rc; 240 } 241 242 /* Empty queue before enabling it */ 243 if (queue->qid == RISCV_IOMMU_INTR_CQ) 244 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); 245 else 246 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0); 247 248 /* 249 * Enable queue with interrupts, clear any memory fault if any. 250 * Wait for the hardware to acknowledge request and activate queue 251 * processing. 252 * Note: All CSR bitfields are in the same offsets for all queues. 253 */ 254 riscv_iommu_writel(iommu, queue->qcr, 255 RISCV_IOMMU_QUEUE_ENABLE | 256 RISCV_IOMMU_QUEUE_INTR_ENABLE | 257 RISCV_IOMMU_QUEUE_MEM_FAULT); 258 259 riscv_iommu_readl_timeout(iommu, queue->qcr, 260 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 261 10, RISCV_IOMMU_QCSR_TIMEOUT); 262 263 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | 264 RISCV_IOMMU_QUEUE_BUSY | 265 RISCV_IOMMU_QUEUE_MEM_FAULT))) { 266 /* Best effort to stop and disable failing hardware queue. */ 267 riscv_iommu_writel(iommu, queue->qcr, 0); 268 free_irq(irq, queue); 269 queue->iommu = NULL; 270 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); 271 return -EBUSY; 272 } 273 274 /* Clear any pending interrupt flag. */ 275 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 276 277 return 0; 278 } 279 280 /* 281 * Disable queue. Wait for the hardware to acknowledge request and 282 * stop processing enqueued requests. Report errors but continue. 283 */ 284 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) 285 { 286 struct riscv_iommu_device *iommu = queue->iommu; 287 u32 csr; 288 289 if (!iommu) 290 return; 291 292 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); 293 riscv_iommu_writel(iommu, queue->qcr, 0); 294 riscv_iommu_readl_timeout(iommu, queue->qcr, 295 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 296 10, RISCV_IOMMU_QCSR_TIMEOUT); 297 298 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) 299 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", 300 queue->qid, csr); 301 302 queue->iommu = NULL; 303 } 304 305 /* 306 * Returns number of available valid queue entries and the first item index. 307 * Update shadow producer index if necessary. 308 */ 309 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, 310 unsigned int *index) 311 { 312 unsigned int head = atomic_read(&queue->head); 313 unsigned int tail = atomic_read(&queue->tail); 314 unsigned int last = Q_ITEM(queue, tail); 315 int available = (int)(tail - head); 316 317 *index = head; 318 319 if (available > 0) 320 return available; 321 322 /* read hardware producer index, check reserved register bits are not set. */ 323 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), 324 tail, (tail & ~queue->mask) == 0, 325 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { 326 dev_err_once(queue->iommu->dev, 327 "Hardware error: queue access timeout\n"); 328 return 0; 329 } 330 331 if (tail == last) 332 return 0; 333 334 /* update shadow producer index */ 335 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); 336 } 337 338 /* 339 * Release processed queue entries, should match riscv_iommu_queue_consume() calls. 340 */ 341 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) 342 { 343 const unsigned int head = atomic_add_return(count, &queue->head); 344 345 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); 346 } 347 348 /* Return actual consumer index based on hardware reported queue head index. */ 349 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) 350 { 351 const unsigned int cons = atomic_read(&queue->head); 352 const unsigned int last = Q_ITEM(queue, cons); 353 unsigned int head; 354 355 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 356 !(head & ~queue->mask), 357 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 358 return cons; 359 360 return cons + ((head - last) & queue->mask); 361 } 362 363 /* Wait for submitted item to be processed. */ 364 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, 365 unsigned int index, 366 unsigned int timeout_us) 367 { 368 unsigned int cons = atomic_read(&queue->head); 369 370 /* Already processed by the consumer */ 371 if ((int)(cons - index) > 0) 372 return 0; 373 374 /* Monitor consumer index */ 375 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, 376 (int)(cons - index) > 0, 0, timeout_us); 377 } 378 379 /* Enqueue an entry and wait to be processed if timeout_us > 0 380 * 381 * Error handling for IOMMU hardware not responding in reasonable time 382 * will be added as separate patch series along with other RAS features. 383 * For now, only report hardware failure and continue. 384 */ 385 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, 386 void *entry, size_t entry_size) 387 { 388 unsigned int prod; 389 unsigned int head; 390 unsigned int tail; 391 unsigned long flags; 392 393 /* Do not preempt submission flow. */ 394 local_irq_save(flags); 395 396 /* 1. Allocate some space in the queue */ 397 prod = atomic_inc_return(&queue->prod) - 1; 398 head = atomic_read(&queue->head); 399 400 /* 2. Wait for space availability. */ 401 if ((prod - head) > queue->mask) { 402 if (readx_poll_timeout(atomic_read, &queue->head, 403 head, (prod - head) < queue->mask, 404 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 405 goto err_busy; 406 } else if ((prod - head) == queue->mask) { 407 const unsigned int last = Q_ITEM(queue, head); 408 409 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 410 !(head & ~queue->mask) && head != last, 411 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 412 goto err_busy; 413 atomic_add((head - last) & queue->mask, &queue->head); 414 } 415 416 /* 3. Store entry in the ring buffer */ 417 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); 418 419 /* 4. Wait for all previous entries to be ready */ 420 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, 421 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 422 goto err_busy; 423 424 /* 425 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is 426 * completed and visible before signaling the tail doorbell to fetch 427 * the next command. 'fence ow, ow' 428 */ 429 dma_wmb(); 430 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); 431 432 /* 433 * 6. Make sure the doorbell write to the device has finished before updating 434 * the shadow tail index in normal memory. 'fence o, w' 435 */ 436 mmiowb(); 437 atomic_inc(&queue->tail); 438 439 /* 7. Complete submission and restore local interrupts */ 440 local_irq_restore(flags); 441 442 return prod; 443 444 err_busy: 445 local_irq_restore(flags); 446 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); 447 448 return prod; 449 } 450 451 /* 452 * IOMMU Command queue chapter 3.1 453 */ 454 455 /* Command queue interrupt handler thread function */ 456 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) 457 { 458 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 459 unsigned int ctrl; 460 461 /* Clear MF/CQ errors, complete error recovery to be implemented. */ 462 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); 463 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 464 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { 465 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); 466 dev_warn(queue->iommu->dev, 467 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", 468 queue->qid, 469 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), 470 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), 471 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), 472 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); 473 } 474 475 /* Placeholder for command queue interrupt notifiers */ 476 477 /* Clear command interrupt pending. */ 478 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 479 480 return IRQ_HANDLED; 481 } 482 483 /* Send command to the IOMMU command queue */ 484 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, 485 struct riscv_iommu_command *cmd) 486 { 487 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); 488 } 489 490 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */ 491 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, 492 unsigned int timeout_us) 493 { 494 struct riscv_iommu_command cmd; 495 unsigned int prod; 496 497 riscv_iommu_cmd_iofence(&cmd); 498 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); 499 500 if (!timeout_us) 501 return; 502 503 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) 504 dev_err_once(iommu->dev, 505 "Hardware error: command execution timeout\n"); 506 } 507 508 /* 509 * IOMMU Fault/Event queue chapter 3.2 510 */ 511 512 static void riscv_iommu_fault(struct riscv_iommu_device *iommu, 513 struct riscv_iommu_fq_record *event) 514 { 515 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); 516 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); 517 518 /* Placeholder for future fault handling implementation, report only. */ 519 if (err) 520 dev_warn_ratelimited(iommu->dev, 521 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", 522 err, devid, event->iotval, event->iotval2); 523 } 524 525 /* Fault queue interrupt handler thread function */ 526 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) 527 { 528 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 529 struct riscv_iommu_device *iommu = queue->iommu; 530 struct riscv_iommu_fq_record *events; 531 unsigned int ctrl, idx; 532 int cnt, len; 533 534 events = (struct riscv_iommu_fq_record *)queue->base; 535 536 /* Clear fault interrupt pending and process all received fault events. */ 537 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 538 539 do { 540 cnt = riscv_iommu_queue_consume(queue, &idx); 541 for (len = 0; len < cnt; idx++, len++) 542 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); 543 riscv_iommu_queue_release(queue, cnt); 544 } while (cnt > 0); 545 546 /* Clear MF/OF errors, complete error recovery to be implemented. */ 547 ctrl = riscv_iommu_readl(iommu, queue->qcr); 548 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { 549 riscv_iommu_writel(iommu, queue->qcr, ctrl); 550 dev_warn(iommu->dev, 551 "Queue #%u error; memory fault:%d overflow:%d\n", 552 queue->qid, 553 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), 554 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); 555 } 556 557 return IRQ_HANDLED; 558 } 559 560 /* Lookup and initialize device context info structure. */ 561 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, 562 unsigned int devid) 563 { 564 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); 565 unsigned int depth; 566 unsigned long ddt, old, new; 567 void *ptr; 568 u8 ddi_bits[3] = { 0 }; 569 u64 *ddtp = NULL; 570 571 /* Make sure the mode is valid */ 572 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || 573 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) 574 return NULL; 575 576 /* 577 * Device id partitioning for base format: 578 * DDI[0]: bits 0 - 6 (1st level) (7 bits) 579 * DDI[1]: bits 7 - 15 (2nd level) (9 bits) 580 * DDI[2]: bits 16 - 23 (3rd level) (8 bits) 581 * 582 * For extended format: 583 * DDI[0]: bits 0 - 5 (1st level) (6 bits) 584 * DDI[1]: bits 6 - 14 (2nd level) (9 bits) 585 * DDI[2]: bits 15 - 23 (3rd level) (9 bits) 586 */ 587 if (base_format) { 588 ddi_bits[0] = 7; 589 ddi_bits[1] = 7 + 9; 590 ddi_bits[2] = 7 + 9 + 8; 591 } else { 592 ddi_bits[0] = 6; 593 ddi_bits[1] = 6 + 9; 594 ddi_bits[2] = 6 + 9 + 9; 595 } 596 597 /* Make sure device id is within range */ 598 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; 599 if (devid >= (1 << ddi_bits[depth])) 600 return NULL; 601 602 /* Get to the level of the non-leaf node that holds the device context */ 603 for (ddtp = iommu->ddt_root; depth-- > 0;) { 604 const int split = ddi_bits[depth]; 605 /* 606 * Each non-leaf node is 64bits wide and on each level 607 * nodes are indexed by DDI[depth]. 608 */ 609 ddtp += (devid >> split) & 0x1FF; 610 611 /* 612 * Check if this node has been populated and if not 613 * allocate a new level and populate it. 614 */ 615 do { 616 ddt = READ_ONCE(*(unsigned long *)ddtp); 617 if (ddt & RISCV_IOMMU_DDTE_V) { 618 ddtp = __va(ppn_to_phys(ddt)); 619 break; 620 } 621 622 ptr = riscv_iommu_get_pages(iommu, SZ_4K); 623 if (!ptr) 624 return NULL; 625 626 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; 627 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); 628 629 if (old == ddt) { 630 ddtp = (u64 *)ptr; 631 break; 632 } 633 634 /* Race setting DDT detected, re-read and retry. */ 635 riscv_iommu_free_pages(iommu, ptr); 636 } while (1); 637 } 638 639 /* 640 * Grab the node that matches DDI[depth], note that when using base 641 * format the device context is 4 * 64bits, and the extended format 642 * is 8 * 64bits, hence the (3 - base_format) below. 643 */ 644 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); 645 646 return (struct riscv_iommu_dc *)ddtp; 647 } 648 649 /* 650 * This is best effort IOMMU translation shutdown flow. 651 * Disable IOMMU without waiting for hardware response. 652 */ 653 void riscv_iommu_disable(struct riscv_iommu_device *iommu) 654 { 655 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 656 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, 657 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); 658 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); 659 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); 660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); 661 } 662 663 #define riscv_iommu_read_ddtp(iommu) ({ \ 664 u64 ddtp; \ 665 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ 666 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ 667 RISCV_IOMMU_DDTP_TIMEOUT); \ 668 ddtp; }) 669 670 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) 671 { 672 u64 ddtp; 673 unsigned int mode; 674 675 ddtp = riscv_iommu_read_ddtp(iommu); 676 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 677 return -EBUSY; 678 679 /* 680 * It is optional for the hardware to report a fixed address for device 681 * directory root page when DDT.MODE is OFF or BARE. 682 */ 683 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 684 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || 685 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { 686 /* Use WARL to discover hardware fixed DDT PPN */ 687 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 688 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); 689 ddtp = riscv_iommu_read_ddtp(iommu); 690 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 691 return -EBUSY; 692 693 iommu->ddt_phys = ppn_to_phys(ddtp); 694 if (iommu->ddt_phys) 695 iommu->ddt_root = devm_ioremap(iommu->dev, 696 iommu->ddt_phys, PAGE_SIZE); 697 if (iommu->ddt_root) 698 memset(iommu->ddt_root, 0, PAGE_SIZE); 699 } 700 701 if (!iommu->ddt_root) { 702 iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K); 703 iommu->ddt_phys = __pa(iommu->ddt_root); 704 } 705 706 if (!iommu->ddt_root) 707 return -ENOMEM; 708 709 return 0; 710 } 711 712 /* 713 * Discover supported DDT modes starting from requested value, 714 * configure DDTP register with accepted mode and root DDT address. 715 * Accepted iommu->ddt_mode is updated on success. 716 */ 717 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, 718 unsigned int ddtp_mode) 719 { 720 struct device *dev = iommu->dev; 721 u64 ddtp, rq_ddtp; 722 unsigned int mode, rq_mode = ddtp_mode; 723 struct riscv_iommu_command cmd; 724 725 ddtp = riscv_iommu_read_ddtp(iommu); 726 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 727 return -EBUSY; 728 729 /* Disallow state transition from xLVL to xLVL. */ 730 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 731 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 732 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && 733 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 734 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) 735 return -EINVAL; 736 737 do { 738 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); 739 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 740 rq_ddtp |= phys_to_ppn(iommu->ddt_phys); 741 742 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); 743 ddtp = riscv_iommu_read_ddtp(iommu); 744 if (ddtp & RISCV_IOMMU_DDTP_BUSY) { 745 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", 746 rq_mode, ddtp); 747 return -EBUSY; 748 } 749 750 /* Verify IOMMU hardware accepts new DDTP config. */ 751 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 752 753 if (rq_mode == mode) 754 break; 755 756 /* Hardware mandatory DDTP mode has not been accepted. */ 757 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { 758 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", 759 ddtp, rq_ddtp); 760 return -EINVAL; 761 } 762 763 /* 764 * Mode field is WARL, an IOMMU may support a subset of 765 * directory table levels in which case if we tried to set 766 * an unsupported number of levels we'll readback either 767 * a valid xLVL or off/bare. If we got off/bare, try again 768 * with a smaller xLVL. 769 */ 770 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && 771 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { 772 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); 773 rq_mode--; 774 continue; 775 } 776 777 /* 778 * We tried all supported modes and IOMMU hardware failed to 779 * accept new settings, something went very wrong since off/bare 780 * and at least one xLVL must be supported. 781 */ 782 dev_err(dev, "DDTP hw mode %u, failed to set %u\n", 783 mode, ddtp_mode); 784 return -EINVAL; 785 } while (1); 786 787 iommu->ddt_mode = mode; 788 if (mode != ddtp_mode) 789 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); 790 791 /* Invalidate device context cache */ 792 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 793 riscv_iommu_cmd_send(iommu, &cmd); 794 795 /* Invalidate address translation cache */ 796 riscv_iommu_cmd_inval_vma(&cmd); 797 riscv_iommu_cmd_send(iommu, &cmd); 798 799 /* IOFENCE.C */ 800 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 801 802 return 0; 803 } 804 805 /* This struct contains protection domain specific IOMMU driver data. */ 806 struct riscv_iommu_domain { 807 struct iommu_domain domain; 808 struct list_head bonds; 809 spinlock_t lock; /* protect bonds list updates. */ 810 int pscid; 811 bool amo_enabled; 812 int numa_node; 813 unsigned int pgd_mode; 814 unsigned long *pgd_root; 815 }; 816 817 #define iommu_domain_to_riscv(iommu_domain) \ 818 container_of(iommu_domain, struct riscv_iommu_domain, domain) 819 820 /* Private IOMMU data for managed devices, dev_iommu_priv_* */ 821 struct riscv_iommu_info { 822 struct riscv_iommu_domain *domain; 823 }; 824 825 /* 826 * Linkage between an iommu_domain and attached devices. 827 * 828 * Protection domain requiring IOATC and DevATC translation cache invalidations, 829 * should be linked to attached devices using a riscv_iommu_bond structure. 830 * Devices should be linked to the domain before first use and unlinked after 831 * the translations from the referenced protection domain can no longer be used. 832 * Blocking and identity domains are not tracked here, as the IOMMU hardware 833 * does not cache negative and/or identity (BARE mode) translations, and DevATC 834 * is disabled for those protection domains. 835 * 836 * The device pointer and IOMMU data remain stable in the bond struct after 837 * _probe_device() where it's attached to the managed IOMMU, up to the 838 * completion of the _release_device() call. The release of the bond structure 839 * is synchronized with the device release. 840 */ 841 struct riscv_iommu_bond { 842 struct list_head list; 843 struct rcu_head rcu; 844 struct device *dev; 845 }; 846 847 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, 848 struct device *dev) 849 { 850 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 851 struct riscv_iommu_bond *bond; 852 struct list_head *bonds; 853 854 bond = kzalloc(sizeof(*bond), GFP_KERNEL); 855 if (!bond) 856 return -ENOMEM; 857 bond->dev = dev; 858 859 /* 860 * List of devices attached to the domain is arranged based on 861 * managed IOMMU device. 862 */ 863 864 spin_lock(&domain->lock); 865 list_for_each(bonds, &domain->bonds) 866 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) 867 break; 868 list_add_rcu(&bond->list, bonds); 869 spin_unlock(&domain->lock); 870 871 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ 872 smp_mb(); 873 874 return 0; 875 } 876 877 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, 878 struct device *dev) 879 { 880 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 881 struct riscv_iommu_bond *bond, *found = NULL; 882 struct riscv_iommu_command cmd; 883 int count = 0; 884 885 if (!domain) 886 return; 887 888 spin_lock(&domain->lock); 889 list_for_each_entry(bond, &domain->bonds, list) { 890 if (found && count) 891 break; 892 else if (bond->dev == dev) 893 found = bond; 894 else if (dev_to_iommu(bond->dev) == iommu) 895 count++; 896 } 897 if (found) 898 list_del_rcu(&found->list); 899 spin_unlock(&domain->lock); 900 kfree_rcu(found, rcu); 901 902 /* 903 * If this was the last bond between this domain and the IOMMU 904 * invalidate all cached entries for domain's PSCID. 905 */ 906 if (!count) { 907 riscv_iommu_cmd_inval_vma(&cmd); 908 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 909 riscv_iommu_cmd_send(iommu, &cmd); 910 911 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 912 } 913 } 914 915 /* 916 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. 917 * This limit will be replaced with range invalidations, if supported by 918 * the hardware, when RISC-V IOMMU architecture specification update for 919 * range invalidations update will be available. 920 */ 921 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) 922 923 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, 924 unsigned long start, unsigned long end) 925 { 926 struct riscv_iommu_bond *bond; 927 struct riscv_iommu_device *iommu, *prev; 928 struct riscv_iommu_command cmd; 929 unsigned long len = end - start + 1; 930 unsigned long iova; 931 932 /* 933 * For each IOMMU linked with this protection domain (via bonds->dev), 934 * an IOTLB invaliation command will be submitted and executed. 935 * 936 * Possbile race with domain attach flow is handled by sequencing 937 * bond creation - riscv_iommu_bond_link(), and device directory 938 * update - riscv_iommu_iodir_update(). 939 * 940 * PTE Update / IOTLB Inval Device attach & directory update 941 * -------------------------- -------------------------- 942 * update page table entries add dev to the bond list 943 * FENCE RW,RW FENCE RW,RW 944 * For all IOMMUs: (can be empty) Update FSC/PSCID 945 * FENCE IOW,IOW FENCE IOW,IOW 946 * IOTLB.INVAL IODIR.INVAL 947 * IOFENCE.C 948 * 949 * If bond list is not updated with new device, directory context will 950 * be configured with already valid page table content. If an IOMMU is 951 * linked to the protection domain it will receive invalidation 952 * requests for updated page table entries. 953 */ 954 smp_mb(); 955 956 rcu_read_lock(); 957 958 prev = NULL; 959 list_for_each_entry_rcu(bond, &domain->bonds, list) { 960 iommu = dev_to_iommu(bond->dev); 961 962 /* 963 * IOTLB invalidation request can be safely omitted if already sent 964 * to the IOMMU for the same PSCID, and with domain->bonds list 965 * arranged based on the device's IOMMU, it's sufficient to check 966 * last device the invalidation was sent to. 967 */ 968 if (iommu == prev) 969 continue; 970 971 riscv_iommu_cmd_inval_vma(&cmd); 972 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 973 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { 974 for (iova = start; iova < end; iova += PAGE_SIZE) { 975 riscv_iommu_cmd_inval_set_addr(&cmd, iova); 976 riscv_iommu_cmd_send(iommu, &cmd); 977 } 978 } else { 979 riscv_iommu_cmd_send(iommu, &cmd); 980 } 981 prev = iommu; 982 } 983 984 prev = NULL; 985 list_for_each_entry_rcu(bond, &domain->bonds, list) { 986 iommu = dev_to_iommu(bond->dev); 987 if (iommu == prev) 988 continue; 989 990 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 991 prev = iommu; 992 } 993 rcu_read_unlock(); 994 } 995 996 #define RISCV_IOMMU_FSC_BARE 0 997 998 /* 999 * Update IODIR for the device. 1000 * 1001 * During the execution of riscv_iommu_probe_device(), IODIR entries are 1002 * allocated for the device's identifiers. Device context invalidation 1003 * becomes necessary only if one of the updated entries was previously 1004 * marked as valid, given that invalid device context entries are not 1005 * cached by the IOMMU hardware. 1006 * In this implementation, updating a valid device context while the 1007 * device is not quiesced might be disruptive, potentially causing 1008 * interim translation faults. 1009 */ 1010 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, 1011 struct device *dev, u64 fsc, u64 ta) 1012 { 1013 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1014 struct riscv_iommu_dc *dc; 1015 struct riscv_iommu_command cmd; 1016 bool sync_required = false; 1017 u64 tc; 1018 int i; 1019 1020 for (i = 0; i < fwspec->num_ids; i++) { 1021 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1022 tc = READ_ONCE(dc->tc); 1023 if (!(tc & RISCV_IOMMU_DC_TC_V)) 1024 continue; 1025 1026 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); 1027 1028 /* Invalidate device context cached values */ 1029 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1030 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1031 riscv_iommu_cmd_send(iommu, &cmd); 1032 sync_required = true; 1033 } 1034 1035 if (sync_required) 1036 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1037 1038 /* 1039 * For device context with DC_TC_PDTV = 0, translation attributes valid bit 1040 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). 1041 */ 1042 for (i = 0; i < fwspec->num_ids; i++) { 1043 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1044 tc = READ_ONCE(dc->tc); 1045 tc |= ta & RISCV_IOMMU_DC_TC_V; 1046 1047 WRITE_ONCE(dc->fsc, fsc); 1048 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); 1049 /* Update device context, write TC.V as the last step. */ 1050 dma_wmb(); 1051 WRITE_ONCE(dc->tc, tc); 1052 1053 /* Invalidate device context after update */ 1054 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1055 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1056 riscv_iommu_cmd_send(iommu, &cmd); 1057 } 1058 1059 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1060 } 1061 1062 /* 1063 * IOVA page translation tree management. 1064 */ 1065 1066 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) 1067 { 1068 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1069 1070 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1071 } 1072 1073 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, 1074 struct iommu_iotlb_gather *gather) 1075 { 1076 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1077 1078 riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1079 } 1080 1081 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) 1082 1083 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) 1084 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) 1085 #define _io_pte_none(pte) ((pte) == 0) 1086 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) 1087 1088 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, 1089 unsigned long pte, 1090 struct iommu_pages_list *freelist) 1091 { 1092 unsigned long *ptr; 1093 int i; 1094 1095 if (!_io_pte_present(pte) || _io_pte_leaf(pte)) 1096 return; 1097 1098 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1099 1100 /* Recursively free all sub page table pages */ 1101 for (i = 0; i < PTRS_PER_PTE; i++) { 1102 pte = READ_ONCE(ptr[i]); 1103 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) 1104 riscv_iommu_pte_free(domain, pte, freelist); 1105 } 1106 1107 if (freelist) 1108 iommu_pages_list_add(freelist, ptr); 1109 else 1110 iommu_free_pages(ptr); 1111 } 1112 1113 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, 1114 unsigned long iova, size_t pgsize, 1115 gfp_t gfp) 1116 { 1117 unsigned long *ptr = domain->pgd_root; 1118 unsigned long pte, old; 1119 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1120 void *addr; 1121 1122 do { 1123 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1124 1125 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1126 /* 1127 * Note: returned entry might be a non-leaf if there was 1128 * existing mapping with smaller granularity. Up to the caller 1129 * to replace and invalidate. 1130 */ 1131 if (((size_t)1 << shift) == pgsize) 1132 return ptr; 1133 pte_retry: 1134 pte = READ_ONCE(*ptr); 1135 /* 1136 * This is very likely incorrect as we should not be adding 1137 * new mapping with smaller granularity on top 1138 * of existing 2M/1G mapping. Fail. 1139 */ 1140 if (_io_pte_present(pte) && _io_pte_leaf(pte)) 1141 return NULL; 1142 /* 1143 * Non-leaf entry is missing, allocate and try to add to the 1144 * page table. This might race with other mappings, retry. 1145 */ 1146 if (_io_pte_none(pte)) { 1147 addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp, 1148 SZ_4K); 1149 if (!addr) 1150 return NULL; 1151 old = pte; 1152 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); 1153 if (cmpxchg_relaxed(ptr, old, pte) != old) { 1154 iommu_free_pages(addr); 1155 goto pte_retry; 1156 } 1157 } 1158 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1159 } while (level-- > 0); 1160 1161 return NULL; 1162 } 1163 1164 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, 1165 unsigned long iova, size_t *pte_pgsize) 1166 { 1167 unsigned long *ptr = domain->pgd_root; 1168 unsigned long pte; 1169 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1170 1171 do { 1172 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1173 1174 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1175 pte = READ_ONCE(*ptr); 1176 if (_io_pte_present(pte) && _io_pte_leaf(pte)) { 1177 *pte_pgsize = (size_t)1 << shift; 1178 return ptr; 1179 } 1180 if (_io_pte_none(pte)) 1181 return NULL; 1182 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1183 } while (level-- > 0); 1184 1185 return NULL; 1186 } 1187 1188 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, 1189 unsigned long iova, phys_addr_t phys, 1190 size_t pgsize, size_t pgcount, int prot, 1191 gfp_t gfp, size_t *mapped) 1192 { 1193 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1194 size_t size = 0; 1195 unsigned long *ptr; 1196 unsigned long pte, old, pte_prot; 1197 int rc = 0; 1198 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 1199 1200 if (!(prot & IOMMU_WRITE)) 1201 pte_prot = _PAGE_BASE | _PAGE_READ; 1202 else if (domain->amo_enabled) 1203 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; 1204 else 1205 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; 1206 1207 while (pgcount) { 1208 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); 1209 if (!ptr) { 1210 rc = -ENOMEM; 1211 break; 1212 } 1213 1214 old = READ_ONCE(*ptr); 1215 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); 1216 if (cmpxchg_relaxed(ptr, old, pte) != old) 1217 continue; 1218 1219 riscv_iommu_pte_free(domain, old, &freelist); 1220 1221 size += pgsize; 1222 iova += pgsize; 1223 phys += pgsize; 1224 --pgcount; 1225 } 1226 1227 *mapped = size; 1228 1229 if (!iommu_pages_list_empty(&freelist)) { 1230 /* 1231 * In 1.0 spec version, the smallest scope we can use to 1232 * invalidate all levels of page table (i.e. leaf and non-leaf) 1233 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. 1234 * This will be updated with hardware support for 1235 * capability.NL (non-leaf) IOTINVAL command. 1236 */ 1237 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1238 iommu_put_pages_list(&freelist); 1239 } 1240 1241 return rc; 1242 } 1243 1244 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, 1245 unsigned long iova, size_t pgsize, 1246 size_t pgcount, 1247 struct iommu_iotlb_gather *gather) 1248 { 1249 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1250 size_t size = pgcount << __ffs(pgsize); 1251 unsigned long *ptr, old; 1252 size_t unmapped = 0; 1253 size_t pte_size; 1254 1255 while (unmapped < size) { 1256 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1257 if (!ptr) 1258 return unmapped; 1259 1260 /* partial unmap is not allowed, fail. */ 1261 if (iova & (pte_size - 1)) 1262 return unmapped; 1263 1264 old = READ_ONCE(*ptr); 1265 if (cmpxchg_relaxed(ptr, old, 0) != old) 1266 continue; 1267 1268 iommu_iotlb_gather_add_page(&domain->domain, gather, iova, 1269 pte_size); 1270 1271 iova += pte_size; 1272 unmapped += pte_size; 1273 } 1274 1275 return unmapped; 1276 } 1277 1278 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, 1279 dma_addr_t iova) 1280 { 1281 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1282 size_t pte_size; 1283 unsigned long *ptr; 1284 1285 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1286 if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) 1287 return 0; 1288 1289 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); 1290 } 1291 1292 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) 1293 { 1294 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1295 const unsigned long pfn = virt_to_pfn(domain->pgd_root); 1296 1297 WARN_ON(!list_empty(&domain->bonds)); 1298 1299 if ((int)domain->pscid > 0) 1300 ida_free(&riscv_iommu_pscids, domain->pscid); 1301 1302 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); 1303 kfree(domain); 1304 } 1305 1306 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) 1307 { 1308 switch (pgd_mode) { 1309 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: 1310 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; 1311 1312 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: 1313 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; 1314 1315 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: 1316 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; 1317 } 1318 return false; 1319 } 1320 1321 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, 1322 struct device *dev) 1323 { 1324 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1325 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1326 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1327 u64 fsc, ta; 1328 1329 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) 1330 return -ENODEV; 1331 1332 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | 1333 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); 1334 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | 1335 RISCV_IOMMU_PC_TA_V; 1336 1337 if (riscv_iommu_bond_link(domain, dev)) 1338 return -ENOMEM; 1339 1340 riscv_iommu_iodir_update(iommu, dev, fsc, ta); 1341 riscv_iommu_bond_unlink(info->domain, dev); 1342 info->domain = domain; 1343 1344 return 0; 1345 } 1346 1347 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { 1348 .attach_dev = riscv_iommu_attach_paging_domain, 1349 .free = riscv_iommu_free_paging_domain, 1350 .map_pages = riscv_iommu_map_pages, 1351 .unmap_pages = riscv_iommu_unmap_pages, 1352 .iova_to_phys = riscv_iommu_iova_to_phys, 1353 .iotlb_sync = riscv_iommu_iotlb_sync, 1354 .flush_iotlb_all = riscv_iommu_iotlb_flush_all, 1355 }; 1356 1357 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) 1358 { 1359 struct riscv_iommu_domain *domain; 1360 struct riscv_iommu_device *iommu; 1361 unsigned int pgd_mode; 1362 dma_addr_t va_mask; 1363 int va_bits; 1364 1365 iommu = dev_to_iommu(dev); 1366 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { 1367 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; 1368 va_bits = 57; 1369 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { 1370 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; 1371 va_bits = 48; 1372 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { 1373 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; 1374 va_bits = 39; 1375 } else { 1376 dev_err(dev, "cannot find supported page table mode\n"); 1377 return ERR_PTR(-ENODEV); 1378 } 1379 1380 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1381 if (!domain) 1382 return ERR_PTR(-ENOMEM); 1383 1384 INIT_LIST_HEAD_RCU(&domain->bonds); 1385 spin_lock_init(&domain->lock); 1386 domain->numa_node = dev_to_node(iommu->dev); 1387 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); 1388 domain->pgd_mode = pgd_mode; 1389 domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node, 1390 GFP_KERNEL_ACCOUNT, SZ_4K); 1391 if (!domain->pgd_root) { 1392 kfree(domain); 1393 return ERR_PTR(-ENOMEM); 1394 } 1395 1396 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, 1397 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); 1398 if (domain->pscid < 0) { 1399 iommu_free_pages(domain->pgd_root); 1400 kfree(domain); 1401 return ERR_PTR(-ENOMEM); 1402 } 1403 1404 /* 1405 * Note: RISC-V Privilege spec mandates that virtual addresses 1406 * need to be sign-extended, so if (VA_BITS - 1) is set, all 1407 * bits >= VA_BITS need to also be set or else we'll get a 1408 * page fault. However the code that creates the mappings 1409 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us 1410 * for now, so we'll end up with invalid virtual addresses 1411 * to map. As a workaround until we get this sorted out 1412 * limit the available virtual addresses to VA_BITS - 1. 1413 */ 1414 va_mask = DMA_BIT_MASK(va_bits - 1); 1415 1416 domain->domain.geometry.aperture_start = 0; 1417 domain->domain.geometry.aperture_end = va_mask; 1418 domain->domain.geometry.force_aperture = true; 1419 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); 1420 1421 domain->domain.ops = &riscv_iommu_paging_domain_ops; 1422 1423 return &domain->domain; 1424 } 1425 1426 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, 1427 struct device *dev) 1428 { 1429 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1430 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1431 1432 /* Make device context invalid, translation requests will fault w/ #258 */ 1433 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); 1434 riscv_iommu_bond_unlink(info->domain, dev); 1435 info->domain = NULL; 1436 1437 return 0; 1438 } 1439 1440 static struct iommu_domain riscv_iommu_blocking_domain = { 1441 .type = IOMMU_DOMAIN_BLOCKED, 1442 .ops = &(const struct iommu_domain_ops) { 1443 .attach_dev = riscv_iommu_attach_blocking_domain, 1444 } 1445 }; 1446 1447 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, 1448 struct device *dev) 1449 { 1450 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1451 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1452 1453 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); 1454 riscv_iommu_bond_unlink(info->domain, dev); 1455 info->domain = NULL; 1456 1457 return 0; 1458 } 1459 1460 static struct iommu_domain riscv_iommu_identity_domain = { 1461 .type = IOMMU_DOMAIN_IDENTITY, 1462 .ops = &(const struct iommu_domain_ops) { 1463 .attach_dev = riscv_iommu_attach_identity_domain, 1464 } 1465 }; 1466 1467 static struct iommu_group *riscv_iommu_device_group(struct device *dev) 1468 { 1469 if (dev_is_pci(dev)) 1470 return pci_device_group(dev); 1471 return generic_device_group(dev); 1472 } 1473 1474 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) 1475 { 1476 return iommu_fwspec_add_ids(dev, args->args, 1); 1477 } 1478 1479 static struct iommu_device *riscv_iommu_probe_device(struct device *dev) 1480 { 1481 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1482 struct riscv_iommu_device *iommu; 1483 struct riscv_iommu_info *info; 1484 struct riscv_iommu_dc *dc; 1485 u64 tc; 1486 int i; 1487 1488 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) 1489 return ERR_PTR(-ENODEV); 1490 1491 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); 1492 if (!iommu) 1493 return ERR_PTR(-ENODEV); 1494 1495 /* 1496 * IOMMU hardware operating in fail-over BARE mode will provide 1497 * identity translation for all connected devices anyway... 1498 */ 1499 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 1500 return ERR_PTR(-ENODEV); 1501 1502 info = kzalloc(sizeof(*info), GFP_KERNEL); 1503 if (!info) 1504 return ERR_PTR(-ENOMEM); 1505 /* 1506 * Allocate and pre-configure device context entries in 1507 * the device directory. Do not mark the context valid yet. 1508 */ 1509 tc = 0; 1510 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) 1511 tc |= RISCV_IOMMU_DC_TC_SADE; 1512 for (i = 0; i < fwspec->num_ids; i++) { 1513 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1514 if (!dc) { 1515 kfree(info); 1516 return ERR_PTR(-ENODEV); 1517 } 1518 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) 1519 dev_warn(dev, "already attached to IOMMU device directory\n"); 1520 WRITE_ONCE(dc->tc, tc); 1521 } 1522 1523 dev_iommu_priv_set(dev, info); 1524 1525 return &iommu->iommu; 1526 } 1527 1528 static void riscv_iommu_release_device(struct device *dev) 1529 { 1530 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1531 1532 kfree_rcu_mightsleep(info); 1533 } 1534 1535 static const struct iommu_ops riscv_iommu_ops = { 1536 .pgsize_bitmap = SZ_4K, 1537 .of_xlate = riscv_iommu_of_xlate, 1538 .identity_domain = &riscv_iommu_identity_domain, 1539 .blocked_domain = &riscv_iommu_blocking_domain, 1540 .release_domain = &riscv_iommu_blocking_domain, 1541 .domain_alloc_paging = riscv_iommu_alloc_paging_domain, 1542 .device_group = riscv_iommu_device_group, 1543 .probe_device = riscv_iommu_probe_device, 1544 .release_device = riscv_iommu_release_device, 1545 }; 1546 1547 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) 1548 { 1549 u64 ddtp; 1550 1551 /* 1552 * Make sure the IOMMU is switched off or in pass-through mode during 1553 * regular boot flow and disable translation when we boot into a kexec 1554 * kernel and the previous kernel left them enabled. 1555 */ 1556 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); 1557 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 1558 return -EBUSY; 1559 1560 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > 1561 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { 1562 if (!is_kdump_kernel()) 1563 return -EBUSY; 1564 riscv_iommu_disable(iommu); 1565 } 1566 1567 /* Configure accesses to in-memory data structures for CPU-native byte order. */ 1568 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1569 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { 1570 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) 1571 return -EINVAL; 1572 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, 1573 iommu->fctl ^ RISCV_IOMMU_FCTL_BE); 1574 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); 1575 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1576 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) 1577 return -EINVAL; 1578 } 1579 1580 /* 1581 * Distribute interrupt vectors, always use first vector for CIV. 1582 * At least one interrupt is required. Read back and verify. 1583 */ 1584 if (!iommu->irqs_count) 1585 return -EINVAL; 1586 1587 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | 1588 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | 1589 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); 1590 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); 1591 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); 1592 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), 1593 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), 1594 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), 1595 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) 1596 return -EINVAL; 1597 1598 return 0; 1599 } 1600 1601 void riscv_iommu_remove(struct riscv_iommu_device *iommu) 1602 { 1603 iommu_device_unregister(&iommu->iommu); 1604 iommu_device_sysfs_remove(&iommu->iommu); 1605 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1606 riscv_iommu_queue_disable(&iommu->cmdq); 1607 riscv_iommu_queue_disable(&iommu->fltq); 1608 } 1609 1610 int riscv_iommu_init(struct riscv_iommu_device *iommu) 1611 { 1612 int rc; 1613 1614 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); 1615 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); 1616 1617 rc = riscv_iommu_init_check(iommu); 1618 if (rc) 1619 return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); 1620 1621 rc = riscv_iommu_iodir_alloc(iommu); 1622 if (rc) 1623 return rc; 1624 1625 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, 1626 sizeof(struct riscv_iommu_command)); 1627 if (rc) 1628 return rc; 1629 1630 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, 1631 sizeof(struct riscv_iommu_fq_record)); 1632 if (rc) 1633 return rc; 1634 1635 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); 1636 if (rc) 1637 return rc; 1638 1639 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); 1640 if (rc) 1641 goto err_queue_disable; 1642 1643 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); 1644 if (rc) 1645 goto err_queue_disable; 1646 1647 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", 1648 dev_name(iommu->dev)); 1649 if (rc) { 1650 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); 1651 goto err_iodir_off; 1652 } 1653 1654 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); 1655 if (rc) { 1656 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); 1657 goto err_remove_sysfs; 1658 } 1659 1660 return 0; 1661 1662 err_remove_sysfs: 1663 iommu_device_sysfs_remove(&iommu->iommu); 1664 err_iodir_off: 1665 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1666 err_queue_disable: 1667 riscv_iommu_queue_disable(&iommu->fltq); 1668 riscv_iommu_queue_disable(&iommu->cmdq); 1669 return rc; 1670 } 1671