1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * IOMMU API for RISC-V IOMMU implementations. 4 * 5 * Copyright © 2022-2024 Rivos Inc. 6 * Copyright © 2023 FORTH-ICS/CARV 7 * 8 * Authors 9 * Tomasz Jeznach <tjeznach@rivosinc.com> 10 * Nick Kossifidis <mick@ics.forth.gr> 11 */ 12 13 #define pr_fmt(fmt) "riscv-iommu: " fmt 14 15 #include <linux/compiler.h> 16 #include <linux/crash_dump.h> 17 #include <linux/init.h> 18 #include <linux/iommu.h> 19 #include <linux/iopoll.h> 20 #include <linux/kernel.h> 21 #include <linux/pci.h> 22 23 #include "../iommu-pages.h" 24 #include "iommu-bits.h" 25 #include "iommu.h" 26 27 /* Timeouts in [us] */ 28 #define RISCV_IOMMU_QCSR_TIMEOUT 150000 29 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000 30 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000 31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 32 33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */ 34 #define RISCV_IOMMU_DEF_CQ_COUNT 8192 35 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 36 37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ 38 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) 39 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) 40 41 #define dev_to_iommu(dev) \ 42 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) 43 44 /* IOMMU PSCID allocation namespace. */ 45 static DEFINE_IDA(riscv_iommu_pscids); 46 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) 47 48 /* Device resource-managed allocations */ 49 struct riscv_iommu_devres { 50 void *addr; 51 int order; 52 }; 53 54 static void riscv_iommu_devres_pages_release(struct device *dev, void *res) 55 { 56 struct riscv_iommu_devres *devres = res; 57 58 iommu_free_pages(devres->addr, devres->order); 59 } 60 61 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) 62 { 63 struct riscv_iommu_devres *devres = res; 64 struct riscv_iommu_devres *target = p; 65 66 return devres->addr == target->addr; 67 } 68 69 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) 70 { 71 struct riscv_iommu_devres *devres; 72 void *addr; 73 74 addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), 75 GFP_KERNEL_ACCOUNT, order); 76 if (unlikely(!addr)) 77 return NULL; 78 79 devres = devres_alloc(riscv_iommu_devres_pages_release, 80 sizeof(struct riscv_iommu_devres), GFP_KERNEL); 81 82 if (unlikely(!devres)) { 83 iommu_free_pages(addr, order); 84 return NULL; 85 } 86 87 devres->addr = addr; 88 devres->order = order; 89 90 devres_add(iommu->dev, devres); 91 92 return addr; 93 } 94 95 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) 96 { 97 struct riscv_iommu_devres devres = { .addr = addr }; 98 99 devres_release(iommu->dev, riscv_iommu_devres_pages_release, 100 riscv_iommu_devres_pages_match, &devres); 101 } 102 103 /* 104 * Hardware queue allocation and management. 105 */ 106 107 /* Setup queue base, control registers and default queue length */ 108 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ 109 struct riscv_iommu_queue *_q = q; \ 110 _q->qid = RISCV_IOMMU_INTR_ ## name; \ 111 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ 112 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ 113 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ 114 } while (0) 115 116 /* Note: offsets are the same for all queues */ 117 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) 118 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) 119 #define Q_ITEM(q, index) ((q)->mask & (index)) 120 #define Q_IPSR(q) BIT((q)->qid) 121 122 /* 123 * Discover queue ring buffer hardware configuration, allocate in-memory 124 * ring buffer or use fixed I/O memory location, configure queue base register. 125 * Must be called before hardware queue is enabled. 126 * 127 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() 128 * @entry_size - queue single element size in bytes. 129 */ 130 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, 131 struct riscv_iommu_queue *queue, 132 size_t entry_size) 133 { 134 unsigned int logsz; 135 u64 qb, rb; 136 137 /* 138 * Use WARL base register property to discover maximum allowed 139 * number of entries and optional fixed IO address for queue location. 140 */ 141 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); 142 qb = riscv_iommu_readq(iommu, queue->qbr); 143 144 /* 145 * Calculate and verify hardware supported queue length, as reported 146 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). 147 * Update queue size based on hardware supported value. 148 */ 149 logsz = ilog2(queue->mask); 150 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) 151 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); 152 153 /* 154 * Use WARL base register property to discover an optional fixed IO 155 * address for queue ring buffer location. Otherwise allocate contiguous 156 * system memory. 157 */ 158 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { 159 const size_t queue_size = entry_size << (logsz + 1); 160 161 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 162 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); 163 } else { 164 do { 165 const size_t queue_size = entry_size << (logsz + 1); 166 const int order = get_order(queue_size); 167 168 queue->base = riscv_iommu_get_pages(iommu, order); 169 queue->phys = __pa(queue->base); 170 } while (!queue->base && logsz-- > 0); 171 } 172 173 if (!queue->base) 174 return -ENOMEM; 175 176 qb = phys_to_ppn(queue->phys) | 177 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); 178 179 /* Update base register and read back to verify hw accepted our write */ 180 riscv_iommu_writeq(iommu, queue->qbr, qb); 181 rb = riscv_iommu_readq(iommu, queue->qbr); 182 if (rb != qb) { 183 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); 184 return -ENODEV; 185 } 186 187 /* Update actual queue mask */ 188 queue->mask = (2U << logsz) - 1; 189 190 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", 191 queue->qid, logsz + 1); 192 193 return 0; 194 } 195 196 /* Check interrupt queue status, IPSR */ 197 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) 198 { 199 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 200 201 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) 202 return IRQ_WAKE_THREAD; 203 204 return IRQ_NONE; 205 } 206 207 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) 208 { 209 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ 210 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; 211 } 212 213 /* 214 * Enable queue processing in the hardware, register interrupt handler. 215 * 216 * @queue - data structure, already allocated with riscv_iommu_queue_alloc() 217 * @irq_handler - threaded interrupt handler. 218 */ 219 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, 220 struct riscv_iommu_queue *queue, 221 irq_handler_t irq_handler) 222 { 223 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; 224 u32 csr; 225 int rc; 226 227 if (queue->iommu) 228 return -EBUSY; 229 230 /* Polling not implemented */ 231 if (!irq) 232 return -ENODEV; 233 234 queue->iommu = iommu; 235 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, 236 IRQF_ONESHOT | IRQF_SHARED, 237 dev_name(iommu->dev), queue); 238 if (rc) { 239 queue->iommu = NULL; 240 return rc; 241 } 242 243 /* Empty queue before enabling it */ 244 if (queue->qid == RISCV_IOMMU_INTR_CQ) 245 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); 246 else 247 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0); 248 249 /* 250 * Enable queue with interrupts, clear any memory fault if any. 251 * Wait for the hardware to acknowledge request and activate queue 252 * processing. 253 * Note: All CSR bitfields are in the same offsets for all queues. 254 */ 255 riscv_iommu_writel(iommu, queue->qcr, 256 RISCV_IOMMU_QUEUE_ENABLE | 257 RISCV_IOMMU_QUEUE_INTR_ENABLE | 258 RISCV_IOMMU_QUEUE_MEM_FAULT); 259 260 riscv_iommu_readl_timeout(iommu, queue->qcr, 261 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 262 10, RISCV_IOMMU_QCSR_TIMEOUT); 263 264 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | 265 RISCV_IOMMU_QUEUE_BUSY | 266 RISCV_IOMMU_QUEUE_MEM_FAULT))) { 267 /* Best effort to stop and disable failing hardware queue. */ 268 riscv_iommu_writel(iommu, queue->qcr, 0); 269 free_irq(irq, queue); 270 queue->iommu = NULL; 271 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); 272 return -EBUSY; 273 } 274 275 /* Clear any pending interrupt flag. */ 276 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 277 278 return 0; 279 } 280 281 /* 282 * Disable queue. Wait for the hardware to acknowledge request and 283 * stop processing enqueued requests. Report errors but continue. 284 */ 285 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) 286 { 287 struct riscv_iommu_device *iommu = queue->iommu; 288 u32 csr; 289 290 if (!iommu) 291 return; 292 293 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); 294 riscv_iommu_writel(iommu, queue->qcr, 0); 295 riscv_iommu_readl_timeout(iommu, queue->qcr, 296 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 297 10, RISCV_IOMMU_QCSR_TIMEOUT); 298 299 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) 300 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", 301 queue->qid, csr); 302 303 queue->iommu = NULL; 304 } 305 306 /* 307 * Returns number of available valid queue entries and the first item index. 308 * Update shadow producer index if necessary. 309 */ 310 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, 311 unsigned int *index) 312 { 313 unsigned int head = atomic_read(&queue->head); 314 unsigned int tail = atomic_read(&queue->tail); 315 unsigned int last = Q_ITEM(queue, tail); 316 int available = (int)(tail - head); 317 318 *index = head; 319 320 if (available > 0) 321 return available; 322 323 /* read hardware producer index, check reserved register bits are not set. */ 324 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), 325 tail, (tail & ~queue->mask) == 0, 326 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { 327 dev_err_once(queue->iommu->dev, 328 "Hardware error: queue access timeout\n"); 329 return 0; 330 } 331 332 if (tail == last) 333 return 0; 334 335 /* update shadow producer index */ 336 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); 337 } 338 339 /* 340 * Release processed queue entries, should match riscv_iommu_queue_consume() calls. 341 */ 342 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) 343 { 344 const unsigned int head = atomic_add_return(count, &queue->head); 345 346 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); 347 } 348 349 /* Return actual consumer index based on hardware reported queue head index. */ 350 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) 351 { 352 const unsigned int cons = atomic_read(&queue->head); 353 const unsigned int last = Q_ITEM(queue, cons); 354 unsigned int head; 355 356 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 357 !(head & ~queue->mask), 358 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 359 return cons; 360 361 return cons + ((head - last) & queue->mask); 362 } 363 364 /* Wait for submitted item to be processed. */ 365 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, 366 unsigned int index, 367 unsigned int timeout_us) 368 { 369 unsigned int cons = atomic_read(&queue->head); 370 371 /* Already processed by the consumer */ 372 if ((int)(cons - index) > 0) 373 return 0; 374 375 /* Monitor consumer index */ 376 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, 377 (int)(cons - index) > 0, 0, timeout_us); 378 } 379 380 /* Enqueue an entry and wait to be processed if timeout_us > 0 381 * 382 * Error handling for IOMMU hardware not responding in reasonable time 383 * will be added as separate patch series along with other RAS features. 384 * For now, only report hardware failure and continue. 385 */ 386 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, 387 void *entry, size_t entry_size) 388 { 389 unsigned int prod; 390 unsigned int head; 391 unsigned int tail; 392 unsigned long flags; 393 394 /* Do not preempt submission flow. */ 395 local_irq_save(flags); 396 397 /* 1. Allocate some space in the queue */ 398 prod = atomic_inc_return(&queue->prod) - 1; 399 head = atomic_read(&queue->head); 400 401 /* 2. Wait for space availability. */ 402 if ((prod - head) > queue->mask) { 403 if (readx_poll_timeout(atomic_read, &queue->head, 404 head, (prod - head) < queue->mask, 405 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 406 goto err_busy; 407 } else if ((prod - head) == queue->mask) { 408 const unsigned int last = Q_ITEM(queue, head); 409 410 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 411 !(head & ~queue->mask) && head != last, 412 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 413 goto err_busy; 414 atomic_add((head - last) & queue->mask, &queue->head); 415 } 416 417 /* 3. Store entry in the ring buffer */ 418 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); 419 420 /* 4. Wait for all previous entries to be ready */ 421 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, 422 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 423 goto err_busy; 424 425 /* 426 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is 427 * completed and visible before signaling the tail doorbell to fetch 428 * the next command. 'fence ow, ow' 429 */ 430 dma_wmb(); 431 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); 432 433 /* 434 * 6. Make sure the doorbell write to the device has finished before updating 435 * the shadow tail index in normal memory. 'fence o, w' 436 */ 437 mmiowb(); 438 atomic_inc(&queue->tail); 439 440 /* 7. Complete submission and restore local interrupts */ 441 local_irq_restore(flags); 442 443 return prod; 444 445 err_busy: 446 local_irq_restore(flags); 447 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); 448 449 return prod; 450 } 451 452 /* 453 * IOMMU Command queue chapter 3.1 454 */ 455 456 /* Command queue interrupt handler thread function */ 457 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) 458 { 459 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 460 unsigned int ctrl; 461 462 /* Clear MF/CQ errors, complete error recovery to be implemented. */ 463 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); 464 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 465 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { 466 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); 467 dev_warn(queue->iommu->dev, 468 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", 469 queue->qid, 470 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), 471 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), 472 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), 473 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); 474 } 475 476 /* Placeholder for command queue interrupt notifiers */ 477 478 /* Clear command interrupt pending. */ 479 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 480 481 return IRQ_HANDLED; 482 } 483 484 /* Send command to the IOMMU command queue */ 485 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, 486 struct riscv_iommu_command *cmd) 487 { 488 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); 489 } 490 491 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */ 492 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, 493 unsigned int timeout_us) 494 { 495 struct riscv_iommu_command cmd; 496 unsigned int prod; 497 498 riscv_iommu_cmd_iofence(&cmd); 499 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); 500 501 if (!timeout_us) 502 return; 503 504 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) 505 dev_err_once(iommu->dev, 506 "Hardware error: command execution timeout\n"); 507 } 508 509 /* 510 * IOMMU Fault/Event queue chapter 3.2 511 */ 512 513 static void riscv_iommu_fault(struct riscv_iommu_device *iommu, 514 struct riscv_iommu_fq_record *event) 515 { 516 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); 517 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); 518 519 /* Placeholder for future fault handling implementation, report only. */ 520 if (err) 521 dev_warn_ratelimited(iommu->dev, 522 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", 523 err, devid, event->iotval, event->iotval2); 524 } 525 526 /* Fault queue interrupt handler thread function */ 527 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) 528 { 529 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 530 struct riscv_iommu_device *iommu = queue->iommu; 531 struct riscv_iommu_fq_record *events; 532 unsigned int ctrl, idx; 533 int cnt, len; 534 535 events = (struct riscv_iommu_fq_record *)queue->base; 536 537 /* Clear fault interrupt pending and process all received fault events. */ 538 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 539 540 do { 541 cnt = riscv_iommu_queue_consume(queue, &idx); 542 for (len = 0; len < cnt; idx++, len++) 543 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); 544 riscv_iommu_queue_release(queue, cnt); 545 } while (cnt > 0); 546 547 /* Clear MF/OF errors, complete error recovery to be implemented. */ 548 ctrl = riscv_iommu_readl(iommu, queue->qcr); 549 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { 550 riscv_iommu_writel(iommu, queue->qcr, ctrl); 551 dev_warn(iommu->dev, 552 "Queue #%u error; memory fault:%d overflow:%d\n", 553 queue->qid, 554 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), 555 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); 556 } 557 558 return IRQ_HANDLED; 559 } 560 561 /* Lookup and initialize device context info structure. */ 562 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, 563 unsigned int devid) 564 { 565 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); 566 unsigned int depth; 567 unsigned long ddt, old, new; 568 void *ptr; 569 u8 ddi_bits[3] = { 0 }; 570 u64 *ddtp = NULL; 571 572 /* Make sure the mode is valid */ 573 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || 574 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) 575 return NULL; 576 577 /* 578 * Device id partitioning for base format: 579 * DDI[0]: bits 0 - 6 (1st level) (7 bits) 580 * DDI[1]: bits 7 - 15 (2nd level) (9 bits) 581 * DDI[2]: bits 16 - 23 (3rd level) (8 bits) 582 * 583 * For extended format: 584 * DDI[0]: bits 0 - 5 (1st level) (6 bits) 585 * DDI[1]: bits 6 - 14 (2nd level) (9 bits) 586 * DDI[2]: bits 15 - 23 (3rd level) (9 bits) 587 */ 588 if (base_format) { 589 ddi_bits[0] = 7; 590 ddi_bits[1] = 7 + 9; 591 ddi_bits[2] = 7 + 9 + 8; 592 } else { 593 ddi_bits[0] = 6; 594 ddi_bits[1] = 6 + 9; 595 ddi_bits[2] = 6 + 9 + 9; 596 } 597 598 /* Make sure device id is within range */ 599 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; 600 if (devid >= (1 << ddi_bits[depth])) 601 return NULL; 602 603 /* Get to the level of the non-leaf node that holds the device context */ 604 for (ddtp = iommu->ddt_root; depth-- > 0;) { 605 const int split = ddi_bits[depth]; 606 /* 607 * Each non-leaf node is 64bits wide and on each level 608 * nodes are indexed by DDI[depth]. 609 */ 610 ddtp += (devid >> split) & 0x1FF; 611 612 /* 613 * Check if this node has been populated and if not 614 * allocate a new level and populate it. 615 */ 616 do { 617 ddt = READ_ONCE(*(unsigned long *)ddtp); 618 if (ddt & RISCV_IOMMU_DDTE_V) { 619 ddtp = __va(ppn_to_phys(ddt)); 620 break; 621 } 622 623 ptr = riscv_iommu_get_pages(iommu, 0); 624 if (!ptr) 625 return NULL; 626 627 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; 628 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); 629 630 if (old == ddt) { 631 ddtp = (u64 *)ptr; 632 break; 633 } 634 635 /* Race setting DDT detected, re-read and retry. */ 636 riscv_iommu_free_pages(iommu, ptr); 637 } while (1); 638 } 639 640 /* 641 * Grab the node that matches DDI[depth], note that when using base 642 * format the device context is 4 * 64bits, and the extended format 643 * is 8 * 64bits, hence the (3 - base_format) below. 644 */ 645 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); 646 647 return (struct riscv_iommu_dc *)ddtp; 648 } 649 650 /* 651 * This is best effort IOMMU translation shutdown flow. 652 * Disable IOMMU without waiting for hardware response. 653 */ 654 void riscv_iommu_disable(struct riscv_iommu_device *iommu) 655 { 656 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 657 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, 658 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); 659 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); 660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); 661 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); 662 } 663 664 #define riscv_iommu_read_ddtp(iommu) ({ \ 665 u64 ddtp; \ 666 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ 667 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ 668 RISCV_IOMMU_DDTP_TIMEOUT); \ 669 ddtp; }) 670 671 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) 672 { 673 u64 ddtp; 674 unsigned int mode; 675 676 ddtp = riscv_iommu_read_ddtp(iommu); 677 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 678 return -EBUSY; 679 680 /* 681 * It is optional for the hardware to report a fixed address for device 682 * directory root page when DDT.MODE is OFF or BARE. 683 */ 684 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 685 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || 686 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { 687 /* Use WARL to discover hardware fixed DDT PPN */ 688 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 689 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); 690 ddtp = riscv_iommu_read_ddtp(iommu); 691 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 692 return -EBUSY; 693 694 iommu->ddt_phys = ppn_to_phys(ddtp); 695 if (iommu->ddt_phys) 696 iommu->ddt_root = devm_ioremap(iommu->dev, 697 iommu->ddt_phys, PAGE_SIZE); 698 if (iommu->ddt_root) 699 memset(iommu->ddt_root, 0, PAGE_SIZE); 700 } 701 702 if (!iommu->ddt_root) { 703 iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); 704 iommu->ddt_phys = __pa(iommu->ddt_root); 705 } 706 707 if (!iommu->ddt_root) 708 return -ENOMEM; 709 710 return 0; 711 } 712 713 /* 714 * Discover supported DDT modes starting from requested value, 715 * configure DDTP register with accepted mode and root DDT address. 716 * Accepted iommu->ddt_mode is updated on success. 717 */ 718 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, 719 unsigned int ddtp_mode) 720 { 721 struct device *dev = iommu->dev; 722 u64 ddtp, rq_ddtp; 723 unsigned int mode, rq_mode = ddtp_mode; 724 struct riscv_iommu_command cmd; 725 726 ddtp = riscv_iommu_read_ddtp(iommu); 727 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 728 return -EBUSY; 729 730 /* Disallow state transition from xLVL to xLVL. */ 731 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 732 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 733 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && 734 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 735 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) 736 return -EINVAL; 737 738 do { 739 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); 740 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 741 rq_ddtp |= phys_to_ppn(iommu->ddt_phys); 742 743 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); 744 ddtp = riscv_iommu_read_ddtp(iommu); 745 if (ddtp & RISCV_IOMMU_DDTP_BUSY) { 746 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", 747 rq_mode, ddtp); 748 return -EBUSY; 749 } 750 751 /* Verify IOMMU hardware accepts new DDTP config. */ 752 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 753 754 if (rq_mode == mode) 755 break; 756 757 /* Hardware mandatory DDTP mode has not been accepted. */ 758 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { 759 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", 760 ddtp, rq_ddtp); 761 return -EINVAL; 762 } 763 764 /* 765 * Mode field is WARL, an IOMMU may support a subset of 766 * directory table levels in which case if we tried to set 767 * an unsupported number of levels we'll readback either 768 * a valid xLVL or off/bare. If we got off/bare, try again 769 * with a smaller xLVL. 770 */ 771 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && 772 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { 773 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); 774 rq_mode--; 775 continue; 776 } 777 778 /* 779 * We tried all supported modes and IOMMU hardware failed to 780 * accept new settings, something went very wrong since off/bare 781 * and at least one xLVL must be supported. 782 */ 783 dev_err(dev, "DDTP hw mode %u, failed to set %u\n", 784 mode, ddtp_mode); 785 return -EINVAL; 786 } while (1); 787 788 iommu->ddt_mode = mode; 789 if (mode != ddtp_mode) 790 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); 791 792 /* Invalidate device context cache */ 793 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 794 riscv_iommu_cmd_send(iommu, &cmd); 795 796 /* Invalidate address translation cache */ 797 riscv_iommu_cmd_inval_vma(&cmd); 798 riscv_iommu_cmd_send(iommu, &cmd); 799 800 /* IOFENCE.C */ 801 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 802 803 return 0; 804 } 805 806 /* This struct contains protection domain specific IOMMU driver data. */ 807 struct riscv_iommu_domain { 808 struct iommu_domain domain; 809 struct list_head bonds; 810 spinlock_t lock; /* protect bonds list updates. */ 811 int pscid; 812 bool amo_enabled; 813 int numa_node; 814 unsigned int pgd_mode; 815 unsigned long *pgd_root; 816 }; 817 818 #define iommu_domain_to_riscv(iommu_domain) \ 819 container_of(iommu_domain, struct riscv_iommu_domain, domain) 820 821 /* Private IOMMU data for managed devices, dev_iommu_priv_* */ 822 struct riscv_iommu_info { 823 struct riscv_iommu_domain *domain; 824 }; 825 826 /* 827 * Linkage between an iommu_domain and attached devices. 828 * 829 * Protection domain requiring IOATC and DevATC translation cache invalidations, 830 * should be linked to attached devices using a riscv_iommu_bond structure. 831 * Devices should be linked to the domain before first use and unlinked after 832 * the translations from the referenced protection domain can no longer be used. 833 * Blocking and identity domains are not tracked here, as the IOMMU hardware 834 * does not cache negative and/or identity (BARE mode) translations, and DevATC 835 * is disabled for those protection domains. 836 * 837 * The device pointer and IOMMU data remain stable in the bond struct after 838 * _probe_device() where it's attached to the managed IOMMU, up to the 839 * completion of the _release_device() call. The release of the bond structure 840 * is synchronized with the device release. 841 */ 842 struct riscv_iommu_bond { 843 struct list_head list; 844 struct rcu_head rcu; 845 struct device *dev; 846 }; 847 848 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, 849 struct device *dev) 850 { 851 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 852 struct riscv_iommu_bond *bond; 853 struct list_head *bonds; 854 855 bond = kzalloc(sizeof(*bond), GFP_KERNEL); 856 if (!bond) 857 return -ENOMEM; 858 bond->dev = dev; 859 860 /* 861 * List of devices attached to the domain is arranged based on 862 * managed IOMMU device. 863 */ 864 865 spin_lock(&domain->lock); 866 list_for_each(bonds, &domain->bonds) 867 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) 868 break; 869 list_add_rcu(&bond->list, bonds); 870 spin_unlock(&domain->lock); 871 872 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ 873 smp_mb(); 874 875 return 0; 876 } 877 878 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, 879 struct device *dev) 880 { 881 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 882 struct riscv_iommu_bond *bond, *found = NULL; 883 struct riscv_iommu_command cmd; 884 int count = 0; 885 886 if (!domain) 887 return; 888 889 spin_lock(&domain->lock); 890 list_for_each_entry(bond, &domain->bonds, list) { 891 if (found && count) 892 break; 893 else if (bond->dev == dev) 894 found = bond; 895 else if (dev_to_iommu(bond->dev) == iommu) 896 count++; 897 } 898 if (found) 899 list_del_rcu(&found->list); 900 spin_unlock(&domain->lock); 901 kfree_rcu(found, rcu); 902 903 /* 904 * If this was the last bond between this domain and the IOMMU 905 * invalidate all cached entries for domain's PSCID. 906 */ 907 if (!count) { 908 riscv_iommu_cmd_inval_vma(&cmd); 909 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 910 riscv_iommu_cmd_send(iommu, &cmd); 911 912 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 913 } 914 } 915 916 /* 917 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. 918 * This limit will be replaced with range invalidations, if supported by 919 * the hardware, when RISC-V IOMMU architecture specification update for 920 * range invalidations update will be available. 921 */ 922 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) 923 924 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, 925 unsigned long start, unsigned long end) 926 { 927 struct riscv_iommu_bond *bond; 928 struct riscv_iommu_device *iommu, *prev; 929 struct riscv_iommu_command cmd; 930 unsigned long len = end - start + 1; 931 unsigned long iova; 932 933 /* 934 * For each IOMMU linked with this protection domain (via bonds->dev), 935 * an IOTLB invaliation command will be submitted and executed. 936 * 937 * Possbile race with domain attach flow is handled by sequencing 938 * bond creation - riscv_iommu_bond_link(), and device directory 939 * update - riscv_iommu_iodir_update(). 940 * 941 * PTE Update / IOTLB Inval Device attach & directory update 942 * -------------------------- -------------------------- 943 * update page table entries add dev to the bond list 944 * FENCE RW,RW FENCE RW,RW 945 * For all IOMMUs: (can be empty) Update FSC/PSCID 946 * FENCE IOW,IOW FENCE IOW,IOW 947 * IOTLB.INVAL IODIR.INVAL 948 * IOFENCE.C 949 * 950 * If bond list is not updated with new device, directory context will 951 * be configured with already valid page table content. If an IOMMU is 952 * linked to the protection domain it will receive invalidation 953 * requests for updated page table entries. 954 */ 955 smp_mb(); 956 957 rcu_read_lock(); 958 959 prev = NULL; 960 list_for_each_entry_rcu(bond, &domain->bonds, list) { 961 iommu = dev_to_iommu(bond->dev); 962 963 /* 964 * IOTLB invalidation request can be safely omitted if already sent 965 * to the IOMMU for the same PSCID, and with domain->bonds list 966 * arranged based on the device's IOMMU, it's sufficient to check 967 * last device the invalidation was sent to. 968 */ 969 if (iommu == prev) 970 continue; 971 972 riscv_iommu_cmd_inval_vma(&cmd); 973 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 974 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { 975 for (iova = start; iova < end; iova += PAGE_SIZE) { 976 riscv_iommu_cmd_inval_set_addr(&cmd, iova); 977 riscv_iommu_cmd_send(iommu, &cmd); 978 } 979 } else { 980 riscv_iommu_cmd_send(iommu, &cmd); 981 } 982 prev = iommu; 983 } 984 985 prev = NULL; 986 list_for_each_entry_rcu(bond, &domain->bonds, list) { 987 iommu = dev_to_iommu(bond->dev); 988 if (iommu == prev) 989 continue; 990 991 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 992 prev = iommu; 993 } 994 rcu_read_unlock(); 995 } 996 997 #define RISCV_IOMMU_FSC_BARE 0 998 999 /* 1000 * Update IODIR for the device. 1001 * 1002 * During the execution of riscv_iommu_probe_device(), IODIR entries are 1003 * allocated for the device's identifiers. Device context invalidation 1004 * becomes necessary only if one of the updated entries was previously 1005 * marked as valid, given that invalid device context entries are not 1006 * cached by the IOMMU hardware. 1007 * In this implementation, updating a valid device context while the 1008 * device is not quiesced might be disruptive, potentially causing 1009 * interim translation faults. 1010 */ 1011 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, 1012 struct device *dev, u64 fsc, u64 ta) 1013 { 1014 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1015 struct riscv_iommu_dc *dc; 1016 struct riscv_iommu_command cmd; 1017 bool sync_required = false; 1018 u64 tc; 1019 int i; 1020 1021 for (i = 0; i < fwspec->num_ids; i++) { 1022 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1023 tc = READ_ONCE(dc->tc); 1024 if (!(tc & RISCV_IOMMU_DC_TC_V)) 1025 continue; 1026 1027 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); 1028 1029 /* Invalidate device context cached values */ 1030 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1031 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1032 riscv_iommu_cmd_send(iommu, &cmd); 1033 sync_required = true; 1034 } 1035 1036 if (sync_required) 1037 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1038 1039 /* 1040 * For device context with DC_TC_PDTV = 0, translation attributes valid bit 1041 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). 1042 */ 1043 for (i = 0; i < fwspec->num_ids; i++) { 1044 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1045 tc = READ_ONCE(dc->tc); 1046 tc |= ta & RISCV_IOMMU_DC_TC_V; 1047 1048 WRITE_ONCE(dc->fsc, fsc); 1049 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); 1050 /* Update device context, write TC.V as the last step. */ 1051 dma_wmb(); 1052 WRITE_ONCE(dc->tc, tc); 1053 1054 /* Invalidate device context after update */ 1055 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1056 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1057 riscv_iommu_cmd_send(iommu, &cmd); 1058 } 1059 1060 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1061 } 1062 1063 /* 1064 * IOVA page translation tree management. 1065 */ 1066 1067 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) 1068 { 1069 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1070 1071 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1072 } 1073 1074 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, 1075 struct iommu_iotlb_gather *gather) 1076 { 1077 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1078 1079 riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1080 } 1081 1082 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) 1083 1084 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) 1085 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) 1086 #define _io_pte_none(pte) ((pte) == 0) 1087 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) 1088 1089 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, 1090 unsigned long pte, struct list_head *freelist) 1091 { 1092 unsigned long *ptr; 1093 int i; 1094 1095 if (!_io_pte_present(pte) || _io_pte_leaf(pte)) 1096 return; 1097 1098 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1099 1100 /* Recursively free all sub page table pages */ 1101 for (i = 0; i < PTRS_PER_PTE; i++) { 1102 pte = READ_ONCE(ptr[i]); 1103 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) 1104 riscv_iommu_pte_free(domain, pte, freelist); 1105 } 1106 1107 if (freelist) 1108 list_add_tail(&virt_to_page(ptr)->lru, freelist); 1109 else 1110 iommu_free_page(ptr); 1111 } 1112 1113 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, 1114 unsigned long iova, size_t pgsize, 1115 gfp_t gfp) 1116 { 1117 unsigned long *ptr = domain->pgd_root; 1118 unsigned long pte, old; 1119 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1120 void *addr; 1121 1122 do { 1123 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1124 1125 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1126 /* 1127 * Note: returned entry might be a non-leaf if there was 1128 * existing mapping with smaller granularity. Up to the caller 1129 * to replace and invalidate. 1130 */ 1131 if (((size_t)1 << shift) == pgsize) 1132 return ptr; 1133 pte_retry: 1134 pte = READ_ONCE(*ptr); 1135 /* 1136 * This is very likely incorrect as we should not be adding 1137 * new mapping with smaller granularity on top 1138 * of existing 2M/1G mapping. Fail. 1139 */ 1140 if (_io_pte_present(pte) && _io_pte_leaf(pte)) 1141 return NULL; 1142 /* 1143 * Non-leaf entry is missing, allocate and try to add to the 1144 * page table. This might race with other mappings, retry. 1145 */ 1146 if (_io_pte_none(pte)) { 1147 addr = iommu_alloc_page_node(domain->numa_node, gfp); 1148 if (!addr) 1149 return NULL; 1150 old = pte; 1151 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); 1152 if (cmpxchg_relaxed(ptr, old, pte) != old) { 1153 iommu_free_page(addr); 1154 goto pte_retry; 1155 } 1156 } 1157 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1158 } while (level-- > 0); 1159 1160 return NULL; 1161 } 1162 1163 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, 1164 unsigned long iova, size_t *pte_pgsize) 1165 { 1166 unsigned long *ptr = domain->pgd_root; 1167 unsigned long pte; 1168 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1169 1170 do { 1171 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1172 1173 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1174 pte = READ_ONCE(*ptr); 1175 if (_io_pte_present(pte) && _io_pte_leaf(pte)) { 1176 *pte_pgsize = (size_t)1 << shift; 1177 return ptr; 1178 } 1179 if (_io_pte_none(pte)) 1180 return NULL; 1181 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1182 } while (level-- > 0); 1183 1184 return NULL; 1185 } 1186 1187 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, 1188 unsigned long iova, phys_addr_t phys, 1189 size_t pgsize, size_t pgcount, int prot, 1190 gfp_t gfp, size_t *mapped) 1191 { 1192 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1193 size_t size = 0; 1194 unsigned long *ptr; 1195 unsigned long pte, old, pte_prot; 1196 int rc = 0; 1197 LIST_HEAD(freelist); 1198 1199 if (!(prot & IOMMU_WRITE)) 1200 pte_prot = _PAGE_BASE | _PAGE_READ; 1201 else if (domain->amo_enabled) 1202 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; 1203 else 1204 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; 1205 1206 while (pgcount) { 1207 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); 1208 if (!ptr) { 1209 rc = -ENOMEM; 1210 break; 1211 } 1212 1213 old = READ_ONCE(*ptr); 1214 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); 1215 if (cmpxchg_relaxed(ptr, old, pte) != old) 1216 continue; 1217 1218 riscv_iommu_pte_free(domain, old, &freelist); 1219 1220 size += pgsize; 1221 iova += pgsize; 1222 phys += pgsize; 1223 --pgcount; 1224 } 1225 1226 *mapped = size; 1227 1228 if (!list_empty(&freelist)) { 1229 /* 1230 * In 1.0 spec version, the smallest scope we can use to 1231 * invalidate all levels of page table (i.e. leaf and non-leaf) 1232 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. 1233 * This will be updated with hardware support for 1234 * capability.NL (non-leaf) IOTINVAL command. 1235 */ 1236 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1237 iommu_put_pages_list(&freelist); 1238 } 1239 1240 return rc; 1241 } 1242 1243 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, 1244 unsigned long iova, size_t pgsize, 1245 size_t pgcount, 1246 struct iommu_iotlb_gather *gather) 1247 { 1248 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1249 size_t size = pgcount << __ffs(pgsize); 1250 unsigned long *ptr, old; 1251 size_t unmapped = 0; 1252 size_t pte_size; 1253 1254 while (unmapped < size) { 1255 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1256 if (!ptr) 1257 return unmapped; 1258 1259 /* partial unmap is not allowed, fail. */ 1260 if (iova & (pte_size - 1)) 1261 return unmapped; 1262 1263 old = READ_ONCE(*ptr); 1264 if (cmpxchg_relaxed(ptr, old, 0) != old) 1265 continue; 1266 1267 iommu_iotlb_gather_add_page(&domain->domain, gather, iova, 1268 pte_size); 1269 1270 iova += pte_size; 1271 unmapped += pte_size; 1272 } 1273 1274 return unmapped; 1275 } 1276 1277 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, 1278 dma_addr_t iova) 1279 { 1280 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1281 size_t pte_size; 1282 unsigned long *ptr; 1283 1284 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1285 if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) 1286 return 0; 1287 1288 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); 1289 } 1290 1291 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) 1292 { 1293 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1294 const unsigned long pfn = virt_to_pfn(domain->pgd_root); 1295 1296 WARN_ON(!list_empty(&domain->bonds)); 1297 1298 if ((int)domain->pscid > 0) 1299 ida_free(&riscv_iommu_pscids, domain->pscid); 1300 1301 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); 1302 kfree(domain); 1303 } 1304 1305 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) 1306 { 1307 switch (pgd_mode) { 1308 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: 1309 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; 1310 1311 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: 1312 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; 1313 1314 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: 1315 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; 1316 } 1317 return false; 1318 } 1319 1320 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, 1321 struct device *dev) 1322 { 1323 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1324 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1325 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1326 u64 fsc, ta; 1327 1328 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) 1329 return -ENODEV; 1330 1331 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | 1332 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); 1333 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | 1334 RISCV_IOMMU_PC_TA_V; 1335 1336 if (riscv_iommu_bond_link(domain, dev)) 1337 return -ENOMEM; 1338 1339 riscv_iommu_iodir_update(iommu, dev, fsc, ta); 1340 riscv_iommu_bond_unlink(info->domain, dev); 1341 info->domain = domain; 1342 1343 return 0; 1344 } 1345 1346 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { 1347 .attach_dev = riscv_iommu_attach_paging_domain, 1348 .free = riscv_iommu_free_paging_domain, 1349 .map_pages = riscv_iommu_map_pages, 1350 .unmap_pages = riscv_iommu_unmap_pages, 1351 .iova_to_phys = riscv_iommu_iova_to_phys, 1352 .iotlb_sync = riscv_iommu_iotlb_sync, 1353 .flush_iotlb_all = riscv_iommu_iotlb_flush_all, 1354 }; 1355 1356 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) 1357 { 1358 struct riscv_iommu_domain *domain; 1359 struct riscv_iommu_device *iommu; 1360 unsigned int pgd_mode; 1361 dma_addr_t va_mask; 1362 int va_bits; 1363 1364 iommu = dev_to_iommu(dev); 1365 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { 1366 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; 1367 va_bits = 57; 1368 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { 1369 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; 1370 va_bits = 48; 1371 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { 1372 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; 1373 va_bits = 39; 1374 } else { 1375 dev_err(dev, "cannot find supported page table mode\n"); 1376 return ERR_PTR(-ENODEV); 1377 } 1378 1379 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1380 if (!domain) 1381 return ERR_PTR(-ENOMEM); 1382 1383 INIT_LIST_HEAD_RCU(&domain->bonds); 1384 spin_lock_init(&domain->lock); 1385 domain->numa_node = dev_to_node(iommu->dev); 1386 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); 1387 domain->pgd_mode = pgd_mode; 1388 domain->pgd_root = iommu_alloc_page_node(domain->numa_node, 1389 GFP_KERNEL_ACCOUNT); 1390 if (!domain->pgd_root) { 1391 kfree(domain); 1392 return ERR_PTR(-ENOMEM); 1393 } 1394 1395 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, 1396 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); 1397 if (domain->pscid < 0) { 1398 iommu_free_page(domain->pgd_root); 1399 kfree(domain); 1400 return ERR_PTR(-ENOMEM); 1401 } 1402 1403 /* 1404 * Note: RISC-V Privilege spec mandates that virtual addresses 1405 * need to be sign-extended, so if (VA_BITS - 1) is set, all 1406 * bits >= VA_BITS need to also be set or else we'll get a 1407 * page fault. However the code that creates the mappings 1408 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us 1409 * for now, so we'll end up with invalid virtual addresses 1410 * to map. As a workaround until we get this sorted out 1411 * limit the available virtual addresses to VA_BITS - 1. 1412 */ 1413 va_mask = DMA_BIT_MASK(va_bits - 1); 1414 1415 domain->domain.geometry.aperture_start = 0; 1416 domain->domain.geometry.aperture_end = va_mask; 1417 domain->domain.geometry.force_aperture = true; 1418 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); 1419 1420 domain->domain.ops = &riscv_iommu_paging_domain_ops; 1421 1422 return &domain->domain; 1423 } 1424 1425 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, 1426 struct device *dev) 1427 { 1428 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1429 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1430 1431 /* Make device context invalid, translation requests will fault w/ #258 */ 1432 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); 1433 riscv_iommu_bond_unlink(info->domain, dev); 1434 info->domain = NULL; 1435 1436 return 0; 1437 } 1438 1439 static struct iommu_domain riscv_iommu_blocking_domain = { 1440 .type = IOMMU_DOMAIN_BLOCKED, 1441 .ops = &(const struct iommu_domain_ops) { 1442 .attach_dev = riscv_iommu_attach_blocking_domain, 1443 } 1444 }; 1445 1446 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, 1447 struct device *dev) 1448 { 1449 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1450 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1451 1452 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); 1453 riscv_iommu_bond_unlink(info->domain, dev); 1454 info->domain = NULL; 1455 1456 return 0; 1457 } 1458 1459 static struct iommu_domain riscv_iommu_identity_domain = { 1460 .type = IOMMU_DOMAIN_IDENTITY, 1461 .ops = &(const struct iommu_domain_ops) { 1462 .attach_dev = riscv_iommu_attach_identity_domain, 1463 } 1464 }; 1465 1466 static struct iommu_group *riscv_iommu_device_group(struct device *dev) 1467 { 1468 if (dev_is_pci(dev)) 1469 return pci_device_group(dev); 1470 return generic_device_group(dev); 1471 } 1472 1473 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) 1474 { 1475 return iommu_fwspec_add_ids(dev, args->args, 1); 1476 } 1477 1478 static struct iommu_device *riscv_iommu_probe_device(struct device *dev) 1479 { 1480 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1481 struct riscv_iommu_device *iommu; 1482 struct riscv_iommu_info *info; 1483 struct riscv_iommu_dc *dc; 1484 u64 tc; 1485 int i; 1486 1487 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) 1488 return ERR_PTR(-ENODEV); 1489 1490 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); 1491 if (!iommu) 1492 return ERR_PTR(-ENODEV); 1493 1494 /* 1495 * IOMMU hardware operating in fail-over BARE mode will provide 1496 * identity translation for all connected devices anyway... 1497 */ 1498 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 1499 return ERR_PTR(-ENODEV); 1500 1501 info = kzalloc(sizeof(*info), GFP_KERNEL); 1502 if (!info) 1503 return ERR_PTR(-ENOMEM); 1504 /* 1505 * Allocate and pre-configure device context entries in 1506 * the device directory. Do not mark the context valid yet. 1507 */ 1508 tc = 0; 1509 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) 1510 tc |= RISCV_IOMMU_DC_TC_SADE; 1511 for (i = 0; i < fwspec->num_ids; i++) { 1512 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1513 if (!dc) { 1514 kfree(info); 1515 return ERR_PTR(-ENODEV); 1516 } 1517 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) 1518 dev_warn(dev, "already attached to IOMMU device directory\n"); 1519 WRITE_ONCE(dc->tc, tc); 1520 } 1521 1522 dev_iommu_priv_set(dev, info); 1523 1524 return &iommu->iommu; 1525 } 1526 1527 static void riscv_iommu_release_device(struct device *dev) 1528 { 1529 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1530 1531 kfree_rcu_mightsleep(info); 1532 } 1533 1534 static const struct iommu_ops riscv_iommu_ops = { 1535 .pgsize_bitmap = SZ_4K, 1536 .of_xlate = riscv_iommu_of_xlate, 1537 .identity_domain = &riscv_iommu_identity_domain, 1538 .blocked_domain = &riscv_iommu_blocking_domain, 1539 .release_domain = &riscv_iommu_blocking_domain, 1540 .domain_alloc_paging = riscv_iommu_alloc_paging_domain, 1541 .device_group = riscv_iommu_device_group, 1542 .probe_device = riscv_iommu_probe_device, 1543 .release_device = riscv_iommu_release_device, 1544 }; 1545 1546 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) 1547 { 1548 u64 ddtp; 1549 1550 /* 1551 * Make sure the IOMMU is switched off or in pass-through mode during 1552 * regular boot flow and disable translation when we boot into a kexec 1553 * kernel and the previous kernel left them enabled. 1554 */ 1555 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); 1556 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 1557 return -EBUSY; 1558 1559 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > 1560 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { 1561 if (!is_kdump_kernel()) 1562 return -EBUSY; 1563 riscv_iommu_disable(iommu); 1564 } 1565 1566 /* Configure accesses to in-memory data structures for CPU-native byte order. */ 1567 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1568 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { 1569 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) 1570 return -EINVAL; 1571 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, 1572 iommu->fctl ^ RISCV_IOMMU_FCTL_BE); 1573 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); 1574 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1575 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) 1576 return -EINVAL; 1577 } 1578 1579 /* 1580 * Distribute interrupt vectors, always use first vector for CIV. 1581 * At least one interrupt is required. Read back and verify. 1582 */ 1583 if (!iommu->irqs_count) 1584 return -EINVAL; 1585 1586 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | 1587 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | 1588 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); 1589 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); 1590 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); 1591 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), 1592 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), 1593 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), 1594 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) 1595 return -EINVAL; 1596 1597 return 0; 1598 } 1599 1600 void riscv_iommu_remove(struct riscv_iommu_device *iommu) 1601 { 1602 iommu_device_unregister(&iommu->iommu); 1603 iommu_device_sysfs_remove(&iommu->iommu); 1604 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1605 riscv_iommu_queue_disable(&iommu->cmdq); 1606 riscv_iommu_queue_disable(&iommu->fltq); 1607 } 1608 1609 int riscv_iommu_init(struct riscv_iommu_device *iommu) 1610 { 1611 int rc; 1612 1613 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); 1614 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); 1615 1616 rc = riscv_iommu_init_check(iommu); 1617 if (rc) 1618 return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); 1619 1620 rc = riscv_iommu_iodir_alloc(iommu); 1621 if (rc) 1622 return rc; 1623 1624 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, 1625 sizeof(struct riscv_iommu_command)); 1626 if (rc) 1627 return rc; 1628 1629 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, 1630 sizeof(struct riscv_iommu_fq_record)); 1631 if (rc) 1632 return rc; 1633 1634 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); 1635 if (rc) 1636 return rc; 1637 1638 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); 1639 if (rc) 1640 goto err_queue_disable; 1641 1642 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); 1643 if (rc) 1644 goto err_queue_disable; 1645 1646 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", 1647 dev_name(iommu->dev)); 1648 if (rc) { 1649 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); 1650 goto err_iodir_off; 1651 } 1652 1653 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); 1654 if (rc) { 1655 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); 1656 goto err_remove_sysfs; 1657 } 1658 1659 return 0; 1660 1661 err_remove_sysfs: 1662 iommu_device_sysfs_remove(&iommu->iommu); 1663 err_iodir_off: 1664 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1665 err_queue_disable: 1666 riscv_iommu_queue_disable(&iommu->fltq); 1667 riscv_iommu_queue_disable(&iommu->cmdq); 1668 return rc; 1669 } 1670