1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * IOMMU API for RISC-V IOMMU implementations. 4 * 5 * Copyright © 2022-2024 Rivos Inc. 6 * Copyright © 2023 FORTH-ICS/CARV 7 * 8 * Authors 9 * Tomasz Jeznach <tjeznach@rivosinc.com> 10 * Nick Kossifidis <mick@ics.forth.gr> 11 */ 12 13 #define pr_fmt(fmt) "riscv-iommu: " fmt 14 15 #include <linux/acpi.h> 16 #include <linux/acpi_rimt.h> 17 #include <linux/compiler.h> 18 #include <linux/crash_dump.h> 19 #include <linux/init.h> 20 #include <linux/iommu.h> 21 #include <linux/iopoll.h> 22 #include <linux/kernel.h> 23 #include <linux/pci.h> 24 25 #include "../iommu-pages.h" 26 #include "iommu-bits.h" 27 #include "iommu.h" 28 29 /* Timeouts in [us] */ 30 #define RISCV_IOMMU_QCSR_TIMEOUT 150000 31 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000 32 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000 33 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 34 35 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */ 36 #define RISCV_IOMMU_DEF_CQ_COUNT 8192 37 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 38 39 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ 40 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) 41 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) 42 43 #define dev_to_iommu(dev) \ 44 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) 45 46 /* IOMMU PSCID allocation namespace. */ 47 static DEFINE_IDA(riscv_iommu_pscids); 48 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) 49 50 /* Device resource-managed allocations */ 51 struct riscv_iommu_devres { 52 void *addr; 53 }; 54 55 static void riscv_iommu_devres_pages_release(struct device *dev, void *res) 56 { 57 struct riscv_iommu_devres *devres = res; 58 59 iommu_free_pages(devres->addr); 60 } 61 62 static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) 63 { 64 struct riscv_iommu_devres *devres = res; 65 struct riscv_iommu_devres *target = p; 66 67 return devres->addr == target->addr; 68 } 69 70 static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, 71 unsigned int size) 72 { 73 struct riscv_iommu_devres *devres; 74 void *addr; 75 76 addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev), 77 GFP_KERNEL_ACCOUNT, size); 78 if (unlikely(!addr)) 79 return NULL; 80 81 devres = devres_alloc(riscv_iommu_devres_pages_release, 82 sizeof(struct riscv_iommu_devres), GFP_KERNEL); 83 84 if (unlikely(!devres)) { 85 iommu_free_pages(addr); 86 return NULL; 87 } 88 89 devres->addr = addr; 90 91 devres_add(iommu->dev, devres); 92 93 return addr; 94 } 95 96 static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) 97 { 98 struct riscv_iommu_devres devres = { .addr = addr }; 99 100 devres_release(iommu->dev, riscv_iommu_devres_pages_release, 101 riscv_iommu_devres_pages_match, &devres); 102 } 103 104 /* 105 * Hardware queue allocation and management. 106 */ 107 108 /* Setup queue base, control registers and default queue length */ 109 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ 110 struct riscv_iommu_queue *_q = q; \ 111 _q->qid = RISCV_IOMMU_INTR_ ## name; \ 112 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ 113 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ 114 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ 115 } while (0) 116 117 /* Note: offsets are the same for all queues */ 118 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) 119 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) 120 #define Q_ITEM(q, index) ((q)->mask & (index)) 121 #define Q_IPSR(q) BIT((q)->qid) 122 123 /* 124 * Discover queue ring buffer hardware configuration, allocate in-memory 125 * ring buffer or use fixed I/O memory location, configure queue base register. 126 * Must be called before hardware queue is enabled. 127 * 128 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() 129 * @entry_size - queue single element size in bytes. 130 */ 131 static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, 132 struct riscv_iommu_queue *queue, 133 size_t entry_size) 134 { 135 unsigned int logsz; 136 u64 qb, rb; 137 138 /* 139 * Use WARL base register property to discover maximum allowed 140 * number of entries and optional fixed IO address for queue location. 141 */ 142 riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); 143 qb = riscv_iommu_readq(iommu, queue->qbr); 144 145 /* 146 * Calculate and verify hardware supported queue length, as reported 147 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). 148 * Update queue size based on hardware supported value. 149 */ 150 logsz = ilog2(queue->mask); 151 if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) 152 logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); 153 154 /* 155 * Use WARL base register property to discover an optional fixed IO 156 * address for queue ring buffer location. Otherwise allocate contiguous 157 * system memory. 158 */ 159 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { 160 const size_t queue_size = entry_size << (logsz + 1); 161 162 queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 163 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); 164 } else { 165 do { 166 const size_t queue_size = entry_size << (logsz + 1); 167 168 queue->base = riscv_iommu_get_pages( 169 iommu, max(queue_size, SZ_4K)); 170 queue->phys = __pa(queue->base); 171 } while (!queue->base && logsz-- > 0); 172 } 173 174 if (!queue->base) 175 return -ENOMEM; 176 177 qb = phys_to_ppn(queue->phys) | 178 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); 179 180 /* Update base register and read back to verify hw accepted our write */ 181 riscv_iommu_writeq(iommu, queue->qbr, qb); 182 rb = riscv_iommu_readq(iommu, queue->qbr); 183 if (rb != qb) { 184 dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); 185 return -ENODEV; 186 } 187 188 /* Update actual queue mask */ 189 queue->mask = (2U << logsz) - 1; 190 191 dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", 192 queue->qid, logsz + 1); 193 194 return 0; 195 } 196 197 /* Check interrupt queue status, IPSR */ 198 static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) 199 { 200 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 201 202 if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) 203 return IRQ_WAKE_THREAD; 204 205 return IRQ_NONE; 206 } 207 208 static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) 209 { 210 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ 211 return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; 212 } 213 214 /* 215 * Enable queue processing in the hardware, register interrupt handler. 216 * 217 * @queue - data structure, already allocated with riscv_iommu_queue_alloc() 218 * @irq_handler - threaded interrupt handler. 219 */ 220 static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, 221 struct riscv_iommu_queue *queue, 222 irq_handler_t irq_handler) 223 { 224 const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; 225 u32 csr; 226 int rc; 227 228 if (queue->iommu) 229 return -EBUSY; 230 231 /* Polling not implemented */ 232 if (!irq) 233 return -ENODEV; 234 235 queue->iommu = iommu; 236 rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, 237 IRQF_ONESHOT | IRQF_SHARED, 238 dev_name(iommu->dev), queue); 239 if (rc) { 240 queue->iommu = NULL; 241 return rc; 242 } 243 244 /* Empty queue before enabling it */ 245 if (queue->qid == RISCV_IOMMU_INTR_CQ) 246 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); 247 else 248 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0); 249 250 /* 251 * Enable queue with interrupts, clear any memory fault if any. 252 * Wait for the hardware to acknowledge request and activate queue 253 * processing. 254 * Note: All CSR bitfields are in the same offsets for all queues. 255 */ 256 riscv_iommu_writel(iommu, queue->qcr, 257 RISCV_IOMMU_QUEUE_ENABLE | 258 RISCV_IOMMU_QUEUE_INTR_ENABLE | 259 RISCV_IOMMU_QUEUE_MEM_FAULT); 260 261 riscv_iommu_readl_timeout(iommu, queue->qcr, 262 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 263 10, RISCV_IOMMU_QCSR_TIMEOUT); 264 265 if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | 266 RISCV_IOMMU_QUEUE_BUSY | 267 RISCV_IOMMU_QUEUE_MEM_FAULT))) { 268 /* Best effort to stop and disable failing hardware queue. */ 269 riscv_iommu_writel(iommu, queue->qcr, 0); 270 free_irq(irq, queue); 271 queue->iommu = NULL; 272 dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); 273 return -EBUSY; 274 } 275 276 /* Clear any pending interrupt flag. */ 277 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 278 279 return 0; 280 } 281 282 /* 283 * Disable queue. Wait for the hardware to acknowledge request and 284 * stop processing enqueued requests. Report errors but continue. 285 */ 286 static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) 287 { 288 struct riscv_iommu_device *iommu = queue->iommu; 289 u32 csr; 290 291 if (!iommu) 292 return; 293 294 free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); 295 riscv_iommu_writel(iommu, queue->qcr, 0); 296 riscv_iommu_readl_timeout(iommu, queue->qcr, 297 csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), 298 10, RISCV_IOMMU_QCSR_TIMEOUT); 299 300 if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) 301 dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", 302 queue->qid, csr); 303 304 queue->iommu = NULL; 305 } 306 307 /* 308 * Returns number of available valid queue entries and the first item index. 309 * Update shadow producer index if necessary. 310 */ 311 static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, 312 unsigned int *index) 313 { 314 unsigned int head = atomic_read(&queue->head); 315 unsigned int tail = atomic_read(&queue->tail); 316 unsigned int last = Q_ITEM(queue, tail); 317 int available = (int)(tail - head); 318 319 *index = head; 320 321 if (available > 0) 322 return available; 323 324 /* read hardware producer index, check reserved register bits are not set. */ 325 if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), 326 tail, (tail & ~queue->mask) == 0, 327 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { 328 dev_err_once(queue->iommu->dev, 329 "Hardware error: queue access timeout\n"); 330 return 0; 331 } 332 333 if (tail == last) 334 return 0; 335 336 /* update shadow producer index */ 337 return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); 338 } 339 340 /* 341 * Release processed queue entries, should match riscv_iommu_queue_consume() calls. 342 */ 343 static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) 344 { 345 const unsigned int head = atomic_add_return(count, &queue->head); 346 347 riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); 348 } 349 350 /* Return actual consumer index based on hardware reported queue head index. */ 351 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) 352 { 353 const unsigned int cons = atomic_read(&queue->head); 354 const unsigned int last = Q_ITEM(queue, cons); 355 unsigned int head; 356 357 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 358 !(head & ~queue->mask), 359 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 360 return cons; 361 362 return cons + ((head - last) & queue->mask); 363 } 364 365 /* Wait for submitted item to be processed. */ 366 static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, 367 unsigned int index, 368 unsigned int timeout_us) 369 { 370 unsigned int cons = atomic_read(&queue->head); 371 372 /* Already processed by the consumer */ 373 if ((int)(cons - index) > 0) 374 return 0; 375 376 /* Monitor consumer index */ 377 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, 378 (int)(cons - index) > 0, 0, timeout_us); 379 } 380 381 /* Enqueue an entry and wait to be processed if timeout_us > 0 382 * 383 * Error handling for IOMMU hardware not responding in reasonable time 384 * will be added as separate patch series along with other RAS features. 385 * For now, only report hardware failure and continue. 386 */ 387 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, 388 void *entry, size_t entry_size) 389 { 390 unsigned int prod; 391 unsigned int head; 392 unsigned int tail; 393 unsigned long flags; 394 395 /* Do not preempt submission flow. */ 396 local_irq_save(flags); 397 398 /* 1. Allocate some space in the queue */ 399 prod = atomic_inc_return(&queue->prod) - 1; 400 head = atomic_read(&queue->head); 401 402 /* 2. Wait for space availability. */ 403 if ((prod - head) > queue->mask) { 404 if (readx_poll_timeout(atomic_read, &queue->head, 405 head, (prod - head) < queue->mask, 406 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 407 goto err_busy; 408 } else if ((prod - head) == queue->mask) { 409 const unsigned int last = Q_ITEM(queue, head); 410 411 if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, 412 !(head & ~queue->mask) && head != last, 413 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 414 goto err_busy; 415 atomic_add((head - last) & queue->mask, &queue->head); 416 } 417 418 /* 3. Store entry in the ring buffer */ 419 memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); 420 421 /* 4. Wait for all previous entries to be ready */ 422 if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, 423 0, RISCV_IOMMU_QUEUE_TIMEOUT)) 424 goto err_busy; 425 426 /* 427 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is 428 * completed and visible before signaling the tail doorbell to fetch 429 * the next command. 'fence ow, ow' 430 */ 431 dma_wmb(); 432 riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); 433 434 /* 435 * 6. Make sure the doorbell write to the device has finished before updating 436 * the shadow tail index in normal memory. 'fence o, w' 437 */ 438 mmiowb(); 439 atomic_inc(&queue->tail); 440 441 /* 7. Complete submission and restore local interrupts */ 442 local_irq_restore(flags); 443 444 return prod; 445 446 err_busy: 447 local_irq_restore(flags); 448 dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); 449 450 return prod; 451 } 452 453 /* 454 * IOMMU Command queue chapter 3.1 455 */ 456 457 /* Command queue interrupt handler thread function */ 458 static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) 459 { 460 const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 461 unsigned int ctrl; 462 463 /* Clear MF/CQ errors, complete error recovery to be implemented. */ 464 ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); 465 if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 466 RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { 467 riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); 468 dev_warn(queue->iommu->dev, 469 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", 470 queue->qid, 471 !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), 472 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), 473 !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), 474 !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); 475 } 476 477 /* Placeholder for command queue interrupt notifiers */ 478 479 /* Clear command interrupt pending. */ 480 riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 481 482 return IRQ_HANDLED; 483 } 484 485 /* Send command to the IOMMU command queue */ 486 static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, 487 struct riscv_iommu_command *cmd) 488 { 489 riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); 490 } 491 492 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */ 493 static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, 494 unsigned int timeout_us) 495 { 496 struct riscv_iommu_command cmd; 497 unsigned int prod; 498 499 riscv_iommu_cmd_iofence(&cmd); 500 prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); 501 502 if (!timeout_us) 503 return; 504 505 if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) 506 dev_err_once(iommu->dev, 507 "Hardware error: command execution timeout\n"); 508 } 509 510 /* 511 * IOMMU Fault/Event queue chapter 3.2 512 */ 513 514 static void riscv_iommu_fault(struct riscv_iommu_device *iommu, 515 struct riscv_iommu_fq_record *event) 516 { 517 unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); 518 unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); 519 520 /* Placeholder for future fault handling implementation, report only. */ 521 if (err) 522 dev_warn_ratelimited(iommu->dev, 523 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", 524 err, devid, event->iotval, event->iotval2); 525 } 526 527 /* Fault queue interrupt handler thread function */ 528 static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) 529 { 530 struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; 531 struct riscv_iommu_device *iommu = queue->iommu; 532 struct riscv_iommu_fq_record *events; 533 unsigned int ctrl, idx; 534 int cnt, len; 535 536 events = (struct riscv_iommu_fq_record *)queue->base; 537 538 /* Clear fault interrupt pending and process all received fault events. */ 539 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); 540 541 do { 542 cnt = riscv_iommu_queue_consume(queue, &idx); 543 for (len = 0; len < cnt; idx++, len++) 544 riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); 545 riscv_iommu_queue_release(queue, cnt); 546 } while (cnt > 0); 547 548 /* Clear MF/OF errors, complete error recovery to be implemented. */ 549 ctrl = riscv_iommu_readl(iommu, queue->qcr); 550 if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { 551 riscv_iommu_writel(iommu, queue->qcr, ctrl); 552 dev_warn(iommu->dev, 553 "Queue #%u error; memory fault:%d overflow:%d\n", 554 queue->qid, 555 !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), 556 !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); 557 } 558 559 return IRQ_HANDLED; 560 } 561 562 /* Lookup and initialize device context info structure. */ 563 static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, 564 unsigned int devid) 565 { 566 const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); 567 unsigned int depth; 568 unsigned long ddt, old, new; 569 void *ptr; 570 u8 ddi_bits[3] = { 0 }; 571 u64 *ddtp = NULL; 572 573 /* Make sure the mode is valid */ 574 if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || 575 iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) 576 return NULL; 577 578 /* 579 * Device id partitioning for base format: 580 * DDI[0]: bits 0 - 6 (1st level) (7 bits) 581 * DDI[1]: bits 7 - 15 (2nd level) (9 bits) 582 * DDI[2]: bits 16 - 23 (3rd level) (8 bits) 583 * 584 * For extended format: 585 * DDI[0]: bits 0 - 5 (1st level) (6 bits) 586 * DDI[1]: bits 6 - 14 (2nd level) (9 bits) 587 * DDI[2]: bits 15 - 23 (3rd level) (9 bits) 588 */ 589 if (base_format) { 590 ddi_bits[0] = 7; 591 ddi_bits[1] = 7 + 9; 592 ddi_bits[2] = 7 + 9 + 8; 593 } else { 594 ddi_bits[0] = 6; 595 ddi_bits[1] = 6 + 9; 596 ddi_bits[2] = 6 + 9 + 9; 597 } 598 599 /* Make sure device id is within range */ 600 depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; 601 if (devid >= (1 << ddi_bits[depth])) 602 return NULL; 603 604 /* Get to the level of the non-leaf node that holds the device context */ 605 for (ddtp = iommu->ddt_root; depth-- > 0;) { 606 const int split = ddi_bits[depth]; 607 /* 608 * Each non-leaf node is 64bits wide and on each level 609 * nodes are indexed by DDI[depth]. 610 */ 611 ddtp += (devid >> split) & 0x1FF; 612 613 /* 614 * Check if this node has been populated and if not 615 * allocate a new level and populate it. 616 */ 617 do { 618 ddt = READ_ONCE(*(unsigned long *)ddtp); 619 if (ddt & RISCV_IOMMU_DDTE_V) { 620 ddtp = __va(ppn_to_phys(ddt)); 621 break; 622 } 623 624 ptr = riscv_iommu_get_pages(iommu, SZ_4K); 625 if (!ptr) 626 return NULL; 627 628 new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; 629 old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); 630 631 if (old == ddt) { 632 ddtp = (u64 *)ptr; 633 break; 634 } 635 636 /* Race setting DDT detected, re-read and retry. */ 637 riscv_iommu_free_pages(iommu, ptr); 638 } while (1); 639 } 640 641 /* 642 * Grab the node that matches DDI[depth], note that when using base 643 * format the device context is 4 * 64bits, and the extended format 644 * is 8 * 64bits, hence the (3 - base_format) below. 645 */ 646 ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); 647 648 return (struct riscv_iommu_dc *)ddtp; 649 } 650 651 /* 652 * This is best effort IOMMU translation shutdown flow. 653 * Disable IOMMU without waiting for hardware response. 654 */ 655 void riscv_iommu_disable(struct riscv_iommu_device *iommu) 656 { 657 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 658 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, 659 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); 660 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); 661 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); 662 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); 663 } 664 665 #define riscv_iommu_read_ddtp(iommu) ({ \ 666 u64 ddtp; \ 667 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ 668 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ 669 RISCV_IOMMU_DDTP_TIMEOUT); \ 670 ddtp; }) 671 672 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) 673 { 674 u64 ddtp; 675 unsigned int mode; 676 677 ddtp = riscv_iommu_read_ddtp(iommu); 678 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 679 return -EBUSY; 680 681 /* 682 * It is optional for the hardware to report a fixed address for device 683 * directory root page when DDT.MODE is OFF or BARE. 684 */ 685 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 686 if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || 687 mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { 688 /* Use WARL to discover hardware fixed DDT PPN */ 689 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 690 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); 691 ddtp = riscv_iommu_read_ddtp(iommu); 692 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 693 return -EBUSY; 694 695 iommu->ddt_phys = ppn_to_phys(ddtp); 696 if (iommu->ddt_phys) 697 iommu->ddt_root = devm_ioremap(iommu->dev, 698 iommu->ddt_phys, PAGE_SIZE); 699 if (iommu->ddt_root) 700 memset(iommu->ddt_root, 0, PAGE_SIZE); 701 } 702 703 if (!iommu->ddt_root) { 704 iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K); 705 iommu->ddt_phys = __pa(iommu->ddt_root); 706 } 707 708 if (!iommu->ddt_root) 709 return -ENOMEM; 710 711 return 0; 712 } 713 714 /* 715 * Discover supported DDT modes starting from requested value, 716 * configure DDTP register with accepted mode and root DDT address. 717 * Accepted iommu->ddt_mode is updated on success. 718 */ 719 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, 720 unsigned int ddtp_mode) 721 { 722 struct device *dev = iommu->dev; 723 u64 ddtp, rq_ddtp; 724 unsigned int mode, rq_mode = ddtp_mode; 725 struct riscv_iommu_command cmd; 726 727 ddtp = riscv_iommu_read_ddtp(iommu); 728 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 729 return -EBUSY; 730 731 /* Disallow state transition from xLVL to xLVL. */ 732 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 733 if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 734 mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && 735 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && 736 rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) 737 return -EINVAL; 738 739 do { 740 rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); 741 if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 742 rq_ddtp |= phys_to_ppn(iommu->ddt_phys); 743 744 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); 745 ddtp = riscv_iommu_read_ddtp(iommu); 746 if (ddtp & RISCV_IOMMU_DDTP_BUSY) { 747 dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", 748 rq_mode, ddtp); 749 return -EBUSY; 750 } 751 752 /* Verify IOMMU hardware accepts new DDTP config. */ 753 mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); 754 755 if (rq_mode == mode) 756 break; 757 758 /* Hardware mandatory DDTP mode has not been accepted. */ 759 if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { 760 dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", 761 ddtp, rq_ddtp); 762 return -EINVAL; 763 } 764 765 /* 766 * Mode field is WARL, an IOMMU may support a subset of 767 * directory table levels in which case if we tried to set 768 * an unsupported number of levels we'll readback either 769 * a valid xLVL or off/bare. If we got off/bare, try again 770 * with a smaller xLVL. 771 */ 772 if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && 773 rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { 774 dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); 775 rq_mode--; 776 continue; 777 } 778 779 /* 780 * We tried all supported modes and IOMMU hardware failed to 781 * accept new settings, something went very wrong since off/bare 782 * and at least one xLVL must be supported. 783 */ 784 dev_err(dev, "DDTP hw mode %u, failed to set %u\n", 785 mode, ddtp_mode); 786 return -EINVAL; 787 } while (1); 788 789 iommu->ddt_mode = mode; 790 if (mode != ddtp_mode) 791 dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); 792 793 /* Invalidate device context cache */ 794 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 795 riscv_iommu_cmd_send(iommu, &cmd); 796 797 /* Invalidate address translation cache */ 798 riscv_iommu_cmd_inval_vma(&cmd); 799 riscv_iommu_cmd_send(iommu, &cmd); 800 801 /* IOFENCE.C */ 802 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 803 804 return 0; 805 } 806 807 /* This struct contains protection domain specific IOMMU driver data. */ 808 struct riscv_iommu_domain { 809 struct iommu_domain domain; 810 struct list_head bonds; 811 spinlock_t lock; /* protect bonds list updates. */ 812 int pscid; 813 bool amo_enabled; 814 int numa_node; 815 unsigned int pgd_mode; 816 unsigned long *pgd_root; 817 }; 818 819 #define iommu_domain_to_riscv(iommu_domain) \ 820 container_of(iommu_domain, struct riscv_iommu_domain, domain) 821 822 /* Private IOMMU data for managed devices, dev_iommu_priv_* */ 823 struct riscv_iommu_info { 824 struct riscv_iommu_domain *domain; 825 }; 826 827 /* 828 * Linkage between an iommu_domain and attached devices. 829 * 830 * Protection domain requiring IOATC and DevATC translation cache invalidations, 831 * should be linked to attached devices using a riscv_iommu_bond structure. 832 * Devices should be linked to the domain before first use and unlinked after 833 * the translations from the referenced protection domain can no longer be used. 834 * Blocking and identity domains are not tracked here, as the IOMMU hardware 835 * does not cache negative and/or identity (BARE mode) translations, and DevATC 836 * is disabled for those protection domains. 837 * 838 * The device pointer and IOMMU data remain stable in the bond struct after 839 * _probe_device() where it's attached to the managed IOMMU, up to the 840 * completion of the _release_device() call. The release of the bond structure 841 * is synchronized with the device release. 842 */ 843 struct riscv_iommu_bond { 844 struct list_head list; 845 struct rcu_head rcu; 846 struct device *dev; 847 }; 848 849 static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, 850 struct device *dev) 851 { 852 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 853 struct riscv_iommu_bond *bond; 854 struct list_head *bonds; 855 856 bond = kzalloc(sizeof(*bond), GFP_KERNEL); 857 if (!bond) 858 return -ENOMEM; 859 bond->dev = dev; 860 861 /* 862 * List of devices attached to the domain is arranged based on 863 * managed IOMMU device. 864 */ 865 866 spin_lock(&domain->lock); 867 list_for_each(bonds, &domain->bonds) 868 if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) 869 break; 870 list_add_rcu(&bond->list, bonds); 871 spin_unlock(&domain->lock); 872 873 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ 874 smp_mb(); 875 876 return 0; 877 } 878 879 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, 880 struct device *dev) 881 { 882 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 883 struct riscv_iommu_bond *bond, *found = NULL; 884 struct riscv_iommu_command cmd; 885 int count = 0; 886 887 if (!domain) 888 return; 889 890 spin_lock(&domain->lock); 891 list_for_each_entry(bond, &domain->bonds, list) { 892 if (found && count) 893 break; 894 else if (bond->dev == dev) 895 found = bond; 896 else if (dev_to_iommu(bond->dev) == iommu) 897 count++; 898 } 899 if (found) 900 list_del_rcu(&found->list); 901 spin_unlock(&domain->lock); 902 kfree_rcu(found, rcu); 903 904 /* 905 * If this was the last bond between this domain and the IOMMU 906 * invalidate all cached entries for domain's PSCID. 907 */ 908 if (!count) { 909 riscv_iommu_cmd_inval_vma(&cmd); 910 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 911 riscv_iommu_cmd_send(iommu, &cmd); 912 913 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 914 } 915 } 916 917 /* 918 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. 919 * This limit will be replaced with range invalidations, if supported by 920 * the hardware, when RISC-V IOMMU architecture specification update for 921 * range invalidations update will be available. 922 */ 923 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) 924 925 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, 926 unsigned long start, unsigned long end) 927 { 928 struct riscv_iommu_bond *bond; 929 struct riscv_iommu_device *iommu, *prev; 930 struct riscv_iommu_command cmd; 931 unsigned long len = end - start + 1; 932 unsigned long iova; 933 934 /* 935 * For each IOMMU linked with this protection domain (via bonds->dev), 936 * an IOTLB invaliation command will be submitted and executed. 937 * 938 * Possbile race with domain attach flow is handled by sequencing 939 * bond creation - riscv_iommu_bond_link(), and device directory 940 * update - riscv_iommu_iodir_update(). 941 * 942 * PTE Update / IOTLB Inval Device attach & directory update 943 * -------------------------- -------------------------- 944 * update page table entries add dev to the bond list 945 * FENCE RW,RW FENCE RW,RW 946 * For all IOMMUs: (can be empty) Update FSC/PSCID 947 * FENCE IOW,IOW FENCE IOW,IOW 948 * IOTLB.INVAL IODIR.INVAL 949 * IOFENCE.C 950 * 951 * If bond list is not updated with new device, directory context will 952 * be configured with already valid page table content. If an IOMMU is 953 * linked to the protection domain it will receive invalidation 954 * requests for updated page table entries. 955 */ 956 smp_mb(); 957 958 rcu_read_lock(); 959 960 prev = NULL; 961 list_for_each_entry_rcu(bond, &domain->bonds, list) { 962 iommu = dev_to_iommu(bond->dev); 963 964 /* 965 * IOTLB invalidation request can be safely omitted if already sent 966 * to the IOMMU for the same PSCID, and with domain->bonds list 967 * arranged based on the device's IOMMU, it's sufficient to check 968 * last device the invalidation was sent to. 969 */ 970 if (iommu == prev) 971 continue; 972 973 riscv_iommu_cmd_inval_vma(&cmd); 974 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 975 if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { 976 for (iova = start; iova < end; iova += PAGE_SIZE) { 977 riscv_iommu_cmd_inval_set_addr(&cmd, iova); 978 riscv_iommu_cmd_send(iommu, &cmd); 979 } 980 } else { 981 riscv_iommu_cmd_send(iommu, &cmd); 982 } 983 prev = iommu; 984 } 985 986 prev = NULL; 987 list_for_each_entry_rcu(bond, &domain->bonds, list) { 988 iommu = dev_to_iommu(bond->dev); 989 if (iommu == prev) 990 continue; 991 992 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 993 prev = iommu; 994 } 995 rcu_read_unlock(); 996 } 997 998 #define RISCV_IOMMU_FSC_BARE 0 999 1000 /* 1001 * Update IODIR for the device. 1002 * 1003 * During the execution of riscv_iommu_probe_device(), IODIR entries are 1004 * allocated for the device's identifiers. Device context invalidation 1005 * becomes necessary only if one of the updated entries was previously 1006 * marked as valid, given that invalid device context entries are not 1007 * cached by the IOMMU hardware. 1008 * In this implementation, updating a valid device context while the 1009 * device is not quiesced might be disruptive, potentially causing 1010 * interim translation faults. 1011 */ 1012 static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, 1013 struct device *dev, u64 fsc, u64 ta) 1014 { 1015 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1016 struct riscv_iommu_dc *dc; 1017 struct riscv_iommu_command cmd; 1018 bool sync_required = false; 1019 u64 tc; 1020 int i; 1021 1022 for (i = 0; i < fwspec->num_ids; i++) { 1023 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1024 tc = READ_ONCE(dc->tc); 1025 if (!(tc & RISCV_IOMMU_DC_TC_V)) 1026 continue; 1027 1028 WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); 1029 1030 /* Invalidate device context cached values */ 1031 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1032 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1033 riscv_iommu_cmd_send(iommu, &cmd); 1034 sync_required = true; 1035 } 1036 1037 if (sync_required) 1038 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1039 1040 /* 1041 * For device context with DC_TC_PDTV = 0, translation attributes valid bit 1042 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). 1043 */ 1044 for (i = 0; i < fwspec->num_ids; i++) { 1045 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1046 tc = READ_ONCE(dc->tc); 1047 tc |= ta & RISCV_IOMMU_DC_TC_V; 1048 1049 WRITE_ONCE(dc->fsc, fsc); 1050 WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); 1051 /* Update device context, write TC.V as the last step. */ 1052 dma_wmb(); 1053 WRITE_ONCE(dc->tc, tc); 1054 1055 /* Invalidate device context after update */ 1056 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1057 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1058 riscv_iommu_cmd_send(iommu, &cmd); 1059 } 1060 1061 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); 1062 } 1063 1064 /* 1065 * IOVA page translation tree management. 1066 */ 1067 1068 static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) 1069 { 1070 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1071 1072 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1073 } 1074 1075 static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, 1076 struct iommu_iotlb_gather *gather) 1077 { 1078 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1079 1080 riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1081 } 1082 1083 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) 1084 1085 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) 1086 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) 1087 #define _io_pte_none(pte) ((pte) == 0) 1088 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) 1089 1090 static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, 1091 unsigned long pte, 1092 struct iommu_pages_list *freelist) 1093 { 1094 unsigned long *ptr; 1095 int i; 1096 1097 if (!_io_pte_present(pte) || _io_pte_leaf(pte)) 1098 return; 1099 1100 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1101 1102 /* Recursively free all sub page table pages */ 1103 for (i = 0; i < PTRS_PER_PTE; i++) { 1104 pte = READ_ONCE(ptr[i]); 1105 if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) 1106 riscv_iommu_pte_free(domain, pte, freelist); 1107 } 1108 1109 if (freelist) 1110 iommu_pages_list_add(freelist, ptr); 1111 else 1112 iommu_free_pages(ptr); 1113 } 1114 1115 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, 1116 unsigned long iova, size_t pgsize, 1117 gfp_t gfp) 1118 { 1119 unsigned long *ptr = domain->pgd_root; 1120 unsigned long pte, old; 1121 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1122 void *addr; 1123 1124 do { 1125 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1126 1127 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1128 /* 1129 * Note: returned entry might be a non-leaf if there was 1130 * existing mapping with smaller granularity. Up to the caller 1131 * to replace and invalidate. 1132 */ 1133 if (((size_t)1 << shift) == pgsize) 1134 return ptr; 1135 pte_retry: 1136 pte = READ_ONCE(*ptr); 1137 /* 1138 * This is very likely incorrect as we should not be adding 1139 * new mapping with smaller granularity on top 1140 * of existing 2M/1G mapping. Fail. 1141 */ 1142 if (_io_pte_present(pte) && _io_pte_leaf(pte)) 1143 return NULL; 1144 /* 1145 * Non-leaf entry is missing, allocate and try to add to the 1146 * page table. This might race with other mappings, retry. 1147 */ 1148 if (_io_pte_none(pte)) { 1149 addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp, 1150 SZ_4K); 1151 if (!addr) 1152 return NULL; 1153 old = pte; 1154 pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); 1155 if (cmpxchg_relaxed(ptr, old, pte) != old) { 1156 iommu_free_pages(addr); 1157 goto pte_retry; 1158 } 1159 } 1160 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1161 } while (level-- > 0); 1162 1163 return NULL; 1164 } 1165 1166 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, 1167 unsigned long iova, size_t *pte_pgsize) 1168 { 1169 unsigned long *ptr = domain->pgd_root; 1170 unsigned long pte; 1171 int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1172 1173 do { 1174 const int shift = PAGE_SHIFT + PT_SHIFT * level; 1175 1176 ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1177 pte = READ_ONCE(*ptr); 1178 if (_io_pte_present(pte) && _io_pte_leaf(pte)) { 1179 *pte_pgsize = (size_t)1 << shift; 1180 return ptr; 1181 } 1182 if (_io_pte_none(pte)) 1183 return NULL; 1184 ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1185 } while (level-- > 0); 1186 1187 return NULL; 1188 } 1189 1190 static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, 1191 unsigned long iova, phys_addr_t phys, 1192 size_t pgsize, size_t pgcount, int prot, 1193 gfp_t gfp, size_t *mapped) 1194 { 1195 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1196 size_t size = 0; 1197 unsigned long *ptr; 1198 unsigned long pte, old, pte_prot; 1199 int rc = 0; 1200 struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 1201 1202 if (!(prot & IOMMU_WRITE)) 1203 pte_prot = _PAGE_BASE | _PAGE_READ; 1204 else if (domain->amo_enabled) 1205 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; 1206 else 1207 pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; 1208 1209 while (pgcount) { 1210 ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); 1211 if (!ptr) { 1212 rc = -ENOMEM; 1213 break; 1214 } 1215 1216 old = READ_ONCE(*ptr); 1217 pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); 1218 if (cmpxchg_relaxed(ptr, old, pte) != old) 1219 continue; 1220 1221 riscv_iommu_pte_free(domain, old, &freelist); 1222 1223 size += pgsize; 1224 iova += pgsize; 1225 phys += pgsize; 1226 --pgcount; 1227 } 1228 1229 *mapped = size; 1230 1231 if (!iommu_pages_list_empty(&freelist)) { 1232 /* 1233 * In 1.0 spec version, the smallest scope we can use to 1234 * invalidate all levels of page table (i.e. leaf and non-leaf) 1235 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. 1236 * This will be updated with hardware support for 1237 * capability.NL (non-leaf) IOTINVAL command. 1238 */ 1239 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1240 iommu_put_pages_list(&freelist); 1241 } 1242 1243 return rc; 1244 } 1245 1246 static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, 1247 unsigned long iova, size_t pgsize, 1248 size_t pgcount, 1249 struct iommu_iotlb_gather *gather) 1250 { 1251 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1252 size_t size = pgcount << __ffs(pgsize); 1253 unsigned long *ptr, old; 1254 size_t unmapped = 0; 1255 size_t pte_size; 1256 1257 while (unmapped < size) { 1258 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1259 if (!ptr) 1260 return unmapped; 1261 1262 /* partial unmap is not allowed, fail. */ 1263 if (iova & (pte_size - 1)) 1264 return unmapped; 1265 1266 old = READ_ONCE(*ptr); 1267 if (cmpxchg_relaxed(ptr, old, 0) != old) 1268 continue; 1269 1270 iommu_iotlb_gather_add_page(&domain->domain, gather, iova, 1271 pte_size); 1272 1273 iova += pte_size; 1274 unmapped += pte_size; 1275 } 1276 1277 return unmapped; 1278 } 1279 1280 static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, 1281 dma_addr_t iova) 1282 { 1283 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1284 size_t pte_size; 1285 unsigned long *ptr; 1286 1287 ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1288 if (!ptr) 1289 return 0; 1290 1291 return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); 1292 } 1293 1294 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) 1295 { 1296 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1297 const unsigned long pfn = virt_to_pfn(domain->pgd_root); 1298 1299 WARN_ON(!list_empty(&domain->bonds)); 1300 1301 if ((int)domain->pscid > 0) 1302 ida_free(&riscv_iommu_pscids, domain->pscid); 1303 1304 riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); 1305 kfree(domain); 1306 } 1307 1308 static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) 1309 { 1310 switch (pgd_mode) { 1311 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: 1312 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; 1313 1314 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: 1315 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; 1316 1317 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: 1318 return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; 1319 } 1320 return false; 1321 } 1322 1323 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, 1324 struct device *dev) 1325 { 1326 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1327 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1328 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1329 u64 fsc, ta; 1330 1331 if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) 1332 return -ENODEV; 1333 1334 fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | 1335 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); 1336 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | 1337 RISCV_IOMMU_PC_TA_V; 1338 1339 if (riscv_iommu_bond_link(domain, dev)) 1340 return -ENOMEM; 1341 1342 riscv_iommu_iodir_update(iommu, dev, fsc, ta); 1343 riscv_iommu_bond_unlink(info->domain, dev); 1344 info->domain = domain; 1345 1346 return 0; 1347 } 1348 1349 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { 1350 .attach_dev = riscv_iommu_attach_paging_domain, 1351 .free = riscv_iommu_free_paging_domain, 1352 .map_pages = riscv_iommu_map_pages, 1353 .unmap_pages = riscv_iommu_unmap_pages, 1354 .iova_to_phys = riscv_iommu_iova_to_phys, 1355 .iotlb_sync = riscv_iommu_iotlb_sync, 1356 .flush_iotlb_all = riscv_iommu_iotlb_flush_all, 1357 }; 1358 1359 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) 1360 { 1361 struct riscv_iommu_domain *domain; 1362 struct riscv_iommu_device *iommu; 1363 unsigned int pgd_mode; 1364 dma_addr_t va_mask; 1365 int va_bits; 1366 1367 iommu = dev_to_iommu(dev); 1368 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { 1369 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; 1370 va_bits = 57; 1371 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { 1372 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; 1373 va_bits = 48; 1374 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { 1375 pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; 1376 va_bits = 39; 1377 } else { 1378 dev_err(dev, "cannot find supported page table mode\n"); 1379 return ERR_PTR(-ENODEV); 1380 } 1381 1382 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 1383 if (!domain) 1384 return ERR_PTR(-ENOMEM); 1385 1386 INIT_LIST_HEAD_RCU(&domain->bonds); 1387 spin_lock_init(&domain->lock); 1388 domain->numa_node = dev_to_node(iommu->dev); 1389 domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); 1390 domain->pgd_mode = pgd_mode; 1391 domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node, 1392 GFP_KERNEL_ACCOUNT, SZ_4K); 1393 if (!domain->pgd_root) { 1394 kfree(domain); 1395 return ERR_PTR(-ENOMEM); 1396 } 1397 1398 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, 1399 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); 1400 if (domain->pscid < 0) { 1401 iommu_free_pages(domain->pgd_root); 1402 kfree(domain); 1403 return ERR_PTR(-ENOMEM); 1404 } 1405 1406 /* 1407 * Note: RISC-V Privilege spec mandates that virtual addresses 1408 * need to be sign-extended, so if (VA_BITS - 1) is set, all 1409 * bits >= VA_BITS need to also be set or else we'll get a 1410 * page fault. However the code that creates the mappings 1411 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us 1412 * for now, so we'll end up with invalid virtual addresses 1413 * to map. As a workaround until we get this sorted out 1414 * limit the available virtual addresses to VA_BITS - 1. 1415 */ 1416 va_mask = DMA_BIT_MASK(va_bits - 1); 1417 1418 domain->domain.geometry.aperture_start = 0; 1419 domain->domain.geometry.aperture_end = va_mask; 1420 domain->domain.geometry.force_aperture = true; 1421 domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); 1422 1423 domain->domain.ops = &riscv_iommu_paging_domain_ops; 1424 1425 return &domain->domain; 1426 } 1427 1428 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, 1429 struct device *dev) 1430 { 1431 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1432 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1433 1434 /* Make device context invalid, translation requests will fault w/ #258 */ 1435 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); 1436 riscv_iommu_bond_unlink(info->domain, dev); 1437 info->domain = NULL; 1438 1439 return 0; 1440 } 1441 1442 static struct iommu_domain riscv_iommu_blocking_domain = { 1443 .type = IOMMU_DOMAIN_BLOCKED, 1444 .ops = &(const struct iommu_domain_ops) { 1445 .attach_dev = riscv_iommu_attach_blocking_domain, 1446 } 1447 }; 1448 1449 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, 1450 struct device *dev) 1451 { 1452 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1453 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1454 1455 riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); 1456 riscv_iommu_bond_unlink(info->domain, dev); 1457 info->domain = NULL; 1458 1459 return 0; 1460 } 1461 1462 static struct iommu_domain riscv_iommu_identity_domain = { 1463 .type = IOMMU_DOMAIN_IDENTITY, 1464 .ops = &(const struct iommu_domain_ops) { 1465 .attach_dev = riscv_iommu_attach_identity_domain, 1466 } 1467 }; 1468 1469 static struct iommu_group *riscv_iommu_device_group(struct device *dev) 1470 { 1471 if (dev_is_pci(dev)) 1472 return pci_device_group(dev); 1473 return generic_device_group(dev); 1474 } 1475 1476 static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) 1477 { 1478 return iommu_fwspec_add_ids(dev, args->args, 1); 1479 } 1480 1481 static struct iommu_device *riscv_iommu_probe_device(struct device *dev) 1482 { 1483 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 1484 struct riscv_iommu_device *iommu; 1485 struct riscv_iommu_info *info; 1486 struct riscv_iommu_dc *dc; 1487 u64 tc; 1488 int i; 1489 1490 if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) 1491 return ERR_PTR(-ENODEV); 1492 1493 iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); 1494 if (!iommu) 1495 return ERR_PTR(-ENODEV); 1496 1497 /* 1498 * IOMMU hardware operating in fail-over BARE mode will provide 1499 * identity translation for all connected devices anyway... 1500 */ 1501 if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) 1502 return ERR_PTR(-ENODEV); 1503 1504 info = kzalloc(sizeof(*info), GFP_KERNEL); 1505 if (!info) 1506 return ERR_PTR(-ENOMEM); 1507 /* 1508 * Allocate and pre-configure device context entries in 1509 * the device directory. Do not mark the context valid yet. 1510 */ 1511 tc = 0; 1512 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) 1513 tc |= RISCV_IOMMU_DC_TC_SADE; 1514 for (i = 0; i < fwspec->num_ids; i++) { 1515 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1516 if (!dc) { 1517 kfree(info); 1518 return ERR_PTR(-ENODEV); 1519 } 1520 if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) 1521 dev_warn(dev, "already attached to IOMMU device directory\n"); 1522 WRITE_ONCE(dc->tc, tc); 1523 } 1524 1525 dev_iommu_priv_set(dev, info); 1526 1527 return &iommu->iommu; 1528 } 1529 1530 static void riscv_iommu_release_device(struct device *dev) 1531 { 1532 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1533 1534 kfree_rcu_mightsleep(info); 1535 } 1536 1537 static const struct iommu_ops riscv_iommu_ops = { 1538 .of_xlate = riscv_iommu_of_xlate, 1539 .identity_domain = &riscv_iommu_identity_domain, 1540 .blocked_domain = &riscv_iommu_blocking_domain, 1541 .release_domain = &riscv_iommu_blocking_domain, 1542 .domain_alloc_paging = riscv_iommu_alloc_paging_domain, 1543 .device_group = riscv_iommu_device_group, 1544 .probe_device = riscv_iommu_probe_device, 1545 .release_device = riscv_iommu_release_device, 1546 }; 1547 1548 static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) 1549 { 1550 u64 ddtp; 1551 1552 /* 1553 * Make sure the IOMMU is switched off or in pass-through mode during 1554 * regular boot flow and disable translation when we boot into a kexec 1555 * kernel and the previous kernel left them enabled. 1556 */ 1557 ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); 1558 if (ddtp & RISCV_IOMMU_DDTP_BUSY) 1559 return -EBUSY; 1560 1561 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > 1562 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { 1563 if (!is_kdump_kernel()) 1564 return -EBUSY; 1565 riscv_iommu_disable(iommu); 1566 } 1567 1568 /* Configure accesses to in-memory data structures for CPU-native byte order. */ 1569 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1570 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { 1571 if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) 1572 return -EINVAL; 1573 riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, 1574 iommu->fctl ^ RISCV_IOMMU_FCTL_BE); 1575 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); 1576 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != 1577 !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) 1578 return -EINVAL; 1579 } 1580 1581 /* 1582 * Distribute interrupt vectors, always use first vector for CIV. 1583 * At least one interrupt is required. Read back and verify. 1584 */ 1585 if (!iommu->irqs_count) 1586 return -EINVAL; 1587 1588 iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | 1589 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | 1590 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); 1591 riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); 1592 iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); 1593 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), 1594 FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), 1595 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), 1596 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) 1597 return -EINVAL; 1598 1599 return 0; 1600 } 1601 1602 void riscv_iommu_remove(struct riscv_iommu_device *iommu) 1603 { 1604 iommu_device_unregister(&iommu->iommu); 1605 iommu_device_sysfs_remove(&iommu->iommu); 1606 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1607 riscv_iommu_queue_disable(&iommu->cmdq); 1608 riscv_iommu_queue_disable(&iommu->fltq); 1609 } 1610 1611 int riscv_iommu_init(struct riscv_iommu_device *iommu) 1612 { 1613 int rc; 1614 1615 RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); 1616 RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); 1617 1618 rc = riscv_iommu_init_check(iommu); 1619 if (rc) 1620 return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); 1621 1622 rc = riscv_iommu_iodir_alloc(iommu); 1623 if (rc) 1624 return rc; 1625 1626 rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, 1627 sizeof(struct riscv_iommu_command)); 1628 if (rc) 1629 return rc; 1630 1631 rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, 1632 sizeof(struct riscv_iommu_fq_record)); 1633 if (rc) 1634 return rc; 1635 1636 rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); 1637 if (rc) 1638 return rc; 1639 1640 rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); 1641 if (rc) 1642 goto err_queue_disable; 1643 1644 rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); 1645 if (rc) 1646 goto err_queue_disable; 1647 1648 rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", 1649 dev_name(iommu->dev)); 1650 if (rc) { 1651 dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); 1652 goto err_iodir_off; 1653 } 1654 1655 if (!acpi_disabled) { 1656 rc = rimt_iommu_register(iommu->dev); 1657 if (rc) { 1658 dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n"); 1659 goto err_remove_sysfs; 1660 } 1661 } 1662 1663 rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); 1664 if (rc) { 1665 dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); 1666 goto err_remove_sysfs; 1667 } 1668 1669 return 0; 1670 1671 err_remove_sysfs: 1672 iommu_device_sysfs_remove(&iommu->iommu); 1673 err_iodir_off: 1674 riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); 1675 err_queue_disable: 1676 riscv_iommu_queue_disable(&iommu->fltq); 1677 riscv_iommu_queue_disable(&iommu->cmdq); 1678 return rc; 1679 } 1680