1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Portions Copyright (c) 2010, Oracle and/or its affiliates. 23 * All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2009, Intel Corporation. 27 * All rights reserved. 28 */ 29 /* 30 * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. 31 * Copyright 2017 Joyent, Inc. 32 */ 33 34 /* 35 * DVMA code 36 * This file contains Intel IOMMU code that deals with DVMA 37 * i.e. DMA remapping. 38 */ 39 40 #include <sys/sysmacros.h> 41 #include <sys/pcie.h> 42 #include <sys/pci_cfgspace.h> 43 #include <vm/hat_i86.h> 44 #include <sys/memlist.h> 45 #include <sys/acpi/acpi.h> 46 #include <sys/acpica.h> 47 #include <sys/modhash.h> 48 #include <sys/immu.h> 49 #include <sys/x86_archext.h> 50 #include <sys/archsystm.h> 51 52 #undef TEST 53 54 /* 55 * Macros based on PCI spec 56 */ 57 #define IMMU_PCI_REV2CLASS(r) ((r) >> 8) /* classcode from revid */ 58 #define IMMU_PCI_CLASS2BASE(c) ((c) >> 16) /* baseclass from classcode */ 59 #define IMMU_PCI_CLASS2SUB(c) (((c) >> 8) & 0xff); /* classcode */ 60 61 #define IMMU_CONTIG_PADDR(d, p) \ 62 ((d).dck_paddr && ((d).dck_paddr + (d).dck_npages * IMMU_PAGESIZE) \ 63 == (p)) 64 65 typedef struct dvma_arg { 66 immu_t *dva_immu; 67 dev_info_t *dva_rdip; 68 dev_info_t *dva_ddip; 69 domain_t *dva_domain; 70 int dva_level; 71 immu_flags_t dva_flags; 72 list_t *dva_list; 73 int dva_error; 74 } dvma_arg_t; 75 76 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip, 77 dev_info_t *rdip, immu_flags_t immu_flags); 78 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus, 79 int dev, int func, immu_flags_t immu_flags); 80 static void destroy_immu_devi(immu_devi_t *immu_devi); 81 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma, 82 uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 83 immu_flags_t immu_flags); 84 85 /* Extern globals */ 86 extern struct memlist *phys_install; 87 88 /* 89 * iommulib interface functions. 90 */ 91 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip); 92 static int immu_allochdl(iommulib_handle_t handle, 93 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 94 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep); 95 static int immu_freehdl(iommulib_handle_t handle, 96 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 97 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 98 dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req, 99 ddi_dma_cookie_t *cookiep, uint_t *ccountp); 100 static int immu_unbindhdl(iommulib_handle_t handle, 101 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 102 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip, 103 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len, 104 uint_t cachefl); 105 static int immu_win(iommulib_handle_t handle, dev_info_t *dip, 106 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 107 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); 108 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 109 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 110 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao); 111 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 112 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao); 113 114 /* static Globals */ 115 116 /* 117 * Used to setup DMA objects (memory regions) 118 * for DMA reads by IOMMU units 119 */ 120 static ddi_dma_attr_t immu_dma_attr = { 121 DMA_ATTR_V0, 122 0U, 123 0xffffffffffffffffULL, 124 0xffffffffU, 125 MMU_PAGESIZE, /* MMU page aligned */ 126 0x1, 127 0x1, 128 0xffffffffU, 129 0xffffffffffffffffULL, 130 1, 131 4, 132 0 133 }; 134 135 static ddi_device_acc_attr_t immu_acc_attr = { 136 DDI_DEVICE_ATTR_V0, 137 DDI_NEVERSWAP_ACC, 138 DDI_STRICTORDER_ACC 139 }; 140 141 struct iommulib_ops immulib_ops = { 142 IOMMU_OPS_VERSION, 143 INTEL_IOMMU, 144 "Intel IOMMU", 145 NULL, 146 immu_probe, 147 immu_allochdl, 148 immu_freehdl, 149 immu_bindhdl, 150 immu_unbindhdl, 151 immu_sync, 152 immu_win, 153 immu_mapobject, 154 immu_unmapobject, 155 }; 156 157 /* 158 * Fake physical address range used to set up initial prealloc mappings. 159 * This memory is never actually accessed. It is mapped read-only, 160 * and is overwritten as soon as the first DMA bind operation is 161 * performed. Since 0 is a special case, just start at the 2nd 162 * physical page. 163 */ 164 165 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES }; 166 167 /* globals private to this file */ 168 static kmutex_t immu_domain_lock; 169 static list_t immu_unity_domain_list; 170 static list_t immu_xlate_domain_list; 171 172 /* structure used to store idx into each level of the page tables */ 173 typedef struct xlate { 174 int xlt_level; 175 uint_t xlt_idx; 176 pgtable_t *xlt_pgtable; 177 } xlate_t; 178 179 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */ 180 #define IMMU_UNITY_DID 1 181 182 static mod_hash_t *bdf_domain_hash; 183 184 int immu_use_alh; 185 int immu_use_tm; 186 187 static domain_t * 188 bdf_domain_lookup(immu_devi_t *immu_devi) 189 { 190 domain_t *domain; 191 int16_t seg = immu_devi->imd_seg; 192 int16_t bus = immu_devi->imd_bus; 193 int16_t devfunc = immu_devi->imd_devfunc; 194 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 195 196 if (seg < 0 || bus < 0 || devfunc < 0) { 197 return (NULL); 198 } 199 200 domain = NULL; 201 if (mod_hash_find(bdf_domain_hash, 202 (void *)bdf, (void *)&domain) == 0) { 203 ASSERT(domain); 204 ASSERT(domain->dom_did > 0); 205 return (domain); 206 } else { 207 return (NULL); 208 } 209 } 210 211 static void 212 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain) 213 { 214 int16_t seg = immu_devi->imd_seg; 215 int16_t bus = immu_devi->imd_bus; 216 int16_t devfunc = immu_devi->imd_devfunc; 217 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 218 219 if (seg < 0 || bus < 0 || devfunc < 0) { 220 return; 221 } 222 223 (void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain); 224 } 225 226 static int 227 match_lpc(dev_info_t *pdip, void *arg) 228 { 229 immu_devi_t *immu_devi; 230 dvma_arg_t *dvap = (dvma_arg_t *)arg; 231 232 if (list_is_empty(dvap->dva_list)) { 233 return (DDI_WALK_TERMINATE); 234 } 235 236 immu_devi = list_head(dvap->dva_list); 237 for (; immu_devi; immu_devi = list_next(dvap->dva_list, 238 immu_devi)) { 239 if (immu_devi->imd_dip == pdip) { 240 dvap->dva_ddip = pdip; 241 dvap->dva_error = DDI_SUCCESS; 242 return (DDI_WALK_TERMINATE); 243 } 244 } 245 246 return (DDI_WALK_CONTINUE); 247 } 248 249 static void 250 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu) 251 { 252 list_t *spclist = NULL; 253 immu_devi_t *immu_devi; 254 255 immu_devi = IMMU_DEVI(dip); 256 if (immu_devi->imd_display == B_TRUE) { 257 spclist = &(immu->immu_dvma_gfx_list); 258 } else if (immu_devi->imd_lpc == B_TRUE) { 259 spclist = &(immu->immu_dvma_lpc_list); 260 } 261 262 if (spclist) { 263 mutex_enter(&(immu->immu_lock)); 264 list_insert_head(spclist, immu_devi); 265 mutex_exit(&(immu->immu_lock)); 266 } 267 } 268 269 /* 270 * Set the immu_devi struct in the immu_devi field of a devinfo node 271 */ 272 int 273 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags) 274 { 275 int bus, dev, func; 276 immu_devi_t *new_imd; 277 immu_devi_t *immu_devi; 278 279 immu_devi = immu_devi_get(dip); 280 if (immu_devi != NULL) { 281 return (DDI_SUCCESS); 282 } 283 284 bus = dev = func = -1; 285 286 /* 287 * Assume a new immu_devi struct is needed 288 */ 289 if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) { 290 /* 291 * No BDF. Set bus = -1 to indicate this. 292 * We still need to create a immu_devi struct 293 * though 294 */ 295 bus = -1; 296 dev = 0; 297 func = 0; 298 } 299 300 new_imd = create_immu_devi(dip, bus, dev, func, immu_flags); 301 if (new_imd == NULL) { 302 ddi_err(DER_WARN, dip, "Failed to create immu_devi " 303 "structure"); 304 return (DDI_FAILURE); 305 } 306 307 /* 308 * Check if some other thread allocated a immu_devi while we 309 * didn't own the lock. 310 */ 311 mutex_enter(&(DEVI(dip)->devi_lock)); 312 if (IMMU_DEVI(dip) == NULL) { 313 IMMU_DEVI_SET(dip, new_imd); 314 } else { 315 destroy_immu_devi(new_imd); 316 } 317 mutex_exit(&(DEVI(dip)->devi_lock)); 318 319 return (DDI_SUCCESS); 320 } 321 322 static dev_info_t * 323 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags) 324 { 325 dvma_arg_t dvarg = {0}; 326 dvarg.dva_list = &(immu->immu_dvma_lpc_list); 327 dvarg.dva_rdip = rdip; 328 dvarg.dva_error = DDI_FAILURE; 329 330 if (immu_walk_ancestor(rdip, NULL, match_lpc, 331 &dvarg, NULL, immu_flags) != DDI_SUCCESS) { 332 ddi_err(DER_MODE, rdip, "Could not walk ancestors to " 333 "find lpc_devinfo for ISA device"); 334 return (NULL); 335 } 336 337 if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) { 338 ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for " 339 "ISA device"); 340 return (NULL); 341 } 342 343 return (dvarg.dva_ddip); 344 } 345 346 static dev_info_t * 347 get_gfx_devinfo(dev_info_t *rdip) 348 { 349 immu_t *immu; 350 immu_devi_t *immu_devi; 351 list_t *list_gfx; 352 353 /* 354 * The GFX device may not be on the same iommu unit as "agpgart" 355 * so search globally 356 */ 357 immu_devi = NULL; 358 immu = list_head(&immu_list); 359 for (; immu; immu = list_next(&immu_list, immu)) { 360 list_gfx = &(immu->immu_dvma_gfx_list); 361 if (!list_is_empty(list_gfx)) { 362 immu_devi = list_head(list_gfx); 363 break; 364 } 365 } 366 367 if (immu_devi == NULL) { 368 ddi_err(DER_WARN, rdip, "iommu: No GFX device. " 369 "Cannot redirect agpgart"); 370 return (NULL); 371 } 372 373 ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s", 374 ddi_node_name(immu_devi->imd_dip)); 375 376 return (immu_devi->imd_dip); 377 } 378 379 static immu_flags_t 380 dma_to_immu_flags(struct ddi_dma_req *dmareq) 381 { 382 immu_flags_t flags = 0; 383 384 if (dmareq->dmar_fp == DDI_DMA_SLEEP) { 385 flags |= IMMU_FLAGS_SLEEP; 386 } else { 387 flags |= IMMU_FLAGS_NOSLEEP; 388 } 389 390 #ifdef BUGGY_DRIVERS 391 392 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 393 394 #else 395 /* 396 * Read and write flags need to be reversed. 397 * DMA_READ means read from device and write 398 * to memory. So DMA read means DVMA write. 399 */ 400 if (dmareq->dmar_flags & DDI_DMA_READ) 401 flags |= IMMU_FLAGS_WRITE; 402 403 if (dmareq->dmar_flags & DDI_DMA_WRITE) 404 flags |= IMMU_FLAGS_READ; 405 406 /* 407 * Some buggy drivers specify neither READ or WRITE 408 * For such drivers set both read and write permissions 409 */ 410 if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) { 411 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 412 } 413 #endif 414 415 return (flags); 416 } 417 418 /*ARGSUSED*/ 419 int 420 pgtable_ctor(void *buf, void *arg, int kmflag) 421 { 422 size_t actual_size = 0; 423 pgtable_t *pgtable; 424 int (*dmafp)(caddr_t); 425 caddr_t vaddr; 426 void *next; 427 uint_t flags; 428 immu_t *immu = arg; 429 430 pgtable = (pgtable_t *)buf; 431 432 dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; 433 434 next = kmem_zalloc(IMMU_PAGESIZE, kmflag); 435 if (next == NULL) { 436 return (-1); 437 } 438 439 if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr, 440 dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) { 441 kmem_free(next, IMMU_PAGESIZE); 442 return (-1); 443 } 444 445 flags = DDI_DMA_CONSISTENT; 446 if (!immu->immu_dvma_coherent) 447 flags |= IOMEM_DATA_UC_WR_COMBINE; 448 449 if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE, 450 &immu_acc_attr, flags, 451 dmafp, NULL, &vaddr, &actual_size, 452 &pgtable->hwpg_memhdl) != DDI_SUCCESS) { 453 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 454 kmem_free(next, IMMU_PAGESIZE); 455 return (-1); 456 } 457 458 /* 459 * Memory allocation failure. Maybe a temporary condition 460 * so return error rather than panic, so we can try again 461 */ 462 if (actual_size < IMMU_PAGESIZE) { 463 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 464 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 465 kmem_free(next, IMMU_PAGESIZE); 466 return (-1); 467 } 468 469 pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr)); 470 pgtable->hwpg_vaddr = vaddr; 471 pgtable->swpg_next_array = next; 472 473 rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL); 474 475 return (0); 476 } 477 478 /*ARGSUSED*/ 479 void 480 pgtable_dtor(void *buf, void *arg) 481 { 482 pgtable_t *pgtable; 483 484 pgtable = (pgtable_t *)buf; 485 486 /* destroy will panic if lock is held. */ 487 rw_destroy(&(pgtable->swpg_rwlock)); 488 489 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 490 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 491 kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE); 492 } 493 494 /* 495 * pgtable_alloc() 496 * alloc a IOMMU pgtable structure. 497 * This same struct is used for root and context tables as well. 498 * This routine allocs the f/ollowing: 499 * - a pgtable_t struct 500 * - a HW page which holds PTEs/entries which is accesssed by HW 501 * so we set up DMA for this page 502 * - a SW page which is only for our bookeeping 503 * (for example to hold pointers to the next level pgtable). 504 * So a simple kmem_alloc suffices 505 */ 506 static pgtable_t * 507 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags) 508 { 509 pgtable_t *pgtable; 510 int kmflags; 511 512 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 513 514 pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags); 515 if (pgtable == NULL) { 516 return (NULL); 517 } 518 return (pgtable); 519 } 520 521 static void 522 pgtable_zero(pgtable_t *pgtable) 523 { 524 bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE); 525 bzero(pgtable->swpg_next_array, IMMU_PAGESIZE); 526 } 527 528 static void 529 pgtable_free(immu_t *immu, pgtable_t *pgtable) 530 { 531 kmem_cache_free(immu->immu_pgtable_cache, pgtable); 532 } 533 534 /* 535 * Function to identify a display device from the PCI class code 536 */ 537 static boolean_t 538 device_is_display(uint_t classcode) 539 { 540 static uint_t disp_classes[] = { 541 0x000100, 542 0x030000, 543 0x030001 544 }; 545 int i, nclasses = sizeof (disp_classes) / sizeof (uint_t); 546 547 for (i = 0; i < nclasses; i++) { 548 if (classcode == disp_classes[i]) 549 return (B_TRUE); 550 } 551 return (B_FALSE); 552 } 553 554 /* 555 * Function that determines if device is PCIEX and/or PCIEX bridge 556 */ 557 static boolean_t 558 device_is_pciex( 559 uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib) 560 { 561 ushort_t cap; 562 ushort_t capsp; 563 ushort_t cap_count = PCI_CAP_MAX_PTR; 564 ushort_t status; 565 boolean_t is_pciex = B_FALSE; 566 567 *is_pcib = B_FALSE; 568 569 status = pci_getw_func(bus, dev, func, PCI_CONF_STAT); 570 if (!(status & PCI_STAT_CAP)) 571 return (B_FALSE); 572 573 capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR); 574 while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) { 575 capsp &= PCI_CAP_PTR_MASK; 576 cap = pci_getb_func(bus, dev, func, capsp); 577 578 if (cap == PCI_CAP_ID_PCI_E) { 579 status = pci_getw_func(bus, dev, func, capsp + 2); 580 /* 581 * See section 7.8.2 of PCI-Express Base Spec v1.0a 582 * for Device/Port Type. 583 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the 584 * device is a PCIE2PCI bridge 585 */ 586 *is_pcib = 587 ((status & PCIE_PCIECAP_DEV_TYPE_MASK) == 588 PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE; 589 is_pciex = B_TRUE; 590 } 591 592 capsp = (*pci_getb_func)(bus, dev, func, 593 capsp + PCI_CAP_NEXT_PTR); 594 } 595 596 return (is_pciex); 597 } 598 599 static boolean_t 600 device_use_premap(uint_t classcode) 601 { 602 if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET) 603 return (B_TRUE); 604 return (B_FALSE); 605 } 606 607 608 /* 609 * immu_dvma_get_immu() 610 * get the immu unit structure for a dev_info node 611 */ 612 immu_t * 613 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags) 614 { 615 immu_devi_t *immu_devi; 616 immu_t *immu; 617 618 /* 619 * check if immu unit was already found earlier. 620 * If yes, then it will be stashed in immu_devi struct. 621 */ 622 immu_devi = immu_devi_get(dip); 623 if (immu_devi == NULL) { 624 if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) { 625 /* 626 * May fail because of low memory. Return error rather 627 * than panic as we want driver to rey again later 628 */ 629 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 630 "No immu_devi structure"); 631 /*NOTREACHED*/ 632 } 633 immu_devi = immu_devi_get(dip); 634 } 635 636 mutex_enter(&(DEVI(dip)->devi_lock)); 637 if (immu_devi->imd_immu) { 638 immu = immu_devi->imd_immu; 639 mutex_exit(&(DEVI(dip)->devi_lock)); 640 return (immu); 641 } 642 mutex_exit(&(DEVI(dip)->devi_lock)); 643 644 immu = immu_dmar_get_immu(dip); 645 if (immu == NULL) { 646 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 647 "Cannot find immu_t for device"); 648 /*NOTREACHED*/ 649 } 650 651 /* 652 * Check if some other thread found immu 653 * while lock was not held 654 */ 655 immu_devi = immu_devi_get(dip); 656 /* immu_devi should be present as we found it earlier */ 657 if (immu_devi == NULL) { 658 ddi_err(DER_PANIC, dip, 659 "immu_dvma_get_immu: No immu_devi structure"); 660 /*NOTREACHED*/ 661 } 662 663 mutex_enter(&(DEVI(dip)->devi_lock)); 664 if (immu_devi->imd_immu == NULL) { 665 /* nobody else set it, so we should do it */ 666 immu_devi->imd_immu = immu; 667 immu_devi_set_spclist(dip, immu); 668 } else { 669 /* 670 * if some other thread got immu before 671 * us, it should get the same results 672 */ 673 if (immu_devi->imd_immu != immu) { 674 ddi_err(DER_PANIC, dip, "Multiple " 675 "immu units found for device. Expected (%p), " 676 "actual (%p)", (void *)immu, 677 (void *)immu_devi->imd_immu); 678 mutex_exit(&(DEVI(dip)->devi_lock)); 679 /*NOTREACHED*/ 680 } 681 } 682 mutex_exit(&(DEVI(dip)->devi_lock)); 683 684 return (immu); 685 } 686 687 688 /* ############################# IMMU_DEVI code ############################ */ 689 690 /* 691 * Allocate a immu_devi structure and initialize it 692 */ 693 static immu_devi_t * 694 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func, 695 immu_flags_t immu_flags) 696 { 697 uchar_t baseclass, subclass; 698 uint_t classcode, revclass; 699 immu_devi_t *immu_devi; 700 boolean_t pciex = B_FALSE; 701 int kmflags; 702 boolean_t is_pcib = B_FALSE; 703 704 /* bus == -1 indicate non-PCI device (no BDF) */ 705 ASSERT(bus == -1 || bus >= 0); 706 ASSERT(dev >= 0); 707 ASSERT(func >= 0); 708 709 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 710 immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags); 711 if (immu_devi == NULL) { 712 ddi_err(DER_WARN, rdip, "Failed to allocate memory for " 713 "Intel IOMMU immu_devi structure"); 714 return (NULL); 715 } 716 immu_devi->imd_dip = rdip; 717 immu_devi->imd_seg = 0; /* Currently seg can only be 0 */ 718 immu_devi->imd_bus = bus; 719 immu_devi->imd_pcib_type = IMMU_PCIB_BAD; 720 721 if (bus == -1) { 722 immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF; 723 return (immu_devi); 724 } 725 726 immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func); 727 immu_devi->imd_sec = 0; 728 immu_devi->imd_sub = 0; 729 730 revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID); 731 732 classcode = IMMU_PCI_REV2CLASS(revclass); 733 baseclass = IMMU_PCI_CLASS2BASE(classcode); 734 subclass = IMMU_PCI_CLASS2SUB(classcode); 735 736 if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) { 737 738 immu_devi->imd_sec = pci_getb_func(bus, dev, func, 739 PCI_BCNF_SECBUS); 740 immu_devi->imd_sub = pci_getb_func(bus, dev, func, 741 PCI_BCNF_SUBBUS); 742 743 pciex = device_is_pciex(bus, dev, func, &is_pcib); 744 if (pciex == B_TRUE && is_pcib == B_TRUE) { 745 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI; 746 } else if (pciex == B_TRUE) { 747 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE; 748 } else { 749 immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI; 750 } 751 } else { 752 immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT; 753 } 754 755 /* check for certain special devices */ 756 immu_devi->imd_display = device_is_display(classcode); 757 immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) && 758 (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE; 759 immu_devi->imd_use_premap = device_use_premap(classcode); 760 761 immu_devi->imd_domain = NULL; 762 763 immu_devi->imd_dvma_flags = immu_global_dvma_flags; 764 765 return (immu_devi); 766 } 767 768 static void 769 destroy_immu_devi(immu_devi_t *immu_devi) 770 { 771 kmem_free(immu_devi, sizeof (immu_devi_t)); 772 } 773 774 static domain_t * 775 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp) 776 { 777 immu_devi_t *immu_devi; 778 domain_t *domain; 779 dev_info_t *ddip; 780 781 *ddipp = NULL; 782 783 immu_devi = immu_devi_get(rdip); 784 if (immu_devi == NULL) { 785 return (NULL); 786 } 787 788 mutex_enter(&(DEVI(rdip)->devi_lock)); 789 domain = immu_devi->imd_domain; 790 ddip = immu_devi->imd_ddip; 791 mutex_exit(&(DEVI(rdip)->devi_lock)); 792 793 if (domain) 794 *ddipp = ddip; 795 796 return (domain); 797 798 } 799 800 /* ############################# END IMMU_DEVI code ######################## */ 801 /* ############################# DOMAIN code ############################### */ 802 803 /* 804 * This routine always succeeds 805 */ 806 static int 807 did_alloc(immu_t *immu, dev_info_t *rdip, 808 dev_info_t *ddip, immu_flags_t immu_flags) 809 { 810 int did; 811 812 did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1, 813 (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP); 814 815 if (did == 0) { 816 ddi_err(DER_WARN, rdip, "device domain-id alloc error" 817 " domain-device: %s%d. immu unit is %s. Using " 818 "unity domain with domain-id (%d)", 819 ddi_driver_name(ddip), ddi_get_instance(ddip), 820 immu->immu_name, immu->immu_unity_domain->dom_did); 821 did = immu->immu_unity_domain->dom_did; 822 } 823 824 return (did); 825 } 826 827 static int 828 get_branch_domain(dev_info_t *pdip, void *arg) 829 { 830 immu_devi_t *immu_devi; 831 domain_t *domain; 832 dev_info_t *ddip; 833 immu_t *immu; 834 dvma_arg_t *dvp = (dvma_arg_t *)arg; 835 836 /* 837 * The field dvp->dva_rdip is a work-in-progress 838 * and gets updated as we walk up the ancestor 839 * tree. The final ddip is set only when we reach 840 * the top of the tree. So the dvp->dva_ddip field cannot 841 * be relied on until we reach the top of the field. 842 */ 843 844 /* immu_devi may not be set. */ 845 immu_devi = immu_devi_get(pdip); 846 if (immu_devi == NULL) { 847 if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) { 848 dvp->dva_error = DDI_FAILURE; 849 return (DDI_WALK_TERMINATE); 850 } 851 } 852 853 immu_devi = immu_devi_get(pdip); 854 immu = immu_devi->imd_immu; 855 if (immu == NULL) 856 immu = immu_dvma_get_immu(pdip, dvp->dva_flags); 857 858 /* 859 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to 860 * terminate the walk (since the device under the PCIE bridge 861 * is a PCIE device and has an independent entry in the 862 * root/context table) 863 */ 864 if (dvp->dva_rdip != pdip && 865 immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) { 866 return (DDI_WALK_TERMINATE); 867 } 868 869 /* 870 * In order to be a domain-dim, it must be a PCI device i.e. 871 * must have valid BDF. This also eliminates the root complex. 872 */ 873 if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD && 874 immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) { 875 ASSERT(immu_devi->imd_bus >= 0); 876 ASSERT(immu_devi->imd_devfunc >= 0); 877 dvp->dva_ddip = pdip; 878 } 879 880 if (immu_devi->imd_display == B_TRUE || 881 (dvp->dva_flags & IMMU_FLAGS_UNITY)) { 882 dvp->dva_domain = immu->immu_unity_domain; 883 /* continue walking to find ddip */ 884 return (DDI_WALK_CONTINUE); 885 } 886 887 mutex_enter(&(DEVI(pdip)->devi_lock)); 888 domain = immu_devi->imd_domain; 889 ddip = immu_devi->imd_ddip; 890 mutex_exit(&(DEVI(pdip)->devi_lock)); 891 892 if (domain && ddip) { 893 /* if domain is set, it must be the same */ 894 if (dvp->dva_domain) { 895 ASSERT(domain == dvp->dva_domain); 896 } 897 dvp->dva_domain = domain; 898 dvp->dva_ddip = ddip; 899 return (DDI_WALK_TERMINATE); 900 } 901 902 /* Domain may already be set, continue walking so that ddip gets set */ 903 if (dvp->dva_domain) { 904 return (DDI_WALK_CONTINUE); 905 } 906 907 /* domain is not set in either immu_devi or dvp */ 908 domain = bdf_domain_lookup(immu_devi); 909 if (domain == NULL) { 910 return (DDI_WALK_CONTINUE); 911 } 912 913 /* ok, the BDF hash had a domain for this BDF. */ 914 915 /* Grab lock again to check if something else set immu_devi fields */ 916 mutex_enter(&(DEVI(pdip)->devi_lock)); 917 if (immu_devi->imd_domain != NULL) { 918 dvp->dva_domain = domain; 919 } else { 920 dvp->dva_domain = domain; 921 } 922 mutex_exit(&(DEVI(pdip)->devi_lock)); 923 924 /* 925 * walk upwards until the topmost PCI bridge is found 926 */ 927 return (DDI_WALK_CONTINUE); 928 929 } 930 931 static void 932 map_unity_domain(domain_t *domain) 933 { 934 struct memlist *mp; 935 uint64_t start; 936 uint64_t npages; 937 immu_dcookie_t dcookies[1] = {0}; 938 int dcount = 0; 939 940 /* 941 * UNITY arenas are a mirror of the physical memory 942 * installed on the system. 943 */ 944 945 #ifdef BUGGY_DRIVERS 946 /* 947 * Dont skip page0. Some broken HW/FW access it. 948 */ 949 dcookies[0].dck_paddr = 0; 950 dcookies[0].dck_npages = 1; 951 dcount = 1; 952 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 953 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 954 #endif 955 956 memlist_read_lock(); 957 958 mp = phys_install; 959 960 if (mp->ml_address == 0) { 961 /* since we already mapped page1 above */ 962 start = IMMU_PAGESIZE; 963 } else { 964 start = mp->ml_address; 965 } 966 npages = mp->ml_size/IMMU_PAGESIZE + 1; 967 968 dcookies[0].dck_paddr = start; 969 dcookies[0].dck_npages = npages; 970 dcount = 1; 971 (void) dvma_map(domain, start, npages, dcookies, 972 dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 973 974 ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64 975 " - 0x%" PRIx64 "]", start, start + mp->ml_size); 976 977 mp = mp->ml_next; 978 while (mp) { 979 ddi_err(DER_LOG, domain->dom_dip, 980 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 981 mp->ml_address, mp->ml_address + mp->ml_size); 982 983 start = mp->ml_address; 984 npages = mp->ml_size/IMMU_PAGESIZE + 1; 985 986 dcookies[0].dck_paddr = start; 987 dcookies[0].dck_npages = npages; 988 dcount = 1; 989 (void) dvma_map(domain, start, npages, 990 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 991 mp = mp->ml_next; 992 } 993 994 mp = bios_rsvd; 995 while (mp) { 996 ddi_err(DER_LOG, domain->dom_dip, 997 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 998 mp->ml_address, mp->ml_address + mp->ml_size); 999 1000 start = mp->ml_address; 1001 npages = mp->ml_size/IMMU_PAGESIZE + 1; 1002 1003 dcookies[0].dck_paddr = start; 1004 dcookies[0].dck_npages = npages; 1005 dcount = 1; 1006 (void) dvma_map(domain, start, npages, 1007 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 1008 1009 mp = mp->ml_next; 1010 } 1011 1012 memlist_read_unlock(); 1013 } 1014 1015 /* 1016 * create_xlate_arena() 1017 * Create the dvma arena for a domain with translation 1018 * mapping 1019 */ 1020 static void 1021 create_xlate_arena(immu_t *immu, domain_t *domain, 1022 dev_info_t *rdip, immu_flags_t immu_flags) 1023 { 1024 char *arena_name; 1025 struct memlist *mp; 1026 int vmem_flags; 1027 uint64_t start; 1028 uint_t mgaw; 1029 uint64_t size; 1030 uint64_t maxaddr; 1031 void *vmem_ret; 1032 1033 arena_name = domain->dom_dvma_arena_name; 1034 1035 /* Note, don't do sizeof (arena_name) - it is just a pointer */ 1036 (void) snprintf(arena_name, 1037 sizeof (domain->dom_dvma_arena_name), 1038 "%s-domain-%d-xlate-DVMA-arena", immu->immu_name, 1039 domain->dom_did); 1040 1041 vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP; 1042 1043 /* Restrict mgaddr (max guest addr) to MGAW */ 1044 mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap); 1045 1046 /* 1047 * To ensure we avoid ioapic and PCI MMIO ranges we just 1048 * use the physical memory address range of the system as the 1049 * range 1050 */ 1051 maxaddr = ((uint64_t)1 << mgaw); 1052 1053 memlist_read_lock(); 1054 1055 mp = phys_install; 1056 1057 if (mp->ml_address == 0) 1058 start = MMU_PAGESIZE; 1059 else 1060 start = mp->ml_address; 1061 1062 if (start + mp->ml_size > maxaddr) 1063 size = maxaddr - start; 1064 else 1065 size = mp->ml_size; 1066 1067 ddi_err(DER_VERB, rdip, 1068 "iommu: %s: Creating dvma vmem arena [0x%" PRIx64 1069 " - 0x%" PRIx64 "]", arena_name, start, start + size); 1070 1071 /* 1072 * We always allocate in quanta of IMMU_PAGESIZE 1073 */ 1074 domain->dom_dvma_arena = vmem_create(arena_name, 1075 (void *)(uintptr_t)start, /* start addr */ 1076 size, /* size */ 1077 IMMU_PAGESIZE, /* quantum */ 1078 NULL, /* afunc */ 1079 NULL, /* ffunc */ 1080 NULL, /* source */ 1081 0, /* qcache_max */ 1082 vmem_flags); 1083 1084 if (domain->dom_dvma_arena == NULL) { 1085 ddi_err(DER_PANIC, rdip, 1086 "Failed to allocate DVMA arena(%s) " 1087 "for domain ID (%d)", arena_name, domain->dom_did); 1088 /*NOTREACHED*/ 1089 } 1090 1091 mp = mp->ml_next; 1092 while (mp) { 1093 1094 if (mp->ml_address == 0) 1095 start = MMU_PAGESIZE; 1096 else 1097 start = mp->ml_address; 1098 1099 if (start + mp->ml_size > maxaddr) 1100 size = maxaddr - start; 1101 else 1102 size = mp->ml_size; 1103 1104 ddi_err(DER_VERB, rdip, 1105 "iommu: %s: Adding dvma vmem span [0x%" PRIx64 1106 " - 0x%" PRIx64 "]", arena_name, start, 1107 start + size); 1108 1109 vmem_ret = vmem_add(domain->dom_dvma_arena, 1110 (void *)(uintptr_t)start, size, vmem_flags); 1111 1112 if (vmem_ret == NULL) { 1113 ddi_err(DER_PANIC, rdip, 1114 "Failed to allocate DVMA arena(%s) " 1115 "for domain ID (%d)", 1116 arena_name, domain->dom_did); 1117 /*NOTREACHED*/ 1118 } 1119 mp = mp->ml_next; 1120 } 1121 memlist_read_unlock(); 1122 } 1123 1124 /* ################################### DOMAIN CODE ######################### */ 1125 1126 /* 1127 * Set the domain and domain-dip for a dip 1128 */ 1129 static void 1130 set_domain( 1131 dev_info_t *dip, 1132 dev_info_t *ddip, 1133 domain_t *domain) 1134 { 1135 immu_devi_t *immu_devi; 1136 domain_t *fdomain; 1137 dev_info_t *fddip; 1138 1139 immu_devi = immu_devi_get(dip); 1140 1141 mutex_enter(&(DEVI(dip)->devi_lock)); 1142 fddip = immu_devi->imd_ddip; 1143 fdomain = immu_devi->imd_domain; 1144 1145 if (fddip) { 1146 ASSERT(fddip == ddip); 1147 } else { 1148 immu_devi->imd_ddip = ddip; 1149 } 1150 1151 if (fdomain) { 1152 ASSERT(fdomain == domain); 1153 } else { 1154 immu_devi->imd_domain = domain; 1155 } 1156 mutex_exit(&(DEVI(dip)->devi_lock)); 1157 } 1158 1159 /* 1160 * device_domain() 1161 * Get domain for a device. The domain may be global in which case it 1162 * is shared between all IOMMU units. Due to potential AGAW differences 1163 * between IOMMU units, such global domains *have to be* UNITY mapping 1164 * domains. Alternatively, the domain may be local to a IOMMU unit. 1165 * Local domains may be shared or immu_devi, although the 1166 * scope of sharing 1167 * is restricted to devices controlled by the IOMMU unit to 1168 * which the domain 1169 * belongs. If shared, they (currently) have to be UNITY domains. If 1170 * immu_devi a domain may be either UNITY or translation (XLATE) domain. 1171 */ 1172 static domain_t * 1173 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) 1174 { 1175 dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */ 1176 immu_t *immu; 1177 domain_t *domain; 1178 dvma_arg_t dvarg = {0}; 1179 int level; 1180 1181 *ddipp = NULL; 1182 1183 /* 1184 * Check if the domain is already set. This is usually true 1185 * if this is not the first DVMA transaction. 1186 */ 1187 ddip = NULL; 1188 domain = immu_devi_domain(rdip, &ddip); 1189 if (domain) { 1190 *ddipp = ddip; 1191 return (domain); 1192 } 1193 1194 immu = immu_dvma_get_immu(rdip, immu_flags); 1195 if (immu == NULL) { 1196 /* 1197 * possible that there is no IOMMU unit for this device 1198 * - BIOS bugs are one example. 1199 */ 1200 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 1201 return (NULL); 1202 } 1203 1204 immu_flags |= immu_devi_get(rdip)->imd_dvma_flags; 1205 1206 dvarg.dva_rdip = rdip; 1207 dvarg.dva_ddip = NULL; 1208 dvarg.dva_domain = NULL; 1209 dvarg.dva_flags = immu_flags; 1210 level = 0; 1211 if (immu_walk_ancestor(rdip, NULL, get_branch_domain, 1212 &dvarg, &level, immu_flags) != DDI_SUCCESS) { 1213 /* 1214 * maybe low memory. return error, 1215 * so driver tries again later 1216 */ 1217 return (NULL); 1218 } 1219 1220 /* should have walked at least 1 dip (i.e. edip) */ 1221 ASSERT(level > 0); 1222 1223 ddip = dvarg.dva_ddip; /* must be present */ 1224 domain = dvarg.dva_domain; /* may be NULL */ 1225 1226 /* 1227 * We may find the domain during our ancestor walk on any one of our 1228 * ancestor dips, If the domain is found then the domain-dip 1229 * (i.e. ddip) will also be found in the same immu_devi struct. 1230 * The domain-dip is the highest ancestor dip which shares the 1231 * same domain with edip. 1232 * The domain may or may not be found, but the domain dip must 1233 * be found. 1234 */ 1235 if (ddip == NULL) { 1236 ddi_err(DER_MODE, rdip, "Cannot find domain dip for device."); 1237 return (NULL); 1238 } 1239 1240 /* 1241 * Did we find a domain ? 1242 */ 1243 if (domain) { 1244 goto found; 1245 } 1246 1247 /* nope, so allocate */ 1248 domain = domain_create(immu, ddip, rdip, immu_flags); 1249 if (domain == NULL) { 1250 return (NULL); 1251 } 1252 1253 /*FALLTHROUGH*/ 1254 found: 1255 /* 1256 * We know *domain *is* the right domain, so panic if 1257 * another domain is set for either the request-dip or 1258 * effective dip. 1259 */ 1260 set_domain(ddip, ddip, domain); 1261 set_domain(rdip, ddip, domain); 1262 1263 *ddipp = ddip; 1264 return (domain); 1265 } 1266 1267 static void 1268 create_unity_domain(immu_t *immu) 1269 { 1270 domain_t *domain; 1271 1272 /* domain created during boot and always use sleep flag */ 1273 domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP); 1274 1275 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1276 1277 domain->dom_did = IMMU_UNITY_DID; 1278 domain->dom_maptype = IMMU_MAPTYPE_UNITY; 1279 1280 domain->dom_immu = immu; 1281 immu->immu_unity_domain = domain; 1282 1283 /* 1284 * Setup the domain's initial page table 1285 * should never fail. 1286 */ 1287 domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1288 pgtable_zero(domain->dom_pgtable_root); 1289 1290 /* 1291 * Only map all physical memory in to the unity domain 1292 * if passthrough is not supported. If it is supported, 1293 * passthrough is set in the context entry instead. 1294 */ 1295 if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1296 map_unity_domain(domain); 1297 1298 1299 /* 1300 * put it on the system-wide UNITY domain list 1301 */ 1302 mutex_enter(&(immu_domain_lock)); 1303 list_insert_tail(&immu_unity_domain_list, domain); 1304 mutex_exit(&(immu_domain_lock)); 1305 } 1306 1307 /* 1308 * ddip is the domain-dip - the topmost dip in a domain 1309 * rdip is the requesting-dip - the device which is 1310 * requesting DVMA setup 1311 * if domain is a non-shared domain rdip == ddip 1312 */ 1313 static domain_t * 1314 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, 1315 immu_flags_t immu_flags) 1316 { 1317 int kmflags; 1318 domain_t *domain; 1319 char mod_hash_name[128]; 1320 immu_devi_t *immu_devi; 1321 int did; 1322 immu_dcookie_t dcookies[1] = {0}; 1323 int dcount = 0; 1324 1325 immu_devi = immu_devi_get(rdip); 1326 1327 /* 1328 * First allocate a domainid. 1329 * This routine will never fail, since if we run out 1330 * of domains the unity domain will be allocated. 1331 */ 1332 did = did_alloc(immu, rdip, ddip, immu_flags); 1333 if (did == IMMU_UNITY_DID) { 1334 /* domain overflow */ 1335 ASSERT(immu->immu_unity_domain); 1336 return (immu->immu_unity_domain); 1337 } 1338 1339 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 1340 domain = kmem_zalloc(sizeof (domain_t), kmflags); 1341 if (domain == NULL) { 1342 ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain " 1343 "structure for device. IOMMU unit: %s", immu->immu_name); 1344 /*NOTREACHED*/ 1345 } 1346 1347 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1348 1349 (void) snprintf(mod_hash_name, sizeof (mod_hash_name), 1350 "immu%s-domain%d-pava-hash", immu->immu_name, did); 1351 1352 domain->dom_did = did; 1353 domain->dom_immu = immu; 1354 domain->dom_maptype = IMMU_MAPTYPE_XLATE; 1355 domain->dom_dip = ddip; 1356 1357 /* 1358 * Create xlate DVMA arena for this domain. 1359 */ 1360 create_xlate_arena(immu, domain, rdip, immu_flags); 1361 1362 /* 1363 * Setup the domain's initial page table 1364 */ 1365 domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags); 1366 if (domain->dom_pgtable_root == NULL) { 1367 ddi_err(DER_PANIC, rdip, "Failed to alloc root " 1368 "pgtable for domain (%d). IOMMU unit: %s", 1369 domain->dom_did, immu->immu_name); 1370 /*NOTREACHED*/ 1371 } 1372 pgtable_zero(domain->dom_pgtable_root); 1373 1374 /* 1375 * Since this is a immu unit-specific domain, put it on 1376 * the per-immu domain list. 1377 */ 1378 mutex_enter(&(immu->immu_lock)); 1379 list_insert_head(&immu->immu_domain_list, domain); 1380 mutex_exit(&(immu->immu_lock)); 1381 1382 /* 1383 * Also put it on the system-wide xlate domain list 1384 */ 1385 mutex_enter(&(immu_domain_lock)); 1386 list_insert_head(&immu_xlate_domain_list, domain); 1387 mutex_exit(&(immu_domain_lock)); 1388 1389 bdf_domain_insert(immu_devi, domain); 1390 1391 #ifdef BUGGY_DRIVERS 1392 /* 1393 * Map page0. Some broken HW/FW access it. 1394 */ 1395 dcookies[0].dck_paddr = 0; 1396 dcookies[0].dck_npages = 1; 1397 dcount = 1; 1398 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 1399 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 1400 #endif 1401 return (domain); 1402 } 1403 1404 /* 1405 * Create domainid arena. 1406 * Domainid 0 is reserved by Vt-d spec and cannot be used by 1407 * system software. 1408 * Domainid 1 is reserved by solaris and used for *all* of the following: 1409 * as the "uninitialized" domain - For devices not yet controlled 1410 * by Solaris 1411 * as the "unity" domain - For devices that will always belong 1412 * to the unity domain 1413 * as the "overflow" domain - Used for any new device after we 1414 * run out of domains 1415 * All of the above domains map into a single domain with 1416 * domainid 1 and UNITY DVMA mapping 1417 * Each IMMU unity has its own unity/uninit/overflow domain 1418 */ 1419 static void 1420 did_init(immu_t *immu) 1421 { 1422 (void) snprintf(immu->immu_did_arena_name, 1423 sizeof (immu->immu_did_arena_name), 1424 "%s_domainid_arena", immu->immu_name); 1425 1426 ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s", 1427 immu->immu_did_arena_name); 1428 1429 immu->immu_did_arena = vmem_create( 1430 immu->immu_did_arena_name, 1431 (void *)(uintptr_t)(IMMU_UNITY_DID + 1), /* start addr */ 1432 immu->immu_max_domains - IMMU_UNITY_DID, 1433 1, /* quantum */ 1434 NULL, /* afunc */ 1435 NULL, /* ffunc */ 1436 NULL, /* source */ 1437 0, /* qcache_max */ 1438 VM_SLEEP); 1439 1440 /* Even with SLEEP flag, vmem_create() can fail */ 1441 if (immu->immu_did_arena == NULL) { 1442 ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel " 1443 "IOMMU domainid allocator: %s", immu->immu_name, 1444 immu->immu_did_arena_name); 1445 } 1446 } 1447 1448 /* ######################### CONTEXT CODE ################################# */ 1449 1450 static void 1451 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, 1452 int bus, int devfunc) 1453 { 1454 pgtable_t *context; 1455 pgtable_t *pgtable_root; 1456 hw_rce_t *hw_rent; 1457 hw_rce_t *hw_cent; 1458 hw_rce_t *ctxp; 1459 int sid; 1460 krw_t rwtype; 1461 boolean_t fill_root; 1462 boolean_t fill_ctx; 1463 1464 pgtable_root = domain->dom_pgtable_root; 1465 1466 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1467 context = *(pgtable_t **)(ctxp + bus); 1468 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus; 1469 1470 fill_root = B_FALSE; 1471 fill_ctx = B_FALSE; 1472 1473 /* Check the most common case first with reader lock */ 1474 rw_enter(&(immu->immu_ctx_rwlock), RW_READER); 1475 rwtype = RW_READER; 1476 again: 1477 if (ROOT_GET_P(hw_rent)) { 1478 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1479 if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) { 1480 rw_exit(&(immu->immu_ctx_rwlock)); 1481 return; 1482 } else { 1483 fill_ctx = B_TRUE; 1484 } 1485 } else { 1486 fill_root = B_TRUE; 1487 fill_ctx = B_TRUE; 1488 } 1489 1490 if (rwtype == RW_READER && 1491 rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) { 1492 rw_exit(&(immu->immu_ctx_rwlock)); 1493 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1494 rwtype = RW_WRITER; 1495 goto again; 1496 } 1497 rwtype = RW_WRITER; 1498 1499 if (fill_root == B_TRUE) { 1500 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1501 ROOT_SET_P(hw_rent); 1502 immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t)); 1503 } 1504 1505 if (fill_ctx == B_TRUE) { 1506 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1507 /* need to disable context entry before reprogramming it */ 1508 bzero(hw_cent, sizeof (hw_rce_t)); 1509 1510 /* flush caches */ 1511 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1512 1513 sid = ((bus << 8) | devfunc); 1514 immu_flush_context_fsi(immu, 0, sid, domain->dom_did, 1515 &immu->immu_ctx_inv_wait); 1516 1517 CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED); 1518 CONT_SET_DID(hw_cent, domain->dom_did); 1519 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1520 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1521 if (domain->dom_did == IMMU_UNITY_DID && 1522 IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1523 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1524 else 1525 /*LINTED*/ 1526 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1527 CONT_SET_P(hw_cent); 1528 if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) { 1529 CONT_SET_EH(hw_cent); 1530 if (immu_use_alh) 1531 CONT_SET_ALH(hw_cent); 1532 } 1533 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1534 } 1535 rw_exit(&(immu->immu_ctx_rwlock)); 1536 } 1537 1538 static pgtable_t * 1539 context_create(immu_t *immu) 1540 { 1541 int bus; 1542 int devfunc; 1543 pgtable_t *root_table; 1544 pgtable_t *context; 1545 pgtable_t *pgtable_root; 1546 hw_rce_t *ctxp; 1547 hw_rce_t *hw_rent; 1548 hw_rce_t *hw_cent; 1549 1550 /* Allocate a zeroed root table (4K 256b entries) */ 1551 root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1552 pgtable_zero(root_table); 1553 1554 /* 1555 * Setup context tables for all possible root table entries. 1556 * Start out with unity domains for all entries. 1557 */ 1558 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1559 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr); 1560 for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) { 1561 context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1562 pgtable_zero(context); 1563 ROOT_SET_P(hw_rent); 1564 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1565 hw_cent = (hw_rce_t *)(context->hwpg_vaddr); 1566 for (devfunc = 0; devfunc < IMMU_CONT_NUM; 1567 devfunc++, hw_cent++) { 1568 pgtable_root = 1569 immu->immu_unity_domain->dom_pgtable_root; 1570 CONT_SET_DID(hw_cent, 1571 immu->immu_unity_domain->dom_did); 1572 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1573 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1574 if (IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1575 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1576 else 1577 /*LINTED*/ 1578 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1579 CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED); 1580 CONT_SET_P(hw_cent); 1581 } 1582 immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE); 1583 *((pgtable_t **)ctxp) = context; 1584 } 1585 1586 return (root_table); 1587 } 1588 1589 /* 1590 * Called during rootnex attach, so no locks needed 1591 */ 1592 static void 1593 context_init(immu_t *immu) 1594 { 1595 rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL); 1596 1597 immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE); 1598 1599 immu_regs_wbf_flush(immu); 1600 1601 immu->immu_ctx_root = context_create(immu); 1602 1603 immu_regs_set_root_table(immu); 1604 1605 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1606 immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); 1607 immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); 1608 rw_exit(&(immu->immu_ctx_rwlock)); 1609 } 1610 1611 1612 /* 1613 * Find top pcib 1614 */ 1615 static int 1616 find_top_pcib(dev_info_t *dip, void *arg) 1617 { 1618 immu_devi_t *immu_devi; 1619 dev_info_t **pcibdipp = (dev_info_t **)arg; 1620 1621 immu_devi = immu_devi_get(dip); 1622 1623 if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) { 1624 *pcibdipp = dip; 1625 } 1626 1627 return (DDI_WALK_CONTINUE); 1628 } 1629 1630 static int 1631 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, 1632 dev_info_t *rdip, immu_flags_t immu_flags) 1633 { 1634 immu_devi_t *r_immu_devi; 1635 immu_devi_t *d_immu_devi; 1636 int r_bus; 1637 int d_bus; 1638 int r_devfunc; 1639 int d_devfunc; 1640 immu_pcib_t d_pcib_type; 1641 dev_info_t *pcibdip; 1642 1643 if (ddip == NULL || rdip == NULL || 1644 ddip == root_devinfo || rdip == root_devinfo) { 1645 ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or " 1646 "request-dip are NULL or are root devinfo"); 1647 return (DDI_FAILURE); 1648 } 1649 1650 /* 1651 * We need to set the context fields 1652 * based on what type of device rdip and ddip are. 1653 * To do that we need the immu_devi field. 1654 * Set the immu_devi field (if not already set) 1655 */ 1656 if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) { 1657 ddi_err(DER_MODE, rdip, 1658 "immu_context_update: failed to set immu_devi for ddip"); 1659 return (DDI_FAILURE); 1660 } 1661 1662 if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) { 1663 ddi_err(DER_MODE, rdip, 1664 "immu_context_update: failed to set immu_devi for rdip"); 1665 return (DDI_FAILURE); 1666 } 1667 1668 d_immu_devi = immu_devi_get(ddip); 1669 r_immu_devi = immu_devi_get(rdip); 1670 1671 d_bus = d_immu_devi->imd_bus; 1672 d_devfunc = d_immu_devi->imd_devfunc; 1673 d_pcib_type = d_immu_devi->imd_pcib_type; 1674 r_bus = r_immu_devi->imd_bus; 1675 r_devfunc = r_immu_devi->imd_devfunc; 1676 1677 if (rdip == ddip) { 1678 /* rdip is a PCIE device. set context for it only */ 1679 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1680 r_devfunc); 1681 #ifdef BUGGY_DRIVERS 1682 } else if (r_immu_devi == d_immu_devi) { 1683 #ifdef TEST 1684 ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and " 1685 "0x%lx are identical", rdip, ddip); 1686 #endif 1687 /* rdip is a PCIE device. set context for it only */ 1688 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1689 r_devfunc); 1690 #endif 1691 } else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) { 1692 /* 1693 * ddip is a PCIE_PCI bridge. Set context for ddip's 1694 * secondary bus. If rdip is on ddip's secondary 1695 * bus, set context for rdip. Else, set context 1696 * for rdip's PCI bridge on ddip's secondary bus. 1697 */ 1698 context_set(immu, domain, immu->immu_ctx_root, 1699 d_immu_devi->imd_sec, 0); 1700 if (d_immu_devi->imd_sec == r_bus) { 1701 context_set(immu, domain, immu->immu_ctx_root, 1702 r_bus, r_devfunc); 1703 } else { 1704 pcibdip = NULL; 1705 if (immu_walk_ancestor(rdip, ddip, find_top_pcib, 1706 &pcibdip, NULL, immu_flags) == DDI_SUCCESS && 1707 pcibdip != NULL) { 1708 r_immu_devi = immu_devi_get(pcibdip); 1709 r_bus = r_immu_devi->imd_bus; 1710 r_devfunc = r_immu_devi->imd_devfunc; 1711 context_set(immu, domain, immu->immu_ctx_root, 1712 r_bus, r_devfunc); 1713 } else { 1714 ddi_err(DER_PANIC, rdip, "Failed to find PCI " 1715 " bridge for PCI device"); 1716 /*NOTREACHED*/ 1717 } 1718 } 1719 } else if (d_pcib_type == IMMU_PCIB_PCI_PCI) { 1720 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1721 d_devfunc); 1722 } else if (d_pcib_type == IMMU_PCIB_ENDPOINT) { 1723 /* 1724 * ddip is a PCIE device which has a non-PCI device under it 1725 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata 1726 */ 1727 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1728 d_devfunc); 1729 } else { 1730 ddi_err(DER_PANIC, rdip, "unknown device type. Cannot " 1731 "set iommu context."); 1732 /*NOTREACHED*/ 1733 } 1734 1735 /* XXX do we need a membar_producer() here */ 1736 return (DDI_SUCCESS); 1737 } 1738 1739 /* ##################### END CONTEXT CODE ################################## */ 1740 /* ##################### MAPPING CODE ################################## */ 1741 1742 1743 #ifdef DEBUG 1744 static boolean_t 1745 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr, 1746 dev_info_t *rdip, immu_flags_t immu_flags) 1747 { 1748 /* The PDTE must be set i.e. present bit is set */ 1749 if (!PDTE_P(pdte)) { 1750 ddi_err(DER_MODE, rdip, "No present flag"); 1751 return (B_FALSE); 1752 } 1753 1754 /* 1755 * Just assert to check most significant system software field 1756 * (PDTE_SW4) as it is same as present bit and we 1757 * checked that above 1758 */ 1759 ASSERT(PDTE_SW4(pdte)); 1760 1761 /* 1762 * TM field should be clear if not reserved. 1763 * non-leaf is always reserved 1764 */ 1765 if (next == NULL && immu->immu_TM_reserved == B_FALSE) { 1766 if (PDTE_TM(pdte)) { 1767 ddi_err(DER_MODE, rdip, "TM flag set"); 1768 return (B_FALSE); 1769 } 1770 } 1771 1772 /* 1773 * The SW3 field is not used and must be clear 1774 */ 1775 if (PDTE_SW3(pdte)) { 1776 ddi_err(DER_MODE, rdip, "SW3 set"); 1777 return (B_FALSE); 1778 } 1779 1780 /* 1781 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set 1782 */ 1783 if (next == NULL) { 1784 ASSERT(paddr % IMMU_PAGESIZE == 0); 1785 if (PDTE_PADDR(pdte) != paddr) { 1786 ddi_err(DER_MODE, rdip, 1787 "PTE paddr mismatch: %lx != %lx", 1788 PDTE_PADDR(pdte), paddr); 1789 return (B_FALSE); 1790 } 1791 } else { 1792 if (PDTE_PADDR(pdte) != next->hwpg_paddr) { 1793 ddi_err(DER_MODE, rdip, 1794 "PDE paddr mismatch: %lx != %lx", 1795 PDTE_PADDR(pdte), next->hwpg_paddr); 1796 return (B_FALSE); 1797 } 1798 } 1799 1800 /* 1801 * SNP field should be clear if not reserved. 1802 * non-leaf is always reserved 1803 */ 1804 if (next == NULL && immu->immu_SNP_reserved == B_FALSE) { 1805 if (PDTE_SNP(pdte)) { 1806 ddi_err(DER_MODE, rdip, "SNP set"); 1807 return (B_FALSE); 1808 } 1809 } 1810 1811 /* second field available for system software should be clear */ 1812 if (PDTE_SW2(pdte)) { 1813 ddi_err(DER_MODE, rdip, "SW2 set"); 1814 return (B_FALSE); 1815 } 1816 1817 /* Super pages field should be clear */ 1818 if (PDTE_SP(pdte)) { 1819 ddi_err(DER_MODE, rdip, "SP set"); 1820 return (B_FALSE); 1821 } 1822 1823 /* 1824 * least significant field available for 1825 * system software should be clear 1826 */ 1827 if (PDTE_SW1(pdte)) { 1828 ddi_err(DER_MODE, rdip, "SW1 set"); 1829 return (B_FALSE); 1830 } 1831 1832 if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) { 1833 ddi_err(DER_MODE, rdip, "READ not set"); 1834 return (B_FALSE); 1835 } 1836 1837 if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) { 1838 ddi_err(DER_MODE, rdip, "WRITE not set"); 1839 return (B_FALSE); 1840 } 1841 1842 return (B_TRUE); 1843 } 1844 #endif 1845 1846 /*ARGSUSED*/ 1847 static void 1848 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 1849 uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip) 1850 { 1851 uint64_t npages; 1852 uint64_t dvma; 1853 pgtable_t *pgtable; 1854 hw_pdte_t *hwp; 1855 hw_pdte_t *shwp; 1856 int idx; 1857 1858 pgtable = xlate->xlt_pgtable; 1859 idx = xlate->xlt_idx; 1860 1861 dvma = *dvma_ptr; 1862 npages = *npages_ptr; 1863 1864 /* 1865 * since a caller gets a unique dvma for a physical address, 1866 * no other concurrent thread will be writing to the same 1867 * PTE even if it has the same paddr. So no locks needed. 1868 */ 1869 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 1870 1871 hwp = shwp; 1872 for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 1873 PDTE_CLEAR_P(*hwp); 1874 dvma += IMMU_PAGESIZE; 1875 npages--; 1876 } 1877 1878 *dvma_ptr = dvma; 1879 *npages_ptr = npages; 1880 1881 xlate->xlt_idx = idx; 1882 } 1883 1884 static void 1885 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels) 1886 { 1887 int level; 1888 uint64_t offbits; 1889 1890 /* 1891 * Skip the first 12 bits which is the offset into 1892 * 4K PFN (phys page frame based on IMMU_PAGESIZE) 1893 */ 1894 offbits = dvma >> IMMU_PAGESHIFT; 1895 1896 /* skip to level 1 i.e. leaf PTE */ 1897 for (level = 1, xlate++; level <= nlevels; level++, xlate++) { 1898 xlate->xlt_level = level; 1899 xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK); 1900 ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX); 1901 xlate->xlt_pgtable = NULL; 1902 offbits >>= IMMU_PGTABLE_LEVEL_STRIDE; 1903 } 1904 } 1905 1906 /* 1907 * Read the pgtables 1908 */ 1909 static boolean_t 1910 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels) 1911 { 1912 pgtable_t *pgtable; 1913 pgtable_t *next; 1914 uint_t idx; 1915 1916 /* start with highest level pgtable i.e. root */ 1917 xlate += nlevels; 1918 1919 if (xlate->xlt_pgtable == NULL) { 1920 xlate->xlt_pgtable = domain->dom_pgtable_root; 1921 } 1922 1923 for (; xlate->xlt_level > 1; xlate--) { 1924 idx = xlate->xlt_idx; 1925 pgtable = xlate->xlt_pgtable; 1926 1927 if ((xlate - 1)->xlt_pgtable) { 1928 continue; 1929 } 1930 1931 /* Lock the pgtable in read mode */ 1932 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 1933 1934 /* 1935 * since we are unmapping, the pgtable should 1936 * already point to a leafier pgtable. 1937 */ 1938 next = *(pgtable->swpg_next_array + idx); 1939 (xlate - 1)->xlt_pgtable = next; 1940 rw_exit(&(pgtable->swpg_rwlock)); 1941 if (next == NULL) 1942 return (B_FALSE); 1943 } 1944 1945 return (B_TRUE); 1946 } 1947 1948 static void 1949 immu_fault_walk(void *arg, void *base, size_t len) 1950 { 1951 uint64_t dvma, start; 1952 1953 dvma = *(uint64_t *)arg; 1954 start = (uint64_t)(uintptr_t)base; 1955 1956 if (dvma >= start && dvma < (start + len)) { 1957 ddi_err(DER_WARN, NULL, 1958 "faulting DVMA address is in vmem arena " 1959 "(%" PRIx64 "-%" PRIx64 ")", 1960 start, start + len); 1961 *(uint64_t *)arg = ~0ULL; 1962 } 1963 } 1964 1965 void 1966 immu_print_fault_info(uint_t sid, uint64_t dvma) 1967 { 1968 int nlevels; 1969 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 1970 xlate_t *xlatep; 1971 hw_pdte_t pte; 1972 domain_t *domain; 1973 immu_t *immu; 1974 uint64_t dvma_arg; 1975 1976 if (mod_hash_find(bdf_domain_hash, 1977 (void *)(uintptr_t)sid, (void *)&domain) != 0) { 1978 ddi_err(DER_WARN, NULL, 1979 "no domain for faulting SID %08x", sid); 1980 return; 1981 } 1982 1983 immu = domain->dom_immu; 1984 1985 dvma_arg = dvma; 1986 vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk, 1987 (void *)&dvma_arg); 1988 if (dvma_arg != ~0ULL) 1989 ddi_err(DER_WARN, domain->dom_dip, 1990 "faulting DVMA address is not in vmem arena"); 1991 1992 nlevels = immu->immu_dvma_nlevels; 1993 xlate_setup(dvma, xlate, nlevels); 1994 1995 if (!PDE_lookup(domain, xlate, nlevels)) { 1996 ddi_err(DER_WARN, domain->dom_dip, 1997 "pte not found in domid %d for faulting addr %" PRIx64, 1998 domain->dom_did, dvma); 1999 return; 2000 } 2001 2002 xlatep = &xlate[1]; 2003 pte = *((hw_pdte_t *) 2004 (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx); 2005 2006 ddi_err(DER_WARN, domain->dom_dip, 2007 "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did, 2008 (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte)); 2009 } 2010 2011 /*ARGSUSED*/ 2012 static void 2013 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, 2014 dev_info_t *rdip, immu_flags_t immu_flags) 2015 { 2016 hw_pdte_t pte; 2017 2018 #ifndef DEBUG 2019 pte = immu->immu_ptemask; 2020 PDTE_SET_PADDR(pte, paddr); 2021 #else 2022 pte = *hwp; 2023 2024 if (PDTE_P(pte)) { 2025 if (PDTE_PADDR(pte) != paddr) { 2026 ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx", 2027 PDTE_PADDR(pte), paddr); 2028 } 2029 #ifdef BUGGY_DRIVERS 2030 return; 2031 #else 2032 goto out; 2033 #endif 2034 } 2035 2036 /* clear TM field if not reserved */ 2037 if (immu->immu_TM_reserved == B_FALSE) { 2038 PDTE_CLEAR_TM(pte); 2039 } 2040 2041 /* Clear 3rd field for system software - not used */ 2042 PDTE_CLEAR_SW3(pte); 2043 2044 /* Set paddr */ 2045 ASSERT(paddr % IMMU_PAGESIZE == 0); 2046 PDTE_CLEAR_PADDR(pte); 2047 PDTE_SET_PADDR(pte, paddr); 2048 2049 /* clear SNP field if not reserved. */ 2050 if (immu->immu_SNP_reserved == B_FALSE) { 2051 PDTE_CLEAR_SNP(pte); 2052 } 2053 2054 /* Clear SW2 field available for software */ 2055 PDTE_CLEAR_SW2(pte); 2056 2057 2058 /* SP is don't care for PTEs. Clear it for cleanliness */ 2059 PDTE_CLEAR_SP(pte); 2060 2061 /* Clear SW1 field available for software */ 2062 PDTE_CLEAR_SW1(pte); 2063 2064 /* 2065 * Now that we are done writing the PTE 2066 * set the "present" flag. Note this present 2067 * flag is a bit in the PDE/PTE that the 2068 * spec says is available for system software. 2069 * This is an implementation detail of Solaris 2070 * bare-metal Intel IOMMU. 2071 * The present field in a PDE/PTE is not defined 2072 * by the Vt-d spec 2073 */ 2074 2075 PDTE_SET_P(pte); 2076 2077 pte |= immu->immu_ptemask; 2078 2079 out: 2080 #endif /* DEBUG */ 2081 #ifdef BUGGY_DRIVERS 2082 PDTE_SET_READ(pte); 2083 PDTE_SET_WRITE(pte); 2084 #else 2085 if (immu_flags & IMMU_FLAGS_READ) 2086 PDTE_SET_READ(pte); 2087 if (immu_flags & IMMU_FLAGS_WRITE) 2088 PDTE_SET_WRITE(pte); 2089 #endif /* BUGGY_DRIVERS */ 2090 2091 *hwp = pte; 2092 } 2093 2094 /*ARGSUSED*/ 2095 static void 2096 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 2097 uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies, 2098 int dcount, dev_info_t *rdip, immu_flags_t immu_flags) 2099 { 2100 paddr_t paddr; 2101 uint64_t nvpages; 2102 uint64_t nppages; 2103 uint64_t dvma; 2104 pgtable_t *pgtable; 2105 hw_pdte_t *hwp; 2106 hw_pdte_t *shwp; 2107 int idx, nset; 2108 int j; 2109 2110 pgtable = xlate->xlt_pgtable; 2111 idx = xlate->xlt_idx; 2112 2113 dvma = *dvma_ptr; 2114 nvpages = *nvpages_ptr; 2115 2116 /* 2117 * since a caller gets a unique dvma for a physical address, 2118 * no other concurrent thread will be writing to the same 2119 * PTE even if it has the same paddr. So no locks needed. 2120 */ 2121 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2122 2123 hwp = shwp; 2124 for (j = dcount - 1; j >= 0; j--) { 2125 if (nvpages <= dcookies[j].dck_npages) 2126 break; 2127 nvpages -= dcookies[j].dck_npages; 2128 } 2129 2130 VERIFY(j >= 0); 2131 nppages = nvpages; 2132 paddr = dcookies[j].dck_paddr + 2133 (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE; 2134 2135 nvpages = *nvpages_ptr; 2136 nset = 0; 2137 for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 2138 PTE_set_one(immu, hwp, paddr, rdip, immu_flags); 2139 nset++; 2140 2141 ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags) 2142 == B_TRUE); 2143 nppages--; 2144 nvpages--; 2145 paddr += IMMU_PAGESIZE; 2146 dvma += IMMU_PAGESIZE; 2147 2148 if (nppages == 0) { 2149 j++; 2150 } 2151 2152 if (j == dcount) 2153 break; 2154 2155 if (nppages == 0) { 2156 nppages = dcookies[j].dck_npages; 2157 paddr = dcookies[j].dck_paddr; 2158 } 2159 } 2160 2161 if (nvpages) { 2162 *dvma_ptr = dvma; 2163 *nvpages_ptr = nvpages; 2164 } else { 2165 *dvma_ptr = 0; 2166 *nvpages_ptr = 0; 2167 } 2168 2169 xlate->xlt_idx = idx; 2170 } 2171 2172 /*ARGSUSED*/ 2173 static void 2174 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next, 2175 dev_info_t *rdip, immu_flags_t immu_flags) 2176 { 2177 hw_pdte_t pde; 2178 2179 pde = *hwp; 2180 2181 /* if PDE is already set, make sure it is correct */ 2182 if (PDTE_P(pde)) { 2183 ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr); 2184 #ifdef BUGGY_DRIVERS 2185 return; 2186 #else 2187 goto out; 2188 #endif 2189 } 2190 2191 /* Dont touch SW4, it is the present bit */ 2192 2193 /* don't touch TM field it is reserved for PDEs */ 2194 2195 /* 3rd field available for system software is not used */ 2196 PDTE_CLEAR_SW3(pde); 2197 2198 /* Set next level pgtable-paddr for PDE */ 2199 PDTE_CLEAR_PADDR(pde); 2200 PDTE_SET_PADDR(pde, next->hwpg_paddr); 2201 2202 /* don't touch SNP field it is reserved for PDEs */ 2203 2204 /* Clear second field available for system software */ 2205 PDTE_CLEAR_SW2(pde); 2206 2207 /* No super pages for PDEs */ 2208 PDTE_CLEAR_SP(pde); 2209 2210 /* Clear SW1 for software */ 2211 PDTE_CLEAR_SW1(pde); 2212 2213 /* 2214 * Now that we are done writing the PDE 2215 * set the "present" flag. Note this present 2216 * flag is a bit in the PDE/PTE that the 2217 * spec says is available for system software. 2218 * This is an implementation detail of Solaris 2219 * base-metal Intel IOMMU. 2220 * The present field in a PDE/PTE is not defined 2221 * by the Vt-d spec 2222 */ 2223 2224 out: 2225 #ifdef BUGGY_DRIVERS 2226 PDTE_SET_READ(pde); 2227 PDTE_SET_WRITE(pde); 2228 #else 2229 if (immu_flags & IMMU_FLAGS_READ) 2230 PDTE_SET_READ(pde); 2231 if (immu_flags & IMMU_FLAGS_WRITE) 2232 PDTE_SET_WRITE(pde); 2233 #endif 2234 2235 PDTE_SET_P(pde); 2236 2237 *hwp = pde; 2238 } 2239 2240 /* 2241 * Used to set PDEs 2242 */ 2243 static boolean_t 2244 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, 2245 dev_info_t *rdip, immu_flags_t immu_flags) 2246 { 2247 pgtable_t *pgtable; 2248 pgtable_t *new; 2249 pgtable_t *next; 2250 hw_pdte_t *hwp; 2251 int level; 2252 uint_t idx; 2253 krw_t rwtype; 2254 boolean_t set = B_FALSE; 2255 2256 /* start with highest level pgtable i.e. root */ 2257 xlate += nlevels; 2258 2259 new = NULL; 2260 xlate->xlt_pgtable = domain->dom_pgtable_root; 2261 for (level = nlevels; level > 1; level--, xlate--) { 2262 idx = xlate->xlt_idx; 2263 pgtable = xlate->xlt_pgtable; 2264 2265 /* Lock the pgtable in READ mode first */ 2266 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 2267 rwtype = RW_READER; 2268 again: 2269 hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2270 next = (pgtable->swpg_next_array)[idx]; 2271 2272 /* 2273 * check if leafier level already has a pgtable 2274 * if yes, verify 2275 */ 2276 if (next == NULL) { 2277 if (new == NULL) { 2278 2279 IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *, 2280 rdip, int, level); 2281 2282 new = pgtable_alloc(immu, immu_flags); 2283 if (new == NULL) { 2284 ddi_err(DER_PANIC, rdip, 2285 "pgtable alloc err"); 2286 } 2287 pgtable_zero(new); 2288 } 2289 2290 /* Change to a write lock */ 2291 if (rwtype == RW_READER && 2292 rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) { 2293 rw_exit(&(pgtable->swpg_rwlock)); 2294 rw_enter(&(pgtable->swpg_rwlock), RW_WRITER); 2295 rwtype = RW_WRITER; 2296 goto again; 2297 } 2298 rwtype = RW_WRITER; 2299 next = new; 2300 (pgtable->swpg_next_array)[idx] = next; 2301 new = NULL; 2302 PDE_set_one(immu, hwp, next, rdip, immu_flags); 2303 set = B_TRUE; 2304 rw_downgrade(&(pgtable->swpg_rwlock)); 2305 rwtype = RW_READER; 2306 } 2307 #ifndef BUGGY_DRIVERS 2308 else { 2309 hw_pdte_t pde = *hwp; 2310 2311 /* 2312 * If buggy driver we already set permission 2313 * READ+WRITE so nothing to do for that case 2314 * XXX Check that read writer perms change before 2315 * actually setting perms. Also need to hold lock 2316 */ 2317 if (immu_flags & IMMU_FLAGS_READ) 2318 PDTE_SET_READ(pde); 2319 if (immu_flags & IMMU_FLAGS_WRITE) 2320 PDTE_SET_WRITE(pde); 2321 2322 *hwp = pde; 2323 } 2324 #endif 2325 2326 ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags) 2327 == B_TRUE); 2328 2329 (xlate - 1)->xlt_pgtable = next; 2330 rw_exit(&(pgtable->swpg_rwlock)); 2331 } 2332 2333 if (new) { 2334 pgtable_free(immu, new); 2335 } 2336 2337 return (set); 2338 } 2339 2340 /* 2341 * dvma_map() 2342 * map a contiguous range of DVMA pages 2343 * 2344 * immu: IOMMU unit for which we are generating DVMA cookies 2345 * domain: domain 2346 * sdvma: Starting dvma 2347 * spaddr: Starting paddr 2348 * npages: Number of pages 2349 * rdip: requesting device 2350 * immu_flags: flags 2351 */ 2352 static boolean_t 2353 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages, 2354 immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 2355 immu_flags_t immu_flags) 2356 { 2357 uint64_t dvma; 2358 uint64_t n; 2359 immu_t *immu = domain->dom_immu; 2360 int nlevels = immu->immu_dvma_nlevels; 2361 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2362 boolean_t pde_set = B_FALSE; 2363 2364 n = snvpages; 2365 dvma = sdvma; 2366 2367 while (n > 0) { 2368 xlate_setup(dvma, xlate, nlevels); 2369 2370 /* Lookup or allocate PGDIRs and PGTABLEs if necessary */ 2371 if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags) 2372 == B_TRUE) { 2373 pde_set = B_TRUE; 2374 } 2375 2376 /* set all matching ptes that fit into this leaf pgtable */ 2377 PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies, 2378 dcount, rdip, immu_flags); 2379 } 2380 2381 return (pde_set); 2382 } 2383 2384 /* 2385 * dvma_unmap() 2386 * unmap a range of DVMAs 2387 * 2388 * immu: IOMMU unit state 2389 * domain: domain for requesting device 2390 * ddip: domain-dip 2391 * dvma: starting DVMA 2392 * npages: Number of IMMU pages to be unmapped 2393 * rdip: requesting device 2394 */ 2395 static void 2396 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages, 2397 dev_info_t *rdip) 2398 { 2399 immu_t *immu = domain->dom_immu; 2400 int nlevels = immu->immu_dvma_nlevels; 2401 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2402 uint64_t n; 2403 uint64_t dvma; 2404 2405 dvma = sdvma; 2406 n = snpages; 2407 2408 while (n > 0) { 2409 /* setup the xlate array */ 2410 xlate_setup(dvma, xlate, nlevels); 2411 2412 /* just lookup existing pgtables. Should never fail */ 2413 if (!PDE_lookup(domain, xlate, nlevels)) 2414 ddi_err(DER_PANIC, rdip, 2415 "PTE not found for addr %" PRIx64, 2416 (unsigned long long)dvma); 2417 2418 /* clear all matching ptes that fit into this leaf pgtable */ 2419 PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip); 2420 } 2421 2422 /* No need to flush IOTLB after unmap */ 2423 } 2424 2425 static uint64_t 2426 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf) 2427 { 2428 uint64_t dvma; 2429 size_t xsize, align; 2430 uint64_t minaddr, maxaddr; 2431 2432 /* parameters */ 2433 xsize = npages * IMMU_PAGESIZE; 2434 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2435 minaddr = dma_attr->dma_attr_addr_lo; 2436 maxaddr = dma_attr->dma_attr_addr_hi + 1; 2437 2438 /* handle the rollover cases */ 2439 if (maxaddr < dma_attr->dma_attr_addr_hi) { 2440 maxaddr = dma_attr->dma_attr_addr_hi; 2441 } 2442 2443 /* 2444 * allocate from vmem arena. 2445 */ 2446 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2447 xsize, align, 0, 0, (void *)(uintptr_t)minaddr, 2448 (void *)(uintptr_t)maxaddr, kmf); 2449 2450 return (dvma); 2451 } 2452 2453 static void 2454 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr) 2455 { 2456 int nlevels; 2457 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp; 2458 uint64_t dvma, n; 2459 size_t xsize, align; 2460 uint64_t minaddr, maxaddr, dmamax; 2461 int on, npte, pindex; 2462 hw_pdte_t *shwp; 2463 immu_t *immu; 2464 domain_t *domain; 2465 2466 /* parameters */ 2467 domain = IMMU_DEVI(rdip)->imd_domain; 2468 immu = domain->dom_immu; 2469 nlevels = immu->immu_dvma_nlevels; 2470 xsize = IMMU_NPREPTES * IMMU_PAGESIZE; 2471 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2472 minaddr = dma_attr->dma_attr_addr_lo; 2473 if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG) 2474 dmamax = dma_attr->dma_attr_seg; 2475 else 2476 dmamax = dma_attr->dma_attr_addr_hi; 2477 maxaddr = dmamax + 1; 2478 2479 if (maxaddr < dmamax) 2480 maxaddr = dmamax; 2481 2482 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2483 xsize, align, 0, dma_attr->dma_attr_seg + 1, 2484 (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP); 2485 2486 ihp->ihp_predvma = dvma; 2487 ihp->ihp_npremapped = 0; 2488 if (dvma == 0) 2489 return; 2490 2491 n = IMMU_NPREPTES; 2492 pindex = 0; 2493 2494 /* 2495 * Set up a mapping at address 0, just so that all PDPs get allocated 2496 * now. Although this initial mapping should never be used, 2497 * explicitly set it to read-only, just to be safe. 2498 */ 2499 while (n > 0) { 2500 xlate_setup(dvma, xlate, nlevels); 2501 2502 (void) PDE_set_all(immu, domain, xlate, nlevels, rdip, 2503 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2504 2505 xlp = &xlate[1]; 2506 shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr) 2507 + xlp->xlt_idx; 2508 on = n; 2509 2510 PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie, 2511 1, rdip, IMMU_FLAGS_READ); 2512 2513 npte = on - n; 2514 2515 while (npte > 0) { 2516 ihp->ihp_preptes[pindex++] = shwp; 2517 #ifdef BUGGY_DRIVERS 2518 PDTE_CLEAR_WRITE(*shwp); 2519 #endif 2520 shwp++; 2521 npte--; 2522 } 2523 } 2524 } 2525 2526 static void 2527 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp) 2528 { 2529 domain_t *domain; 2530 2531 domain = IMMU_DEVI(rdip)->imd_domain; 2532 2533 if (ihp->ihp_predvma != 0) { 2534 dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip); 2535 vmem_free(domain->dom_dvma_arena, 2536 (void *)(uintptr_t)ihp->ihp_predvma, 2537 IMMU_NPREPTES * IMMU_PAGESIZE); 2538 } 2539 } 2540 2541 static void 2542 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages) 2543 { 2544 uint64_t size = npages * IMMU_PAGESIZE; 2545 2546 if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) 2547 return; 2548 2549 vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size); 2550 } 2551 2552 static int 2553 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle, 2554 immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq, 2555 ddi_dma_obj_t *dma_out) 2556 { 2557 domain_t *domain; 2558 immu_t *immu; 2559 immu_flags_t immu_flags; 2560 ddi_dma_atyp_t buftype; 2561 ddi_dma_obj_t *dmar_object; 2562 ddi_dma_attr_t *attrp; 2563 uint64_t offset, paddr, dvma, sdvma, rwmask; 2564 size_t npages, npgalloc; 2565 uint_t psize, size, pcnt, dmax; 2566 page_t **pparray; 2567 caddr_t vaddr; 2568 page_t *page; 2569 struct as *vas; 2570 immu_dcookie_t *dcookies; 2571 int pde_set; 2572 2573 domain = IMMU_DEVI(rdip)->imd_domain; 2574 immu = domain->dom_immu; 2575 immu_flags = dma_to_immu_flags(dmareq); 2576 2577 attrp = &((ddi_dma_impl_t *)handle)->dmai_attr; 2578 2579 dmar_object = &dmareq->dmar_object; 2580 pparray = dmar_object->dmao_obj.virt_obj.v_priv; 2581 vaddr = dmar_object->dmao_obj.virt_obj.v_addr; 2582 buftype = dmar_object->dmao_type; 2583 size = dmar_object->dmao_size; 2584 2585 IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t, 2586 buftype, uint_t, size); 2587 2588 dcookies = &ihp->ihp_dcookies[0]; 2589 2590 pcnt = dmax = 0; 2591 2592 /* retrieve paddr, psize, offset from dmareq */ 2593 if (buftype == DMA_OTYP_PAGES) { 2594 page = dmar_object->dmao_obj.pp_obj.pp_pp; 2595 offset = dmar_object->dmao_obj.pp_obj.pp_offset & 2596 MMU_PAGEOFFSET; 2597 paddr = pfn_to_pa(page->p_pagenum) + offset; 2598 psize = MIN((MMU_PAGESIZE - offset), size); 2599 page = page->p_next; 2600 vas = dmar_object->dmao_obj.virt_obj.v_as; 2601 } else { 2602 if (vas == NULL) { 2603 vas = &kas; 2604 } 2605 offset = (uintptr_t)vaddr & MMU_PAGEOFFSET; 2606 if (pparray != NULL) { 2607 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset; 2608 psize = MIN((MMU_PAGESIZE - offset), size); 2609 pcnt++; 2610 } else { 2611 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, 2612 vaddr)) + offset; 2613 psize = MIN(size, (MMU_PAGESIZE - offset)); 2614 vaddr += psize; 2615 } 2616 } 2617 2618 npgalloc = IMMU_BTOPR(size + offset); 2619 2620 if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) { 2621 #ifdef BUGGY_DRIVERS 2622 rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask; 2623 #else 2624 rwmask = immu->immu_ptemask; 2625 if (immu_flags & IMMU_FLAGS_READ) 2626 rwmask |= PDTE_MASK_R; 2627 if (immu_flags & IMMU_FLAGS_WRITE) 2628 rwmask |= PDTE_MASK_W; 2629 #endif 2630 #ifdef DEBUG 2631 rwmask |= PDTE_MASK_P; 2632 #endif 2633 sdvma = ihp->ihp_predvma; 2634 ihp->ihp_npremapped = npgalloc; 2635 *ihp->ihp_preptes[0] = 2636 PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask; 2637 } else { 2638 ihp->ihp_npremapped = 0; 2639 sdvma = dvma_alloc(domain, attrp, npgalloc, 2640 dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP); 2641 if (sdvma == 0) 2642 return (DDI_DMA_NORESOURCES); 2643 2644 dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET); 2645 dcookies[0].dck_npages = 1; 2646 } 2647 2648 IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc, 2649 uint64_t, sdvma); 2650 2651 dvma = sdvma; 2652 pde_set = 0; 2653 npages = 1; 2654 size -= psize; 2655 while (size > 0) { 2656 /* get the size for this page (i.e. partial or full page) */ 2657 psize = MIN(size, MMU_PAGESIZE); 2658 if (buftype == DMA_OTYP_PAGES) { 2659 /* get the paddr from the page_t */ 2660 paddr = pfn_to_pa(page->p_pagenum); 2661 page = page->p_next; 2662 } else if (pparray != NULL) { 2663 /* index into the array of page_t's to get the paddr */ 2664 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum); 2665 pcnt++; 2666 } else { 2667 /* call into the VM to get the paddr */ 2668 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr)); 2669 vaddr += psize; 2670 } 2671 2672 if (ihp->ihp_npremapped > 0) { 2673 *ihp->ihp_preptes[npages] = 2674 PDTE_PADDR(paddr) | rwmask; 2675 } else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) { 2676 dcookies[dmax].dck_npages++; 2677 } else { 2678 /* No, we need a new dcookie */ 2679 if (dmax == (IMMU_NDCK - 1)) { 2680 /* 2681 * Ran out of dcookies. Map them now. 2682 */ 2683 if (dvma_map(domain, dvma, 2684 npages, dcookies, dmax + 1, rdip, 2685 immu_flags)) 2686 pde_set++; 2687 2688 IMMU_DPROBE4(immu__dvmamap__early, 2689 dev_info_t *, rdip, uint64_t, dvma, 2690 uint_t, npages, uint_t, dmax+1); 2691 2692 dvma += (npages << IMMU_PAGESHIFT); 2693 npages = 0; 2694 dmax = 0; 2695 } else { 2696 dmax++; 2697 } 2698 dcookies[dmax].dck_paddr = paddr; 2699 dcookies[dmax].dck_npages = 1; 2700 } 2701 size -= psize; 2702 if (npages != 0) 2703 npages++; 2704 } 2705 2706 /* 2707 * Finish up, mapping all, or all of the remaining, 2708 * physical memory ranges. 2709 */ 2710 if (ihp->ihp_npremapped == 0 && npages > 0) { 2711 IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \ 2712 uint64_t, dvma, uint_t, npages, uint_t, dmax+1); 2713 2714 if (dvma_map(domain, dvma, npages, dcookies, 2715 dmax + 1, rdip, immu_flags)) 2716 pde_set++; 2717 } 2718 2719 /* Invalidate the IOTLB */ 2720 immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc, 2721 pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF, 2722 &ihp->ihp_inv_wait); 2723 2724 ihp->ihp_ndvseg = 1; 2725 ihp->ihp_dvseg[0].dvs_start = sdvma; 2726 ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size; 2727 2728 dma_out->dmao_size = dmar_object->dmao_size; 2729 dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET; 2730 dma_out->dmao_obj.dvma_obj.dv_nseg = 1; 2731 dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0]; 2732 dma_out->dmao_type = DMA_OTYP_DVADDR; 2733 2734 return (DDI_DMA_MAPPED); 2735 } 2736 2737 static int 2738 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao) 2739 { 2740 uint64_t dvma, npages; 2741 domain_t *domain; 2742 struct dvmaseg *dvs; 2743 2744 domain = IMMU_DEVI(rdip)->imd_domain; 2745 dvs = dmao->dmao_obj.dvma_obj.dv_seg; 2746 2747 dvma = dvs[0].dvs_start; 2748 npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off); 2749 2750 #ifdef DEBUG 2751 /* Unmap only in DEBUG mode */ 2752 dvma_unmap(domain, dvma, npages, rdip); 2753 #endif 2754 dvma_free(domain, dvma, npages); 2755 2756 IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages, 2757 uint64_t, dvma); 2758 2759 #ifdef DEBUG 2760 /* 2761 * In the DEBUG case, the unmap was actually done, 2762 * but an IOTLB flush was not done. So, an explicit 2763 * write back flush is needed. 2764 */ 2765 immu_regs_wbf_flush(domain->dom_immu); 2766 #endif 2767 2768 return (DDI_SUCCESS); 2769 } 2770 2771 /* ############################# Functions exported ######################## */ 2772 2773 /* 2774 * setup the DVMA subsystem 2775 * this code runs only for the first IOMMU unit 2776 */ 2777 void 2778 immu_dvma_setup(list_t *listp) 2779 { 2780 immu_t *immu; 2781 uint_t kval; 2782 size_t nchains; 2783 2784 /* locks */ 2785 mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL); 2786 2787 /* Create lists */ 2788 list_create(&immu_unity_domain_list, sizeof (domain_t), 2789 offsetof(domain_t, dom_maptype_node)); 2790 list_create(&immu_xlate_domain_list, sizeof (domain_t), 2791 offsetof(domain_t, dom_maptype_node)); 2792 2793 /* Setup BDF domain hash */ 2794 nchains = 0xff; 2795 kval = mod_hash_iddata_gen(nchains); 2796 2797 bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH", 2798 nchains, mod_hash_null_keydtor, mod_hash_null_valdtor, 2799 mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp, 2800 KM_NOSLEEP); 2801 2802 immu = list_head(listp); 2803 for (; immu; immu = list_next(listp, immu)) { 2804 create_unity_domain(immu); 2805 did_init(immu); 2806 context_init(immu); 2807 immu->immu_dvma_setup = B_TRUE; 2808 } 2809 } 2810 2811 /* 2812 * Startup up one DVMA unit 2813 */ 2814 void 2815 immu_dvma_startup(immu_t *immu) 2816 { 2817 if (immu_gfxdvma_enable == B_FALSE && 2818 immu->immu_dvma_gfx_only == B_TRUE) { 2819 return; 2820 } 2821 2822 /* 2823 * DVMA will start once IOMMU is "running" 2824 */ 2825 immu->immu_dvma_running = B_TRUE; 2826 } 2827 2828 /* 2829 * immu_dvma_physmem_update() 2830 * called when the installed memory on a 2831 * system increases, to expand domain DVMA 2832 * for domains with UNITY mapping 2833 */ 2834 void 2835 immu_dvma_physmem_update(uint64_t addr, uint64_t size) 2836 { 2837 uint64_t start; 2838 uint64_t npages; 2839 int dcount; 2840 immu_dcookie_t dcookies[1] = {0}; 2841 domain_t *domain; 2842 2843 /* 2844 * Just walk the system-wide list of domains with 2845 * UNITY mapping. Both the list of *all* domains 2846 * and *UNITY* domains is protected by the same 2847 * single lock 2848 */ 2849 mutex_enter(&immu_domain_lock); 2850 domain = list_head(&immu_unity_domain_list); 2851 for (; domain; domain = list_next(&immu_unity_domain_list, domain)) { 2852 /* 2853 * Nothing to do if the IOMMU supports passthrough. 2854 */ 2855 if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap)) 2856 continue; 2857 2858 /* There is no vmem_arena for unity domains. Just map it */ 2859 ddi_err(DER_LOG, domain->dom_dip, 2860 "iommu: unity-domain: Adding map " 2861 "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size); 2862 2863 start = IMMU_ROUNDOWN(addr); 2864 npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1; 2865 2866 dcookies[0].dck_paddr = start; 2867 dcookies[0].dck_npages = npages; 2868 dcount = 1; 2869 (void) dvma_map(domain, start, npages, 2870 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2871 2872 } 2873 mutex_exit(&immu_domain_lock); 2874 } 2875 2876 int 2877 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags) 2878 { 2879 dev_info_t *ddip, *odip; 2880 immu_t *immu; 2881 domain_t *domain; 2882 2883 odip = rdip; 2884 2885 immu = immu_dvma_get_immu(rdip, immu_flags); 2886 if (immu == NULL) { 2887 /* 2888 * possible that there is no IOMMU unit for this device 2889 * - BIOS bugs are one example. 2890 */ 2891 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 2892 return (DDI_DMA_NORESOURCES); 2893 } 2894 2895 /* 2896 * redirect isa devices attached under lpc to lpc dip 2897 */ 2898 if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) { 2899 rdip = get_lpc_devinfo(immu, rdip, immu_flags); 2900 if (rdip == NULL) { 2901 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2902 /*NOTREACHED*/ 2903 } 2904 } 2905 2906 /* Reset immu, as redirection can change IMMU */ 2907 immu = NULL; 2908 2909 /* 2910 * for gart, redirect to the real graphic devinfo 2911 */ 2912 if (strcmp(ddi_node_name(rdip), "agpgart") == 0) { 2913 rdip = get_gfx_devinfo(rdip); 2914 if (rdip == NULL) { 2915 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2916 /*NOTREACHED*/ 2917 } 2918 } 2919 2920 /* 2921 * Setup DVMA domain for the device. This does 2922 * work only the first time we do DVMA for a 2923 * device. 2924 */ 2925 ddip = NULL; 2926 domain = device_domain(rdip, &ddip, immu_flags); 2927 if (domain == NULL) { 2928 ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device"); 2929 return (DDI_DMA_NORESOURCES); 2930 } 2931 2932 immu = domain->dom_immu; 2933 2934 /* 2935 * If a domain is found, we must also have a domain dip 2936 * which is the topmost ancestor dip of rdip that shares 2937 * the same domain with rdip. 2938 */ 2939 if (domain->dom_did == 0 || ddip == NULL) { 2940 ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)", 2941 domain->dom_did, ddip); 2942 return (DDI_DMA_NORESOURCES); 2943 } 2944 2945 if (odip != rdip) 2946 set_domain(odip, ddip, domain); 2947 2948 /* 2949 * Update the root and context entries 2950 */ 2951 if (immu_context_update(immu, domain, ddip, rdip, immu_flags) 2952 != DDI_SUCCESS) { 2953 ddi_err(DER_MODE, rdip, "DVMA map: context update failed"); 2954 return (DDI_DMA_NORESOURCES); 2955 } 2956 2957 return (DDI_SUCCESS); 2958 } 2959 2960 int 2961 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng) 2962 { 2963 immu_dcookie_t dcookies[1] = {0}; 2964 boolean_t pde_set; 2965 immu_t *immu; 2966 domain_t *domain; 2967 immu_inv_wait_t iw; 2968 2969 dcookies[0].dck_paddr = mrng->mrng_start; 2970 dcookies[0].dck_npages = mrng->mrng_npages; 2971 2972 domain = IMMU_DEVI(rdip)->imd_domain; 2973 immu = domain->dom_immu; 2974 2975 pde_set = dvma_map(domain, mrng->mrng_start, 2976 mrng->mrng_npages, dcookies, 1, rdip, 2977 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2978 2979 immu_init_inv_wait(&iw, "memrange", B_TRUE); 2980 2981 immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start, 2982 mrng->mrng_npages, pde_set == B_TRUE ? 2983 TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw); 2984 2985 return (DDI_SUCCESS); 2986 } 2987 2988 immu_devi_t * 2989 immu_devi_get(dev_info_t *rdip) 2990 { 2991 immu_devi_t *immu_devi; 2992 volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu); 2993 2994 /* Just want atomic reads. No need for lock */ 2995 immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr, 2996 0); 2997 return (immu_devi); 2998 } 2999 3000 /*ARGSUSED*/ 3001 int 3002 immu_hdl_priv_ctor(void *buf, void *arg, int kmf) 3003 { 3004 immu_hdl_priv_t *ihp; 3005 3006 ihp = buf; 3007 immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE); 3008 3009 return (0); 3010 } 3011 3012 /* 3013 * iommulib interface functions 3014 */ 3015 static int 3016 immu_probe(iommulib_handle_t handle, dev_info_t *dip) 3017 { 3018 immu_devi_t *immu_devi; 3019 int ret; 3020 3021 if (!immu_enable) 3022 return (DDI_FAILURE); 3023 3024 /* 3025 * Make sure the device has all the IOMMU structures 3026 * initialized. If this device goes through an IOMMU 3027 * unit (e.g. this probe function returns success), 3028 * this will be called at most N times, with N being 3029 * the number of IOMMUs in the system. 3030 * 3031 * After that, when iommulib_nex_open succeeds, 3032 * we can always assume that this device has all 3033 * the structures initialized. IOMMU_USED(dip) will 3034 * be true. There is no need to find the controlling 3035 * IOMMU/domain again. 3036 */ 3037 ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP); 3038 if (ret != DDI_SUCCESS) 3039 return (ret); 3040 3041 immu_devi = IMMU_DEVI(dip); 3042 3043 /* 3044 * For unity domains, there is no need to call in to 3045 * the IOMMU code. 3046 */ 3047 if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID) 3048 return (DDI_FAILURE); 3049 3050 if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle)) 3051 return (DDI_SUCCESS); 3052 3053 return (DDI_FAILURE); 3054 } 3055 3056 /*ARGSUSED*/ 3057 static int 3058 immu_allochdl(iommulib_handle_t handle, 3059 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 3060 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep) 3061 { 3062 int ret; 3063 immu_hdl_priv_t *ihp; 3064 immu_t *immu; 3065 3066 ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp, 3067 arg, dma_handlep); 3068 if (ret == DDI_SUCCESS) { 3069 immu = IMMU_DEVI(rdip)->imd_immu; 3070 3071 ihp = kmem_cache_alloc(immu->immu_hdl_cache, 3072 waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP); 3073 if (ihp == NULL) { 3074 (void) iommulib_iommu_dma_freehdl(dip, rdip, 3075 *dma_handlep); 3076 return (DDI_DMA_NORESOURCES); 3077 } 3078 3079 if (IMMU_DEVI(rdip)->imd_use_premap) 3080 dvma_prealloc(rdip, ihp, attr); 3081 else { 3082 ihp->ihp_npremapped = 0; 3083 ihp->ihp_predvma = 0; 3084 } 3085 ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep, 3086 ihp); 3087 } 3088 return (ret); 3089 } 3090 3091 /*ARGSUSED*/ 3092 static int 3093 immu_freehdl(iommulib_handle_t handle, 3094 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3095 { 3096 immu_hdl_priv_t *ihp; 3097 3098 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3099 if (ihp != NULL) { 3100 if (IMMU_DEVI(rdip)->imd_use_premap) 3101 dvma_prefree(rdip, ihp); 3102 kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp); 3103 } 3104 3105 return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle)); 3106 } 3107 3108 3109 /*ARGSUSED*/ 3110 static int 3111 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 3112 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3113 struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep, 3114 uint_t *ccountp) 3115 { 3116 int ret; 3117 immu_hdl_priv_t *ihp; 3118 3119 ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle, 3120 dma_req, cookiep, ccountp); 3121 3122 if (ret == DDI_DMA_MAPPED) { 3123 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3124 immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait); 3125 } 3126 3127 return (ret); 3128 } 3129 3130 /*ARGSUSED*/ 3131 static int 3132 immu_unbindhdl(iommulib_handle_t handle, 3133 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3134 { 3135 return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle)); 3136 } 3137 3138 /*ARGSUSED*/ 3139 static int 3140 immu_sync(iommulib_handle_t handle, dev_info_t *dip, 3141 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, 3142 size_t len, uint_t cachefl) 3143 { 3144 return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len, 3145 cachefl)); 3146 } 3147 3148 /*ARGSUSED*/ 3149 static int 3150 immu_win(iommulib_handle_t handle, dev_info_t *dip, 3151 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 3152 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, 3153 uint_t *ccountp) 3154 { 3155 return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp, 3156 lenp, cookiep, ccountp)); 3157 } 3158 3159 /*ARGSUSED*/ 3160 static int 3161 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 3162 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3163 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao) 3164 { 3165 immu_hdl_priv_t *ihp; 3166 3167 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3168 3169 return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao)); 3170 } 3171 3172 /*ARGSUSED*/ 3173 static int 3174 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 3175 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao) 3176 { 3177 immu_hdl_priv_t *ihp; 3178 3179 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3180 if (ihp->ihp_npremapped > 0) 3181 return (DDI_SUCCESS); 3182 return (immu_unmap_dvmaseg(rdip, dmao)); 3183 } 3184