1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Portions Copyright (c) 2010, Oracle and/or its affiliates. 23 * All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2009, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 /* 31 * DVMA code 32 * This file contains Intel IOMMU code that deals with DVMA 33 * i.e. DMA remapping. 34 */ 35 36 #include <sys/sysmacros.h> 37 #include <sys/pcie.h> 38 #include <sys/pci_cfgspace.h> 39 #include <vm/hat_i86.h> 40 #include <sys/memlist.h> 41 #include <sys/acpi/acpi.h> 42 #include <sys/acpica.h> 43 #include <sys/modhash.h> 44 #include <sys/immu.h> 45 #include <sys/x86_archext.h> 46 #include <sys/archsystm.h> 47 48 #undef TEST 49 50 /* 51 * Macros based on PCI spec 52 */ 53 #define IMMU_PCI_REV2CLASS(r) ((r) >> 8) /* classcode from revid */ 54 #define IMMU_PCI_CLASS2BASE(c) ((c) >> 16) /* baseclass from classcode */ 55 #define IMMU_PCI_CLASS2SUB(c) (((c) >> 8) & 0xff); /* classcode */ 56 57 #define IMMU_CONTIG_PADDR(d, p) \ 58 ((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p)) 59 60 typedef struct dvma_arg { 61 immu_t *dva_immu; 62 dev_info_t *dva_rdip; 63 dev_info_t *dva_ddip; 64 domain_t *dva_domain; 65 int dva_level; 66 immu_flags_t dva_flags; 67 list_t *dva_list; 68 int dva_error; 69 } dvma_arg_t; 70 71 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip, 72 dev_info_t *rdip, immu_flags_t immu_flags); 73 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus, 74 int dev, int func, immu_flags_t immu_flags); 75 static void destroy_immu_devi(immu_devi_t *immu_devi); 76 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma, 77 uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 78 immu_flags_t immu_flags); 79 80 /* Extern globals */ 81 extern struct memlist *phys_install; 82 83 /* 84 * iommulib interface functions. 85 */ 86 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip); 87 static int immu_allochdl(iommulib_handle_t handle, 88 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 89 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep); 90 static int immu_freehdl(iommulib_handle_t handle, 91 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 92 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 93 dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req, 94 ddi_dma_cookie_t *cookiep, uint_t *ccountp); 95 static int immu_unbindhdl(iommulib_handle_t handle, 96 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 97 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip, 98 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len, 99 uint_t cachefl); 100 static int immu_win(iommulib_handle_t handle, dev_info_t *dip, 101 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 102 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); 103 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 104 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 105 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao); 106 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 107 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao); 108 static int immu_map(iommulib_handle_t handle, dev_info_t *dip, 109 dev_info_t *rdip, struct ddi_dma_req *dmareq, 110 ddi_dma_handle_t *dma_handle); 111 static int immu_mctl(iommulib_handle_t handle, dev_info_t *dip, 112 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 113 enum ddi_dma_ctlops request, off_t *offp, size_t *lenp, 114 caddr_t *objpp, uint_t cachefl); 115 116 /* static Globals */ 117 118 /* 119 * Used to setup DMA objects (memory regions) 120 * for DMA reads by IOMMU units 121 */ 122 static ddi_dma_attr_t immu_dma_attr = { 123 DMA_ATTR_V0, 124 0U, 125 0xffffffffffffffffULL, 126 0xffffffffU, 127 MMU_PAGESIZE, /* MMU page aligned */ 128 0x1, 129 0x1, 130 0xffffffffU, 131 0xffffffffffffffffULL, 132 1, 133 4, 134 0 135 }; 136 137 static ddi_device_acc_attr_t immu_acc_attr = { 138 DDI_DEVICE_ATTR_V0, 139 DDI_NEVERSWAP_ACC, 140 DDI_STRICTORDER_ACC 141 }; 142 143 struct iommulib_ops immulib_ops = { 144 IOMMU_OPS_VERSION, 145 INTEL_IOMMU, 146 "Intel IOMMU", 147 NULL, 148 immu_probe, 149 immu_allochdl, 150 immu_freehdl, 151 immu_bindhdl, 152 immu_unbindhdl, 153 immu_sync, 154 immu_win, 155 immu_mapobject, 156 immu_unmapobject, 157 immu_map, 158 immu_mctl 159 }; 160 161 /* 162 * Fake physical address range used to set up initial prealloc mappings. 163 * This memory is never actually accessed. It is mapped read-only, 164 * and is overwritten as soon as the first DMA bind operation is 165 * performed. Since 0 is a special case, just start at the 2nd 166 * physical page. 167 */ 168 169 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES }; 170 171 /* globals private to this file */ 172 static kmutex_t immu_domain_lock; 173 static list_t immu_unity_domain_list; 174 static list_t immu_xlate_domain_list; 175 176 /* structure used to store idx into each level of the page tables */ 177 typedef struct xlate { 178 int xlt_level; 179 uint_t xlt_idx; 180 pgtable_t *xlt_pgtable; 181 } xlate_t; 182 183 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */ 184 #define IMMU_UNITY_DID 1 185 186 static mod_hash_t *bdf_domain_hash; 187 188 int immu_use_alh; 189 int immu_use_tm; 190 191 static domain_t * 192 bdf_domain_lookup(immu_devi_t *immu_devi) 193 { 194 domain_t *domain; 195 int16_t seg = immu_devi->imd_seg; 196 int16_t bus = immu_devi->imd_bus; 197 int16_t devfunc = immu_devi->imd_devfunc; 198 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 199 200 if (seg < 0 || bus < 0 || devfunc < 0) { 201 return (NULL); 202 } 203 204 domain = NULL; 205 if (mod_hash_find(bdf_domain_hash, 206 (void *)bdf, (void *)&domain) == 0) { 207 ASSERT(domain); 208 ASSERT(domain->dom_did > 0); 209 return (domain); 210 } else { 211 return (NULL); 212 } 213 } 214 215 static void 216 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain) 217 { 218 int16_t seg = immu_devi->imd_seg; 219 int16_t bus = immu_devi->imd_bus; 220 int16_t devfunc = immu_devi->imd_devfunc; 221 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 222 223 if (seg < 0 || bus < 0 || devfunc < 0) { 224 return; 225 } 226 227 (void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain); 228 } 229 230 static int 231 match_lpc(dev_info_t *pdip, void *arg) 232 { 233 immu_devi_t *immu_devi; 234 dvma_arg_t *dvap = (dvma_arg_t *)arg; 235 236 if (list_is_empty(dvap->dva_list)) { 237 return (DDI_WALK_TERMINATE); 238 } 239 240 immu_devi = list_head(dvap->dva_list); 241 for (; immu_devi; immu_devi = list_next(dvap->dva_list, 242 immu_devi)) { 243 if (immu_devi->imd_dip == pdip) { 244 dvap->dva_ddip = pdip; 245 dvap->dva_error = DDI_SUCCESS; 246 return (DDI_WALK_TERMINATE); 247 } 248 } 249 250 return (DDI_WALK_CONTINUE); 251 } 252 253 static void 254 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu) 255 { 256 list_t *spclist = NULL; 257 immu_devi_t *immu_devi; 258 259 immu_devi = IMMU_DEVI(dip); 260 if (immu_devi->imd_display == B_TRUE) { 261 spclist = &(immu->immu_dvma_gfx_list); 262 } else if (immu_devi->imd_lpc == B_TRUE) { 263 spclist = &(immu->immu_dvma_lpc_list); 264 } 265 266 if (spclist) { 267 mutex_enter(&(immu->immu_lock)); 268 list_insert_head(spclist, immu_devi); 269 mutex_exit(&(immu->immu_lock)); 270 } 271 } 272 273 /* 274 * Set the immu_devi struct in the immu_devi field of a devinfo node 275 */ 276 int 277 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags) 278 { 279 int bus, dev, func; 280 immu_devi_t *new_imd; 281 immu_devi_t *immu_devi; 282 283 immu_devi = immu_devi_get(dip); 284 if (immu_devi != NULL) { 285 return (DDI_SUCCESS); 286 } 287 288 bus = dev = func = -1; 289 290 /* 291 * Assume a new immu_devi struct is needed 292 */ 293 if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) { 294 /* 295 * No BDF. Set bus = -1 to indicate this. 296 * We still need to create a immu_devi struct 297 * though 298 */ 299 bus = -1; 300 dev = 0; 301 func = 0; 302 } 303 304 new_imd = create_immu_devi(dip, bus, dev, func, immu_flags); 305 if (new_imd == NULL) { 306 ddi_err(DER_WARN, dip, "Failed to create immu_devi " 307 "structure"); 308 return (DDI_FAILURE); 309 } 310 311 /* 312 * Check if some other thread allocated a immu_devi while we 313 * didn't own the lock. 314 */ 315 mutex_enter(&(DEVI(dip)->devi_lock)); 316 if (IMMU_DEVI(dip) == NULL) { 317 IMMU_DEVI_SET(dip, new_imd); 318 } else { 319 destroy_immu_devi(new_imd); 320 } 321 mutex_exit(&(DEVI(dip)->devi_lock)); 322 323 return (DDI_SUCCESS); 324 } 325 326 static dev_info_t * 327 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags) 328 { 329 dvma_arg_t dvarg = {0}; 330 dvarg.dva_list = &(immu->immu_dvma_lpc_list); 331 dvarg.dva_rdip = rdip; 332 dvarg.dva_error = DDI_FAILURE; 333 334 if (immu_walk_ancestor(rdip, NULL, match_lpc, 335 &dvarg, NULL, immu_flags) != DDI_SUCCESS) { 336 ddi_err(DER_MODE, rdip, "Could not walk ancestors to " 337 "find lpc_devinfo for ISA device"); 338 return (NULL); 339 } 340 341 if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) { 342 ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for " 343 "ISA device"); 344 return (NULL); 345 } 346 347 return (dvarg.dva_ddip); 348 } 349 350 static dev_info_t * 351 get_gfx_devinfo(dev_info_t *rdip) 352 { 353 immu_t *immu; 354 immu_devi_t *immu_devi; 355 list_t *list_gfx; 356 357 /* 358 * The GFX device may not be on the same iommu unit as "agpgart" 359 * so search globally 360 */ 361 immu_devi = NULL; 362 immu = list_head(&immu_list); 363 for (; immu; immu = list_next(&immu_list, immu)) { 364 list_gfx = &(immu->immu_dvma_gfx_list); 365 if (!list_is_empty(list_gfx)) { 366 immu_devi = list_head(list_gfx); 367 break; 368 } 369 } 370 371 if (immu_devi == NULL) { 372 ddi_err(DER_WARN, rdip, "iommu: No GFX device. " 373 "Cannot redirect agpgart"); 374 return (NULL); 375 } 376 377 ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s", 378 ddi_node_name(immu_devi->imd_dip)); 379 380 return (immu_devi->imd_dip); 381 } 382 383 static immu_flags_t 384 dma_to_immu_flags(struct ddi_dma_req *dmareq) 385 { 386 immu_flags_t flags = 0; 387 388 if (dmareq->dmar_fp == DDI_DMA_SLEEP) { 389 flags |= IMMU_FLAGS_SLEEP; 390 } else { 391 flags |= IMMU_FLAGS_NOSLEEP; 392 } 393 394 #ifdef BUGGY_DRIVERS 395 396 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 397 398 #else 399 /* 400 * Read and write flags need to be reversed. 401 * DMA_READ means read from device and write 402 * to memory. So DMA read means DVMA write. 403 */ 404 if (dmareq->dmar_flags & DDI_DMA_READ) 405 flags |= IMMU_FLAGS_WRITE; 406 407 if (dmareq->dmar_flags & DDI_DMA_WRITE) 408 flags |= IMMU_FLAGS_READ; 409 410 /* 411 * Some buggy drivers specify neither READ or WRITE 412 * For such drivers set both read and write permissions 413 */ 414 if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) { 415 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 416 } 417 #endif 418 419 return (flags); 420 } 421 422 /*ARGSUSED*/ 423 int 424 pgtable_ctor(void *buf, void *arg, int kmflag) 425 { 426 size_t actual_size = 0; 427 pgtable_t *pgtable; 428 int (*dmafp)(caddr_t); 429 caddr_t vaddr; 430 void *next; 431 uint_t flags; 432 immu_t *immu = arg; 433 434 pgtable = (pgtable_t *)buf; 435 436 dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; 437 438 next = kmem_zalloc(IMMU_PAGESIZE, kmflag); 439 if (next == NULL) { 440 return (-1); 441 } 442 443 if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr, 444 dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) { 445 kmem_free(next, IMMU_PAGESIZE); 446 return (-1); 447 } 448 449 flags = DDI_DMA_CONSISTENT; 450 if (!immu->immu_dvma_coherent) 451 flags |= IOMEM_DATA_UC_WR_COMBINE; 452 453 if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE, 454 &immu_acc_attr, flags, 455 dmafp, NULL, &vaddr, &actual_size, 456 &pgtable->hwpg_memhdl) != DDI_SUCCESS) { 457 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 458 kmem_free(next, IMMU_PAGESIZE); 459 return (-1); 460 } 461 462 /* 463 * Memory allocation failure. Maybe a temporary condition 464 * so return error rather than panic, so we can try again 465 */ 466 if (actual_size < IMMU_PAGESIZE) { 467 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 468 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 469 kmem_free(next, IMMU_PAGESIZE); 470 return (-1); 471 } 472 473 pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr)); 474 pgtable->hwpg_vaddr = vaddr; 475 pgtable->swpg_next_array = next; 476 477 rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL); 478 479 return (0); 480 } 481 482 /*ARGSUSED*/ 483 void 484 pgtable_dtor(void *buf, void *arg) 485 { 486 pgtable_t *pgtable; 487 488 pgtable = (pgtable_t *)buf; 489 490 /* destroy will panic if lock is held. */ 491 rw_destroy(&(pgtable->swpg_rwlock)); 492 493 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 494 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 495 kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE); 496 } 497 498 /* 499 * pgtable_alloc() 500 * alloc a IOMMU pgtable structure. 501 * This same struct is used for root and context tables as well. 502 * This routine allocs the f/ollowing: 503 * - a pgtable_t struct 504 * - a HW page which holds PTEs/entries which is accesssed by HW 505 * so we set up DMA for this page 506 * - a SW page which is only for our bookeeping 507 * (for example to hold pointers to the next level pgtable). 508 * So a simple kmem_alloc suffices 509 */ 510 static pgtable_t * 511 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags) 512 { 513 pgtable_t *pgtable; 514 int kmflags; 515 516 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 517 518 pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags); 519 if (pgtable == NULL) { 520 return (NULL); 521 } 522 return (pgtable); 523 } 524 525 static void 526 pgtable_zero(pgtable_t *pgtable) 527 { 528 bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE); 529 bzero(pgtable->swpg_next_array, IMMU_PAGESIZE); 530 } 531 532 static void 533 pgtable_free(immu_t *immu, pgtable_t *pgtable) 534 { 535 kmem_cache_free(immu->immu_pgtable_cache, pgtable); 536 } 537 538 /* 539 * Function to identify a display device from the PCI class code 540 */ 541 static boolean_t 542 device_is_display(uint_t classcode) 543 { 544 static uint_t disp_classes[] = { 545 0x000100, 546 0x030000, 547 0x030001 548 }; 549 int i, nclasses = sizeof (disp_classes) / sizeof (uint_t); 550 551 for (i = 0; i < nclasses; i++) { 552 if (classcode == disp_classes[i]) 553 return (B_TRUE); 554 } 555 return (B_FALSE); 556 } 557 558 /* 559 * Function that determines if device is PCIEX and/or PCIEX bridge 560 */ 561 static boolean_t 562 device_is_pciex( 563 uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib) 564 { 565 ushort_t cap; 566 ushort_t capsp; 567 ushort_t cap_count = PCI_CAP_MAX_PTR; 568 ushort_t status; 569 boolean_t is_pciex = B_FALSE; 570 571 *is_pcib = B_FALSE; 572 573 status = pci_getw_func(bus, dev, func, PCI_CONF_STAT); 574 if (!(status & PCI_STAT_CAP)) 575 return (B_FALSE); 576 577 capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR); 578 while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) { 579 capsp &= PCI_CAP_PTR_MASK; 580 cap = pci_getb_func(bus, dev, func, capsp); 581 582 if (cap == PCI_CAP_ID_PCI_E) { 583 status = pci_getw_func(bus, dev, func, capsp + 2); 584 /* 585 * See section 7.8.2 of PCI-Express Base Spec v1.0a 586 * for Device/Port Type. 587 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the 588 * device is a PCIE2PCI bridge 589 */ 590 *is_pcib = 591 ((status & PCIE_PCIECAP_DEV_TYPE_MASK) == 592 PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE; 593 is_pciex = B_TRUE; 594 } 595 596 capsp = (*pci_getb_func)(bus, dev, func, 597 capsp + PCI_CAP_NEXT_PTR); 598 } 599 600 return (is_pciex); 601 } 602 603 static boolean_t 604 device_use_premap(uint_t classcode) 605 { 606 if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET) 607 return (B_TRUE); 608 return (B_FALSE); 609 } 610 611 612 /* 613 * immu_dvma_get_immu() 614 * get the immu unit structure for a dev_info node 615 */ 616 immu_t * 617 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags) 618 { 619 immu_devi_t *immu_devi; 620 immu_t *immu; 621 622 /* 623 * check if immu unit was already found earlier. 624 * If yes, then it will be stashed in immu_devi struct. 625 */ 626 immu_devi = immu_devi_get(dip); 627 if (immu_devi == NULL) { 628 if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) { 629 /* 630 * May fail because of low memory. Return error rather 631 * than panic as we want driver to rey again later 632 */ 633 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 634 "No immu_devi structure"); 635 /*NOTREACHED*/ 636 } 637 immu_devi = immu_devi_get(dip); 638 } 639 640 mutex_enter(&(DEVI(dip)->devi_lock)); 641 if (immu_devi->imd_immu) { 642 immu = immu_devi->imd_immu; 643 mutex_exit(&(DEVI(dip)->devi_lock)); 644 return (immu); 645 } 646 mutex_exit(&(DEVI(dip)->devi_lock)); 647 648 immu = immu_dmar_get_immu(dip); 649 if (immu == NULL) { 650 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 651 "Cannot find immu_t for device"); 652 /*NOTREACHED*/ 653 } 654 655 /* 656 * Check if some other thread found immu 657 * while lock was not held 658 */ 659 immu_devi = immu_devi_get(dip); 660 /* immu_devi should be present as we found it earlier */ 661 if (immu_devi == NULL) { 662 ddi_err(DER_PANIC, dip, 663 "immu_dvma_get_immu: No immu_devi structure"); 664 /*NOTREACHED*/ 665 } 666 667 mutex_enter(&(DEVI(dip)->devi_lock)); 668 if (immu_devi->imd_immu == NULL) { 669 /* nobody else set it, so we should do it */ 670 immu_devi->imd_immu = immu; 671 immu_devi_set_spclist(dip, immu); 672 } else { 673 /* 674 * if some other thread got immu before 675 * us, it should get the same results 676 */ 677 if (immu_devi->imd_immu != immu) { 678 ddi_err(DER_PANIC, dip, "Multiple " 679 "immu units found for device. Expected (%p), " 680 "actual (%p)", (void *)immu, 681 (void *)immu_devi->imd_immu); 682 mutex_exit(&(DEVI(dip)->devi_lock)); 683 /*NOTREACHED*/ 684 } 685 } 686 mutex_exit(&(DEVI(dip)->devi_lock)); 687 688 return (immu); 689 } 690 691 692 /* ############################# IMMU_DEVI code ############################ */ 693 694 /* 695 * Allocate a immu_devi structure and initialize it 696 */ 697 static immu_devi_t * 698 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func, 699 immu_flags_t immu_flags) 700 { 701 uchar_t baseclass, subclass; 702 uint_t classcode, revclass; 703 immu_devi_t *immu_devi; 704 boolean_t pciex = B_FALSE; 705 int kmflags; 706 boolean_t is_pcib = B_FALSE; 707 708 /* bus == -1 indicate non-PCI device (no BDF) */ 709 ASSERT(bus == -1 || bus >= 0); 710 ASSERT(dev >= 0); 711 ASSERT(func >= 0); 712 713 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 714 immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags); 715 if (immu_devi == NULL) { 716 ddi_err(DER_WARN, rdip, "Failed to allocate memory for " 717 "Intel IOMMU immu_devi structure"); 718 return (NULL); 719 } 720 immu_devi->imd_dip = rdip; 721 immu_devi->imd_seg = 0; /* Currently seg can only be 0 */ 722 immu_devi->imd_bus = bus; 723 immu_devi->imd_pcib_type = IMMU_PCIB_BAD; 724 725 if (bus == -1) { 726 immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF; 727 return (immu_devi); 728 } 729 730 immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func); 731 immu_devi->imd_sec = 0; 732 immu_devi->imd_sub = 0; 733 734 revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID); 735 736 classcode = IMMU_PCI_REV2CLASS(revclass); 737 baseclass = IMMU_PCI_CLASS2BASE(classcode); 738 subclass = IMMU_PCI_CLASS2SUB(classcode); 739 740 if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) { 741 742 immu_devi->imd_sec = pci_getb_func(bus, dev, func, 743 PCI_BCNF_SECBUS); 744 immu_devi->imd_sub = pci_getb_func(bus, dev, func, 745 PCI_BCNF_SUBBUS); 746 747 pciex = device_is_pciex(bus, dev, func, &is_pcib); 748 if (pciex == B_TRUE && is_pcib == B_TRUE) { 749 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI; 750 } else if (pciex == B_TRUE) { 751 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE; 752 } else { 753 immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI; 754 } 755 } else { 756 immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT; 757 } 758 759 /* check for certain special devices */ 760 immu_devi->imd_display = device_is_display(classcode); 761 immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) && 762 (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE; 763 immu_devi->imd_use_premap = device_use_premap(classcode); 764 765 immu_devi->imd_domain = NULL; 766 767 immu_devi->imd_dvma_flags = immu_global_dvma_flags; 768 769 return (immu_devi); 770 } 771 772 static void 773 destroy_immu_devi(immu_devi_t *immu_devi) 774 { 775 kmem_free(immu_devi, sizeof (immu_devi_t)); 776 } 777 778 static domain_t * 779 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp) 780 { 781 immu_devi_t *immu_devi; 782 domain_t *domain; 783 dev_info_t *ddip; 784 785 *ddipp = NULL; 786 787 immu_devi = immu_devi_get(rdip); 788 if (immu_devi == NULL) { 789 return (NULL); 790 } 791 792 mutex_enter(&(DEVI(rdip)->devi_lock)); 793 domain = immu_devi->imd_domain; 794 ddip = immu_devi->imd_ddip; 795 mutex_exit(&(DEVI(rdip)->devi_lock)); 796 797 if (domain) 798 *ddipp = ddip; 799 800 return (domain); 801 802 } 803 804 /* ############################# END IMMU_DEVI code ######################## */ 805 /* ############################# DOMAIN code ############################### */ 806 807 /* 808 * This routine always succeeds 809 */ 810 static int 811 did_alloc(immu_t *immu, dev_info_t *rdip, 812 dev_info_t *ddip, immu_flags_t immu_flags) 813 { 814 int did; 815 816 did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1, 817 (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP); 818 819 if (did == 0) { 820 ddi_err(DER_WARN, rdip, "device domain-id alloc error" 821 " domain-device: %s%d. immu unit is %s. Using " 822 "unity domain with domain-id (%d)", 823 ddi_driver_name(ddip), ddi_get_instance(ddip), 824 immu->immu_name, immu->immu_unity_domain->dom_did); 825 did = immu->immu_unity_domain->dom_did; 826 } 827 828 return (did); 829 } 830 831 static int 832 get_branch_domain(dev_info_t *pdip, void *arg) 833 { 834 immu_devi_t *immu_devi; 835 domain_t *domain; 836 dev_info_t *ddip; 837 immu_t *immu; 838 dvma_arg_t *dvp = (dvma_arg_t *)arg; 839 840 /* 841 * The field dvp->dva_rdip is a work-in-progress 842 * and gets updated as we walk up the ancestor 843 * tree. The final ddip is set only when we reach 844 * the top of the tree. So the dvp->dva_ddip field cannot 845 * be relied on until we reach the top of the field. 846 */ 847 848 /* immu_devi may not be set. */ 849 immu_devi = immu_devi_get(pdip); 850 if (immu_devi == NULL) { 851 if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) { 852 dvp->dva_error = DDI_FAILURE; 853 return (DDI_WALK_TERMINATE); 854 } 855 } 856 857 immu_devi = immu_devi_get(pdip); 858 immu = immu_devi->imd_immu; 859 if (immu == NULL) 860 immu = immu_dvma_get_immu(pdip, dvp->dva_flags); 861 862 /* 863 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to 864 * terminate the walk (since the device under the PCIE bridge 865 * is a PCIE device and has an independent entry in the 866 * root/context table) 867 */ 868 if (dvp->dva_rdip != pdip && 869 immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) { 870 return (DDI_WALK_TERMINATE); 871 } 872 873 /* 874 * In order to be a domain-dim, it must be a PCI device i.e. 875 * must have valid BDF. This also eliminates the root complex. 876 */ 877 if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD && 878 immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) { 879 ASSERT(immu_devi->imd_bus >= 0); 880 ASSERT(immu_devi->imd_devfunc >= 0); 881 dvp->dva_ddip = pdip; 882 } 883 884 if (immu_devi->imd_display == B_TRUE || 885 (dvp->dva_flags & IMMU_FLAGS_UNITY)) { 886 dvp->dva_domain = immu->immu_unity_domain; 887 /* continue walking to find ddip */ 888 return (DDI_WALK_CONTINUE); 889 } 890 891 mutex_enter(&(DEVI(pdip)->devi_lock)); 892 domain = immu_devi->imd_domain; 893 ddip = immu_devi->imd_ddip; 894 mutex_exit(&(DEVI(pdip)->devi_lock)); 895 896 if (domain && ddip) { 897 /* if domain is set, it must be the same */ 898 if (dvp->dva_domain) { 899 ASSERT(domain == dvp->dva_domain); 900 } 901 dvp->dva_domain = domain; 902 dvp->dva_ddip = ddip; 903 return (DDI_WALK_TERMINATE); 904 } 905 906 /* Domain may already be set, continue walking so that ddip gets set */ 907 if (dvp->dva_domain) { 908 return (DDI_WALK_CONTINUE); 909 } 910 911 /* domain is not set in either immu_devi or dvp */ 912 domain = bdf_domain_lookup(immu_devi); 913 if (domain == NULL) { 914 return (DDI_WALK_CONTINUE); 915 } 916 917 /* ok, the BDF hash had a domain for this BDF. */ 918 919 /* Grab lock again to check if something else set immu_devi fields */ 920 mutex_enter(&(DEVI(pdip)->devi_lock)); 921 if (immu_devi->imd_domain != NULL) { 922 dvp->dva_domain = domain; 923 } else { 924 dvp->dva_domain = domain; 925 } 926 mutex_exit(&(DEVI(pdip)->devi_lock)); 927 928 /* 929 * walk upwards until the topmost PCI bridge is found 930 */ 931 return (DDI_WALK_CONTINUE); 932 933 } 934 935 static void 936 map_unity_domain(domain_t *domain) 937 { 938 struct memlist *mp; 939 uint64_t start; 940 uint64_t npages; 941 immu_dcookie_t dcookies[1] = {0}; 942 int dcount = 0; 943 944 /* 945 * UNITY arenas are a mirror of the physical memory 946 * installed on the system. 947 */ 948 949 #ifdef BUGGY_DRIVERS 950 /* 951 * Dont skip page0. Some broken HW/FW access it. 952 */ 953 dcookies[0].dck_paddr = 0; 954 dcookies[0].dck_npages = 1; 955 dcount = 1; 956 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 957 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 958 #endif 959 960 memlist_read_lock(); 961 962 mp = phys_install; 963 964 if (mp->ml_address == 0) { 965 /* since we already mapped page1 above */ 966 start = IMMU_PAGESIZE; 967 } else { 968 start = mp->ml_address; 969 } 970 npages = mp->ml_size/IMMU_PAGESIZE + 1; 971 972 dcookies[0].dck_paddr = start; 973 dcookies[0].dck_npages = npages; 974 dcount = 1; 975 (void) dvma_map(domain, start, npages, dcookies, 976 dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 977 978 ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64 979 " - 0x%" PRIx64 "]", start, start + mp->ml_size); 980 981 mp = mp->ml_next; 982 while (mp) { 983 ddi_err(DER_LOG, domain->dom_dip, 984 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 985 mp->ml_address, mp->ml_address + mp->ml_size); 986 987 start = mp->ml_address; 988 npages = mp->ml_size/IMMU_PAGESIZE + 1; 989 990 dcookies[0].dck_paddr = start; 991 dcookies[0].dck_npages = npages; 992 dcount = 1; 993 (void) dvma_map(domain, start, npages, 994 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 995 mp = mp->ml_next; 996 } 997 998 mp = bios_rsvd; 999 while (mp) { 1000 ddi_err(DER_LOG, domain->dom_dip, 1001 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 1002 mp->ml_address, mp->ml_address + mp->ml_size); 1003 1004 start = mp->ml_address; 1005 npages = mp->ml_size/IMMU_PAGESIZE + 1; 1006 1007 dcookies[0].dck_paddr = start; 1008 dcookies[0].dck_npages = npages; 1009 dcount = 1; 1010 (void) dvma_map(domain, start, npages, 1011 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 1012 1013 mp = mp->ml_next; 1014 } 1015 1016 memlist_read_unlock(); 1017 } 1018 1019 /* 1020 * create_xlate_arena() 1021 * Create the dvma arena for a domain with translation 1022 * mapping 1023 */ 1024 static void 1025 create_xlate_arena(immu_t *immu, domain_t *domain, 1026 dev_info_t *rdip, immu_flags_t immu_flags) 1027 { 1028 char *arena_name; 1029 struct memlist *mp; 1030 int vmem_flags; 1031 uint64_t start; 1032 uint_t mgaw; 1033 uint64_t size; 1034 uint64_t maxaddr; 1035 void *vmem_ret; 1036 1037 arena_name = domain->dom_dvma_arena_name; 1038 1039 /* Note, don't do sizeof (arena_name) - it is just a pointer */ 1040 (void) snprintf(arena_name, 1041 sizeof (domain->dom_dvma_arena_name), 1042 "%s-domain-%d-xlate-DVMA-arena", immu->immu_name, 1043 domain->dom_did); 1044 1045 vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP; 1046 1047 /* Restrict mgaddr (max guest addr) to MGAW */ 1048 mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap); 1049 1050 /* 1051 * To ensure we avoid ioapic and PCI MMIO ranges we just 1052 * use the physical memory address range of the system as the 1053 * range 1054 */ 1055 maxaddr = ((uint64_t)1 << mgaw); 1056 1057 memlist_read_lock(); 1058 1059 mp = phys_install; 1060 1061 if (mp->ml_address == 0) 1062 start = MMU_PAGESIZE; 1063 else 1064 start = mp->ml_address; 1065 1066 if (start + mp->ml_size > maxaddr) 1067 size = maxaddr - start; 1068 else 1069 size = mp->ml_size; 1070 1071 ddi_err(DER_VERB, rdip, 1072 "iommu: %s: Creating dvma vmem arena [0x%" PRIx64 1073 " - 0x%" PRIx64 "]", arena_name, start, start + size); 1074 1075 /* 1076 * We always allocate in quanta of IMMU_PAGESIZE 1077 */ 1078 domain->dom_dvma_arena = vmem_create(arena_name, 1079 (void *)(uintptr_t)start, /* start addr */ 1080 size, /* size */ 1081 IMMU_PAGESIZE, /* quantum */ 1082 NULL, /* afunc */ 1083 NULL, /* ffunc */ 1084 NULL, /* source */ 1085 0, /* qcache_max */ 1086 vmem_flags); 1087 1088 if (domain->dom_dvma_arena == NULL) { 1089 ddi_err(DER_PANIC, rdip, 1090 "Failed to allocate DVMA arena(%s) " 1091 "for domain ID (%d)", arena_name, domain->dom_did); 1092 /*NOTREACHED*/ 1093 } 1094 1095 mp = mp->ml_next; 1096 while (mp) { 1097 1098 if (mp->ml_address == 0) 1099 start = MMU_PAGESIZE; 1100 else 1101 start = mp->ml_address; 1102 1103 if (start + mp->ml_size > maxaddr) 1104 size = maxaddr - start; 1105 else 1106 size = mp->ml_size; 1107 1108 ddi_err(DER_VERB, rdip, 1109 "iommu: %s: Adding dvma vmem span [0x%" PRIx64 1110 " - 0x%" PRIx64 "]", arena_name, start, 1111 start + size); 1112 1113 vmem_ret = vmem_add(domain->dom_dvma_arena, 1114 (void *)(uintptr_t)start, size, vmem_flags); 1115 1116 if (vmem_ret == NULL) { 1117 ddi_err(DER_PANIC, rdip, 1118 "Failed to allocate DVMA arena(%s) " 1119 "for domain ID (%d)", 1120 arena_name, domain->dom_did); 1121 /*NOTREACHED*/ 1122 } 1123 mp = mp->ml_next; 1124 } 1125 memlist_read_unlock(); 1126 } 1127 1128 /* ################################### DOMAIN CODE ######################### */ 1129 1130 /* 1131 * Set the domain and domain-dip for a dip 1132 */ 1133 static void 1134 set_domain( 1135 dev_info_t *dip, 1136 dev_info_t *ddip, 1137 domain_t *domain) 1138 { 1139 immu_devi_t *immu_devi; 1140 domain_t *fdomain; 1141 dev_info_t *fddip; 1142 1143 immu_devi = immu_devi_get(dip); 1144 1145 mutex_enter(&(DEVI(dip)->devi_lock)); 1146 fddip = immu_devi->imd_ddip; 1147 fdomain = immu_devi->imd_domain; 1148 1149 if (fddip) { 1150 ASSERT(fddip == ddip); 1151 } else { 1152 immu_devi->imd_ddip = ddip; 1153 } 1154 1155 if (fdomain) { 1156 ASSERT(fdomain == domain); 1157 } else { 1158 immu_devi->imd_domain = domain; 1159 } 1160 mutex_exit(&(DEVI(dip)->devi_lock)); 1161 } 1162 1163 /* 1164 * device_domain() 1165 * Get domain for a device. The domain may be global in which case it 1166 * is shared between all IOMMU units. Due to potential AGAW differences 1167 * between IOMMU units, such global domains *have to be* UNITY mapping 1168 * domains. Alternatively, the domain may be local to a IOMMU unit. 1169 * Local domains may be shared or immu_devi, although the 1170 * scope of sharing 1171 * is restricted to devices controlled by the IOMMU unit to 1172 * which the domain 1173 * belongs. If shared, they (currently) have to be UNITY domains. If 1174 * immu_devi a domain may be either UNITY or translation (XLATE) domain. 1175 */ 1176 static domain_t * 1177 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) 1178 { 1179 dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */ 1180 immu_t *immu; 1181 domain_t *domain; 1182 dvma_arg_t dvarg = {0}; 1183 int level; 1184 1185 *ddipp = NULL; 1186 1187 /* 1188 * Check if the domain is already set. This is usually true 1189 * if this is not the first DVMA transaction. 1190 */ 1191 ddip = NULL; 1192 domain = immu_devi_domain(rdip, &ddip); 1193 if (domain) { 1194 *ddipp = ddip; 1195 return (domain); 1196 } 1197 1198 immu = immu_dvma_get_immu(rdip, immu_flags); 1199 if (immu == NULL) { 1200 /* 1201 * possible that there is no IOMMU unit for this device 1202 * - BIOS bugs are one example. 1203 */ 1204 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 1205 return (NULL); 1206 } 1207 1208 immu_flags |= immu_devi_get(rdip)->imd_dvma_flags; 1209 1210 dvarg.dva_rdip = rdip; 1211 dvarg.dva_ddip = NULL; 1212 dvarg.dva_domain = NULL; 1213 dvarg.dva_flags = immu_flags; 1214 level = 0; 1215 if (immu_walk_ancestor(rdip, NULL, get_branch_domain, 1216 &dvarg, &level, immu_flags) != DDI_SUCCESS) { 1217 /* 1218 * maybe low memory. return error, 1219 * so driver tries again later 1220 */ 1221 return (NULL); 1222 } 1223 1224 /* should have walked at least 1 dip (i.e. edip) */ 1225 ASSERT(level > 0); 1226 1227 ddip = dvarg.dva_ddip; /* must be present */ 1228 domain = dvarg.dva_domain; /* may be NULL */ 1229 1230 /* 1231 * We may find the domain during our ancestor walk on any one of our 1232 * ancestor dips, If the domain is found then the domain-dip 1233 * (i.e. ddip) will also be found in the same immu_devi struct. 1234 * The domain-dip is the highest ancestor dip which shares the 1235 * same domain with edip. 1236 * The domain may or may not be found, but the domain dip must 1237 * be found. 1238 */ 1239 if (ddip == NULL) { 1240 ddi_err(DER_MODE, rdip, "Cannot find domain dip for device."); 1241 return (NULL); 1242 } 1243 1244 /* 1245 * Did we find a domain ? 1246 */ 1247 if (domain) { 1248 goto found; 1249 } 1250 1251 /* nope, so allocate */ 1252 domain = domain_create(immu, ddip, rdip, immu_flags); 1253 if (domain == NULL) { 1254 return (NULL); 1255 } 1256 1257 /*FALLTHROUGH*/ 1258 found: 1259 /* 1260 * We know *domain *is* the right domain, so panic if 1261 * another domain is set for either the request-dip or 1262 * effective dip. 1263 */ 1264 set_domain(ddip, ddip, domain); 1265 set_domain(rdip, ddip, domain); 1266 1267 *ddipp = ddip; 1268 return (domain); 1269 } 1270 1271 static void 1272 create_unity_domain(immu_t *immu) 1273 { 1274 domain_t *domain; 1275 1276 /* domain created during boot and always use sleep flag */ 1277 domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP); 1278 1279 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1280 1281 domain->dom_did = IMMU_UNITY_DID; 1282 domain->dom_maptype = IMMU_MAPTYPE_UNITY; 1283 1284 domain->dom_immu = immu; 1285 immu->immu_unity_domain = domain; 1286 1287 /* 1288 * Setup the domain's initial page table 1289 * should never fail. 1290 */ 1291 domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1292 pgtable_zero(domain->dom_pgtable_root); 1293 1294 /* 1295 * Only map all physical memory in to the unity domain 1296 * if passthrough is not supported. If it is supported, 1297 * passthrough is set in the context entry instead. 1298 */ 1299 if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1300 map_unity_domain(domain); 1301 1302 1303 /* 1304 * put it on the system-wide UNITY domain list 1305 */ 1306 mutex_enter(&(immu_domain_lock)); 1307 list_insert_tail(&immu_unity_domain_list, domain); 1308 mutex_exit(&(immu_domain_lock)); 1309 } 1310 1311 /* 1312 * ddip is the domain-dip - the topmost dip in a domain 1313 * rdip is the requesting-dip - the device which is 1314 * requesting DVMA setup 1315 * if domain is a non-shared domain rdip == ddip 1316 */ 1317 static domain_t * 1318 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, 1319 immu_flags_t immu_flags) 1320 { 1321 int kmflags; 1322 domain_t *domain; 1323 char mod_hash_name[128]; 1324 immu_devi_t *immu_devi; 1325 int did; 1326 immu_dcookie_t dcookies[1] = {0}; 1327 int dcount = 0; 1328 1329 immu_devi = immu_devi_get(rdip); 1330 1331 /* 1332 * First allocate a domainid. 1333 * This routine will never fail, since if we run out 1334 * of domains the unity domain will be allocated. 1335 */ 1336 did = did_alloc(immu, rdip, ddip, immu_flags); 1337 if (did == IMMU_UNITY_DID) { 1338 /* domain overflow */ 1339 ASSERT(immu->immu_unity_domain); 1340 return (immu->immu_unity_domain); 1341 } 1342 1343 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 1344 domain = kmem_zalloc(sizeof (domain_t), kmflags); 1345 if (domain == NULL) { 1346 ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain " 1347 "structure for device. IOMMU unit: %s", immu->immu_name); 1348 /*NOTREACHED*/ 1349 } 1350 1351 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1352 1353 (void) snprintf(mod_hash_name, sizeof (mod_hash_name), 1354 "immu%s-domain%d-pava-hash", immu->immu_name, did); 1355 1356 domain->dom_did = did; 1357 domain->dom_immu = immu; 1358 domain->dom_maptype = IMMU_MAPTYPE_XLATE; 1359 domain->dom_dip = ddip; 1360 1361 /* 1362 * Create xlate DVMA arena for this domain. 1363 */ 1364 create_xlate_arena(immu, domain, rdip, immu_flags); 1365 1366 /* 1367 * Setup the domain's initial page table 1368 */ 1369 domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags); 1370 if (domain->dom_pgtable_root == NULL) { 1371 ddi_err(DER_PANIC, rdip, "Failed to alloc root " 1372 "pgtable for domain (%d). IOMMU unit: %s", 1373 domain->dom_did, immu->immu_name); 1374 /*NOTREACHED*/ 1375 } 1376 pgtable_zero(domain->dom_pgtable_root); 1377 1378 /* 1379 * Since this is a immu unit-specific domain, put it on 1380 * the per-immu domain list. 1381 */ 1382 mutex_enter(&(immu->immu_lock)); 1383 list_insert_head(&immu->immu_domain_list, domain); 1384 mutex_exit(&(immu->immu_lock)); 1385 1386 /* 1387 * Also put it on the system-wide xlate domain list 1388 */ 1389 mutex_enter(&(immu_domain_lock)); 1390 list_insert_head(&immu_xlate_domain_list, domain); 1391 mutex_exit(&(immu_domain_lock)); 1392 1393 bdf_domain_insert(immu_devi, domain); 1394 1395 #ifdef BUGGY_DRIVERS 1396 /* 1397 * Map page0. Some broken HW/FW access it. 1398 */ 1399 dcookies[0].dck_paddr = 0; 1400 dcookies[0].dck_npages = 1; 1401 dcount = 1; 1402 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 1403 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 1404 #endif 1405 return (domain); 1406 } 1407 1408 /* 1409 * Create domainid arena. 1410 * Domainid 0 is reserved by Vt-d spec and cannot be used by 1411 * system software. 1412 * Domainid 1 is reserved by solaris and used for *all* of the following: 1413 * as the "uninitialized" domain - For devices not yet controlled 1414 * by Solaris 1415 * as the "unity" domain - For devices that will always belong 1416 * to the unity domain 1417 * as the "overflow" domain - Used for any new device after we 1418 * run out of domains 1419 * All of the above domains map into a single domain with 1420 * domainid 1 and UNITY DVMA mapping 1421 * Each IMMU unity has its own unity/uninit/overflow domain 1422 */ 1423 static void 1424 did_init(immu_t *immu) 1425 { 1426 (void) snprintf(immu->immu_did_arena_name, 1427 sizeof (immu->immu_did_arena_name), 1428 "%s_domainid_arena", immu->immu_name); 1429 1430 ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s", 1431 immu->immu_did_arena_name); 1432 1433 immu->immu_did_arena = vmem_create( 1434 immu->immu_did_arena_name, 1435 (void *)(uintptr_t)(IMMU_UNITY_DID + 1), /* start addr */ 1436 immu->immu_max_domains - IMMU_UNITY_DID, 1437 1, /* quantum */ 1438 NULL, /* afunc */ 1439 NULL, /* ffunc */ 1440 NULL, /* source */ 1441 0, /* qcache_max */ 1442 VM_SLEEP); 1443 1444 /* Even with SLEEP flag, vmem_create() can fail */ 1445 if (immu->immu_did_arena == NULL) { 1446 ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel " 1447 "IOMMU domainid allocator: %s", immu->immu_name, 1448 immu->immu_did_arena_name); 1449 } 1450 } 1451 1452 /* ######################### CONTEXT CODE ################################# */ 1453 1454 static void 1455 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, 1456 int bus, int devfunc) 1457 { 1458 pgtable_t *context; 1459 pgtable_t *pgtable_root; 1460 hw_rce_t *hw_rent; 1461 hw_rce_t *hw_cent; 1462 hw_rce_t *ctxp; 1463 int sid; 1464 krw_t rwtype; 1465 boolean_t fill_root; 1466 boolean_t fill_ctx; 1467 1468 pgtable_root = domain->dom_pgtable_root; 1469 1470 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1471 context = *(pgtable_t **)(ctxp + bus); 1472 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus; 1473 1474 fill_root = B_FALSE; 1475 fill_ctx = B_FALSE; 1476 1477 /* Check the most common case first with reader lock */ 1478 rw_enter(&(immu->immu_ctx_rwlock), RW_READER); 1479 rwtype = RW_READER; 1480 again: 1481 if (ROOT_GET_P(hw_rent)) { 1482 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1483 if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) { 1484 rw_exit(&(immu->immu_ctx_rwlock)); 1485 return; 1486 } else { 1487 fill_ctx = B_TRUE; 1488 } 1489 } else { 1490 fill_root = B_TRUE; 1491 fill_ctx = B_TRUE; 1492 } 1493 1494 if (rwtype == RW_READER && 1495 rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) { 1496 rw_exit(&(immu->immu_ctx_rwlock)); 1497 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1498 rwtype = RW_WRITER; 1499 goto again; 1500 } 1501 rwtype = RW_WRITER; 1502 1503 if (fill_root == B_TRUE) { 1504 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1505 ROOT_SET_P(hw_rent); 1506 immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t)); 1507 } 1508 1509 if (fill_ctx == B_TRUE) { 1510 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1511 /* need to disable context entry before reprogramming it */ 1512 bzero(hw_cent, sizeof (hw_rce_t)); 1513 1514 /* flush caches */ 1515 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1516 1517 sid = ((bus << 8) | devfunc); 1518 immu_flush_context_fsi(immu, 0, sid, domain->dom_did, 1519 &immu->immu_ctx_inv_wait); 1520 1521 CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED); 1522 CONT_SET_DID(hw_cent, domain->dom_did); 1523 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1524 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1525 if (domain->dom_did == IMMU_UNITY_DID && 1526 IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1527 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1528 else 1529 /*LINTED*/ 1530 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1531 CONT_SET_P(hw_cent); 1532 if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) { 1533 CONT_SET_EH(hw_cent); 1534 if (immu_use_alh) 1535 CONT_SET_ALH(hw_cent); 1536 } 1537 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1538 } 1539 rw_exit(&(immu->immu_ctx_rwlock)); 1540 } 1541 1542 static pgtable_t * 1543 context_create(immu_t *immu) 1544 { 1545 int bus; 1546 int devfunc; 1547 pgtable_t *root_table; 1548 pgtable_t *context; 1549 pgtable_t *pgtable_root; 1550 hw_rce_t *ctxp; 1551 hw_rce_t *hw_rent; 1552 hw_rce_t *hw_cent; 1553 1554 /* Allocate a zeroed root table (4K 256b entries) */ 1555 root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1556 pgtable_zero(root_table); 1557 1558 /* 1559 * Setup context tables for all possible root table entries. 1560 * Start out with unity domains for all entries. 1561 */ 1562 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1563 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr); 1564 for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) { 1565 context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1566 pgtable_zero(context); 1567 ROOT_SET_P(hw_rent); 1568 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1569 hw_cent = (hw_rce_t *)(context->hwpg_vaddr); 1570 for (devfunc = 0; devfunc < IMMU_CONT_NUM; 1571 devfunc++, hw_cent++) { 1572 pgtable_root = 1573 immu->immu_unity_domain->dom_pgtable_root; 1574 CONT_SET_DID(hw_cent, 1575 immu->immu_unity_domain->dom_did); 1576 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1577 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1578 if (IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1579 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1580 else 1581 /*LINTED*/ 1582 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1583 CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED); 1584 CONT_SET_P(hw_cent); 1585 } 1586 immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE); 1587 *((pgtable_t **)ctxp) = context; 1588 } 1589 1590 return (root_table); 1591 } 1592 1593 /* 1594 * Called during rootnex attach, so no locks needed 1595 */ 1596 static void 1597 context_init(immu_t *immu) 1598 { 1599 rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL); 1600 1601 immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE); 1602 1603 immu_regs_wbf_flush(immu); 1604 1605 immu->immu_ctx_root = context_create(immu); 1606 1607 immu_regs_set_root_table(immu); 1608 1609 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1610 immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); 1611 immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); 1612 rw_exit(&(immu->immu_ctx_rwlock)); 1613 } 1614 1615 1616 /* 1617 * Find top pcib 1618 */ 1619 static int 1620 find_top_pcib(dev_info_t *dip, void *arg) 1621 { 1622 immu_devi_t *immu_devi; 1623 dev_info_t **pcibdipp = (dev_info_t **)arg; 1624 1625 immu_devi = immu_devi_get(dip); 1626 1627 if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) { 1628 *pcibdipp = dip; 1629 } 1630 1631 return (DDI_WALK_CONTINUE); 1632 } 1633 1634 static int 1635 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, 1636 dev_info_t *rdip, immu_flags_t immu_flags) 1637 { 1638 immu_devi_t *r_immu_devi; 1639 immu_devi_t *d_immu_devi; 1640 int r_bus; 1641 int d_bus; 1642 int r_devfunc; 1643 int d_devfunc; 1644 immu_pcib_t d_pcib_type; 1645 dev_info_t *pcibdip; 1646 1647 if (ddip == NULL || rdip == NULL || 1648 ddip == root_devinfo || rdip == root_devinfo) { 1649 ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or " 1650 "request-dip are NULL or are root devinfo"); 1651 return (DDI_FAILURE); 1652 } 1653 1654 /* 1655 * We need to set the context fields 1656 * based on what type of device rdip and ddip are. 1657 * To do that we need the immu_devi field. 1658 * Set the immu_devi field (if not already set) 1659 */ 1660 if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) { 1661 ddi_err(DER_MODE, rdip, 1662 "immu_context_update: failed to set immu_devi for ddip"); 1663 return (DDI_FAILURE); 1664 } 1665 1666 if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) { 1667 ddi_err(DER_MODE, rdip, 1668 "immu_context_update: failed to set immu_devi for rdip"); 1669 return (DDI_FAILURE); 1670 } 1671 1672 d_immu_devi = immu_devi_get(ddip); 1673 r_immu_devi = immu_devi_get(rdip); 1674 1675 d_bus = d_immu_devi->imd_bus; 1676 d_devfunc = d_immu_devi->imd_devfunc; 1677 d_pcib_type = d_immu_devi->imd_pcib_type; 1678 r_bus = r_immu_devi->imd_bus; 1679 r_devfunc = r_immu_devi->imd_devfunc; 1680 1681 if (rdip == ddip) { 1682 /* rdip is a PCIE device. set context for it only */ 1683 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1684 r_devfunc); 1685 #ifdef BUGGY_DRIVERS 1686 } else if (r_immu_devi == d_immu_devi) { 1687 #ifdef TEST 1688 ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and " 1689 "0x%lx are identical", rdip, ddip); 1690 #endif 1691 /* rdip is a PCIE device. set context for it only */ 1692 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1693 r_devfunc); 1694 #endif 1695 } else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) { 1696 /* 1697 * ddip is a PCIE_PCI bridge. Set context for ddip's 1698 * secondary bus. If rdip is on ddip's secondary 1699 * bus, set context for rdip. Else, set context 1700 * for rdip's PCI bridge on ddip's secondary bus. 1701 */ 1702 context_set(immu, domain, immu->immu_ctx_root, 1703 d_immu_devi->imd_sec, 0); 1704 if (d_immu_devi->imd_sec == r_bus) { 1705 context_set(immu, domain, immu->immu_ctx_root, 1706 r_bus, r_devfunc); 1707 } else { 1708 pcibdip = NULL; 1709 if (immu_walk_ancestor(rdip, ddip, find_top_pcib, 1710 &pcibdip, NULL, immu_flags) == DDI_SUCCESS && 1711 pcibdip != NULL) { 1712 r_immu_devi = immu_devi_get(pcibdip); 1713 r_bus = r_immu_devi->imd_bus; 1714 r_devfunc = r_immu_devi->imd_devfunc; 1715 context_set(immu, domain, immu->immu_ctx_root, 1716 r_bus, r_devfunc); 1717 } else { 1718 ddi_err(DER_PANIC, rdip, "Failed to find PCI " 1719 " bridge for PCI device"); 1720 /*NOTREACHED*/ 1721 } 1722 } 1723 } else if (d_pcib_type == IMMU_PCIB_PCI_PCI) { 1724 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1725 d_devfunc); 1726 } else if (d_pcib_type == IMMU_PCIB_ENDPOINT) { 1727 /* 1728 * ddip is a PCIE device which has a non-PCI device under it 1729 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata 1730 */ 1731 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1732 d_devfunc); 1733 } else { 1734 ddi_err(DER_PANIC, rdip, "unknown device type. Cannot " 1735 "set iommu context."); 1736 /*NOTREACHED*/ 1737 } 1738 1739 /* XXX do we need a membar_producer() here */ 1740 return (DDI_SUCCESS); 1741 } 1742 1743 /* ##################### END CONTEXT CODE ################################## */ 1744 /* ##################### MAPPING CODE ################################## */ 1745 1746 1747 #ifdef DEBUG 1748 static boolean_t 1749 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr, 1750 dev_info_t *rdip, immu_flags_t immu_flags) 1751 { 1752 /* The PDTE must be set i.e. present bit is set */ 1753 if (!PDTE_P(pdte)) { 1754 ddi_err(DER_MODE, rdip, "No present flag"); 1755 return (B_FALSE); 1756 } 1757 1758 /* 1759 * Just assert to check most significant system software field 1760 * (PDTE_SW4) as it is same as present bit and we 1761 * checked that above 1762 */ 1763 ASSERT(PDTE_SW4(pdte)); 1764 1765 /* 1766 * TM field should be clear if not reserved. 1767 * non-leaf is always reserved 1768 */ 1769 if (next == NULL && immu->immu_TM_reserved == B_FALSE) { 1770 if (PDTE_TM(pdte)) { 1771 ddi_err(DER_MODE, rdip, "TM flag set"); 1772 return (B_FALSE); 1773 } 1774 } 1775 1776 /* 1777 * The SW3 field is not used and must be clear 1778 */ 1779 if (PDTE_SW3(pdte)) { 1780 ddi_err(DER_MODE, rdip, "SW3 set"); 1781 return (B_FALSE); 1782 } 1783 1784 /* 1785 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set 1786 */ 1787 if (next == NULL) { 1788 ASSERT(paddr % IMMU_PAGESIZE == 0); 1789 if (PDTE_PADDR(pdte) != paddr) { 1790 ddi_err(DER_MODE, rdip, 1791 "PTE paddr mismatch: %lx != %lx", 1792 PDTE_PADDR(pdte), paddr); 1793 return (B_FALSE); 1794 } 1795 } else { 1796 if (PDTE_PADDR(pdte) != next->hwpg_paddr) { 1797 ddi_err(DER_MODE, rdip, 1798 "PDE paddr mismatch: %lx != %lx", 1799 PDTE_PADDR(pdte), next->hwpg_paddr); 1800 return (B_FALSE); 1801 } 1802 } 1803 1804 /* 1805 * SNP field should be clear if not reserved. 1806 * non-leaf is always reserved 1807 */ 1808 if (next == NULL && immu->immu_SNP_reserved == B_FALSE) { 1809 if (PDTE_SNP(pdte)) { 1810 ddi_err(DER_MODE, rdip, "SNP set"); 1811 return (B_FALSE); 1812 } 1813 } 1814 1815 /* second field available for system software should be clear */ 1816 if (PDTE_SW2(pdte)) { 1817 ddi_err(DER_MODE, rdip, "SW2 set"); 1818 return (B_FALSE); 1819 } 1820 1821 /* Super pages field should be clear */ 1822 if (PDTE_SP(pdte)) { 1823 ddi_err(DER_MODE, rdip, "SP set"); 1824 return (B_FALSE); 1825 } 1826 1827 /* 1828 * least significant field available for 1829 * system software should be clear 1830 */ 1831 if (PDTE_SW1(pdte)) { 1832 ddi_err(DER_MODE, rdip, "SW1 set"); 1833 return (B_FALSE); 1834 } 1835 1836 if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) { 1837 ddi_err(DER_MODE, rdip, "READ not set"); 1838 return (B_FALSE); 1839 } 1840 1841 if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) { 1842 ddi_err(DER_MODE, rdip, "WRITE not set"); 1843 return (B_FALSE); 1844 } 1845 1846 return (B_TRUE); 1847 } 1848 #endif 1849 1850 /*ARGSUSED*/ 1851 static void 1852 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 1853 uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip) 1854 { 1855 uint64_t npages; 1856 uint64_t dvma; 1857 pgtable_t *pgtable; 1858 hw_pdte_t *hwp; 1859 hw_pdte_t *shwp; 1860 int idx; 1861 1862 pgtable = xlate->xlt_pgtable; 1863 idx = xlate->xlt_idx; 1864 1865 dvma = *dvma_ptr; 1866 npages = *npages_ptr; 1867 1868 /* 1869 * since a caller gets a unique dvma for a physical address, 1870 * no other concurrent thread will be writing to the same 1871 * PTE even if it has the same paddr. So no locks needed. 1872 */ 1873 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 1874 1875 hwp = shwp; 1876 for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 1877 PDTE_CLEAR_P(*hwp); 1878 dvma += IMMU_PAGESIZE; 1879 npages--; 1880 } 1881 1882 *dvma_ptr = dvma; 1883 *npages_ptr = npages; 1884 1885 xlate->xlt_idx = idx; 1886 } 1887 1888 static void 1889 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels) 1890 { 1891 int level; 1892 uint64_t offbits; 1893 1894 /* 1895 * Skip the first 12 bits which is the offset into 1896 * 4K PFN (phys page frame based on IMMU_PAGESIZE) 1897 */ 1898 offbits = dvma >> IMMU_PAGESHIFT; 1899 1900 /* skip to level 1 i.e. leaf PTE */ 1901 for (level = 1, xlate++; level <= nlevels; level++, xlate++) { 1902 xlate->xlt_level = level; 1903 xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK); 1904 ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX); 1905 xlate->xlt_pgtable = NULL; 1906 offbits >>= IMMU_PGTABLE_LEVEL_STRIDE; 1907 } 1908 } 1909 1910 /* 1911 * Read the pgtables 1912 */ 1913 static boolean_t 1914 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels) 1915 { 1916 pgtable_t *pgtable; 1917 pgtable_t *next; 1918 uint_t idx; 1919 1920 /* start with highest level pgtable i.e. root */ 1921 xlate += nlevels; 1922 1923 if (xlate->xlt_pgtable == NULL) { 1924 xlate->xlt_pgtable = domain->dom_pgtable_root; 1925 } 1926 1927 for (; xlate->xlt_level > 1; xlate--) { 1928 idx = xlate->xlt_idx; 1929 pgtable = xlate->xlt_pgtable; 1930 1931 if ((xlate - 1)->xlt_pgtable) { 1932 continue; 1933 } 1934 1935 /* Lock the pgtable in read mode */ 1936 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 1937 1938 /* 1939 * since we are unmapping, the pgtable should 1940 * already point to a leafier pgtable. 1941 */ 1942 next = *(pgtable->swpg_next_array + idx); 1943 (xlate - 1)->xlt_pgtable = next; 1944 rw_exit(&(pgtable->swpg_rwlock)); 1945 if (next == NULL) 1946 return (B_FALSE); 1947 } 1948 1949 return (B_TRUE); 1950 } 1951 1952 static void 1953 immu_fault_walk(void *arg, void *base, size_t len) 1954 { 1955 uint64_t dvma, start; 1956 1957 dvma = *(uint64_t *)arg; 1958 start = (uint64_t)(uintptr_t)base; 1959 1960 if (dvma >= start && dvma < (start + len)) { 1961 ddi_err(DER_WARN, NULL, 1962 "faulting DVMA address is in vmem arena " 1963 "(%" PRIx64 "-%" PRIx64 ")", 1964 start, start + len); 1965 *(uint64_t *)arg = ~0ULL; 1966 } 1967 } 1968 1969 void 1970 immu_print_fault_info(uint_t sid, uint64_t dvma) 1971 { 1972 int nlevels; 1973 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 1974 xlate_t *xlatep; 1975 hw_pdte_t pte; 1976 domain_t *domain; 1977 immu_t *immu; 1978 uint64_t dvma_arg; 1979 1980 if (mod_hash_find(bdf_domain_hash, 1981 (void *)(uintptr_t)sid, (void *)&domain) != 0) { 1982 ddi_err(DER_WARN, NULL, 1983 "no domain for faulting SID %08x", sid); 1984 return; 1985 } 1986 1987 immu = domain->dom_immu; 1988 1989 dvma_arg = dvma; 1990 vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk, 1991 (void *)&dvma_arg); 1992 if (dvma_arg != ~0ULL) 1993 ddi_err(DER_WARN, domain->dom_dip, 1994 "faulting DVMA address is not in vmem arena"); 1995 1996 nlevels = immu->immu_dvma_nlevels; 1997 xlate_setup(dvma, xlate, nlevels); 1998 1999 if (!PDE_lookup(domain, xlate, nlevels)) { 2000 ddi_err(DER_WARN, domain->dom_dip, 2001 "pte not found in domid %d for faulting addr %" PRIx64, 2002 domain->dom_did, dvma); 2003 return; 2004 } 2005 2006 xlatep = &xlate[1]; 2007 pte = *((hw_pdte_t *) 2008 (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx); 2009 2010 ddi_err(DER_WARN, domain->dom_dip, 2011 "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did, 2012 (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte)); 2013 } 2014 2015 /*ARGSUSED*/ 2016 static void 2017 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, 2018 dev_info_t *rdip, immu_flags_t immu_flags) 2019 { 2020 hw_pdte_t pte; 2021 2022 #ifndef DEBUG 2023 pte = immu->immu_ptemask; 2024 PDTE_SET_PADDR(pte, paddr); 2025 #else 2026 pte = *hwp; 2027 2028 if (PDTE_P(pte)) { 2029 if (PDTE_PADDR(pte) != paddr) { 2030 ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx", 2031 PDTE_PADDR(pte), paddr); 2032 } 2033 #ifdef BUGGY_DRIVERS 2034 return; 2035 #else 2036 goto out; 2037 #endif 2038 } 2039 2040 /* clear TM field if not reserved */ 2041 if (immu->immu_TM_reserved == B_FALSE) { 2042 PDTE_CLEAR_TM(pte); 2043 } 2044 2045 /* Clear 3rd field for system software - not used */ 2046 PDTE_CLEAR_SW3(pte); 2047 2048 /* Set paddr */ 2049 ASSERT(paddr % IMMU_PAGESIZE == 0); 2050 PDTE_CLEAR_PADDR(pte); 2051 PDTE_SET_PADDR(pte, paddr); 2052 2053 /* clear SNP field if not reserved. */ 2054 if (immu->immu_SNP_reserved == B_FALSE) { 2055 PDTE_CLEAR_SNP(pte); 2056 } 2057 2058 /* Clear SW2 field available for software */ 2059 PDTE_CLEAR_SW2(pte); 2060 2061 2062 /* SP is don't care for PTEs. Clear it for cleanliness */ 2063 PDTE_CLEAR_SP(pte); 2064 2065 /* Clear SW1 field available for software */ 2066 PDTE_CLEAR_SW1(pte); 2067 2068 /* 2069 * Now that we are done writing the PTE 2070 * set the "present" flag. Note this present 2071 * flag is a bit in the PDE/PTE that the 2072 * spec says is available for system software. 2073 * This is an implementation detail of Solaris 2074 * bare-metal Intel IOMMU. 2075 * The present field in a PDE/PTE is not defined 2076 * by the Vt-d spec 2077 */ 2078 2079 PDTE_SET_P(pte); 2080 2081 pte |= immu->immu_ptemask; 2082 2083 out: 2084 #endif /* DEBUG */ 2085 #ifdef BUGGY_DRIVERS 2086 PDTE_SET_READ(pte); 2087 PDTE_SET_WRITE(pte); 2088 #else 2089 if (immu_flags & IMMU_FLAGS_READ) 2090 PDTE_SET_READ(pte); 2091 if (immu_flags & IMMU_FLAGS_WRITE) 2092 PDTE_SET_WRITE(pte); 2093 #endif /* BUGGY_DRIVERS */ 2094 2095 *hwp = pte; 2096 } 2097 2098 /*ARGSUSED*/ 2099 static void 2100 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 2101 uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies, 2102 int dcount, dev_info_t *rdip, immu_flags_t immu_flags) 2103 { 2104 paddr_t paddr; 2105 uint64_t nvpages; 2106 uint64_t nppages; 2107 uint64_t dvma; 2108 pgtable_t *pgtable; 2109 hw_pdte_t *hwp; 2110 hw_pdte_t *shwp; 2111 int idx, nset; 2112 int j; 2113 2114 pgtable = xlate->xlt_pgtable; 2115 idx = xlate->xlt_idx; 2116 2117 dvma = *dvma_ptr; 2118 nvpages = *nvpages_ptr; 2119 2120 /* 2121 * since a caller gets a unique dvma for a physical address, 2122 * no other concurrent thread will be writing to the same 2123 * PTE even if it has the same paddr. So no locks needed. 2124 */ 2125 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2126 2127 hwp = shwp; 2128 for (j = dcount - 1; j >= 0; j--) { 2129 if (nvpages <= dcookies[j].dck_npages) 2130 break; 2131 nvpages -= dcookies[j].dck_npages; 2132 } 2133 2134 nppages = nvpages; 2135 paddr = dcookies[j].dck_paddr + 2136 (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE; 2137 2138 nvpages = *nvpages_ptr; 2139 nset = 0; 2140 for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 2141 PTE_set_one(immu, hwp, paddr, rdip, immu_flags); 2142 nset++; 2143 2144 ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags) 2145 == B_TRUE); 2146 nppages--; 2147 nvpages--; 2148 paddr += IMMU_PAGESIZE; 2149 dvma += IMMU_PAGESIZE; 2150 2151 if (nppages == 0) { 2152 j++; 2153 } 2154 2155 if (j == dcount) 2156 break; 2157 2158 if (nppages == 0) { 2159 nppages = dcookies[j].dck_npages; 2160 paddr = dcookies[j].dck_paddr; 2161 } 2162 } 2163 2164 if (nvpages) { 2165 *dvma_ptr = dvma; 2166 *nvpages_ptr = nvpages; 2167 } else { 2168 *dvma_ptr = 0; 2169 *nvpages_ptr = 0; 2170 } 2171 2172 xlate->xlt_idx = idx; 2173 } 2174 2175 /*ARGSUSED*/ 2176 static void 2177 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next, 2178 dev_info_t *rdip, immu_flags_t immu_flags) 2179 { 2180 hw_pdte_t pde; 2181 2182 pde = *hwp; 2183 2184 /* if PDE is already set, make sure it is correct */ 2185 if (PDTE_P(pde)) { 2186 ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr); 2187 #ifdef BUGGY_DRIVERS 2188 return; 2189 #else 2190 goto out; 2191 #endif 2192 } 2193 2194 /* Dont touch SW4, it is the present bit */ 2195 2196 /* don't touch TM field it is reserved for PDEs */ 2197 2198 /* 3rd field available for system software is not used */ 2199 PDTE_CLEAR_SW3(pde); 2200 2201 /* Set next level pgtable-paddr for PDE */ 2202 PDTE_CLEAR_PADDR(pde); 2203 PDTE_SET_PADDR(pde, next->hwpg_paddr); 2204 2205 /* don't touch SNP field it is reserved for PDEs */ 2206 2207 /* Clear second field available for system software */ 2208 PDTE_CLEAR_SW2(pde); 2209 2210 /* No super pages for PDEs */ 2211 PDTE_CLEAR_SP(pde); 2212 2213 /* Clear SW1 for software */ 2214 PDTE_CLEAR_SW1(pde); 2215 2216 /* 2217 * Now that we are done writing the PDE 2218 * set the "present" flag. Note this present 2219 * flag is a bit in the PDE/PTE that the 2220 * spec says is available for system software. 2221 * This is an implementation detail of Solaris 2222 * base-metal Intel IOMMU. 2223 * The present field in a PDE/PTE is not defined 2224 * by the Vt-d spec 2225 */ 2226 2227 out: 2228 #ifdef BUGGY_DRIVERS 2229 PDTE_SET_READ(pde); 2230 PDTE_SET_WRITE(pde); 2231 #else 2232 if (immu_flags & IMMU_FLAGS_READ) 2233 PDTE_SET_READ(pde); 2234 if (immu_flags & IMMU_FLAGS_WRITE) 2235 PDTE_SET_WRITE(pde); 2236 #endif 2237 2238 PDTE_SET_P(pde); 2239 2240 *hwp = pde; 2241 } 2242 2243 /* 2244 * Used to set PDEs 2245 */ 2246 static boolean_t 2247 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, 2248 dev_info_t *rdip, immu_flags_t immu_flags) 2249 { 2250 pgtable_t *pgtable; 2251 pgtable_t *new; 2252 pgtable_t *next; 2253 hw_pdte_t *hwp; 2254 int level; 2255 uint_t idx; 2256 krw_t rwtype; 2257 boolean_t set = B_FALSE; 2258 2259 /* start with highest level pgtable i.e. root */ 2260 xlate += nlevels; 2261 2262 new = NULL; 2263 xlate->xlt_pgtable = domain->dom_pgtable_root; 2264 for (level = nlevels; level > 1; level--, xlate--) { 2265 idx = xlate->xlt_idx; 2266 pgtable = xlate->xlt_pgtable; 2267 2268 /* Lock the pgtable in READ mode first */ 2269 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 2270 rwtype = RW_READER; 2271 again: 2272 hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2273 next = (pgtable->swpg_next_array)[idx]; 2274 2275 /* 2276 * check if leafier level already has a pgtable 2277 * if yes, verify 2278 */ 2279 if (next == NULL) { 2280 if (new == NULL) { 2281 2282 IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *, 2283 rdip, int, level); 2284 2285 new = pgtable_alloc(immu, immu_flags); 2286 if (new == NULL) { 2287 ddi_err(DER_PANIC, rdip, 2288 "pgtable alloc err"); 2289 } 2290 pgtable_zero(new); 2291 } 2292 2293 /* Change to a write lock */ 2294 if (rwtype == RW_READER && 2295 rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) { 2296 rw_exit(&(pgtable->swpg_rwlock)); 2297 rw_enter(&(pgtable->swpg_rwlock), RW_WRITER); 2298 rwtype = RW_WRITER; 2299 goto again; 2300 } 2301 rwtype = RW_WRITER; 2302 next = new; 2303 (pgtable->swpg_next_array)[idx] = next; 2304 new = NULL; 2305 PDE_set_one(immu, hwp, next, rdip, immu_flags); 2306 set = B_TRUE; 2307 rw_downgrade(&(pgtable->swpg_rwlock)); 2308 rwtype = RW_READER; 2309 } 2310 #ifndef BUGGY_DRIVERS 2311 else { 2312 hw_pdte_t pde = *hwp; 2313 2314 /* 2315 * If buggy driver we already set permission 2316 * READ+WRITE so nothing to do for that case 2317 * XXX Check that read writer perms change before 2318 * actually setting perms. Also need to hold lock 2319 */ 2320 if (immu_flags & IMMU_FLAGS_READ) 2321 PDTE_SET_READ(pde); 2322 if (immu_flags & IMMU_FLAGS_WRITE) 2323 PDTE_SET_WRITE(pde); 2324 2325 *hwp = pde; 2326 } 2327 #endif 2328 2329 ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags) 2330 == B_TRUE); 2331 2332 (xlate - 1)->xlt_pgtable = next; 2333 rw_exit(&(pgtable->swpg_rwlock)); 2334 } 2335 2336 if (new) { 2337 pgtable_free(immu, new); 2338 } 2339 2340 return (set); 2341 } 2342 2343 /* 2344 * dvma_map() 2345 * map a contiguous range of DVMA pages 2346 * 2347 * immu: IOMMU unit for which we are generating DVMA cookies 2348 * domain: domain 2349 * sdvma: Starting dvma 2350 * spaddr: Starting paddr 2351 * npages: Number of pages 2352 * rdip: requesting device 2353 * immu_flags: flags 2354 */ 2355 static boolean_t 2356 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages, 2357 immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 2358 immu_flags_t immu_flags) 2359 { 2360 uint64_t dvma; 2361 uint64_t n; 2362 immu_t *immu = domain->dom_immu; 2363 int nlevels = immu->immu_dvma_nlevels; 2364 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2365 boolean_t pde_set = B_FALSE; 2366 2367 n = snvpages; 2368 dvma = sdvma; 2369 2370 while (n > 0) { 2371 xlate_setup(dvma, xlate, nlevels); 2372 2373 /* Lookup or allocate PGDIRs and PGTABLEs if necessary */ 2374 if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags) 2375 == B_TRUE) { 2376 pde_set = B_TRUE; 2377 } 2378 2379 /* set all matching ptes that fit into this leaf pgtable */ 2380 PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies, 2381 dcount, rdip, immu_flags); 2382 } 2383 2384 return (pde_set); 2385 } 2386 2387 /* 2388 * dvma_unmap() 2389 * unmap a range of DVMAs 2390 * 2391 * immu: IOMMU unit state 2392 * domain: domain for requesting device 2393 * ddip: domain-dip 2394 * dvma: starting DVMA 2395 * npages: Number of IMMU pages to be unmapped 2396 * rdip: requesting device 2397 */ 2398 static void 2399 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages, 2400 dev_info_t *rdip) 2401 { 2402 immu_t *immu = domain->dom_immu; 2403 int nlevels = immu->immu_dvma_nlevels; 2404 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2405 uint64_t n; 2406 uint64_t dvma; 2407 2408 dvma = sdvma; 2409 n = snpages; 2410 2411 while (n > 0) { 2412 /* setup the xlate array */ 2413 xlate_setup(dvma, xlate, nlevels); 2414 2415 /* just lookup existing pgtables. Should never fail */ 2416 if (!PDE_lookup(domain, xlate, nlevels)) 2417 ddi_err(DER_PANIC, rdip, 2418 "PTE not found for addr %" PRIx64, 2419 (unsigned long long)dvma); 2420 2421 /* clear all matching ptes that fit into this leaf pgtable */ 2422 PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip); 2423 } 2424 2425 /* No need to flush IOTLB after unmap */ 2426 } 2427 2428 static uint64_t 2429 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf) 2430 { 2431 uint64_t dvma; 2432 size_t xsize, align; 2433 uint64_t minaddr, maxaddr; 2434 2435 /* parameters */ 2436 xsize = npages * IMMU_PAGESIZE; 2437 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2438 minaddr = dma_attr->dma_attr_addr_lo; 2439 maxaddr = dma_attr->dma_attr_addr_hi + 1; 2440 2441 /* handle the rollover cases */ 2442 if (maxaddr < dma_attr->dma_attr_addr_hi) { 2443 maxaddr = dma_attr->dma_attr_addr_hi; 2444 } 2445 2446 /* 2447 * allocate from vmem arena. 2448 */ 2449 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2450 xsize, align, 0, 0, (void *)(uintptr_t)minaddr, 2451 (void *)(uintptr_t)maxaddr, kmf); 2452 2453 return (dvma); 2454 } 2455 2456 static void 2457 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr) 2458 { 2459 int nlevels; 2460 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp; 2461 uint64_t dvma, n; 2462 size_t xsize, align; 2463 uint64_t minaddr, maxaddr, dmamax; 2464 int on, npte, pindex; 2465 hw_pdte_t *shwp; 2466 immu_t *immu; 2467 domain_t *domain; 2468 2469 /* parameters */ 2470 domain = IMMU_DEVI(rdip)->imd_domain; 2471 immu = domain->dom_immu; 2472 nlevels = immu->immu_dvma_nlevels; 2473 xsize = IMMU_NPREPTES * IMMU_PAGESIZE; 2474 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2475 minaddr = dma_attr->dma_attr_addr_lo; 2476 if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG) 2477 dmamax = dma_attr->dma_attr_seg; 2478 else 2479 dmamax = dma_attr->dma_attr_addr_hi; 2480 maxaddr = dmamax + 1; 2481 2482 if (maxaddr < dmamax) 2483 maxaddr = dmamax; 2484 2485 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2486 xsize, align, 0, dma_attr->dma_attr_seg + 1, 2487 (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP); 2488 2489 ihp->ihp_predvma = dvma; 2490 ihp->ihp_npremapped = 0; 2491 if (dvma == 0) 2492 return; 2493 2494 n = IMMU_NPREPTES; 2495 pindex = 0; 2496 2497 /* 2498 * Set up a mapping at address 0, just so that all PDPs get allocated 2499 * now. Although this initial mapping should never be used, 2500 * explicitly set it to read-only, just to be safe. 2501 */ 2502 while (n > 0) { 2503 xlate_setup(dvma, xlate, nlevels); 2504 2505 (void) PDE_set_all(immu, domain, xlate, nlevels, rdip, 2506 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2507 2508 xlp = &xlate[1]; 2509 shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr) 2510 + xlp->xlt_idx; 2511 on = n; 2512 2513 PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie, 2514 1, rdip, IMMU_FLAGS_READ); 2515 2516 npte = on - n; 2517 2518 while (npte > 0) { 2519 ihp->ihp_preptes[pindex++] = shwp; 2520 #ifdef BUGGY_DRIVERS 2521 PDTE_CLEAR_WRITE(*shwp); 2522 #endif 2523 shwp++; 2524 npte--; 2525 } 2526 } 2527 } 2528 2529 static void 2530 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp) 2531 { 2532 domain_t *domain; 2533 2534 domain = IMMU_DEVI(rdip)->imd_domain; 2535 2536 if (ihp->ihp_predvma != 0) { 2537 dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip); 2538 vmem_free(domain->dom_dvma_arena, 2539 (void *)(uintptr_t)ihp->ihp_predvma, 2540 IMMU_NPREPTES * IMMU_PAGESIZE); 2541 } 2542 } 2543 2544 static void 2545 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages) 2546 { 2547 uint64_t size = npages * IMMU_PAGESIZE; 2548 2549 if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) 2550 return; 2551 2552 vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size); 2553 } 2554 2555 static int 2556 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle, 2557 immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq, 2558 ddi_dma_obj_t *dma_out) 2559 { 2560 domain_t *domain; 2561 immu_t *immu; 2562 immu_flags_t immu_flags; 2563 ddi_dma_atyp_t buftype; 2564 ddi_dma_obj_t *dmar_object; 2565 ddi_dma_attr_t *attrp; 2566 uint64_t offset, paddr, dvma, sdvma, rwmask; 2567 size_t npages, npgalloc; 2568 uint_t psize, size, pcnt, dmax; 2569 page_t **pparray; 2570 caddr_t vaddr; 2571 page_t *page; 2572 struct as *vas; 2573 immu_dcookie_t *dcookies; 2574 int pde_set; 2575 2576 domain = IMMU_DEVI(rdip)->imd_domain; 2577 immu = domain->dom_immu; 2578 immu_flags = dma_to_immu_flags(dmareq); 2579 2580 attrp = &((ddi_dma_impl_t *)handle)->dmai_attr; 2581 2582 dmar_object = &dmareq->dmar_object; 2583 pparray = dmar_object->dmao_obj.virt_obj.v_priv; 2584 vaddr = dmar_object->dmao_obj.virt_obj.v_addr; 2585 buftype = dmar_object->dmao_type; 2586 size = dmar_object->dmao_size; 2587 2588 IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t, 2589 buftype, uint_t, size); 2590 2591 dcookies = &ihp->ihp_dcookies[0]; 2592 2593 pcnt = dmax = 0; 2594 2595 /* retrieve paddr, psize, offset from dmareq */ 2596 if (buftype == DMA_OTYP_PAGES) { 2597 page = dmar_object->dmao_obj.pp_obj.pp_pp; 2598 offset = dmar_object->dmao_obj.pp_obj.pp_offset & 2599 MMU_PAGEOFFSET; 2600 paddr = pfn_to_pa(page->p_pagenum) + offset; 2601 psize = MIN((MMU_PAGESIZE - offset), size); 2602 page = page->p_next; 2603 vas = dmar_object->dmao_obj.virt_obj.v_as; 2604 } else { 2605 if (vas == NULL) { 2606 vas = &kas; 2607 } 2608 offset = (uintptr_t)vaddr & MMU_PAGEOFFSET; 2609 if (pparray != NULL) { 2610 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset; 2611 psize = MIN((MMU_PAGESIZE - offset), size); 2612 pcnt++; 2613 } else { 2614 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, 2615 vaddr)) + offset; 2616 psize = MIN(size, (MMU_PAGESIZE - offset)); 2617 vaddr += psize; 2618 } 2619 } 2620 2621 npgalloc = IMMU_BTOPR(size + offset); 2622 2623 if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) { 2624 #ifdef BUGGY_DRIVERS 2625 rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask; 2626 #else 2627 rwmask = immu->immu_ptemask; 2628 if (immu_flags & IMMU_FLAGS_READ) 2629 rwmask |= PDTE_MASK_R; 2630 if (immu_flags & IMMU_FLAGS_WRITE) 2631 rwmask |= PDTE_MASK_W; 2632 #endif 2633 #ifdef DEBUG 2634 rwmask |= PDTE_MASK_P; 2635 #endif 2636 sdvma = ihp->ihp_predvma; 2637 ihp->ihp_npremapped = npgalloc; 2638 *ihp->ihp_preptes[0] = 2639 PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask; 2640 } else { 2641 ihp->ihp_npremapped = 0; 2642 sdvma = dvma_alloc(domain, attrp, npgalloc, 2643 dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP); 2644 if (sdvma == 0) 2645 return (DDI_DMA_NORESOURCES); 2646 2647 dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET); 2648 dcookies[0].dck_npages = 1; 2649 } 2650 2651 IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc, 2652 uint64_t, sdvma); 2653 2654 dvma = sdvma; 2655 pde_set = 0; 2656 npages = 1; 2657 size -= psize; 2658 while (size > 0) { 2659 /* get the size for this page (i.e. partial or full page) */ 2660 psize = MIN(size, MMU_PAGESIZE); 2661 if (buftype == DMA_OTYP_PAGES) { 2662 /* get the paddr from the page_t */ 2663 paddr = pfn_to_pa(page->p_pagenum); 2664 page = page->p_next; 2665 } else if (pparray != NULL) { 2666 /* index into the array of page_t's to get the paddr */ 2667 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum); 2668 pcnt++; 2669 } else { 2670 /* call into the VM to get the paddr */ 2671 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr)); 2672 vaddr += psize; 2673 } 2674 2675 npages++; 2676 2677 if (ihp->ihp_npremapped > 0) { 2678 *ihp->ihp_preptes[npages - 1] = 2679 PDTE_PADDR(paddr) | rwmask; 2680 } else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) { 2681 dcookies[dmax].dck_npages++; 2682 } else { 2683 /* No, we need a new dcookie */ 2684 if (dmax == (IMMU_NDCK - 1)) { 2685 /* 2686 * Ran out of dcookies. Map them now. 2687 */ 2688 if (dvma_map(domain, dvma, 2689 npages, dcookies, dmax + 1, rdip, 2690 immu_flags)) 2691 pde_set++; 2692 2693 IMMU_DPROBE4(immu__dvmamap__early, 2694 dev_info_t *, rdip, uint64_t, dvma, 2695 uint_t, npages, uint_t, dmax+1); 2696 2697 dvma += (npages << IMMU_PAGESHIFT); 2698 npages = 0; 2699 dmax = 0; 2700 } else 2701 dmax++; 2702 dcookies[dmax].dck_paddr = paddr; 2703 dcookies[dmax].dck_npages = 1; 2704 } 2705 size -= psize; 2706 } 2707 2708 /* 2709 * Finish up, mapping all, or all of the remaining, 2710 * physical memory ranges. 2711 */ 2712 if (ihp->ihp_npremapped == 0 && npages > 0) { 2713 IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \ 2714 uint64_t, dvma, uint_t, npages, uint_t, dmax+1); 2715 2716 if (dvma_map(domain, dvma, npages, dcookies, 2717 dmax + 1, rdip, immu_flags)) 2718 pde_set++; 2719 } 2720 2721 /* Invalidate the IOTLB */ 2722 immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc, 2723 pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF, 2724 &ihp->ihp_inv_wait); 2725 2726 ihp->ihp_ndvseg = 1; 2727 ihp->ihp_dvseg[0].dvs_start = sdvma; 2728 ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size; 2729 2730 dma_out->dmao_size = dmar_object->dmao_size; 2731 dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET; 2732 dma_out->dmao_obj.dvma_obj.dv_nseg = 1; 2733 dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0]; 2734 dma_out->dmao_type = DMA_OTYP_DVADDR; 2735 2736 return (DDI_DMA_MAPPED); 2737 } 2738 2739 static int 2740 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao) 2741 { 2742 uint64_t dvma, npages; 2743 domain_t *domain; 2744 struct dvmaseg *dvs; 2745 2746 domain = IMMU_DEVI(rdip)->imd_domain; 2747 dvs = dmao->dmao_obj.dvma_obj.dv_seg; 2748 2749 dvma = dvs[0].dvs_start; 2750 npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off); 2751 2752 #ifdef DEBUG 2753 /* Unmap only in DEBUG mode */ 2754 dvma_unmap(domain, dvma, npages, rdip); 2755 #endif 2756 dvma_free(domain, dvma, npages); 2757 2758 IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages, 2759 uint64_t, dvma); 2760 2761 #ifdef DEBUG 2762 /* 2763 * In the DEBUG case, the unmap was actually done, 2764 * but an IOTLB flush was not done. So, an explicit 2765 * write back flush is needed. 2766 */ 2767 immu_regs_wbf_flush(domain->dom_immu); 2768 #endif 2769 2770 return (DDI_SUCCESS); 2771 } 2772 2773 /* ############################# Functions exported ######################## */ 2774 2775 /* 2776 * setup the DVMA subsystem 2777 * this code runs only for the first IOMMU unit 2778 */ 2779 void 2780 immu_dvma_setup(list_t *listp) 2781 { 2782 immu_t *immu; 2783 uint_t kval; 2784 size_t nchains; 2785 2786 /* locks */ 2787 mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL); 2788 2789 /* Create lists */ 2790 list_create(&immu_unity_domain_list, sizeof (domain_t), 2791 offsetof(domain_t, dom_maptype_node)); 2792 list_create(&immu_xlate_domain_list, sizeof (domain_t), 2793 offsetof(domain_t, dom_maptype_node)); 2794 2795 /* Setup BDF domain hash */ 2796 nchains = 0xff; 2797 kval = mod_hash_iddata_gen(nchains); 2798 2799 bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH", 2800 nchains, mod_hash_null_keydtor, mod_hash_null_valdtor, 2801 mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp, 2802 KM_NOSLEEP); 2803 2804 immu = list_head(listp); 2805 for (; immu; immu = list_next(listp, immu)) { 2806 create_unity_domain(immu); 2807 did_init(immu); 2808 context_init(immu); 2809 immu->immu_dvma_setup = B_TRUE; 2810 } 2811 } 2812 2813 /* 2814 * Startup up one DVMA unit 2815 */ 2816 void 2817 immu_dvma_startup(immu_t *immu) 2818 { 2819 if (immu_gfxdvma_enable == B_FALSE && 2820 immu->immu_dvma_gfx_only == B_TRUE) { 2821 return; 2822 } 2823 2824 /* 2825 * DVMA will start once IOMMU is "running" 2826 */ 2827 immu->immu_dvma_running = B_TRUE; 2828 } 2829 2830 /* 2831 * immu_dvma_physmem_update() 2832 * called when the installed memory on a 2833 * system increases, to expand domain DVMA 2834 * for domains with UNITY mapping 2835 */ 2836 void 2837 immu_dvma_physmem_update(uint64_t addr, uint64_t size) 2838 { 2839 uint64_t start; 2840 uint64_t npages; 2841 int dcount; 2842 immu_dcookie_t dcookies[1] = {0}; 2843 domain_t *domain; 2844 2845 /* 2846 * Just walk the system-wide list of domains with 2847 * UNITY mapping. Both the list of *all* domains 2848 * and *UNITY* domains is protected by the same 2849 * single lock 2850 */ 2851 mutex_enter(&immu_domain_lock); 2852 domain = list_head(&immu_unity_domain_list); 2853 for (; domain; domain = list_next(&immu_unity_domain_list, domain)) { 2854 /* 2855 * Nothing to do if the IOMMU supports passthrough. 2856 */ 2857 if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap)) 2858 continue; 2859 2860 /* There is no vmem_arena for unity domains. Just map it */ 2861 ddi_err(DER_LOG, domain->dom_dip, 2862 "iommu: unity-domain: Adding map " 2863 "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size); 2864 2865 start = IMMU_ROUNDOWN(addr); 2866 npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1; 2867 2868 dcookies[0].dck_paddr = start; 2869 dcookies[0].dck_npages = npages; 2870 dcount = 1; 2871 (void) dvma_map(domain, start, npages, 2872 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2873 2874 } 2875 mutex_exit(&immu_domain_lock); 2876 } 2877 2878 int 2879 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags) 2880 { 2881 dev_info_t *ddip, *odip; 2882 immu_t *immu; 2883 domain_t *domain; 2884 2885 odip = rdip; 2886 2887 immu = immu_dvma_get_immu(rdip, immu_flags); 2888 if (immu == NULL) { 2889 /* 2890 * possible that there is no IOMMU unit for this device 2891 * - BIOS bugs are one example. 2892 */ 2893 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 2894 return (DDI_DMA_NORESOURCES); 2895 } 2896 2897 /* 2898 * redirect isa devices attached under lpc to lpc dip 2899 */ 2900 if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) { 2901 rdip = get_lpc_devinfo(immu, rdip, immu_flags); 2902 if (rdip == NULL) { 2903 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2904 /*NOTREACHED*/ 2905 } 2906 } 2907 2908 /* Reset immu, as redirection can change IMMU */ 2909 immu = NULL; 2910 2911 /* 2912 * for gart, redirect to the real graphic devinfo 2913 */ 2914 if (strcmp(ddi_node_name(rdip), "agpgart") == 0) { 2915 rdip = get_gfx_devinfo(rdip); 2916 if (rdip == NULL) { 2917 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2918 /*NOTREACHED*/ 2919 } 2920 } 2921 2922 /* 2923 * Setup DVMA domain for the device. This does 2924 * work only the first time we do DVMA for a 2925 * device. 2926 */ 2927 ddip = NULL; 2928 domain = device_domain(rdip, &ddip, immu_flags); 2929 if (domain == NULL) { 2930 ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device"); 2931 return (DDI_DMA_NORESOURCES); 2932 } 2933 2934 immu = domain->dom_immu; 2935 2936 /* 2937 * If a domain is found, we must also have a domain dip 2938 * which is the topmost ancestor dip of rdip that shares 2939 * the same domain with rdip. 2940 */ 2941 if (domain->dom_did == 0 || ddip == NULL) { 2942 ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)", 2943 domain->dom_did, ddip); 2944 return (DDI_DMA_NORESOURCES); 2945 } 2946 2947 if (odip != rdip) 2948 set_domain(odip, ddip, domain); 2949 2950 /* 2951 * Update the root and context entries 2952 */ 2953 if (immu_context_update(immu, domain, ddip, rdip, immu_flags) 2954 != DDI_SUCCESS) { 2955 ddi_err(DER_MODE, rdip, "DVMA map: context update failed"); 2956 return (DDI_DMA_NORESOURCES); 2957 } 2958 2959 return (DDI_SUCCESS); 2960 } 2961 2962 int 2963 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng) 2964 { 2965 immu_dcookie_t dcookies[1] = {0}; 2966 boolean_t pde_set; 2967 immu_t *immu; 2968 domain_t *domain; 2969 immu_inv_wait_t iw; 2970 2971 dcookies[0].dck_paddr = mrng->mrng_start; 2972 dcookies[0].dck_npages = mrng->mrng_npages; 2973 2974 domain = IMMU_DEVI(rdip)->imd_domain; 2975 immu = domain->dom_immu; 2976 2977 pde_set = dvma_map(domain, mrng->mrng_start, 2978 mrng->mrng_npages, dcookies, 1, rdip, 2979 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2980 2981 immu_init_inv_wait(&iw, "memrange", B_TRUE); 2982 2983 immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start, 2984 mrng->mrng_npages, pde_set == B_TRUE ? 2985 TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw); 2986 2987 return (DDI_SUCCESS); 2988 } 2989 2990 immu_devi_t * 2991 immu_devi_get(dev_info_t *rdip) 2992 { 2993 immu_devi_t *immu_devi; 2994 volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu); 2995 2996 /* Just want atomic reads. No need for lock */ 2997 immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr, 2998 0); 2999 return (immu_devi); 3000 } 3001 3002 /*ARGSUSED*/ 3003 int 3004 immu_hdl_priv_ctor(void *buf, void *arg, int kmf) 3005 { 3006 immu_hdl_priv_t *ihp; 3007 3008 ihp = buf; 3009 immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE); 3010 3011 return (0); 3012 } 3013 3014 /* 3015 * iommulib interface functions 3016 */ 3017 static int 3018 immu_probe(iommulib_handle_t handle, dev_info_t *dip) 3019 { 3020 immu_devi_t *immu_devi; 3021 int ret; 3022 3023 if (!immu_enable) 3024 return (DDI_FAILURE); 3025 3026 /* 3027 * Make sure the device has all the IOMMU structures 3028 * initialized. If this device goes through an IOMMU 3029 * unit (e.g. this probe function returns success), 3030 * this will be called at most N times, with N being 3031 * the number of IOMMUs in the system. 3032 * 3033 * After that, when iommulib_nex_open succeeds, 3034 * we can always assume that this device has all 3035 * the structures initialized. IOMMU_USED(dip) will 3036 * be true. There is no need to find the controlling 3037 * IOMMU/domain again. 3038 */ 3039 ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP); 3040 if (ret != DDI_SUCCESS) 3041 return (ret); 3042 3043 immu_devi = IMMU_DEVI(dip); 3044 3045 /* 3046 * For unity domains, there is no need to call in to 3047 * the IOMMU code. 3048 */ 3049 if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID) 3050 return (DDI_FAILURE); 3051 3052 if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle)) 3053 return (DDI_SUCCESS); 3054 3055 return (DDI_FAILURE); 3056 } 3057 3058 /*ARGSUSED*/ 3059 static int 3060 immu_allochdl(iommulib_handle_t handle, 3061 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 3062 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep) 3063 { 3064 int ret; 3065 immu_hdl_priv_t *ihp; 3066 immu_t *immu; 3067 3068 ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp, 3069 arg, dma_handlep); 3070 if (ret == DDI_SUCCESS) { 3071 immu = IMMU_DEVI(rdip)->imd_immu; 3072 3073 ihp = kmem_cache_alloc(immu->immu_hdl_cache, 3074 waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP); 3075 if (ihp == NULL) { 3076 (void) iommulib_iommu_dma_freehdl(dip, rdip, 3077 *dma_handlep); 3078 return (DDI_DMA_NORESOURCES); 3079 } 3080 3081 if (IMMU_DEVI(rdip)->imd_use_premap) 3082 dvma_prealloc(rdip, ihp, attr); 3083 else { 3084 ihp->ihp_npremapped = 0; 3085 ihp->ihp_predvma = 0; 3086 } 3087 ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep, 3088 ihp); 3089 } 3090 return (ret); 3091 } 3092 3093 /*ARGSUSED*/ 3094 static int 3095 immu_freehdl(iommulib_handle_t handle, 3096 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3097 { 3098 immu_hdl_priv_t *ihp; 3099 3100 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3101 if (ihp != NULL) { 3102 if (IMMU_DEVI(rdip)->imd_use_premap) 3103 dvma_prefree(rdip, ihp); 3104 kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp); 3105 } 3106 3107 return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle)); 3108 } 3109 3110 3111 /*ARGSUSED*/ 3112 static int 3113 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 3114 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3115 struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep, 3116 uint_t *ccountp) 3117 { 3118 int ret; 3119 immu_hdl_priv_t *ihp; 3120 3121 ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle, 3122 dma_req, cookiep, ccountp); 3123 3124 if (ret == DDI_DMA_MAPPED) { 3125 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3126 immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait); 3127 } 3128 3129 return (ret); 3130 } 3131 3132 /*ARGSUSED*/ 3133 static int 3134 immu_unbindhdl(iommulib_handle_t handle, 3135 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3136 { 3137 return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle)); 3138 } 3139 3140 /*ARGSUSED*/ 3141 static int 3142 immu_sync(iommulib_handle_t handle, dev_info_t *dip, 3143 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, 3144 size_t len, uint_t cachefl) 3145 { 3146 return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len, 3147 cachefl)); 3148 } 3149 3150 /*ARGSUSED*/ 3151 static int 3152 immu_win(iommulib_handle_t handle, dev_info_t *dip, 3153 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 3154 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, 3155 uint_t *ccountp) 3156 { 3157 return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp, 3158 lenp, cookiep, ccountp)); 3159 } 3160 3161 /*ARGSUSED*/ 3162 static int 3163 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 3164 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3165 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao) 3166 { 3167 immu_hdl_priv_t *ihp; 3168 3169 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3170 3171 return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao)); 3172 } 3173 3174 /*ARGSUSED*/ 3175 static int 3176 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 3177 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao) 3178 { 3179 immu_hdl_priv_t *ihp; 3180 3181 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3182 if (ihp->ihp_npremapped > 0) 3183 return (DDI_SUCCESS); 3184 return (immu_unmap_dvmaseg(rdip, dmao)); 3185 } 3186 3187 /*ARGSUSED*/ 3188 static int 3189 immu_map(iommulib_handle_t handle, dev_info_t *dip, 3190 dev_info_t *rdip, struct ddi_dma_req *dmareq, 3191 ddi_dma_handle_t *dma_handle) 3192 { 3193 ASSERT(0); 3194 return (DDI_FAILURE); 3195 } 3196 3197 /*ARGSUSED*/ 3198 static int 3199 immu_mctl(iommulib_handle_t handle, dev_info_t *dip, 3200 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3201 enum ddi_dma_ctlops request, off_t *offp, size_t *lenp, 3202 caddr_t *objpp, uint_t cachefl) 3203 { 3204 ASSERT(0); 3205 return (DDI_FAILURE); 3206 } 3207