1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Portions Copyright (c) 2010, Oracle and/or its affiliates. 23 * All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2009, Intel Corporation. 27 * All rights reserved. 28 */ 29 /* 30 * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. 31 */ 32 33 /* 34 * DVMA code 35 * This file contains Intel IOMMU code that deals with DVMA 36 * i.e. DMA remapping. 37 */ 38 39 #include <sys/sysmacros.h> 40 #include <sys/pcie.h> 41 #include <sys/pci_cfgspace.h> 42 #include <vm/hat_i86.h> 43 #include <sys/memlist.h> 44 #include <sys/acpi/acpi.h> 45 #include <sys/acpica.h> 46 #include <sys/modhash.h> 47 #include <sys/immu.h> 48 #include <sys/x86_archext.h> 49 #include <sys/archsystm.h> 50 51 #undef TEST 52 53 /* 54 * Macros based on PCI spec 55 */ 56 #define IMMU_PCI_REV2CLASS(r) ((r) >> 8) /* classcode from revid */ 57 #define IMMU_PCI_CLASS2BASE(c) ((c) >> 16) /* baseclass from classcode */ 58 #define IMMU_PCI_CLASS2SUB(c) (((c) >> 8) & 0xff); /* classcode */ 59 60 #define IMMU_CONTIG_PADDR(d, p) \ 61 ((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p)) 62 63 typedef struct dvma_arg { 64 immu_t *dva_immu; 65 dev_info_t *dva_rdip; 66 dev_info_t *dva_ddip; 67 domain_t *dva_domain; 68 int dva_level; 69 immu_flags_t dva_flags; 70 list_t *dva_list; 71 int dva_error; 72 } dvma_arg_t; 73 74 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip, 75 dev_info_t *rdip, immu_flags_t immu_flags); 76 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus, 77 int dev, int func, immu_flags_t immu_flags); 78 static void destroy_immu_devi(immu_devi_t *immu_devi); 79 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma, 80 uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 81 immu_flags_t immu_flags); 82 83 /* Extern globals */ 84 extern struct memlist *phys_install; 85 86 /* 87 * iommulib interface functions. 88 */ 89 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip); 90 static int immu_allochdl(iommulib_handle_t handle, 91 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 92 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep); 93 static int immu_freehdl(iommulib_handle_t handle, 94 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 95 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 96 dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req, 97 ddi_dma_cookie_t *cookiep, uint_t *ccountp); 98 static int immu_unbindhdl(iommulib_handle_t handle, 99 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); 100 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip, 101 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len, 102 uint_t cachefl); 103 static int immu_win(iommulib_handle_t handle, dev_info_t *dip, 104 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 105 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); 106 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 107 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 108 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao); 109 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 110 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao); 111 112 /* static Globals */ 113 114 /* 115 * Used to setup DMA objects (memory regions) 116 * for DMA reads by IOMMU units 117 */ 118 static ddi_dma_attr_t immu_dma_attr = { 119 DMA_ATTR_V0, 120 0U, 121 0xffffffffffffffffULL, 122 0xffffffffU, 123 MMU_PAGESIZE, /* MMU page aligned */ 124 0x1, 125 0x1, 126 0xffffffffU, 127 0xffffffffffffffffULL, 128 1, 129 4, 130 0 131 }; 132 133 static ddi_device_acc_attr_t immu_acc_attr = { 134 DDI_DEVICE_ATTR_V0, 135 DDI_NEVERSWAP_ACC, 136 DDI_STRICTORDER_ACC 137 }; 138 139 struct iommulib_ops immulib_ops = { 140 IOMMU_OPS_VERSION, 141 INTEL_IOMMU, 142 "Intel IOMMU", 143 NULL, 144 immu_probe, 145 immu_allochdl, 146 immu_freehdl, 147 immu_bindhdl, 148 immu_unbindhdl, 149 immu_sync, 150 immu_win, 151 immu_mapobject, 152 immu_unmapobject, 153 }; 154 155 /* 156 * Fake physical address range used to set up initial prealloc mappings. 157 * This memory is never actually accessed. It is mapped read-only, 158 * and is overwritten as soon as the first DMA bind operation is 159 * performed. Since 0 is a special case, just start at the 2nd 160 * physical page. 161 */ 162 163 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES }; 164 165 /* globals private to this file */ 166 static kmutex_t immu_domain_lock; 167 static list_t immu_unity_domain_list; 168 static list_t immu_xlate_domain_list; 169 170 /* structure used to store idx into each level of the page tables */ 171 typedef struct xlate { 172 int xlt_level; 173 uint_t xlt_idx; 174 pgtable_t *xlt_pgtable; 175 } xlate_t; 176 177 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */ 178 #define IMMU_UNITY_DID 1 179 180 static mod_hash_t *bdf_domain_hash; 181 182 int immu_use_alh; 183 int immu_use_tm; 184 185 static domain_t * 186 bdf_domain_lookup(immu_devi_t *immu_devi) 187 { 188 domain_t *domain; 189 int16_t seg = immu_devi->imd_seg; 190 int16_t bus = immu_devi->imd_bus; 191 int16_t devfunc = immu_devi->imd_devfunc; 192 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 193 194 if (seg < 0 || bus < 0 || devfunc < 0) { 195 return (NULL); 196 } 197 198 domain = NULL; 199 if (mod_hash_find(bdf_domain_hash, 200 (void *)bdf, (void *)&domain) == 0) { 201 ASSERT(domain); 202 ASSERT(domain->dom_did > 0); 203 return (domain); 204 } else { 205 return (NULL); 206 } 207 } 208 209 static void 210 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain) 211 { 212 int16_t seg = immu_devi->imd_seg; 213 int16_t bus = immu_devi->imd_bus; 214 int16_t devfunc = immu_devi->imd_devfunc; 215 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); 216 217 if (seg < 0 || bus < 0 || devfunc < 0) { 218 return; 219 } 220 221 (void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain); 222 } 223 224 static int 225 match_lpc(dev_info_t *pdip, void *arg) 226 { 227 immu_devi_t *immu_devi; 228 dvma_arg_t *dvap = (dvma_arg_t *)arg; 229 230 if (list_is_empty(dvap->dva_list)) { 231 return (DDI_WALK_TERMINATE); 232 } 233 234 immu_devi = list_head(dvap->dva_list); 235 for (; immu_devi; immu_devi = list_next(dvap->dva_list, 236 immu_devi)) { 237 if (immu_devi->imd_dip == pdip) { 238 dvap->dva_ddip = pdip; 239 dvap->dva_error = DDI_SUCCESS; 240 return (DDI_WALK_TERMINATE); 241 } 242 } 243 244 return (DDI_WALK_CONTINUE); 245 } 246 247 static void 248 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu) 249 { 250 list_t *spclist = NULL; 251 immu_devi_t *immu_devi; 252 253 immu_devi = IMMU_DEVI(dip); 254 if (immu_devi->imd_display == B_TRUE) { 255 spclist = &(immu->immu_dvma_gfx_list); 256 } else if (immu_devi->imd_lpc == B_TRUE) { 257 spclist = &(immu->immu_dvma_lpc_list); 258 } 259 260 if (spclist) { 261 mutex_enter(&(immu->immu_lock)); 262 list_insert_head(spclist, immu_devi); 263 mutex_exit(&(immu->immu_lock)); 264 } 265 } 266 267 /* 268 * Set the immu_devi struct in the immu_devi field of a devinfo node 269 */ 270 int 271 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags) 272 { 273 int bus, dev, func; 274 immu_devi_t *new_imd; 275 immu_devi_t *immu_devi; 276 277 immu_devi = immu_devi_get(dip); 278 if (immu_devi != NULL) { 279 return (DDI_SUCCESS); 280 } 281 282 bus = dev = func = -1; 283 284 /* 285 * Assume a new immu_devi struct is needed 286 */ 287 if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) { 288 /* 289 * No BDF. Set bus = -1 to indicate this. 290 * We still need to create a immu_devi struct 291 * though 292 */ 293 bus = -1; 294 dev = 0; 295 func = 0; 296 } 297 298 new_imd = create_immu_devi(dip, bus, dev, func, immu_flags); 299 if (new_imd == NULL) { 300 ddi_err(DER_WARN, dip, "Failed to create immu_devi " 301 "structure"); 302 return (DDI_FAILURE); 303 } 304 305 /* 306 * Check if some other thread allocated a immu_devi while we 307 * didn't own the lock. 308 */ 309 mutex_enter(&(DEVI(dip)->devi_lock)); 310 if (IMMU_DEVI(dip) == NULL) { 311 IMMU_DEVI_SET(dip, new_imd); 312 } else { 313 destroy_immu_devi(new_imd); 314 } 315 mutex_exit(&(DEVI(dip)->devi_lock)); 316 317 return (DDI_SUCCESS); 318 } 319 320 static dev_info_t * 321 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags) 322 { 323 dvma_arg_t dvarg = {0}; 324 dvarg.dva_list = &(immu->immu_dvma_lpc_list); 325 dvarg.dva_rdip = rdip; 326 dvarg.dva_error = DDI_FAILURE; 327 328 if (immu_walk_ancestor(rdip, NULL, match_lpc, 329 &dvarg, NULL, immu_flags) != DDI_SUCCESS) { 330 ddi_err(DER_MODE, rdip, "Could not walk ancestors to " 331 "find lpc_devinfo for ISA device"); 332 return (NULL); 333 } 334 335 if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) { 336 ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for " 337 "ISA device"); 338 return (NULL); 339 } 340 341 return (dvarg.dva_ddip); 342 } 343 344 static dev_info_t * 345 get_gfx_devinfo(dev_info_t *rdip) 346 { 347 immu_t *immu; 348 immu_devi_t *immu_devi; 349 list_t *list_gfx; 350 351 /* 352 * The GFX device may not be on the same iommu unit as "agpgart" 353 * so search globally 354 */ 355 immu_devi = NULL; 356 immu = list_head(&immu_list); 357 for (; immu; immu = list_next(&immu_list, immu)) { 358 list_gfx = &(immu->immu_dvma_gfx_list); 359 if (!list_is_empty(list_gfx)) { 360 immu_devi = list_head(list_gfx); 361 break; 362 } 363 } 364 365 if (immu_devi == NULL) { 366 ddi_err(DER_WARN, rdip, "iommu: No GFX device. " 367 "Cannot redirect agpgart"); 368 return (NULL); 369 } 370 371 ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s", 372 ddi_node_name(immu_devi->imd_dip)); 373 374 return (immu_devi->imd_dip); 375 } 376 377 static immu_flags_t 378 dma_to_immu_flags(struct ddi_dma_req *dmareq) 379 { 380 immu_flags_t flags = 0; 381 382 if (dmareq->dmar_fp == DDI_DMA_SLEEP) { 383 flags |= IMMU_FLAGS_SLEEP; 384 } else { 385 flags |= IMMU_FLAGS_NOSLEEP; 386 } 387 388 #ifdef BUGGY_DRIVERS 389 390 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 391 392 #else 393 /* 394 * Read and write flags need to be reversed. 395 * DMA_READ means read from device and write 396 * to memory. So DMA read means DVMA write. 397 */ 398 if (dmareq->dmar_flags & DDI_DMA_READ) 399 flags |= IMMU_FLAGS_WRITE; 400 401 if (dmareq->dmar_flags & DDI_DMA_WRITE) 402 flags |= IMMU_FLAGS_READ; 403 404 /* 405 * Some buggy drivers specify neither READ or WRITE 406 * For such drivers set both read and write permissions 407 */ 408 if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) { 409 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 410 } 411 #endif 412 413 return (flags); 414 } 415 416 /*ARGSUSED*/ 417 int 418 pgtable_ctor(void *buf, void *arg, int kmflag) 419 { 420 size_t actual_size = 0; 421 pgtable_t *pgtable; 422 int (*dmafp)(caddr_t); 423 caddr_t vaddr; 424 void *next; 425 uint_t flags; 426 immu_t *immu = arg; 427 428 pgtable = (pgtable_t *)buf; 429 430 dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; 431 432 next = kmem_zalloc(IMMU_PAGESIZE, kmflag); 433 if (next == NULL) { 434 return (-1); 435 } 436 437 if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr, 438 dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) { 439 kmem_free(next, IMMU_PAGESIZE); 440 return (-1); 441 } 442 443 flags = DDI_DMA_CONSISTENT; 444 if (!immu->immu_dvma_coherent) 445 flags |= IOMEM_DATA_UC_WR_COMBINE; 446 447 if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE, 448 &immu_acc_attr, flags, 449 dmafp, NULL, &vaddr, &actual_size, 450 &pgtable->hwpg_memhdl) != DDI_SUCCESS) { 451 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 452 kmem_free(next, IMMU_PAGESIZE); 453 return (-1); 454 } 455 456 /* 457 * Memory allocation failure. Maybe a temporary condition 458 * so return error rather than panic, so we can try again 459 */ 460 if (actual_size < IMMU_PAGESIZE) { 461 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 462 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 463 kmem_free(next, IMMU_PAGESIZE); 464 return (-1); 465 } 466 467 pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr)); 468 pgtable->hwpg_vaddr = vaddr; 469 pgtable->swpg_next_array = next; 470 471 rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL); 472 473 return (0); 474 } 475 476 /*ARGSUSED*/ 477 void 478 pgtable_dtor(void *buf, void *arg) 479 { 480 pgtable_t *pgtable; 481 482 pgtable = (pgtable_t *)buf; 483 484 /* destroy will panic if lock is held. */ 485 rw_destroy(&(pgtable->swpg_rwlock)); 486 487 ddi_dma_mem_free(&pgtable->hwpg_memhdl); 488 ddi_dma_free_handle(&pgtable->hwpg_dmahdl); 489 kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE); 490 } 491 492 /* 493 * pgtable_alloc() 494 * alloc a IOMMU pgtable structure. 495 * This same struct is used for root and context tables as well. 496 * This routine allocs the f/ollowing: 497 * - a pgtable_t struct 498 * - a HW page which holds PTEs/entries which is accesssed by HW 499 * so we set up DMA for this page 500 * - a SW page which is only for our bookeeping 501 * (for example to hold pointers to the next level pgtable). 502 * So a simple kmem_alloc suffices 503 */ 504 static pgtable_t * 505 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags) 506 { 507 pgtable_t *pgtable; 508 int kmflags; 509 510 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 511 512 pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags); 513 if (pgtable == NULL) { 514 return (NULL); 515 } 516 return (pgtable); 517 } 518 519 static void 520 pgtable_zero(pgtable_t *pgtable) 521 { 522 bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE); 523 bzero(pgtable->swpg_next_array, IMMU_PAGESIZE); 524 } 525 526 static void 527 pgtable_free(immu_t *immu, pgtable_t *pgtable) 528 { 529 kmem_cache_free(immu->immu_pgtable_cache, pgtable); 530 } 531 532 /* 533 * Function to identify a display device from the PCI class code 534 */ 535 static boolean_t 536 device_is_display(uint_t classcode) 537 { 538 static uint_t disp_classes[] = { 539 0x000100, 540 0x030000, 541 0x030001 542 }; 543 int i, nclasses = sizeof (disp_classes) / sizeof (uint_t); 544 545 for (i = 0; i < nclasses; i++) { 546 if (classcode == disp_classes[i]) 547 return (B_TRUE); 548 } 549 return (B_FALSE); 550 } 551 552 /* 553 * Function that determines if device is PCIEX and/or PCIEX bridge 554 */ 555 static boolean_t 556 device_is_pciex( 557 uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib) 558 { 559 ushort_t cap; 560 ushort_t capsp; 561 ushort_t cap_count = PCI_CAP_MAX_PTR; 562 ushort_t status; 563 boolean_t is_pciex = B_FALSE; 564 565 *is_pcib = B_FALSE; 566 567 status = pci_getw_func(bus, dev, func, PCI_CONF_STAT); 568 if (!(status & PCI_STAT_CAP)) 569 return (B_FALSE); 570 571 capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR); 572 while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) { 573 capsp &= PCI_CAP_PTR_MASK; 574 cap = pci_getb_func(bus, dev, func, capsp); 575 576 if (cap == PCI_CAP_ID_PCI_E) { 577 status = pci_getw_func(bus, dev, func, capsp + 2); 578 /* 579 * See section 7.8.2 of PCI-Express Base Spec v1.0a 580 * for Device/Port Type. 581 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the 582 * device is a PCIE2PCI bridge 583 */ 584 *is_pcib = 585 ((status & PCIE_PCIECAP_DEV_TYPE_MASK) == 586 PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE; 587 is_pciex = B_TRUE; 588 } 589 590 capsp = (*pci_getb_func)(bus, dev, func, 591 capsp + PCI_CAP_NEXT_PTR); 592 } 593 594 return (is_pciex); 595 } 596 597 static boolean_t 598 device_use_premap(uint_t classcode) 599 { 600 if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET) 601 return (B_TRUE); 602 return (B_FALSE); 603 } 604 605 606 /* 607 * immu_dvma_get_immu() 608 * get the immu unit structure for a dev_info node 609 */ 610 immu_t * 611 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags) 612 { 613 immu_devi_t *immu_devi; 614 immu_t *immu; 615 616 /* 617 * check if immu unit was already found earlier. 618 * If yes, then it will be stashed in immu_devi struct. 619 */ 620 immu_devi = immu_devi_get(dip); 621 if (immu_devi == NULL) { 622 if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) { 623 /* 624 * May fail because of low memory. Return error rather 625 * than panic as we want driver to rey again later 626 */ 627 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 628 "No immu_devi structure"); 629 /*NOTREACHED*/ 630 } 631 immu_devi = immu_devi_get(dip); 632 } 633 634 mutex_enter(&(DEVI(dip)->devi_lock)); 635 if (immu_devi->imd_immu) { 636 immu = immu_devi->imd_immu; 637 mutex_exit(&(DEVI(dip)->devi_lock)); 638 return (immu); 639 } 640 mutex_exit(&(DEVI(dip)->devi_lock)); 641 642 immu = immu_dmar_get_immu(dip); 643 if (immu == NULL) { 644 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: " 645 "Cannot find immu_t for device"); 646 /*NOTREACHED*/ 647 } 648 649 /* 650 * Check if some other thread found immu 651 * while lock was not held 652 */ 653 immu_devi = immu_devi_get(dip); 654 /* immu_devi should be present as we found it earlier */ 655 if (immu_devi == NULL) { 656 ddi_err(DER_PANIC, dip, 657 "immu_dvma_get_immu: No immu_devi structure"); 658 /*NOTREACHED*/ 659 } 660 661 mutex_enter(&(DEVI(dip)->devi_lock)); 662 if (immu_devi->imd_immu == NULL) { 663 /* nobody else set it, so we should do it */ 664 immu_devi->imd_immu = immu; 665 immu_devi_set_spclist(dip, immu); 666 } else { 667 /* 668 * if some other thread got immu before 669 * us, it should get the same results 670 */ 671 if (immu_devi->imd_immu != immu) { 672 ddi_err(DER_PANIC, dip, "Multiple " 673 "immu units found for device. Expected (%p), " 674 "actual (%p)", (void *)immu, 675 (void *)immu_devi->imd_immu); 676 mutex_exit(&(DEVI(dip)->devi_lock)); 677 /*NOTREACHED*/ 678 } 679 } 680 mutex_exit(&(DEVI(dip)->devi_lock)); 681 682 return (immu); 683 } 684 685 686 /* ############################# IMMU_DEVI code ############################ */ 687 688 /* 689 * Allocate a immu_devi structure and initialize it 690 */ 691 static immu_devi_t * 692 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func, 693 immu_flags_t immu_flags) 694 { 695 uchar_t baseclass, subclass; 696 uint_t classcode, revclass; 697 immu_devi_t *immu_devi; 698 boolean_t pciex = B_FALSE; 699 int kmflags; 700 boolean_t is_pcib = B_FALSE; 701 702 /* bus == -1 indicate non-PCI device (no BDF) */ 703 ASSERT(bus == -1 || bus >= 0); 704 ASSERT(dev >= 0); 705 ASSERT(func >= 0); 706 707 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 708 immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags); 709 if (immu_devi == NULL) { 710 ddi_err(DER_WARN, rdip, "Failed to allocate memory for " 711 "Intel IOMMU immu_devi structure"); 712 return (NULL); 713 } 714 immu_devi->imd_dip = rdip; 715 immu_devi->imd_seg = 0; /* Currently seg can only be 0 */ 716 immu_devi->imd_bus = bus; 717 immu_devi->imd_pcib_type = IMMU_PCIB_BAD; 718 719 if (bus == -1) { 720 immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF; 721 return (immu_devi); 722 } 723 724 immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func); 725 immu_devi->imd_sec = 0; 726 immu_devi->imd_sub = 0; 727 728 revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID); 729 730 classcode = IMMU_PCI_REV2CLASS(revclass); 731 baseclass = IMMU_PCI_CLASS2BASE(classcode); 732 subclass = IMMU_PCI_CLASS2SUB(classcode); 733 734 if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) { 735 736 immu_devi->imd_sec = pci_getb_func(bus, dev, func, 737 PCI_BCNF_SECBUS); 738 immu_devi->imd_sub = pci_getb_func(bus, dev, func, 739 PCI_BCNF_SUBBUS); 740 741 pciex = device_is_pciex(bus, dev, func, &is_pcib); 742 if (pciex == B_TRUE && is_pcib == B_TRUE) { 743 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI; 744 } else if (pciex == B_TRUE) { 745 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE; 746 } else { 747 immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI; 748 } 749 } else { 750 immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT; 751 } 752 753 /* check for certain special devices */ 754 immu_devi->imd_display = device_is_display(classcode); 755 immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) && 756 (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE; 757 immu_devi->imd_use_premap = device_use_premap(classcode); 758 759 immu_devi->imd_domain = NULL; 760 761 immu_devi->imd_dvma_flags = immu_global_dvma_flags; 762 763 return (immu_devi); 764 } 765 766 static void 767 destroy_immu_devi(immu_devi_t *immu_devi) 768 { 769 kmem_free(immu_devi, sizeof (immu_devi_t)); 770 } 771 772 static domain_t * 773 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp) 774 { 775 immu_devi_t *immu_devi; 776 domain_t *domain; 777 dev_info_t *ddip; 778 779 *ddipp = NULL; 780 781 immu_devi = immu_devi_get(rdip); 782 if (immu_devi == NULL) { 783 return (NULL); 784 } 785 786 mutex_enter(&(DEVI(rdip)->devi_lock)); 787 domain = immu_devi->imd_domain; 788 ddip = immu_devi->imd_ddip; 789 mutex_exit(&(DEVI(rdip)->devi_lock)); 790 791 if (domain) 792 *ddipp = ddip; 793 794 return (domain); 795 796 } 797 798 /* ############################# END IMMU_DEVI code ######################## */ 799 /* ############################# DOMAIN code ############################### */ 800 801 /* 802 * This routine always succeeds 803 */ 804 static int 805 did_alloc(immu_t *immu, dev_info_t *rdip, 806 dev_info_t *ddip, immu_flags_t immu_flags) 807 { 808 int did; 809 810 did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1, 811 (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP); 812 813 if (did == 0) { 814 ddi_err(DER_WARN, rdip, "device domain-id alloc error" 815 " domain-device: %s%d. immu unit is %s. Using " 816 "unity domain with domain-id (%d)", 817 ddi_driver_name(ddip), ddi_get_instance(ddip), 818 immu->immu_name, immu->immu_unity_domain->dom_did); 819 did = immu->immu_unity_domain->dom_did; 820 } 821 822 return (did); 823 } 824 825 static int 826 get_branch_domain(dev_info_t *pdip, void *arg) 827 { 828 immu_devi_t *immu_devi; 829 domain_t *domain; 830 dev_info_t *ddip; 831 immu_t *immu; 832 dvma_arg_t *dvp = (dvma_arg_t *)arg; 833 834 /* 835 * The field dvp->dva_rdip is a work-in-progress 836 * and gets updated as we walk up the ancestor 837 * tree. The final ddip is set only when we reach 838 * the top of the tree. So the dvp->dva_ddip field cannot 839 * be relied on until we reach the top of the field. 840 */ 841 842 /* immu_devi may not be set. */ 843 immu_devi = immu_devi_get(pdip); 844 if (immu_devi == NULL) { 845 if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) { 846 dvp->dva_error = DDI_FAILURE; 847 return (DDI_WALK_TERMINATE); 848 } 849 } 850 851 immu_devi = immu_devi_get(pdip); 852 immu = immu_devi->imd_immu; 853 if (immu == NULL) 854 immu = immu_dvma_get_immu(pdip, dvp->dva_flags); 855 856 /* 857 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to 858 * terminate the walk (since the device under the PCIE bridge 859 * is a PCIE device and has an independent entry in the 860 * root/context table) 861 */ 862 if (dvp->dva_rdip != pdip && 863 immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) { 864 return (DDI_WALK_TERMINATE); 865 } 866 867 /* 868 * In order to be a domain-dim, it must be a PCI device i.e. 869 * must have valid BDF. This also eliminates the root complex. 870 */ 871 if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD && 872 immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) { 873 ASSERT(immu_devi->imd_bus >= 0); 874 ASSERT(immu_devi->imd_devfunc >= 0); 875 dvp->dva_ddip = pdip; 876 } 877 878 if (immu_devi->imd_display == B_TRUE || 879 (dvp->dva_flags & IMMU_FLAGS_UNITY)) { 880 dvp->dva_domain = immu->immu_unity_domain; 881 /* continue walking to find ddip */ 882 return (DDI_WALK_CONTINUE); 883 } 884 885 mutex_enter(&(DEVI(pdip)->devi_lock)); 886 domain = immu_devi->imd_domain; 887 ddip = immu_devi->imd_ddip; 888 mutex_exit(&(DEVI(pdip)->devi_lock)); 889 890 if (domain && ddip) { 891 /* if domain is set, it must be the same */ 892 if (dvp->dva_domain) { 893 ASSERT(domain == dvp->dva_domain); 894 } 895 dvp->dva_domain = domain; 896 dvp->dva_ddip = ddip; 897 return (DDI_WALK_TERMINATE); 898 } 899 900 /* Domain may already be set, continue walking so that ddip gets set */ 901 if (dvp->dva_domain) { 902 return (DDI_WALK_CONTINUE); 903 } 904 905 /* domain is not set in either immu_devi or dvp */ 906 domain = bdf_domain_lookup(immu_devi); 907 if (domain == NULL) { 908 return (DDI_WALK_CONTINUE); 909 } 910 911 /* ok, the BDF hash had a domain for this BDF. */ 912 913 /* Grab lock again to check if something else set immu_devi fields */ 914 mutex_enter(&(DEVI(pdip)->devi_lock)); 915 if (immu_devi->imd_domain != NULL) { 916 dvp->dva_domain = domain; 917 } else { 918 dvp->dva_domain = domain; 919 } 920 mutex_exit(&(DEVI(pdip)->devi_lock)); 921 922 /* 923 * walk upwards until the topmost PCI bridge is found 924 */ 925 return (DDI_WALK_CONTINUE); 926 927 } 928 929 static void 930 map_unity_domain(domain_t *domain) 931 { 932 struct memlist *mp; 933 uint64_t start; 934 uint64_t npages; 935 immu_dcookie_t dcookies[1] = {0}; 936 int dcount = 0; 937 938 /* 939 * UNITY arenas are a mirror of the physical memory 940 * installed on the system. 941 */ 942 943 #ifdef BUGGY_DRIVERS 944 /* 945 * Dont skip page0. Some broken HW/FW access it. 946 */ 947 dcookies[0].dck_paddr = 0; 948 dcookies[0].dck_npages = 1; 949 dcount = 1; 950 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 951 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 952 #endif 953 954 memlist_read_lock(); 955 956 mp = phys_install; 957 958 if (mp->ml_address == 0) { 959 /* since we already mapped page1 above */ 960 start = IMMU_PAGESIZE; 961 } else { 962 start = mp->ml_address; 963 } 964 npages = mp->ml_size/IMMU_PAGESIZE + 1; 965 966 dcookies[0].dck_paddr = start; 967 dcookies[0].dck_npages = npages; 968 dcount = 1; 969 (void) dvma_map(domain, start, npages, dcookies, 970 dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 971 972 ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64 973 " - 0x%" PRIx64 "]", start, start + mp->ml_size); 974 975 mp = mp->ml_next; 976 while (mp) { 977 ddi_err(DER_LOG, domain->dom_dip, 978 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 979 mp->ml_address, mp->ml_address + mp->ml_size); 980 981 start = mp->ml_address; 982 npages = mp->ml_size/IMMU_PAGESIZE + 1; 983 984 dcookies[0].dck_paddr = start; 985 dcookies[0].dck_npages = npages; 986 dcount = 1; 987 (void) dvma_map(domain, start, npages, 988 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 989 mp = mp->ml_next; 990 } 991 992 mp = bios_rsvd; 993 while (mp) { 994 ddi_err(DER_LOG, domain->dom_dip, 995 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", 996 mp->ml_address, mp->ml_address + mp->ml_size); 997 998 start = mp->ml_address; 999 npages = mp->ml_size/IMMU_PAGESIZE + 1; 1000 1001 dcookies[0].dck_paddr = start; 1002 dcookies[0].dck_npages = npages; 1003 dcount = 1; 1004 (void) dvma_map(domain, start, npages, 1005 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 1006 1007 mp = mp->ml_next; 1008 } 1009 1010 memlist_read_unlock(); 1011 } 1012 1013 /* 1014 * create_xlate_arena() 1015 * Create the dvma arena for a domain with translation 1016 * mapping 1017 */ 1018 static void 1019 create_xlate_arena(immu_t *immu, domain_t *domain, 1020 dev_info_t *rdip, immu_flags_t immu_flags) 1021 { 1022 char *arena_name; 1023 struct memlist *mp; 1024 int vmem_flags; 1025 uint64_t start; 1026 uint_t mgaw; 1027 uint64_t size; 1028 uint64_t maxaddr; 1029 void *vmem_ret; 1030 1031 arena_name = domain->dom_dvma_arena_name; 1032 1033 /* Note, don't do sizeof (arena_name) - it is just a pointer */ 1034 (void) snprintf(arena_name, 1035 sizeof (domain->dom_dvma_arena_name), 1036 "%s-domain-%d-xlate-DVMA-arena", immu->immu_name, 1037 domain->dom_did); 1038 1039 vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP; 1040 1041 /* Restrict mgaddr (max guest addr) to MGAW */ 1042 mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap); 1043 1044 /* 1045 * To ensure we avoid ioapic and PCI MMIO ranges we just 1046 * use the physical memory address range of the system as the 1047 * range 1048 */ 1049 maxaddr = ((uint64_t)1 << mgaw); 1050 1051 memlist_read_lock(); 1052 1053 mp = phys_install; 1054 1055 if (mp->ml_address == 0) 1056 start = MMU_PAGESIZE; 1057 else 1058 start = mp->ml_address; 1059 1060 if (start + mp->ml_size > maxaddr) 1061 size = maxaddr - start; 1062 else 1063 size = mp->ml_size; 1064 1065 ddi_err(DER_VERB, rdip, 1066 "iommu: %s: Creating dvma vmem arena [0x%" PRIx64 1067 " - 0x%" PRIx64 "]", arena_name, start, start + size); 1068 1069 /* 1070 * We always allocate in quanta of IMMU_PAGESIZE 1071 */ 1072 domain->dom_dvma_arena = vmem_create(arena_name, 1073 (void *)(uintptr_t)start, /* start addr */ 1074 size, /* size */ 1075 IMMU_PAGESIZE, /* quantum */ 1076 NULL, /* afunc */ 1077 NULL, /* ffunc */ 1078 NULL, /* source */ 1079 0, /* qcache_max */ 1080 vmem_flags); 1081 1082 if (domain->dom_dvma_arena == NULL) { 1083 ddi_err(DER_PANIC, rdip, 1084 "Failed to allocate DVMA arena(%s) " 1085 "for domain ID (%d)", arena_name, domain->dom_did); 1086 /*NOTREACHED*/ 1087 } 1088 1089 mp = mp->ml_next; 1090 while (mp) { 1091 1092 if (mp->ml_address == 0) 1093 start = MMU_PAGESIZE; 1094 else 1095 start = mp->ml_address; 1096 1097 if (start + mp->ml_size > maxaddr) 1098 size = maxaddr - start; 1099 else 1100 size = mp->ml_size; 1101 1102 ddi_err(DER_VERB, rdip, 1103 "iommu: %s: Adding dvma vmem span [0x%" PRIx64 1104 " - 0x%" PRIx64 "]", arena_name, start, 1105 start + size); 1106 1107 vmem_ret = vmem_add(domain->dom_dvma_arena, 1108 (void *)(uintptr_t)start, size, vmem_flags); 1109 1110 if (vmem_ret == NULL) { 1111 ddi_err(DER_PANIC, rdip, 1112 "Failed to allocate DVMA arena(%s) " 1113 "for domain ID (%d)", 1114 arena_name, domain->dom_did); 1115 /*NOTREACHED*/ 1116 } 1117 mp = mp->ml_next; 1118 } 1119 memlist_read_unlock(); 1120 } 1121 1122 /* ################################### DOMAIN CODE ######################### */ 1123 1124 /* 1125 * Set the domain and domain-dip for a dip 1126 */ 1127 static void 1128 set_domain( 1129 dev_info_t *dip, 1130 dev_info_t *ddip, 1131 domain_t *domain) 1132 { 1133 immu_devi_t *immu_devi; 1134 domain_t *fdomain; 1135 dev_info_t *fddip; 1136 1137 immu_devi = immu_devi_get(dip); 1138 1139 mutex_enter(&(DEVI(dip)->devi_lock)); 1140 fddip = immu_devi->imd_ddip; 1141 fdomain = immu_devi->imd_domain; 1142 1143 if (fddip) { 1144 ASSERT(fddip == ddip); 1145 } else { 1146 immu_devi->imd_ddip = ddip; 1147 } 1148 1149 if (fdomain) { 1150 ASSERT(fdomain == domain); 1151 } else { 1152 immu_devi->imd_domain = domain; 1153 } 1154 mutex_exit(&(DEVI(dip)->devi_lock)); 1155 } 1156 1157 /* 1158 * device_domain() 1159 * Get domain for a device. The domain may be global in which case it 1160 * is shared between all IOMMU units. Due to potential AGAW differences 1161 * between IOMMU units, such global domains *have to be* UNITY mapping 1162 * domains. Alternatively, the domain may be local to a IOMMU unit. 1163 * Local domains may be shared or immu_devi, although the 1164 * scope of sharing 1165 * is restricted to devices controlled by the IOMMU unit to 1166 * which the domain 1167 * belongs. If shared, they (currently) have to be UNITY domains. If 1168 * immu_devi a domain may be either UNITY or translation (XLATE) domain. 1169 */ 1170 static domain_t * 1171 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) 1172 { 1173 dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */ 1174 immu_t *immu; 1175 domain_t *domain; 1176 dvma_arg_t dvarg = {0}; 1177 int level; 1178 1179 *ddipp = NULL; 1180 1181 /* 1182 * Check if the domain is already set. This is usually true 1183 * if this is not the first DVMA transaction. 1184 */ 1185 ddip = NULL; 1186 domain = immu_devi_domain(rdip, &ddip); 1187 if (domain) { 1188 *ddipp = ddip; 1189 return (domain); 1190 } 1191 1192 immu = immu_dvma_get_immu(rdip, immu_flags); 1193 if (immu == NULL) { 1194 /* 1195 * possible that there is no IOMMU unit for this device 1196 * - BIOS bugs are one example. 1197 */ 1198 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 1199 return (NULL); 1200 } 1201 1202 immu_flags |= immu_devi_get(rdip)->imd_dvma_flags; 1203 1204 dvarg.dva_rdip = rdip; 1205 dvarg.dva_ddip = NULL; 1206 dvarg.dva_domain = NULL; 1207 dvarg.dva_flags = immu_flags; 1208 level = 0; 1209 if (immu_walk_ancestor(rdip, NULL, get_branch_domain, 1210 &dvarg, &level, immu_flags) != DDI_SUCCESS) { 1211 /* 1212 * maybe low memory. return error, 1213 * so driver tries again later 1214 */ 1215 return (NULL); 1216 } 1217 1218 /* should have walked at least 1 dip (i.e. edip) */ 1219 ASSERT(level > 0); 1220 1221 ddip = dvarg.dva_ddip; /* must be present */ 1222 domain = dvarg.dva_domain; /* may be NULL */ 1223 1224 /* 1225 * We may find the domain during our ancestor walk on any one of our 1226 * ancestor dips, If the domain is found then the domain-dip 1227 * (i.e. ddip) will also be found in the same immu_devi struct. 1228 * The domain-dip is the highest ancestor dip which shares the 1229 * same domain with edip. 1230 * The domain may or may not be found, but the domain dip must 1231 * be found. 1232 */ 1233 if (ddip == NULL) { 1234 ddi_err(DER_MODE, rdip, "Cannot find domain dip for device."); 1235 return (NULL); 1236 } 1237 1238 /* 1239 * Did we find a domain ? 1240 */ 1241 if (domain) { 1242 goto found; 1243 } 1244 1245 /* nope, so allocate */ 1246 domain = domain_create(immu, ddip, rdip, immu_flags); 1247 if (domain == NULL) { 1248 return (NULL); 1249 } 1250 1251 /*FALLTHROUGH*/ 1252 found: 1253 /* 1254 * We know *domain *is* the right domain, so panic if 1255 * another domain is set for either the request-dip or 1256 * effective dip. 1257 */ 1258 set_domain(ddip, ddip, domain); 1259 set_domain(rdip, ddip, domain); 1260 1261 *ddipp = ddip; 1262 return (domain); 1263 } 1264 1265 static void 1266 create_unity_domain(immu_t *immu) 1267 { 1268 domain_t *domain; 1269 1270 /* domain created during boot and always use sleep flag */ 1271 domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP); 1272 1273 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1274 1275 domain->dom_did = IMMU_UNITY_DID; 1276 domain->dom_maptype = IMMU_MAPTYPE_UNITY; 1277 1278 domain->dom_immu = immu; 1279 immu->immu_unity_domain = domain; 1280 1281 /* 1282 * Setup the domain's initial page table 1283 * should never fail. 1284 */ 1285 domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1286 pgtable_zero(domain->dom_pgtable_root); 1287 1288 /* 1289 * Only map all physical memory in to the unity domain 1290 * if passthrough is not supported. If it is supported, 1291 * passthrough is set in the context entry instead. 1292 */ 1293 if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1294 map_unity_domain(domain); 1295 1296 1297 /* 1298 * put it on the system-wide UNITY domain list 1299 */ 1300 mutex_enter(&(immu_domain_lock)); 1301 list_insert_tail(&immu_unity_domain_list, domain); 1302 mutex_exit(&(immu_domain_lock)); 1303 } 1304 1305 /* 1306 * ddip is the domain-dip - the topmost dip in a domain 1307 * rdip is the requesting-dip - the device which is 1308 * requesting DVMA setup 1309 * if domain is a non-shared domain rdip == ddip 1310 */ 1311 static domain_t * 1312 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, 1313 immu_flags_t immu_flags) 1314 { 1315 int kmflags; 1316 domain_t *domain; 1317 char mod_hash_name[128]; 1318 immu_devi_t *immu_devi; 1319 int did; 1320 immu_dcookie_t dcookies[1] = {0}; 1321 int dcount = 0; 1322 1323 immu_devi = immu_devi_get(rdip); 1324 1325 /* 1326 * First allocate a domainid. 1327 * This routine will never fail, since if we run out 1328 * of domains the unity domain will be allocated. 1329 */ 1330 did = did_alloc(immu, rdip, ddip, immu_flags); 1331 if (did == IMMU_UNITY_DID) { 1332 /* domain overflow */ 1333 ASSERT(immu->immu_unity_domain); 1334 return (immu->immu_unity_domain); 1335 } 1336 1337 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 1338 domain = kmem_zalloc(sizeof (domain_t), kmflags); 1339 if (domain == NULL) { 1340 ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain " 1341 "structure for device. IOMMU unit: %s", immu->immu_name); 1342 /*NOTREACHED*/ 1343 } 1344 1345 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL); 1346 1347 (void) snprintf(mod_hash_name, sizeof (mod_hash_name), 1348 "immu%s-domain%d-pava-hash", immu->immu_name, did); 1349 1350 domain->dom_did = did; 1351 domain->dom_immu = immu; 1352 domain->dom_maptype = IMMU_MAPTYPE_XLATE; 1353 domain->dom_dip = ddip; 1354 1355 /* 1356 * Create xlate DVMA arena for this domain. 1357 */ 1358 create_xlate_arena(immu, domain, rdip, immu_flags); 1359 1360 /* 1361 * Setup the domain's initial page table 1362 */ 1363 domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags); 1364 if (domain->dom_pgtable_root == NULL) { 1365 ddi_err(DER_PANIC, rdip, "Failed to alloc root " 1366 "pgtable for domain (%d). IOMMU unit: %s", 1367 domain->dom_did, immu->immu_name); 1368 /*NOTREACHED*/ 1369 } 1370 pgtable_zero(domain->dom_pgtable_root); 1371 1372 /* 1373 * Since this is a immu unit-specific domain, put it on 1374 * the per-immu domain list. 1375 */ 1376 mutex_enter(&(immu->immu_lock)); 1377 list_insert_head(&immu->immu_domain_list, domain); 1378 mutex_exit(&(immu->immu_lock)); 1379 1380 /* 1381 * Also put it on the system-wide xlate domain list 1382 */ 1383 mutex_enter(&(immu_domain_lock)); 1384 list_insert_head(&immu_xlate_domain_list, domain); 1385 mutex_exit(&(immu_domain_lock)); 1386 1387 bdf_domain_insert(immu_devi, domain); 1388 1389 #ifdef BUGGY_DRIVERS 1390 /* 1391 * Map page0. Some broken HW/FW access it. 1392 */ 1393 dcookies[0].dck_paddr = 0; 1394 dcookies[0].dck_npages = 1; 1395 dcount = 1; 1396 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, 1397 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); 1398 #endif 1399 return (domain); 1400 } 1401 1402 /* 1403 * Create domainid arena. 1404 * Domainid 0 is reserved by Vt-d spec and cannot be used by 1405 * system software. 1406 * Domainid 1 is reserved by solaris and used for *all* of the following: 1407 * as the "uninitialized" domain - For devices not yet controlled 1408 * by Solaris 1409 * as the "unity" domain - For devices that will always belong 1410 * to the unity domain 1411 * as the "overflow" domain - Used for any new device after we 1412 * run out of domains 1413 * All of the above domains map into a single domain with 1414 * domainid 1 and UNITY DVMA mapping 1415 * Each IMMU unity has its own unity/uninit/overflow domain 1416 */ 1417 static void 1418 did_init(immu_t *immu) 1419 { 1420 (void) snprintf(immu->immu_did_arena_name, 1421 sizeof (immu->immu_did_arena_name), 1422 "%s_domainid_arena", immu->immu_name); 1423 1424 ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s", 1425 immu->immu_did_arena_name); 1426 1427 immu->immu_did_arena = vmem_create( 1428 immu->immu_did_arena_name, 1429 (void *)(uintptr_t)(IMMU_UNITY_DID + 1), /* start addr */ 1430 immu->immu_max_domains - IMMU_UNITY_DID, 1431 1, /* quantum */ 1432 NULL, /* afunc */ 1433 NULL, /* ffunc */ 1434 NULL, /* source */ 1435 0, /* qcache_max */ 1436 VM_SLEEP); 1437 1438 /* Even with SLEEP flag, vmem_create() can fail */ 1439 if (immu->immu_did_arena == NULL) { 1440 ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel " 1441 "IOMMU domainid allocator: %s", immu->immu_name, 1442 immu->immu_did_arena_name); 1443 } 1444 } 1445 1446 /* ######################### CONTEXT CODE ################################# */ 1447 1448 static void 1449 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, 1450 int bus, int devfunc) 1451 { 1452 pgtable_t *context; 1453 pgtable_t *pgtable_root; 1454 hw_rce_t *hw_rent; 1455 hw_rce_t *hw_cent; 1456 hw_rce_t *ctxp; 1457 int sid; 1458 krw_t rwtype; 1459 boolean_t fill_root; 1460 boolean_t fill_ctx; 1461 1462 pgtable_root = domain->dom_pgtable_root; 1463 1464 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1465 context = *(pgtable_t **)(ctxp + bus); 1466 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus; 1467 1468 fill_root = B_FALSE; 1469 fill_ctx = B_FALSE; 1470 1471 /* Check the most common case first with reader lock */ 1472 rw_enter(&(immu->immu_ctx_rwlock), RW_READER); 1473 rwtype = RW_READER; 1474 again: 1475 if (ROOT_GET_P(hw_rent)) { 1476 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1477 if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) { 1478 rw_exit(&(immu->immu_ctx_rwlock)); 1479 return; 1480 } else { 1481 fill_ctx = B_TRUE; 1482 } 1483 } else { 1484 fill_root = B_TRUE; 1485 fill_ctx = B_TRUE; 1486 } 1487 1488 if (rwtype == RW_READER && 1489 rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) { 1490 rw_exit(&(immu->immu_ctx_rwlock)); 1491 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1492 rwtype = RW_WRITER; 1493 goto again; 1494 } 1495 rwtype = RW_WRITER; 1496 1497 if (fill_root == B_TRUE) { 1498 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1499 ROOT_SET_P(hw_rent); 1500 immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t)); 1501 } 1502 1503 if (fill_ctx == B_TRUE) { 1504 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; 1505 /* need to disable context entry before reprogramming it */ 1506 bzero(hw_cent, sizeof (hw_rce_t)); 1507 1508 /* flush caches */ 1509 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1510 1511 sid = ((bus << 8) | devfunc); 1512 immu_flush_context_fsi(immu, 0, sid, domain->dom_did, 1513 &immu->immu_ctx_inv_wait); 1514 1515 CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED); 1516 CONT_SET_DID(hw_cent, domain->dom_did); 1517 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1518 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1519 if (domain->dom_did == IMMU_UNITY_DID && 1520 IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1521 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1522 else 1523 /*LINTED*/ 1524 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1525 CONT_SET_P(hw_cent); 1526 if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) { 1527 CONT_SET_EH(hw_cent); 1528 if (immu_use_alh) 1529 CONT_SET_ALH(hw_cent); 1530 } 1531 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); 1532 } 1533 rw_exit(&(immu->immu_ctx_rwlock)); 1534 } 1535 1536 static pgtable_t * 1537 context_create(immu_t *immu) 1538 { 1539 int bus; 1540 int devfunc; 1541 pgtable_t *root_table; 1542 pgtable_t *context; 1543 pgtable_t *pgtable_root; 1544 hw_rce_t *ctxp; 1545 hw_rce_t *hw_rent; 1546 hw_rce_t *hw_cent; 1547 1548 /* Allocate a zeroed root table (4K 256b entries) */ 1549 root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1550 pgtable_zero(root_table); 1551 1552 /* 1553 * Setup context tables for all possible root table entries. 1554 * Start out with unity domains for all entries. 1555 */ 1556 ctxp = (hw_rce_t *)(root_table->swpg_next_array); 1557 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr); 1558 for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) { 1559 context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); 1560 pgtable_zero(context); 1561 ROOT_SET_P(hw_rent); 1562 ROOT_SET_CONT(hw_rent, context->hwpg_paddr); 1563 hw_cent = (hw_rce_t *)(context->hwpg_vaddr); 1564 for (devfunc = 0; devfunc < IMMU_CONT_NUM; 1565 devfunc++, hw_cent++) { 1566 pgtable_root = 1567 immu->immu_unity_domain->dom_pgtable_root; 1568 CONT_SET_DID(hw_cent, 1569 immu->immu_unity_domain->dom_did); 1570 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); 1571 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); 1572 if (IMMU_ECAP_GET_PT(immu->immu_regs_excap)) 1573 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); 1574 else 1575 /*LINTED*/ 1576 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); 1577 CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED); 1578 CONT_SET_P(hw_cent); 1579 } 1580 immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE); 1581 *((pgtable_t **)ctxp) = context; 1582 } 1583 1584 return (root_table); 1585 } 1586 1587 /* 1588 * Called during rootnex attach, so no locks needed 1589 */ 1590 static void 1591 context_init(immu_t *immu) 1592 { 1593 rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL); 1594 1595 immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE); 1596 1597 immu_regs_wbf_flush(immu); 1598 1599 immu->immu_ctx_root = context_create(immu); 1600 1601 immu_regs_set_root_table(immu); 1602 1603 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); 1604 immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); 1605 immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); 1606 rw_exit(&(immu->immu_ctx_rwlock)); 1607 } 1608 1609 1610 /* 1611 * Find top pcib 1612 */ 1613 static int 1614 find_top_pcib(dev_info_t *dip, void *arg) 1615 { 1616 immu_devi_t *immu_devi; 1617 dev_info_t **pcibdipp = (dev_info_t **)arg; 1618 1619 immu_devi = immu_devi_get(dip); 1620 1621 if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) { 1622 *pcibdipp = dip; 1623 } 1624 1625 return (DDI_WALK_CONTINUE); 1626 } 1627 1628 static int 1629 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, 1630 dev_info_t *rdip, immu_flags_t immu_flags) 1631 { 1632 immu_devi_t *r_immu_devi; 1633 immu_devi_t *d_immu_devi; 1634 int r_bus; 1635 int d_bus; 1636 int r_devfunc; 1637 int d_devfunc; 1638 immu_pcib_t d_pcib_type; 1639 dev_info_t *pcibdip; 1640 1641 if (ddip == NULL || rdip == NULL || 1642 ddip == root_devinfo || rdip == root_devinfo) { 1643 ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or " 1644 "request-dip are NULL or are root devinfo"); 1645 return (DDI_FAILURE); 1646 } 1647 1648 /* 1649 * We need to set the context fields 1650 * based on what type of device rdip and ddip are. 1651 * To do that we need the immu_devi field. 1652 * Set the immu_devi field (if not already set) 1653 */ 1654 if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) { 1655 ddi_err(DER_MODE, rdip, 1656 "immu_context_update: failed to set immu_devi for ddip"); 1657 return (DDI_FAILURE); 1658 } 1659 1660 if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) { 1661 ddi_err(DER_MODE, rdip, 1662 "immu_context_update: failed to set immu_devi for rdip"); 1663 return (DDI_FAILURE); 1664 } 1665 1666 d_immu_devi = immu_devi_get(ddip); 1667 r_immu_devi = immu_devi_get(rdip); 1668 1669 d_bus = d_immu_devi->imd_bus; 1670 d_devfunc = d_immu_devi->imd_devfunc; 1671 d_pcib_type = d_immu_devi->imd_pcib_type; 1672 r_bus = r_immu_devi->imd_bus; 1673 r_devfunc = r_immu_devi->imd_devfunc; 1674 1675 if (rdip == ddip) { 1676 /* rdip is a PCIE device. set context for it only */ 1677 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1678 r_devfunc); 1679 #ifdef BUGGY_DRIVERS 1680 } else if (r_immu_devi == d_immu_devi) { 1681 #ifdef TEST 1682 ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and " 1683 "0x%lx are identical", rdip, ddip); 1684 #endif 1685 /* rdip is a PCIE device. set context for it only */ 1686 context_set(immu, domain, immu->immu_ctx_root, r_bus, 1687 r_devfunc); 1688 #endif 1689 } else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) { 1690 /* 1691 * ddip is a PCIE_PCI bridge. Set context for ddip's 1692 * secondary bus. If rdip is on ddip's secondary 1693 * bus, set context for rdip. Else, set context 1694 * for rdip's PCI bridge on ddip's secondary bus. 1695 */ 1696 context_set(immu, domain, immu->immu_ctx_root, 1697 d_immu_devi->imd_sec, 0); 1698 if (d_immu_devi->imd_sec == r_bus) { 1699 context_set(immu, domain, immu->immu_ctx_root, 1700 r_bus, r_devfunc); 1701 } else { 1702 pcibdip = NULL; 1703 if (immu_walk_ancestor(rdip, ddip, find_top_pcib, 1704 &pcibdip, NULL, immu_flags) == DDI_SUCCESS && 1705 pcibdip != NULL) { 1706 r_immu_devi = immu_devi_get(pcibdip); 1707 r_bus = r_immu_devi->imd_bus; 1708 r_devfunc = r_immu_devi->imd_devfunc; 1709 context_set(immu, domain, immu->immu_ctx_root, 1710 r_bus, r_devfunc); 1711 } else { 1712 ddi_err(DER_PANIC, rdip, "Failed to find PCI " 1713 " bridge for PCI device"); 1714 /*NOTREACHED*/ 1715 } 1716 } 1717 } else if (d_pcib_type == IMMU_PCIB_PCI_PCI) { 1718 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1719 d_devfunc); 1720 } else if (d_pcib_type == IMMU_PCIB_ENDPOINT) { 1721 /* 1722 * ddip is a PCIE device which has a non-PCI device under it 1723 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata 1724 */ 1725 context_set(immu, domain, immu->immu_ctx_root, d_bus, 1726 d_devfunc); 1727 } else { 1728 ddi_err(DER_PANIC, rdip, "unknown device type. Cannot " 1729 "set iommu context."); 1730 /*NOTREACHED*/ 1731 } 1732 1733 /* XXX do we need a membar_producer() here */ 1734 return (DDI_SUCCESS); 1735 } 1736 1737 /* ##################### END CONTEXT CODE ################################## */ 1738 /* ##################### MAPPING CODE ################################## */ 1739 1740 1741 #ifdef DEBUG 1742 static boolean_t 1743 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr, 1744 dev_info_t *rdip, immu_flags_t immu_flags) 1745 { 1746 /* The PDTE must be set i.e. present bit is set */ 1747 if (!PDTE_P(pdte)) { 1748 ddi_err(DER_MODE, rdip, "No present flag"); 1749 return (B_FALSE); 1750 } 1751 1752 /* 1753 * Just assert to check most significant system software field 1754 * (PDTE_SW4) as it is same as present bit and we 1755 * checked that above 1756 */ 1757 ASSERT(PDTE_SW4(pdte)); 1758 1759 /* 1760 * TM field should be clear if not reserved. 1761 * non-leaf is always reserved 1762 */ 1763 if (next == NULL && immu->immu_TM_reserved == B_FALSE) { 1764 if (PDTE_TM(pdte)) { 1765 ddi_err(DER_MODE, rdip, "TM flag set"); 1766 return (B_FALSE); 1767 } 1768 } 1769 1770 /* 1771 * The SW3 field is not used and must be clear 1772 */ 1773 if (PDTE_SW3(pdte)) { 1774 ddi_err(DER_MODE, rdip, "SW3 set"); 1775 return (B_FALSE); 1776 } 1777 1778 /* 1779 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set 1780 */ 1781 if (next == NULL) { 1782 ASSERT(paddr % IMMU_PAGESIZE == 0); 1783 if (PDTE_PADDR(pdte) != paddr) { 1784 ddi_err(DER_MODE, rdip, 1785 "PTE paddr mismatch: %lx != %lx", 1786 PDTE_PADDR(pdte), paddr); 1787 return (B_FALSE); 1788 } 1789 } else { 1790 if (PDTE_PADDR(pdte) != next->hwpg_paddr) { 1791 ddi_err(DER_MODE, rdip, 1792 "PDE paddr mismatch: %lx != %lx", 1793 PDTE_PADDR(pdte), next->hwpg_paddr); 1794 return (B_FALSE); 1795 } 1796 } 1797 1798 /* 1799 * SNP field should be clear if not reserved. 1800 * non-leaf is always reserved 1801 */ 1802 if (next == NULL && immu->immu_SNP_reserved == B_FALSE) { 1803 if (PDTE_SNP(pdte)) { 1804 ddi_err(DER_MODE, rdip, "SNP set"); 1805 return (B_FALSE); 1806 } 1807 } 1808 1809 /* second field available for system software should be clear */ 1810 if (PDTE_SW2(pdte)) { 1811 ddi_err(DER_MODE, rdip, "SW2 set"); 1812 return (B_FALSE); 1813 } 1814 1815 /* Super pages field should be clear */ 1816 if (PDTE_SP(pdte)) { 1817 ddi_err(DER_MODE, rdip, "SP set"); 1818 return (B_FALSE); 1819 } 1820 1821 /* 1822 * least significant field available for 1823 * system software should be clear 1824 */ 1825 if (PDTE_SW1(pdte)) { 1826 ddi_err(DER_MODE, rdip, "SW1 set"); 1827 return (B_FALSE); 1828 } 1829 1830 if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) { 1831 ddi_err(DER_MODE, rdip, "READ not set"); 1832 return (B_FALSE); 1833 } 1834 1835 if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) { 1836 ddi_err(DER_MODE, rdip, "WRITE not set"); 1837 return (B_FALSE); 1838 } 1839 1840 return (B_TRUE); 1841 } 1842 #endif 1843 1844 /*ARGSUSED*/ 1845 static void 1846 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 1847 uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip) 1848 { 1849 uint64_t npages; 1850 uint64_t dvma; 1851 pgtable_t *pgtable; 1852 hw_pdte_t *hwp; 1853 hw_pdte_t *shwp; 1854 int idx; 1855 1856 pgtable = xlate->xlt_pgtable; 1857 idx = xlate->xlt_idx; 1858 1859 dvma = *dvma_ptr; 1860 npages = *npages_ptr; 1861 1862 /* 1863 * since a caller gets a unique dvma for a physical address, 1864 * no other concurrent thread will be writing to the same 1865 * PTE even if it has the same paddr. So no locks needed. 1866 */ 1867 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 1868 1869 hwp = shwp; 1870 for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 1871 PDTE_CLEAR_P(*hwp); 1872 dvma += IMMU_PAGESIZE; 1873 npages--; 1874 } 1875 1876 *dvma_ptr = dvma; 1877 *npages_ptr = npages; 1878 1879 xlate->xlt_idx = idx; 1880 } 1881 1882 static void 1883 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels) 1884 { 1885 int level; 1886 uint64_t offbits; 1887 1888 /* 1889 * Skip the first 12 bits which is the offset into 1890 * 4K PFN (phys page frame based on IMMU_PAGESIZE) 1891 */ 1892 offbits = dvma >> IMMU_PAGESHIFT; 1893 1894 /* skip to level 1 i.e. leaf PTE */ 1895 for (level = 1, xlate++; level <= nlevels; level++, xlate++) { 1896 xlate->xlt_level = level; 1897 xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK); 1898 ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX); 1899 xlate->xlt_pgtable = NULL; 1900 offbits >>= IMMU_PGTABLE_LEVEL_STRIDE; 1901 } 1902 } 1903 1904 /* 1905 * Read the pgtables 1906 */ 1907 static boolean_t 1908 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels) 1909 { 1910 pgtable_t *pgtable; 1911 pgtable_t *next; 1912 uint_t idx; 1913 1914 /* start with highest level pgtable i.e. root */ 1915 xlate += nlevels; 1916 1917 if (xlate->xlt_pgtable == NULL) { 1918 xlate->xlt_pgtable = domain->dom_pgtable_root; 1919 } 1920 1921 for (; xlate->xlt_level > 1; xlate--) { 1922 idx = xlate->xlt_idx; 1923 pgtable = xlate->xlt_pgtable; 1924 1925 if ((xlate - 1)->xlt_pgtable) { 1926 continue; 1927 } 1928 1929 /* Lock the pgtable in read mode */ 1930 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 1931 1932 /* 1933 * since we are unmapping, the pgtable should 1934 * already point to a leafier pgtable. 1935 */ 1936 next = *(pgtable->swpg_next_array + idx); 1937 (xlate - 1)->xlt_pgtable = next; 1938 rw_exit(&(pgtable->swpg_rwlock)); 1939 if (next == NULL) 1940 return (B_FALSE); 1941 } 1942 1943 return (B_TRUE); 1944 } 1945 1946 static void 1947 immu_fault_walk(void *arg, void *base, size_t len) 1948 { 1949 uint64_t dvma, start; 1950 1951 dvma = *(uint64_t *)arg; 1952 start = (uint64_t)(uintptr_t)base; 1953 1954 if (dvma >= start && dvma < (start + len)) { 1955 ddi_err(DER_WARN, NULL, 1956 "faulting DVMA address is in vmem arena " 1957 "(%" PRIx64 "-%" PRIx64 ")", 1958 start, start + len); 1959 *(uint64_t *)arg = ~0ULL; 1960 } 1961 } 1962 1963 void 1964 immu_print_fault_info(uint_t sid, uint64_t dvma) 1965 { 1966 int nlevels; 1967 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 1968 xlate_t *xlatep; 1969 hw_pdte_t pte; 1970 domain_t *domain; 1971 immu_t *immu; 1972 uint64_t dvma_arg; 1973 1974 if (mod_hash_find(bdf_domain_hash, 1975 (void *)(uintptr_t)sid, (void *)&domain) != 0) { 1976 ddi_err(DER_WARN, NULL, 1977 "no domain for faulting SID %08x", sid); 1978 return; 1979 } 1980 1981 immu = domain->dom_immu; 1982 1983 dvma_arg = dvma; 1984 vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk, 1985 (void *)&dvma_arg); 1986 if (dvma_arg != ~0ULL) 1987 ddi_err(DER_WARN, domain->dom_dip, 1988 "faulting DVMA address is not in vmem arena"); 1989 1990 nlevels = immu->immu_dvma_nlevels; 1991 xlate_setup(dvma, xlate, nlevels); 1992 1993 if (!PDE_lookup(domain, xlate, nlevels)) { 1994 ddi_err(DER_WARN, domain->dom_dip, 1995 "pte not found in domid %d for faulting addr %" PRIx64, 1996 domain->dom_did, dvma); 1997 return; 1998 } 1999 2000 xlatep = &xlate[1]; 2001 pte = *((hw_pdte_t *) 2002 (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx); 2003 2004 ddi_err(DER_WARN, domain->dom_dip, 2005 "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did, 2006 (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte)); 2007 } 2008 2009 /*ARGSUSED*/ 2010 static void 2011 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, 2012 dev_info_t *rdip, immu_flags_t immu_flags) 2013 { 2014 hw_pdte_t pte; 2015 2016 #ifndef DEBUG 2017 pte = immu->immu_ptemask; 2018 PDTE_SET_PADDR(pte, paddr); 2019 #else 2020 pte = *hwp; 2021 2022 if (PDTE_P(pte)) { 2023 if (PDTE_PADDR(pte) != paddr) { 2024 ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx", 2025 PDTE_PADDR(pte), paddr); 2026 } 2027 #ifdef BUGGY_DRIVERS 2028 return; 2029 #else 2030 goto out; 2031 #endif 2032 } 2033 2034 /* clear TM field if not reserved */ 2035 if (immu->immu_TM_reserved == B_FALSE) { 2036 PDTE_CLEAR_TM(pte); 2037 } 2038 2039 /* Clear 3rd field for system software - not used */ 2040 PDTE_CLEAR_SW3(pte); 2041 2042 /* Set paddr */ 2043 ASSERT(paddr % IMMU_PAGESIZE == 0); 2044 PDTE_CLEAR_PADDR(pte); 2045 PDTE_SET_PADDR(pte, paddr); 2046 2047 /* clear SNP field if not reserved. */ 2048 if (immu->immu_SNP_reserved == B_FALSE) { 2049 PDTE_CLEAR_SNP(pte); 2050 } 2051 2052 /* Clear SW2 field available for software */ 2053 PDTE_CLEAR_SW2(pte); 2054 2055 2056 /* SP is don't care for PTEs. Clear it for cleanliness */ 2057 PDTE_CLEAR_SP(pte); 2058 2059 /* Clear SW1 field available for software */ 2060 PDTE_CLEAR_SW1(pte); 2061 2062 /* 2063 * Now that we are done writing the PTE 2064 * set the "present" flag. Note this present 2065 * flag is a bit in the PDE/PTE that the 2066 * spec says is available for system software. 2067 * This is an implementation detail of Solaris 2068 * bare-metal Intel IOMMU. 2069 * The present field in a PDE/PTE is not defined 2070 * by the Vt-d spec 2071 */ 2072 2073 PDTE_SET_P(pte); 2074 2075 pte |= immu->immu_ptemask; 2076 2077 out: 2078 #endif /* DEBUG */ 2079 #ifdef BUGGY_DRIVERS 2080 PDTE_SET_READ(pte); 2081 PDTE_SET_WRITE(pte); 2082 #else 2083 if (immu_flags & IMMU_FLAGS_READ) 2084 PDTE_SET_READ(pte); 2085 if (immu_flags & IMMU_FLAGS_WRITE) 2086 PDTE_SET_WRITE(pte); 2087 #endif /* BUGGY_DRIVERS */ 2088 2089 *hwp = pte; 2090 } 2091 2092 /*ARGSUSED*/ 2093 static void 2094 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, 2095 uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies, 2096 int dcount, dev_info_t *rdip, immu_flags_t immu_flags) 2097 { 2098 paddr_t paddr; 2099 uint64_t nvpages; 2100 uint64_t nppages; 2101 uint64_t dvma; 2102 pgtable_t *pgtable; 2103 hw_pdte_t *hwp; 2104 hw_pdte_t *shwp; 2105 int idx, nset; 2106 int j; 2107 2108 pgtable = xlate->xlt_pgtable; 2109 idx = xlate->xlt_idx; 2110 2111 dvma = *dvma_ptr; 2112 nvpages = *nvpages_ptr; 2113 2114 /* 2115 * since a caller gets a unique dvma for a physical address, 2116 * no other concurrent thread will be writing to the same 2117 * PTE even if it has the same paddr. So no locks needed. 2118 */ 2119 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2120 2121 hwp = shwp; 2122 for (j = dcount - 1; j >= 0; j--) { 2123 if (nvpages <= dcookies[j].dck_npages) 2124 break; 2125 nvpages -= dcookies[j].dck_npages; 2126 } 2127 2128 nppages = nvpages; 2129 paddr = dcookies[j].dck_paddr + 2130 (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE; 2131 2132 nvpages = *nvpages_ptr; 2133 nset = 0; 2134 for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { 2135 PTE_set_one(immu, hwp, paddr, rdip, immu_flags); 2136 nset++; 2137 2138 ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags) 2139 == B_TRUE); 2140 nppages--; 2141 nvpages--; 2142 paddr += IMMU_PAGESIZE; 2143 dvma += IMMU_PAGESIZE; 2144 2145 if (nppages == 0) { 2146 j++; 2147 } 2148 2149 if (j == dcount) 2150 break; 2151 2152 if (nppages == 0) { 2153 nppages = dcookies[j].dck_npages; 2154 paddr = dcookies[j].dck_paddr; 2155 } 2156 } 2157 2158 if (nvpages) { 2159 *dvma_ptr = dvma; 2160 *nvpages_ptr = nvpages; 2161 } else { 2162 *dvma_ptr = 0; 2163 *nvpages_ptr = 0; 2164 } 2165 2166 xlate->xlt_idx = idx; 2167 } 2168 2169 /*ARGSUSED*/ 2170 static void 2171 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next, 2172 dev_info_t *rdip, immu_flags_t immu_flags) 2173 { 2174 hw_pdte_t pde; 2175 2176 pde = *hwp; 2177 2178 /* if PDE is already set, make sure it is correct */ 2179 if (PDTE_P(pde)) { 2180 ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr); 2181 #ifdef BUGGY_DRIVERS 2182 return; 2183 #else 2184 goto out; 2185 #endif 2186 } 2187 2188 /* Dont touch SW4, it is the present bit */ 2189 2190 /* don't touch TM field it is reserved for PDEs */ 2191 2192 /* 3rd field available for system software is not used */ 2193 PDTE_CLEAR_SW3(pde); 2194 2195 /* Set next level pgtable-paddr for PDE */ 2196 PDTE_CLEAR_PADDR(pde); 2197 PDTE_SET_PADDR(pde, next->hwpg_paddr); 2198 2199 /* don't touch SNP field it is reserved for PDEs */ 2200 2201 /* Clear second field available for system software */ 2202 PDTE_CLEAR_SW2(pde); 2203 2204 /* No super pages for PDEs */ 2205 PDTE_CLEAR_SP(pde); 2206 2207 /* Clear SW1 for software */ 2208 PDTE_CLEAR_SW1(pde); 2209 2210 /* 2211 * Now that we are done writing the PDE 2212 * set the "present" flag. Note this present 2213 * flag is a bit in the PDE/PTE that the 2214 * spec says is available for system software. 2215 * This is an implementation detail of Solaris 2216 * base-metal Intel IOMMU. 2217 * The present field in a PDE/PTE is not defined 2218 * by the Vt-d spec 2219 */ 2220 2221 out: 2222 #ifdef BUGGY_DRIVERS 2223 PDTE_SET_READ(pde); 2224 PDTE_SET_WRITE(pde); 2225 #else 2226 if (immu_flags & IMMU_FLAGS_READ) 2227 PDTE_SET_READ(pde); 2228 if (immu_flags & IMMU_FLAGS_WRITE) 2229 PDTE_SET_WRITE(pde); 2230 #endif 2231 2232 PDTE_SET_P(pde); 2233 2234 *hwp = pde; 2235 } 2236 2237 /* 2238 * Used to set PDEs 2239 */ 2240 static boolean_t 2241 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, 2242 dev_info_t *rdip, immu_flags_t immu_flags) 2243 { 2244 pgtable_t *pgtable; 2245 pgtable_t *new; 2246 pgtable_t *next; 2247 hw_pdte_t *hwp; 2248 int level; 2249 uint_t idx; 2250 krw_t rwtype; 2251 boolean_t set = B_FALSE; 2252 2253 /* start with highest level pgtable i.e. root */ 2254 xlate += nlevels; 2255 2256 new = NULL; 2257 xlate->xlt_pgtable = domain->dom_pgtable_root; 2258 for (level = nlevels; level > 1; level--, xlate--) { 2259 idx = xlate->xlt_idx; 2260 pgtable = xlate->xlt_pgtable; 2261 2262 /* Lock the pgtable in READ mode first */ 2263 rw_enter(&(pgtable->swpg_rwlock), RW_READER); 2264 rwtype = RW_READER; 2265 again: 2266 hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; 2267 next = (pgtable->swpg_next_array)[idx]; 2268 2269 /* 2270 * check if leafier level already has a pgtable 2271 * if yes, verify 2272 */ 2273 if (next == NULL) { 2274 if (new == NULL) { 2275 2276 IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *, 2277 rdip, int, level); 2278 2279 new = pgtable_alloc(immu, immu_flags); 2280 if (new == NULL) { 2281 ddi_err(DER_PANIC, rdip, 2282 "pgtable alloc err"); 2283 } 2284 pgtable_zero(new); 2285 } 2286 2287 /* Change to a write lock */ 2288 if (rwtype == RW_READER && 2289 rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) { 2290 rw_exit(&(pgtable->swpg_rwlock)); 2291 rw_enter(&(pgtable->swpg_rwlock), RW_WRITER); 2292 rwtype = RW_WRITER; 2293 goto again; 2294 } 2295 rwtype = RW_WRITER; 2296 next = new; 2297 (pgtable->swpg_next_array)[idx] = next; 2298 new = NULL; 2299 PDE_set_one(immu, hwp, next, rdip, immu_flags); 2300 set = B_TRUE; 2301 rw_downgrade(&(pgtable->swpg_rwlock)); 2302 rwtype = RW_READER; 2303 } 2304 #ifndef BUGGY_DRIVERS 2305 else { 2306 hw_pdte_t pde = *hwp; 2307 2308 /* 2309 * If buggy driver we already set permission 2310 * READ+WRITE so nothing to do for that case 2311 * XXX Check that read writer perms change before 2312 * actually setting perms. Also need to hold lock 2313 */ 2314 if (immu_flags & IMMU_FLAGS_READ) 2315 PDTE_SET_READ(pde); 2316 if (immu_flags & IMMU_FLAGS_WRITE) 2317 PDTE_SET_WRITE(pde); 2318 2319 *hwp = pde; 2320 } 2321 #endif 2322 2323 ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags) 2324 == B_TRUE); 2325 2326 (xlate - 1)->xlt_pgtable = next; 2327 rw_exit(&(pgtable->swpg_rwlock)); 2328 } 2329 2330 if (new) { 2331 pgtable_free(immu, new); 2332 } 2333 2334 return (set); 2335 } 2336 2337 /* 2338 * dvma_map() 2339 * map a contiguous range of DVMA pages 2340 * 2341 * immu: IOMMU unit for which we are generating DVMA cookies 2342 * domain: domain 2343 * sdvma: Starting dvma 2344 * spaddr: Starting paddr 2345 * npages: Number of pages 2346 * rdip: requesting device 2347 * immu_flags: flags 2348 */ 2349 static boolean_t 2350 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages, 2351 immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, 2352 immu_flags_t immu_flags) 2353 { 2354 uint64_t dvma; 2355 uint64_t n; 2356 immu_t *immu = domain->dom_immu; 2357 int nlevels = immu->immu_dvma_nlevels; 2358 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2359 boolean_t pde_set = B_FALSE; 2360 2361 n = snvpages; 2362 dvma = sdvma; 2363 2364 while (n > 0) { 2365 xlate_setup(dvma, xlate, nlevels); 2366 2367 /* Lookup or allocate PGDIRs and PGTABLEs if necessary */ 2368 if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags) 2369 == B_TRUE) { 2370 pde_set = B_TRUE; 2371 } 2372 2373 /* set all matching ptes that fit into this leaf pgtable */ 2374 PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies, 2375 dcount, rdip, immu_flags); 2376 } 2377 2378 return (pde_set); 2379 } 2380 2381 /* 2382 * dvma_unmap() 2383 * unmap a range of DVMAs 2384 * 2385 * immu: IOMMU unit state 2386 * domain: domain for requesting device 2387 * ddip: domain-dip 2388 * dvma: starting DVMA 2389 * npages: Number of IMMU pages to be unmapped 2390 * rdip: requesting device 2391 */ 2392 static void 2393 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages, 2394 dev_info_t *rdip) 2395 { 2396 immu_t *immu = domain->dom_immu; 2397 int nlevels = immu->immu_dvma_nlevels; 2398 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; 2399 uint64_t n; 2400 uint64_t dvma; 2401 2402 dvma = sdvma; 2403 n = snpages; 2404 2405 while (n > 0) { 2406 /* setup the xlate array */ 2407 xlate_setup(dvma, xlate, nlevels); 2408 2409 /* just lookup existing pgtables. Should never fail */ 2410 if (!PDE_lookup(domain, xlate, nlevels)) 2411 ddi_err(DER_PANIC, rdip, 2412 "PTE not found for addr %" PRIx64, 2413 (unsigned long long)dvma); 2414 2415 /* clear all matching ptes that fit into this leaf pgtable */ 2416 PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip); 2417 } 2418 2419 /* No need to flush IOTLB after unmap */ 2420 } 2421 2422 static uint64_t 2423 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf) 2424 { 2425 uint64_t dvma; 2426 size_t xsize, align; 2427 uint64_t minaddr, maxaddr; 2428 2429 /* parameters */ 2430 xsize = npages * IMMU_PAGESIZE; 2431 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2432 minaddr = dma_attr->dma_attr_addr_lo; 2433 maxaddr = dma_attr->dma_attr_addr_hi + 1; 2434 2435 /* handle the rollover cases */ 2436 if (maxaddr < dma_attr->dma_attr_addr_hi) { 2437 maxaddr = dma_attr->dma_attr_addr_hi; 2438 } 2439 2440 /* 2441 * allocate from vmem arena. 2442 */ 2443 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2444 xsize, align, 0, 0, (void *)(uintptr_t)minaddr, 2445 (void *)(uintptr_t)maxaddr, kmf); 2446 2447 return (dvma); 2448 } 2449 2450 static void 2451 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr) 2452 { 2453 int nlevels; 2454 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp; 2455 uint64_t dvma, n; 2456 size_t xsize, align; 2457 uint64_t minaddr, maxaddr, dmamax; 2458 int on, npte, pindex; 2459 hw_pdte_t *shwp; 2460 immu_t *immu; 2461 domain_t *domain; 2462 2463 /* parameters */ 2464 domain = IMMU_DEVI(rdip)->imd_domain; 2465 immu = domain->dom_immu; 2466 nlevels = immu->immu_dvma_nlevels; 2467 xsize = IMMU_NPREPTES * IMMU_PAGESIZE; 2468 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); 2469 minaddr = dma_attr->dma_attr_addr_lo; 2470 if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG) 2471 dmamax = dma_attr->dma_attr_seg; 2472 else 2473 dmamax = dma_attr->dma_attr_addr_hi; 2474 maxaddr = dmamax + 1; 2475 2476 if (maxaddr < dmamax) 2477 maxaddr = dmamax; 2478 2479 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, 2480 xsize, align, 0, dma_attr->dma_attr_seg + 1, 2481 (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP); 2482 2483 ihp->ihp_predvma = dvma; 2484 ihp->ihp_npremapped = 0; 2485 if (dvma == 0) 2486 return; 2487 2488 n = IMMU_NPREPTES; 2489 pindex = 0; 2490 2491 /* 2492 * Set up a mapping at address 0, just so that all PDPs get allocated 2493 * now. Although this initial mapping should never be used, 2494 * explicitly set it to read-only, just to be safe. 2495 */ 2496 while (n > 0) { 2497 xlate_setup(dvma, xlate, nlevels); 2498 2499 (void) PDE_set_all(immu, domain, xlate, nlevels, rdip, 2500 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2501 2502 xlp = &xlate[1]; 2503 shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr) 2504 + xlp->xlt_idx; 2505 on = n; 2506 2507 PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie, 2508 1, rdip, IMMU_FLAGS_READ); 2509 2510 npte = on - n; 2511 2512 while (npte > 0) { 2513 ihp->ihp_preptes[pindex++] = shwp; 2514 #ifdef BUGGY_DRIVERS 2515 PDTE_CLEAR_WRITE(*shwp); 2516 #endif 2517 shwp++; 2518 npte--; 2519 } 2520 } 2521 } 2522 2523 static void 2524 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp) 2525 { 2526 domain_t *domain; 2527 2528 domain = IMMU_DEVI(rdip)->imd_domain; 2529 2530 if (ihp->ihp_predvma != 0) { 2531 dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip); 2532 vmem_free(domain->dom_dvma_arena, 2533 (void *)(uintptr_t)ihp->ihp_predvma, 2534 IMMU_NPREPTES * IMMU_PAGESIZE); 2535 } 2536 } 2537 2538 static void 2539 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages) 2540 { 2541 uint64_t size = npages * IMMU_PAGESIZE; 2542 2543 if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) 2544 return; 2545 2546 vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size); 2547 } 2548 2549 static int 2550 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle, 2551 immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq, 2552 ddi_dma_obj_t *dma_out) 2553 { 2554 domain_t *domain; 2555 immu_t *immu; 2556 immu_flags_t immu_flags; 2557 ddi_dma_atyp_t buftype; 2558 ddi_dma_obj_t *dmar_object; 2559 ddi_dma_attr_t *attrp; 2560 uint64_t offset, paddr, dvma, sdvma, rwmask; 2561 size_t npages, npgalloc; 2562 uint_t psize, size, pcnt, dmax; 2563 page_t **pparray; 2564 caddr_t vaddr; 2565 page_t *page; 2566 struct as *vas; 2567 immu_dcookie_t *dcookies; 2568 int pde_set; 2569 2570 domain = IMMU_DEVI(rdip)->imd_domain; 2571 immu = domain->dom_immu; 2572 immu_flags = dma_to_immu_flags(dmareq); 2573 2574 attrp = &((ddi_dma_impl_t *)handle)->dmai_attr; 2575 2576 dmar_object = &dmareq->dmar_object; 2577 pparray = dmar_object->dmao_obj.virt_obj.v_priv; 2578 vaddr = dmar_object->dmao_obj.virt_obj.v_addr; 2579 buftype = dmar_object->dmao_type; 2580 size = dmar_object->dmao_size; 2581 2582 IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t, 2583 buftype, uint_t, size); 2584 2585 dcookies = &ihp->ihp_dcookies[0]; 2586 2587 pcnt = dmax = 0; 2588 2589 /* retrieve paddr, psize, offset from dmareq */ 2590 if (buftype == DMA_OTYP_PAGES) { 2591 page = dmar_object->dmao_obj.pp_obj.pp_pp; 2592 offset = dmar_object->dmao_obj.pp_obj.pp_offset & 2593 MMU_PAGEOFFSET; 2594 paddr = pfn_to_pa(page->p_pagenum) + offset; 2595 psize = MIN((MMU_PAGESIZE - offset), size); 2596 page = page->p_next; 2597 vas = dmar_object->dmao_obj.virt_obj.v_as; 2598 } else { 2599 if (vas == NULL) { 2600 vas = &kas; 2601 } 2602 offset = (uintptr_t)vaddr & MMU_PAGEOFFSET; 2603 if (pparray != NULL) { 2604 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset; 2605 psize = MIN((MMU_PAGESIZE - offset), size); 2606 pcnt++; 2607 } else { 2608 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, 2609 vaddr)) + offset; 2610 psize = MIN(size, (MMU_PAGESIZE - offset)); 2611 vaddr += psize; 2612 } 2613 } 2614 2615 npgalloc = IMMU_BTOPR(size + offset); 2616 2617 if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) { 2618 #ifdef BUGGY_DRIVERS 2619 rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask; 2620 #else 2621 rwmask = immu->immu_ptemask; 2622 if (immu_flags & IMMU_FLAGS_READ) 2623 rwmask |= PDTE_MASK_R; 2624 if (immu_flags & IMMU_FLAGS_WRITE) 2625 rwmask |= PDTE_MASK_W; 2626 #endif 2627 #ifdef DEBUG 2628 rwmask |= PDTE_MASK_P; 2629 #endif 2630 sdvma = ihp->ihp_predvma; 2631 ihp->ihp_npremapped = npgalloc; 2632 *ihp->ihp_preptes[0] = 2633 PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask; 2634 } else { 2635 ihp->ihp_npremapped = 0; 2636 sdvma = dvma_alloc(domain, attrp, npgalloc, 2637 dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP); 2638 if (sdvma == 0) 2639 return (DDI_DMA_NORESOURCES); 2640 2641 dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET); 2642 dcookies[0].dck_npages = 1; 2643 } 2644 2645 IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc, 2646 uint64_t, sdvma); 2647 2648 dvma = sdvma; 2649 pde_set = 0; 2650 npages = 1; 2651 size -= psize; 2652 while (size > 0) { 2653 /* get the size for this page (i.e. partial or full page) */ 2654 psize = MIN(size, MMU_PAGESIZE); 2655 if (buftype == DMA_OTYP_PAGES) { 2656 /* get the paddr from the page_t */ 2657 paddr = pfn_to_pa(page->p_pagenum); 2658 page = page->p_next; 2659 } else if (pparray != NULL) { 2660 /* index into the array of page_t's to get the paddr */ 2661 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum); 2662 pcnt++; 2663 } else { 2664 /* call into the VM to get the paddr */ 2665 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr)); 2666 vaddr += psize; 2667 } 2668 2669 npages++; 2670 2671 if (ihp->ihp_npremapped > 0) { 2672 *ihp->ihp_preptes[npages - 1] = 2673 PDTE_PADDR(paddr) | rwmask; 2674 } else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) { 2675 dcookies[dmax].dck_npages++; 2676 } else { 2677 /* No, we need a new dcookie */ 2678 if (dmax == (IMMU_NDCK - 1)) { 2679 /* 2680 * Ran out of dcookies. Map them now. 2681 */ 2682 if (dvma_map(domain, dvma, 2683 npages, dcookies, dmax + 1, rdip, 2684 immu_flags)) 2685 pde_set++; 2686 2687 IMMU_DPROBE4(immu__dvmamap__early, 2688 dev_info_t *, rdip, uint64_t, dvma, 2689 uint_t, npages, uint_t, dmax+1); 2690 2691 dvma += (npages << IMMU_PAGESHIFT); 2692 npages = 0; 2693 dmax = 0; 2694 } else 2695 dmax++; 2696 dcookies[dmax].dck_paddr = paddr; 2697 dcookies[dmax].dck_npages = 1; 2698 } 2699 size -= psize; 2700 } 2701 2702 /* 2703 * Finish up, mapping all, or all of the remaining, 2704 * physical memory ranges. 2705 */ 2706 if (ihp->ihp_npremapped == 0 && npages > 0) { 2707 IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \ 2708 uint64_t, dvma, uint_t, npages, uint_t, dmax+1); 2709 2710 if (dvma_map(domain, dvma, npages, dcookies, 2711 dmax + 1, rdip, immu_flags)) 2712 pde_set++; 2713 } 2714 2715 /* Invalidate the IOTLB */ 2716 immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc, 2717 pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF, 2718 &ihp->ihp_inv_wait); 2719 2720 ihp->ihp_ndvseg = 1; 2721 ihp->ihp_dvseg[0].dvs_start = sdvma; 2722 ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size; 2723 2724 dma_out->dmao_size = dmar_object->dmao_size; 2725 dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET; 2726 dma_out->dmao_obj.dvma_obj.dv_nseg = 1; 2727 dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0]; 2728 dma_out->dmao_type = DMA_OTYP_DVADDR; 2729 2730 return (DDI_DMA_MAPPED); 2731 } 2732 2733 static int 2734 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao) 2735 { 2736 uint64_t dvma, npages; 2737 domain_t *domain; 2738 struct dvmaseg *dvs; 2739 2740 domain = IMMU_DEVI(rdip)->imd_domain; 2741 dvs = dmao->dmao_obj.dvma_obj.dv_seg; 2742 2743 dvma = dvs[0].dvs_start; 2744 npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off); 2745 2746 #ifdef DEBUG 2747 /* Unmap only in DEBUG mode */ 2748 dvma_unmap(domain, dvma, npages, rdip); 2749 #endif 2750 dvma_free(domain, dvma, npages); 2751 2752 IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages, 2753 uint64_t, dvma); 2754 2755 #ifdef DEBUG 2756 /* 2757 * In the DEBUG case, the unmap was actually done, 2758 * but an IOTLB flush was not done. So, an explicit 2759 * write back flush is needed. 2760 */ 2761 immu_regs_wbf_flush(domain->dom_immu); 2762 #endif 2763 2764 return (DDI_SUCCESS); 2765 } 2766 2767 /* ############################# Functions exported ######################## */ 2768 2769 /* 2770 * setup the DVMA subsystem 2771 * this code runs only for the first IOMMU unit 2772 */ 2773 void 2774 immu_dvma_setup(list_t *listp) 2775 { 2776 immu_t *immu; 2777 uint_t kval; 2778 size_t nchains; 2779 2780 /* locks */ 2781 mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL); 2782 2783 /* Create lists */ 2784 list_create(&immu_unity_domain_list, sizeof (domain_t), 2785 offsetof(domain_t, dom_maptype_node)); 2786 list_create(&immu_xlate_domain_list, sizeof (domain_t), 2787 offsetof(domain_t, dom_maptype_node)); 2788 2789 /* Setup BDF domain hash */ 2790 nchains = 0xff; 2791 kval = mod_hash_iddata_gen(nchains); 2792 2793 bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH", 2794 nchains, mod_hash_null_keydtor, mod_hash_null_valdtor, 2795 mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp, 2796 KM_NOSLEEP); 2797 2798 immu = list_head(listp); 2799 for (; immu; immu = list_next(listp, immu)) { 2800 create_unity_domain(immu); 2801 did_init(immu); 2802 context_init(immu); 2803 immu->immu_dvma_setup = B_TRUE; 2804 } 2805 } 2806 2807 /* 2808 * Startup up one DVMA unit 2809 */ 2810 void 2811 immu_dvma_startup(immu_t *immu) 2812 { 2813 if (immu_gfxdvma_enable == B_FALSE && 2814 immu->immu_dvma_gfx_only == B_TRUE) { 2815 return; 2816 } 2817 2818 /* 2819 * DVMA will start once IOMMU is "running" 2820 */ 2821 immu->immu_dvma_running = B_TRUE; 2822 } 2823 2824 /* 2825 * immu_dvma_physmem_update() 2826 * called when the installed memory on a 2827 * system increases, to expand domain DVMA 2828 * for domains with UNITY mapping 2829 */ 2830 void 2831 immu_dvma_physmem_update(uint64_t addr, uint64_t size) 2832 { 2833 uint64_t start; 2834 uint64_t npages; 2835 int dcount; 2836 immu_dcookie_t dcookies[1] = {0}; 2837 domain_t *domain; 2838 2839 /* 2840 * Just walk the system-wide list of domains with 2841 * UNITY mapping. Both the list of *all* domains 2842 * and *UNITY* domains is protected by the same 2843 * single lock 2844 */ 2845 mutex_enter(&immu_domain_lock); 2846 domain = list_head(&immu_unity_domain_list); 2847 for (; domain; domain = list_next(&immu_unity_domain_list, domain)) { 2848 /* 2849 * Nothing to do if the IOMMU supports passthrough. 2850 */ 2851 if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap)) 2852 continue; 2853 2854 /* There is no vmem_arena for unity domains. Just map it */ 2855 ddi_err(DER_LOG, domain->dom_dip, 2856 "iommu: unity-domain: Adding map " 2857 "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size); 2858 2859 start = IMMU_ROUNDOWN(addr); 2860 npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1; 2861 2862 dcookies[0].dck_paddr = start; 2863 dcookies[0].dck_npages = npages; 2864 dcount = 1; 2865 (void) dvma_map(domain, start, npages, 2866 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2867 2868 } 2869 mutex_exit(&immu_domain_lock); 2870 } 2871 2872 int 2873 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags) 2874 { 2875 dev_info_t *ddip, *odip; 2876 immu_t *immu; 2877 domain_t *domain; 2878 2879 odip = rdip; 2880 2881 immu = immu_dvma_get_immu(rdip, immu_flags); 2882 if (immu == NULL) { 2883 /* 2884 * possible that there is no IOMMU unit for this device 2885 * - BIOS bugs are one example. 2886 */ 2887 ddi_err(DER_WARN, rdip, "No iommu unit found for device"); 2888 return (DDI_DMA_NORESOURCES); 2889 } 2890 2891 /* 2892 * redirect isa devices attached under lpc to lpc dip 2893 */ 2894 if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) { 2895 rdip = get_lpc_devinfo(immu, rdip, immu_flags); 2896 if (rdip == NULL) { 2897 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2898 /*NOTREACHED*/ 2899 } 2900 } 2901 2902 /* Reset immu, as redirection can change IMMU */ 2903 immu = NULL; 2904 2905 /* 2906 * for gart, redirect to the real graphic devinfo 2907 */ 2908 if (strcmp(ddi_node_name(rdip), "agpgart") == 0) { 2909 rdip = get_gfx_devinfo(rdip); 2910 if (rdip == NULL) { 2911 ddi_err(DER_PANIC, rdip, "iommu redirect failed"); 2912 /*NOTREACHED*/ 2913 } 2914 } 2915 2916 /* 2917 * Setup DVMA domain for the device. This does 2918 * work only the first time we do DVMA for a 2919 * device. 2920 */ 2921 ddip = NULL; 2922 domain = device_domain(rdip, &ddip, immu_flags); 2923 if (domain == NULL) { 2924 ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device"); 2925 return (DDI_DMA_NORESOURCES); 2926 } 2927 2928 immu = domain->dom_immu; 2929 2930 /* 2931 * If a domain is found, we must also have a domain dip 2932 * which is the topmost ancestor dip of rdip that shares 2933 * the same domain with rdip. 2934 */ 2935 if (domain->dom_did == 0 || ddip == NULL) { 2936 ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)", 2937 domain->dom_did, ddip); 2938 return (DDI_DMA_NORESOURCES); 2939 } 2940 2941 if (odip != rdip) 2942 set_domain(odip, ddip, domain); 2943 2944 /* 2945 * Update the root and context entries 2946 */ 2947 if (immu_context_update(immu, domain, ddip, rdip, immu_flags) 2948 != DDI_SUCCESS) { 2949 ddi_err(DER_MODE, rdip, "DVMA map: context update failed"); 2950 return (DDI_DMA_NORESOURCES); 2951 } 2952 2953 return (DDI_SUCCESS); 2954 } 2955 2956 int 2957 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng) 2958 { 2959 immu_dcookie_t dcookies[1] = {0}; 2960 boolean_t pde_set; 2961 immu_t *immu; 2962 domain_t *domain; 2963 immu_inv_wait_t iw; 2964 2965 dcookies[0].dck_paddr = mrng->mrng_start; 2966 dcookies[0].dck_npages = mrng->mrng_npages; 2967 2968 domain = IMMU_DEVI(rdip)->imd_domain; 2969 immu = domain->dom_immu; 2970 2971 pde_set = dvma_map(domain, mrng->mrng_start, 2972 mrng->mrng_npages, dcookies, 1, rdip, 2973 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); 2974 2975 immu_init_inv_wait(&iw, "memrange", B_TRUE); 2976 2977 immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start, 2978 mrng->mrng_npages, pde_set == B_TRUE ? 2979 TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw); 2980 2981 return (DDI_SUCCESS); 2982 } 2983 2984 immu_devi_t * 2985 immu_devi_get(dev_info_t *rdip) 2986 { 2987 immu_devi_t *immu_devi; 2988 volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu); 2989 2990 /* Just want atomic reads. No need for lock */ 2991 immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr, 2992 0); 2993 return (immu_devi); 2994 } 2995 2996 /*ARGSUSED*/ 2997 int 2998 immu_hdl_priv_ctor(void *buf, void *arg, int kmf) 2999 { 3000 immu_hdl_priv_t *ihp; 3001 3002 ihp = buf; 3003 immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE); 3004 3005 return (0); 3006 } 3007 3008 /* 3009 * iommulib interface functions 3010 */ 3011 static int 3012 immu_probe(iommulib_handle_t handle, dev_info_t *dip) 3013 { 3014 immu_devi_t *immu_devi; 3015 int ret; 3016 3017 if (!immu_enable) 3018 return (DDI_FAILURE); 3019 3020 /* 3021 * Make sure the device has all the IOMMU structures 3022 * initialized. If this device goes through an IOMMU 3023 * unit (e.g. this probe function returns success), 3024 * this will be called at most N times, with N being 3025 * the number of IOMMUs in the system. 3026 * 3027 * After that, when iommulib_nex_open succeeds, 3028 * we can always assume that this device has all 3029 * the structures initialized. IOMMU_USED(dip) will 3030 * be true. There is no need to find the controlling 3031 * IOMMU/domain again. 3032 */ 3033 ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP); 3034 if (ret != DDI_SUCCESS) 3035 return (ret); 3036 3037 immu_devi = IMMU_DEVI(dip); 3038 3039 /* 3040 * For unity domains, there is no need to call in to 3041 * the IOMMU code. 3042 */ 3043 if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID) 3044 return (DDI_FAILURE); 3045 3046 if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle)) 3047 return (DDI_SUCCESS); 3048 3049 return (DDI_FAILURE); 3050 } 3051 3052 /*ARGSUSED*/ 3053 static int 3054 immu_allochdl(iommulib_handle_t handle, 3055 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, 3056 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep) 3057 { 3058 int ret; 3059 immu_hdl_priv_t *ihp; 3060 immu_t *immu; 3061 3062 ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp, 3063 arg, dma_handlep); 3064 if (ret == DDI_SUCCESS) { 3065 immu = IMMU_DEVI(rdip)->imd_immu; 3066 3067 ihp = kmem_cache_alloc(immu->immu_hdl_cache, 3068 waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP); 3069 if (ihp == NULL) { 3070 (void) iommulib_iommu_dma_freehdl(dip, rdip, 3071 *dma_handlep); 3072 return (DDI_DMA_NORESOURCES); 3073 } 3074 3075 if (IMMU_DEVI(rdip)->imd_use_premap) 3076 dvma_prealloc(rdip, ihp, attr); 3077 else { 3078 ihp->ihp_npremapped = 0; 3079 ihp->ihp_predvma = 0; 3080 } 3081 ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep, 3082 ihp); 3083 } 3084 return (ret); 3085 } 3086 3087 /*ARGSUSED*/ 3088 static int 3089 immu_freehdl(iommulib_handle_t handle, 3090 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3091 { 3092 immu_hdl_priv_t *ihp; 3093 3094 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3095 if (ihp != NULL) { 3096 if (IMMU_DEVI(rdip)->imd_use_premap) 3097 dvma_prefree(rdip, ihp); 3098 kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp); 3099 } 3100 3101 return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle)); 3102 } 3103 3104 3105 /*ARGSUSED*/ 3106 static int 3107 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, 3108 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3109 struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep, 3110 uint_t *ccountp) 3111 { 3112 int ret; 3113 immu_hdl_priv_t *ihp; 3114 3115 ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle, 3116 dma_req, cookiep, ccountp); 3117 3118 if (ret == DDI_DMA_MAPPED) { 3119 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3120 immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait); 3121 } 3122 3123 return (ret); 3124 } 3125 3126 /*ARGSUSED*/ 3127 static int 3128 immu_unbindhdl(iommulib_handle_t handle, 3129 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) 3130 { 3131 return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle)); 3132 } 3133 3134 /*ARGSUSED*/ 3135 static int 3136 immu_sync(iommulib_handle_t handle, dev_info_t *dip, 3137 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, 3138 size_t len, uint_t cachefl) 3139 { 3140 return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len, 3141 cachefl)); 3142 } 3143 3144 /*ARGSUSED*/ 3145 static int 3146 immu_win(iommulib_handle_t handle, dev_info_t *dip, 3147 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, 3148 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, 3149 uint_t *ccountp) 3150 { 3151 return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp, 3152 lenp, cookiep, ccountp)); 3153 } 3154 3155 /*ARGSUSED*/ 3156 static int 3157 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, 3158 dev_info_t *rdip, ddi_dma_handle_t dma_handle, 3159 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao) 3160 { 3161 immu_hdl_priv_t *ihp; 3162 3163 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3164 3165 return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao)); 3166 } 3167 3168 /*ARGSUSED*/ 3169 static int 3170 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, 3171 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao) 3172 { 3173 immu_hdl_priv_t *ihp; 3174 3175 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); 3176 if (ihp->ihp_npremapped > 0) 3177 return (DDI_SUCCESS); 3178 return (immu_unmap_dvmaseg(rdip, dmao)); 3179 } 3180