xref: /illumos-gate/usr/src/uts/i86pc/io/immu_dvma.c (revision 4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Portions Copyright (c) 2010, Oracle and/or its affiliates.
23  * All rights reserved.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 /*
30  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
31  */
32 
33 /*
34  * DVMA code
35  * This file contains Intel IOMMU code that deals with DVMA
36  * i.e. DMA remapping.
37  */
38 
39 #include <sys/sysmacros.h>
40 #include <sys/pcie.h>
41 #include <sys/pci_cfgspace.h>
42 #include <vm/hat_i86.h>
43 #include <sys/memlist.h>
44 #include <sys/acpi/acpi.h>
45 #include <sys/acpica.h>
46 #include <sys/modhash.h>
47 #include <sys/immu.h>
48 #include <sys/x86_archext.h>
49 #include <sys/archsystm.h>
50 
51 #undef	TEST
52 
53 /*
54  * Macros based on PCI spec
55  */
56 #define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
57 #define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
58 #define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
59 
60 #define	IMMU_CONTIG_PADDR(d, p) \
61 	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
62 
63 typedef struct dvma_arg {
64 	immu_t *dva_immu;
65 	dev_info_t *dva_rdip;
66 	dev_info_t *dva_ddip;
67 	domain_t *dva_domain;
68 	int dva_level;
69 	immu_flags_t dva_flags;
70 	list_t *dva_list;
71 	int dva_error;
72 } dvma_arg_t;
73 
74 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
75     dev_info_t *rdip, immu_flags_t immu_flags);
76 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
77     int dev, int func, immu_flags_t immu_flags);
78 static void destroy_immu_devi(immu_devi_t *immu_devi);
79 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma,
80     uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
81     immu_flags_t immu_flags);
82 
83 /* Extern globals */
84 extern struct memlist  *phys_install;
85 
86 /*
87  * iommulib interface functions.
88  */
89 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip);
90 static int immu_allochdl(iommulib_handle_t handle,
91     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
92     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep);
93 static int immu_freehdl(iommulib_handle_t handle,
94     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
95 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
96     dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req,
97     ddi_dma_cookie_t *cookiep, uint_t *ccountp);
98 static int immu_unbindhdl(iommulib_handle_t handle,
99     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
100 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip,
101     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len,
102     uint_t cachefl);
103 static int immu_win(iommulib_handle_t handle, dev_info_t *dip,
104     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
105     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp);
106 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
107     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
108     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao);
109 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
110     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao);
111 
112 /* static Globals */
113 
114 /*
115  * Used to setup DMA objects (memory regions)
116  * for DMA reads by IOMMU units
117  */
118 static ddi_dma_attr_t immu_dma_attr = {
119 	DMA_ATTR_V0,
120 	0U,
121 	0xffffffffffffffffULL,
122 	0xffffffffU,
123 	MMU_PAGESIZE, /* MMU page aligned */
124 	0x1,
125 	0x1,
126 	0xffffffffU,
127 	0xffffffffffffffffULL,
128 	1,
129 	4,
130 	0
131 };
132 
133 static ddi_device_acc_attr_t immu_acc_attr = {
134 	DDI_DEVICE_ATTR_V0,
135 	DDI_NEVERSWAP_ACC,
136 	DDI_STRICTORDER_ACC
137 };
138 
139 struct iommulib_ops immulib_ops = {
140 	IOMMU_OPS_VERSION,
141 	INTEL_IOMMU,
142 	"Intel IOMMU",
143 	NULL,
144 	immu_probe,
145 	immu_allochdl,
146 	immu_freehdl,
147 	immu_bindhdl,
148 	immu_unbindhdl,
149 	immu_sync,
150 	immu_win,
151 	immu_mapobject,
152 	immu_unmapobject,
153 };
154 
155 /*
156  * Fake physical address range used to set up initial prealloc mappings.
157  * This memory is never actually accessed. It is mapped read-only,
158  * and is overwritten as soon as the first DMA bind operation is
159  * performed. Since 0 is a special case, just start at the 2nd
160  * physical page.
161  */
162 
163 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES };
164 
165 /* globals private to this file */
166 static kmutex_t immu_domain_lock;
167 static list_t immu_unity_domain_list;
168 static list_t immu_xlate_domain_list;
169 
170 /* structure used to store idx into each level of the page tables */
171 typedef struct xlate {
172 	int xlt_level;
173 	uint_t xlt_idx;
174 	pgtable_t *xlt_pgtable;
175 } xlate_t;
176 
177 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
178 #define	IMMU_UNITY_DID   1
179 
180 static mod_hash_t *bdf_domain_hash;
181 
182 int immu_use_alh;
183 int immu_use_tm;
184 
185 static domain_t *
186 bdf_domain_lookup(immu_devi_t *immu_devi)
187 {
188 	domain_t *domain;
189 	int16_t seg = immu_devi->imd_seg;
190 	int16_t bus = immu_devi->imd_bus;
191 	int16_t devfunc = immu_devi->imd_devfunc;
192 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
193 
194 	if (seg < 0 || bus < 0 || devfunc < 0) {
195 		return (NULL);
196 	}
197 
198 	domain = NULL;
199 	if (mod_hash_find(bdf_domain_hash,
200 	    (void *)bdf, (void *)&domain) == 0) {
201 		ASSERT(domain);
202 		ASSERT(domain->dom_did > 0);
203 		return (domain);
204 	} else {
205 		return (NULL);
206 	}
207 }
208 
209 static void
210 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
211 {
212 	int16_t seg = immu_devi->imd_seg;
213 	int16_t bus = immu_devi->imd_bus;
214 	int16_t devfunc = immu_devi->imd_devfunc;
215 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
216 
217 	if (seg < 0 || bus < 0 || devfunc < 0) {
218 		return;
219 	}
220 
221 	(void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
222 }
223 
224 static int
225 match_lpc(dev_info_t *pdip, void *arg)
226 {
227 	immu_devi_t *immu_devi;
228 	dvma_arg_t *dvap = (dvma_arg_t *)arg;
229 
230 	if (list_is_empty(dvap->dva_list)) {
231 		return (DDI_WALK_TERMINATE);
232 	}
233 
234 	immu_devi = list_head(dvap->dva_list);
235 	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
236 	    immu_devi)) {
237 		if (immu_devi->imd_dip == pdip) {
238 			dvap->dva_ddip = pdip;
239 			dvap->dva_error = DDI_SUCCESS;
240 			return (DDI_WALK_TERMINATE);
241 		}
242 	}
243 
244 	return (DDI_WALK_CONTINUE);
245 }
246 
247 static void
248 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
249 {
250 	list_t *spclist = NULL;
251 	immu_devi_t *immu_devi;
252 
253 	immu_devi = IMMU_DEVI(dip);
254 	if (immu_devi->imd_display == B_TRUE) {
255 		spclist = &(immu->immu_dvma_gfx_list);
256 	} else if (immu_devi->imd_lpc == B_TRUE) {
257 		spclist = &(immu->immu_dvma_lpc_list);
258 	}
259 
260 	if (spclist) {
261 		mutex_enter(&(immu->immu_lock));
262 		list_insert_head(spclist, immu_devi);
263 		mutex_exit(&(immu->immu_lock));
264 	}
265 }
266 
267 /*
268  * Set the immu_devi struct in the immu_devi field of a devinfo node
269  */
270 int
271 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
272 {
273 	int bus, dev, func;
274 	immu_devi_t *new_imd;
275 	immu_devi_t *immu_devi;
276 
277 	immu_devi = immu_devi_get(dip);
278 	if (immu_devi != NULL) {
279 		return (DDI_SUCCESS);
280 	}
281 
282 	bus = dev = func = -1;
283 
284 	/*
285 	 * Assume a new immu_devi struct is needed
286 	 */
287 	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
288 		/*
289 		 * No BDF. Set bus = -1 to indicate this.
290 		 * We still need to create a immu_devi struct
291 		 * though
292 		 */
293 		bus = -1;
294 		dev = 0;
295 		func = 0;
296 	}
297 
298 	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
299 	if (new_imd  == NULL) {
300 		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
301 		    "structure");
302 		return (DDI_FAILURE);
303 	}
304 
305 	/*
306 	 * Check if some other thread allocated a immu_devi while we
307 	 * didn't own the lock.
308 	 */
309 	mutex_enter(&(DEVI(dip)->devi_lock));
310 	if (IMMU_DEVI(dip) == NULL) {
311 		IMMU_DEVI_SET(dip, new_imd);
312 	} else {
313 		destroy_immu_devi(new_imd);
314 	}
315 	mutex_exit(&(DEVI(dip)->devi_lock));
316 
317 	return (DDI_SUCCESS);
318 }
319 
320 static dev_info_t *
321 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
322 {
323 	dvma_arg_t dvarg = {0};
324 	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
325 	dvarg.dva_rdip = rdip;
326 	dvarg.dva_error = DDI_FAILURE;
327 
328 	if (immu_walk_ancestor(rdip, NULL, match_lpc,
329 	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
330 		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
331 		    "find lpc_devinfo for ISA device");
332 		return (NULL);
333 	}
334 
335 	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
336 		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
337 		    "ISA device");
338 		return (NULL);
339 	}
340 
341 	return (dvarg.dva_ddip);
342 }
343 
344 static dev_info_t *
345 get_gfx_devinfo(dev_info_t *rdip)
346 {
347 	immu_t *immu;
348 	immu_devi_t *immu_devi;
349 	list_t *list_gfx;
350 
351 	/*
352 	 * The GFX device may not be on the same iommu unit as "agpgart"
353 	 * so search globally
354 	 */
355 	immu_devi = NULL;
356 	immu = list_head(&immu_list);
357 	for (; immu; immu = list_next(&immu_list, immu)) {
358 		list_gfx = &(immu->immu_dvma_gfx_list);
359 		if (!list_is_empty(list_gfx)) {
360 			immu_devi = list_head(list_gfx);
361 			break;
362 		}
363 	}
364 
365 	if (immu_devi == NULL) {
366 		ddi_err(DER_WARN, rdip, "iommu: No GFX device. "
367 		    "Cannot redirect agpgart");
368 		return (NULL);
369 	}
370 
371 	ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s",
372 	    ddi_node_name(immu_devi->imd_dip));
373 
374 	return (immu_devi->imd_dip);
375 }
376 
377 static immu_flags_t
378 dma_to_immu_flags(struct ddi_dma_req *dmareq)
379 {
380 	immu_flags_t flags = 0;
381 
382 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
383 		flags |= IMMU_FLAGS_SLEEP;
384 	} else {
385 		flags |= IMMU_FLAGS_NOSLEEP;
386 	}
387 
388 #ifdef BUGGY_DRIVERS
389 
390 	flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
391 
392 #else
393 	/*
394 	 * Read and write flags need to be reversed.
395 	 * DMA_READ means read from device and write
396 	 * to memory. So DMA read means DVMA write.
397 	 */
398 	if (dmareq->dmar_flags & DDI_DMA_READ)
399 		flags |= IMMU_FLAGS_WRITE;
400 
401 	if (dmareq->dmar_flags & DDI_DMA_WRITE)
402 		flags |= IMMU_FLAGS_READ;
403 
404 	/*
405 	 * Some buggy drivers specify neither READ or WRITE
406 	 * For such drivers set both read and write permissions
407 	 */
408 	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
409 		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
410 	}
411 #endif
412 
413 	return (flags);
414 }
415 
416 /*ARGSUSED*/
417 int
418 pgtable_ctor(void *buf, void *arg, int kmflag)
419 {
420 	size_t actual_size = 0;
421 	pgtable_t *pgtable;
422 	int (*dmafp)(caddr_t);
423 	caddr_t vaddr;
424 	void *next;
425 	uint_t flags;
426 	immu_t *immu = arg;
427 
428 	pgtable = (pgtable_t *)buf;
429 
430 	dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
431 
432 	next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
433 	if (next == NULL) {
434 		return (-1);
435 	}
436 
437 	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
438 	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
439 		kmem_free(next, IMMU_PAGESIZE);
440 		return (-1);
441 	}
442 
443 	flags = DDI_DMA_CONSISTENT;
444 	if (!immu->immu_dvma_coherent)
445 		flags |= IOMEM_DATA_UC_WR_COMBINE;
446 
447 	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
448 	    &immu_acc_attr, flags,
449 	    dmafp, NULL, &vaddr, &actual_size,
450 	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
451 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
452 		kmem_free(next, IMMU_PAGESIZE);
453 		return (-1);
454 	}
455 
456 	/*
457 	 * Memory allocation failure. Maybe a temporary condition
458 	 * so return error rather than panic, so we can try again
459 	 */
460 	if (actual_size < IMMU_PAGESIZE) {
461 		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
462 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
463 		kmem_free(next, IMMU_PAGESIZE);
464 		return (-1);
465 	}
466 
467 	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
468 	pgtable->hwpg_vaddr = vaddr;
469 	pgtable->swpg_next_array = next;
470 
471 	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
472 
473 	return (0);
474 }
475 
476 /*ARGSUSED*/
477 void
478 pgtable_dtor(void *buf, void *arg)
479 {
480 	pgtable_t *pgtable;
481 
482 	pgtable = (pgtable_t *)buf;
483 
484 	/* destroy will panic if lock is held. */
485 	rw_destroy(&(pgtable->swpg_rwlock));
486 
487 	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
488 	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
489 	kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
490 }
491 
492 /*
493  * pgtable_alloc()
494  *	alloc a IOMMU pgtable structure.
495  *	This same struct is used for root and context tables as well.
496  *	This routine allocs the f/ollowing:
497  *	- a pgtable_t struct
498  *	- a HW page which holds PTEs/entries which is accesssed by HW
499  *        so we set up DMA for this page
500  *	- a SW page which is only for our bookeeping
501  *        (for example to  hold pointers to the next level pgtable).
502  *        So a simple kmem_alloc suffices
503  */
504 static pgtable_t *
505 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
506 {
507 	pgtable_t *pgtable;
508 	int kmflags;
509 
510 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
511 
512 	pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags);
513 	if (pgtable == NULL) {
514 		return (NULL);
515 	}
516 	return (pgtable);
517 }
518 
519 static void
520 pgtable_zero(pgtable_t *pgtable)
521 {
522 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
523 	bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);
524 }
525 
526 static void
527 pgtable_free(immu_t *immu, pgtable_t *pgtable)
528 {
529 	kmem_cache_free(immu->immu_pgtable_cache, pgtable);
530 }
531 
532 /*
533  * Function to identify a display device from the PCI class code
534  */
535 static boolean_t
536 device_is_display(uint_t classcode)
537 {
538 	static uint_t disp_classes[] = {
539 		0x000100,
540 		0x030000,
541 		0x030001
542 	};
543 	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
544 
545 	for (i = 0; i < nclasses; i++) {
546 		if (classcode == disp_classes[i])
547 			return (B_TRUE);
548 	}
549 	return (B_FALSE);
550 }
551 
552 /*
553  * Function that determines if device is PCIEX and/or PCIEX bridge
554  */
555 static boolean_t
556 device_is_pciex(
557 	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
558 {
559 	ushort_t cap;
560 	ushort_t capsp;
561 	ushort_t cap_count = PCI_CAP_MAX_PTR;
562 	ushort_t status;
563 	boolean_t is_pciex = B_FALSE;
564 
565 	*is_pcib = B_FALSE;
566 
567 	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
568 	if (!(status & PCI_STAT_CAP))
569 		return (B_FALSE);
570 
571 	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
572 	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
573 		capsp &= PCI_CAP_PTR_MASK;
574 		cap = pci_getb_func(bus, dev, func, capsp);
575 
576 		if (cap == PCI_CAP_ID_PCI_E) {
577 			status = pci_getw_func(bus, dev, func, capsp + 2);
578 			/*
579 			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
580 			 * for Device/Port Type.
581 			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
582 			 * device is a PCIE2PCI bridge
583 			 */
584 			*is_pcib =
585 			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
586 			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
587 			is_pciex = B_TRUE;
588 		}
589 
590 		capsp = (*pci_getb_func)(bus, dev, func,
591 		    capsp + PCI_CAP_NEXT_PTR);
592 	}
593 
594 	return (is_pciex);
595 }
596 
597 static boolean_t
598 device_use_premap(uint_t classcode)
599 {
600 	if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET)
601 		return (B_TRUE);
602 	return (B_FALSE);
603 }
604 
605 
606 /*
607  * immu_dvma_get_immu()
608  *   get the immu unit structure for a dev_info node
609  */
610 immu_t *
611 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
612 {
613 	immu_devi_t *immu_devi;
614 	immu_t *immu;
615 
616 	/*
617 	 * check if immu unit was already found earlier.
618 	 * If yes, then it will be stashed in immu_devi struct.
619 	 */
620 	immu_devi = immu_devi_get(dip);
621 	if (immu_devi == NULL) {
622 		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
623 			/*
624 			 * May fail because of low memory. Return error rather
625 			 * than panic as we want driver to rey again later
626 			 */
627 			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
628 			    "No immu_devi structure");
629 			/*NOTREACHED*/
630 		}
631 		immu_devi = immu_devi_get(dip);
632 	}
633 
634 	mutex_enter(&(DEVI(dip)->devi_lock));
635 	if (immu_devi->imd_immu) {
636 		immu = immu_devi->imd_immu;
637 		mutex_exit(&(DEVI(dip)->devi_lock));
638 		return (immu);
639 	}
640 	mutex_exit(&(DEVI(dip)->devi_lock));
641 
642 	immu = immu_dmar_get_immu(dip);
643 	if (immu == NULL) {
644 		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
645 		    "Cannot find immu_t for device");
646 		/*NOTREACHED*/
647 	}
648 
649 	/*
650 	 * Check if some other thread found immu
651 	 * while lock was not held
652 	 */
653 	immu_devi = immu_devi_get(dip);
654 	/* immu_devi should be present as we found it earlier */
655 	if (immu_devi == NULL) {
656 		ddi_err(DER_PANIC, dip,
657 		    "immu_dvma_get_immu: No immu_devi structure");
658 		/*NOTREACHED*/
659 	}
660 
661 	mutex_enter(&(DEVI(dip)->devi_lock));
662 	if (immu_devi->imd_immu == NULL) {
663 		/* nobody else set it, so we should do it */
664 		immu_devi->imd_immu = immu;
665 		immu_devi_set_spclist(dip, immu);
666 	} else {
667 		/*
668 		 * if some other thread got immu before
669 		 * us, it should get the same results
670 		 */
671 		if (immu_devi->imd_immu != immu) {
672 			ddi_err(DER_PANIC, dip, "Multiple "
673 			    "immu units found for device. Expected (%p), "
674 			    "actual (%p)", (void *)immu,
675 			    (void *)immu_devi->imd_immu);
676 			mutex_exit(&(DEVI(dip)->devi_lock));
677 			/*NOTREACHED*/
678 		}
679 	}
680 	mutex_exit(&(DEVI(dip)->devi_lock));
681 
682 	return (immu);
683 }
684 
685 
686 /* ############################# IMMU_DEVI code ############################ */
687 
688 /*
689  * Allocate a immu_devi structure and initialize it
690  */
691 static immu_devi_t *
692 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
693     immu_flags_t immu_flags)
694 {
695 	uchar_t baseclass, subclass;
696 	uint_t classcode, revclass;
697 	immu_devi_t *immu_devi;
698 	boolean_t pciex = B_FALSE;
699 	int kmflags;
700 	boolean_t is_pcib = B_FALSE;
701 
702 	/* bus ==  -1 indicate non-PCI device (no BDF) */
703 	ASSERT(bus == -1 || bus >= 0);
704 	ASSERT(dev >= 0);
705 	ASSERT(func >= 0);
706 
707 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
708 	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
709 	if (immu_devi == NULL) {
710 		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
711 		    "Intel IOMMU immu_devi structure");
712 		return (NULL);
713 	}
714 	immu_devi->imd_dip = rdip;
715 	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
716 	immu_devi->imd_bus = bus;
717 	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
718 
719 	if (bus == -1) {
720 		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
721 		return (immu_devi);
722 	}
723 
724 	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
725 	immu_devi->imd_sec = 0;
726 	immu_devi->imd_sub = 0;
727 
728 	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
729 
730 	classcode = IMMU_PCI_REV2CLASS(revclass);
731 	baseclass = IMMU_PCI_CLASS2BASE(classcode);
732 	subclass = IMMU_PCI_CLASS2SUB(classcode);
733 
734 	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
735 
736 		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
737 		    PCI_BCNF_SECBUS);
738 		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
739 		    PCI_BCNF_SUBBUS);
740 
741 		pciex = device_is_pciex(bus, dev, func, &is_pcib);
742 		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
743 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
744 		} else if (pciex == B_TRUE) {
745 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
746 		} else {
747 			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
748 		}
749 	} else {
750 		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
751 	}
752 
753 	/* check for certain special devices */
754 	immu_devi->imd_display = device_is_display(classcode);
755 	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
756 	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
757 	immu_devi->imd_use_premap = device_use_premap(classcode);
758 
759 	immu_devi->imd_domain = NULL;
760 
761 	immu_devi->imd_dvma_flags = immu_global_dvma_flags;
762 
763 	return (immu_devi);
764 }
765 
766 static void
767 destroy_immu_devi(immu_devi_t *immu_devi)
768 {
769 	kmem_free(immu_devi, sizeof (immu_devi_t));
770 }
771 
772 static domain_t *
773 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
774 {
775 	immu_devi_t *immu_devi;
776 	domain_t *domain;
777 	dev_info_t *ddip;
778 
779 	*ddipp = NULL;
780 
781 	immu_devi = immu_devi_get(rdip);
782 	if (immu_devi == NULL) {
783 		return (NULL);
784 	}
785 
786 	mutex_enter(&(DEVI(rdip)->devi_lock));
787 	domain = immu_devi->imd_domain;
788 	ddip = immu_devi->imd_ddip;
789 	mutex_exit(&(DEVI(rdip)->devi_lock));
790 
791 	if (domain)
792 		*ddipp = ddip;
793 
794 	return (domain);
795 
796 }
797 
798 /* ############################# END IMMU_DEVI code ######################## */
799 /* ############################# DOMAIN code ############################### */
800 
801 /*
802  * This routine always succeeds
803  */
804 static int
805 did_alloc(immu_t *immu, dev_info_t *rdip,
806     dev_info_t *ddip, immu_flags_t immu_flags)
807 {
808 	int did;
809 
810 	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
811 	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
812 
813 	if (did == 0) {
814 		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
815 		    " domain-device: %s%d. immu unit is %s. Using "
816 		    "unity domain with domain-id (%d)",
817 		    ddi_driver_name(ddip), ddi_get_instance(ddip),
818 		    immu->immu_name, immu->immu_unity_domain->dom_did);
819 		did = immu->immu_unity_domain->dom_did;
820 	}
821 
822 	return (did);
823 }
824 
825 static int
826 get_branch_domain(dev_info_t *pdip, void *arg)
827 {
828 	immu_devi_t *immu_devi;
829 	domain_t *domain;
830 	dev_info_t *ddip;
831 	immu_t *immu;
832 	dvma_arg_t *dvp = (dvma_arg_t *)arg;
833 
834 	/*
835 	 * The field dvp->dva_rdip is a work-in-progress
836 	 * and gets updated as we walk up the ancestor
837 	 * tree. The final ddip is set only when we reach
838 	 * the top of the tree. So the dvp->dva_ddip field cannot
839 	 * be relied on until we reach the top of the field.
840 	 */
841 
842 	/* immu_devi may not be set. */
843 	immu_devi = immu_devi_get(pdip);
844 	if (immu_devi == NULL) {
845 		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
846 			dvp->dva_error = DDI_FAILURE;
847 			return (DDI_WALK_TERMINATE);
848 		}
849 	}
850 
851 	immu_devi = immu_devi_get(pdip);
852 	immu = immu_devi->imd_immu;
853 	if (immu == NULL)
854 		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
855 
856 	/*
857 	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
858 	 * terminate the walk (since the device under the PCIE bridge
859 	 * is a PCIE device and has an independent entry in the
860 	 * root/context table)
861 	 */
862 	if (dvp->dva_rdip != pdip &&
863 	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
864 		return (DDI_WALK_TERMINATE);
865 	}
866 
867 	/*
868 	 * In order to be a domain-dim, it must be a PCI device i.e.
869 	 * must have valid BDF. This also eliminates the root complex.
870 	 */
871 	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
872 	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
873 		ASSERT(immu_devi->imd_bus >= 0);
874 		ASSERT(immu_devi->imd_devfunc >= 0);
875 		dvp->dva_ddip = pdip;
876 	}
877 
878 	if (immu_devi->imd_display == B_TRUE ||
879 	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
880 		dvp->dva_domain = immu->immu_unity_domain;
881 		/* continue walking to find ddip */
882 		return (DDI_WALK_CONTINUE);
883 	}
884 
885 	mutex_enter(&(DEVI(pdip)->devi_lock));
886 	domain = immu_devi->imd_domain;
887 	ddip = immu_devi->imd_ddip;
888 	mutex_exit(&(DEVI(pdip)->devi_lock));
889 
890 	if (domain && ddip) {
891 		/* if domain is set, it must be the same */
892 		if (dvp->dva_domain) {
893 			ASSERT(domain == dvp->dva_domain);
894 		}
895 		dvp->dva_domain = domain;
896 		dvp->dva_ddip = ddip;
897 		return (DDI_WALK_TERMINATE);
898 	}
899 
900 	/* Domain may already be set, continue walking so that ddip gets set */
901 	if (dvp->dva_domain) {
902 		return (DDI_WALK_CONTINUE);
903 	}
904 
905 	/* domain is not set in either immu_devi or dvp */
906 	domain = bdf_domain_lookup(immu_devi);
907 	if (domain == NULL) {
908 		return (DDI_WALK_CONTINUE);
909 	}
910 
911 	/* ok, the BDF hash had a domain for this BDF. */
912 
913 	/* Grab lock again to check if something else set immu_devi fields */
914 	mutex_enter(&(DEVI(pdip)->devi_lock));
915 	if (immu_devi->imd_domain != NULL) {
916 		dvp->dva_domain = domain;
917 	} else {
918 		dvp->dva_domain = domain;
919 	}
920 	mutex_exit(&(DEVI(pdip)->devi_lock));
921 
922 	/*
923 	 * walk upwards until the topmost PCI bridge is found
924 	 */
925 	return (DDI_WALK_CONTINUE);
926 
927 }
928 
929 static void
930 map_unity_domain(domain_t *domain)
931 {
932 	struct memlist *mp;
933 	uint64_t start;
934 	uint64_t npages;
935 	immu_dcookie_t dcookies[1] = {0};
936 	int dcount = 0;
937 
938 	/*
939 	 * UNITY arenas are a mirror of the physical memory
940 	 * installed on the system.
941 	 */
942 
943 #ifdef BUGGY_DRIVERS
944 	/*
945 	 * Dont skip page0. Some broken HW/FW access it.
946 	 */
947 	dcookies[0].dck_paddr = 0;
948 	dcookies[0].dck_npages = 1;
949 	dcount = 1;
950 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
951 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
952 #endif
953 
954 	memlist_read_lock();
955 
956 	mp = phys_install;
957 
958 	if (mp->ml_address == 0) {
959 		/* since we already mapped page1 above */
960 		start = IMMU_PAGESIZE;
961 	} else {
962 		start = mp->ml_address;
963 	}
964 	npages = mp->ml_size/IMMU_PAGESIZE + 1;
965 
966 	dcookies[0].dck_paddr = start;
967 	dcookies[0].dck_npages = npages;
968 	dcount = 1;
969 	(void) dvma_map(domain, start, npages, dcookies,
970 	    dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
971 
972 	ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64
973 	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
974 
975 	mp = mp->ml_next;
976 	while (mp) {
977 		ddi_err(DER_LOG, domain->dom_dip,
978 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
979 		    mp->ml_address, mp->ml_address + mp->ml_size);
980 
981 		start = mp->ml_address;
982 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
983 
984 		dcookies[0].dck_paddr = start;
985 		dcookies[0].dck_npages = npages;
986 		dcount = 1;
987 		(void) dvma_map(domain, start, npages,
988 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
989 		mp = mp->ml_next;
990 	}
991 
992 	mp = bios_rsvd;
993 	while (mp) {
994 		ddi_err(DER_LOG, domain->dom_dip,
995 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
996 		    mp->ml_address, mp->ml_address + mp->ml_size);
997 
998 		start = mp->ml_address;
999 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
1000 
1001 		dcookies[0].dck_paddr = start;
1002 		dcookies[0].dck_npages = npages;
1003 		dcount = 1;
1004 		(void) dvma_map(domain, start, npages,
1005 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
1006 
1007 		mp = mp->ml_next;
1008 	}
1009 
1010 	memlist_read_unlock();
1011 }
1012 
1013 /*
1014  * create_xlate_arena()
1015  * 	Create the dvma arena for a domain with translation
1016  *	mapping
1017  */
1018 static void
1019 create_xlate_arena(immu_t *immu, domain_t *domain,
1020     dev_info_t *rdip, immu_flags_t immu_flags)
1021 {
1022 	char *arena_name;
1023 	struct memlist *mp;
1024 	int vmem_flags;
1025 	uint64_t start;
1026 	uint_t mgaw;
1027 	uint64_t size;
1028 	uint64_t maxaddr;
1029 	void *vmem_ret;
1030 
1031 	arena_name = domain->dom_dvma_arena_name;
1032 
1033 	/* Note, don't do sizeof (arena_name) - it is just a pointer */
1034 	(void) snprintf(arena_name,
1035 	    sizeof (domain->dom_dvma_arena_name),
1036 	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1037 	    domain->dom_did);
1038 
1039 	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1040 
1041 	/* Restrict mgaddr (max guest addr) to MGAW */
1042 	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1043 
1044 	/*
1045 	 * To ensure we avoid ioapic and PCI MMIO ranges we just
1046 	 * use the physical memory address range of the system as the
1047 	 * range
1048 	 */
1049 	maxaddr = ((uint64_t)1 << mgaw);
1050 
1051 	memlist_read_lock();
1052 
1053 	mp = phys_install;
1054 
1055 	if (mp->ml_address == 0)
1056 		start = MMU_PAGESIZE;
1057 	else
1058 		start = mp->ml_address;
1059 
1060 	if (start + mp->ml_size > maxaddr)
1061 		size = maxaddr - start;
1062 	else
1063 		size = mp->ml_size;
1064 
1065 	ddi_err(DER_VERB, rdip,
1066 	    "iommu: %s: Creating dvma vmem arena [0x%" PRIx64
1067 	    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1068 
1069 	/*
1070 	 * We always allocate in quanta of IMMU_PAGESIZE
1071 	 */
1072 	domain->dom_dvma_arena = vmem_create(arena_name,
1073 	    (void *)(uintptr_t)start,	/* start addr */
1074 	    size,			/* size */
1075 	    IMMU_PAGESIZE,		/* quantum */
1076 	    NULL,			/* afunc */
1077 	    NULL,			/* ffunc */
1078 	    NULL,			/* source */
1079 	    0,				/* qcache_max */
1080 	    vmem_flags);
1081 
1082 	if (domain->dom_dvma_arena == NULL) {
1083 		ddi_err(DER_PANIC, rdip,
1084 		    "Failed to allocate DVMA arena(%s) "
1085 		    "for domain ID (%d)", arena_name, domain->dom_did);
1086 		/*NOTREACHED*/
1087 	}
1088 
1089 	mp = mp->ml_next;
1090 	while (mp) {
1091 
1092 		if (mp->ml_address == 0)
1093 			start = MMU_PAGESIZE;
1094 		else
1095 			start = mp->ml_address;
1096 
1097 		if (start + mp->ml_size > maxaddr)
1098 			size = maxaddr - start;
1099 		else
1100 			size = mp->ml_size;
1101 
1102 		ddi_err(DER_VERB, rdip,
1103 		    "iommu: %s: Adding dvma vmem span [0x%" PRIx64
1104 		    " - 0x%" PRIx64 "]", arena_name, start,
1105 		    start + size);
1106 
1107 		vmem_ret = vmem_add(domain->dom_dvma_arena,
1108 		    (void *)(uintptr_t)start, size,  vmem_flags);
1109 
1110 		if (vmem_ret == NULL) {
1111 			ddi_err(DER_PANIC, rdip,
1112 			    "Failed to allocate DVMA arena(%s) "
1113 			    "for domain ID (%d)",
1114 			    arena_name, domain->dom_did);
1115 			/*NOTREACHED*/
1116 		}
1117 		mp = mp->ml_next;
1118 	}
1119 	memlist_read_unlock();
1120 }
1121 
1122 /* ################################### DOMAIN CODE ######################### */
1123 
1124 /*
1125  * Set the domain and domain-dip for a dip
1126  */
1127 static void
1128 set_domain(
1129 	dev_info_t *dip,
1130 	dev_info_t *ddip,
1131 	domain_t *domain)
1132 {
1133 	immu_devi_t *immu_devi;
1134 	domain_t *fdomain;
1135 	dev_info_t *fddip;
1136 
1137 	immu_devi = immu_devi_get(dip);
1138 
1139 	mutex_enter(&(DEVI(dip)->devi_lock));
1140 	fddip = immu_devi->imd_ddip;
1141 	fdomain = immu_devi->imd_domain;
1142 
1143 	if (fddip) {
1144 		ASSERT(fddip == ddip);
1145 	} else {
1146 		immu_devi->imd_ddip = ddip;
1147 	}
1148 
1149 	if (fdomain) {
1150 		ASSERT(fdomain == domain);
1151 	} else {
1152 		immu_devi->imd_domain = domain;
1153 	}
1154 	mutex_exit(&(DEVI(dip)->devi_lock));
1155 }
1156 
1157 /*
1158  * device_domain()
1159  * 	Get domain for a device. The domain may be global in which case it
1160  *	is shared between all IOMMU units. Due to potential AGAW differences
1161  *      between IOMMU units, such global domains *have to be* UNITY mapping
1162  *      domains. Alternatively, the domain may be local to a IOMMU unit.
1163  *	Local domains may be shared or immu_devi, although the
1164  *      scope of sharing
1165  *	is restricted to devices controlled by the IOMMU unit to
1166  *      which the domain
1167  *	belongs. If shared, they (currently) have to be UNITY domains. If
1168  *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
1169  */
1170 static domain_t *
1171 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1172 {
1173 	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1174 	immu_t *immu;
1175 	domain_t *domain;
1176 	dvma_arg_t dvarg = {0};
1177 	int level;
1178 
1179 	*ddipp = NULL;
1180 
1181 	/*
1182 	 * Check if the domain is already set. This is usually true
1183 	 * if this is not the first DVMA transaction.
1184 	 */
1185 	ddip = NULL;
1186 	domain = immu_devi_domain(rdip, &ddip);
1187 	if (domain) {
1188 		*ddipp = ddip;
1189 		return (domain);
1190 	}
1191 
1192 	immu = immu_dvma_get_immu(rdip, immu_flags);
1193 	if (immu == NULL) {
1194 		/*
1195 		 * possible that there is no IOMMU unit for this device
1196 		 * - BIOS bugs are one example.
1197 		 */
1198 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
1199 		return (NULL);
1200 	}
1201 
1202 	immu_flags |= immu_devi_get(rdip)->imd_dvma_flags;
1203 
1204 	dvarg.dva_rdip = rdip;
1205 	dvarg.dva_ddip = NULL;
1206 	dvarg.dva_domain = NULL;
1207 	dvarg.dva_flags = immu_flags;
1208 	level = 0;
1209 	if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
1210 	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1211 		/*
1212 		 * maybe low memory. return error,
1213 		 * so driver tries again later
1214 		 */
1215 		return (NULL);
1216 	}
1217 
1218 	/* should have walked at least 1 dip (i.e. edip) */
1219 	ASSERT(level > 0);
1220 
1221 	ddip = dvarg.dva_ddip;	/* must be present */
1222 	domain = dvarg.dva_domain;	/* may be NULL */
1223 
1224 	/*
1225 	 * We may find the domain during our ancestor walk on any one of our
1226 	 * ancestor dips, If the domain is found then the domain-dip
1227 	 * (i.e. ddip) will also be found in the same immu_devi struct.
1228 	 * The domain-dip is the highest ancestor dip which shares the
1229 	 * same domain with edip.
1230 	 * The domain may or may not be found, but the domain dip must
1231 	 * be found.
1232 	 */
1233 	if (ddip == NULL) {
1234 		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
1235 		return (NULL);
1236 	}
1237 
1238 	/*
1239 	 * Did we find a domain ?
1240 	 */
1241 	if (domain) {
1242 		goto found;
1243 	}
1244 
1245 	/* nope, so allocate */
1246 	domain = domain_create(immu, ddip, rdip, immu_flags);
1247 	if (domain == NULL) {
1248 		return (NULL);
1249 	}
1250 
1251 	/*FALLTHROUGH*/
1252 found:
1253 	/*
1254 	 * We know *domain *is* the right domain, so panic if
1255 	 * another domain is set for either the request-dip or
1256 	 * effective dip.
1257 	 */
1258 	set_domain(ddip, ddip, domain);
1259 	set_domain(rdip, ddip, domain);
1260 
1261 	*ddipp = ddip;
1262 	return (domain);
1263 }
1264 
1265 static void
1266 create_unity_domain(immu_t *immu)
1267 {
1268 	domain_t *domain;
1269 
1270 	/* domain created during boot and always use sleep flag */
1271 	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1272 
1273 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1274 
1275 	domain->dom_did = IMMU_UNITY_DID;
1276 	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1277 
1278 	domain->dom_immu = immu;
1279 	immu->immu_unity_domain = domain;
1280 
1281 	/*
1282 	 * Setup the domain's initial page table
1283 	 * should never fail.
1284 	 */
1285 	domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1286 	pgtable_zero(domain->dom_pgtable_root);
1287 
1288 	/*
1289 	 * Only map all physical memory in to the unity domain
1290 	 * if passthrough is not supported. If it is supported,
1291 	 * passthrough is set in the context entry instead.
1292 	 */
1293 	if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1294 		map_unity_domain(domain);
1295 
1296 
1297 	/*
1298 	 * put it on the system-wide UNITY domain list
1299 	 */
1300 	mutex_enter(&(immu_domain_lock));
1301 	list_insert_tail(&immu_unity_domain_list, domain);
1302 	mutex_exit(&(immu_domain_lock));
1303 }
1304 
1305 /*
1306  * ddip is the domain-dip - the topmost dip in a domain
1307  * rdip is the requesting-dip - the device which is
1308  * requesting DVMA setup
1309  * if domain is a non-shared domain rdip == ddip
1310  */
1311 static domain_t *
1312 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1313     immu_flags_t immu_flags)
1314 {
1315 	int kmflags;
1316 	domain_t *domain;
1317 	char mod_hash_name[128];
1318 	immu_devi_t *immu_devi;
1319 	int did;
1320 	immu_dcookie_t dcookies[1] = {0};
1321 	int dcount = 0;
1322 
1323 	immu_devi = immu_devi_get(rdip);
1324 
1325 	/*
1326 	 * First allocate a domainid.
1327 	 * This routine will never fail, since if we run out
1328 	 * of domains the unity domain will be allocated.
1329 	 */
1330 	did = did_alloc(immu, rdip, ddip, immu_flags);
1331 	if (did == IMMU_UNITY_DID) {
1332 		/* domain overflow */
1333 		ASSERT(immu->immu_unity_domain);
1334 		return (immu->immu_unity_domain);
1335 	}
1336 
1337 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1338 	domain = kmem_zalloc(sizeof (domain_t), kmflags);
1339 	if (domain == NULL) {
1340 		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1341 		    "structure for device. IOMMU unit: %s", immu->immu_name);
1342 		/*NOTREACHED*/
1343 	}
1344 
1345 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1346 
1347 	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1348 	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
1349 
1350 	domain->dom_did = did;
1351 	domain->dom_immu = immu;
1352 	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1353 	domain->dom_dip = ddip;
1354 
1355 	/*
1356 	 * Create xlate DVMA arena for this domain.
1357 	 */
1358 	create_xlate_arena(immu, domain, rdip, immu_flags);
1359 
1360 	/*
1361 	 * Setup the domain's initial page table
1362 	 */
1363 	domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
1364 	if (domain->dom_pgtable_root == NULL) {
1365 		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1366 		    "pgtable for domain (%d). IOMMU unit: %s",
1367 		    domain->dom_did, immu->immu_name);
1368 		/*NOTREACHED*/
1369 	}
1370 	pgtable_zero(domain->dom_pgtable_root);
1371 
1372 	/*
1373 	 * Since this is a immu unit-specific domain, put it on
1374 	 * the per-immu domain list.
1375 	 */
1376 	mutex_enter(&(immu->immu_lock));
1377 	list_insert_head(&immu->immu_domain_list, domain);
1378 	mutex_exit(&(immu->immu_lock));
1379 
1380 	/*
1381 	 * Also put it on the system-wide xlate domain list
1382 	 */
1383 	mutex_enter(&(immu_domain_lock));
1384 	list_insert_head(&immu_xlate_domain_list, domain);
1385 	mutex_exit(&(immu_domain_lock));
1386 
1387 	bdf_domain_insert(immu_devi, domain);
1388 
1389 #ifdef BUGGY_DRIVERS
1390 	/*
1391 	 * Map page0. Some broken HW/FW access it.
1392 	 */
1393 	dcookies[0].dck_paddr = 0;
1394 	dcookies[0].dck_npages = 1;
1395 	dcount = 1;
1396 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
1397 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1398 #endif
1399 	return (domain);
1400 }
1401 
1402 /*
1403  * Create domainid arena.
1404  * Domainid 0 is reserved by Vt-d spec and cannot be used by
1405  * system software.
1406  * Domainid 1 is reserved by solaris and used for *all* of the following:
1407  *	as the "uninitialized" domain - For devices not yet controlled
1408  *	by Solaris
1409  *	as the "unity" domain - For devices that will always belong
1410  *	to the unity domain
1411  *	as the "overflow" domain - Used for any new device after we
1412  *	run out of domains
1413  * All of the above domains map into a single domain with
1414  * domainid 1 and UNITY DVMA mapping
1415  * Each IMMU unity has its own unity/uninit/overflow domain
1416  */
1417 static void
1418 did_init(immu_t *immu)
1419 {
1420 	(void) snprintf(immu->immu_did_arena_name,
1421 	    sizeof (immu->immu_did_arena_name),
1422 	    "%s_domainid_arena", immu->immu_name);
1423 
1424 	ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s",
1425 	    immu->immu_did_arena_name);
1426 
1427 	immu->immu_did_arena = vmem_create(
1428 	    immu->immu_did_arena_name,
1429 	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
1430 	    immu->immu_max_domains - IMMU_UNITY_DID,
1431 	    1,				/* quantum */
1432 	    NULL,			/* afunc */
1433 	    NULL,			/* ffunc */
1434 	    NULL,			/* source */
1435 	    0,				/* qcache_max */
1436 	    VM_SLEEP);
1437 
1438 	/* Even with SLEEP flag, vmem_create() can fail */
1439 	if (immu->immu_did_arena == NULL) {
1440 		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1441 		    "IOMMU domainid allocator: %s", immu->immu_name,
1442 		    immu->immu_did_arena_name);
1443 	}
1444 }
1445 
1446 /* #########################  CONTEXT CODE ################################# */
1447 
1448 static void
1449 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1450     int bus, int devfunc)
1451 {
1452 	pgtable_t *context;
1453 	pgtable_t *pgtable_root;
1454 	hw_rce_t *hw_rent;
1455 	hw_rce_t *hw_cent;
1456 	hw_rce_t *ctxp;
1457 	int sid;
1458 	krw_t rwtype;
1459 	boolean_t fill_root;
1460 	boolean_t fill_ctx;
1461 
1462 	pgtable_root = domain->dom_pgtable_root;
1463 
1464 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1465 	context = *(pgtable_t **)(ctxp + bus);
1466 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1467 
1468 	fill_root = B_FALSE;
1469 	fill_ctx = B_FALSE;
1470 
1471 	/* Check the most common case first with reader lock */
1472 	rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
1473 	rwtype = RW_READER;
1474 again:
1475 	if (ROOT_GET_P(hw_rent)) {
1476 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1477 		if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
1478 			rw_exit(&(immu->immu_ctx_rwlock));
1479 			return;
1480 		} else {
1481 			fill_ctx = B_TRUE;
1482 		}
1483 	} else {
1484 		fill_root = B_TRUE;
1485 		fill_ctx = B_TRUE;
1486 	}
1487 
1488 	if (rwtype == RW_READER &&
1489 	    rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
1490 		rw_exit(&(immu->immu_ctx_rwlock));
1491 		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1492 		rwtype = RW_WRITER;
1493 		goto again;
1494 	}
1495 	rwtype = RW_WRITER;
1496 
1497 	if (fill_root == B_TRUE) {
1498 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1499 		ROOT_SET_P(hw_rent);
1500 		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1501 	}
1502 
1503 	if (fill_ctx == B_TRUE) {
1504 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1505 		/* need to disable context entry before reprogramming it */
1506 		bzero(hw_cent, sizeof (hw_rce_t));
1507 
1508 		/* flush caches */
1509 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1510 
1511 		sid = ((bus << 8) | devfunc);
1512 		immu_flush_context_fsi(immu, 0, sid, domain->dom_did,
1513 		    &immu->immu_ctx_inv_wait);
1514 
1515 		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1516 		CONT_SET_DID(hw_cent, domain->dom_did);
1517 		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1518 		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1519 		if (domain->dom_did == IMMU_UNITY_DID &&
1520 		    IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1521 			CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1522 		else
1523 			/*LINTED*/
1524 			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1525 		CONT_SET_P(hw_cent);
1526 		if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) {
1527 			CONT_SET_EH(hw_cent);
1528 			if (immu_use_alh)
1529 				CONT_SET_ALH(hw_cent);
1530 		}
1531 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1532 	}
1533 	rw_exit(&(immu->immu_ctx_rwlock));
1534 }
1535 
1536 static pgtable_t *
1537 context_create(immu_t *immu)
1538 {
1539 	int	bus;
1540 	int	devfunc;
1541 	pgtable_t *root_table;
1542 	pgtable_t *context;
1543 	pgtable_t *pgtable_root;
1544 	hw_rce_t *ctxp;
1545 	hw_rce_t *hw_rent;
1546 	hw_rce_t *hw_cent;
1547 
1548 	/* Allocate a zeroed root table (4K 256b entries) */
1549 	root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1550 	pgtable_zero(root_table);
1551 
1552 	/*
1553 	 * Setup context tables for all possible root table entries.
1554 	 * Start out with unity domains for all entries.
1555 	 */
1556 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1557 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1558 	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1559 		context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1560 		pgtable_zero(context);
1561 		ROOT_SET_P(hw_rent);
1562 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1563 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1564 		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1565 		    devfunc++, hw_cent++) {
1566 			pgtable_root =
1567 			    immu->immu_unity_domain->dom_pgtable_root;
1568 			CONT_SET_DID(hw_cent,
1569 			    immu->immu_unity_domain->dom_did);
1570 			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1571 			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1572 			if (IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1573 				CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1574 			else
1575 				/*LINTED*/
1576 				CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1577 			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1578 			CONT_SET_P(hw_cent);
1579 		}
1580 		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1581 		*((pgtable_t **)ctxp) = context;
1582 	}
1583 
1584 	return (root_table);
1585 }
1586 
1587 /*
1588  * Called during rootnex attach, so no locks needed
1589  */
1590 static void
1591 context_init(immu_t *immu)
1592 {
1593 	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1594 
1595 	immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE);
1596 
1597 	immu_regs_wbf_flush(immu);
1598 
1599 	immu->immu_ctx_root = context_create(immu);
1600 
1601 	immu_regs_set_root_table(immu);
1602 
1603 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1604 	immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait);
1605 	immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait);
1606 	rw_exit(&(immu->immu_ctx_rwlock));
1607 }
1608 
1609 
1610 /*
1611  * Find top pcib
1612  */
1613 static int
1614 find_top_pcib(dev_info_t *dip, void *arg)
1615 {
1616 	immu_devi_t *immu_devi;
1617 	dev_info_t **pcibdipp = (dev_info_t **)arg;
1618 
1619 	immu_devi = immu_devi_get(dip);
1620 
1621 	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1622 		*pcibdipp = dip;
1623 	}
1624 
1625 	return (DDI_WALK_CONTINUE);
1626 }
1627 
1628 static int
1629 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1630     dev_info_t *rdip, immu_flags_t immu_flags)
1631 {
1632 	immu_devi_t *r_immu_devi;
1633 	immu_devi_t *d_immu_devi;
1634 	int r_bus;
1635 	int d_bus;
1636 	int r_devfunc;
1637 	int d_devfunc;
1638 	immu_pcib_t d_pcib_type;
1639 	dev_info_t *pcibdip;
1640 
1641 	if (ddip == NULL || rdip == NULL ||
1642 	    ddip == root_devinfo || rdip == root_devinfo) {
1643 		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1644 		    "request-dip are NULL or are root devinfo");
1645 		return (DDI_FAILURE);
1646 	}
1647 
1648 	/*
1649 	 * We need to set the context fields
1650 	 * based on what type of device rdip and ddip are.
1651 	 * To do that we need the immu_devi field.
1652 	 * Set the immu_devi field (if not already set)
1653 	 */
1654 	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1655 		ddi_err(DER_MODE, rdip,
1656 		    "immu_context_update: failed to set immu_devi for ddip");
1657 		return (DDI_FAILURE);
1658 	}
1659 
1660 	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1661 		ddi_err(DER_MODE, rdip,
1662 		    "immu_context_update: failed to set immu_devi for rdip");
1663 		return (DDI_FAILURE);
1664 	}
1665 
1666 	d_immu_devi = immu_devi_get(ddip);
1667 	r_immu_devi = immu_devi_get(rdip);
1668 
1669 	d_bus = d_immu_devi->imd_bus;
1670 	d_devfunc = d_immu_devi->imd_devfunc;
1671 	d_pcib_type = d_immu_devi->imd_pcib_type;
1672 	r_bus = r_immu_devi->imd_bus;
1673 	r_devfunc = r_immu_devi->imd_devfunc;
1674 
1675 	if (rdip == ddip) {
1676 		/* rdip is a PCIE device. set context for it only */
1677 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1678 		    r_devfunc);
1679 #ifdef BUGGY_DRIVERS
1680 	} else if (r_immu_devi == d_immu_devi) {
1681 #ifdef TEST
1682 		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1683 		    "0x%lx are identical", rdip, ddip);
1684 #endif
1685 		/* rdip is a PCIE device. set context for it only */
1686 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1687 		    r_devfunc);
1688 #endif
1689 	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1690 		/*
1691 		 * ddip is a PCIE_PCI bridge. Set context for ddip's
1692 		 * secondary bus. If rdip is on ddip's secondary
1693 		 * bus, set context for rdip. Else, set context
1694 		 * for rdip's PCI bridge on ddip's secondary bus.
1695 		 */
1696 		context_set(immu, domain, immu->immu_ctx_root,
1697 		    d_immu_devi->imd_sec, 0);
1698 		if (d_immu_devi->imd_sec == r_bus) {
1699 			context_set(immu, domain, immu->immu_ctx_root,
1700 			    r_bus, r_devfunc);
1701 		} else {
1702 			pcibdip = NULL;
1703 			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1704 			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1705 			    pcibdip != NULL) {
1706 				r_immu_devi = immu_devi_get(pcibdip);
1707 				r_bus = r_immu_devi->imd_bus;
1708 				r_devfunc = r_immu_devi->imd_devfunc;
1709 				context_set(immu, domain, immu->immu_ctx_root,
1710 				    r_bus, r_devfunc);
1711 			} else {
1712 				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1713 				    " bridge for PCI device");
1714 				/*NOTREACHED*/
1715 			}
1716 		}
1717 	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1718 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1719 		    d_devfunc);
1720 	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1721 		/*
1722 		 * ddip is a PCIE device which has a non-PCI device under it
1723 		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1724 		 */
1725 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1726 		    d_devfunc);
1727 	} else {
1728 		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1729 		    "set iommu context.");
1730 		/*NOTREACHED*/
1731 	}
1732 
1733 	/* XXX do we need a membar_producer() here */
1734 	return (DDI_SUCCESS);
1735 }
1736 
1737 /* ##################### END CONTEXT CODE ################################## */
1738 /* ##################### MAPPING CODE ################################## */
1739 
1740 
1741 #ifdef DEBUG
1742 static boolean_t
1743 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1744     dev_info_t *rdip, immu_flags_t immu_flags)
1745 {
1746 	/* The PDTE must be set i.e. present bit is set */
1747 	if (!PDTE_P(pdte)) {
1748 		ddi_err(DER_MODE, rdip, "No present flag");
1749 		return (B_FALSE);
1750 	}
1751 
1752 	/*
1753 	 * Just assert to check most significant system software field
1754 	 * (PDTE_SW4) as it is same as present bit and we
1755 	 * checked that above
1756 	 */
1757 	ASSERT(PDTE_SW4(pdte));
1758 
1759 	/*
1760 	 * TM field should be clear if not reserved.
1761 	 * non-leaf is always reserved
1762 	 */
1763 	if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
1764 		if (PDTE_TM(pdte)) {
1765 			ddi_err(DER_MODE, rdip, "TM flag set");
1766 			return (B_FALSE);
1767 		}
1768 	}
1769 
1770 	/*
1771 	 * The SW3 field is not used and must be clear
1772 	 */
1773 	if (PDTE_SW3(pdte)) {
1774 		ddi_err(DER_MODE, rdip, "SW3 set");
1775 		return (B_FALSE);
1776 	}
1777 
1778 	/*
1779 	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1780 	 */
1781 	if (next == NULL) {
1782 		ASSERT(paddr % IMMU_PAGESIZE == 0);
1783 		if (PDTE_PADDR(pdte) != paddr) {
1784 			ddi_err(DER_MODE, rdip,
1785 			    "PTE paddr mismatch: %lx != %lx",
1786 			    PDTE_PADDR(pdte), paddr);
1787 			return (B_FALSE);
1788 		}
1789 	} else {
1790 		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1791 			ddi_err(DER_MODE, rdip,
1792 			    "PDE paddr mismatch: %lx != %lx",
1793 			    PDTE_PADDR(pdte), next->hwpg_paddr);
1794 			return (B_FALSE);
1795 		}
1796 	}
1797 
1798 	/*
1799 	 * SNP field should be clear if not reserved.
1800 	 * non-leaf is always reserved
1801 	 */
1802 	if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
1803 		if (PDTE_SNP(pdte)) {
1804 			ddi_err(DER_MODE, rdip, "SNP set");
1805 			return (B_FALSE);
1806 		}
1807 	}
1808 
1809 	/* second field available for system software should be clear */
1810 	if (PDTE_SW2(pdte)) {
1811 		ddi_err(DER_MODE, rdip, "SW2 set");
1812 		return (B_FALSE);
1813 	}
1814 
1815 	/* Super pages field should be clear */
1816 	if (PDTE_SP(pdte)) {
1817 		ddi_err(DER_MODE, rdip, "SP set");
1818 		return (B_FALSE);
1819 	}
1820 
1821 	/*
1822 	 * least significant field available for
1823 	 * system software should be clear
1824 	 */
1825 	if (PDTE_SW1(pdte)) {
1826 		ddi_err(DER_MODE, rdip, "SW1 set");
1827 		return (B_FALSE);
1828 	}
1829 
1830 	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1831 		ddi_err(DER_MODE, rdip, "READ not set");
1832 		return (B_FALSE);
1833 	}
1834 
1835 	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1836 		ddi_err(DER_MODE, rdip, "WRITE not set");
1837 		return (B_FALSE);
1838 	}
1839 
1840 	return (B_TRUE);
1841 }
1842 #endif
1843 
1844 /*ARGSUSED*/
1845 static void
1846 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
1847     uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
1848 {
1849 	uint64_t npages;
1850 	uint64_t dvma;
1851 	pgtable_t *pgtable;
1852 	hw_pdte_t *hwp;
1853 	hw_pdte_t *shwp;
1854 	int idx;
1855 
1856 	pgtable = xlate->xlt_pgtable;
1857 	idx = xlate->xlt_idx;
1858 
1859 	dvma = *dvma_ptr;
1860 	npages = *npages_ptr;
1861 
1862 	/*
1863 	 * since a caller gets a unique dvma for a physical address,
1864 	 * no other concurrent thread will be writing to the same
1865 	 * PTE even if it has the same paddr. So no locks needed.
1866 	 */
1867 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1868 
1869 	hwp = shwp;
1870 	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
1871 		PDTE_CLEAR_P(*hwp);
1872 		dvma += IMMU_PAGESIZE;
1873 		npages--;
1874 	}
1875 
1876 	*dvma_ptr = dvma;
1877 	*npages_ptr = npages;
1878 
1879 	xlate->xlt_idx = idx;
1880 }
1881 
1882 static void
1883 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels)
1884 {
1885 	int level;
1886 	uint64_t offbits;
1887 
1888 	/*
1889 	 * Skip the first 12 bits which is the offset into
1890 	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1891 	 */
1892 	offbits = dvma >> IMMU_PAGESHIFT;
1893 
1894 	/* skip to level 1 i.e. leaf PTE */
1895 	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
1896 		xlate->xlt_level = level;
1897 		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
1898 		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
1899 		xlate->xlt_pgtable = NULL;
1900 		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
1901 	}
1902 }
1903 
1904 /*
1905  * Read the pgtables
1906  */
1907 static boolean_t
1908 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels)
1909 {
1910 	pgtable_t *pgtable;
1911 	pgtable_t *next;
1912 	uint_t idx;
1913 
1914 	/* start with highest level pgtable i.e. root */
1915 	xlate += nlevels;
1916 
1917 	if (xlate->xlt_pgtable == NULL) {
1918 		xlate->xlt_pgtable = domain->dom_pgtable_root;
1919 	}
1920 
1921 	for (; xlate->xlt_level > 1; xlate--) {
1922 		idx = xlate->xlt_idx;
1923 		pgtable = xlate->xlt_pgtable;
1924 
1925 		if ((xlate - 1)->xlt_pgtable) {
1926 			continue;
1927 		}
1928 
1929 		/* Lock the pgtable in read mode */
1930 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
1931 
1932 		/*
1933 		 * since we are unmapping, the pgtable should
1934 		 * already point to a leafier pgtable.
1935 		 */
1936 		next = *(pgtable->swpg_next_array + idx);
1937 		(xlate - 1)->xlt_pgtable = next;
1938 		rw_exit(&(pgtable->swpg_rwlock));
1939 		if (next == NULL)
1940 			return (B_FALSE);
1941 	}
1942 
1943 	return (B_TRUE);
1944 }
1945 
1946 static void
1947 immu_fault_walk(void *arg, void *base, size_t len)
1948 {
1949 	uint64_t dvma, start;
1950 
1951 	dvma = *(uint64_t *)arg;
1952 	start = (uint64_t)(uintptr_t)base;
1953 
1954 	if (dvma >= start && dvma < (start + len)) {
1955 		ddi_err(DER_WARN, NULL,
1956 		    "faulting DVMA address is in vmem arena "
1957 		    "(%" PRIx64 "-%" PRIx64 ")",
1958 		    start, start + len);
1959 		*(uint64_t *)arg = ~0ULL;
1960 	}
1961 }
1962 
1963 void
1964 immu_print_fault_info(uint_t sid, uint64_t dvma)
1965 {
1966 	int nlevels;
1967 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
1968 	xlate_t *xlatep;
1969 	hw_pdte_t pte;
1970 	domain_t *domain;
1971 	immu_t *immu;
1972 	uint64_t dvma_arg;
1973 
1974 	if (mod_hash_find(bdf_domain_hash,
1975 	    (void *)(uintptr_t)sid, (void *)&domain) != 0) {
1976 		ddi_err(DER_WARN, NULL,
1977 		    "no domain for faulting SID %08x", sid);
1978 		return;
1979 	}
1980 
1981 	immu = domain->dom_immu;
1982 
1983 	dvma_arg = dvma;
1984 	vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk,
1985 	    (void *)&dvma_arg);
1986 	if (dvma_arg != ~0ULL)
1987 		ddi_err(DER_WARN, domain->dom_dip,
1988 		    "faulting DVMA address is not in vmem arena");
1989 
1990 	nlevels = immu->immu_dvma_nlevels;
1991 	xlate_setup(dvma, xlate, nlevels);
1992 
1993 	if (!PDE_lookup(domain, xlate, nlevels)) {
1994 		ddi_err(DER_WARN, domain->dom_dip,
1995 		    "pte not found in domid %d for faulting addr %" PRIx64,
1996 		    domain->dom_did, dvma);
1997 		return;
1998 	}
1999 
2000 	xlatep = &xlate[1];
2001 	pte = *((hw_pdte_t *)
2002 	    (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx);
2003 
2004 	ddi_err(DER_WARN, domain->dom_dip,
2005 	    "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did,
2006 	    (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte));
2007 }
2008 
2009 /*ARGSUSED*/
2010 static void
2011 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2012     dev_info_t *rdip, immu_flags_t immu_flags)
2013 {
2014 	hw_pdte_t pte;
2015 
2016 #ifndef DEBUG
2017 	pte = immu->immu_ptemask;
2018 	PDTE_SET_PADDR(pte, paddr);
2019 #else
2020 	pte = *hwp;
2021 
2022 	if (PDTE_P(pte)) {
2023 		if (PDTE_PADDR(pte) != paddr) {
2024 			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2025 			    PDTE_PADDR(pte), paddr);
2026 		}
2027 #ifdef BUGGY_DRIVERS
2028 		return;
2029 #else
2030 		goto out;
2031 #endif
2032 	}
2033 
2034 	/* clear TM field if not reserved */
2035 	if (immu->immu_TM_reserved == B_FALSE) {
2036 		PDTE_CLEAR_TM(pte);
2037 	}
2038 
2039 	/* Clear 3rd field for system software  - not used */
2040 	PDTE_CLEAR_SW3(pte);
2041 
2042 	/* Set paddr */
2043 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2044 	PDTE_CLEAR_PADDR(pte);
2045 	PDTE_SET_PADDR(pte, paddr);
2046 
2047 	/*  clear SNP field if not reserved. */
2048 	if (immu->immu_SNP_reserved == B_FALSE) {
2049 		PDTE_CLEAR_SNP(pte);
2050 	}
2051 
2052 	/* Clear SW2 field available for software */
2053 	PDTE_CLEAR_SW2(pte);
2054 
2055 
2056 	/* SP is don't care for PTEs. Clear it for cleanliness */
2057 	PDTE_CLEAR_SP(pte);
2058 
2059 	/* Clear SW1 field available for software */
2060 	PDTE_CLEAR_SW1(pte);
2061 
2062 	/*
2063 	 * Now that we are done writing the PTE
2064 	 * set the "present" flag. Note this present
2065 	 * flag is a bit in the PDE/PTE that the
2066 	 * spec says is available for system software.
2067 	 * This is an implementation detail of Solaris
2068 	 * bare-metal Intel IOMMU.
2069 	 * The present field in a PDE/PTE is not defined
2070 	 * by the Vt-d spec
2071 	 */
2072 
2073 	PDTE_SET_P(pte);
2074 
2075 	pte |= immu->immu_ptemask;
2076 
2077 out:
2078 #endif /* DEBUG */
2079 #ifdef BUGGY_DRIVERS
2080 	PDTE_SET_READ(pte);
2081 	PDTE_SET_WRITE(pte);
2082 #else
2083 	if (immu_flags & IMMU_FLAGS_READ)
2084 		PDTE_SET_READ(pte);
2085 	if (immu_flags & IMMU_FLAGS_WRITE)
2086 		PDTE_SET_WRITE(pte);
2087 #endif /* BUGGY_DRIVERS */
2088 
2089 	*hwp = pte;
2090 }
2091 
2092 /*ARGSUSED*/
2093 static void
2094 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2095     uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies,
2096     int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2097 {
2098 	paddr_t paddr;
2099 	uint64_t nvpages;
2100 	uint64_t nppages;
2101 	uint64_t dvma;
2102 	pgtable_t *pgtable;
2103 	hw_pdte_t *hwp;
2104 	hw_pdte_t *shwp;
2105 	int idx, nset;
2106 	int j;
2107 
2108 	pgtable = xlate->xlt_pgtable;
2109 	idx = xlate->xlt_idx;
2110 
2111 	dvma = *dvma_ptr;
2112 	nvpages = *nvpages_ptr;
2113 
2114 	/*
2115 	 * since a caller gets a unique dvma for a physical address,
2116 	 * no other concurrent thread will be writing to the same
2117 	 * PTE even if it has the same paddr. So no locks needed.
2118 	 */
2119 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2120 
2121 	hwp = shwp;
2122 	for (j = dcount - 1; j >= 0; j--) {
2123 		if (nvpages <= dcookies[j].dck_npages)
2124 			break;
2125 		nvpages -= dcookies[j].dck_npages;
2126 	}
2127 
2128 	nppages = nvpages;
2129 	paddr = dcookies[j].dck_paddr +
2130 	    (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;
2131 
2132 	nvpages = *nvpages_ptr;
2133 	nset = 0;
2134 	for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2135 		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2136 		nset++;
2137 
2138 		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2139 		    == B_TRUE);
2140 		nppages--;
2141 		nvpages--;
2142 		paddr += IMMU_PAGESIZE;
2143 		dvma += IMMU_PAGESIZE;
2144 
2145 		if (nppages == 0) {
2146 			j++;
2147 		}
2148 
2149 		if (j == dcount)
2150 			break;
2151 
2152 		if (nppages == 0) {
2153 			nppages = dcookies[j].dck_npages;
2154 			paddr = dcookies[j].dck_paddr;
2155 		}
2156 	}
2157 
2158 	if (nvpages) {
2159 		*dvma_ptr = dvma;
2160 		*nvpages_ptr = nvpages;
2161 	} else {
2162 		*dvma_ptr = 0;
2163 		*nvpages_ptr = 0;
2164 	}
2165 
2166 	xlate->xlt_idx = idx;
2167 }
2168 
2169 /*ARGSUSED*/
2170 static void
2171 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2172     dev_info_t *rdip, immu_flags_t immu_flags)
2173 {
2174 	hw_pdte_t pde;
2175 
2176 	pde = *hwp;
2177 
2178 	/* if PDE is already set, make sure it is correct */
2179 	if (PDTE_P(pde)) {
2180 		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2181 #ifdef BUGGY_DRIVERS
2182 		return;
2183 #else
2184 		goto out;
2185 #endif
2186 	}
2187 
2188 	/* Dont touch SW4, it is the present bit */
2189 
2190 	/* don't touch TM field it is reserved for PDEs */
2191 
2192 	/* 3rd field available for system software is not used */
2193 	PDTE_CLEAR_SW3(pde);
2194 
2195 	/* Set next level pgtable-paddr for PDE */
2196 	PDTE_CLEAR_PADDR(pde);
2197 	PDTE_SET_PADDR(pde, next->hwpg_paddr);
2198 
2199 	/* don't touch SNP field it is reserved for PDEs */
2200 
2201 	/* Clear second field available for system software */
2202 	PDTE_CLEAR_SW2(pde);
2203 
2204 	/* No super pages for PDEs */
2205 	PDTE_CLEAR_SP(pde);
2206 
2207 	/* Clear SW1 for software */
2208 	PDTE_CLEAR_SW1(pde);
2209 
2210 	/*
2211 	 * Now that we are done writing the PDE
2212 	 * set the "present" flag. Note this present
2213 	 * flag is a bit in the PDE/PTE that the
2214 	 * spec says is available for system software.
2215 	 * This is an implementation detail of Solaris
2216 	 * base-metal Intel IOMMU.
2217 	 * The present field in a PDE/PTE is not defined
2218 	 * by the Vt-d spec
2219 	 */
2220 
2221 out:
2222 #ifdef  BUGGY_DRIVERS
2223 	PDTE_SET_READ(pde);
2224 	PDTE_SET_WRITE(pde);
2225 #else
2226 	if (immu_flags & IMMU_FLAGS_READ)
2227 		PDTE_SET_READ(pde);
2228 	if (immu_flags & IMMU_FLAGS_WRITE)
2229 		PDTE_SET_WRITE(pde);
2230 #endif
2231 
2232 	PDTE_SET_P(pde);
2233 
2234 	*hwp = pde;
2235 }
2236 
2237 /*
2238  * Used to set PDEs
2239  */
2240 static boolean_t
2241 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2242     dev_info_t *rdip, immu_flags_t immu_flags)
2243 {
2244 	pgtable_t *pgtable;
2245 	pgtable_t *new;
2246 	pgtable_t *next;
2247 	hw_pdte_t *hwp;
2248 	int level;
2249 	uint_t idx;
2250 	krw_t rwtype;
2251 	boolean_t set = B_FALSE;
2252 
2253 	/* start with highest level pgtable i.e. root */
2254 	xlate += nlevels;
2255 
2256 	new = NULL;
2257 	xlate->xlt_pgtable = domain->dom_pgtable_root;
2258 	for (level = nlevels; level > 1; level--, xlate--) {
2259 		idx = xlate->xlt_idx;
2260 		pgtable = xlate->xlt_pgtable;
2261 
2262 		/* Lock the pgtable in READ mode first */
2263 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2264 		rwtype = RW_READER;
2265 again:
2266 		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2267 		next = (pgtable->swpg_next_array)[idx];
2268 
2269 		/*
2270 		 * check if leafier level already has a pgtable
2271 		 * if yes, verify
2272 		 */
2273 		if (next == NULL) {
2274 			if (new == NULL) {
2275 
2276 				IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *,
2277 				    rdip, int, level);
2278 
2279 				new = pgtable_alloc(immu, immu_flags);
2280 				if (new == NULL) {
2281 					ddi_err(DER_PANIC, rdip,
2282 					    "pgtable alloc err");
2283 				}
2284 				pgtable_zero(new);
2285 			}
2286 
2287 			/* Change to a write lock */
2288 			if (rwtype == RW_READER &&
2289 			    rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
2290 				rw_exit(&(pgtable->swpg_rwlock));
2291 				rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2292 				rwtype = RW_WRITER;
2293 				goto again;
2294 			}
2295 			rwtype = RW_WRITER;
2296 			next = new;
2297 			(pgtable->swpg_next_array)[idx] = next;
2298 			new = NULL;
2299 			PDE_set_one(immu, hwp, next, rdip, immu_flags);
2300 			set = B_TRUE;
2301 			rw_downgrade(&(pgtable->swpg_rwlock));
2302 			rwtype = RW_READER;
2303 		}
2304 #ifndef  BUGGY_DRIVERS
2305 		else {
2306 			hw_pdte_t pde = *hwp;
2307 
2308 			/*
2309 			 * If buggy driver we already set permission
2310 			 * READ+WRITE so nothing to do for that case
2311 			 * XXX Check that read writer perms change before
2312 			 * actually setting perms. Also need to hold lock
2313 			 */
2314 			if (immu_flags & IMMU_FLAGS_READ)
2315 				PDTE_SET_READ(pde);
2316 			if (immu_flags & IMMU_FLAGS_WRITE)
2317 				PDTE_SET_WRITE(pde);
2318 
2319 			*hwp = pde;
2320 		}
2321 #endif
2322 
2323 		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2324 		    == B_TRUE);
2325 
2326 		(xlate - 1)->xlt_pgtable = next;
2327 		rw_exit(&(pgtable->swpg_rwlock));
2328 	}
2329 
2330 	if (new) {
2331 		pgtable_free(immu, new);
2332 	}
2333 
2334 	return (set);
2335 }
2336 
2337 /*
2338  * dvma_map()
2339  *     map a contiguous range of DVMA pages
2340  *
2341  *     immu: IOMMU unit for which we are generating DVMA cookies
2342  *   domain: domain
2343  *    sdvma: Starting dvma
2344  *   spaddr: Starting paddr
2345  *   npages: Number of pages
2346  *     rdip: requesting device
2347  *     immu_flags: flags
2348  */
2349 static boolean_t
2350 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages,
2351     immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
2352     immu_flags_t immu_flags)
2353 {
2354 	uint64_t dvma;
2355 	uint64_t n;
2356 	immu_t *immu = domain->dom_immu;
2357 	int nlevels = immu->immu_dvma_nlevels;
2358 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2359 	boolean_t pde_set = B_FALSE;
2360 
2361 	n = snvpages;
2362 	dvma = sdvma;
2363 
2364 	while (n > 0) {
2365 		xlate_setup(dvma, xlate, nlevels);
2366 
2367 		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2368 		if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
2369 		    == B_TRUE) {
2370 			pde_set = B_TRUE;
2371 		}
2372 
2373 		/* set all matching ptes that fit into this leaf pgtable */
2374 		PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
2375 		    dcount, rdip, immu_flags);
2376 	}
2377 
2378 	return (pde_set);
2379 }
2380 
2381 /*
2382  * dvma_unmap()
2383  *   unmap a range of DVMAs
2384  *
2385  * immu: IOMMU unit state
2386  * domain: domain for requesting device
2387  * ddip: domain-dip
2388  * dvma: starting DVMA
2389  * npages: Number of IMMU pages to be unmapped
2390  * rdip: requesting device
2391  */
2392 static void
2393 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages,
2394     dev_info_t *rdip)
2395 {
2396 	immu_t *immu = domain->dom_immu;
2397 	int nlevels = immu->immu_dvma_nlevels;
2398 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2399 	uint64_t n;
2400 	uint64_t dvma;
2401 
2402 	dvma = sdvma;
2403 	n = snpages;
2404 
2405 	while (n > 0) {
2406 		/* setup the xlate array */
2407 		xlate_setup(dvma, xlate, nlevels);
2408 
2409 		/* just lookup existing pgtables. Should never fail */
2410 		if (!PDE_lookup(domain, xlate, nlevels))
2411 			ddi_err(DER_PANIC, rdip,
2412 			    "PTE not found for addr %" PRIx64,
2413 			    (unsigned long long)dvma);
2414 
2415 		/* clear all matching ptes that fit into this leaf pgtable */
2416 		PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
2417 	}
2418 
2419 	/* No need to flush IOTLB after unmap */
2420 }
2421 
2422 static uint64_t
2423 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf)
2424 {
2425 	uint64_t dvma;
2426 	size_t xsize, align;
2427 	uint64_t minaddr, maxaddr;
2428 
2429 	/* parameters */
2430 	xsize = npages * IMMU_PAGESIZE;
2431 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2432 	minaddr = dma_attr->dma_attr_addr_lo;
2433 	maxaddr = dma_attr->dma_attr_addr_hi + 1;
2434 
2435 	/* handle the rollover cases */
2436 	if (maxaddr < dma_attr->dma_attr_addr_hi) {
2437 		maxaddr = dma_attr->dma_attr_addr_hi;
2438 	}
2439 
2440 	/*
2441 	 * allocate from vmem arena.
2442 	 */
2443 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2444 	    xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
2445 	    (void *)(uintptr_t)maxaddr, kmf);
2446 
2447 	return (dvma);
2448 }
2449 
2450 static void
2451 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr)
2452 {
2453 	int nlevels;
2454 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp;
2455 	uint64_t dvma, n;
2456 	size_t xsize, align;
2457 	uint64_t minaddr, maxaddr, dmamax;
2458 	int on, npte, pindex;
2459 	hw_pdte_t *shwp;
2460 	immu_t *immu;
2461 	domain_t *domain;
2462 
2463 	/* parameters */
2464 	domain = IMMU_DEVI(rdip)->imd_domain;
2465 	immu = domain->dom_immu;
2466 	nlevels = immu->immu_dvma_nlevels;
2467 	xsize = IMMU_NPREPTES * IMMU_PAGESIZE;
2468 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2469 	minaddr = dma_attr->dma_attr_addr_lo;
2470 	if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG)
2471 		dmamax = dma_attr->dma_attr_seg;
2472 	else
2473 		dmamax = dma_attr->dma_attr_addr_hi;
2474 	maxaddr = dmamax + 1;
2475 
2476 	if (maxaddr < dmamax)
2477 		maxaddr = dmamax;
2478 
2479 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2480 	    xsize, align, 0, dma_attr->dma_attr_seg + 1,
2481 	    (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2482 
2483 	ihp->ihp_predvma = dvma;
2484 	ihp->ihp_npremapped = 0;
2485 	if (dvma == 0)
2486 		return;
2487 
2488 	n = IMMU_NPREPTES;
2489 	pindex = 0;
2490 
2491 	/*
2492 	 * Set up a mapping at address 0, just so that all PDPs get allocated
2493 	 * now. Although this initial mapping should never be used,
2494 	 * explicitly set it to read-only, just to be safe.
2495 	 */
2496 	while (n > 0) {
2497 		xlate_setup(dvma, xlate, nlevels);
2498 
2499 		(void) PDE_set_all(immu, domain, xlate, nlevels, rdip,
2500 		    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2501 
2502 		xlp = &xlate[1];
2503 		shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr)
2504 		    + xlp->xlt_idx;
2505 		on = n;
2506 
2507 		PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie,
2508 		    1, rdip, IMMU_FLAGS_READ);
2509 
2510 		npte = on - n;
2511 
2512 		while (npte > 0) {
2513 			ihp->ihp_preptes[pindex++] = shwp;
2514 #ifdef BUGGY_DRIVERS
2515 			PDTE_CLEAR_WRITE(*shwp);
2516 #endif
2517 			shwp++;
2518 			npte--;
2519 		}
2520 	}
2521 }
2522 
2523 static void
2524 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp)
2525 {
2526 	domain_t *domain;
2527 
2528 	domain = IMMU_DEVI(rdip)->imd_domain;
2529 
2530 	if (ihp->ihp_predvma != 0) {
2531 		dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip);
2532 		vmem_free(domain->dom_dvma_arena,
2533 		    (void *)(uintptr_t)ihp->ihp_predvma,
2534 		    IMMU_NPREPTES * IMMU_PAGESIZE);
2535 	}
2536 }
2537 
2538 static void
2539 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2540 {
2541 	uint64_t size = npages * IMMU_PAGESIZE;
2542 
2543 	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE)
2544 		return;
2545 
2546 	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2547 }
2548 
2549 static int
2550 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle,
2551     immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq,
2552     ddi_dma_obj_t *dma_out)
2553 {
2554 	domain_t *domain;
2555 	immu_t *immu;
2556 	immu_flags_t immu_flags;
2557 	ddi_dma_atyp_t buftype;
2558 	ddi_dma_obj_t *dmar_object;
2559 	ddi_dma_attr_t *attrp;
2560 	uint64_t offset, paddr, dvma, sdvma, rwmask;
2561 	size_t npages, npgalloc;
2562 	uint_t psize, size, pcnt, dmax;
2563 	page_t **pparray;
2564 	caddr_t vaddr;
2565 	page_t *page;
2566 	struct as *vas;
2567 	immu_dcookie_t *dcookies;
2568 	int pde_set;
2569 
2570 	domain = IMMU_DEVI(rdip)->imd_domain;
2571 	immu = domain->dom_immu;
2572 	immu_flags = dma_to_immu_flags(dmareq);
2573 
2574 	attrp = &((ddi_dma_impl_t *)handle)->dmai_attr;
2575 
2576 	dmar_object = &dmareq->dmar_object;
2577 	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2578 	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2579 	buftype = dmar_object->dmao_type;
2580 	size = dmar_object->dmao_size;
2581 
2582 	IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t,
2583 	    buftype, uint_t, size);
2584 
2585 	dcookies = &ihp->ihp_dcookies[0];
2586 
2587 	pcnt = dmax = 0;
2588 
2589 	/* retrieve paddr, psize, offset from dmareq */
2590 	if (buftype == DMA_OTYP_PAGES) {
2591 		page = dmar_object->dmao_obj.pp_obj.pp_pp;
2592 		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
2593 		    MMU_PAGEOFFSET;
2594 		paddr = pfn_to_pa(page->p_pagenum) + offset;
2595 		psize = MIN((MMU_PAGESIZE - offset), size);
2596 		page = page->p_next;
2597 		vas = dmar_object->dmao_obj.virt_obj.v_as;
2598 	} else {
2599 		if (vas == NULL) {
2600 			vas = &kas;
2601 		}
2602 		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2603 		if (pparray != NULL) {
2604 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2605 			psize = MIN((MMU_PAGESIZE - offset), size);
2606 			pcnt++;
2607 		} else {
2608 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat,
2609 			    vaddr)) + offset;
2610 			psize = MIN(size, (MMU_PAGESIZE - offset));
2611 			vaddr += psize;
2612 		}
2613 	}
2614 
2615 	npgalloc = IMMU_BTOPR(size + offset);
2616 
2617 	if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) {
2618 #ifdef BUGGY_DRIVERS
2619 		rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask;
2620 #else
2621 		rwmask = immu->immu_ptemask;
2622 		if (immu_flags & IMMU_FLAGS_READ)
2623 			rwmask |= PDTE_MASK_R;
2624 		if (immu_flags & IMMU_FLAGS_WRITE)
2625 			rwmask |= PDTE_MASK_W;
2626 #endif
2627 #ifdef DEBUG
2628 		rwmask |= PDTE_MASK_P;
2629 #endif
2630 		sdvma = ihp->ihp_predvma;
2631 		ihp->ihp_npremapped = npgalloc;
2632 		*ihp->ihp_preptes[0] =
2633 		    PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask;
2634 	} else {
2635 		ihp->ihp_npremapped = 0;
2636 		sdvma = dvma_alloc(domain, attrp, npgalloc,
2637 		    dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP);
2638 		if (sdvma == 0)
2639 			return (DDI_DMA_NORESOURCES);
2640 
2641 		dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET);
2642 		dcookies[0].dck_npages = 1;
2643 	}
2644 
2645 	IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc,
2646 	    uint64_t, sdvma);
2647 
2648 	dvma = sdvma;
2649 	pde_set = 0;
2650 	npages = 1;
2651 	size -= psize;
2652 	while (size > 0) {
2653 		/* get the size for this page (i.e. partial or full page) */
2654 		psize = MIN(size, MMU_PAGESIZE);
2655 		if (buftype == DMA_OTYP_PAGES) {
2656 			/* get the paddr from the page_t */
2657 			paddr = pfn_to_pa(page->p_pagenum);
2658 			page = page->p_next;
2659 		} else if (pparray != NULL) {
2660 			/* index into the array of page_t's to get the paddr */
2661 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2662 			pcnt++;
2663 		} else {
2664 			/* call into the VM to get the paddr */
2665 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr));
2666 			vaddr += psize;
2667 		}
2668 
2669 		npages++;
2670 
2671 		if (ihp->ihp_npremapped > 0) {
2672 			*ihp->ihp_preptes[npages - 1] =
2673 			    PDTE_PADDR(paddr) | rwmask;
2674 		} else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2675 			dcookies[dmax].dck_npages++;
2676 		} else {
2677 			/* No, we need a new dcookie */
2678 			if (dmax == (IMMU_NDCK - 1)) {
2679 				/*
2680 				 * Ran out of dcookies. Map them now.
2681 				 */
2682 				if (dvma_map(domain, dvma,
2683 				    npages, dcookies, dmax + 1, rdip,
2684 				    immu_flags))
2685 					pde_set++;
2686 
2687 				IMMU_DPROBE4(immu__dvmamap__early,
2688 				    dev_info_t *, rdip, uint64_t, dvma,
2689 				    uint_t, npages, uint_t, dmax+1);
2690 
2691 				dvma += (npages << IMMU_PAGESHIFT);
2692 				npages = 0;
2693 				dmax = 0;
2694 			} else
2695 				dmax++;
2696 			dcookies[dmax].dck_paddr = paddr;
2697 			dcookies[dmax].dck_npages = 1;
2698 		}
2699 		size -= psize;
2700 	}
2701 
2702 	/*
2703 	 * Finish up, mapping all, or all of the remaining,
2704 	 * physical memory ranges.
2705 	 */
2706 	if (ihp->ihp_npremapped == 0 && npages > 0) {
2707 		IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \
2708 		    uint64_t, dvma, uint_t, npages, uint_t, dmax+1);
2709 
2710 		if (dvma_map(domain, dvma, npages, dcookies,
2711 		    dmax + 1, rdip, immu_flags))
2712 			pde_set++;
2713 	}
2714 
2715 	/* Invalidate the IOTLB */
2716 	immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc,
2717 	    pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF,
2718 	    &ihp->ihp_inv_wait);
2719 
2720 	ihp->ihp_ndvseg = 1;
2721 	ihp->ihp_dvseg[0].dvs_start = sdvma;
2722 	ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size;
2723 
2724 	dma_out->dmao_size = dmar_object->dmao_size;
2725 	dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET;
2726 	dma_out->dmao_obj.dvma_obj.dv_nseg = 1;
2727 	dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0];
2728 	dma_out->dmao_type = DMA_OTYP_DVADDR;
2729 
2730 	return (DDI_DMA_MAPPED);
2731 }
2732 
2733 static int
2734 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao)
2735 {
2736 	uint64_t dvma, npages;
2737 	domain_t *domain;
2738 	struct dvmaseg *dvs;
2739 
2740 	domain = IMMU_DEVI(rdip)->imd_domain;
2741 	dvs = dmao->dmao_obj.dvma_obj.dv_seg;
2742 
2743 	dvma = dvs[0].dvs_start;
2744 	npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off);
2745 
2746 #ifdef DEBUG
2747 	/* Unmap only in DEBUG mode */
2748 	dvma_unmap(domain, dvma, npages, rdip);
2749 #endif
2750 	dvma_free(domain, dvma, npages);
2751 
2752 	IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages,
2753 	    uint64_t, dvma);
2754 
2755 #ifdef DEBUG
2756 	/*
2757 	 * In the DEBUG case, the unmap was actually done,
2758 	 * but an IOTLB flush was not done. So, an explicit
2759 	 * write back flush is needed.
2760 	 */
2761 	immu_regs_wbf_flush(domain->dom_immu);
2762 #endif
2763 
2764 	return (DDI_SUCCESS);
2765 }
2766 
2767 /* ############################# Functions exported ######################## */
2768 
2769 /*
2770  * setup the DVMA subsystem
2771  * this code runs only for the first IOMMU unit
2772  */
2773 void
2774 immu_dvma_setup(list_t *listp)
2775 {
2776 	immu_t *immu;
2777 	uint_t kval;
2778 	size_t nchains;
2779 
2780 	/* locks */
2781 	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
2782 
2783 	/* Create lists */
2784 	list_create(&immu_unity_domain_list, sizeof (domain_t),
2785 	    offsetof(domain_t, dom_maptype_node));
2786 	list_create(&immu_xlate_domain_list, sizeof (domain_t),
2787 	    offsetof(domain_t, dom_maptype_node));
2788 
2789 	/* Setup BDF domain hash */
2790 	nchains = 0xff;
2791 	kval = mod_hash_iddata_gen(nchains);
2792 
2793 	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
2794 	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
2795 	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
2796 	    KM_NOSLEEP);
2797 
2798 	immu = list_head(listp);
2799 	for (; immu; immu = list_next(listp, immu)) {
2800 		create_unity_domain(immu);
2801 		did_init(immu);
2802 		context_init(immu);
2803 		immu->immu_dvma_setup = B_TRUE;
2804 	}
2805 }
2806 
2807 /*
2808  * Startup up one DVMA unit
2809  */
2810 void
2811 immu_dvma_startup(immu_t *immu)
2812 {
2813 	if (immu_gfxdvma_enable == B_FALSE &&
2814 	    immu->immu_dvma_gfx_only == B_TRUE) {
2815 		return;
2816 	}
2817 
2818 	/*
2819 	 * DVMA will start once IOMMU is "running"
2820 	 */
2821 	immu->immu_dvma_running = B_TRUE;
2822 }
2823 
2824 /*
2825  * immu_dvma_physmem_update()
2826  *       called when the installed memory on a
2827  *       system increases, to expand domain DVMA
2828  *       for domains with UNITY mapping
2829  */
2830 void
2831 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
2832 {
2833 	uint64_t start;
2834 	uint64_t npages;
2835 	int dcount;
2836 	immu_dcookie_t dcookies[1] = {0};
2837 	domain_t *domain;
2838 
2839 	/*
2840 	 * Just walk the system-wide list of domains with
2841 	 * UNITY mapping. Both the list of *all* domains
2842 	 * and *UNITY* domains is protected by the same
2843 	 * single lock
2844 	 */
2845 	mutex_enter(&immu_domain_lock);
2846 	domain = list_head(&immu_unity_domain_list);
2847 	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
2848 		/*
2849 		 * Nothing to do if the IOMMU supports passthrough.
2850 		 */
2851 		if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap))
2852 			continue;
2853 
2854 		/* There is no vmem_arena for unity domains. Just map it */
2855 		ddi_err(DER_LOG, domain->dom_dip,
2856 		    "iommu: unity-domain: Adding map "
2857 		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
2858 
2859 		start = IMMU_ROUNDOWN(addr);
2860 		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
2861 
2862 		dcookies[0].dck_paddr = start;
2863 		dcookies[0].dck_npages = npages;
2864 		dcount = 1;
2865 		(void) dvma_map(domain, start, npages,
2866 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2867 
2868 	}
2869 	mutex_exit(&immu_domain_lock);
2870 }
2871 
2872 int
2873 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags)
2874 {
2875 	dev_info_t *ddip, *odip;
2876 	immu_t *immu;
2877 	domain_t *domain;
2878 
2879 	odip = rdip;
2880 
2881 	immu = immu_dvma_get_immu(rdip, immu_flags);
2882 	if (immu == NULL) {
2883 		/*
2884 		 * possible that there is no IOMMU unit for this device
2885 		 * - BIOS bugs are one example.
2886 		 */
2887 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
2888 		return (DDI_DMA_NORESOURCES);
2889 	}
2890 
2891 	/*
2892 	 * redirect isa devices attached under lpc to lpc dip
2893 	 */
2894 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
2895 		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
2896 		if (rdip == NULL) {
2897 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2898 			/*NOTREACHED*/
2899 		}
2900 	}
2901 
2902 	/* Reset immu, as redirection can change IMMU */
2903 	immu = NULL;
2904 
2905 	/*
2906 	 * for gart, redirect to the real graphic devinfo
2907 	 */
2908 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
2909 		rdip = get_gfx_devinfo(rdip);
2910 		if (rdip == NULL) {
2911 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2912 			/*NOTREACHED*/
2913 		}
2914 	}
2915 
2916 	/*
2917 	 * Setup DVMA domain for the device. This does
2918 	 * work only the first time we do DVMA for a
2919 	 * device.
2920 	 */
2921 	ddip = NULL;
2922 	domain = device_domain(rdip, &ddip, immu_flags);
2923 	if (domain == NULL) {
2924 		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
2925 		return (DDI_DMA_NORESOURCES);
2926 	}
2927 
2928 	immu = domain->dom_immu;
2929 
2930 	/*
2931 	 * If a domain is found, we must also have a domain dip
2932 	 * which is the topmost ancestor dip of rdip that shares
2933 	 * the same domain with rdip.
2934 	 */
2935 	if (domain->dom_did == 0 || ddip == NULL) {
2936 		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
2937 		    domain->dom_did, ddip);
2938 		return (DDI_DMA_NORESOURCES);
2939 	}
2940 
2941 	if (odip != rdip)
2942 		set_domain(odip, ddip, domain);
2943 
2944 	/*
2945 	 * Update the root and context entries
2946 	 */
2947 	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
2948 	    != DDI_SUCCESS) {
2949 		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
2950 		return (DDI_DMA_NORESOURCES);
2951 	}
2952 
2953 	return (DDI_SUCCESS);
2954 }
2955 
2956 int
2957 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng)
2958 {
2959 	immu_dcookie_t dcookies[1] = {0};
2960 	boolean_t pde_set;
2961 	immu_t *immu;
2962 	domain_t *domain;
2963 	immu_inv_wait_t iw;
2964 
2965 	dcookies[0].dck_paddr = mrng->mrng_start;
2966 	dcookies[0].dck_npages = mrng->mrng_npages;
2967 
2968 	domain = IMMU_DEVI(rdip)->imd_domain;
2969 	immu = domain->dom_immu;
2970 
2971 	pde_set = dvma_map(domain, mrng->mrng_start,
2972 	    mrng->mrng_npages, dcookies, 1, rdip,
2973 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2974 
2975 	immu_init_inv_wait(&iw, "memrange", B_TRUE);
2976 
2977 	immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start,
2978 	    mrng->mrng_npages, pde_set == B_TRUE ?
2979 	    TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw);
2980 
2981 	return (DDI_SUCCESS);
2982 }
2983 
2984 immu_devi_t *
2985 immu_devi_get(dev_info_t *rdip)
2986 {
2987 	immu_devi_t *immu_devi;
2988 	volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);
2989 
2990 	/* Just want atomic reads. No need for lock */
2991 	immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
2992 	    0);
2993 	return (immu_devi);
2994 }
2995 
2996 /*ARGSUSED*/
2997 int
2998 immu_hdl_priv_ctor(void *buf, void *arg, int kmf)
2999 {
3000 	immu_hdl_priv_t *ihp;
3001 
3002 	ihp = buf;
3003 	immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE);
3004 
3005 	return (0);
3006 }
3007 
3008 /*
3009  * iommulib interface functions
3010  */
3011 static int
3012 immu_probe(iommulib_handle_t handle, dev_info_t *dip)
3013 {
3014 	immu_devi_t *immu_devi;
3015 	int ret;
3016 
3017 	if (!immu_enable)
3018 		return (DDI_FAILURE);
3019 
3020 	/*
3021 	 * Make sure the device has all the IOMMU structures
3022 	 * initialized. If this device goes through an IOMMU
3023 	 * unit (e.g. this probe function returns success),
3024 	 * this will be called at most N times, with N being
3025 	 * the number of IOMMUs in the system.
3026 	 *
3027 	 * After that, when iommulib_nex_open succeeds,
3028 	 * we can always assume that this device has all
3029 	 * the structures initialized. IOMMU_USED(dip) will
3030 	 * be true. There is no need to find the controlling
3031 	 * IOMMU/domain again.
3032 	 */
3033 	ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP);
3034 	if (ret != DDI_SUCCESS)
3035 		return (ret);
3036 
3037 	immu_devi = IMMU_DEVI(dip);
3038 
3039 	/*
3040 	 * For unity domains, there is no need to call in to
3041 	 * the IOMMU code.
3042 	 */
3043 	if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID)
3044 		return (DDI_FAILURE);
3045 
3046 	if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle))
3047 		return (DDI_SUCCESS);
3048 
3049 	return (DDI_FAILURE);
3050 }
3051 
3052 /*ARGSUSED*/
3053 static int
3054 immu_allochdl(iommulib_handle_t handle,
3055     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
3056     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep)
3057 {
3058 	int ret;
3059 	immu_hdl_priv_t *ihp;
3060 	immu_t *immu;
3061 
3062 	ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp,
3063 	    arg, dma_handlep);
3064 	if (ret == DDI_SUCCESS) {
3065 		immu = IMMU_DEVI(rdip)->imd_immu;
3066 
3067 		ihp = kmem_cache_alloc(immu->immu_hdl_cache,
3068 		    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
3069 		if (ihp == NULL) {
3070 			(void) iommulib_iommu_dma_freehdl(dip, rdip,
3071 			    *dma_handlep);
3072 			return (DDI_DMA_NORESOURCES);
3073 		}
3074 
3075 		if (IMMU_DEVI(rdip)->imd_use_premap)
3076 			dvma_prealloc(rdip, ihp, attr);
3077 		else {
3078 			ihp->ihp_npremapped = 0;
3079 			ihp->ihp_predvma = 0;
3080 		}
3081 		ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep,
3082 		    ihp);
3083 	}
3084 	return (ret);
3085 }
3086 
3087 /*ARGSUSED*/
3088 static int
3089 immu_freehdl(iommulib_handle_t handle,
3090     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3091 {
3092 	immu_hdl_priv_t *ihp;
3093 
3094 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3095 	if (ihp != NULL) {
3096 		if (IMMU_DEVI(rdip)->imd_use_premap)
3097 			dvma_prefree(rdip, ihp);
3098 		kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp);
3099 	}
3100 
3101 	return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle));
3102 }
3103 
3104 
3105 /*ARGSUSED*/
3106 static int
3107 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
3108     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3109     struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep,
3110     uint_t *ccountp)
3111 {
3112 	int ret;
3113 	immu_hdl_priv_t *ihp;
3114 
3115 	ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle,
3116 	    dma_req, cookiep, ccountp);
3117 
3118 	if (ret == DDI_DMA_MAPPED) {
3119 		ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3120 		immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait);
3121 	}
3122 
3123 	return (ret);
3124 }
3125 
3126 /*ARGSUSED*/
3127 static int
3128 immu_unbindhdl(iommulib_handle_t handle,
3129     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3130 {
3131 	return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle));
3132 }
3133 
3134 /*ARGSUSED*/
3135 static int
3136 immu_sync(iommulib_handle_t handle, dev_info_t *dip,
3137     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off,
3138     size_t len, uint_t cachefl)
3139 {
3140 	return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len,
3141 	    cachefl));
3142 }
3143 
3144 /*ARGSUSED*/
3145 static int
3146 immu_win(iommulib_handle_t handle, dev_info_t *dip,
3147     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
3148     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep,
3149     uint_t *ccountp)
3150 {
3151 	return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp,
3152 	    lenp, cookiep, ccountp));
3153 }
3154 
3155 /*ARGSUSED*/
3156 static int
3157 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
3158     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3159     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao)
3160 {
3161 	immu_hdl_priv_t *ihp;
3162 
3163 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3164 
3165 	return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao));
3166 }
3167 
3168 /*ARGSUSED*/
3169 static int
3170 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
3171     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao)
3172 {
3173 	immu_hdl_priv_t *ihp;
3174 
3175 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3176 	if (ihp->ihp_npremapped > 0)
3177 		return (DDI_SUCCESS);
3178 	return (immu_unmap_dvmaseg(rdip, dmao));
3179 }
3180