xref: /titanic_51/usr/src/uts/i86pc/io/immu_dvma.c (revision b3697b90e692e3e5d859fb77d285d4c056d99eda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Portions Copyright (c) 2010, Oracle and/or its affiliates.
23  * All rights reserved.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 /*
31  * DVMA code
32  * This file contains Intel IOMMU code that deals with DVMA
33  * i.e. DMA remapping.
34  */
35 
36 #include <sys/sysmacros.h>
37 #include <sys/pcie.h>
38 #include <sys/pci_cfgspace.h>
39 #include <vm/hat_i86.h>
40 #include <sys/memlist.h>
41 #include <sys/acpi/acpi.h>
42 #include <sys/acpica.h>
43 #include <sys/modhash.h>
44 #include <sys/immu.h>
45 #include <sys/x86_archext.h>
46 #include <sys/archsystm.h>
47 
48 #undef	TEST
49 
50 /*
51  * Macros based on PCI spec
52  */
53 #define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
54 #define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
55 #define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
56 
57 #define	IMMU_CONTIG_PADDR(d, p) \
58 	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
59 
60 typedef struct dvma_arg {
61 	immu_t *dva_immu;
62 	dev_info_t *dva_rdip;
63 	dev_info_t *dva_ddip;
64 	domain_t *dva_domain;
65 	int dva_level;
66 	immu_flags_t dva_flags;
67 	list_t *dva_list;
68 	int dva_error;
69 } dvma_arg_t;
70 
71 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
72     dev_info_t *rdip, immu_flags_t immu_flags);
73 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
74     int dev, int func, immu_flags_t immu_flags);
75 static void destroy_immu_devi(immu_devi_t *immu_devi);
76 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma,
77     uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
78     immu_flags_t immu_flags);
79 
80 /* Extern globals */
81 extern struct memlist  *phys_install;
82 
83 /*
84  * iommulib interface functions.
85  */
86 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip);
87 static int immu_allochdl(iommulib_handle_t handle,
88     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
89     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep);
90 static int immu_freehdl(iommulib_handle_t handle,
91     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
92 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
93     dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req,
94     ddi_dma_cookie_t *cookiep, uint_t *ccountp);
95 static int immu_unbindhdl(iommulib_handle_t handle,
96     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
97 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip,
98     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len,
99     uint_t cachefl);
100 static int immu_win(iommulib_handle_t handle, dev_info_t *dip,
101     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
102     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp);
103 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
104     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
105     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao);
106 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
107     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao);
108 static int immu_map(iommulib_handle_t handle, dev_info_t *dip,
109     dev_info_t *rdip, struct ddi_dma_req *dmareq,
110     ddi_dma_handle_t *dma_handle);
111 static int immu_mctl(iommulib_handle_t handle, dev_info_t *dip,
112     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
113     enum ddi_dma_ctlops request, off_t *offp, size_t *lenp,
114     caddr_t *objpp, uint_t cachefl);
115 
116 /* static Globals */
117 
118 /*
119  * Used to setup DMA objects (memory regions)
120  * for DMA reads by IOMMU units
121  */
122 static ddi_dma_attr_t immu_dma_attr = {
123 	DMA_ATTR_V0,
124 	0U,
125 	0xffffffffffffffffULL,
126 	0xffffffffU,
127 	MMU_PAGESIZE, /* MMU page aligned */
128 	0x1,
129 	0x1,
130 	0xffffffffU,
131 	0xffffffffffffffffULL,
132 	1,
133 	4,
134 	0
135 };
136 
137 static ddi_device_acc_attr_t immu_acc_attr = {
138 	DDI_DEVICE_ATTR_V0,
139 	DDI_NEVERSWAP_ACC,
140 	DDI_STRICTORDER_ACC
141 };
142 
143 struct iommulib_ops immulib_ops = {
144 	IOMMU_OPS_VERSION,
145 	INTEL_IOMMU,
146 	"Intel IOMMU",
147 	NULL,
148 	immu_probe,
149 	immu_allochdl,
150 	immu_freehdl,
151 	immu_bindhdl,
152 	immu_unbindhdl,
153 	immu_sync,
154 	immu_win,
155 	immu_mapobject,
156 	immu_unmapobject,
157 	immu_map,
158 	immu_mctl
159 };
160 
161 /*
162  * Fake physical address range used to set up initial prealloc mappings.
163  * This memory is never actually accessed. It is mapped read-only,
164  * and is overwritten as soon as the first DMA bind operation is
165  * performed. Since 0 is a special case, just start at the 2nd
166  * physical page.
167  */
168 
169 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES };
170 
171 /* globals private to this file */
172 static kmutex_t immu_domain_lock;
173 static list_t immu_unity_domain_list;
174 static list_t immu_xlate_domain_list;
175 
176 /* structure used to store idx into each level of the page tables */
177 typedef struct xlate {
178 	int xlt_level;
179 	uint_t xlt_idx;
180 	pgtable_t *xlt_pgtable;
181 } xlate_t;
182 
183 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
184 #define	IMMU_UNITY_DID   1
185 
186 static mod_hash_t *bdf_domain_hash;
187 
188 int immu_use_alh;
189 int immu_use_tm;
190 
191 static domain_t *
192 bdf_domain_lookup(immu_devi_t *immu_devi)
193 {
194 	domain_t *domain;
195 	int16_t seg = immu_devi->imd_seg;
196 	int16_t bus = immu_devi->imd_bus;
197 	int16_t devfunc = immu_devi->imd_devfunc;
198 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
199 
200 	if (seg < 0 || bus < 0 || devfunc < 0) {
201 		return (NULL);
202 	}
203 
204 	domain = NULL;
205 	if (mod_hash_find(bdf_domain_hash,
206 	    (void *)bdf, (void *)&domain) == 0) {
207 		ASSERT(domain);
208 		ASSERT(domain->dom_did > 0);
209 		return (domain);
210 	} else {
211 		return (NULL);
212 	}
213 }
214 
215 static void
216 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
217 {
218 	int16_t seg = immu_devi->imd_seg;
219 	int16_t bus = immu_devi->imd_bus;
220 	int16_t devfunc = immu_devi->imd_devfunc;
221 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
222 
223 	if (seg < 0 || bus < 0 || devfunc < 0) {
224 		return;
225 	}
226 
227 	(void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
228 }
229 
230 static int
231 match_lpc(dev_info_t *pdip, void *arg)
232 {
233 	immu_devi_t *immu_devi;
234 	dvma_arg_t *dvap = (dvma_arg_t *)arg;
235 
236 	if (list_is_empty(dvap->dva_list)) {
237 		return (DDI_WALK_TERMINATE);
238 	}
239 
240 	immu_devi = list_head(dvap->dva_list);
241 	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
242 	    immu_devi)) {
243 		if (immu_devi->imd_dip == pdip) {
244 			dvap->dva_ddip = pdip;
245 			dvap->dva_error = DDI_SUCCESS;
246 			return (DDI_WALK_TERMINATE);
247 		}
248 	}
249 
250 	return (DDI_WALK_CONTINUE);
251 }
252 
253 static void
254 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
255 {
256 	list_t *spclist = NULL;
257 	immu_devi_t *immu_devi;
258 
259 	immu_devi = IMMU_DEVI(dip);
260 	if (immu_devi->imd_display == B_TRUE) {
261 		spclist = &(immu->immu_dvma_gfx_list);
262 	} else if (immu_devi->imd_lpc == B_TRUE) {
263 		spclist = &(immu->immu_dvma_lpc_list);
264 	}
265 
266 	if (spclist) {
267 		mutex_enter(&(immu->immu_lock));
268 		list_insert_head(spclist, immu_devi);
269 		mutex_exit(&(immu->immu_lock));
270 	}
271 }
272 
273 /*
274  * Set the immu_devi struct in the immu_devi field of a devinfo node
275  */
276 int
277 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
278 {
279 	int bus, dev, func;
280 	immu_devi_t *new_imd;
281 	immu_devi_t *immu_devi;
282 
283 	immu_devi = immu_devi_get(dip);
284 	if (immu_devi != NULL) {
285 		return (DDI_SUCCESS);
286 	}
287 
288 	bus = dev = func = -1;
289 
290 	/*
291 	 * Assume a new immu_devi struct is needed
292 	 */
293 	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
294 		/*
295 		 * No BDF. Set bus = -1 to indicate this.
296 		 * We still need to create a immu_devi struct
297 		 * though
298 		 */
299 		bus = -1;
300 		dev = 0;
301 		func = 0;
302 	}
303 
304 	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
305 	if (new_imd  == NULL) {
306 		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
307 		    "structure");
308 		return (DDI_FAILURE);
309 	}
310 
311 	/*
312 	 * Check if some other thread allocated a immu_devi while we
313 	 * didn't own the lock.
314 	 */
315 	mutex_enter(&(DEVI(dip)->devi_lock));
316 	if (IMMU_DEVI(dip) == NULL) {
317 		IMMU_DEVI_SET(dip, new_imd);
318 	} else {
319 		destroy_immu_devi(new_imd);
320 	}
321 	mutex_exit(&(DEVI(dip)->devi_lock));
322 
323 	return (DDI_SUCCESS);
324 }
325 
326 static dev_info_t *
327 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
328 {
329 	dvma_arg_t dvarg = {0};
330 	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
331 	dvarg.dva_rdip = rdip;
332 	dvarg.dva_error = DDI_FAILURE;
333 
334 	if (immu_walk_ancestor(rdip, NULL, match_lpc,
335 	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
336 		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
337 		    "find lpc_devinfo for ISA device");
338 		return (NULL);
339 	}
340 
341 	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
342 		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
343 		    "ISA device");
344 		return (NULL);
345 	}
346 
347 	return (dvarg.dva_ddip);
348 }
349 
350 static dev_info_t *
351 get_gfx_devinfo(dev_info_t *rdip)
352 {
353 	immu_t *immu;
354 	immu_devi_t *immu_devi;
355 	list_t *list_gfx;
356 
357 	/*
358 	 * The GFX device may not be on the same iommu unit as "agpgart"
359 	 * so search globally
360 	 */
361 	immu_devi = NULL;
362 	immu = list_head(&immu_list);
363 	for (; immu; immu = list_next(&immu_list, immu)) {
364 		list_gfx = &(immu->immu_dvma_gfx_list);
365 		if (!list_is_empty(list_gfx)) {
366 			immu_devi = list_head(list_gfx);
367 			break;
368 		}
369 	}
370 
371 	if (immu_devi == NULL) {
372 		ddi_err(DER_WARN, rdip, "iommu: No GFX device. "
373 		    "Cannot redirect agpgart");
374 		return (NULL);
375 	}
376 
377 	ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s",
378 	    ddi_node_name(immu_devi->imd_dip));
379 
380 	return (immu_devi->imd_dip);
381 }
382 
383 static immu_flags_t
384 dma_to_immu_flags(struct ddi_dma_req *dmareq)
385 {
386 	immu_flags_t flags = 0;
387 
388 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
389 		flags |= IMMU_FLAGS_SLEEP;
390 	} else {
391 		flags |= IMMU_FLAGS_NOSLEEP;
392 	}
393 
394 #ifdef BUGGY_DRIVERS
395 
396 	flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
397 
398 #else
399 	/*
400 	 * Read and write flags need to be reversed.
401 	 * DMA_READ means read from device and write
402 	 * to memory. So DMA read means DVMA write.
403 	 */
404 	if (dmareq->dmar_flags & DDI_DMA_READ)
405 		flags |= IMMU_FLAGS_WRITE;
406 
407 	if (dmareq->dmar_flags & DDI_DMA_WRITE)
408 		flags |= IMMU_FLAGS_READ;
409 
410 	/*
411 	 * Some buggy drivers specify neither READ or WRITE
412 	 * For such drivers set both read and write permissions
413 	 */
414 	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
415 		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
416 	}
417 #endif
418 
419 	return (flags);
420 }
421 
422 /*ARGSUSED*/
423 int
424 pgtable_ctor(void *buf, void *arg, int kmflag)
425 {
426 	size_t actual_size = 0;
427 	pgtable_t *pgtable;
428 	int (*dmafp)(caddr_t);
429 	caddr_t vaddr;
430 	void *next;
431 	uint_t flags;
432 	immu_t *immu = arg;
433 
434 	pgtable = (pgtable_t *)buf;
435 
436 	dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
437 
438 	next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
439 	if (next == NULL) {
440 		return (-1);
441 	}
442 
443 	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
444 	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
445 		kmem_free(next, IMMU_PAGESIZE);
446 		return (-1);
447 	}
448 
449 	flags = DDI_DMA_CONSISTENT;
450 	if (!immu->immu_dvma_coherent)
451 		flags |= IOMEM_DATA_UC_WR_COMBINE;
452 
453 	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
454 	    &immu_acc_attr, flags,
455 	    dmafp, NULL, &vaddr, &actual_size,
456 	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
457 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
458 		kmem_free(next, IMMU_PAGESIZE);
459 		return (-1);
460 	}
461 
462 	/*
463 	 * Memory allocation failure. Maybe a temporary condition
464 	 * so return error rather than panic, so we can try again
465 	 */
466 	if (actual_size < IMMU_PAGESIZE) {
467 		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
468 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
469 		kmem_free(next, IMMU_PAGESIZE);
470 		return (-1);
471 	}
472 
473 	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
474 	pgtable->hwpg_vaddr = vaddr;
475 	pgtable->swpg_next_array = next;
476 
477 	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
478 
479 	return (0);
480 }
481 
482 /*ARGSUSED*/
483 void
484 pgtable_dtor(void *buf, void *arg)
485 {
486 	pgtable_t *pgtable;
487 
488 	pgtable = (pgtable_t *)buf;
489 
490 	/* destroy will panic if lock is held. */
491 	rw_destroy(&(pgtable->swpg_rwlock));
492 
493 	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
494 	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
495 	kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
496 }
497 
498 /*
499  * pgtable_alloc()
500  *	alloc a IOMMU pgtable structure.
501  *	This same struct is used for root and context tables as well.
502  *	This routine allocs the f/ollowing:
503  *	- a pgtable_t struct
504  *	- a HW page which holds PTEs/entries which is accesssed by HW
505  *        so we set up DMA for this page
506  *	- a SW page which is only for our bookeeping
507  *        (for example to  hold pointers to the next level pgtable).
508  *        So a simple kmem_alloc suffices
509  */
510 static pgtable_t *
511 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
512 {
513 	pgtable_t *pgtable;
514 	int kmflags;
515 
516 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
517 
518 	pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags);
519 	if (pgtable == NULL) {
520 		return (NULL);
521 	}
522 	return (pgtable);
523 }
524 
525 static void
526 pgtable_zero(pgtable_t *pgtable)
527 {
528 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
529 	bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);
530 }
531 
532 static void
533 pgtable_free(immu_t *immu, pgtable_t *pgtable)
534 {
535 	kmem_cache_free(immu->immu_pgtable_cache, pgtable);
536 }
537 
538 /*
539  * Function to identify a display device from the PCI class code
540  */
541 static boolean_t
542 device_is_display(uint_t classcode)
543 {
544 	static uint_t disp_classes[] = {
545 		0x000100,
546 		0x030000,
547 		0x030001
548 	};
549 	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
550 
551 	for (i = 0; i < nclasses; i++) {
552 		if (classcode == disp_classes[i])
553 			return (B_TRUE);
554 	}
555 	return (B_FALSE);
556 }
557 
558 /*
559  * Function that determines if device is PCIEX and/or PCIEX bridge
560  */
561 static boolean_t
562 device_is_pciex(
563 	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
564 {
565 	ushort_t cap;
566 	ushort_t capsp;
567 	ushort_t cap_count = PCI_CAP_MAX_PTR;
568 	ushort_t status;
569 	boolean_t is_pciex = B_FALSE;
570 
571 	*is_pcib = B_FALSE;
572 
573 	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
574 	if (!(status & PCI_STAT_CAP))
575 		return (B_FALSE);
576 
577 	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
578 	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
579 		capsp &= PCI_CAP_PTR_MASK;
580 		cap = pci_getb_func(bus, dev, func, capsp);
581 
582 		if (cap == PCI_CAP_ID_PCI_E) {
583 			status = pci_getw_func(bus, dev, func, capsp + 2);
584 			/*
585 			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
586 			 * for Device/Port Type.
587 			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
588 			 * device is a PCIE2PCI bridge
589 			 */
590 			*is_pcib =
591 			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
592 			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
593 			is_pciex = B_TRUE;
594 		}
595 
596 		capsp = (*pci_getb_func)(bus, dev, func,
597 		    capsp + PCI_CAP_NEXT_PTR);
598 	}
599 
600 	return (is_pciex);
601 }
602 
603 static boolean_t
604 device_use_premap(uint_t classcode)
605 {
606 	if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET)
607 		return (B_TRUE);
608 	return (B_FALSE);
609 }
610 
611 
612 /*
613  * immu_dvma_get_immu()
614  *   get the immu unit structure for a dev_info node
615  */
616 immu_t *
617 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
618 {
619 	immu_devi_t *immu_devi;
620 	immu_t *immu;
621 
622 	/*
623 	 * check if immu unit was already found earlier.
624 	 * If yes, then it will be stashed in immu_devi struct.
625 	 */
626 	immu_devi = immu_devi_get(dip);
627 	if (immu_devi == NULL) {
628 		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
629 			/*
630 			 * May fail because of low memory. Return error rather
631 			 * than panic as we want driver to rey again later
632 			 */
633 			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
634 			    "No immu_devi structure");
635 			/*NOTREACHED*/
636 		}
637 		immu_devi = immu_devi_get(dip);
638 	}
639 
640 	mutex_enter(&(DEVI(dip)->devi_lock));
641 	if (immu_devi->imd_immu) {
642 		immu = immu_devi->imd_immu;
643 		mutex_exit(&(DEVI(dip)->devi_lock));
644 		return (immu);
645 	}
646 	mutex_exit(&(DEVI(dip)->devi_lock));
647 
648 	immu = immu_dmar_get_immu(dip);
649 	if (immu == NULL) {
650 		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
651 		    "Cannot find immu_t for device");
652 		/*NOTREACHED*/
653 	}
654 
655 	/*
656 	 * Check if some other thread found immu
657 	 * while lock was not held
658 	 */
659 	immu_devi = immu_devi_get(dip);
660 	/* immu_devi should be present as we found it earlier */
661 	if (immu_devi == NULL) {
662 		ddi_err(DER_PANIC, dip,
663 		    "immu_dvma_get_immu: No immu_devi structure");
664 		/*NOTREACHED*/
665 	}
666 
667 	mutex_enter(&(DEVI(dip)->devi_lock));
668 	if (immu_devi->imd_immu == NULL) {
669 		/* nobody else set it, so we should do it */
670 		immu_devi->imd_immu = immu;
671 		immu_devi_set_spclist(dip, immu);
672 	} else {
673 		/*
674 		 * if some other thread got immu before
675 		 * us, it should get the same results
676 		 */
677 		if (immu_devi->imd_immu != immu) {
678 			ddi_err(DER_PANIC, dip, "Multiple "
679 			    "immu units found for device. Expected (%p), "
680 			    "actual (%p)", (void *)immu,
681 			    (void *)immu_devi->imd_immu);
682 			mutex_exit(&(DEVI(dip)->devi_lock));
683 			/*NOTREACHED*/
684 		}
685 	}
686 	mutex_exit(&(DEVI(dip)->devi_lock));
687 
688 	return (immu);
689 }
690 
691 
692 /* ############################# IMMU_DEVI code ############################ */
693 
694 /*
695  * Allocate a immu_devi structure and initialize it
696  */
697 static immu_devi_t *
698 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
699     immu_flags_t immu_flags)
700 {
701 	uchar_t baseclass, subclass;
702 	uint_t classcode, revclass;
703 	immu_devi_t *immu_devi;
704 	boolean_t pciex = B_FALSE;
705 	int kmflags;
706 	boolean_t is_pcib = B_FALSE;
707 
708 	/* bus ==  -1 indicate non-PCI device (no BDF) */
709 	ASSERT(bus == -1 || bus >= 0);
710 	ASSERT(dev >= 0);
711 	ASSERT(func >= 0);
712 
713 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
714 	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
715 	if (immu_devi == NULL) {
716 		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
717 		    "Intel IOMMU immu_devi structure");
718 		return (NULL);
719 	}
720 	immu_devi->imd_dip = rdip;
721 	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
722 	immu_devi->imd_bus = bus;
723 	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
724 
725 	if (bus == -1) {
726 		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
727 		return (immu_devi);
728 	}
729 
730 	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
731 	immu_devi->imd_sec = 0;
732 	immu_devi->imd_sub = 0;
733 
734 	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
735 
736 	classcode = IMMU_PCI_REV2CLASS(revclass);
737 	baseclass = IMMU_PCI_CLASS2BASE(classcode);
738 	subclass = IMMU_PCI_CLASS2SUB(classcode);
739 
740 	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
741 
742 		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
743 		    PCI_BCNF_SECBUS);
744 		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
745 		    PCI_BCNF_SUBBUS);
746 
747 		pciex = device_is_pciex(bus, dev, func, &is_pcib);
748 		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
749 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
750 		} else if (pciex == B_TRUE) {
751 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
752 		} else {
753 			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
754 		}
755 	} else {
756 		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
757 	}
758 
759 	/* check for certain special devices */
760 	immu_devi->imd_display = device_is_display(classcode);
761 	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
762 	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
763 	immu_devi->imd_use_premap = device_use_premap(classcode);
764 
765 	immu_devi->imd_domain = NULL;
766 
767 	immu_devi->imd_dvma_flags = immu_global_dvma_flags;
768 
769 	return (immu_devi);
770 }
771 
772 static void
773 destroy_immu_devi(immu_devi_t *immu_devi)
774 {
775 	kmem_free(immu_devi, sizeof (immu_devi_t));
776 }
777 
778 static domain_t *
779 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
780 {
781 	immu_devi_t *immu_devi;
782 	domain_t *domain;
783 	dev_info_t *ddip;
784 
785 	*ddipp = NULL;
786 
787 	immu_devi = immu_devi_get(rdip);
788 	if (immu_devi == NULL) {
789 		return (NULL);
790 	}
791 
792 	mutex_enter(&(DEVI(rdip)->devi_lock));
793 	domain = immu_devi->imd_domain;
794 	ddip = immu_devi->imd_ddip;
795 	mutex_exit(&(DEVI(rdip)->devi_lock));
796 
797 	if (domain)
798 		*ddipp = ddip;
799 
800 	return (domain);
801 
802 }
803 
804 /* ############################# END IMMU_DEVI code ######################## */
805 /* ############################# DOMAIN code ############################### */
806 
807 /*
808  * This routine always succeeds
809  */
810 static int
811 did_alloc(immu_t *immu, dev_info_t *rdip,
812     dev_info_t *ddip, immu_flags_t immu_flags)
813 {
814 	int did;
815 
816 	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
817 	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
818 
819 	if (did == 0) {
820 		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
821 		    " domain-device: %s%d. immu unit is %s. Using "
822 		    "unity domain with domain-id (%d)",
823 		    ddi_driver_name(ddip), ddi_get_instance(ddip),
824 		    immu->immu_name, immu->immu_unity_domain->dom_did);
825 		did = immu->immu_unity_domain->dom_did;
826 	}
827 
828 	return (did);
829 }
830 
831 static int
832 get_branch_domain(dev_info_t *pdip, void *arg)
833 {
834 	immu_devi_t *immu_devi;
835 	domain_t *domain;
836 	dev_info_t *ddip;
837 	immu_t *immu;
838 	dvma_arg_t *dvp = (dvma_arg_t *)arg;
839 
840 	/*
841 	 * The field dvp->dva_rdip is a work-in-progress
842 	 * and gets updated as we walk up the ancestor
843 	 * tree. The final ddip is set only when we reach
844 	 * the top of the tree. So the dvp->dva_ddip field cannot
845 	 * be relied on until we reach the top of the field.
846 	 */
847 
848 	/* immu_devi may not be set. */
849 	immu_devi = immu_devi_get(pdip);
850 	if (immu_devi == NULL) {
851 		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
852 			dvp->dva_error = DDI_FAILURE;
853 			return (DDI_WALK_TERMINATE);
854 		}
855 	}
856 
857 	immu_devi = immu_devi_get(pdip);
858 	immu = immu_devi->imd_immu;
859 	if (immu == NULL)
860 		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
861 
862 	/*
863 	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
864 	 * terminate the walk (since the device under the PCIE bridge
865 	 * is a PCIE device and has an independent entry in the
866 	 * root/context table)
867 	 */
868 	if (dvp->dva_rdip != pdip &&
869 	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
870 		return (DDI_WALK_TERMINATE);
871 	}
872 
873 	/*
874 	 * In order to be a domain-dim, it must be a PCI device i.e.
875 	 * must have valid BDF. This also eliminates the root complex.
876 	 */
877 	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
878 	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
879 		ASSERT(immu_devi->imd_bus >= 0);
880 		ASSERT(immu_devi->imd_devfunc >= 0);
881 		dvp->dva_ddip = pdip;
882 	}
883 
884 	if (immu_devi->imd_display == B_TRUE ||
885 	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
886 		dvp->dva_domain = immu->immu_unity_domain;
887 		/* continue walking to find ddip */
888 		return (DDI_WALK_CONTINUE);
889 	}
890 
891 	mutex_enter(&(DEVI(pdip)->devi_lock));
892 	domain = immu_devi->imd_domain;
893 	ddip = immu_devi->imd_ddip;
894 	mutex_exit(&(DEVI(pdip)->devi_lock));
895 
896 	if (domain && ddip) {
897 		/* if domain is set, it must be the same */
898 		if (dvp->dva_domain) {
899 			ASSERT(domain == dvp->dva_domain);
900 		}
901 		dvp->dva_domain = domain;
902 		dvp->dva_ddip = ddip;
903 		return (DDI_WALK_TERMINATE);
904 	}
905 
906 	/* Domain may already be set, continue walking so that ddip gets set */
907 	if (dvp->dva_domain) {
908 		return (DDI_WALK_CONTINUE);
909 	}
910 
911 	/* domain is not set in either immu_devi or dvp */
912 	domain = bdf_domain_lookup(immu_devi);
913 	if (domain == NULL) {
914 		return (DDI_WALK_CONTINUE);
915 	}
916 
917 	/* ok, the BDF hash had a domain for this BDF. */
918 
919 	/* Grab lock again to check if something else set immu_devi fields */
920 	mutex_enter(&(DEVI(pdip)->devi_lock));
921 	if (immu_devi->imd_domain != NULL) {
922 		dvp->dva_domain = domain;
923 	} else {
924 		dvp->dva_domain = domain;
925 	}
926 	mutex_exit(&(DEVI(pdip)->devi_lock));
927 
928 	/*
929 	 * walk upwards until the topmost PCI bridge is found
930 	 */
931 	return (DDI_WALK_CONTINUE);
932 
933 }
934 
935 static void
936 map_unity_domain(domain_t *domain)
937 {
938 	struct memlist *mp;
939 	uint64_t start;
940 	uint64_t npages;
941 	immu_dcookie_t dcookies[1] = {0};
942 	int dcount = 0;
943 
944 	/*
945 	 * UNITY arenas are a mirror of the physical memory
946 	 * installed on the system.
947 	 */
948 
949 #ifdef BUGGY_DRIVERS
950 	/*
951 	 * Dont skip page0. Some broken HW/FW access it.
952 	 */
953 	dcookies[0].dck_paddr = 0;
954 	dcookies[0].dck_npages = 1;
955 	dcount = 1;
956 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
957 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
958 #endif
959 
960 	memlist_read_lock();
961 
962 	mp = phys_install;
963 
964 	if (mp->ml_address == 0) {
965 		/* since we already mapped page1 above */
966 		start = IMMU_PAGESIZE;
967 	} else {
968 		start = mp->ml_address;
969 	}
970 	npages = mp->ml_size/IMMU_PAGESIZE + 1;
971 
972 	dcookies[0].dck_paddr = start;
973 	dcookies[0].dck_npages = npages;
974 	dcount = 1;
975 	(void) dvma_map(domain, start, npages, dcookies,
976 	    dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
977 
978 	ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64
979 	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
980 
981 	mp = mp->ml_next;
982 	while (mp) {
983 		ddi_err(DER_LOG, domain->dom_dip,
984 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
985 		    mp->ml_address, mp->ml_address + mp->ml_size);
986 
987 		start = mp->ml_address;
988 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
989 
990 		dcookies[0].dck_paddr = start;
991 		dcookies[0].dck_npages = npages;
992 		dcount = 1;
993 		(void) dvma_map(domain, start, npages,
994 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
995 		mp = mp->ml_next;
996 	}
997 
998 	mp = bios_rsvd;
999 	while (mp) {
1000 		ddi_err(DER_LOG, domain->dom_dip,
1001 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
1002 		    mp->ml_address, mp->ml_address + mp->ml_size);
1003 
1004 		start = mp->ml_address;
1005 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
1006 
1007 		dcookies[0].dck_paddr = start;
1008 		dcookies[0].dck_npages = npages;
1009 		dcount = 1;
1010 		(void) dvma_map(domain, start, npages,
1011 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
1012 
1013 		mp = mp->ml_next;
1014 	}
1015 
1016 	memlist_read_unlock();
1017 }
1018 
1019 /*
1020  * create_xlate_arena()
1021  * 	Create the dvma arena for a domain with translation
1022  *	mapping
1023  */
1024 static void
1025 create_xlate_arena(immu_t *immu, domain_t *domain,
1026     dev_info_t *rdip, immu_flags_t immu_flags)
1027 {
1028 	char *arena_name;
1029 	struct memlist *mp;
1030 	int vmem_flags;
1031 	uint64_t start;
1032 	uint_t mgaw;
1033 	uint64_t size;
1034 	uint64_t maxaddr;
1035 	void *vmem_ret;
1036 
1037 	arena_name = domain->dom_dvma_arena_name;
1038 
1039 	/* Note, don't do sizeof (arena_name) - it is just a pointer */
1040 	(void) snprintf(arena_name,
1041 	    sizeof (domain->dom_dvma_arena_name),
1042 	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1043 	    domain->dom_did);
1044 
1045 	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1046 
1047 	/* Restrict mgaddr (max guest addr) to MGAW */
1048 	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1049 
1050 	/*
1051 	 * To ensure we avoid ioapic and PCI MMIO ranges we just
1052 	 * use the physical memory address range of the system as the
1053 	 * range
1054 	 */
1055 	maxaddr = ((uint64_t)1 << mgaw);
1056 
1057 	memlist_read_lock();
1058 
1059 	mp = phys_install;
1060 
1061 	if (mp->ml_address == 0)
1062 		start = MMU_PAGESIZE;
1063 	else
1064 		start = mp->ml_address;
1065 
1066 	if (start + mp->ml_size > maxaddr)
1067 		size = maxaddr - start;
1068 	else
1069 		size = mp->ml_size;
1070 
1071 	ddi_err(DER_VERB, rdip,
1072 	    "iommu: %s: Creating dvma vmem arena [0x%" PRIx64
1073 	    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1074 
1075 	/*
1076 	 * We always allocate in quanta of IMMU_PAGESIZE
1077 	 */
1078 	domain->dom_dvma_arena = vmem_create(arena_name,
1079 	    (void *)(uintptr_t)start,	/* start addr */
1080 	    size,			/* size */
1081 	    IMMU_PAGESIZE,		/* quantum */
1082 	    NULL,			/* afunc */
1083 	    NULL,			/* ffunc */
1084 	    NULL,			/* source */
1085 	    0,				/* qcache_max */
1086 	    vmem_flags);
1087 
1088 	if (domain->dom_dvma_arena == NULL) {
1089 		ddi_err(DER_PANIC, rdip,
1090 		    "Failed to allocate DVMA arena(%s) "
1091 		    "for domain ID (%d)", arena_name, domain->dom_did);
1092 		/*NOTREACHED*/
1093 	}
1094 
1095 	mp = mp->ml_next;
1096 	while (mp) {
1097 
1098 		if (mp->ml_address == 0)
1099 			start = MMU_PAGESIZE;
1100 		else
1101 			start = mp->ml_address;
1102 
1103 		if (start + mp->ml_size > maxaddr)
1104 			size = maxaddr - start;
1105 		else
1106 			size = mp->ml_size;
1107 
1108 		ddi_err(DER_VERB, rdip,
1109 		    "iommu: %s: Adding dvma vmem span [0x%" PRIx64
1110 		    " - 0x%" PRIx64 "]", arena_name, start,
1111 		    start + size);
1112 
1113 		vmem_ret = vmem_add(domain->dom_dvma_arena,
1114 		    (void *)(uintptr_t)start, size,  vmem_flags);
1115 
1116 		if (vmem_ret == NULL) {
1117 			ddi_err(DER_PANIC, rdip,
1118 			    "Failed to allocate DVMA arena(%s) "
1119 			    "for domain ID (%d)",
1120 			    arena_name, domain->dom_did);
1121 			/*NOTREACHED*/
1122 		}
1123 		mp = mp->ml_next;
1124 	}
1125 	memlist_read_unlock();
1126 }
1127 
1128 /* ################################### DOMAIN CODE ######################### */
1129 
1130 /*
1131  * Set the domain and domain-dip for a dip
1132  */
1133 static void
1134 set_domain(
1135 	dev_info_t *dip,
1136 	dev_info_t *ddip,
1137 	domain_t *domain)
1138 {
1139 	immu_devi_t *immu_devi;
1140 	domain_t *fdomain;
1141 	dev_info_t *fddip;
1142 
1143 	immu_devi = immu_devi_get(dip);
1144 
1145 	mutex_enter(&(DEVI(dip)->devi_lock));
1146 	fddip = immu_devi->imd_ddip;
1147 	fdomain = immu_devi->imd_domain;
1148 
1149 	if (fddip) {
1150 		ASSERT(fddip == ddip);
1151 	} else {
1152 		immu_devi->imd_ddip = ddip;
1153 	}
1154 
1155 	if (fdomain) {
1156 		ASSERT(fdomain == domain);
1157 	} else {
1158 		immu_devi->imd_domain = domain;
1159 	}
1160 	mutex_exit(&(DEVI(dip)->devi_lock));
1161 }
1162 
1163 /*
1164  * device_domain()
1165  * 	Get domain for a device. The domain may be global in which case it
1166  *	is shared between all IOMMU units. Due to potential AGAW differences
1167  *      between IOMMU units, such global domains *have to be* UNITY mapping
1168  *      domains. Alternatively, the domain may be local to a IOMMU unit.
1169  *	Local domains may be shared or immu_devi, although the
1170  *      scope of sharing
1171  *	is restricted to devices controlled by the IOMMU unit to
1172  *      which the domain
1173  *	belongs. If shared, they (currently) have to be UNITY domains. If
1174  *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
1175  */
1176 static domain_t *
1177 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1178 {
1179 	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1180 	immu_t *immu;
1181 	domain_t *domain;
1182 	dvma_arg_t dvarg = {0};
1183 	int level;
1184 
1185 	*ddipp = NULL;
1186 
1187 	/*
1188 	 * Check if the domain is already set. This is usually true
1189 	 * if this is not the first DVMA transaction.
1190 	 */
1191 	ddip = NULL;
1192 	domain = immu_devi_domain(rdip, &ddip);
1193 	if (domain) {
1194 		*ddipp = ddip;
1195 		return (domain);
1196 	}
1197 
1198 	immu = immu_dvma_get_immu(rdip, immu_flags);
1199 	if (immu == NULL) {
1200 		/*
1201 		 * possible that there is no IOMMU unit for this device
1202 		 * - BIOS bugs are one example.
1203 		 */
1204 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
1205 		return (NULL);
1206 	}
1207 
1208 	immu_flags |= immu_devi_get(rdip)->imd_dvma_flags;
1209 
1210 	dvarg.dva_rdip = rdip;
1211 	dvarg.dva_ddip = NULL;
1212 	dvarg.dva_domain = NULL;
1213 	dvarg.dva_flags = immu_flags;
1214 	level = 0;
1215 	if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
1216 	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1217 		/*
1218 		 * maybe low memory. return error,
1219 		 * so driver tries again later
1220 		 */
1221 		return (NULL);
1222 	}
1223 
1224 	/* should have walked at least 1 dip (i.e. edip) */
1225 	ASSERT(level > 0);
1226 
1227 	ddip = dvarg.dva_ddip;	/* must be present */
1228 	domain = dvarg.dva_domain;	/* may be NULL */
1229 
1230 	/*
1231 	 * We may find the domain during our ancestor walk on any one of our
1232 	 * ancestor dips, If the domain is found then the domain-dip
1233 	 * (i.e. ddip) will also be found in the same immu_devi struct.
1234 	 * The domain-dip is the highest ancestor dip which shares the
1235 	 * same domain with edip.
1236 	 * The domain may or may not be found, but the domain dip must
1237 	 * be found.
1238 	 */
1239 	if (ddip == NULL) {
1240 		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
1241 		return (NULL);
1242 	}
1243 
1244 	/*
1245 	 * Did we find a domain ?
1246 	 */
1247 	if (domain) {
1248 		goto found;
1249 	}
1250 
1251 	/* nope, so allocate */
1252 	domain = domain_create(immu, ddip, rdip, immu_flags);
1253 	if (domain == NULL) {
1254 		return (NULL);
1255 	}
1256 
1257 	/*FALLTHROUGH*/
1258 found:
1259 	/*
1260 	 * We know *domain *is* the right domain, so panic if
1261 	 * another domain is set for either the request-dip or
1262 	 * effective dip.
1263 	 */
1264 	set_domain(ddip, ddip, domain);
1265 	set_domain(rdip, ddip, domain);
1266 
1267 	*ddipp = ddip;
1268 	return (domain);
1269 }
1270 
1271 static void
1272 create_unity_domain(immu_t *immu)
1273 {
1274 	domain_t *domain;
1275 
1276 	/* domain created during boot and always use sleep flag */
1277 	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1278 
1279 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1280 
1281 	domain->dom_did = IMMU_UNITY_DID;
1282 	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1283 
1284 	domain->dom_immu = immu;
1285 	immu->immu_unity_domain = domain;
1286 
1287 	/*
1288 	 * Setup the domain's initial page table
1289 	 * should never fail.
1290 	 */
1291 	domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1292 	pgtable_zero(domain->dom_pgtable_root);
1293 
1294 	/*
1295 	 * Only map all physical memory in to the unity domain
1296 	 * if passthrough is not supported. If it is supported,
1297 	 * passthrough is set in the context entry instead.
1298 	 */
1299 	if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1300 		map_unity_domain(domain);
1301 
1302 
1303 	/*
1304 	 * put it on the system-wide UNITY domain list
1305 	 */
1306 	mutex_enter(&(immu_domain_lock));
1307 	list_insert_tail(&immu_unity_domain_list, domain);
1308 	mutex_exit(&(immu_domain_lock));
1309 }
1310 
1311 /*
1312  * ddip is the domain-dip - the topmost dip in a domain
1313  * rdip is the requesting-dip - the device which is
1314  * requesting DVMA setup
1315  * if domain is a non-shared domain rdip == ddip
1316  */
1317 static domain_t *
1318 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1319     immu_flags_t immu_flags)
1320 {
1321 	int kmflags;
1322 	domain_t *domain;
1323 	char mod_hash_name[128];
1324 	immu_devi_t *immu_devi;
1325 	int did;
1326 	immu_dcookie_t dcookies[1] = {0};
1327 	int dcount = 0;
1328 
1329 	immu_devi = immu_devi_get(rdip);
1330 
1331 	/*
1332 	 * First allocate a domainid.
1333 	 * This routine will never fail, since if we run out
1334 	 * of domains the unity domain will be allocated.
1335 	 */
1336 	did = did_alloc(immu, rdip, ddip, immu_flags);
1337 	if (did == IMMU_UNITY_DID) {
1338 		/* domain overflow */
1339 		ASSERT(immu->immu_unity_domain);
1340 		return (immu->immu_unity_domain);
1341 	}
1342 
1343 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1344 	domain = kmem_zalloc(sizeof (domain_t), kmflags);
1345 	if (domain == NULL) {
1346 		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1347 		    "structure for device. IOMMU unit: %s", immu->immu_name);
1348 		/*NOTREACHED*/
1349 	}
1350 
1351 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1352 
1353 	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1354 	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
1355 
1356 	domain->dom_did = did;
1357 	domain->dom_immu = immu;
1358 	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1359 	domain->dom_dip = ddip;
1360 
1361 	/*
1362 	 * Create xlate DVMA arena for this domain.
1363 	 */
1364 	create_xlate_arena(immu, domain, rdip, immu_flags);
1365 
1366 	/*
1367 	 * Setup the domain's initial page table
1368 	 */
1369 	domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
1370 	if (domain->dom_pgtable_root == NULL) {
1371 		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1372 		    "pgtable for domain (%d). IOMMU unit: %s",
1373 		    domain->dom_did, immu->immu_name);
1374 		/*NOTREACHED*/
1375 	}
1376 	pgtable_zero(domain->dom_pgtable_root);
1377 
1378 	/*
1379 	 * Since this is a immu unit-specific domain, put it on
1380 	 * the per-immu domain list.
1381 	 */
1382 	mutex_enter(&(immu->immu_lock));
1383 	list_insert_head(&immu->immu_domain_list, domain);
1384 	mutex_exit(&(immu->immu_lock));
1385 
1386 	/*
1387 	 * Also put it on the system-wide xlate domain list
1388 	 */
1389 	mutex_enter(&(immu_domain_lock));
1390 	list_insert_head(&immu_xlate_domain_list, domain);
1391 	mutex_exit(&(immu_domain_lock));
1392 
1393 	bdf_domain_insert(immu_devi, domain);
1394 
1395 #ifdef BUGGY_DRIVERS
1396 	/*
1397 	 * Map page0. Some broken HW/FW access it.
1398 	 */
1399 	dcookies[0].dck_paddr = 0;
1400 	dcookies[0].dck_npages = 1;
1401 	dcount = 1;
1402 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
1403 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1404 #endif
1405 	return (domain);
1406 }
1407 
1408 /*
1409  * Create domainid arena.
1410  * Domainid 0 is reserved by Vt-d spec and cannot be used by
1411  * system software.
1412  * Domainid 1 is reserved by solaris and used for *all* of the following:
1413  *	as the "uninitialized" domain - For devices not yet controlled
1414  *	by Solaris
1415  *	as the "unity" domain - For devices that will always belong
1416  *	to the unity domain
1417  *	as the "overflow" domain - Used for any new device after we
1418  *	run out of domains
1419  * All of the above domains map into a single domain with
1420  * domainid 1 and UNITY DVMA mapping
1421  * Each IMMU unity has its own unity/uninit/overflow domain
1422  */
1423 static void
1424 did_init(immu_t *immu)
1425 {
1426 	(void) snprintf(immu->immu_did_arena_name,
1427 	    sizeof (immu->immu_did_arena_name),
1428 	    "%s_domainid_arena", immu->immu_name);
1429 
1430 	ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s",
1431 	    immu->immu_did_arena_name);
1432 
1433 	immu->immu_did_arena = vmem_create(
1434 	    immu->immu_did_arena_name,
1435 	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
1436 	    immu->immu_max_domains - IMMU_UNITY_DID,
1437 	    1,				/* quantum */
1438 	    NULL,			/* afunc */
1439 	    NULL,			/* ffunc */
1440 	    NULL,			/* source */
1441 	    0,				/* qcache_max */
1442 	    VM_SLEEP);
1443 
1444 	/* Even with SLEEP flag, vmem_create() can fail */
1445 	if (immu->immu_did_arena == NULL) {
1446 		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1447 		    "IOMMU domainid allocator: %s", immu->immu_name,
1448 		    immu->immu_did_arena_name);
1449 	}
1450 }
1451 
1452 /* #########################  CONTEXT CODE ################################# */
1453 
1454 static void
1455 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1456     int bus, int devfunc)
1457 {
1458 	pgtable_t *context;
1459 	pgtable_t *pgtable_root;
1460 	hw_rce_t *hw_rent;
1461 	hw_rce_t *hw_cent;
1462 	hw_rce_t *ctxp;
1463 	int sid;
1464 	krw_t rwtype;
1465 	boolean_t fill_root;
1466 	boolean_t fill_ctx;
1467 
1468 	pgtable_root = domain->dom_pgtable_root;
1469 
1470 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1471 	context = *(pgtable_t **)(ctxp + bus);
1472 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1473 
1474 	fill_root = B_FALSE;
1475 	fill_ctx = B_FALSE;
1476 
1477 	/* Check the most common case first with reader lock */
1478 	rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
1479 	rwtype = RW_READER;
1480 again:
1481 	if (ROOT_GET_P(hw_rent)) {
1482 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1483 		if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
1484 			rw_exit(&(immu->immu_ctx_rwlock));
1485 			return;
1486 		} else {
1487 			fill_ctx = B_TRUE;
1488 		}
1489 	} else {
1490 		fill_root = B_TRUE;
1491 		fill_ctx = B_TRUE;
1492 	}
1493 
1494 	if (rwtype == RW_READER &&
1495 	    rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
1496 		rw_exit(&(immu->immu_ctx_rwlock));
1497 		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1498 		rwtype = RW_WRITER;
1499 		goto again;
1500 	}
1501 	rwtype = RW_WRITER;
1502 
1503 	if (fill_root == B_TRUE) {
1504 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1505 		ROOT_SET_P(hw_rent);
1506 		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1507 	}
1508 
1509 	if (fill_ctx == B_TRUE) {
1510 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1511 		/* need to disable context entry before reprogramming it */
1512 		bzero(hw_cent, sizeof (hw_rce_t));
1513 
1514 		/* flush caches */
1515 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1516 
1517 		sid = ((bus << 8) | devfunc);
1518 		immu_flush_context_fsi(immu, 0, sid, domain->dom_did,
1519 		    &immu->immu_ctx_inv_wait);
1520 
1521 		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1522 		CONT_SET_DID(hw_cent, domain->dom_did);
1523 		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1524 		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1525 		if (domain->dom_did == IMMU_UNITY_DID &&
1526 		    IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1527 			CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1528 		else
1529 			/*LINTED*/
1530 			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1531 		CONT_SET_P(hw_cent);
1532 		if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) {
1533 			CONT_SET_EH(hw_cent);
1534 			if (immu_use_alh)
1535 				CONT_SET_ALH(hw_cent);
1536 		}
1537 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1538 	}
1539 	rw_exit(&(immu->immu_ctx_rwlock));
1540 }
1541 
1542 static pgtable_t *
1543 context_create(immu_t *immu)
1544 {
1545 	int	bus;
1546 	int	devfunc;
1547 	pgtable_t *root_table;
1548 	pgtable_t *context;
1549 	pgtable_t *pgtable_root;
1550 	hw_rce_t *ctxp;
1551 	hw_rce_t *hw_rent;
1552 	hw_rce_t *hw_cent;
1553 
1554 	/* Allocate a zeroed root table (4K 256b entries) */
1555 	root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1556 	pgtable_zero(root_table);
1557 
1558 	/*
1559 	 * Setup context tables for all possible root table entries.
1560 	 * Start out with unity domains for all entries.
1561 	 */
1562 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1563 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1564 	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1565 		context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1566 		pgtable_zero(context);
1567 		ROOT_SET_P(hw_rent);
1568 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1569 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1570 		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1571 		    devfunc++, hw_cent++) {
1572 			pgtable_root =
1573 			    immu->immu_unity_domain->dom_pgtable_root;
1574 			CONT_SET_DID(hw_cent,
1575 			    immu->immu_unity_domain->dom_did);
1576 			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1577 			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1578 			if (IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1579 				CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1580 			else
1581 				/*LINTED*/
1582 				CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1583 			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1584 			CONT_SET_P(hw_cent);
1585 		}
1586 		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1587 		*((pgtable_t **)ctxp) = context;
1588 	}
1589 
1590 	return (root_table);
1591 }
1592 
1593 /*
1594  * Called during rootnex attach, so no locks needed
1595  */
1596 static void
1597 context_init(immu_t *immu)
1598 {
1599 	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1600 
1601 	immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE);
1602 
1603 	immu_regs_wbf_flush(immu);
1604 
1605 	immu->immu_ctx_root = context_create(immu);
1606 
1607 	immu_regs_set_root_table(immu);
1608 
1609 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1610 	immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait);
1611 	immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait);
1612 	rw_exit(&(immu->immu_ctx_rwlock));
1613 }
1614 
1615 
1616 /*
1617  * Find top pcib
1618  */
1619 static int
1620 find_top_pcib(dev_info_t *dip, void *arg)
1621 {
1622 	immu_devi_t *immu_devi;
1623 	dev_info_t **pcibdipp = (dev_info_t **)arg;
1624 
1625 	immu_devi = immu_devi_get(dip);
1626 
1627 	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1628 		*pcibdipp = dip;
1629 	}
1630 
1631 	return (DDI_WALK_CONTINUE);
1632 }
1633 
1634 static int
1635 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1636     dev_info_t *rdip, immu_flags_t immu_flags)
1637 {
1638 	immu_devi_t *r_immu_devi;
1639 	immu_devi_t *d_immu_devi;
1640 	int r_bus;
1641 	int d_bus;
1642 	int r_devfunc;
1643 	int d_devfunc;
1644 	immu_pcib_t d_pcib_type;
1645 	dev_info_t *pcibdip;
1646 
1647 	if (ddip == NULL || rdip == NULL ||
1648 	    ddip == root_devinfo || rdip == root_devinfo) {
1649 		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1650 		    "request-dip are NULL or are root devinfo");
1651 		return (DDI_FAILURE);
1652 	}
1653 
1654 	/*
1655 	 * We need to set the context fields
1656 	 * based on what type of device rdip and ddip are.
1657 	 * To do that we need the immu_devi field.
1658 	 * Set the immu_devi field (if not already set)
1659 	 */
1660 	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1661 		ddi_err(DER_MODE, rdip,
1662 		    "immu_context_update: failed to set immu_devi for ddip");
1663 		return (DDI_FAILURE);
1664 	}
1665 
1666 	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1667 		ddi_err(DER_MODE, rdip,
1668 		    "immu_context_update: failed to set immu_devi for rdip");
1669 		return (DDI_FAILURE);
1670 	}
1671 
1672 	d_immu_devi = immu_devi_get(ddip);
1673 	r_immu_devi = immu_devi_get(rdip);
1674 
1675 	d_bus = d_immu_devi->imd_bus;
1676 	d_devfunc = d_immu_devi->imd_devfunc;
1677 	d_pcib_type = d_immu_devi->imd_pcib_type;
1678 	r_bus = r_immu_devi->imd_bus;
1679 	r_devfunc = r_immu_devi->imd_devfunc;
1680 
1681 	if (rdip == ddip) {
1682 		/* rdip is a PCIE device. set context for it only */
1683 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1684 		    r_devfunc);
1685 #ifdef BUGGY_DRIVERS
1686 	} else if (r_immu_devi == d_immu_devi) {
1687 #ifdef TEST
1688 		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1689 		    "0x%lx are identical", rdip, ddip);
1690 #endif
1691 		/* rdip is a PCIE device. set context for it only */
1692 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1693 		    r_devfunc);
1694 #endif
1695 	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1696 		/*
1697 		 * ddip is a PCIE_PCI bridge. Set context for ddip's
1698 		 * secondary bus. If rdip is on ddip's secondary
1699 		 * bus, set context for rdip. Else, set context
1700 		 * for rdip's PCI bridge on ddip's secondary bus.
1701 		 */
1702 		context_set(immu, domain, immu->immu_ctx_root,
1703 		    d_immu_devi->imd_sec, 0);
1704 		if (d_immu_devi->imd_sec == r_bus) {
1705 			context_set(immu, domain, immu->immu_ctx_root,
1706 			    r_bus, r_devfunc);
1707 		} else {
1708 			pcibdip = NULL;
1709 			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1710 			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1711 			    pcibdip != NULL) {
1712 				r_immu_devi = immu_devi_get(pcibdip);
1713 				r_bus = r_immu_devi->imd_bus;
1714 				r_devfunc = r_immu_devi->imd_devfunc;
1715 				context_set(immu, domain, immu->immu_ctx_root,
1716 				    r_bus, r_devfunc);
1717 			} else {
1718 				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1719 				    " bridge for PCI device");
1720 				/*NOTREACHED*/
1721 			}
1722 		}
1723 	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1724 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1725 		    d_devfunc);
1726 	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1727 		/*
1728 		 * ddip is a PCIE device which has a non-PCI device under it
1729 		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1730 		 */
1731 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1732 		    d_devfunc);
1733 	} else {
1734 		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1735 		    "set iommu context.");
1736 		/*NOTREACHED*/
1737 	}
1738 
1739 	/* XXX do we need a membar_producer() here */
1740 	return (DDI_SUCCESS);
1741 }
1742 
1743 /* ##################### END CONTEXT CODE ################################## */
1744 /* ##################### MAPPING CODE ################################## */
1745 
1746 
1747 #ifdef DEBUG
1748 static boolean_t
1749 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1750     dev_info_t *rdip, immu_flags_t immu_flags)
1751 {
1752 	/* The PDTE must be set i.e. present bit is set */
1753 	if (!PDTE_P(pdte)) {
1754 		ddi_err(DER_MODE, rdip, "No present flag");
1755 		return (B_FALSE);
1756 	}
1757 
1758 	/*
1759 	 * Just assert to check most significant system software field
1760 	 * (PDTE_SW4) as it is same as present bit and we
1761 	 * checked that above
1762 	 */
1763 	ASSERT(PDTE_SW4(pdte));
1764 
1765 	/*
1766 	 * TM field should be clear if not reserved.
1767 	 * non-leaf is always reserved
1768 	 */
1769 	if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
1770 		if (PDTE_TM(pdte)) {
1771 			ddi_err(DER_MODE, rdip, "TM flag set");
1772 			return (B_FALSE);
1773 		}
1774 	}
1775 
1776 	/*
1777 	 * The SW3 field is not used and must be clear
1778 	 */
1779 	if (PDTE_SW3(pdte)) {
1780 		ddi_err(DER_MODE, rdip, "SW3 set");
1781 		return (B_FALSE);
1782 	}
1783 
1784 	/*
1785 	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1786 	 */
1787 	if (next == NULL) {
1788 		ASSERT(paddr % IMMU_PAGESIZE == 0);
1789 		if (PDTE_PADDR(pdte) != paddr) {
1790 			ddi_err(DER_MODE, rdip,
1791 			    "PTE paddr mismatch: %lx != %lx",
1792 			    PDTE_PADDR(pdte), paddr);
1793 			return (B_FALSE);
1794 		}
1795 	} else {
1796 		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1797 			ddi_err(DER_MODE, rdip,
1798 			    "PDE paddr mismatch: %lx != %lx",
1799 			    PDTE_PADDR(pdte), next->hwpg_paddr);
1800 			return (B_FALSE);
1801 		}
1802 	}
1803 
1804 	/*
1805 	 * SNP field should be clear if not reserved.
1806 	 * non-leaf is always reserved
1807 	 */
1808 	if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
1809 		if (PDTE_SNP(pdte)) {
1810 			ddi_err(DER_MODE, rdip, "SNP set");
1811 			return (B_FALSE);
1812 		}
1813 	}
1814 
1815 	/* second field available for system software should be clear */
1816 	if (PDTE_SW2(pdte)) {
1817 		ddi_err(DER_MODE, rdip, "SW2 set");
1818 		return (B_FALSE);
1819 	}
1820 
1821 	/* Super pages field should be clear */
1822 	if (PDTE_SP(pdte)) {
1823 		ddi_err(DER_MODE, rdip, "SP set");
1824 		return (B_FALSE);
1825 	}
1826 
1827 	/*
1828 	 * least significant field available for
1829 	 * system software should be clear
1830 	 */
1831 	if (PDTE_SW1(pdte)) {
1832 		ddi_err(DER_MODE, rdip, "SW1 set");
1833 		return (B_FALSE);
1834 	}
1835 
1836 	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1837 		ddi_err(DER_MODE, rdip, "READ not set");
1838 		return (B_FALSE);
1839 	}
1840 
1841 	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1842 		ddi_err(DER_MODE, rdip, "WRITE not set");
1843 		return (B_FALSE);
1844 	}
1845 
1846 	return (B_TRUE);
1847 }
1848 #endif
1849 
1850 /*ARGSUSED*/
1851 static void
1852 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
1853     uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
1854 {
1855 	uint64_t npages;
1856 	uint64_t dvma;
1857 	pgtable_t *pgtable;
1858 	hw_pdte_t *hwp;
1859 	hw_pdte_t *shwp;
1860 	int idx;
1861 
1862 	pgtable = xlate->xlt_pgtable;
1863 	idx = xlate->xlt_idx;
1864 
1865 	dvma = *dvma_ptr;
1866 	npages = *npages_ptr;
1867 
1868 	/*
1869 	 * since a caller gets a unique dvma for a physical address,
1870 	 * no other concurrent thread will be writing to the same
1871 	 * PTE even if it has the same paddr. So no locks needed.
1872 	 */
1873 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1874 
1875 	hwp = shwp;
1876 	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
1877 		PDTE_CLEAR_P(*hwp);
1878 		dvma += IMMU_PAGESIZE;
1879 		npages--;
1880 	}
1881 
1882 	*dvma_ptr = dvma;
1883 	*npages_ptr = npages;
1884 
1885 	xlate->xlt_idx = idx;
1886 }
1887 
1888 static void
1889 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels)
1890 {
1891 	int level;
1892 	uint64_t offbits;
1893 
1894 	/*
1895 	 * Skip the first 12 bits which is the offset into
1896 	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1897 	 */
1898 	offbits = dvma >> IMMU_PAGESHIFT;
1899 
1900 	/* skip to level 1 i.e. leaf PTE */
1901 	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
1902 		xlate->xlt_level = level;
1903 		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
1904 		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
1905 		xlate->xlt_pgtable = NULL;
1906 		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
1907 	}
1908 }
1909 
1910 /*
1911  * Read the pgtables
1912  */
1913 static boolean_t
1914 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels)
1915 {
1916 	pgtable_t *pgtable;
1917 	pgtable_t *next;
1918 	uint_t idx;
1919 
1920 	/* start with highest level pgtable i.e. root */
1921 	xlate += nlevels;
1922 
1923 	if (xlate->xlt_pgtable == NULL) {
1924 		xlate->xlt_pgtable = domain->dom_pgtable_root;
1925 	}
1926 
1927 	for (; xlate->xlt_level > 1; xlate--) {
1928 		idx = xlate->xlt_idx;
1929 		pgtable = xlate->xlt_pgtable;
1930 
1931 		if ((xlate - 1)->xlt_pgtable) {
1932 			continue;
1933 		}
1934 
1935 		/* Lock the pgtable in read mode */
1936 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
1937 
1938 		/*
1939 		 * since we are unmapping, the pgtable should
1940 		 * already point to a leafier pgtable.
1941 		 */
1942 		next = *(pgtable->swpg_next_array + idx);
1943 		(xlate - 1)->xlt_pgtable = next;
1944 		rw_exit(&(pgtable->swpg_rwlock));
1945 		if (next == NULL)
1946 			return (B_FALSE);
1947 	}
1948 
1949 	return (B_TRUE);
1950 }
1951 
1952 static void
1953 immu_fault_walk(void *arg, void *base, size_t len)
1954 {
1955 	uint64_t dvma, start;
1956 
1957 	dvma = *(uint64_t *)arg;
1958 	start = (uint64_t)(uintptr_t)base;
1959 
1960 	if (dvma >= start && dvma < (start + len)) {
1961 		ddi_err(DER_WARN, NULL,
1962 		    "faulting DVMA address is in vmem arena "
1963 		    "(%" PRIx64 "-%" PRIx64 ")",
1964 		    start, start + len);
1965 		*(uint64_t *)arg = ~0ULL;
1966 	}
1967 }
1968 
1969 void
1970 immu_print_fault_info(uint_t sid, uint64_t dvma)
1971 {
1972 	int nlevels;
1973 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
1974 	xlate_t *xlatep;
1975 	hw_pdte_t pte;
1976 	domain_t *domain;
1977 	immu_t *immu;
1978 	uint64_t dvma_arg;
1979 
1980 	if (mod_hash_find(bdf_domain_hash,
1981 	    (void *)(uintptr_t)sid, (void *)&domain) != 0) {
1982 		ddi_err(DER_WARN, NULL,
1983 		    "no domain for faulting SID %08x", sid);
1984 		return;
1985 	}
1986 
1987 	immu = domain->dom_immu;
1988 
1989 	dvma_arg = dvma;
1990 	vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk,
1991 	    (void *)&dvma_arg);
1992 	if (dvma_arg != ~0ULL)
1993 		ddi_err(DER_WARN, domain->dom_dip,
1994 		    "faulting DVMA address is not in vmem arena");
1995 
1996 	nlevels = immu->immu_dvma_nlevels;
1997 	xlate_setup(dvma, xlate, nlevels);
1998 
1999 	if (!PDE_lookup(domain, xlate, nlevels)) {
2000 		ddi_err(DER_WARN, domain->dom_dip,
2001 		    "pte not found in domid %d for faulting addr %" PRIx64,
2002 		    domain->dom_did, dvma);
2003 		return;
2004 	}
2005 
2006 	xlatep = &xlate[1];
2007 	pte = *((hw_pdte_t *)
2008 	    (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx);
2009 
2010 	ddi_err(DER_WARN, domain->dom_dip,
2011 	    "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did,
2012 	    (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte));
2013 }
2014 
2015 /*ARGSUSED*/
2016 static void
2017 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2018     dev_info_t *rdip, immu_flags_t immu_flags)
2019 {
2020 	hw_pdte_t pte;
2021 
2022 #ifndef DEBUG
2023 	pte = immu->immu_ptemask;
2024 	PDTE_SET_PADDR(pte, paddr);
2025 #else
2026 	pte = *hwp;
2027 
2028 	if (PDTE_P(pte)) {
2029 		if (PDTE_PADDR(pte) != paddr) {
2030 			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2031 			    PDTE_PADDR(pte), paddr);
2032 		}
2033 #ifdef BUGGY_DRIVERS
2034 		return;
2035 #else
2036 		goto out;
2037 #endif
2038 	}
2039 
2040 	/* clear TM field if not reserved */
2041 	if (immu->immu_TM_reserved == B_FALSE) {
2042 		PDTE_CLEAR_TM(pte);
2043 	}
2044 
2045 	/* Clear 3rd field for system software  - not used */
2046 	PDTE_CLEAR_SW3(pte);
2047 
2048 	/* Set paddr */
2049 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2050 	PDTE_CLEAR_PADDR(pte);
2051 	PDTE_SET_PADDR(pte, paddr);
2052 
2053 	/*  clear SNP field if not reserved. */
2054 	if (immu->immu_SNP_reserved == B_FALSE) {
2055 		PDTE_CLEAR_SNP(pte);
2056 	}
2057 
2058 	/* Clear SW2 field available for software */
2059 	PDTE_CLEAR_SW2(pte);
2060 
2061 
2062 	/* SP is don't care for PTEs. Clear it for cleanliness */
2063 	PDTE_CLEAR_SP(pte);
2064 
2065 	/* Clear SW1 field available for software */
2066 	PDTE_CLEAR_SW1(pte);
2067 
2068 	/*
2069 	 * Now that we are done writing the PTE
2070 	 * set the "present" flag. Note this present
2071 	 * flag is a bit in the PDE/PTE that the
2072 	 * spec says is available for system software.
2073 	 * This is an implementation detail of Solaris
2074 	 * bare-metal Intel IOMMU.
2075 	 * The present field in a PDE/PTE is not defined
2076 	 * by the Vt-d spec
2077 	 */
2078 
2079 	PDTE_SET_P(pte);
2080 
2081 	pte |= immu->immu_ptemask;
2082 
2083 out:
2084 #endif /* DEBUG */
2085 #ifdef BUGGY_DRIVERS
2086 	PDTE_SET_READ(pte);
2087 	PDTE_SET_WRITE(pte);
2088 #else
2089 	if (immu_flags & IMMU_FLAGS_READ)
2090 		PDTE_SET_READ(pte);
2091 	if (immu_flags & IMMU_FLAGS_WRITE)
2092 		PDTE_SET_WRITE(pte);
2093 #endif /* BUGGY_DRIVERS */
2094 
2095 	*hwp = pte;
2096 }
2097 
2098 /*ARGSUSED*/
2099 static void
2100 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2101     uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies,
2102     int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2103 {
2104 	paddr_t paddr;
2105 	uint64_t nvpages;
2106 	uint64_t nppages;
2107 	uint64_t dvma;
2108 	pgtable_t *pgtable;
2109 	hw_pdte_t *hwp;
2110 	hw_pdte_t *shwp;
2111 	int idx, nset;
2112 	int j;
2113 
2114 	pgtable = xlate->xlt_pgtable;
2115 	idx = xlate->xlt_idx;
2116 
2117 	dvma = *dvma_ptr;
2118 	nvpages = *nvpages_ptr;
2119 
2120 	/*
2121 	 * since a caller gets a unique dvma for a physical address,
2122 	 * no other concurrent thread will be writing to the same
2123 	 * PTE even if it has the same paddr. So no locks needed.
2124 	 */
2125 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2126 
2127 	hwp = shwp;
2128 	for (j = dcount - 1; j >= 0; j--) {
2129 		if (nvpages <= dcookies[j].dck_npages)
2130 			break;
2131 		nvpages -= dcookies[j].dck_npages;
2132 	}
2133 
2134 	nppages = nvpages;
2135 	paddr = dcookies[j].dck_paddr +
2136 	    (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;
2137 
2138 	nvpages = *nvpages_ptr;
2139 	nset = 0;
2140 	for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2141 		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2142 		nset++;
2143 
2144 		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2145 		    == B_TRUE);
2146 		nppages--;
2147 		nvpages--;
2148 		paddr += IMMU_PAGESIZE;
2149 		dvma += IMMU_PAGESIZE;
2150 
2151 		if (nppages == 0) {
2152 			j++;
2153 		}
2154 
2155 		if (j == dcount)
2156 			break;
2157 
2158 		if (nppages == 0) {
2159 			nppages = dcookies[j].dck_npages;
2160 			paddr = dcookies[j].dck_paddr;
2161 		}
2162 	}
2163 
2164 	if (nvpages) {
2165 		*dvma_ptr = dvma;
2166 		*nvpages_ptr = nvpages;
2167 	} else {
2168 		*dvma_ptr = 0;
2169 		*nvpages_ptr = 0;
2170 	}
2171 
2172 	xlate->xlt_idx = idx;
2173 }
2174 
2175 /*ARGSUSED*/
2176 static void
2177 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2178     dev_info_t *rdip, immu_flags_t immu_flags)
2179 {
2180 	hw_pdte_t pde;
2181 
2182 	pde = *hwp;
2183 
2184 	/* if PDE is already set, make sure it is correct */
2185 	if (PDTE_P(pde)) {
2186 		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2187 #ifdef BUGGY_DRIVERS
2188 		return;
2189 #else
2190 		goto out;
2191 #endif
2192 	}
2193 
2194 	/* Dont touch SW4, it is the present bit */
2195 
2196 	/* don't touch TM field it is reserved for PDEs */
2197 
2198 	/* 3rd field available for system software is not used */
2199 	PDTE_CLEAR_SW3(pde);
2200 
2201 	/* Set next level pgtable-paddr for PDE */
2202 	PDTE_CLEAR_PADDR(pde);
2203 	PDTE_SET_PADDR(pde, next->hwpg_paddr);
2204 
2205 	/* don't touch SNP field it is reserved for PDEs */
2206 
2207 	/* Clear second field available for system software */
2208 	PDTE_CLEAR_SW2(pde);
2209 
2210 	/* No super pages for PDEs */
2211 	PDTE_CLEAR_SP(pde);
2212 
2213 	/* Clear SW1 for software */
2214 	PDTE_CLEAR_SW1(pde);
2215 
2216 	/*
2217 	 * Now that we are done writing the PDE
2218 	 * set the "present" flag. Note this present
2219 	 * flag is a bit in the PDE/PTE that the
2220 	 * spec says is available for system software.
2221 	 * This is an implementation detail of Solaris
2222 	 * base-metal Intel IOMMU.
2223 	 * The present field in a PDE/PTE is not defined
2224 	 * by the Vt-d spec
2225 	 */
2226 
2227 out:
2228 #ifdef  BUGGY_DRIVERS
2229 	PDTE_SET_READ(pde);
2230 	PDTE_SET_WRITE(pde);
2231 #else
2232 	if (immu_flags & IMMU_FLAGS_READ)
2233 		PDTE_SET_READ(pde);
2234 	if (immu_flags & IMMU_FLAGS_WRITE)
2235 		PDTE_SET_WRITE(pde);
2236 #endif
2237 
2238 	PDTE_SET_P(pde);
2239 
2240 	*hwp = pde;
2241 }
2242 
2243 /*
2244  * Used to set PDEs
2245  */
2246 static boolean_t
2247 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2248     dev_info_t *rdip, immu_flags_t immu_flags)
2249 {
2250 	pgtable_t *pgtable;
2251 	pgtable_t *new;
2252 	pgtable_t *next;
2253 	hw_pdte_t *hwp;
2254 	int level;
2255 	uint_t idx;
2256 	krw_t rwtype;
2257 	boolean_t set = B_FALSE;
2258 
2259 	/* start with highest level pgtable i.e. root */
2260 	xlate += nlevels;
2261 
2262 	new = NULL;
2263 	xlate->xlt_pgtable = domain->dom_pgtable_root;
2264 	for (level = nlevels; level > 1; level--, xlate--) {
2265 		idx = xlate->xlt_idx;
2266 		pgtable = xlate->xlt_pgtable;
2267 
2268 		/* Lock the pgtable in READ mode first */
2269 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2270 		rwtype = RW_READER;
2271 again:
2272 		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2273 		next = (pgtable->swpg_next_array)[idx];
2274 
2275 		/*
2276 		 * check if leafier level already has a pgtable
2277 		 * if yes, verify
2278 		 */
2279 		if (next == NULL) {
2280 			if (new == NULL) {
2281 
2282 				IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *,
2283 				    rdip, int, level);
2284 
2285 				new = pgtable_alloc(immu, immu_flags);
2286 				if (new == NULL) {
2287 					ddi_err(DER_PANIC, rdip,
2288 					    "pgtable alloc err");
2289 				}
2290 				pgtable_zero(new);
2291 			}
2292 
2293 			/* Change to a write lock */
2294 			if (rwtype == RW_READER &&
2295 			    rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
2296 				rw_exit(&(pgtable->swpg_rwlock));
2297 				rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2298 				rwtype = RW_WRITER;
2299 				goto again;
2300 			}
2301 			rwtype = RW_WRITER;
2302 			next = new;
2303 			(pgtable->swpg_next_array)[idx] = next;
2304 			new = NULL;
2305 			PDE_set_one(immu, hwp, next, rdip, immu_flags);
2306 			set = B_TRUE;
2307 			rw_downgrade(&(pgtable->swpg_rwlock));
2308 			rwtype = RW_READER;
2309 		}
2310 #ifndef  BUGGY_DRIVERS
2311 		else {
2312 			hw_pdte_t pde = *hwp;
2313 
2314 			/*
2315 			 * If buggy driver we already set permission
2316 			 * READ+WRITE so nothing to do for that case
2317 			 * XXX Check that read writer perms change before
2318 			 * actually setting perms. Also need to hold lock
2319 			 */
2320 			if (immu_flags & IMMU_FLAGS_READ)
2321 				PDTE_SET_READ(pde);
2322 			if (immu_flags & IMMU_FLAGS_WRITE)
2323 				PDTE_SET_WRITE(pde);
2324 
2325 			*hwp = pde;
2326 		}
2327 #endif
2328 
2329 		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2330 		    == B_TRUE);
2331 
2332 		(xlate - 1)->xlt_pgtable = next;
2333 		rw_exit(&(pgtable->swpg_rwlock));
2334 	}
2335 
2336 	if (new) {
2337 		pgtable_free(immu, new);
2338 	}
2339 
2340 	return (set);
2341 }
2342 
2343 /*
2344  * dvma_map()
2345  *     map a contiguous range of DVMA pages
2346  *
2347  *     immu: IOMMU unit for which we are generating DVMA cookies
2348  *   domain: domain
2349  *    sdvma: Starting dvma
2350  *   spaddr: Starting paddr
2351  *   npages: Number of pages
2352  *     rdip: requesting device
2353  *     immu_flags: flags
2354  */
2355 static boolean_t
2356 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages,
2357     immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
2358     immu_flags_t immu_flags)
2359 {
2360 	uint64_t dvma;
2361 	uint64_t n;
2362 	immu_t *immu = domain->dom_immu;
2363 	int nlevels = immu->immu_dvma_nlevels;
2364 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2365 	boolean_t pde_set = B_FALSE;
2366 
2367 	n = snvpages;
2368 	dvma = sdvma;
2369 
2370 	while (n > 0) {
2371 		xlate_setup(dvma, xlate, nlevels);
2372 
2373 		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2374 		if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
2375 		    == B_TRUE) {
2376 			pde_set = B_TRUE;
2377 		}
2378 
2379 		/* set all matching ptes that fit into this leaf pgtable */
2380 		PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
2381 		    dcount, rdip, immu_flags);
2382 	}
2383 
2384 	return (pde_set);
2385 }
2386 
2387 /*
2388  * dvma_unmap()
2389  *   unmap a range of DVMAs
2390  *
2391  * immu: IOMMU unit state
2392  * domain: domain for requesting device
2393  * ddip: domain-dip
2394  * dvma: starting DVMA
2395  * npages: Number of IMMU pages to be unmapped
2396  * rdip: requesting device
2397  */
2398 static void
2399 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages,
2400     dev_info_t *rdip)
2401 {
2402 	immu_t *immu = domain->dom_immu;
2403 	int nlevels = immu->immu_dvma_nlevels;
2404 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2405 	uint64_t n;
2406 	uint64_t dvma;
2407 
2408 	dvma = sdvma;
2409 	n = snpages;
2410 
2411 	while (n > 0) {
2412 		/* setup the xlate array */
2413 		xlate_setup(dvma, xlate, nlevels);
2414 
2415 		/* just lookup existing pgtables. Should never fail */
2416 		if (!PDE_lookup(domain, xlate, nlevels))
2417 			ddi_err(DER_PANIC, rdip,
2418 			    "PTE not found for addr %" PRIx64,
2419 			    (unsigned long long)dvma);
2420 
2421 		/* clear all matching ptes that fit into this leaf pgtable */
2422 		PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
2423 	}
2424 
2425 	/* No need to flush IOTLB after unmap */
2426 }
2427 
2428 static uint64_t
2429 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf)
2430 {
2431 	uint64_t dvma;
2432 	size_t xsize, align;
2433 	uint64_t minaddr, maxaddr;
2434 
2435 	/* parameters */
2436 	xsize = npages * IMMU_PAGESIZE;
2437 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2438 	minaddr = dma_attr->dma_attr_addr_lo;
2439 	maxaddr = dma_attr->dma_attr_addr_hi + 1;
2440 
2441 	/* handle the rollover cases */
2442 	if (maxaddr < dma_attr->dma_attr_addr_hi) {
2443 		maxaddr = dma_attr->dma_attr_addr_hi;
2444 	}
2445 
2446 	/*
2447 	 * allocate from vmem arena.
2448 	 */
2449 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2450 	    xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
2451 	    (void *)(uintptr_t)maxaddr, kmf);
2452 
2453 	return (dvma);
2454 }
2455 
2456 static void
2457 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr)
2458 {
2459 	int nlevels;
2460 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp;
2461 	uint64_t dvma, n;
2462 	size_t xsize, align;
2463 	uint64_t minaddr, maxaddr, dmamax;
2464 	int on, npte, pindex;
2465 	hw_pdte_t *shwp;
2466 	immu_t *immu;
2467 	domain_t *domain;
2468 
2469 	/* parameters */
2470 	domain = IMMU_DEVI(rdip)->imd_domain;
2471 	immu = domain->dom_immu;
2472 	nlevels = immu->immu_dvma_nlevels;
2473 	xsize = IMMU_NPREPTES * IMMU_PAGESIZE;
2474 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2475 	minaddr = dma_attr->dma_attr_addr_lo;
2476 	if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG)
2477 		dmamax = dma_attr->dma_attr_seg;
2478 	else
2479 		dmamax = dma_attr->dma_attr_addr_hi;
2480 	maxaddr = dmamax + 1;
2481 
2482 	if (maxaddr < dmamax)
2483 		maxaddr = dmamax;
2484 
2485 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2486 	    xsize, align, 0, dma_attr->dma_attr_seg + 1,
2487 	    (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2488 
2489 	ihp->ihp_predvma = dvma;
2490 	ihp->ihp_npremapped = 0;
2491 	if (dvma == 0)
2492 		return;
2493 
2494 	n = IMMU_NPREPTES;
2495 	pindex = 0;
2496 
2497 	/*
2498 	 * Set up a mapping at address 0, just so that all PDPs get allocated
2499 	 * now. Although this initial mapping should never be used,
2500 	 * explicitly set it to read-only, just to be safe.
2501 	 */
2502 	while (n > 0) {
2503 		xlate_setup(dvma, xlate, nlevels);
2504 
2505 		(void) PDE_set_all(immu, domain, xlate, nlevels, rdip,
2506 		    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2507 
2508 		xlp = &xlate[1];
2509 		shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr)
2510 		    + xlp->xlt_idx;
2511 		on = n;
2512 
2513 		PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie,
2514 		    1, rdip, IMMU_FLAGS_READ);
2515 
2516 		npte = on - n;
2517 
2518 		while (npte > 0) {
2519 			ihp->ihp_preptes[pindex++] = shwp;
2520 #ifdef BUGGY_DRIVERS
2521 			PDTE_CLEAR_WRITE(*shwp);
2522 #endif
2523 			shwp++;
2524 			npte--;
2525 		}
2526 	}
2527 }
2528 
2529 static void
2530 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp)
2531 {
2532 	domain_t *domain;
2533 
2534 	domain = IMMU_DEVI(rdip)->imd_domain;
2535 
2536 	if (ihp->ihp_predvma != 0) {
2537 		dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip);
2538 		vmem_free(domain->dom_dvma_arena,
2539 		    (void *)(uintptr_t)ihp->ihp_predvma,
2540 		    IMMU_NPREPTES * IMMU_PAGESIZE);
2541 	}
2542 }
2543 
2544 static void
2545 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2546 {
2547 	uint64_t size = npages * IMMU_PAGESIZE;
2548 
2549 	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE)
2550 		return;
2551 
2552 	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2553 }
2554 
2555 static int
2556 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle,
2557     immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq,
2558     ddi_dma_obj_t *dma_out)
2559 {
2560 	domain_t *domain;
2561 	immu_t *immu;
2562 	immu_flags_t immu_flags;
2563 	ddi_dma_atyp_t buftype;
2564 	ddi_dma_obj_t *dmar_object;
2565 	ddi_dma_attr_t *attrp;
2566 	uint64_t offset, paddr, dvma, sdvma, rwmask;
2567 	size_t npages, npgalloc;
2568 	uint_t psize, size, pcnt, dmax;
2569 	page_t **pparray;
2570 	caddr_t vaddr;
2571 	page_t *page;
2572 	struct as *vas;
2573 	immu_dcookie_t *dcookies;
2574 	int pde_set;
2575 
2576 	domain = IMMU_DEVI(rdip)->imd_domain;
2577 	immu = domain->dom_immu;
2578 	immu_flags = dma_to_immu_flags(dmareq);
2579 
2580 	attrp = &((ddi_dma_impl_t *)handle)->dmai_attr;
2581 
2582 	dmar_object = &dmareq->dmar_object;
2583 	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2584 	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2585 	buftype = dmar_object->dmao_type;
2586 	size = dmar_object->dmao_size;
2587 
2588 	IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t,
2589 	    buftype, uint_t, size);
2590 
2591 	dcookies = &ihp->ihp_dcookies[0];
2592 
2593 	pcnt = dmax = 0;
2594 
2595 	/* retrieve paddr, psize, offset from dmareq */
2596 	if (buftype == DMA_OTYP_PAGES) {
2597 		page = dmar_object->dmao_obj.pp_obj.pp_pp;
2598 		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
2599 		    MMU_PAGEOFFSET;
2600 		paddr = pfn_to_pa(page->p_pagenum) + offset;
2601 		psize = MIN((MMU_PAGESIZE - offset), size);
2602 		page = page->p_next;
2603 		vas = dmar_object->dmao_obj.virt_obj.v_as;
2604 	} else {
2605 		if (vas == NULL) {
2606 			vas = &kas;
2607 		}
2608 		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2609 		if (pparray != NULL) {
2610 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2611 			psize = MIN((MMU_PAGESIZE - offset), size);
2612 			pcnt++;
2613 		} else {
2614 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat,
2615 			    vaddr)) + offset;
2616 			psize = MIN(size, (MMU_PAGESIZE - offset));
2617 			vaddr += psize;
2618 		}
2619 	}
2620 
2621 	npgalloc = IMMU_BTOPR(size + offset);
2622 
2623 	if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) {
2624 #ifdef BUGGY_DRIVERS
2625 		rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask;
2626 #else
2627 		rwmask = immu->immu_ptemask;
2628 		if (immu_flags & IMMU_FLAGS_READ)
2629 			rwmask |= PDTE_MASK_R;
2630 		if (immu_flags & IMMU_FLAGS_WRITE)
2631 			rwmask |= PDTE_MASK_W;
2632 #endif
2633 #ifdef DEBUG
2634 		rwmask |= PDTE_MASK_P;
2635 #endif
2636 		sdvma = ihp->ihp_predvma;
2637 		ihp->ihp_npremapped = npgalloc;
2638 		*ihp->ihp_preptes[0] =
2639 		    PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask;
2640 	} else {
2641 		ihp->ihp_npremapped = 0;
2642 		sdvma = dvma_alloc(domain, attrp, npgalloc,
2643 		    dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP);
2644 		if (sdvma == 0)
2645 			return (DDI_DMA_NORESOURCES);
2646 
2647 		dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET);
2648 		dcookies[0].dck_npages = 1;
2649 	}
2650 
2651 	IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc,
2652 	    uint64_t, sdvma);
2653 
2654 	dvma = sdvma;
2655 	pde_set = 0;
2656 	npages = 1;
2657 	size -= psize;
2658 	while (size > 0) {
2659 		/* get the size for this page (i.e. partial or full page) */
2660 		psize = MIN(size, MMU_PAGESIZE);
2661 		if (buftype == DMA_OTYP_PAGES) {
2662 			/* get the paddr from the page_t */
2663 			paddr = pfn_to_pa(page->p_pagenum);
2664 			page = page->p_next;
2665 		} else if (pparray != NULL) {
2666 			/* index into the array of page_t's to get the paddr */
2667 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2668 			pcnt++;
2669 		} else {
2670 			/* call into the VM to get the paddr */
2671 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr));
2672 			vaddr += psize;
2673 		}
2674 
2675 		npages++;
2676 
2677 		if (ihp->ihp_npremapped > 0) {
2678 			*ihp->ihp_preptes[npages - 1] =
2679 			    PDTE_PADDR(paddr) | rwmask;
2680 		} else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2681 			dcookies[dmax].dck_npages++;
2682 		} else {
2683 			/* No, we need a new dcookie */
2684 			if (dmax == (IMMU_NDCK - 1)) {
2685 				/*
2686 				 * Ran out of dcookies. Map them now.
2687 				 */
2688 				if (dvma_map(domain, dvma,
2689 				    npages, dcookies, dmax + 1, rdip,
2690 				    immu_flags))
2691 					pde_set++;
2692 
2693 				IMMU_DPROBE4(immu__dvmamap__early,
2694 				    dev_info_t *, rdip, uint64_t, dvma,
2695 				    uint_t, npages, uint_t, dmax+1);
2696 
2697 				dvma += (npages << IMMU_PAGESHIFT);
2698 				npages = 0;
2699 				dmax = 0;
2700 			} else
2701 				dmax++;
2702 			dcookies[dmax].dck_paddr = paddr;
2703 			dcookies[dmax].dck_npages = 1;
2704 		}
2705 		size -= psize;
2706 	}
2707 
2708 	/*
2709 	 * Finish up, mapping all, or all of the remaining,
2710 	 * physical memory ranges.
2711 	 */
2712 	if (ihp->ihp_npremapped == 0 && npages > 0) {
2713 		IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \
2714 		    uint64_t, dvma, uint_t, npages, uint_t, dmax+1);
2715 
2716 		if (dvma_map(domain, dvma, npages, dcookies,
2717 		    dmax + 1, rdip, immu_flags))
2718 			pde_set++;
2719 	}
2720 
2721 	/* Invalidate the IOTLB */
2722 	immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc,
2723 	    pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF,
2724 	    &ihp->ihp_inv_wait);
2725 
2726 	ihp->ihp_ndvseg = 1;
2727 	ihp->ihp_dvseg[0].dvs_start = sdvma;
2728 	ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size;
2729 
2730 	dma_out->dmao_size = dmar_object->dmao_size;
2731 	dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET;
2732 	dma_out->dmao_obj.dvma_obj.dv_nseg = 1;
2733 	dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0];
2734 	dma_out->dmao_type = DMA_OTYP_DVADDR;
2735 
2736 	return (DDI_DMA_MAPPED);
2737 }
2738 
2739 static int
2740 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao)
2741 {
2742 	uint64_t dvma, npages;
2743 	domain_t *domain;
2744 	struct dvmaseg *dvs;
2745 
2746 	domain = IMMU_DEVI(rdip)->imd_domain;
2747 	dvs = dmao->dmao_obj.dvma_obj.dv_seg;
2748 
2749 	dvma = dvs[0].dvs_start;
2750 	npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off);
2751 
2752 #ifdef DEBUG
2753 	/* Unmap only in DEBUG mode */
2754 	dvma_unmap(domain, dvma, npages, rdip);
2755 #endif
2756 	dvma_free(domain, dvma, npages);
2757 
2758 	IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages,
2759 	    uint64_t, dvma);
2760 
2761 #ifdef DEBUG
2762 	/*
2763 	 * In the DEBUG case, the unmap was actually done,
2764 	 * but an IOTLB flush was not done. So, an explicit
2765 	 * write back flush is needed.
2766 	 */
2767 	immu_regs_wbf_flush(domain->dom_immu);
2768 #endif
2769 
2770 	return (DDI_SUCCESS);
2771 }
2772 
2773 /* ############################# Functions exported ######################## */
2774 
2775 /*
2776  * setup the DVMA subsystem
2777  * this code runs only for the first IOMMU unit
2778  */
2779 void
2780 immu_dvma_setup(list_t *listp)
2781 {
2782 	immu_t *immu;
2783 	uint_t kval;
2784 	size_t nchains;
2785 
2786 	/* locks */
2787 	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
2788 
2789 	/* Create lists */
2790 	list_create(&immu_unity_domain_list, sizeof (domain_t),
2791 	    offsetof(domain_t, dom_maptype_node));
2792 	list_create(&immu_xlate_domain_list, sizeof (domain_t),
2793 	    offsetof(domain_t, dom_maptype_node));
2794 
2795 	/* Setup BDF domain hash */
2796 	nchains = 0xff;
2797 	kval = mod_hash_iddata_gen(nchains);
2798 
2799 	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
2800 	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
2801 	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
2802 	    KM_NOSLEEP);
2803 
2804 	immu = list_head(listp);
2805 	for (; immu; immu = list_next(listp, immu)) {
2806 		create_unity_domain(immu);
2807 		did_init(immu);
2808 		context_init(immu);
2809 		immu->immu_dvma_setup = B_TRUE;
2810 	}
2811 }
2812 
2813 /*
2814  * Startup up one DVMA unit
2815  */
2816 void
2817 immu_dvma_startup(immu_t *immu)
2818 {
2819 	if (immu_gfxdvma_enable == B_FALSE &&
2820 	    immu->immu_dvma_gfx_only == B_TRUE) {
2821 		return;
2822 	}
2823 
2824 	/*
2825 	 * DVMA will start once IOMMU is "running"
2826 	 */
2827 	immu->immu_dvma_running = B_TRUE;
2828 }
2829 
2830 /*
2831  * immu_dvma_physmem_update()
2832  *       called when the installed memory on a
2833  *       system increases, to expand domain DVMA
2834  *       for domains with UNITY mapping
2835  */
2836 void
2837 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
2838 {
2839 	uint64_t start;
2840 	uint64_t npages;
2841 	int dcount;
2842 	immu_dcookie_t dcookies[1] = {0};
2843 	domain_t *domain;
2844 
2845 	/*
2846 	 * Just walk the system-wide list of domains with
2847 	 * UNITY mapping. Both the list of *all* domains
2848 	 * and *UNITY* domains is protected by the same
2849 	 * single lock
2850 	 */
2851 	mutex_enter(&immu_domain_lock);
2852 	domain = list_head(&immu_unity_domain_list);
2853 	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
2854 		/*
2855 		 * Nothing to do if the IOMMU supports passthrough.
2856 		 */
2857 		if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap))
2858 			continue;
2859 
2860 		/* There is no vmem_arena for unity domains. Just map it */
2861 		ddi_err(DER_LOG, domain->dom_dip,
2862 		    "iommu: unity-domain: Adding map "
2863 		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
2864 
2865 		start = IMMU_ROUNDOWN(addr);
2866 		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
2867 
2868 		dcookies[0].dck_paddr = start;
2869 		dcookies[0].dck_npages = npages;
2870 		dcount = 1;
2871 		(void) dvma_map(domain, start, npages,
2872 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2873 
2874 	}
2875 	mutex_exit(&immu_domain_lock);
2876 }
2877 
2878 int
2879 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags)
2880 {
2881 	dev_info_t *ddip, *odip;
2882 	immu_t *immu;
2883 	domain_t *domain;
2884 
2885 	odip = rdip;
2886 
2887 	immu = immu_dvma_get_immu(rdip, immu_flags);
2888 	if (immu == NULL) {
2889 		/*
2890 		 * possible that there is no IOMMU unit for this device
2891 		 * - BIOS bugs are one example.
2892 		 */
2893 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
2894 		return (DDI_DMA_NORESOURCES);
2895 	}
2896 
2897 	/*
2898 	 * redirect isa devices attached under lpc to lpc dip
2899 	 */
2900 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
2901 		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
2902 		if (rdip == NULL) {
2903 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2904 			/*NOTREACHED*/
2905 		}
2906 	}
2907 
2908 	/* Reset immu, as redirection can change IMMU */
2909 	immu = NULL;
2910 
2911 	/*
2912 	 * for gart, redirect to the real graphic devinfo
2913 	 */
2914 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
2915 		rdip = get_gfx_devinfo(rdip);
2916 		if (rdip == NULL) {
2917 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2918 			/*NOTREACHED*/
2919 		}
2920 	}
2921 
2922 	/*
2923 	 * Setup DVMA domain for the device. This does
2924 	 * work only the first time we do DVMA for a
2925 	 * device.
2926 	 */
2927 	ddip = NULL;
2928 	domain = device_domain(rdip, &ddip, immu_flags);
2929 	if (domain == NULL) {
2930 		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
2931 		return (DDI_DMA_NORESOURCES);
2932 	}
2933 
2934 	immu = domain->dom_immu;
2935 
2936 	/*
2937 	 * If a domain is found, we must also have a domain dip
2938 	 * which is the topmost ancestor dip of rdip that shares
2939 	 * the same domain with rdip.
2940 	 */
2941 	if (domain->dom_did == 0 || ddip == NULL) {
2942 		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
2943 		    domain->dom_did, ddip);
2944 		return (DDI_DMA_NORESOURCES);
2945 	}
2946 
2947 	if (odip != rdip)
2948 		set_domain(odip, ddip, domain);
2949 
2950 	/*
2951 	 * Update the root and context entries
2952 	 */
2953 	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
2954 	    != DDI_SUCCESS) {
2955 		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
2956 		return (DDI_DMA_NORESOURCES);
2957 	}
2958 
2959 	return (DDI_SUCCESS);
2960 }
2961 
2962 int
2963 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng)
2964 {
2965 	immu_dcookie_t dcookies[1] = {0};
2966 	boolean_t pde_set;
2967 	immu_t *immu;
2968 	domain_t *domain;
2969 	immu_inv_wait_t iw;
2970 
2971 	dcookies[0].dck_paddr = mrng->mrng_start;
2972 	dcookies[0].dck_npages = mrng->mrng_npages;
2973 
2974 	domain = IMMU_DEVI(rdip)->imd_domain;
2975 	immu = domain->dom_immu;
2976 
2977 	pde_set = dvma_map(domain, mrng->mrng_start,
2978 	    mrng->mrng_npages, dcookies, 1, rdip,
2979 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2980 
2981 	immu_init_inv_wait(&iw, "memrange", B_TRUE);
2982 
2983 	immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start,
2984 	    mrng->mrng_npages, pde_set == B_TRUE ?
2985 	    TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw);
2986 
2987 	return (DDI_SUCCESS);
2988 }
2989 
2990 immu_devi_t *
2991 immu_devi_get(dev_info_t *rdip)
2992 {
2993 	immu_devi_t *immu_devi;
2994 	volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);
2995 
2996 	/* Just want atomic reads. No need for lock */
2997 	immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
2998 	    0);
2999 	return (immu_devi);
3000 }
3001 
3002 /*ARGSUSED*/
3003 int
3004 immu_hdl_priv_ctor(void *buf, void *arg, int kmf)
3005 {
3006 	immu_hdl_priv_t *ihp;
3007 
3008 	ihp = buf;
3009 	immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE);
3010 
3011 	return (0);
3012 }
3013 
3014 /*
3015  * iommulib interface functions
3016  */
3017 static int
3018 immu_probe(iommulib_handle_t handle, dev_info_t *dip)
3019 {
3020 	immu_devi_t *immu_devi;
3021 	int ret;
3022 
3023 	if (!immu_enable)
3024 		return (DDI_FAILURE);
3025 
3026 	/*
3027 	 * Make sure the device has all the IOMMU structures
3028 	 * initialized. If this device goes through an IOMMU
3029 	 * unit (e.g. this probe function returns success),
3030 	 * this will be called at most N times, with N being
3031 	 * the number of IOMMUs in the system.
3032 	 *
3033 	 * After that, when iommulib_nex_open succeeds,
3034 	 * we can always assume that this device has all
3035 	 * the structures initialized. IOMMU_USED(dip) will
3036 	 * be true. There is no need to find the controlling
3037 	 * IOMMU/domain again.
3038 	 */
3039 	ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP);
3040 	if (ret != DDI_SUCCESS)
3041 		return (ret);
3042 
3043 	immu_devi = IMMU_DEVI(dip);
3044 
3045 	/*
3046 	 * For unity domains, there is no need to call in to
3047 	 * the IOMMU code.
3048 	 */
3049 	if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID)
3050 		return (DDI_FAILURE);
3051 
3052 	if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle))
3053 		return (DDI_SUCCESS);
3054 
3055 	return (DDI_FAILURE);
3056 }
3057 
3058 /*ARGSUSED*/
3059 static int
3060 immu_allochdl(iommulib_handle_t handle,
3061     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
3062     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep)
3063 {
3064 	int ret;
3065 	immu_hdl_priv_t *ihp;
3066 	immu_t *immu;
3067 
3068 	ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp,
3069 	    arg, dma_handlep);
3070 	if (ret == DDI_SUCCESS) {
3071 		immu = IMMU_DEVI(rdip)->imd_immu;
3072 
3073 		ihp = kmem_cache_alloc(immu->immu_hdl_cache,
3074 		    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
3075 		if (ihp == NULL) {
3076 			(void) iommulib_iommu_dma_freehdl(dip, rdip,
3077 			    *dma_handlep);
3078 			return (DDI_DMA_NORESOURCES);
3079 		}
3080 
3081 		if (IMMU_DEVI(rdip)->imd_use_premap)
3082 			dvma_prealloc(rdip, ihp, attr);
3083 		else {
3084 			ihp->ihp_npremapped = 0;
3085 			ihp->ihp_predvma = 0;
3086 		}
3087 		ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep,
3088 		    ihp);
3089 	}
3090 	return (ret);
3091 }
3092 
3093 /*ARGSUSED*/
3094 static int
3095 immu_freehdl(iommulib_handle_t handle,
3096     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3097 {
3098 	immu_hdl_priv_t *ihp;
3099 
3100 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3101 	if (ihp != NULL) {
3102 		if (IMMU_DEVI(rdip)->imd_use_premap)
3103 			dvma_prefree(rdip, ihp);
3104 		kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp);
3105 	}
3106 
3107 	return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle));
3108 }
3109 
3110 
3111 /*ARGSUSED*/
3112 static int
3113 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
3114     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3115     struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep,
3116     uint_t *ccountp)
3117 {
3118 	int ret;
3119 	immu_hdl_priv_t *ihp;
3120 
3121 	ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle,
3122 	    dma_req, cookiep, ccountp);
3123 
3124 	if (ret == DDI_DMA_MAPPED) {
3125 		ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3126 		immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait);
3127 	}
3128 
3129 	return (ret);
3130 }
3131 
3132 /*ARGSUSED*/
3133 static int
3134 immu_unbindhdl(iommulib_handle_t handle,
3135     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3136 {
3137 	return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle));
3138 }
3139 
3140 /*ARGSUSED*/
3141 static int
3142 immu_sync(iommulib_handle_t handle, dev_info_t *dip,
3143     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off,
3144     size_t len, uint_t cachefl)
3145 {
3146 	return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len,
3147 	    cachefl));
3148 }
3149 
3150 /*ARGSUSED*/
3151 static int
3152 immu_win(iommulib_handle_t handle, dev_info_t *dip,
3153     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
3154     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep,
3155     uint_t *ccountp)
3156 {
3157 	return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp,
3158 	    lenp, cookiep, ccountp));
3159 }
3160 
3161 /*ARGSUSED*/
3162 static int
3163 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
3164     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3165     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao)
3166 {
3167 	immu_hdl_priv_t *ihp;
3168 
3169 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3170 
3171 	return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao));
3172 }
3173 
3174 /*ARGSUSED*/
3175 static int
3176 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
3177     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao)
3178 {
3179 	immu_hdl_priv_t *ihp;
3180 
3181 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3182 	if (ihp->ihp_npremapped > 0)
3183 		return (DDI_SUCCESS);
3184 	return (immu_unmap_dvmaseg(rdip, dmao));
3185 }
3186 
3187 /*ARGSUSED*/
3188 static int
3189 immu_map(iommulib_handle_t handle, dev_info_t *dip,
3190     dev_info_t *rdip, struct ddi_dma_req *dmareq,
3191     ddi_dma_handle_t *dma_handle)
3192 {
3193 	ASSERT(0);
3194 	return (DDI_FAILURE);
3195 }
3196 
3197 /*ARGSUSED*/
3198 static int
3199 immu_mctl(iommulib_handle_t handle, dev_info_t *dip,
3200     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3201     enum ddi_dma_ctlops request, off_t *offp, size_t *lenp,
3202     caddr_t *objpp, uint_t cachefl)
3203 {
3204 	ASSERT(0);
3205 	return (DDI_FAILURE);
3206 }
3207