xref: /titanic_52/usr/src/uts/i86pc/io/immu_dvma.c (revision 9aed162131f1840d0bc1cd0275f4d7144f3690f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 /*
31  * DVMA code
32  * This file contains Intel IOMMU code that deals with DVMA
33  * i.e. DMA remapping.
34  */
35 
36 #include <sys/sysmacros.h>
37 #include <sys/pcie.h>
38 #include <sys/pci_cfgspace.h>
39 #include <vm/hat_i86.h>
40 #include <sys/memlist.h>
41 #include <sys/acpi/acpi.h>
42 #include <sys/acpica.h>
43 #include <sys/modhash.h>
44 #include <sys/immu.h>
45 
46 #undef	TEST
47 
48 /*
49  * Macros based on PCI spec
50  */
51 #define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
52 #define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
53 #define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
54 
55 #define	IMMU_CONTIG_PADDR(d, p) \
56 	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
57 
58 typedef struct dvma_arg {
59 	immu_t *dva_immu;
60 	dev_info_t *dva_rdip;
61 	dev_info_t *dva_ddip;
62 	domain_t *dva_domain;
63 	int dva_level;
64 	immu_flags_t dva_flags;
65 	list_t *dva_list;
66 	int dva_error;
67 } dvma_arg_t;
68 
69 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
70     dev_info_t *rdip, immu_flags_t immu_flags);
71 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
72     int dev, int func, immu_flags_t immu_flags);
73 static void destroy_immu_devi(immu_devi_t *immu_devi);
74 static void dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma,
75     uint64_t spaddr, uint64_t npages, dev_info_t *rdip,
76     immu_flags_t immu_flags);
77 extern struct memlist  *phys_install;
78 
79 
80 
81 /* static Globals */
82 
83 /*
84  * Used to setup DMA objects (memory regions)
85  * for DMA reads by IOMMU units
86  */
87 static ddi_dma_attr_t immu_dma_attr = {
88 	DMA_ATTR_V0,
89 	0U,
90 	0xffffffffU,
91 	0xffffffffU,
92 	MMU_PAGESIZE, /* MMU page aligned */
93 	0x1,
94 	0x1,
95 	0xffffffffU,
96 	0xffffffffU,
97 	1,
98 	4,
99 	0
100 };
101 
102 static ddi_device_acc_attr_t immu_acc_attr = {
103 	DDI_DEVICE_ATTR_V0,
104 	DDI_NEVERSWAP_ACC,
105 	DDI_STRICTORDER_ACC
106 };
107 
108 
109 /* globals private to this file */
110 static kmutex_t immu_domain_lock;
111 static list_t immu_unity_domain_list;
112 static list_t immu_xlate_domain_list;
113 
114 /* structure used to store idx into each level of the page tables */
115 typedef struct xlate {
116 	int xlt_level;
117 	uint_t xlt_idx;
118 	pgtable_t *xlt_pgtable;
119 } xlate_t;
120 
121 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
122 #define	IMMU_UNITY_DID   1
123 
124 static mod_hash_t *bdf_domain_hash;
125 
126 static domain_t *
127 bdf_domain_lookup(immu_devi_t *immu_devi)
128 {
129 	domain_t *domain;
130 	int16_t seg = immu_devi->imd_seg;
131 	int16_t bus = immu_devi->imd_bus;
132 	int16_t devfunc = immu_devi->imd_devfunc;
133 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
134 
135 	if (seg < 0 || bus < 0 || devfunc < 0) {
136 		return (NULL);
137 	}
138 
139 	domain = NULL;
140 	if (mod_hash_find(bdf_domain_hash,
141 	    (void *)bdf, (void *)&domain) == 0) {
142 		ASSERT(domain);
143 		ASSERT(domain->dom_did > 0);
144 		return (domain);
145 	} else {
146 		return (NULL);
147 	}
148 }
149 
150 static void
151 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
152 {
153 	int16_t seg = immu_devi->imd_seg;
154 	int16_t bus = immu_devi->imd_bus;
155 	int16_t devfunc = immu_devi->imd_devfunc;
156 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
157 	int r;
158 
159 	if (seg < 0 || bus < 0 || devfunc < 0) {
160 		return;
161 	}
162 
163 	r = mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
164 	ASSERT(r != MH_ERR_DUPLICATE);
165 	ASSERT(r == 0);
166 }
167 
168 static int
169 match_lpc(dev_info_t *pdip, void *arg)
170 {
171 	immu_devi_t *immu_devi;
172 	dvma_arg_t *dvap = (dvma_arg_t *)arg;
173 
174 	ASSERT(dvap->dva_error == DDI_FAILURE);
175 	ASSERT(dvap->dva_ddip == NULL);
176 	ASSERT(dvap->dva_list);
177 
178 	if (list_is_empty(dvap->dva_list)) {
179 		return (DDI_WALK_TERMINATE);
180 	}
181 
182 	immu_devi = list_head(dvap->dva_list);
183 	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
184 	    immu_devi)) {
185 		ASSERT(immu_devi->imd_dip);
186 		if (immu_devi->imd_dip == pdip) {
187 			dvap->dva_ddip = pdip;
188 			dvap->dva_error = DDI_SUCCESS;
189 			return (DDI_WALK_TERMINATE);
190 		}
191 	}
192 
193 	return (DDI_WALK_CONTINUE);
194 }
195 
196 static void
197 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
198 {
199 	list_t *spclist = NULL;
200 	immu_devi_t *immu_devi;
201 
202 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_lock)));
203 
204 	immu_devi = IMMU_DEVI(dip);
205 	if (immu_devi->imd_display == B_TRUE) {
206 		spclist = &(immu->immu_dvma_gfx_list);
207 	} else if (immu_devi->imd_lpc == B_TRUE) {
208 		spclist = &(immu->immu_dvma_lpc_list);
209 	}
210 
211 	if (spclist) {
212 		mutex_enter(&(immu->immu_lock));
213 		list_insert_head(spclist, immu_devi);
214 		mutex_exit(&(immu->immu_lock));
215 	}
216 }
217 
218 /*
219  * Set the immu_devi struct in the immu_devi field of a devinfo node
220  */
221 int
222 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
223 {
224 	int bus, dev, func;
225 	immu_devi_t *new_imd;
226 	immu_devi_t *immu_devi;
227 
228 	ASSERT(root_devinfo);
229 	ASSERT(dip);
230 	ASSERT(dip != root_devinfo);
231 
232 	immu_devi = immu_devi_get(dip);
233 	if (immu_devi != NULL) {
234 		return (DDI_SUCCESS);
235 	}
236 
237 	bus = dev = func = -1;
238 
239 	/*
240 	 * Assume a new immu_devi struct is needed
241 	 */
242 	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
243 		/*
244 		 * No BDF. Set bus = -1 to indicate this.
245 		 * We still need to create a immu_devi struct
246 		 * though
247 		 */
248 		bus = -1;
249 		dev = 0;
250 		func = 0;
251 	}
252 
253 	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
254 	if (new_imd  == NULL) {
255 		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
256 		    "structure");
257 		return (DDI_FAILURE);
258 	}
259 
260 	/*
261 	 * Check if some other thread allocated a immu_devi while we
262 	 * didn't own the lock.
263 	 */
264 	mutex_enter(&(DEVI(dip)->devi_lock));
265 	if (IMMU_DEVI(dip) == NULL) {
266 		IMMU_DEVI_SET(dip, new_imd);
267 	} else {
268 		destroy_immu_devi(new_imd);
269 	}
270 	mutex_exit(&(DEVI(dip)->devi_lock));
271 
272 	return (DDI_SUCCESS);
273 }
274 
275 static dev_info_t *
276 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
277 {
278 	dvma_arg_t dvarg = {0};
279 	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
280 	dvarg.dva_rdip = rdip;
281 	dvarg.dva_error = DDI_FAILURE;
282 
283 	if (immu_walk_ancestor(rdip, NULL, match_lpc,
284 	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
285 		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
286 		    "find lpc_devinfo for ISA device");
287 		return (NULL);
288 	}
289 
290 	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
291 		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
292 		    "ISA device");
293 		return (NULL);
294 	}
295 
296 	return (dvarg.dva_ddip);
297 }
298 
299 static dev_info_t *
300 get_gfx_devinfo(dev_info_t *rdip)
301 {
302 	immu_t *immu;
303 	immu_devi_t *immu_devi;
304 	list_t *list_gfx;
305 
306 	/*
307 	 * The GFX device may not be on the same IMMU unit as "agpgart"
308 	 * so search globally
309 	 */
310 	immu_devi = NULL;
311 	immu = list_head(&immu_list);
312 	for (; immu; immu = list_next(&immu_list, immu)) {
313 		list_gfx = &(immu->immu_dvma_gfx_list);
314 		if (!list_is_empty(list_gfx)) {
315 			immu_devi = list_head(list_gfx);
316 			break;
317 		}
318 	}
319 
320 	if (immu_devi == NULL) {
321 		ddi_err(DER_WARN, rdip, "IMMU: No GFX device. "
322 		    "Cannot redirect agpgart",
323 		    ddi_node_name(immu_devi->imd_dip));
324 		return (NULL);
325 	}
326 
327 	/* list is not empty we checked above */
328 	ASSERT(immu_devi);
329 	ASSERT(immu_devi->imd_dip);
330 
331 	ddi_err(DER_LOG, rdip, "IMMU: GFX redirect to %s",
332 	    ddi_node_name(immu_devi->imd_dip));
333 
334 	return (immu_devi->imd_dip);
335 }
336 
337 static immu_flags_t
338 dma_to_immu_flags(struct ddi_dma_req *dmareq)
339 {
340 	immu_flags_t flags = 0;
341 
342 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
343 		flags |= IMMU_FLAGS_SLEEP;
344 	} else {
345 		flags |= IMMU_FLAGS_NOSLEEP;
346 	}
347 
348 	/*
349 	 * Read and write flags need to be reversed.
350 	 * DMA_READ means read from device and write
351 	 * to memory. So DMA read means DVMA write.
352 	 */
353 	if (dmareq->dmar_flags & DDI_DMA_READ)
354 		flags |= IMMU_FLAGS_WRITE;
355 
356 	if (dmareq->dmar_flags & DDI_DMA_WRITE)
357 		flags |= IMMU_FLAGS_READ;
358 
359 #ifdef BUGGY_DRIVERS
360 	/*
361 	 * Some buggy drivers specify neither READ or WRITE
362 	 * For such drivers set both read and write permissions
363 	 */
364 	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
365 		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
366 	}
367 #endif
368 
369 	return (flags);
370 }
371 
372 /*
373  * pgtable_alloc()
374  *	alloc a IOMMU pgtable structure.
375  *	This same struct is used for root and context tables as well.
376  *	This routine allocs the f/ollowing:
377  *	- a pgtable_t struct
378  *	- a HW page which holds PTEs/entries which is accesssed by HW
379  *        so we set up DMA for this page
380  *	- a SW page which is only for our bookeeping
381  *        (for example to  hold pointers to the next level pgtable).
382  *        So a simple kmem_alloc suffices
383  */
384 static pgtable_t *
385 pgtable_alloc(immu_t *immu, domain_t *domain, immu_flags_t immu_flags)
386 {
387 	size_t actual_size = 0;
388 	pgtable_t *pgtable;
389 	int (*dmafp)(caddr_t);
390 	caddr_t vaddr;
391 	int kmflags;
392 
393 	/* TO DO cache freed pgtables as it is expensive to create em */
394 	ASSERT(immu);
395 
396 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ?
397 	    KM_NOSLEEP : KM_SLEEP;
398 
399 	dmafp = (immu_flags & IMMU_FLAGS_NOSLEEP) ?
400 	    DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
401 
402 	pgtable = kmem_zalloc(sizeof (pgtable_t), kmflags);
403 	if (pgtable == NULL) {
404 		return (NULL);
405 	}
406 
407 	pgtable->swpg_next_array = kmem_zalloc(IMMU_PAGESIZE, kmflags);
408 	if (pgtable->swpg_next_array == NULL) {
409 		kmem_free(pgtable, sizeof (pgtable_t));
410 		return (NULL);
411 	}
412 
413 	ASSERT(root_devinfo);
414 	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
415 	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
416 		kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
417 		kmem_free(pgtable, sizeof (pgtable_t));
418 		return (NULL);
419 	}
420 
421 	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
422 	    &immu_acc_attr, DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
423 	    dmafp, NULL, &vaddr, &actual_size,
424 	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
425 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
426 		kmem_free((void *)(pgtable->swpg_next_array),
427 		    IMMU_PAGESIZE);
428 		kmem_free(pgtable, sizeof (pgtable_t));
429 		return (NULL);
430 	}
431 
432 	/*
433 	 * Memory allocation failure. Maybe a temporary condition
434 	 * so return error rather than panic, so we can try again
435 	 */
436 	if (actual_size < IMMU_PAGESIZE) {
437 		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
438 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
439 		kmem_free((void *)(pgtable->swpg_next_array),
440 		    IMMU_PAGESIZE);
441 		kmem_free(pgtable, sizeof (pgtable_t));
442 		return (NULL);
443 	}
444 
445 	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
446 	pgtable->hwpg_vaddr = vaddr;
447 
448 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
449 
450 	/* Use immu directly as domain may be NULL, cant use dom_immu field */
451 	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
452 
453 	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
454 
455 	if (domain) {
456 		rw_enter(&(domain->dom_pgtable_rwlock), RW_WRITER);
457 		list_insert_head(&(domain->dom_pglist), pgtable);
458 		rw_exit(&(domain->dom_pgtable_rwlock));
459 	}
460 
461 	return (pgtable);
462 }
463 
464 static void
465 pgtable_free(immu_t *immu, pgtable_t *pgtable, domain_t *domain)
466 {
467 	ASSERT(immu);
468 	ASSERT(pgtable);
469 
470 	if (domain) {
471 		rw_enter(&(domain->dom_pgtable_rwlock), RW_WRITER);
472 		list_remove(&(domain->dom_pglist), pgtable);
473 		rw_exit(&(domain->dom_pgtable_rwlock));
474 	}
475 
476 	/* destroy will panic if lock is held. */
477 	rw_destroy(&(pgtable->swpg_rwlock));
478 
479 	/* Zero out the HW page being freed to catch errors */
480 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
481 	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
482 	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
483 	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
484 	/* don't zero out the soft pages for debugging */
485 	if (pgtable->swpg_next_array)
486 		kmem_free((void *)(pgtable->swpg_next_array), IMMU_PAGESIZE);
487 	kmem_free(pgtable, sizeof (pgtable_t));
488 }
489 
490 /*
491  * Function to identify a display device from the PCI class code
492  */
493 static boolean_t
494 device_is_display(uint_t classcode)
495 {
496 	static uint_t disp_classes[] = {
497 		0x000100,
498 		0x030000,
499 		0x030001
500 	};
501 	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
502 
503 	for (i = 0; i < nclasses; i++) {
504 		if (classcode == disp_classes[i])
505 			return (B_TRUE);
506 	}
507 	return (B_FALSE);
508 }
509 
510 /*
511  * Function that determines if device is PCIEX and/or PCIEX bridge
512  */
513 static boolean_t
514 device_is_pciex(
515 	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
516 {
517 	ushort_t cap;
518 	ushort_t capsp;
519 	ushort_t cap_count = PCI_CAP_MAX_PTR;
520 	ushort_t status;
521 	boolean_t is_pciex = B_FALSE;
522 
523 	*is_pcib = B_FALSE;
524 
525 	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
526 	if (!(status & PCI_STAT_CAP))
527 		return (B_FALSE);
528 
529 	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
530 	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
531 		capsp &= PCI_CAP_PTR_MASK;
532 		cap = pci_getb_func(bus, dev, func, capsp);
533 
534 		if (cap == PCI_CAP_ID_PCI_E) {
535 			status = pci_getw_func(bus, dev, func, capsp + 2);
536 			/*
537 			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
538 			 * for Device/Port Type.
539 			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
540 			 * device is a PCIE2PCI bridge
541 			 */
542 			*is_pcib =
543 			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
544 			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
545 			is_pciex = B_TRUE;
546 		}
547 
548 		capsp = (*pci_getb_func)(bus, dev, func,
549 		    capsp + PCI_CAP_NEXT_PTR);
550 	}
551 
552 	return (is_pciex);
553 }
554 
555 
556 /*
557  * immu_dvma_get_immu()
558  *   get the immu unit structure for a dev_info node
559  */
560 immu_t *
561 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
562 {
563 	immu_devi_t *immu_devi;
564 	immu_t *immu;
565 
566 	/*
567 	 * check if immu unit was already found earlier.
568 	 * If yes, then it will be stashed in immu_devi struct.
569 	 */
570 	immu_devi = immu_devi_get(dip);
571 	if (immu_devi == NULL) {
572 		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
573 			/*
574 			 * May fail because of low memory. Return error rather
575 			 * than panic as we want driver to rey again later
576 			 */
577 			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
578 			    "No immu_devi structure");
579 			/*NOTREACHED*/
580 		}
581 		immu_devi = immu_devi_get(dip);
582 		ASSERT(immu_devi);
583 	}
584 
585 	mutex_enter(&(DEVI(dip)->devi_lock));
586 	if (immu_devi->imd_immu) {
587 		immu = immu_devi->imd_immu;
588 		mutex_exit(&(DEVI(dip)->devi_lock));
589 		return (immu);
590 	}
591 	mutex_exit(&(DEVI(dip)->devi_lock));
592 
593 	immu = immu_dmar_get_immu(dip);
594 	if (immu == NULL) {
595 		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
596 		    "Cannot find immu_t for device");
597 		/*NOTREACHED*/
598 	}
599 
600 	/*
601 	 * Check if some other thread found immu
602 	 * while lock was not held
603 	 */
604 	immu_devi = immu_devi_get(dip);
605 	/* immu_devi should be present as we found it earlier */
606 	if (immu_devi == NULL) {
607 		ddi_err(DER_PANIC, dip,
608 		    "immu_dvma_get_immu: No immu_devi structure");
609 		/*NOTREACHED*/
610 	}
611 
612 	mutex_enter(&(DEVI(dip)->devi_lock));
613 	if (immu_devi->imd_immu == NULL) {
614 		/* nobody else set it, so we should do it */
615 		immu_devi->imd_immu = immu;
616 		immu_devi_set_spclist(dip, immu);
617 	} else {
618 		/*
619 		 * if some other thread got immu before
620 		 * us, it should get the same results
621 		 */
622 		if (immu_devi->imd_immu != immu) {
623 			ddi_err(DER_PANIC, dip, "Multiple "
624 			    "immu units found for device. Expected (%p), "
625 			    "actual (%p)", (void *)immu,
626 			    (void *)immu_devi->imd_immu);
627 			mutex_exit(&(DEVI(dip)->devi_lock));
628 			/*NOTREACHED*/
629 		}
630 	}
631 	mutex_exit(&(DEVI(dip)->devi_lock));
632 
633 	return (immu);
634 }
635 
636 
637 /* ############################# IMMU_DEVI code ############################ */
638 
639 /*
640  * Allocate a immu_devi structure and initialize it
641  */
642 static immu_devi_t *
643 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
644     immu_flags_t immu_flags)
645 {
646 	uchar_t baseclass, subclass;
647 	uint_t classcode, revclass;
648 	immu_devi_t *immu_devi;
649 	boolean_t pciex = B_FALSE;
650 	int kmflags;
651 	boolean_t is_pcib = B_FALSE;
652 
653 	/* bus ==  -1 indicate non-PCI device (no BDF) */
654 	ASSERT(bus == -1 || bus >= 0);
655 	ASSERT(dev >= 0);
656 	ASSERT(func >= 0);
657 
658 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
659 	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
660 	if (immu_devi == NULL) {
661 		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
662 		    "Intel IOMMU immu_devi structure");
663 		return (NULL);
664 	}
665 	immu_devi->imd_dip = rdip;
666 	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
667 	immu_devi->imd_bus = bus;
668 	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
669 
670 	if (bus == -1) {
671 		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
672 		return (immu_devi);
673 	}
674 
675 	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
676 	immu_devi->imd_sec = 0;
677 	immu_devi->imd_sub = 0;
678 
679 	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
680 
681 	classcode = IMMU_PCI_REV2CLASS(revclass);
682 	baseclass = IMMU_PCI_CLASS2BASE(classcode);
683 	subclass = IMMU_PCI_CLASS2SUB(classcode);
684 
685 	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
686 
687 		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
688 		    PCI_BCNF_SECBUS);
689 		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
690 		    PCI_BCNF_SUBBUS);
691 
692 		pciex = device_is_pciex(bus, dev, func, &is_pcib);
693 		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
694 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
695 		} else if (pciex == B_TRUE) {
696 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
697 		} else {
698 			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
699 		}
700 	} else {
701 		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
702 	}
703 
704 	/* check for certain special devices */
705 	immu_devi->imd_display = device_is_display(classcode);
706 
707 	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
708 	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
709 
710 	immu_devi->imd_domain = NULL;
711 
712 	return (immu_devi);
713 }
714 
715 static void
716 destroy_immu_devi(immu_devi_t *immu_devi)
717 {
718 	kmem_free(immu_devi, sizeof (immu_devi_t));
719 }
720 
721 static domain_t *
722 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
723 {
724 	immu_devi_t *immu_devi;
725 	domain_t *domain;
726 	dev_info_t *ddip;
727 
728 	ASSERT(rdip);
729 	ASSERT(ddipp);
730 
731 	*ddipp = NULL;
732 
733 	immu_devi = immu_devi_get(rdip);
734 	if (immu_devi == NULL) {
735 		return (NULL);
736 	}
737 
738 	mutex_enter(&(DEVI(rdip)->devi_lock));
739 	domain = immu_devi->imd_domain;
740 	ddip = immu_devi->imd_ddip;
741 	mutex_exit(&(DEVI(rdip)->devi_lock));
742 
743 	if (domain) {
744 		ASSERT(domain->dom_did > 0);
745 		ASSERT(ddip);
746 		*ddipp = ddip;
747 	}
748 
749 	return (domain);
750 
751 }
752 
753 /* ############################# END IMMU_DEVI code ######################## */
754 /* ############################# DOMAIN code ############################### */
755 
756 /*
757  * This routine always succeeds
758  */
759 static int
760 did_alloc(immu_t *immu, dev_info_t *rdip,
761     dev_info_t *ddip, immu_flags_t immu_flags)
762 {
763 	int did;
764 
765 	ASSERT(immu);
766 	ASSERT(rdip);
767 	ASSERT(rdip != root_devinfo);
768 
769 	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
770 	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
771 
772 	if (did == 0) {
773 		ASSERT(immu->immu_unity_domain);
774 		ASSERT(immu->immu_unity_domain->dom_did > 0);
775 		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
776 		    " domain-device: %s%d. immu unit is %s. Using "
777 		    "unity domain with domain-id (%d)",
778 		    ddi_driver_name(ddip), ddi_get_instance(ddip),
779 		    immu->immu_name, immu->immu_unity_domain->dom_did);
780 		did = immu->immu_unity_domain->dom_did;
781 	}
782 
783 	return (did);
784 }
785 
786 static int
787 get_branch_domain(dev_info_t *pdip, void *arg)
788 {
789 	immu_devi_t *immu_devi;
790 	domain_t *domain;
791 	dev_info_t *ddip;
792 	immu_t *immu;
793 	dvma_arg_t *dvp = (dvma_arg_t *)arg;
794 
795 	ASSERT(pdip);
796 	ASSERT(dvp);
797 	ASSERT(dvp->dva_rdip);
798 
799 	/*
800 	 * The field dvp->dva_rdip is a work-in-progress
801 	 * and gets updated as we walk up the ancestor
802 	 * tree. The final ddip is set only when we reach
803 	 * the top of the tree. So the dvp->dva_ddip field cannot
804 	 * be relied on until we reach the top of the field.
805 	 */
806 
807 	/* immu_devi may not be set. */
808 	immu_devi = immu_devi_get(pdip);
809 	if (immu_devi == NULL) {
810 		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
811 			dvp->dva_error = DDI_FAILURE;
812 			return (DDI_WALK_TERMINATE);
813 		}
814 	}
815 
816 	immu_devi = immu_devi_get(pdip);
817 	ASSERT(immu_devi);
818 	immu = immu_devi->imd_immu;
819 	if (immu == NULL) {
820 		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
821 		ASSERT(immu);
822 	}
823 
824 	/*
825 	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
826 	 * terminate the walk (since the device under the PCIE bridge
827 	 * is a PCIE device and has an independent entry in the
828 	 * root/context table)
829 	 */
830 	if (dvp->dva_rdip != pdip &&
831 	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
832 		return (DDI_WALK_TERMINATE);
833 	}
834 
835 	/*
836 	 * In order to be a domain-dim, it must be a PCI device i.e.
837 	 * must have valid BDF. This also eliminates the root complex.
838 	 */
839 	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
840 	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
841 		ASSERT(immu_devi->imd_bus >= 0);
842 		ASSERT(immu_devi->imd_devfunc >= 0);
843 		dvp->dva_ddip = pdip;
844 	}
845 
846 	if (immu_devi->imd_display == B_TRUE ||
847 	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
848 		dvp->dva_domain = immu->immu_unity_domain;
849 		/* continue walking to find ddip */
850 		return (DDI_WALK_CONTINUE);
851 	}
852 
853 	mutex_enter(&(DEVI(pdip)->devi_lock));
854 	domain = immu_devi->imd_domain;
855 	ddip = immu_devi->imd_ddip;
856 	mutex_exit(&(DEVI(pdip)->devi_lock));
857 
858 	if (domain && ddip) {
859 		/* if domain is set, it must be the same */
860 		if (dvp->dva_domain) {
861 			ASSERT(domain == dvp->dva_domain);
862 		}
863 		dvp->dva_domain = domain;
864 		dvp->dva_ddip = ddip;
865 		return (DDI_WALK_TERMINATE);
866 	}
867 
868 	/* immu_devi either has both set or both clear */
869 	ASSERT(domain == NULL);
870 	ASSERT(ddip == NULL);
871 
872 	/* Domain may already be set, continue walking so that ddip gets set */
873 	if (dvp->dva_domain) {
874 		return (DDI_WALK_CONTINUE);
875 	}
876 
877 	/* domain is not set in either immu_devi or dvp */
878 	domain = bdf_domain_lookup(immu_devi);
879 	if (domain == NULL) {
880 		return (DDI_WALK_CONTINUE);
881 	}
882 
883 	/* ok, the BDF hash had a domain for this BDF. */
884 
885 	/* Grab lock again to check if something else set immu_devi fields */
886 	mutex_enter(&(DEVI(pdip)->devi_lock));
887 	if (immu_devi->imd_domain != NULL) {
888 		ASSERT(immu_devi->imd_domain == domain);
889 		dvp->dva_domain = domain;
890 	} else {
891 		dvp->dva_domain = domain;
892 	}
893 	mutex_exit(&(DEVI(pdip)->devi_lock));
894 
895 	/*
896 	 * walk upwards until the topmost PCI bridge is found
897 	 */
898 	return (DDI_WALK_CONTINUE);
899 }
900 
901 static void
902 map_unity_domain(domain_t *domain)
903 {
904 	struct memlist *mp;
905 	uint64_t start;
906 	uint64_t npages;
907 
908 	ASSERT(domain);
909 	ASSERT(domain->dom_did == IMMU_UNITY_DID);
910 
911 	/*
912 	 * We call into routines that grab the lock so we should
913 	 * not be called with the lock held. This does not matter
914 	 * much since, no else has a reference to this domain
915 	 */
916 	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
917 
918 	/*
919 	 * UNITY arenas are a mirror of the physical memory
920 	 * installed on the system.
921 	 */
922 
923 #ifdef BUGGY_DRIVERS
924 	/*
925 	 * Dont skip page0. Some broken HW/FW access it.
926 	 */
927 	dvma_map(domain->dom_immu, domain, 0, 0, 1, NULL,
928 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
929 #endif
930 
931 	memlist_read_lock();
932 
933 	mp = phys_install;
934 
935 	if (mp->ml_address == 0) {
936 		/* since we already mapped page1 above */
937 		start = IMMU_PAGESIZE;
938 	} else {
939 		start = mp->ml_address;
940 	}
941 	npages = mp->ml_size/IMMU_PAGESIZE + 1;
942 
943 	dvma_map(domain->dom_immu, domain, start, start, npages, NULL,
944 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
945 
946 	ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
947 	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
948 
949 	mp = mp->ml_next;
950 	while (mp) {
951 		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
952 		    " - 0x%" PRIx64 "]", mp->ml_address,
953 		    mp->ml_address + mp->ml_size);
954 
955 		start = mp->ml_address;
956 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
957 
958 		dvma_map(domain->dom_immu, domain, start, start,
959 		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
960 
961 		mp = mp->ml_next;
962 	}
963 
964 	mp = bios_rsvd;
965 	while (mp) {
966 		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
967 		    " - 0x%" PRIx64 "]", mp->ml_address,
968 		    mp->ml_address + mp->ml_size);
969 
970 		start = mp->ml_address;
971 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
972 
973 		dvma_map(domain->dom_immu, domain, start, start,
974 		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
975 
976 		mp = mp->ml_next;
977 	}
978 
979 	memlist_read_unlock();
980 }
981 
982 /*
983  * create_xlate_arena()
984  * 	Create the dvma arena for a domain with translation
985  *	mapping
986  */
987 static void
988 create_xlate_arena(immu_t *immu, domain_t *domain,
989     dev_info_t *rdip, immu_flags_t immu_flags)
990 {
991 	char *arena_name;
992 	struct memlist *mp;
993 	int vmem_flags;
994 	uint64_t start;
995 	uint_t mgaw;
996 	uint64_t size;
997 	uint64_t maxaddr;
998 	void *vmem_ret;
999 
1000 	arena_name = domain->dom_dvma_arena_name;
1001 
1002 	/* Note, don't do sizeof (arena_name) - it is just a pointer */
1003 	(void) snprintf(arena_name,
1004 	    sizeof (domain->dom_dvma_arena_name),
1005 	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1006 	    domain->dom_did);
1007 
1008 	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1009 
1010 	/*
1011 	 * No one else has access to this domain.
1012 	 * So no domain locks needed
1013 	 */
1014 	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
1015 
1016 	/* Restrict mgaddr (max guest addr) to MGAW */
1017 	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1018 
1019 	/*
1020 	 * To ensure we avoid ioapic and PCI MMIO ranges we just
1021 	 * use the physical memory address range of the system as the
1022 	 * range
1023 	 * Implementing above causes graphics device to barf on
1024 	 * Lenovo X301 hence the toggle switch immu_mmio_safe.
1025 	 */
1026 	maxaddr = ((uint64_t)1 << mgaw);
1027 
1028 	if (immu_mmio_safe == B_FALSE) {
1029 
1030 		start = MMU_PAGESIZE;
1031 		size = maxaddr - start;
1032 
1033 		ddi_err(DER_VERB, rdip,
1034 		    "%s: Creating dvma vmem arena [0x%" PRIx64
1035 		    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1036 
1037 		ASSERT(domain->dom_dvma_arena == NULL);
1038 
1039 		/*
1040 		 * We always allocate in quanta of IMMU_PAGESIZE
1041 		 */
1042 		domain->dom_dvma_arena = vmem_create(arena_name,
1043 		    (void *)(uintptr_t)start,	/* start addr */
1044 		    size,			/* size */
1045 		    IMMU_PAGESIZE,		/* quantum */
1046 		    NULL,			/* afunc */
1047 		    NULL,			/* ffunc */
1048 		    NULL,			/* source */
1049 		    0,				/* qcache_max */
1050 		    vmem_flags);
1051 
1052 		if (domain->dom_dvma_arena == NULL) {
1053 			ddi_err(DER_PANIC, rdip,
1054 			    "Failed to allocate DVMA arena(%s) "
1055 			    "for domain ID (%d)", arena_name, domain->dom_did);
1056 			/*NOTREACHED*/
1057 		}
1058 
1059 	} else {
1060 
1061 		memlist_read_lock();
1062 
1063 		mp = phys_install;
1064 
1065 		if (mp->ml_address == 0)
1066 			start = MMU_PAGESIZE;
1067 		else
1068 			start = mp->ml_address;
1069 
1070 		if (start + mp->ml_size > maxaddr)
1071 			size = maxaddr - start;
1072 		else
1073 			size = mp->ml_size;
1074 
1075 		ddi_err(DER_VERB, rdip,
1076 		    "%s: Creating dvma vmem arena [0x%" PRIx64
1077 		    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1078 
1079 		ASSERT(domain->dom_dvma_arena == NULL);
1080 
1081 		/*
1082 		 * We always allocate in quanta of IMMU_PAGESIZE
1083 		 */
1084 		domain->dom_dvma_arena = vmem_create(arena_name,
1085 		    (void *)(uintptr_t)start,	/* start addr */
1086 		    size,			/* size */
1087 		    IMMU_PAGESIZE,		/* quantum */
1088 		    NULL,			/* afunc */
1089 		    NULL,			/* ffunc */
1090 		    NULL,			/* source */
1091 		    0,				/* qcache_max */
1092 		    vmem_flags);
1093 
1094 		if (domain->dom_dvma_arena == NULL) {
1095 			ddi_err(DER_PANIC, rdip,
1096 			    "Failed to allocate DVMA arena(%s) "
1097 			    "for domain ID (%d)", arena_name, domain->dom_did);
1098 			/*NOTREACHED*/
1099 		}
1100 
1101 		mp = mp->ml_next;
1102 		while (mp) {
1103 
1104 			if (mp->ml_address == 0)
1105 				start = MMU_PAGESIZE;
1106 			else
1107 				start = mp->ml_address;
1108 
1109 			if (start + mp->ml_size > maxaddr)
1110 				size = maxaddr - start;
1111 			else
1112 				size = mp->ml_size;
1113 
1114 			ddi_err(DER_VERB, rdip,
1115 			    "%s: Adding dvma vmem span [0x%" PRIx64
1116 			    " - 0x%" PRIx64 "]", arena_name, start,
1117 			    start + size);
1118 
1119 			vmem_ret = vmem_add(domain->dom_dvma_arena,
1120 			    (void *)(uintptr_t)start, size,  vmem_flags);
1121 
1122 			if (vmem_ret == NULL) {
1123 				ddi_err(DER_PANIC, rdip,
1124 				    "Failed to allocate DVMA arena(%s) "
1125 				    "for domain ID (%d)",
1126 				    arena_name, domain->dom_did);
1127 				/*NOTREACHED*/
1128 			}
1129 
1130 			mp = mp->ml_next;
1131 		}
1132 		memlist_read_unlock();
1133 	}
1134 }
1135 
1136 /* ################################### DOMAIN CODE ######################### */
1137 
1138 /*
1139  * Set the domain and domain-dip for a dip
1140  */
1141 static void
1142 set_domain(
1143 	dev_info_t *dip,
1144 	dev_info_t *ddip,
1145 	domain_t *domain)
1146 {
1147 	immu_devi_t *immu_devi;
1148 	domain_t *fdomain;
1149 	dev_info_t *fddip;
1150 
1151 	ASSERT(dip);
1152 	ASSERT(ddip);
1153 	ASSERT(domain);
1154 	ASSERT(domain->dom_did > 0); /* must be an initialized domain */
1155 
1156 	immu_devi = immu_devi_get(dip);
1157 	ASSERT(immu_devi);
1158 
1159 	mutex_enter(&(DEVI(dip)->devi_lock));
1160 	fddip = immu_devi->imd_ddip;
1161 	fdomain = immu_devi->imd_domain;
1162 
1163 	if (fddip) {
1164 		ASSERT(fddip == ddip);
1165 	} else {
1166 		immu_devi->imd_ddip = ddip;
1167 	}
1168 
1169 	if (fdomain) {
1170 		ASSERT(fdomain == domain);
1171 	} else {
1172 		immu_devi->imd_domain = domain;
1173 	}
1174 	mutex_exit(&(DEVI(dip)->devi_lock));
1175 }
1176 
1177 /*
1178  * device_domain()
1179  * 	Get domain for a device. The domain may be global in which case it
1180  *	is shared between all IOMMU units. Due to potential AGAW differences
1181  *      between IOMMU units, such global domains *have to be* UNITY mapping
1182  *      domains. Alternatively, the domain may be local to a IOMMU unit.
1183  *	Local domains may be shared or immu_devi, although the
1184  *      scope of sharing
1185  *	is restricted to devices controlled by the IOMMU unit to
1186  *      which the domain
1187  *	belongs. If shared, they (currently) have to be UNITY domains. If
1188  *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
1189  */
1190 static domain_t *
1191 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1192 {
1193 	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1194 	dev_info_t *edip; /* effective dip used for finding domain */
1195 	immu_t *immu;
1196 	domain_t *domain;
1197 	dvma_arg_t dvarg = {0};
1198 	int level;
1199 
1200 	ASSERT(rdip);
1201 
1202 	*ddipp = NULL;
1203 
1204 	/*
1205 	 * Check if the domain is already set. This is usually true
1206 	 * if this is not the first DVMA transaction.
1207 	 */
1208 	ddip = NULL;
1209 	domain = immu_devi_domain(rdip, &ddip);
1210 	if (domain) {
1211 		ASSERT(domain->dom_did > 0);
1212 		ASSERT(ddip);
1213 		*ddipp = ddip;
1214 		return (domain);
1215 	}
1216 
1217 	immu = immu_dvma_get_immu(rdip, immu_flags);
1218 	if (immu == NULL) {
1219 		/*
1220 		 * possible that there is no IOMMU unit for this device
1221 		 * - BIOS bugs are one example.
1222 		 */
1223 		return (NULL);
1224 	}
1225 
1226 	/*
1227 	 * Some devices need to be redirected
1228 	 */
1229 	edip = rdip;
1230 
1231 	/*
1232 	 * for isa devices attached under lpc
1233 	 */
1234 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
1235 		edip = get_lpc_devinfo(immu, rdip, immu_flags);
1236 	}
1237 
1238 	/*
1239 	 * for gart, use the real graphic devinfo
1240 	 */
1241 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
1242 		edip = get_gfx_devinfo(rdip);
1243 	}
1244 
1245 	if (edip == NULL) {
1246 		ddi_err(DER_MODE, rdip, "IMMU redirect failed");
1247 		return (NULL);
1248 	}
1249 
1250 	dvarg.dva_rdip = edip;
1251 	dvarg.dva_ddip = NULL;
1252 	dvarg.dva_domain = NULL;
1253 	dvarg.dva_flags = immu_flags;
1254 	level = 0;
1255 	if (immu_walk_ancestor(edip, NULL, get_branch_domain,
1256 	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1257 		/*
1258 		 * maybe low memory. return error,
1259 		 * so driver tries again later
1260 		 */
1261 		return (NULL);
1262 	}
1263 
1264 	/* should have walked at least 1 dip (i.e. edip) */
1265 	ASSERT(level > 0);
1266 
1267 	ddip = dvarg.dva_ddip;	/* must be present */
1268 	domain = dvarg.dva_domain;	/* may be NULL */
1269 
1270 	/*
1271 	 * We may find the domain during our ancestor walk on any one of our
1272 	 * ancestor dips, If the domain is found then the domain-dip
1273 	 * (i.e. ddip) will also be found in the same immu_devi struct.
1274 	 * The domain-dip is the highest ancestor dip which shares the
1275 	 * same domain with edip.
1276 	 * The domain may or may not be found, but the domain dip must
1277 	 * be found.
1278 	 */
1279 	if (ddip == NULL) {
1280 		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device. "
1281 		    "Effective dip (%s%d)", ddi_driver_name(edip),
1282 		    ddi_get_instance(edip));
1283 		return (NULL);
1284 	}
1285 
1286 	/*
1287 	 * Did we find a domain ?
1288 	 */
1289 	if (domain) {
1290 		goto found;
1291 	}
1292 
1293 	/* nope, so allocate */
1294 	domain = domain_create(immu, ddip, rdip, immu_flags);
1295 	if (domain == NULL) {
1296 		return (NULL);
1297 	}
1298 	ASSERT(domain->dom_did > 0);
1299 
1300 	/*FALLTHROUGH*/
1301 found:
1302 	/*
1303 	 * We know *domain *is* the right domain, so panic if
1304 	 * another domain is set for either the request-dip or
1305 	 * effective dip.
1306 	 */
1307 	set_domain(ddip, ddip, domain);
1308 	set_domain(edip, ddip, domain);
1309 	set_domain(rdip, ddip, domain);
1310 
1311 	*ddipp = ddip;
1312 	return (domain);
1313 }
1314 
1315 static void
1316 create_unity_domain(immu_t *immu)
1317 {
1318 	domain_t *domain;
1319 
1320 	/* 0 is reserved by Vt-d */
1321 	/*LINTED*/
1322 	ASSERT(IMMU_UNITY_DID > 0);
1323 
1324 	/* domain created during boot and always use sleep flag */
1325 	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1326 
1327 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1328 	list_create(&(domain->dom_pglist), sizeof (pgtable_t),
1329 	    offsetof(pgtable_t, swpg_domain_node));
1330 
1331 	domain->dom_did = IMMU_UNITY_DID;
1332 	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1333 
1334 	domain->dom_immu = immu;
1335 	immu->immu_unity_domain = domain;
1336 
1337 	/*
1338 	 * Setup the domain's initial page table
1339 	 * should never fail.
1340 	 */
1341 	domain->dom_pgtable_root = pgtable_alloc(immu, domain,
1342 	    IMMU_FLAGS_SLEEP);
1343 
1344 	ASSERT(domain->dom_pgtable_root);
1345 
1346 	map_unity_domain(domain);
1347 
1348 	/*
1349 	 * put it on the system-wide UNITY domain list
1350 	 */
1351 	mutex_enter(&(immu_domain_lock));
1352 	list_insert_tail(&immu_unity_domain_list, domain);
1353 	mutex_exit(&(immu_domain_lock));
1354 }
1355 
1356 /*
1357  * ddip is the domain-dip - the topmost dip in a domain
1358  * rdip is the requesting-dip - the device which is
1359  * requesting DVMA setup
1360  * if domain is a non-shared domain rdip == ddip
1361  */
1362 static domain_t *
1363 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1364     immu_flags_t immu_flags)
1365 {
1366 	int kmflags;
1367 	domain_t *domain;
1368 	char mod_hash_name[128];
1369 	immu_devi_t *immu_devi;
1370 	int did;
1371 
1372 	ASSERT(immu);
1373 	ASSERT(ddip);
1374 
1375 	immu_devi = immu_devi_get(rdip);
1376 
1377 	ASSERT(immu_devi);
1378 
1379 	/*
1380 	 * First allocate a domainid.
1381 	 * This routine will never fail, since if we run out
1382 	 * of domains the unity domain will be allocated.
1383 	 */
1384 	did = did_alloc(immu, rdip, ddip, immu_flags);
1385 	ASSERT(did > 0);
1386 	if (did == IMMU_UNITY_DID) {
1387 		/* domain overflow */
1388 		ASSERT(immu->immu_unity_domain);
1389 		return (immu->immu_unity_domain);
1390 	}
1391 
1392 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1393 	domain = kmem_zalloc(sizeof (domain_t), kmflags);
1394 	if (domain == NULL) {
1395 		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1396 		    "structure for device. IOMMU unit: %s", immu->immu_name);
1397 		/*NOTREACHED*/
1398 	}
1399 
1400 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1401 	list_create(&(domain->dom_pglist), sizeof (pgtable_t),
1402 	    offsetof(pgtable_t, swpg_domain_node));
1403 
1404 	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1405 	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
1406 
1407 	domain->dom_did = did;
1408 	domain->dom_immu = immu;
1409 	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1410 
1411 	/*
1412 	 * Create xlate DVMA arena for this domain.
1413 	 */
1414 	create_xlate_arena(immu, domain, rdip, immu_flags);
1415 
1416 	/*
1417 	 * Setup the domain's initial page table
1418 	 */
1419 	domain->dom_pgtable_root = pgtable_alloc(immu, domain, immu_flags);
1420 	if (domain->dom_pgtable_root == NULL) {
1421 		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1422 		    "pgtable for domain (%d). IOMMU unit: %s",
1423 		    domain->dom_did, immu->immu_name);
1424 		/*NOTREACHED*/
1425 	}
1426 
1427 	/*
1428 	 * Since this is a immu unit-specific domain, put it on
1429 	 * the per-immu domain list.
1430 	 */
1431 	mutex_enter(&(immu->immu_lock));
1432 	list_insert_head(&immu->immu_domain_list, domain);
1433 	mutex_exit(&(immu->immu_lock));
1434 
1435 	/*
1436 	 * Also put it on the system-wide xlate domain list
1437 	 */
1438 	mutex_enter(&(immu_domain_lock));
1439 	list_insert_head(&immu_xlate_domain_list, domain);
1440 	mutex_exit(&(immu_domain_lock));
1441 
1442 	bdf_domain_insert(immu_devi, domain);
1443 
1444 #ifdef BUGGY_DRIVERS
1445 	/*
1446 	 * Map page0. Some broken HW/FW access it.
1447 	 */
1448 	dvma_map(domain->dom_immu, domain, 0, 0, 1, NULL,
1449 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1450 #endif
1451 
1452 	return (domain);
1453 }
1454 
1455 /*
1456  * Create domainid arena.
1457  * Domainid 0 is reserved by Vt-d spec and cannot be used by
1458  * system software.
1459  * Domainid 1 is reserved by solaris and used for *all* of the following:
1460  *	as the "uninitialized" domain - For devices not yet controlled
1461  *	by Solaris
1462  *	as the "unity" domain - For devices that will always belong
1463  *	to the unity domain
1464  *	as the "overflow" domain - Used for any new device after we
1465  *	run out of domains
1466  * All of the above domains map into a single domain with
1467  * domainid 1 and UNITY DVMA mapping
1468  * Each IMMU unity has its own unity/uninit/overflow domain
1469  */
1470 static void
1471 did_init(immu_t *immu)
1472 {
1473 	(void) snprintf(immu->immu_did_arena_name,
1474 	    sizeof (immu->immu_did_arena_name),
1475 	    "%s_domainid_arena", immu->immu_name);
1476 
1477 	ddi_err(DER_VERB, NULL, "%s: Creating domainid arena %s",
1478 	    immu->immu_name, immu->immu_did_arena_name);
1479 
1480 	immu->immu_did_arena = vmem_create(
1481 	    immu->immu_did_arena_name,
1482 	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
1483 	    immu->immu_max_domains - IMMU_UNITY_DID,
1484 	    1,				/* quantum */
1485 	    NULL,			/* afunc */
1486 	    NULL,			/* ffunc */
1487 	    NULL,			/* source */
1488 	    0,				/* qcache_max */
1489 	    VM_SLEEP);
1490 
1491 	/* Even with SLEEP flag, vmem_create() can fail */
1492 	if (immu->immu_did_arena == NULL) {
1493 		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1494 		    "IOMMU domainid allocator: %s", immu->immu_name,
1495 		    immu->immu_did_arena_name);
1496 	}
1497 }
1498 
1499 /* #########################  CONTEXT CODE ################################# */
1500 
1501 static void
1502 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1503     int bus, int devfunc)
1504 {
1505 	pgtable_t *context;
1506 	pgtable_t *pgtable_root;
1507 	pgtable_t *unity_pgtable_root;
1508 	hw_rce_t *hw_rent;
1509 	hw_rce_t *hw_cent;
1510 	hw_rce_t *ctxp;
1511 
1512 	ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
1513 
1514 	ASSERT(immu);
1515 	ASSERT(domain);
1516 	ASSERT(root_table);
1517 	ASSERT(bus >= 0);
1518 	ASSERT(devfunc >= 0);
1519 	ASSERT(domain->dom_pgtable_root);
1520 
1521 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1522 	context = *(pgtable_t **)(ctxp + bus);
1523 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1524 	if (ROOT_GET_P(hw_rent)) {
1525 		ASSERT(ROOT_GET_CONT(hw_rent) == context->hwpg_paddr);
1526 	} else {
1527 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1528 		ROOT_SET_P(hw_rent);
1529 		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1530 	}
1531 	hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1532 
1533 	pgtable_root = domain->dom_pgtable_root;
1534 	unity_pgtable_root = immu->immu_unity_domain->dom_pgtable_root;
1535 	if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_UNINITED) {
1536 		ASSERT(CONT_GET_P(hw_cent));
1537 		ASSERT(CONT_GET_DID(hw_cent) ==
1538 		    immu->immu_unity_domain->dom_did);
1539 		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
1540 		ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
1541 		ASSERT(CONT_GET_ASR(hw_cent) ==
1542 		    unity_pgtable_root->hwpg_paddr);
1543 
1544 		/* need to disable context entry before reprogramming it */
1545 		bzero(hw_cent, sizeof (hw_rce_t));
1546 
1547 		/* flush caches */
1548 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1549 		ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
1550 		immu_regs_context_flush(immu, 0, 0,
1551 		    immu->immu_unity_domain->dom_did, CONTEXT_DSI);
1552 		immu_regs_context_flush(immu, 0, 0, domain->dom_did,
1553 		    CONTEXT_DSI);
1554 		immu_regs_iotlb_flush(immu, immu->immu_unity_domain->dom_did,
1555 		    0, 0, TLB_IVA_WHOLE, IOTLB_DSI);
1556 		immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0,
1557 		    TLB_IVA_WHOLE, IOTLB_DSI);
1558 		immu_regs_wbf_flush(immu);
1559 
1560 		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1561 		CONT_SET_DID(hw_cent, domain->dom_did);
1562 		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1563 		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1564 		/*LINTED*/
1565 		CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1566 		CONT_SET_P(hw_cent);
1567 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1568 	} else {
1569 		ASSERT(CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED);
1570 		ASSERT(CONT_GET_P(hw_cent));
1571 		ASSERT(CONT_GET_DID(hw_cent) == domain->dom_did);
1572 		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
1573 		ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
1574 		ASSERT(CONT_GET_ASR(hw_cent) == pgtable_root->hwpg_paddr);
1575 	}
1576 }
1577 
1578 static pgtable_t *
1579 context_create(immu_t *immu)
1580 {
1581 	int	bus;
1582 	int	devfunc;
1583 	pgtable_t *root_table;
1584 	pgtable_t *context;
1585 	pgtable_t *pgtable_root;
1586 	hw_rce_t *ctxp;
1587 	hw_rce_t *hw_rent;
1588 	hw_rce_t *hw_cent;
1589 
1590 	/* Allocate a zeroed root table (4K 256b entries) */
1591 	root_table = pgtable_alloc(immu, NULL, IMMU_FLAGS_SLEEP);
1592 
1593 	/*
1594 	 * Setup context tables for all possible root table entries.
1595 	 * Start out with unity domains for all entries.
1596 	 */
1597 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1598 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1599 	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1600 		context = pgtable_alloc(immu, NULL, IMMU_FLAGS_SLEEP);
1601 		ASSERT(ROOT_GET_P(hw_rent) == 0);
1602 		ROOT_SET_P(hw_rent);
1603 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1604 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1605 		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1606 		    devfunc++, hw_cent++) {
1607 			ASSERT(CONT_GET_P(hw_cent) == 0);
1608 			pgtable_root =
1609 			    immu->immu_unity_domain->dom_pgtable_root;
1610 			CONT_SET_DID(hw_cent,
1611 			    immu->immu_unity_domain->dom_did);
1612 			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1613 			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1614 			/*LINTED*/
1615 			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1616 			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1617 			CONT_SET_P(hw_cent);
1618 		}
1619 		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1620 		*((pgtable_t **)ctxp) = context;
1621 	}
1622 	immu_regs_cpu_flush(immu, root_table->hwpg_vaddr, IMMU_PAGESIZE);
1623 
1624 	return (root_table);
1625 }
1626 
1627 /*
1628  * Called during rootnex attach, so no locks needed
1629  */
1630 static void
1631 context_init(immu_t *immu)
1632 {
1633 	ASSERT(immu);
1634 	ASSERT(immu->immu_ctx_root == NULL);
1635 
1636 	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1637 
1638 	immu_regs_wbf_flush(immu);
1639 
1640 	immu->immu_ctx_root = context_create(immu);
1641 
1642 	immu_regs_set_root_table(immu);
1643 
1644 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1645 	immu_regs_context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL);
1646 	rw_exit(&(immu->immu_ctx_rwlock));
1647 	immu_regs_iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
1648 	immu_regs_wbf_flush(immu);
1649 }
1650 
1651 
1652 /*
1653  * Find top pcib
1654  */
1655 static int
1656 find_top_pcib(dev_info_t *dip, void *arg)
1657 {
1658 	immu_devi_t *immu_devi;
1659 	dev_info_t **pcibdipp = (dev_info_t **)arg;
1660 
1661 	ASSERT(dip);
1662 
1663 	immu_devi = immu_devi_get(dip);
1664 	ASSERT(immu_devi);
1665 
1666 	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1667 		*pcibdipp = dip;
1668 	}
1669 
1670 	return (DDI_WALK_CONTINUE);
1671 }
1672 
1673 static int
1674 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1675     dev_info_t *rdip, immu_flags_t immu_flags)
1676 {
1677 	immu_devi_t *r_immu_devi;
1678 	immu_devi_t *d_immu_devi;
1679 	int r_bus;
1680 	int d_bus;
1681 	int r_devfunc;
1682 	int d_devfunc;
1683 	immu_pcib_t d_pcib_type;
1684 	immu_pcib_t r_pcib_type;
1685 	dev_info_t *pcibdip;
1686 
1687 	if (ddip == NULL || rdip == NULL ||
1688 	    ddip == root_devinfo || rdip == root_devinfo) {
1689 		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1690 		    "request-dip are NULL or are root devinfo");
1691 		return (DDI_FAILURE);
1692 	}
1693 
1694 	/*
1695 	 * We need to set the context fields
1696 	 * based on what type of device rdip and ddip are.
1697 	 * To do that we need the immu_devi field.
1698 	 * Set the immu_devi field (if not already set)
1699 	 */
1700 	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1701 		ddi_err(DER_MODE, rdip,
1702 		    "immu_context_update: failed to set immu_devi for ddip");
1703 		return (DDI_FAILURE);
1704 	}
1705 
1706 	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1707 		ddi_err(DER_MODE, rdip,
1708 		    "immu_context_update: failed to set immu_devi for rdip");
1709 		return (DDI_FAILURE);
1710 	}
1711 
1712 	d_immu_devi = immu_devi_get(ddip);
1713 	r_immu_devi = immu_devi_get(rdip);
1714 	ASSERT(r_immu_devi);
1715 	ASSERT(d_immu_devi);
1716 
1717 	d_bus = d_immu_devi->imd_bus;
1718 	d_devfunc = d_immu_devi->imd_devfunc;
1719 	d_pcib_type = d_immu_devi->imd_pcib_type;
1720 	r_bus = r_immu_devi->imd_bus;
1721 	r_devfunc = r_immu_devi->imd_devfunc;
1722 	r_pcib_type = r_immu_devi->imd_pcib_type;
1723 
1724 	ASSERT(d_bus >= 0);
1725 
1726 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1727 	if (rdip == ddip) {
1728 		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT ||
1729 		    d_pcib_type == IMMU_PCIB_PCIE_PCIE);
1730 		ASSERT(r_bus >= 0);
1731 		ASSERT(r_devfunc >= 0);
1732 		/* rdip is a PCIE device. set context for it only */
1733 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1734 		    r_devfunc);
1735 #ifdef BUGGY_DRIVERS
1736 	} else if (r_immu_devi == d_immu_devi) {
1737 #ifdef TEST
1738 		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1739 		    "0x%lx are identical", rdip, ddip);
1740 #endif
1741 		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT);
1742 		ASSERT(r_bus >= 0);
1743 		ASSERT(r_devfunc >= 0);
1744 		/* rdip is a PCIE device. set context for it only */
1745 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1746 		    r_devfunc);
1747 #endif
1748 	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1749 		/*
1750 		 * ddip is a PCIE_PCI bridge. Set context for ddip's
1751 		 * secondary bus. If rdip is on ddip's secondary
1752 		 * bus, set context for rdip. Else, set context
1753 		 * for rdip's PCI bridge on ddip's secondary bus.
1754 		 */
1755 		context_set(immu, domain, immu->immu_ctx_root,
1756 		    d_immu_devi->imd_sec, 0);
1757 		if (d_immu_devi->imd_sec == r_bus) {
1758 			context_set(immu, domain, immu->immu_ctx_root,
1759 			    r_bus, r_devfunc);
1760 		} else {
1761 			pcibdip = NULL;
1762 			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1763 			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1764 			    pcibdip != NULL) {
1765 				ASSERT(pcibdip);
1766 				r_immu_devi = immu_devi_get(pcibdip);
1767 				ASSERT(d_immu_devi);
1768 				ASSERT(d_immu_devi->imd_pcib_type ==
1769 				    IMMU_PCIB_PCI_PCI);
1770 				r_bus = r_immu_devi->imd_bus;
1771 				r_devfunc = r_immu_devi->imd_devfunc;
1772 				context_set(immu, domain, immu->immu_ctx_root,
1773 				    r_bus, r_devfunc);
1774 			} else {
1775 				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1776 				    " bridge for PCI device");
1777 				/*NOTREACHED*/
1778 			}
1779 		}
1780 	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1781 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1782 		    d_devfunc);
1783 	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1784 		ASSERT(r_pcib_type == IMMU_PCIB_NOBDF);
1785 		/*
1786 		 * ddip is a PCIE device which has a non-PCI device under it
1787 		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1788 		 */
1789 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1790 		    d_devfunc);
1791 	} else {
1792 		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1793 		    "set IMMU context.");
1794 		/*NOTREACHED*/
1795 	}
1796 	rw_exit(&(immu->immu_ctx_rwlock));
1797 
1798 	/* XXX do we need a membar_producer() here */
1799 	return (DDI_SUCCESS);
1800 }
1801 
1802 /* ##################### END CONTEXT CODE ################################## */
1803 /* ##################### MAPPING CODE ################################## */
1804 
1805 
1806 static boolean_t
1807 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1808     dev_info_t *rdip, immu_flags_t immu_flags)
1809 {
1810 	if (immu_flags & IMMU_FLAGS_PAGE1) {
1811 		ASSERT(paddr == 0);
1812 	} else {
1813 		ASSERT((next == NULL) ^ (paddr == 0));
1814 	}
1815 
1816 	/* The PDTE must be set i.e. present bit is set */
1817 	if (!PDTE_P(pdte)) {
1818 		ddi_err(DER_MODE, rdip, "No present flag");
1819 		return (B_FALSE);
1820 	}
1821 
1822 	/*
1823 	 * Just assert to check most significant system software field
1824 	 * (PDTE_SW4) as it is same as present bit and we
1825 	 * checked that above
1826 	 */
1827 	ASSERT(PDTE_SW4(pdte));
1828 
1829 	/*
1830 	 * TM field should be clear if not reserved.
1831 	 * non-leaf is always reserved
1832 	 */
1833 	if (next == NULL && immu_regs_is_TM_reserved(immu) == B_FALSE) {
1834 		if (PDTE_TM(pdte)) {
1835 			ddi_err(DER_MODE, rdip, "TM flag set");
1836 			return (B_FALSE);
1837 		}
1838 	}
1839 
1840 	/*
1841 	 * The SW3 field is not used and must be clear
1842 	 */
1843 	if (PDTE_SW3(pdte)) {
1844 		ddi_err(DER_MODE, rdip, "SW3 set");
1845 		return (B_FALSE);
1846 	}
1847 
1848 	/*
1849 	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1850 	 */
1851 	if (next == NULL) {
1852 		ASSERT(paddr % IMMU_PAGESIZE == 0);
1853 		if (PDTE_PADDR(pdte) != paddr) {
1854 			ddi_err(DER_MODE, rdip,
1855 			    "PTE paddr mismatch: %lx != %lx",
1856 			    PDTE_PADDR(pdte), paddr);
1857 			return (B_FALSE);
1858 		}
1859 	} else {
1860 		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1861 			ddi_err(DER_MODE, rdip,
1862 			    "PDE paddr mismatch: %lx != %lx",
1863 			    PDTE_PADDR(pdte), next->hwpg_paddr);
1864 			return (B_FALSE);
1865 		}
1866 	}
1867 
1868 	/*
1869 	 * SNP field should be clear if not reserved.
1870 	 * non-leaf is always reserved
1871 	 */
1872 	if (next == NULL && immu_regs_is_SNP_reserved(immu) == B_FALSE) {
1873 		if (PDTE_SNP(pdte)) {
1874 			ddi_err(DER_MODE, rdip, "SNP set");
1875 			return (B_FALSE);
1876 		}
1877 	}
1878 
1879 	/* second field available for system software should be clear */
1880 	if (PDTE_SW2(pdte)) {
1881 		ddi_err(DER_MODE, rdip, "SW2 set");
1882 		return (B_FALSE);
1883 	}
1884 
1885 	/* Super pages field should be clear */
1886 	if (PDTE_SP(pdte)) {
1887 		ddi_err(DER_MODE, rdip, "SP set");
1888 		return (B_FALSE);
1889 	}
1890 
1891 	/*
1892 	 * least significant field available for
1893 	 * system software should be clear
1894 	 */
1895 	if (PDTE_SW1(pdte)) {
1896 		ddi_err(DER_MODE, rdip, "SW1 set");
1897 		return (B_FALSE);
1898 	}
1899 
1900 	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1901 		ddi_err(DER_MODE, rdip, "READ not set");
1902 		return (B_FALSE);
1903 	}
1904 
1905 	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1906 		ddi_err(DER_MODE, rdip, "WRITE not set");
1907 		return (B_FALSE);
1908 	}
1909 
1910 	return (B_TRUE);
1911 }
1912 /*ARGSUSED*/
1913 static void
1914 PTE_clear_one(immu_t *immu, domain_t *domain, xlate_t *xlate, uint64_t dvma,
1915     dev_info_t *rdip)
1916 {
1917 	hw_pdte_t *hwp;
1918 	pgtable_t *pgtable;
1919 	int idx;
1920 	hw_pdte_t pte;
1921 
1922 	ASSERT(xlate->xlt_level == 1);
1923 
1924 	idx = xlate->xlt_idx;
1925 	pgtable = xlate->xlt_pgtable;
1926 
1927 	ASSERT(dvma % IMMU_PAGESIZE == 0);
1928 	ASSERT(pgtable);
1929 	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
1930 
1931 	/*
1932 	 * since we are clearing PTEs, lock the
1933 	 * page table write mode
1934 	 */
1935 	rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
1936 
1937 	/*
1938 	 * We are at the leaf - next level array must be NULL
1939 	 */
1940 	ASSERT(pgtable->swpg_next_array == NULL);
1941 
1942 	hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1943 
1944 	pte = *hwp;
1945 	/* Cannot clear a HW PTE that is aleady clear */
1946 	ASSERT(PDTE_P(pte));
1947 	PDTE_CLEAR_P(pte);
1948 	*hwp = pte;
1949 
1950 	/* flush writes to HW PTE table */
1951 	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
1952 
1953 	rw_exit(&(xlate->xlt_pgtable->swpg_rwlock));
1954 }
1955 
1956 /*ARGSUSED*/
1957 static void
1958 xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate,
1959     int nlevels, dev_info_t *rdip)
1960 {
1961 	int level;
1962 	uint64_t offbits;
1963 
1964 	/* level 0 is never used. Sanity check */
1965 	ASSERT(xlate->xlt_level == 0);
1966 	ASSERT(xlate->xlt_idx == 0);
1967 	ASSERT(xlate->xlt_pgtable == NULL);
1968 	ASSERT(dvma % IMMU_PAGESIZE == 0);
1969 
1970 	/*
1971 	 * Skip the first 12 bits which is the offset into
1972 	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1973 	 */
1974 	offbits = dvma >> IMMU_PAGESHIFT;
1975 
1976 	/* skip to level 1 i.e. leaf PTE */
1977 	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
1978 		xlate->xlt_level = level;
1979 		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
1980 		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
1981 		xlate->xlt_pgtable = NULL;
1982 		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
1983 	}
1984 }
1985 
1986 /*
1987  * Read the pgtables
1988  */
1989 static void
1990 PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
1991     dev_info_t *rdip)
1992 {
1993 	pgtable_t *pgtable;
1994 	pgtable_t *next;
1995 	hw_pdte_t pde;
1996 	uint_t idx;
1997 
1998 	/* xlate should be at level 0 */
1999 	ASSERT(xlate->xlt_level == 0);
2000 	ASSERT(xlate->xlt_idx == 0);
2001 
2002 	/* start with highest level pgtable i.e. root */
2003 	xlate += nlevels;
2004 	ASSERT(xlate->xlt_level == nlevels);
2005 
2006 	if (xlate->xlt_pgtable == NULL) {
2007 		xlate->xlt_pgtable = domain->dom_pgtable_root;
2008 	}
2009 
2010 	for (; xlate->xlt_level > 1; xlate--) {
2011 
2012 		idx = xlate->xlt_idx;
2013 		pgtable = xlate->xlt_pgtable;
2014 
2015 		ASSERT(pgtable);
2016 		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2017 
2018 		if ((xlate - 1)->xlt_pgtable) {
2019 			continue;
2020 		}
2021 
2022 		/* xlate's leafier level is not set, set it now */
2023 
2024 		/* Lock the pgtable in read mode */
2025 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2026 
2027 		/*
2028 		 * since we are unmapping, the pgtable should
2029 		 * already point to a leafier pgtable.
2030 		 */
2031 		next = *(pgtable->swpg_next_array + idx);
2032 		ASSERT(next);
2033 
2034 		pde = *((hw_pdte_t *)(pgtable->hwpg_vaddr) + idx);
2035 
2036 		ASSERT(PDTE_check(immu, pde, next, 0, rdip, 0) == B_TRUE);
2037 
2038 		(xlate - 1)->xlt_pgtable = next;
2039 
2040 		rw_exit(&(pgtable->swpg_rwlock));
2041 	}
2042 }
2043 
2044 static void
2045 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2046     dev_info_t *rdip, immu_flags_t immu_flags)
2047 {
2048 	hw_pdte_t pte;
2049 
2050 	pte = *hwp;
2051 
2052 	if (PDTE_P(pte)) {
2053 		if (PDTE_PADDR(pte) != paddr) {
2054 			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2055 			    PDTE_PADDR(pte), paddr);
2056 		}
2057 		goto out;
2058 	}
2059 
2060 
2061 	/* Don't touch SW4. It is the present field */
2062 
2063 	/* clear TM field if not reserved */
2064 	if (immu_regs_is_TM_reserved(immu) == B_FALSE) {
2065 		PDTE_CLEAR_TM(pte);
2066 	}
2067 
2068 	/* Clear 3rd field for system software  - not used */
2069 	PDTE_CLEAR_SW3(pte);
2070 
2071 	/* Set paddr */
2072 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2073 	PDTE_CLEAR_PADDR(pte);
2074 	PDTE_SET_PADDR(pte, paddr);
2075 
2076 	/*  clear SNP field if not reserved. */
2077 	if (immu_regs_is_SNP_reserved(immu) == B_FALSE) {
2078 		PDTE_CLEAR_SNP(pte);
2079 	}
2080 
2081 	/* Clear SW2 field available for software */
2082 	PDTE_CLEAR_SW2(pte);
2083 
2084 	/* SP is don't care for PTEs. Clear it for cleanliness */
2085 	PDTE_CLEAR_SP(pte);
2086 
2087 	/* Clear SW1 field available for software */
2088 	PDTE_CLEAR_SW1(pte);
2089 
2090 	/*
2091 	 * Now that we are done writing the PTE
2092 	 * set the "present" flag. Note this present
2093 	 * flag is a bit in the PDE/PTE that the
2094 	 * spec says is available for system software.
2095 	 * This is an implementation detail of Solaris
2096 	 * bare-metal Intel IOMMU.
2097 	 * The present field in a PDE/PTE is not defined
2098 	 * by the Vt-d spec
2099 	 */
2100 
2101 	PDTE_SET_P(pte);
2102 
2103 out:
2104 	if (immu_flags & IMMU_FLAGS_READ)
2105 		PDTE_SET_READ(pte);
2106 	if (immu_flags & IMMU_FLAGS_WRITE)
2107 		PDTE_SET_WRITE(pte);
2108 
2109 #ifdef BUGGY_DRIVERS
2110 	PDTE_SET_READ(pte);
2111 	PDTE_SET_WRITE(pte);
2112 #endif
2113 
2114 	*hwp = pte;
2115 }
2116 
2117 /*ARGSUSED*/
2118 static void
2119 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2120     uint64_t *dvma_ptr, paddr_t *paddr_ptr, uint64_t *npages_ptr,
2121     dev_info_t *rdip, immu_flags_t immu_flags)
2122 {
2123 	paddr_t paddr;
2124 	uint64_t npages;
2125 	uint64_t dvma;
2126 	pgtable_t *pgtable;
2127 	hw_pdte_t *hwp;
2128 	hw_pdte_t *shwp;
2129 	int idx;
2130 
2131 	ASSERT(xlate->xlt_level == 1);
2132 
2133 	pgtable = xlate->xlt_pgtable;
2134 	idx = xlate->xlt_idx;
2135 
2136 	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2137 	ASSERT(pgtable);
2138 
2139 	dvma = *dvma_ptr;
2140 	paddr = *paddr_ptr;
2141 	npages = *npages_ptr;
2142 
2143 	ASSERT(paddr || (immu_flags & IMMU_FLAGS_PAGE1));
2144 	ASSERT(dvma || (immu_flags & IMMU_FLAGS_PAGE1));
2145 	ASSERT(npages);
2146 
2147 	/*
2148 	 * since we are setting PTEs, lock the page table in
2149 	 * write mode
2150 	 */
2151 	rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2152 
2153 	/*
2154 	 * we are at the leaf pgtable - no further levels.
2155 	 * The next_array field should be NULL.
2156 	 */
2157 	ASSERT(pgtable->swpg_next_array == NULL);
2158 
2159 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2160 
2161 	hwp = shwp;
2162 	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2163 
2164 		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2165 
2166 		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2167 		    == B_TRUE);
2168 
2169 		paddr += IMMU_PAGESIZE;
2170 		dvma += IMMU_PAGESIZE;
2171 		npages--;
2172 	}
2173 
2174 	/* flush writes to HW PTE table */
2175 	immu_regs_cpu_flush(immu, (caddr_t)shwp, (hwp - shwp) *
2176 	    sizeof (hw_pdte_t));
2177 
2178 	*dvma_ptr = dvma;
2179 	*paddr_ptr = paddr;
2180 	*npages_ptr = npages;
2181 	xlate->xlt_idx = idx;
2182 
2183 	rw_exit(&(pgtable->swpg_rwlock));
2184 }
2185 
2186 /*ARGSUSED*/
2187 static void
2188 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2189     dev_info_t *rdip, immu_flags_t immu_flags)
2190 {
2191 	hw_pdte_t pde;
2192 
2193 	pde = *hwp;
2194 
2195 	/* if PDE is already set, make sure it is correct */
2196 	if (PDTE_P(pde)) {
2197 		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2198 		goto out;
2199 	}
2200 
2201 	/* Dont touch SW4, it is the present bit */
2202 
2203 	/* don't touch TM field it is reserved for PDEs */
2204 
2205 	/* 3rd field available for system software is not used */
2206 	PDTE_CLEAR_SW3(pde);
2207 
2208 	/* Set next level pgtable-paddr for PDE */
2209 	ASSERT(next->hwpg_paddr % IMMU_PAGESIZE == 0);
2210 	PDTE_CLEAR_PADDR(pde);
2211 	PDTE_SET_PADDR(pde, next->hwpg_paddr);
2212 
2213 	/* don't touch SNP field it is reserved for PDEs */
2214 
2215 	/* Clear second field available for system software */
2216 	PDTE_CLEAR_SW2(pde);
2217 
2218 	/* No super pages for PDEs */
2219 	PDTE_CLEAR_SP(pde);
2220 
2221 	/* Clear SW1 for software */
2222 	PDTE_CLEAR_SW1(pde);
2223 
2224 	/*
2225 	 * Now that we are done writing the PDE
2226 	 * set the "present" flag. Note this present
2227 	 * flag is a bit in the PDE/PTE that the
2228 	 * spec says is available for system software.
2229 	 * This is an implementation detail of Solaris
2230 	 * base-metal Intel IOMMU.
2231 	 * The present field in a PDE/PTE is not defined
2232 	 * by the Vt-d spec
2233 	 */
2234 out:
2235 
2236 	if (immu_flags & IMMU_FLAGS_READ)
2237 		PDTE_SET_READ(pde);
2238 	if (immu_flags & IMMU_FLAGS_WRITE)
2239 		PDTE_SET_WRITE(pde);
2240 
2241 #ifdef  BUGGY_DRIVERS
2242 	PDTE_SET_READ(pde);
2243 	PDTE_SET_WRITE(pde);
2244 #endif
2245 
2246 	PDTE_SET_P(pde);
2247 
2248 	*hwp = pde;
2249 
2250 	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
2251 }
2252 
2253 /*
2254  * Used to set PDEs
2255  */
2256 static void
2257 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2258     dev_info_t *rdip, immu_flags_t immu_flags)
2259 {
2260 	pgtable_t *pgtable;
2261 	pgtable_t *new;
2262 	pgtable_t *next;
2263 	hw_pdte_t *hwp;
2264 	int level;
2265 	uint_t idx;
2266 
2267 	/* xlate should be at level 0 */
2268 	ASSERT(xlate->xlt_level == 0);
2269 	ASSERT(xlate->xlt_idx == 0);
2270 
2271 	/* start with highest level pgtable i.e. root */
2272 	xlate += nlevels;
2273 	ASSERT(xlate->xlt_level == nlevels);
2274 
2275 	new = NULL;
2276 	xlate->xlt_pgtable = domain->dom_pgtable_root;
2277 	for (level = nlevels; level > 1; level--, xlate--) {
2278 
2279 		ASSERT(xlate->xlt_level == level);
2280 
2281 		idx = xlate->xlt_idx;
2282 		pgtable = xlate->xlt_pgtable;
2283 
2284 		ASSERT(pgtable);
2285 		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
2286 
2287 		/* speculative alloc */
2288 		if (new == NULL) {
2289 			new = pgtable_alloc(immu, domain, immu_flags);
2290 			if (new == NULL) {
2291 				ddi_err(DER_PANIC, rdip, "pgtable alloc err");
2292 			}
2293 
2294 		}
2295 
2296 		/* Alway lock the pgtable in write mode */
2297 		rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2298 
2299 		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2300 
2301 		ASSERT(pgtable->swpg_next_array);
2302 
2303 		next = (pgtable->swpg_next_array)[idx];
2304 
2305 		/*
2306 		 * check if leafier level already has a pgtable
2307 		 * if yes, verify
2308 		 */
2309 		if (next == NULL) {
2310 			next = new;
2311 			new = NULL;
2312 			if (level == 2) {
2313 				/* leaf cannot have next_array */
2314 				kmem_free(next->swpg_next_array,
2315 				    IMMU_PAGESIZE);
2316 				next->swpg_next_array = NULL;
2317 			}
2318 			(pgtable->swpg_next_array)[idx] = next;
2319 			PDE_set_one(immu, hwp, next, rdip, immu_flags);
2320 		} else {
2321 			hw_pdte_t pde = *hwp;
2322 
2323 			if (immu_flags & IMMU_FLAGS_READ)
2324 				PDTE_SET_READ(pde);
2325 			if (immu_flags & IMMU_FLAGS_WRITE)
2326 				PDTE_SET_WRITE(pde);
2327 
2328 #ifdef  BUGGY_DRIVERS
2329 /* If buggy driver we already set permission READ+WRITE so nothing to do */
2330 #endif
2331 
2332 			*hwp = pde;
2333 		}
2334 
2335 		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2336 		    == B_TRUE);
2337 
2338 		(xlate - 1)->xlt_pgtable = next;
2339 
2340 		rw_exit(&(pgtable->swpg_rwlock));
2341 	}
2342 
2343 	if (new) {
2344 		pgtable_free(immu, new, domain);
2345 	}
2346 }
2347 
2348 /*
2349  * dvma_map()
2350  *     map a contiguous range of DVMA pages
2351  *
2352  *     immu: IOMMU unit for which we are generating DVMA cookies
2353  *   domain: domain
2354  *    sdvma: Starting dvma
2355  *   spaddr: Starting paddr
2356  *   npages: Number of pages
2357  *     rdip: requesting device
2358  *     immu_flags: flags
2359  */
2360 static void
2361 dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t spaddr,
2362     uint64_t npages, dev_info_t *rdip, immu_flags_t immu_flags)
2363 {
2364 	uint64_t dvma;
2365 	paddr_t paddr;
2366 	uint64_t n;
2367 	int nlevels = immu->immu_dvma_nlevels;
2368 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2369 
2370 	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
2371 	ASSERT(spaddr % IMMU_PAGESIZE == 0);
2372 	ASSERT(sdvma % IMMU_PAGESIZE == 0);
2373 	ASSERT(npages);
2374 
2375 	n = npages;
2376 	dvma = sdvma;
2377 	paddr = spaddr;
2378 
2379 	while (n > 0) {
2380 		xlate_setup(immu, dvma, xlate, nlevels, rdip);
2381 
2382 		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2383 		PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags);
2384 
2385 		/* set all matching ptes that fit into this leaf pgtable */
2386 		PTE_set_all(immu, domain, &xlate[1], &dvma, &paddr, &n, rdip,
2387 		    immu_flags);
2388 	}
2389 }
2390 
2391 /*
2392  * dvma_unmap()
2393  *   unmap a range of DVMAs
2394  *
2395  * immu: IOMMU unit state
2396  * domain: domain for requesting device
2397  * ddip: domain-dip
2398  * dvma: starting DVMA
2399  * npages: Number of IMMU pages to be unmapped
2400  * rdip: requesting device
2401  */
2402 static void
2403 dvma_unmap(immu_t *immu, domain_t *domain, uint64_t dvma, uint64_t snpages,
2404     dev_info_t *rdip)
2405 {
2406 	int nlevels = immu->immu_dvma_nlevels;
2407 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2408 	uint64_t npages;
2409 
2410 	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
2411 	ASSERT(dvma != 0);
2412 	ASSERT(dvma % IMMU_PAGESIZE == 0);
2413 	ASSERT(snpages);
2414 
2415 	for (npages = snpages; npages > 0; npages--) {
2416 		/* setup the xlate array */
2417 		xlate_setup(immu, dvma, xlate, nlevels, rdip);
2418 
2419 		/* just lookup existing pgtables. Should never fail */
2420 		PDE_lookup(immu, domain, xlate, nlevels, rdip);
2421 
2422 		/* XXX should be more efficient - batch clear */
2423 		PTE_clear_one(immu, domain, &xlate[1], dvma, rdip);
2424 
2425 		dvma += IMMU_PAGESIZE;
2426 	}
2427 }
2428 
2429 static uint64_t
2430 dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages)
2431 {
2432 	ddi_dma_attr_t *dma_attr;
2433 	uint64_t dvma;
2434 	size_t xsize, align, nocross;
2435 	uint64_t minaddr, maxaddr;
2436 
2437 	ASSERT(domain->dom_maptype != IMMU_MAPTYPE_UNITY);
2438 
2439 	/* shotcuts */
2440 	dma_attr = &(hp->dmai_attr);
2441 
2442 	/* parameters */
2443 	xsize = npages * IMMU_PAGESIZE;
2444 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2445 	nocross = (size_t)(dma_attr->dma_attr_seg + 1);
2446 	minaddr = dma_attr->dma_attr_addr_lo;
2447 	maxaddr = dma_attr->dma_attr_addr_hi + 1;
2448 
2449 	/* handle the rollover cases */
2450 	if (maxaddr < dma_attr->dma_attr_addr_hi) {
2451 		maxaddr = dma_attr->dma_attr_addr_hi;
2452 	}
2453 
2454 	/*
2455 	 * allocate from vmem arena.
2456 	 */
2457 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2458 	    xsize, align, 0, nocross, (void *)(uintptr_t)minaddr,
2459 	    (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2460 
2461 	ASSERT(dvma);
2462 	ASSERT(dvma >= minaddr);
2463 	ASSERT(dvma + xsize - 1 < maxaddr);
2464 
2465 	return (dvma);
2466 }
2467 
2468 static void
2469 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2470 {
2471 	uint64_t size = npages * IMMU_PAGESIZE;
2472 
2473 	ASSERT(domain);
2474 	ASSERT(domain->dom_did > 0);
2475 	ASSERT(dvma);
2476 	ASSERT(npages);
2477 
2478 	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) {
2479 		ASSERT(domain->dom_maptype == IMMU_MAPTYPE_UNITY);
2480 		return;
2481 	}
2482 
2483 	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2484 }
2485 /*ARGSUSED*/
2486 static void
2487 cookie_free(rootnex_dma_t *dma, immu_t *immu, domain_t *domain,
2488     dev_info_t *ddip, dev_info_t *rdip)
2489 {
2490 	int i;
2491 	uint64_t dvma;
2492 	uint64_t npages;
2493 	dvcookie_t  *dvcookies = dma->dp_dvcookies;
2494 	uint64_t dvmax =  dma->dp_dvmax;
2495 
2496 	ASSERT(dma->dp_max_cookies);
2497 	ASSERT(dma->dp_max_dcookies);
2498 	ASSERT(dma->dp_dvmax < dma->dp_max_cookies);
2499 	ASSERT(dma->dp_dmax < dma->dp_max_dcookies);
2500 
2501 	for (i = 0; i <= dvmax; i++) {
2502 		dvma = dvcookies[i].dvck_dvma;
2503 		npages = dvcookies[i].dvck_npages;
2504 		dvma_unmap(immu, domain, dvma, npages, rdip);
2505 		dvma_free(domain, dvma, npages);
2506 	}
2507 
2508 	kmem_free(dma->dp_dvcookies, sizeof (dvcookie_t) * dma->dp_max_cookies);
2509 	dma->dp_dvcookies = NULL;
2510 	kmem_free(dma->dp_dcookies, sizeof (dcookie_t) * dma->dp_max_dcookies);
2511 	dma->dp_dcookies = NULL;
2512 	if (dma->dp_need_to_free_cookie == B_TRUE) {
2513 		kmem_free(dma->dp_cookies, sizeof (ddi_dma_cookie_t) *
2514 		    dma->dp_max_cookies);
2515 		dma->dp_dcookies = NULL;
2516 		dma->dp_need_to_free_cookie = B_FALSE;
2517 	}
2518 
2519 	dma->dp_max_cookies = 0;
2520 	dma->dp_max_dcookies = 0;
2521 	dma->dp_cookie_size = 0;
2522 	dma->dp_dvmax = 0;
2523 	dma->dp_dmax = 0;
2524 }
2525 
2526 /*
2527  * cookie_alloc()
2528  */
2529 static int
2530 cookie_alloc(rootnex_dma_t *dma, struct ddi_dma_req *dmareq,
2531     ddi_dma_attr_t *attr, uint_t prealloc)
2532 {
2533 	int kmflag;
2534 	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
2535 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2536 	dcookie_t *dcookies = dma->dp_dcookies;
2537 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2538 	uint64_t max_cookies;
2539 	uint64_t max_dcookies;
2540 	uint64_t cookie_size;
2541 
2542 	/* we need to allocate new array */
2543 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
2544 		kmflag =  KM_SLEEP;
2545 	} else {
2546 		kmflag =  KM_NOSLEEP;
2547 	}
2548 
2549 	/*
2550 	 * XXX make sure cookies size doen't exceed sinfo->si_max_cookie_size;
2551 	 */
2552 
2553 	/*
2554 	 * figure out the rough estimate of array size
2555 	 * At a minimum, each cookie must hold 1 page.
2556 	 * At a maximum, it cannot exceed dma_attr_sgllen
2557 	 */
2558 	max_dcookies = dmareq->dmar_object.dmao_size + IMMU_PAGEOFFSET;
2559 	max_dcookies /= IMMU_PAGESIZE;
2560 	max_dcookies++;
2561 	max_cookies = MIN(max_dcookies, attr->dma_attr_sgllen);
2562 
2563 	/* allocate the dvma cookie array */
2564 	dvcookies = kmem_zalloc(sizeof (dvcookie_t) * max_cookies, kmflag);
2565 	if (dvcookies == NULL) {
2566 		return (DDI_FAILURE);
2567 	}
2568 
2569 	/* allocate the "phys" cookie array */
2570 	dcookies = kmem_zalloc(sizeof (dcookie_t) * max_dcookies, kmflag);
2571 	if (dcookies == NULL) {
2572 		kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
2573 		dvcookies = NULL;
2574 		return (DDI_FAILURE);
2575 	}
2576 
2577 	/* allocate the "real" cookie array  - the one given to users */
2578 	cookie_size = sizeof (ddi_dma_cookie_t) * max_cookies;
2579 	if (max_cookies > prealloc) {
2580 		cookies = kmem_zalloc(cookie_size, kmflag);
2581 		if (cookies == NULL) {
2582 			kmem_free(dvcookies, sizeof (dvcookie_t) *
2583 			    max_cookies);
2584 			kmem_free(dcookies, sizeof (dcookie_t) *
2585 			    max_dcookies);
2586 			goto fail;
2587 		}
2588 		dma->dp_need_to_free_cookie = B_TRUE;
2589 	} else {
2590 		/* the preallocated buffer fits this size */
2591 		cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
2592 		bzero(cookies, sizeof (ddi_dma_cookie_t) * max_cookies);
2593 		dma->dp_need_to_free_cookie = B_FALSE;
2594 	}
2595 
2596 	dma->dp_dvcookies = dvcookies;
2597 	dma->dp_dcookies = dcookies;
2598 	dma->dp_cookies = cookies;
2599 	dma->dp_cookie_size = cookie_size;
2600 	dma->dp_max_cookies = max_cookies;
2601 	dma->dp_max_dcookies = max_dcookies;
2602 	dma->dp_dvmax = 0;
2603 	dma->dp_dmax = 0;
2604 
2605 	sinfo->si_max_pages = dma->dp_max_cookies;
2606 
2607 	return (DDI_SUCCESS);
2608 
2609 fail:
2610 	dma->dp_dvcookies = NULL;
2611 	dma->dp_dcookies = NULL;
2612 	dma->dp_cookies = NULL;
2613 	dma->dp_cookie_size = 0;
2614 	dma->dp_max_cookies = 0;
2615 	dma->dp_max_dcookies = 0;
2616 	dma->dp_dvmax = 0;
2617 	dma->dp_dmax = 0;
2618 	dma->dp_need_to_free_cookie = B_FALSE;
2619 	sinfo->si_max_pages = 0;
2620 	return (DDI_FAILURE);
2621 }
2622 
2623 /*ARGSUSED*/
2624 static void
2625 cookie_update(domain_t *domain, rootnex_dma_t *dma, paddr_t paddr,
2626     int64_t psize, uint64_t maxseg)
2627 {
2628 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2629 	dcookie_t *dcookies = dma->dp_dcookies;
2630 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2631 	uint64_t dvmax = dma->dp_dvmax;
2632 	uint64_t dmax = dma->dp_dmax;
2633 
2634 	ASSERT(dvmax < dma->dp_max_cookies);
2635 	ASSERT(dmax < dma->dp_max_dcookies);
2636 
2637 	paddr &= IMMU_PAGEMASK;
2638 
2639 	ASSERT(paddr);
2640 	ASSERT(psize);
2641 	ASSERT(maxseg);
2642 
2643 	/*
2644 	 * check to see if this page would put us
2645 	 * over the max cookie size
2646 	 */
2647 	if (cookies[dvmax].dmac_size + psize > maxseg) {
2648 		dvcookies[dvmax].dvck_eidx = dmax;
2649 		dvmax++;    /* use the next dvcookie */
2650 		dmax++;    /* also mean we use the next dcookie */
2651 		dvcookies[dvmax].dvck_sidx = dmax;
2652 
2653 		ASSERT(dvmax < dma->dp_max_cookies);
2654 		ASSERT(dmax < dma->dp_max_dcookies);
2655 	}
2656 
2657 	/*
2658 	 * If the cookie is mapped or empty
2659 	 */
2660 	if (dvcookies[dvmax].dvck_dvma != 0 ||
2661 	    dvcookies[dvmax].dvck_npages == 0) {
2662 		/* if mapped, we need a new empty one */
2663 		if (dvcookies[dvmax].dvck_dvma != 0) {
2664 			dvcookies[dvmax].dvck_eidx = dmax;
2665 			dvmax++;
2666 			dmax++;
2667 			dvcookies[dvmax].dvck_sidx = dma->dp_dmax;
2668 			ASSERT(dvmax < dma->dp_max_cookies);
2669 			ASSERT(dmax < dma->dp_max_dcookies);
2670 		}
2671 
2672 		/* ok, we have an empty cookie */
2673 		ASSERT(cookies[dvmax].dmac_size == 0);
2674 		ASSERT(dvcookies[dvmax].dvck_dvma == 0);
2675 		ASSERT(dvcookies[dvmax].dvck_npages
2676 		    == 0);
2677 		ASSERT(dcookies[dmax].dck_paddr == 0);
2678 		ASSERT(dcookies[dmax].dck_npages == 0);
2679 
2680 		dvcookies[dvmax].dvck_dvma = 0;
2681 		dvcookies[dvmax].dvck_npages = 1;
2682 		dcookies[dmax].dck_paddr = paddr;
2683 		dcookies[dmax].dck_npages = 1;
2684 		cookies[dvmax].dmac_size = psize;
2685 	} else {
2686 		/* Unmapped cookie but not empty. Add to it */
2687 		cookies[dma->dp_dvmax].dmac_size += psize;
2688 		ASSERT(dvcookies[dma->dp_dvmax].dvck_dvma == 0);
2689 		dvcookies[dma->dp_dvmax].dvck_npages++;
2690 		ASSERT(dcookies[dmax].dck_paddr != 0);
2691 		ASSERT(dcookies[dmax].dck_npages != 0);
2692 
2693 		/* Check if this paddr is contiguous */
2694 		if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2695 			dcookies[dmax].dck_npages++;
2696 		} else {
2697 			/* No, we need a new dcookie */
2698 			dmax++;
2699 			ASSERT(dcookies[dmax].dck_paddr == 0);
2700 			ASSERT(dcookies[dmax].dck_npages == 0);
2701 			dcookies[dmax].dck_paddr = paddr;
2702 			dcookies[dmax].dck_npages = 1;
2703 		}
2704 	}
2705 
2706 	dma->dp_dvmax = dvmax;
2707 	dma->dp_dmax = dmax;
2708 }
2709 
2710 static void
2711 cookie_finalize(ddi_dma_impl_t *hp, immu_t *immu, domain_t *domain,
2712     dev_info_t *rdip, immu_flags_t immu_flags)
2713 {
2714 	int i;
2715 	int j;
2716 	rootnex_dma_t *dma = (rootnex_dma_t *)hp->dmai_private;
2717 	dvcookie_t *dvcookies = dma->dp_dvcookies;
2718 	dcookie_t *dcookies = dma->dp_dcookies;
2719 	ddi_dma_cookie_t *cookies = dma->dp_cookies;
2720 	paddr_t paddr;
2721 	uint64_t npages;
2722 	uint64_t dvma;
2723 
2724 	for (i = 0; i <= dma->dp_dvmax; i++) {
2725 		/* Finish up the last cookie */
2726 		if (i == dma->dp_dvmax) {
2727 			dvcookies[i].dvck_eidx = dma->dp_dmax;
2728 		}
2729 		if ((dvma = dvcookies[i].dvck_dvma) != 0) {
2730 			cookies[i].dmac_laddress = dvma;
2731 			ASSERT(cookies[i].dmac_size != 0);
2732 			cookies[i].dmac_type = 0;
2733 			for (j = dvcookies[i].dvck_sidx;
2734 			    j <= dvcookies[i].dvck_eidx; j++) {
2735 				ASSERT(dcookies[j].dck_paddr != 0);
2736 				ASSERT(dcookies[j].dck_npages != 0);
2737 			}
2738 			continue;
2739 		}
2740 
2741 		dvma = dvma_alloc(hp, domain, dvcookies[i].dvck_npages);
2742 
2743 		dvcookies[i].dvck_dvma = dvma;
2744 
2745 		/* Set "real" cookies addr, cookie size already set */
2746 		cookies[i].dmac_laddress = dvma;
2747 		ASSERT(cookies[i].dmac_size != 0);
2748 		cookies[i].dmac_type = 0;
2749 
2750 		for (j = dvcookies[i].dvck_sidx;
2751 		    j <= dvcookies[i].dvck_eidx; j++) {
2752 
2753 			paddr = dcookies[j].dck_paddr;
2754 			npages = dcookies[j].dck_npages;
2755 
2756 			ASSERT(paddr);
2757 			ASSERT(npages);
2758 
2759 			dvma_map(immu, domain, dvma, paddr, npages,
2760 			    rdip, immu_flags);
2761 			dvma += npages * IMMU_PAGESIZE;
2762 		}
2763 	}
2764 }
2765 
2766 /*
2767  * cookie_create()
2768  */
2769 static int
2770 cookie_create(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
2771     ddi_dma_attr_t *a, immu_t *immu, domain_t *domain, dev_info_t *rdip,
2772     uint_t prealloc_count, immu_flags_t immu_flags)
2773 {
2774 
2775 	ddi_dma_atyp_t buftype;
2776 	uint64_t offset;
2777 	page_t **pparray;
2778 	uint64_t paddr;
2779 	uint_t psize;
2780 	uint_t size;
2781 	uint64_t maxseg;
2782 	caddr_t vaddr;
2783 	uint_t pcnt;
2784 	page_t *page;
2785 	rootnex_sglinfo_t *sglinfo;
2786 	ddi_dma_obj_t *dmar_object;
2787 	rootnex_dma_t *dma;
2788 
2789 	dma = (rootnex_dma_t *)hp->dmai_private;
2790 	sglinfo = &(dma->dp_sglinfo);
2791 	dmar_object = &(dmareq->dmar_object);
2792 	maxseg = sglinfo->si_max_cookie_size;
2793 	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2794 	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2795 	buftype = dmar_object->dmao_type;
2796 	size = dmar_object->dmao_size;
2797 
2798 	/*
2799 	 * Allocate cookie, dvcookie and dcookie
2800 	 */
2801 	if (cookie_alloc(dma, dmareq, a, prealloc_count) != DDI_SUCCESS) {
2802 		return (DDI_FAILURE);
2803 	}
2804 	hp->dmai_cookie = dma->dp_cookies;
2805 
2806 	pcnt = 0;
2807 
2808 	/* retrieve paddr, psize, offset from dmareq */
2809 	if (buftype == DMA_OTYP_PAGES) {
2810 		page = dmar_object->dmao_obj.pp_obj.pp_pp;
2811 		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
2812 		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
2813 		    MMU_PAGEOFFSET;
2814 		paddr = pfn_to_pa(page->p_pagenum) + offset;
2815 		psize = MIN((MMU_PAGESIZE - offset), size);
2816 		sglinfo->si_asp = NULL;
2817 		page = page->p_next;
2818 	} else {
2819 		ASSERT((buftype == DMA_OTYP_VADDR) ||
2820 		    (buftype == DMA_OTYP_BUFVADDR));
2821 		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
2822 		if (sglinfo->si_asp == NULL) {
2823 			sglinfo->si_asp = &kas;
2824 		}
2825 		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2826 		if (pparray != NULL) {
2827 			ASSERT(!PP_ISFREE(pparray[pcnt]));
2828 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2829 			psize = MIN((MMU_PAGESIZE - offset), size);
2830 			pcnt++;
2831 		} else {
2832 			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
2833 			    vaddr)) + offset;
2834 			psize = MIN(size, (MMU_PAGESIZE - offset));
2835 			vaddr += psize;
2836 		}
2837 	}
2838 
2839 	/* save the iommu page offset */
2840 	sglinfo->si_buf_offset = offset & IMMU_PAGEOFFSET;
2841 
2842 	/*
2843 	 * setup dvcookie and dcookie for [paddr, paddr+psize)
2844 	 */
2845 	cookie_update(domain, dma, paddr, psize, maxseg);
2846 
2847 	size -= psize;
2848 	while (size > 0) {
2849 		/* get the size for this page (i.e. partial or full page) */
2850 		psize = MIN(size, MMU_PAGESIZE);
2851 		if (buftype == DMA_OTYP_PAGES) {
2852 			/* get the paddr from the page_t */
2853 			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
2854 			paddr = pfn_to_pa(page->p_pagenum);
2855 			page = page->p_next;
2856 		} else if (pparray != NULL) {
2857 			/* index into the array of page_t's to get the paddr */
2858 			ASSERT(!PP_ISFREE(pparray[pcnt]));
2859 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2860 			pcnt++;
2861 		} else {
2862 			/* call into the VM to get the paddr */
2863 			paddr = pfn_to_pa(hat_getpfnum
2864 			    (sglinfo->si_asp->a_hat, vaddr));
2865 			vaddr += psize;
2866 		}
2867 		/*
2868 		 * set dvcookie and dcookie for [paddr, paddr+psize)
2869 		 */
2870 		cookie_update(domain, dma, paddr, psize, maxseg);
2871 		size -= psize;
2872 	}
2873 
2874 	cookie_finalize(hp, immu, domain, rdip, immu_flags);
2875 
2876 	/* take account in the offset into the first page */
2877 	dma->dp_cookies[0].dmac_laddress += sglinfo->si_buf_offset;
2878 
2879 	/* save away how many cookies we have */
2880 	sglinfo->si_sgl_size = dma->dp_dvmax + 1;
2881 
2882 	return (DDI_SUCCESS);
2883 }
2884 
2885 /* ############################# Functions exported ######################## */
2886 
2887 /*
2888  * setup the DVMA subsystem
2889  * this code runs only for the first IOMMU unit
2890  */
2891 void
2892 immu_dvma_setup(list_t *listp)
2893 {
2894 	immu_t *immu;
2895 	uint_t kval;
2896 	size_t nchains;
2897 
2898 	/* locks */
2899 	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
2900 
2901 	/* Create lists */
2902 	list_create(&immu_unity_domain_list, sizeof (domain_t),
2903 	    offsetof(domain_t, dom_maptype_node));
2904 	list_create(&immu_xlate_domain_list, sizeof (domain_t),
2905 	    offsetof(domain_t, dom_maptype_node));
2906 
2907 	/* Setup BDF domain hash */
2908 	nchains = 0xff;
2909 	kval = mod_hash_iddata_gen(nchains);
2910 
2911 	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
2912 	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
2913 	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
2914 	    KM_NOSLEEP);
2915 	ASSERT(bdf_domain_hash);
2916 
2917 	immu = list_head(listp);
2918 	for (; immu; immu = list_next(listp, immu)) {
2919 		create_unity_domain(immu);
2920 		did_init(immu);
2921 		context_init(immu);
2922 		immu->immu_dvma_setup = B_TRUE;
2923 	}
2924 }
2925 
2926 /*
2927  * Startup up one DVMA unit
2928  */
2929 void
2930 immu_dvma_startup(immu_t *immu)
2931 {
2932 	ASSERT(immu);
2933 	ASSERT(immu->immu_dvma_running == B_FALSE);
2934 
2935 	if (immu_gfxdvma_enable == B_FALSE &&
2936 	    immu->immu_dvma_gfx_only == B_TRUE) {
2937 		return;
2938 	}
2939 
2940 	/*
2941 	 * DVMA will start once IOMMU is "running"
2942 	 */
2943 	ASSERT(immu->immu_dvma_running == B_FALSE);
2944 	immu->immu_dvma_running = B_TRUE;
2945 }
2946 
2947 /*
2948  * immu_dvma_physmem_update()
2949  *       called when the installed memory on a
2950  *       system increases, to expand domain DVMA
2951  *       for domains with UNITY mapping
2952  */
2953 void
2954 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
2955 {
2956 	uint64_t start;
2957 	uint64_t npages;
2958 	domain_t *domain;
2959 
2960 	/*
2961 	 * Just walk the system-wide list of domains with
2962 	 * UNITY mapping. Both the list of *all* domains
2963 	 * and *UNITY* domains is protected by the same
2964 	 * single lock
2965 	 */
2966 	mutex_enter(&immu_domain_lock);
2967 	domain = list_head(&immu_unity_domain_list);
2968 	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
2969 
2970 		/* There is no vmem_arena for unity domains. Just map it */
2971 		ddi_err(DER_LOG, NULL, "IMMU: unity-domain: Adding map "
2972 		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
2973 
2974 		start = IMMU_ROUNDOWN(addr);
2975 		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
2976 
2977 		dvma_map(domain->dom_immu, domain, start, start,
2978 		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2979 
2980 	}
2981 	mutex_exit(&immu_domain_lock);
2982 }
2983 
2984 int
2985 immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng,
2986     uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags)
2987 {
2988 	ddi_dma_attr_t *attr;
2989 	dev_info_t *ddip;
2990 	domain_t *domain;
2991 	immu_t *immu;
2992 	int r = DDI_FAILURE;
2993 
2994 	ASSERT(immu_enable == B_TRUE);
2995 	ASSERT(immu_running == B_TRUE || !(immu_flags & IMMU_FLAGS_DMAHDL));
2996 	ASSERT(hp || !(immu_flags & IMMU_FLAGS_DMAHDL));
2997 
2998 	/*
2999 	 * Intel IOMMU will only be turned on if IOMMU
3000 	 * page size is a multiple of IOMMU page size
3001 	 */
3002 
3003 	/*LINTED*/
3004 	ASSERT(MMU_PAGESIZE % IMMU_PAGESIZE == 0);
3005 
3006 	/* Can only do DVMA if dip is attached */
3007 	if (rdip == NULL) {
3008 		ddi_err(DER_PANIC, rdip, "DVMA map: No device specified");
3009 		/*NOTREACHED*/
3010 	}
3011 
3012 	immu_flags |= dma_to_immu_flags(dmareq);
3013 
3014 
3015 	/*
3016 	 * Setup DVMA domain for the device. This does
3017 	 * work only the first time we do DVMA for a
3018 	 * device.
3019 	 */
3020 	ddip = NULL;
3021 	domain = device_domain(rdip, &ddip, immu_flags);
3022 	if (domain == NULL) {
3023 		ASSERT(ddip == NULL);
3024 		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
3025 		return (DDI_DMA_NORESOURCES);
3026 	}
3027 
3028 	/*
3029 	 * If a domain is found, we must also have a domain dip
3030 	 * which is the topmost ancestor dip of rdip that shares
3031 	 * the same domain with rdip.
3032 	 */
3033 	if (domain->dom_did == 0 || ddip == NULL) {
3034 		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
3035 		    domain->dom_did, ddip);
3036 		return (DDI_DMA_NORESOURCES);
3037 	}
3038 
3039 	immu = domain->dom_immu;
3040 	ASSERT(immu);
3041 	if (domain->dom_did == IMMU_UNITY_DID) {
3042 		ASSERT(domain == immu->immu_unity_domain);
3043 
3044 		/* mapping already done. Let rootnex create cookies */
3045 		r = DDI_DMA_USE_PHYSICAL;
3046 	} else  if (immu_flags & IMMU_FLAGS_DMAHDL) {
3047 
3048 		/* if we have a DMA handle, the IOMMUs must be running */
3049 		ASSERT(immu->immu_regs_running == B_TRUE);
3050 		ASSERT(immu->immu_dvma_running == B_TRUE);
3051 
3052 		attr = &hp->dmai_attr;
3053 		if (attr == NULL) {
3054 			ddi_err(DER_PANIC, rdip,
3055 			    "DMA handle (%p): NULL attr", hp);
3056 			/*NOTREACHED*/
3057 		}
3058 		if (cookie_create(hp, dmareq, attr, immu, domain, rdip,
3059 		    prealloc_count, immu_flags) != DDI_SUCCESS) {
3060 			ddi_err(DER_MODE, rdip, "dvcookie_alloc: failed");
3061 			return (DDI_DMA_NORESOURCES);
3062 		}
3063 
3064 		/* flush write buffer */
3065 		immu_regs_wbf_flush(immu);
3066 		r = DDI_DMA_MAPPED;
3067 	} else if (immu_flags & IMMU_FLAGS_MEMRNG) {
3068 		dvma_map(immu, domain, mrng->mrng_start, mrng->mrng_start,
3069 		    mrng->mrng_npages, rdip, immu_flags);
3070 		r = DDI_DMA_MAPPED;
3071 	} else {
3072 		ddi_err(DER_PANIC, rdip, "invalid flags for immu_dvma_map()");
3073 		/*NOTREACHED*/
3074 	}
3075 
3076 	/*
3077 	 * Update the root and context entries
3078 	 */
3079 	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
3080 	    != DDI_SUCCESS) {
3081 		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
3082 		return (DDI_DMA_NORESOURCES);
3083 	}
3084 
3085 	/* flush caches */
3086 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
3087 	immu_regs_context_flush(immu, 0, 0, domain->dom_did, CONTEXT_DSI);
3088 	rw_exit(&(immu->immu_ctx_rwlock));
3089 	immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0, TLB_IVA_WHOLE,
3090 	    IOTLB_DSI);
3091 	immu_regs_wbf_flush(immu);
3092 
3093 	return (r);
3094 }
3095 
3096 int
3097 immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip)
3098 {
3099 	ddi_dma_attr_t *attr;
3100 	rootnex_dma_t *dma;
3101 	domain_t *domain;
3102 	immu_t *immu;
3103 	dev_info_t *ddip;
3104 	immu_flags_t immu_flags;
3105 
3106 	ASSERT(immu_enable == B_TRUE);
3107 	ASSERT(immu_running == B_TRUE);
3108 	ASSERT(hp);
3109 
3110 	/*
3111 	 * Intel IOMMU will only be turned on if IOMMU
3112 	 * page size is same as MMU page size
3113 	 */
3114 	/*LINTED*/
3115 	ASSERT(MMU_PAGESIZE == IMMU_PAGESIZE);
3116 
3117 	/* rdip need not be attached */
3118 	if (rdip == NULL) {
3119 		ddi_err(DER_PANIC, rdip, "DVMA unmap: No device specified");
3120 		return (DDI_DMA_NORESOURCES);
3121 	}
3122 
3123 	/*
3124 	 * Get the device domain, this should always
3125 	 * succeed since there had to be a domain to
3126 	 * setup DVMA.
3127 	 */
3128 	dma = (rootnex_dma_t *)hp->dmai_private;
3129 	attr = &hp->dmai_attr;
3130 	if (attr == NULL) {
3131 		ddi_err(DER_PANIC, rdip, "DMA handle (%p) has NULL attr", hp);
3132 		/*NOTREACHED*/
3133 	}
3134 	immu_flags = dma->dp_sleep_flags;
3135 
3136 	ddip = NULL;
3137 	domain = device_domain(rdip, &ddip, immu_flags);
3138 	if (domain == NULL || domain->dom_did == 0 || ddip == NULL) {
3139 		ddi_err(DER_MODE, rdip, "Attempt to unmap DVMA for "
3140 		    "a device without domain or with an uninitialized "
3141 		    "domain");
3142 		return (DDI_DMA_NORESOURCES);
3143 	}
3144 
3145 	/*
3146 	 * immu must be set in the domain.
3147 	 */
3148 	immu = domain->dom_immu;
3149 	ASSERT(immu);
3150 	if (domain->dom_did == IMMU_UNITY_DID) {
3151 		ASSERT(domain == immu->immu_unity_domain);
3152 		/*
3153 		 * domain is unity, nothing to do here, let the rootnex
3154 		 * code free the cookies.
3155 		 */
3156 		return (DDI_DMA_USE_PHYSICAL);
3157 	}
3158 
3159 	dma = hp->dmai_private;
3160 	if (dma == NULL) {
3161 		ddi_err(DER_PANIC, rdip, "DVMA unmap: DMA handle (%p) has "
3162 		    "no private dma structure", hp);
3163 		/*NOTREACHED*/
3164 	}
3165 
3166 	/* free all cookies */
3167 	cookie_free(dma, immu, domain, ddip, rdip);
3168 
3169 	/* flush caches */
3170 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
3171 	immu_regs_context_flush(immu, 0, 0, domain->dom_did, CONTEXT_DSI);
3172 	rw_exit(&(immu->immu_ctx_rwlock));
3173 	immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0, TLB_IVA_WHOLE,
3174 	    IOTLB_DSI);
3175 	immu_regs_wbf_flush(immu);
3176 
3177 	return (DDI_SUCCESS);
3178 }
3179 
3180 immu_devi_t *
3181 immu_devi_get(dev_info_t *rdip)
3182 {
3183 	immu_devi_t *immu_devi;
3184 
3185 	mutex_enter(&DEVI(rdip)->devi_lock);
3186 	immu_devi = DEVI(rdip)->devi_iommu;
3187 	mutex_exit(&DEVI(rdip)->devi_lock);
3188 
3189 	return (immu_devi);
3190 }
3191