xref: /illumos-gate/usr/src/uts/i86pc/io/immu_dvma.c (revision 33efde4275d24731ef87927237b0ffb0630b6b2d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Portions Copyright (c) 2010, Oracle and/or its affiliates.
23  * All rights reserved.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 /*
30  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
31  * Copyright 2017 Joyent, Inc.
32  */
33 
34 /*
35  * DVMA code
36  * This file contains Intel IOMMU code that deals with DVMA
37  * i.e. DMA remapping.
38  */
39 
40 #include <sys/sysmacros.h>
41 #include <sys/pcie.h>
42 #include <sys/pci_cfgspace.h>
43 #include <vm/hat_i86.h>
44 #include <sys/memlist.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/modhash.h>
48 #include <sys/immu.h>
49 #include <sys/x86_archext.h>
50 #include <sys/archsystm.h>
51 
52 #undef	TEST
53 
54 /*
55  * Macros based on PCI spec
56  */
57 #define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
58 #define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
59 #define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
60 
61 #define	IMMU_CONTIG_PADDR(d, p) \
62 	((d).dck_paddr && ((d).dck_paddr + (d).dck_npages * IMMU_PAGESIZE) \
63 	    == (p))
64 
65 typedef struct dvma_arg {
66 	immu_t *dva_immu;
67 	dev_info_t *dva_rdip;
68 	dev_info_t *dva_ddip;
69 	domain_t *dva_domain;
70 	int dva_level;
71 	immu_flags_t dva_flags;
72 	list_t *dva_list;
73 	int dva_error;
74 } dvma_arg_t;
75 
76 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
77     dev_info_t *rdip, immu_flags_t immu_flags);
78 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
79     int dev, int func, immu_flags_t immu_flags);
80 static void destroy_immu_devi(immu_devi_t *immu_devi);
81 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma,
82     uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
83     immu_flags_t immu_flags);
84 
85 /* Extern globals */
86 extern struct memlist  *phys_install;
87 
88 /*
89  * iommulib interface functions.
90  */
91 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip);
92 static int immu_allochdl(iommulib_handle_t handle,
93     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
94     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep);
95 static int immu_freehdl(iommulib_handle_t handle,
96     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
97 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
98     dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req,
99     ddi_dma_cookie_t *cookiep, uint_t *ccountp);
100 static int immu_unbindhdl(iommulib_handle_t handle,
101     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
102 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip,
103     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len,
104     uint_t cachefl);
105 static int immu_win(iommulib_handle_t handle, dev_info_t *dip,
106     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
107     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp);
108 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
109     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
110     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao);
111 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
112     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao);
113 
114 /* static Globals */
115 
116 /*
117  * Used to setup DMA objects (memory regions)
118  * for DMA reads by IOMMU units
119  */
120 static ddi_dma_attr_t immu_dma_attr = {
121 	DMA_ATTR_V0,
122 	0U,
123 	0xffffffffffffffffULL,
124 	0xffffffffU,
125 	MMU_PAGESIZE, /* MMU page aligned */
126 	0x1,
127 	0x1,
128 	0xffffffffU,
129 	0xffffffffffffffffULL,
130 	1,
131 	4,
132 	0
133 };
134 
135 static ddi_device_acc_attr_t immu_acc_attr = {
136 	DDI_DEVICE_ATTR_V0,
137 	DDI_NEVERSWAP_ACC,
138 	DDI_STRICTORDER_ACC
139 };
140 
141 struct iommulib_ops immulib_ops = {
142 	IOMMU_OPS_VERSION,
143 	INTEL_IOMMU,
144 	"Intel IOMMU",
145 	NULL,
146 	immu_probe,
147 	immu_allochdl,
148 	immu_freehdl,
149 	immu_bindhdl,
150 	immu_unbindhdl,
151 	immu_sync,
152 	immu_win,
153 	immu_mapobject,
154 	immu_unmapobject,
155 };
156 
157 /*
158  * Fake physical address range used to set up initial prealloc mappings.
159  * This memory is never actually accessed. It is mapped read-only,
160  * and is overwritten as soon as the first DMA bind operation is
161  * performed. Since 0 is a special case, just start at the 2nd
162  * physical page.
163  */
164 
165 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES };
166 
167 /* globals private to this file */
168 static kmutex_t immu_domain_lock;
169 static list_t immu_unity_domain_list;
170 static list_t immu_xlate_domain_list;
171 
172 /* structure used to store idx into each level of the page tables */
173 typedef struct xlate {
174 	int xlt_level;
175 	uint_t xlt_idx;
176 	pgtable_t *xlt_pgtable;
177 } xlate_t;
178 
179 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
180 #define	IMMU_UNITY_DID   1
181 
182 static mod_hash_t *bdf_domain_hash;
183 
184 int immu_use_alh;
185 int immu_use_tm;
186 
187 static domain_t *
bdf_domain_lookup(immu_devi_t * immu_devi)188 bdf_domain_lookup(immu_devi_t *immu_devi)
189 {
190 	domain_t *domain;
191 	int16_t seg = immu_devi->imd_seg;
192 	int16_t bus = immu_devi->imd_bus;
193 	int16_t devfunc = immu_devi->imd_devfunc;
194 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
195 
196 	if (seg < 0 || bus < 0 || devfunc < 0) {
197 		return (NULL);
198 	}
199 
200 	domain = NULL;
201 	if (mod_hash_find(bdf_domain_hash,
202 	    (void *)bdf, (void *)&domain) == 0) {
203 		ASSERT(domain);
204 		ASSERT(domain->dom_did > 0);
205 		return (domain);
206 	} else {
207 		return (NULL);
208 	}
209 }
210 
211 static void
bdf_domain_insert(immu_devi_t * immu_devi,domain_t * domain)212 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
213 {
214 	int16_t seg = immu_devi->imd_seg;
215 	int16_t bus = immu_devi->imd_bus;
216 	int16_t devfunc = immu_devi->imd_devfunc;
217 	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
218 
219 	if (seg < 0 || bus < 0 || devfunc < 0) {
220 		return;
221 	}
222 
223 	(void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
224 }
225 
226 static int
match_lpc(dev_info_t * pdip,void * arg)227 match_lpc(dev_info_t *pdip, void *arg)
228 {
229 	immu_devi_t *immu_devi;
230 	dvma_arg_t *dvap = (dvma_arg_t *)arg;
231 
232 	if (list_is_empty(dvap->dva_list)) {
233 		return (DDI_WALK_TERMINATE);
234 	}
235 
236 	immu_devi = list_head(dvap->dva_list);
237 	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
238 	    immu_devi)) {
239 		if (immu_devi->imd_dip == pdip) {
240 			dvap->dva_ddip = pdip;
241 			dvap->dva_error = DDI_SUCCESS;
242 			return (DDI_WALK_TERMINATE);
243 		}
244 	}
245 
246 	return (DDI_WALK_CONTINUE);
247 }
248 
249 static void
immu_devi_set_spclist(dev_info_t * dip,immu_t * immu)250 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
251 {
252 	list_t *spclist = NULL;
253 	immu_devi_t *immu_devi;
254 
255 	immu_devi = IMMU_DEVI(dip);
256 	if (immu_devi->imd_display == B_TRUE) {
257 		spclist = &(immu->immu_dvma_gfx_list);
258 	} else if (immu_devi->imd_lpc == B_TRUE) {
259 		spclist = &(immu->immu_dvma_lpc_list);
260 	}
261 
262 	if (spclist) {
263 		mutex_enter(&(immu->immu_lock));
264 		list_insert_head(spclist, immu_devi);
265 		mutex_exit(&(immu->immu_lock));
266 	}
267 }
268 
269 /*
270  * Set the immu_devi struct in the immu_devi field of a devinfo node
271  */
272 int
immu_devi_set(dev_info_t * dip,immu_flags_t immu_flags)273 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
274 {
275 	int bus, dev, func;
276 	immu_devi_t *new_imd;
277 	immu_devi_t *immu_devi;
278 
279 	immu_devi = immu_devi_get(dip);
280 	if (immu_devi != NULL) {
281 		return (DDI_SUCCESS);
282 	}
283 
284 	bus = dev = func = -1;
285 
286 	/*
287 	 * Assume a new immu_devi struct is needed
288 	 */
289 	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
290 		/*
291 		 * No BDF. Set bus = -1 to indicate this.
292 		 * We still need to create a immu_devi struct
293 		 * though
294 		 */
295 		bus = -1;
296 		dev = 0;
297 		func = 0;
298 	}
299 
300 	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
301 	if (new_imd  == NULL) {
302 		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
303 		    "structure");
304 		return (DDI_FAILURE);
305 	}
306 
307 	/*
308 	 * Check if some other thread allocated a immu_devi while we
309 	 * didn't own the lock.
310 	 */
311 	mutex_enter(&(DEVI(dip)->devi_lock));
312 	if (IMMU_DEVI(dip) == NULL) {
313 		IMMU_DEVI_SET(dip, new_imd);
314 	} else {
315 		destroy_immu_devi(new_imd);
316 	}
317 	mutex_exit(&(DEVI(dip)->devi_lock));
318 
319 	return (DDI_SUCCESS);
320 }
321 
322 static dev_info_t *
get_lpc_devinfo(immu_t * immu,dev_info_t * rdip,immu_flags_t immu_flags)323 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
324 {
325 	dvma_arg_t dvarg = {0};
326 	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
327 	dvarg.dva_rdip = rdip;
328 	dvarg.dva_error = DDI_FAILURE;
329 
330 	if (immu_walk_ancestor(rdip, NULL, match_lpc,
331 	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
332 		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
333 		    "find lpc_devinfo for ISA device");
334 		return (NULL);
335 	}
336 
337 	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
338 		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
339 		    "ISA device");
340 		return (NULL);
341 	}
342 
343 	return (dvarg.dva_ddip);
344 }
345 
346 static dev_info_t *
get_gfx_devinfo(dev_info_t * rdip)347 get_gfx_devinfo(dev_info_t *rdip)
348 {
349 	immu_t *immu;
350 	immu_devi_t *immu_devi;
351 	list_t *list_gfx;
352 
353 	/*
354 	 * The GFX device may not be on the same iommu unit as "agpgart"
355 	 * so search globally
356 	 */
357 	immu_devi = NULL;
358 	immu = list_head(&immu_list);
359 	for (; immu; immu = list_next(&immu_list, immu)) {
360 		list_gfx = &(immu->immu_dvma_gfx_list);
361 		if (!list_is_empty(list_gfx)) {
362 			immu_devi = list_head(list_gfx);
363 			break;
364 		}
365 	}
366 
367 	if (immu_devi == NULL) {
368 		ddi_err(DER_WARN, rdip, "iommu: No GFX device. "
369 		    "Cannot redirect agpgart");
370 		return (NULL);
371 	}
372 
373 	ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s",
374 	    ddi_node_name(immu_devi->imd_dip));
375 
376 	return (immu_devi->imd_dip);
377 }
378 
379 static immu_flags_t
dma_to_immu_flags(struct ddi_dma_req * dmareq)380 dma_to_immu_flags(struct ddi_dma_req *dmareq)
381 {
382 	immu_flags_t flags = 0;
383 
384 	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
385 		flags |= IMMU_FLAGS_SLEEP;
386 	} else {
387 		flags |= IMMU_FLAGS_NOSLEEP;
388 	}
389 
390 #ifdef BUGGY_DRIVERS
391 
392 	flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
393 
394 #else
395 	/*
396 	 * Read and write flags need to be reversed.
397 	 * DMA_READ means read from device and write
398 	 * to memory. So DMA read means DVMA write.
399 	 */
400 	if (dmareq->dmar_flags & DDI_DMA_READ)
401 		flags |= IMMU_FLAGS_WRITE;
402 
403 	if (dmareq->dmar_flags & DDI_DMA_WRITE)
404 		flags |= IMMU_FLAGS_READ;
405 
406 	/*
407 	 * Some buggy drivers specify neither READ or WRITE
408 	 * For such drivers set both read and write permissions
409 	 */
410 	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
411 		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
412 	}
413 #endif
414 
415 	return (flags);
416 }
417 
418 /*ARGSUSED*/
419 int
pgtable_ctor(void * buf,void * arg,int kmflag)420 pgtable_ctor(void *buf, void *arg, int kmflag)
421 {
422 	size_t actual_size = 0;
423 	pgtable_t *pgtable;
424 	int (*dmafp)(caddr_t);
425 	caddr_t vaddr;
426 	void *next;
427 	uint_t flags;
428 	immu_t *immu = arg;
429 
430 	pgtable = (pgtable_t *)buf;
431 
432 	dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
433 
434 	next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
435 	if (next == NULL) {
436 		return (-1);
437 	}
438 
439 	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
440 	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
441 		kmem_free(next, IMMU_PAGESIZE);
442 		return (-1);
443 	}
444 
445 	flags = DDI_DMA_CONSISTENT;
446 	if (!immu->immu_dvma_coherent)
447 		flags |= IOMEM_DATA_UC_WR_COMBINE;
448 
449 	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
450 	    &immu_acc_attr, flags,
451 	    dmafp, NULL, &vaddr, &actual_size,
452 	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
453 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
454 		kmem_free(next, IMMU_PAGESIZE);
455 		return (-1);
456 	}
457 
458 	/*
459 	 * Memory allocation failure. Maybe a temporary condition
460 	 * so return error rather than panic, so we can try again
461 	 */
462 	if (actual_size < IMMU_PAGESIZE) {
463 		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
464 		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
465 		kmem_free(next, IMMU_PAGESIZE);
466 		return (-1);
467 	}
468 
469 	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
470 	pgtable->hwpg_vaddr = vaddr;
471 	pgtable->swpg_next_array = next;
472 
473 	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
474 
475 	return (0);
476 }
477 
478 /*ARGSUSED*/
479 void
pgtable_dtor(void * buf,void * arg)480 pgtable_dtor(void *buf, void *arg)
481 {
482 	pgtable_t *pgtable;
483 
484 	pgtable = (pgtable_t *)buf;
485 
486 	/* destroy will panic if lock is held. */
487 	rw_destroy(&(pgtable->swpg_rwlock));
488 
489 	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
490 	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
491 	kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
492 }
493 
494 /*
495  * pgtable_alloc()
496  *	alloc a IOMMU pgtable structure.
497  *	This same struct is used for root and context tables as well.
498  *	This routine allocs the f/ollowing:
499  *	- a pgtable_t struct
500  *	- a HW page which holds PTEs/entries which is accesssed by HW
501  *        so we set up DMA for this page
502  *	- a SW page which is only for our bookeeping
503  *        (for example to  hold pointers to the next level pgtable).
504  *        So a simple kmem_alloc suffices
505  */
506 static pgtable_t *
pgtable_alloc(immu_t * immu,immu_flags_t immu_flags)507 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
508 {
509 	pgtable_t *pgtable;
510 	int kmflags;
511 
512 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
513 
514 	pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags);
515 	if (pgtable == NULL) {
516 		return (NULL);
517 	}
518 	return (pgtable);
519 }
520 
521 static void
pgtable_zero(pgtable_t * pgtable)522 pgtable_zero(pgtable_t *pgtable)
523 {
524 	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
525 	bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);
526 }
527 
528 static void
pgtable_free(immu_t * immu,pgtable_t * pgtable)529 pgtable_free(immu_t *immu, pgtable_t *pgtable)
530 {
531 	kmem_cache_free(immu->immu_pgtable_cache, pgtable);
532 }
533 
534 /*
535  * Function to identify a display device from the PCI class code
536  */
537 static boolean_t
device_is_display(uint_t classcode)538 device_is_display(uint_t classcode)
539 {
540 	static uint_t disp_classes[] = {
541 		0x000100,
542 		0x030000,
543 		0x030001
544 	};
545 	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
546 
547 	for (i = 0; i < nclasses; i++) {
548 		if (classcode == disp_classes[i])
549 			return (B_TRUE);
550 	}
551 	return (B_FALSE);
552 }
553 
554 /*
555  * Function that determines if device is PCIEX and/or PCIEX bridge
556  */
557 static boolean_t
device_is_pciex(uchar_t bus,uchar_t dev,uchar_t func,boolean_t * is_pcib)558 device_is_pciex(
559 	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
560 {
561 	ushort_t cap;
562 	ushort_t capsp;
563 	ushort_t cap_count = PCI_CAP_MAX_PTR;
564 	ushort_t status;
565 	boolean_t is_pciex = B_FALSE;
566 
567 	*is_pcib = B_FALSE;
568 
569 	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
570 	if (!(status & PCI_STAT_CAP))
571 		return (B_FALSE);
572 
573 	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
574 	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
575 		capsp &= PCI_CAP_PTR_MASK;
576 		cap = pci_getb_func(bus, dev, func, capsp);
577 
578 		if (cap == PCI_CAP_ID_PCI_E) {
579 			status = pci_getw_func(bus, dev, func, capsp + 2);
580 			/*
581 			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
582 			 * for Device/Port Type.
583 			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
584 			 * device is a PCIE2PCI bridge
585 			 */
586 			*is_pcib =
587 			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
588 			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
589 			is_pciex = B_TRUE;
590 		}
591 
592 		capsp = (*pci_getb_func)(bus, dev, func,
593 		    capsp + PCI_CAP_NEXT_PTR);
594 	}
595 
596 	return (is_pciex);
597 }
598 
599 static boolean_t
device_use_premap(uint_t classcode)600 device_use_premap(uint_t classcode)
601 {
602 	if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET)
603 		return (B_TRUE);
604 	return (B_FALSE);
605 }
606 
607 
608 /*
609  * immu_dvma_get_immu()
610  *   get the immu unit structure for a dev_info node
611  */
612 immu_t *
immu_dvma_get_immu(dev_info_t * dip,immu_flags_t immu_flags)613 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
614 {
615 	immu_devi_t *immu_devi;
616 	immu_t *immu;
617 
618 	/*
619 	 * check if immu unit was already found earlier.
620 	 * If yes, then it will be stashed in immu_devi struct.
621 	 */
622 	immu_devi = immu_devi_get(dip);
623 	if (immu_devi == NULL) {
624 		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
625 			/*
626 			 * May fail because of low memory. Return error rather
627 			 * than panic as we want driver to rey again later
628 			 */
629 			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
630 			    "No immu_devi structure");
631 			/*NOTREACHED*/
632 		}
633 		immu_devi = immu_devi_get(dip);
634 	}
635 
636 	mutex_enter(&(DEVI(dip)->devi_lock));
637 	if (immu_devi->imd_immu) {
638 		immu = immu_devi->imd_immu;
639 		mutex_exit(&(DEVI(dip)->devi_lock));
640 		return (immu);
641 	}
642 	mutex_exit(&(DEVI(dip)->devi_lock));
643 
644 	immu = immu_dmar_get_immu(dip);
645 	if (immu == NULL) {
646 		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
647 		    "Cannot find immu_t for device");
648 		/*NOTREACHED*/
649 	}
650 
651 	/*
652 	 * Check if some other thread found immu
653 	 * while lock was not held
654 	 */
655 	immu_devi = immu_devi_get(dip);
656 	/* immu_devi should be present as we found it earlier */
657 	if (immu_devi == NULL) {
658 		ddi_err(DER_PANIC, dip,
659 		    "immu_dvma_get_immu: No immu_devi structure");
660 		/*NOTREACHED*/
661 	}
662 
663 	mutex_enter(&(DEVI(dip)->devi_lock));
664 	if (immu_devi->imd_immu == NULL) {
665 		/* nobody else set it, so we should do it */
666 		immu_devi->imd_immu = immu;
667 		immu_devi_set_spclist(dip, immu);
668 	} else {
669 		/*
670 		 * if some other thread got immu before
671 		 * us, it should get the same results
672 		 */
673 		if (immu_devi->imd_immu != immu) {
674 			ddi_err(DER_PANIC, dip, "Multiple "
675 			    "immu units found for device. Expected (%p), "
676 			    "actual (%p)", (void *)immu,
677 			    (void *)immu_devi->imd_immu);
678 			/*NOTREACHED*/
679 		}
680 	}
681 	mutex_exit(&(DEVI(dip)->devi_lock));
682 
683 	return (immu);
684 }
685 
686 
687 /* ############################# IMMU_DEVI code ############################ */
688 
689 /*
690  * Allocate a immu_devi structure and initialize it
691  */
692 static immu_devi_t *
create_immu_devi(dev_info_t * rdip,int bus,int dev,int func,immu_flags_t immu_flags)693 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
694     immu_flags_t immu_flags)
695 {
696 	uchar_t baseclass, subclass;
697 	uint_t classcode, revclass;
698 	immu_devi_t *immu_devi;
699 	boolean_t pciex = B_FALSE;
700 	int kmflags;
701 	boolean_t is_pcib = B_FALSE;
702 
703 	/* bus ==  -1 indicate non-PCI device (no BDF) */
704 	ASSERT(bus == -1 || bus >= 0);
705 	ASSERT(dev >= 0);
706 	ASSERT(func >= 0);
707 
708 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
709 	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
710 	if (immu_devi == NULL) {
711 		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
712 		    "Intel IOMMU immu_devi structure");
713 		return (NULL);
714 	}
715 	immu_devi->imd_dip = rdip;
716 	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
717 	immu_devi->imd_bus = bus;
718 	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
719 
720 	if (bus == -1) {
721 		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
722 		return (immu_devi);
723 	}
724 
725 	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
726 	immu_devi->imd_sec = 0;
727 	immu_devi->imd_sub = 0;
728 
729 	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
730 
731 	classcode = IMMU_PCI_REV2CLASS(revclass);
732 	baseclass = IMMU_PCI_CLASS2BASE(classcode);
733 	subclass = IMMU_PCI_CLASS2SUB(classcode);
734 
735 	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
736 
737 		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
738 		    PCI_BCNF_SECBUS);
739 		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
740 		    PCI_BCNF_SUBBUS);
741 
742 		pciex = device_is_pciex(bus, dev, func, &is_pcib);
743 		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
744 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
745 		} else if (pciex == B_TRUE) {
746 			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
747 		} else {
748 			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
749 		}
750 	} else {
751 		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
752 	}
753 
754 	/* check for certain special devices */
755 	immu_devi->imd_display = device_is_display(classcode);
756 	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
757 	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
758 	immu_devi->imd_use_premap = device_use_premap(classcode);
759 
760 	immu_devi->imd_domain = NULL;
761 
762 	immu_devi->imd_dvma_flags = immu_global_dvma_flags;
763 
764 	return (immu_devi);
765 }
766 
767 static void
destroy_immu_devi(immu_devi_t * immu_devi)768 destroy_immu_devi(immu_devi_t *immu_devi)
769 {
770 	kmem_free(immu_devi, sizeof (immu_devi_t));
771 }
772 
773 static domain_t *
immu_devi_domain(dev_info_t * rdip,dev_info_t ** ddipp)774 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
775 {
776 	immu_devi_t *immu_devi;
777 	domain_t *domain;
778 	dev_info_t *ddip;
779 
780 	*ddipp = NULL;
781 
782 	immu_devi = immu_devi_get(rdip);
783 	if (immu_devi == NULL) {
784 		return (NULL);
785 	}
786 
787 	mutex_enter(&(DEVI(rdip)->devi_lock));
788 	domain = immu_devi->imd_domain;
789 	ddip = immu_devi->imd_ddip;
790 	mutex_exit(&(DEVI(rdip)->devi_lock));
791 
792 	if (domain)
793 		*ddipp = ddip;
794 
795 	return (domain);
796 
797 }
798 
799 /* ############################# END IMMU_DEVI code ######################## */
800 /* ############################# DOMAIN code ############################### */
801 
802 /*
803  * This routine always succeeds
804  */
805 static int
did_alloc(immu_t * immu,dev_info_t * rdip,dev_info_t * ddip,immu_flags_t immu_flags)806 did_alloc(immu_t *immu, dev_info_t *rdip,
807     dev_info_t *ddip, immu_flags_t immu_flags)
808 {
809 	int did;
810 
811 	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
812 	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
813 
814 	if (did == 0) {
815 		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
816 		    " domain-device: %s%d. immu unit is %s. Using "
817 		    "unity domain with domain-id (%d)",
818 		    ddi_driver_name(ddip), ddi_get_instance(ddip),
819 		    immu->immu_name, immu->immu_unity_domain->dom_did);
820 		did = immu->immu_unity_domain->dom_did;
821 	}
822 
823 	return (did);
824 }
825 
826 static int
get_branch_domain(dev_info_t * pdip,void * arg)827 get_branch_domain(dev_info_t *pdip, void *arg)
828 {
829 	immu_devi_t *immu_devi;
830 	domain_t *domain;
831 	dev_info_t *ddip;
832 	immu_t *immu;
833 	dvma_arg_t *dvp = (dvma_arg_t *)arg;
834 
835 	/*
836 	 * The field dvp->dva_rdip is a work-in-progress
837 	 * and gets updated as we walk up the ancestor
838 	 * tree. The final ddip is set only when we reach
839 	 * the top of the tree. So the dvp->dva_ddip field cannot
840 	 * be relied on until we reach the top of the field.
841 	 */
842 
843 	/* immu_devi may not be set. */
844 	immu_devi = immu_devi_get(pdip);
845 	if (immu_devi == NULL) {
846 		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
847 			dvp->dva_error = DDI_FAILURE;
848 			return (DDI_WALK_TERMINATE);
849 		}
850 	}
851 
852 	immu_devi = immu_devi_get(pdip);
853 	immu = immu_devi->imd_immu;
854 	if (immu == NULL)
855 		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
856 
857 	/*
858 	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
859 	 * terminate the walk (since the device under the PCIE bridge
860 	 * is a PCIE device and has an independent entry in the
861 	 * root/context table)
862 	 */
863 	if (dvp->dva_rdip != pdip &&
864 	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
865 		return (DDI_WALK_TERMINATE);
866 	}
867 
868 	/*
869 	 * In order to be a domain-dim, it must be a PCI device i.e.
870 	 * must have valid BDF. This also eliminates the root complex.
871 	 */
872 	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
873 	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
874 		ASSERT(immu_devi->imd_bus >= 0);
875 		ASSERT(immu_devi->imd_devfunc >= 0);
876 		dvp->dva_ddip = pdip;
877 	}
878 
879 	if (immu_devi->imd_display == B_TRUE ||
880 	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
881 		dvp->dva_domain = immu->immu_unity_domain;
882 		/* continue walking to find ddip */
883 		return (DDI_WALK_CONTINUE);
884 	}
885 
886 	mutex_enter(&(DEVI(pdip)->devi_lock));
887 	domain = immu_devi->imd_domain;
888 	ddip = immu_devi->imd_ddip;
889 	mutex_exit(&(DEVI(pdip)->devi_lock));
890 
891 	if (domain && ddip) {
892 		/* if domain is set, it must be the same */
893 		if (dvp->dva_domain) {
894 			ASSERT(domain == dvp->dva_domain);
895 		}
896 		dvp->dva_domain = domain;
897 		dvp->dva_ddip = ddip;
898 		return (DDI_WALK_TERMINATE);
899 	}
900 
901 	/* Domain may already be set, continue walking so that ddip gets set */
902 	if (dvp->dva_domain) {
903 		return (DDI_WALK_CONTINUE);
904 	}
905 
906 	/* domain is not set in either immu_devi or dvp */
907 	domain = bdf_domain_lookup(immu_devi);
908 	if (domain == NULL) {
909 		return (DDI_WALK_CONTINUE);
910 	}
911 
912 	/* ok, the BDF hash had a domain for this BDF. */
913 
914 	/* Grab lock again to check if something else set immu_devi fields */
915 	mutex_enter(&(DEVI(pdip)->devi_lock));
916 	if (immu_devi->imd_domain != NULL) {
917 		dvp->dva_domain = domain;
918 	} else {
919 		dvp->dva_domain = domain;
920 	}
921 	mutex_exit(&(DEVI(pdip)->devi_lock));
922 
923 	/*
924 	 * walk upwards until the topmost PCI bridge is found
925 	 */
926 	return (DDI_WALK_CONTINUE);
927 
928 }
929 
930 static void
map_unity_domain(domain_t * domain)931 map_unity_domain(domain_t *domain)
932 {
933 	struct memlist *mp;
934 	uint64_t start;
935 	uint64_t npages;
936 	immu_dcookie_t dcookies[1] = {0};
937 	int dcount = 0;
938 
939 	/*
940 	 * UNITY arenas are a mirror of the physical memory
941 	 * installed on the system.
942 	 */
943 
944 #ifdef BUGGY_DRIVERS
945 	/*
946 	 * Dont skip page0. Some broken HW/FW access it.
947 	 */
948 	dcookies[0].dck_paddr = 0;
949 	dcookies[0].dck_npages = 1;
950 	dcount = 1;
951 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
952 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
953 #endif
954 
955 	memlist_read_lock();
956 
957 	mp = phys_install;
958 
959 	if (mp->ml_address == 0) {
960 		/* since we already mapped page1 above */
961 		start = IMMU_PAGESIZE;
962 	} else {
963 		start = mp->ml_address;
964 	}
965 	npages = mp->ml_size/IMMU_PAGESIZE + 1;
966 
967 	dcookies[0].dck_paddr = start;
968 	dcookies[0].dck_npages = npages;
969 	dcount = 1;
970 	(void) dvma_map(domain, start, npages, dcookies,
971 	    dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
972 
973 	ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64
974 	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
975 
976 	mp = mp->ml_next;
977 	while (mp) {
978 		ddi_err(DER_LOG, domain->dom_dip,
979 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
980 		    mp->ml_address, mp->ml_address + mp->ml_size);
981 
982 		start = mp->ml_address;
983 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
984 
985 		dcookies[0].dck_paddr = start;
986 		dcookies[0].dck_npages = npages;
987 		dcount = 1;
988 		(void) dvma_map(domain, start, npages,
989 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
990 		mp = mp->ml_next;
991 	}
992 
993 	mp = bios_rsvd;
994 	while (mp) {
995 		ddi_err(DER_LOG, domain->dom_dip,
996 		    "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
997 		    mp->ml_address, mp->ml_address + mp->ml_size);
998 
999 		start = mp->ml_address;
1000 		npages = mp->ml_size/IMMU_PAGESIZE + 1;
1001 
1002 		dcookies[0].dck_paddr = start;
1003 		dcookies[0].dck_npages = npages;
1004 		dcount = 1;
1005 		(void) dvma_map(domain, start, npages,
1006 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
1007 
1008 		mp = mp->ml_next;
1009 	}
1010 
1011 	memlist_read_unlock();
1012 }
1013 
1014 /*
1015  * create_xlate_arena()
1016  *	Create the dvma arena for a domain with translation
1017  *	mapping
1018  */
1019 static void
create_xlate_arena(immu_t * immu,domain_t * domain,dev_info_t * rdip,immu_flags_t immu_flags)1020 create_xlate_arena(immu_t *immu, domain_t *domain,
1021     dev_info_t *rdip, immu_flags_t immu_flags)
1022 {
1023 	char *arena_name;
1024 	struct memlist *mp;
1025 	int vmem_flags;
1026 	uint64_t start;
1027 	uint_t mgaw;
1028 	uint64_t size;
1029 	uint64_t maxaddr;
1030 	void *vmem_ret;
1031 
1032 	arena_name = domain->dom_dvma_arena_name;
1033 
1034 	/* Note, don't do sizeof (arena_name) - it is just a pointer */
1035 	(void) snprintf(arena_name,
1036 	    sizeof (domain->dom_dvma_arena_name),
1037 	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1038 	    domain->dom_did);
1039 
1040 	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1041 
1042 	/* Restrict mgaddr (max guest addr) to MGAW */
1043 	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1044 
1045 	/*
1046 	 * To ensure we avoid ioapic and PCI MMIO ranges we just
1047 	 * use the physical memory address range of the system as the
1048 	 * range
1049 	 */
1050 	maxaddr = ((uint64_t)1 << mgaw);
1051 
1052 	memlist_read_lock();
1053 
1054 	mp = phys_install;
1055 
1056 	if (mp->ml_address == 0)
1057 		start = MMU_PAGESIZE;
1058 	else
1059 		start = mp->ml_address;
1060 
1061 	if (start + mp->ml_size > maxaddr)
1062 		size = maxaddr - start;
1063 	else
1064 		size = mp->ml_size;
1065 
1066 	ddi_err(DER_VERB, rdip,
1067 	    "iommu: %s: Creating dvma vmem arena [0x%" PRIx64
1068 	    " - 0x%" PRIx64 "]", arena_name, start, start + size);
1069 
1070 	/*
1071 	 * We always allocate in quanta of IMMU_PAGESIZE
1072 	 */
1073 	domain->dom_dvma_arena = vmem_create(arena_name,
1074 	    (void *)(uintptr_t)start,	/* start addr */
1075 	    size,			/* size */
1076 	    IMMU_PAGESIZE,		/* quantum */
1077 	    NULL,			/* afunc */
1078 	    NULL,			/* ffunc */
1079 	    NULL,			/* source */
1080 	    0,				/* qcache_max */
1081 	    vmem_flags);
1082 
1083 	if (domain->dom_dvma_arena == NULL) {
1084 		ddi_err(DER_PANIC, rdip,
1085 		    "Failed to allocate DVMA arena(%s) "
1086 		    "for domain ID (%d)", arena_name, domain->dom_did);
1087 		/*NOTREACHED*/
1088 	}
1089 
1090 	mp = mp->ml_next;
1091 	while (mp) {
1092 
1093 		if (mp->ml_address == 0)
1094 			start = MMU_PAGESIZE;
1095 		else
1096 			start = mp->ml_address;
1097 
1098 		if (start + mp->ml_size > maxaddr)
1099 			size = maxaddr - start;
1100 		else
1101 			size = mp->ml_size;
1102 
1103 		ddi_err(DER_VERB, rdip,
1104 		    "iommu: %s: Adding dvma vmem span [0x%" PRIx64
1105 		    " - 0x%" PRIx64 "]", arena_name, start,
1106 		    start + size);
1107 
1108 		vmem_ret = vmem_add(domain->dom_dvma_arena,
1109 		    (void *)(uintptr_t)start, size,  vmem_flags);
1110 
1111 		if (vmem_ret == NULL) {
1112 			ddi_err(DER_PANIC, rdip,
1113 			    "Failed to allocate DVMA arena(%s) "
1114 			    "for domain ID (%d)",
1115 			    arena_name, domain->dom_did);
1116 			/*NOTREACHED*/
1117 		}
1118 		mp = mp->ml_next;
1119 	}
1120 	memlist_read_unlock();
1121 }
1122 
1123 /* ################################### DOMAIN CODE ######################### */
1124 
1125 /*
1126  * Set the domain and domain-dip for a dip
1127  */
1128 static void
set_domain(dev_info_t * dip,dev_info_t * ddip,domain_t * domain)1129 set_domain(
1130 	dev_info_t *dip,
1131 	dev_info_t *ddip,
1132 	domain_t *domain)
1133 {
1134 	immu_devi_t *immu_devi;
1135 	domain_t *fdomain;
1136 	dev_info_t *fddip;
1137 
1138 	immu_devi = immu_devi_get(dip);
1139 
1140 	mutex_enter(&(DEVI(dip)->devi_lock));
1141 	fddip = immu_devi->imd_ddip;
1142 	fdomain = immu_devi->imd_domain;
1143 
1144 	if (fddip) {
1145 		ASSERT(fddip == ddip);
1146 	} else {
1147 		immu_devi->imd_ddip = ddip;
1148 	}
1149 
1150 	if (fdomain) {
1151 		ASSERT(fdomain == domain);
1152 	} else {
1153 		immu_devi->imd_domain = domain;
1154 	}
1155 	mutex_exit(&(DEVI(dip)->devi_lock));
1156 }
1157 
1158 /*
1159  * device_domain()
1160  *	Get domain for a device. The domain may be global in which case it
1161  *	is shared between all IOMMU units. Due to potential AGAW differences
1162  *      between IOMMU units, such global domains *have to be* UNITY mapping
1163  *      domains. Alternatively, the domain may be local to a IOMMU unit.
1164  *	Local domains may be shared or immu_devi, although the
1165  *      scope of sharing
1166  *	is restricted to devices controlled by the IOMMU unit to
1167  *      which the domain
1168  *	belongs. If shared, they (currently) have to be UNITY domains. If
1169  *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
1170  */
1171 static domain_t *
device_domain(dev_info_t * rdip,dev_info_t ** ddipp,immu_flags_t immu_flags)1172 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1173 {
1174 	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1175 	immu_t *immu;
1176 	domain_t *domain;
1177 	dvma_arg_t dvarg = {0};
1178 	int level;
1179 
1180 	*ddipp = NULL;
1181 
1182 	/*
1183 	 * Check if the domain is already set. This is usually true
1184 	 * if this is not the first DVMA transaction.
1185 	 */
1186 	ddip = NULL;
1187 	domain = immu_devi_domain(rdip, &ddip);
1188 	if (domain) {
1189 		*ddipp = ddip;
1190 		return (domain);
1191 	}
1192 
1193 	immu = immu_dvma_get_immu(rdip, immu_flags);
1194 	if (immu == NULL) {
1195 		/*
1196 		 * possible that there is no IOMMU unit for this device
1197 		 * - BIOS bugs are one example.
1198 		 */
1199 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
1200 		return (NULL);
1201 	}
1202 
1203 	immu_flags |= immu_devi_get(rdip)->imd_dvma_flags;
1204 
1205 	dvarg.dva_rdip = rdip;
1206 	dvarg.dva_ddip = NULL;
1207 	dvarg.dva_domain = NULL;
1208 	dvarg.dva_flags = immu_flags;
1209 	level = 0;
1210 	if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
1211 	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1212 		/*
1213 		 * maybe low memory. return error,
1214 		 * so driver tries again later
1215 		 */
1216 		return (NULL);
1217 	}
1218 
1219 	/* should have walked at least 1 dip (i.e. edip) */
1220 	ASSERT(level > 0);
1221 
1222 	ddip = dvarg.dva_ddip;	/* must be present */
1223 	domain = dvarg.dva_domain;	/* may be NULL */
1224 
1225 	/*
1226 	 * We may find the domain during our ancestor walk on any one of our
1227 	 * ancestor dips, If the domain is found then the domain-dip
1228 	 * (i.e. ddip) will also be found in the same immu_devi struct.
1229 	 * The domain-dip is the highest ancestor dip which shares the
1230 	 * same domain with edip.
1231 	 * The domain may or may not be found, but the domain dip must
1232 	 * be found.
1233 	 */
1234 	if (ddip == NULL) {
1235 		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
1236 		return (NULL);
1237 	}
1238 
1239 	/*
1240 	 * Did we find a domain ?
1241 	 */
1242 	if (domain) {
1243 		goto found;
1244 	}
1245 
1246 	/* nope, so allocate */
1247 	domain = domain_create(immu, ddip, rdip, immu_flags);
1248 	if (domain == NULL) {
1249 		return (NULL);
1250 	}
1251 
1252 	/*FALLTHROUGH*/
1253 found:
1254 	/*
1255 	 * We know *domain *is* the right domain, so panic if
1256 	 * another domain is set for either the request-dip or
1257 	 * effective dip.
1258 	 */
1259 	set_domain(ddip, ddip, domain);
1260 	set_domain(rdip, ddip, domain);
1261 
1262 	*ddipp = ddip;
1263 	return (domain);
1264 }
1265 
1266 static void
create_unity_domain(immu_t * immu)1267 create_unity_domain(immu_t *immu)
1268 {
1269 	domain_t *domain;
1270 
1271 	/* domain created during boot and always use sleep flag */
1272 	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1273 
1274 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1275 
1276 	domain->dom_did = IMMU_UNITY_DID;
1277 	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1278 
1279 	domain->dom_immu = immu;
1280 	immu->immu_unity_domain = domain;
1281 
1282 	/*
1283 	 * Setup the domain's initial page table
1284 	 * should never fail.
1285 	 */
1286 	domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1287 	pgtable_zero(domain->dom_pgtable_root);
1288 
1289 	/*
1290 	 * Only map all physical memory in to the unity domain
1291 	 * if passthrough is not supported. If it is supported,
1292 	 * passthrough is set in the context entry instead.
1293 	 */
1294 	if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1295 		map_unity_domain(domain);
1296 
1297 
1298 	/*
1299 	 * put it on the system-wide UNITY domain list
1300 	 */
1301 	mutex_enter(&(immu_domain_lock));
1302 	list_insert_tail(&immu_unity_domain_list, domain);
1303 	mutex_exit(&(immu_domain_lock));
1304 }
1305 
1306 /*
1307  * ddip is the domain-dip - the topmost dip in a domain
1308  * rdip is the requesting-dip - the device which is
1309  * requesting DVMA setup
1310  * if domain is a non-shared domain rdip == ddip
1311  */
1312 static domain_t *
domain_create(immu_t * immu,dev_info_t * ddip,dev_info_t * rdip,immu_flags_t immu_flags)1313 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1314     immu_flags_t immu_flags)
1315 {
1316 	int kmflags;
1317 	domain_t *domain;
1318 	char mod_hash_name[128];
1319 	immu_devi_t *immu_devi;
1320 	int did;
1321 	immu_dcookie_t dcookies[1] = {0};
1322 	int dcount = 0;
1323 
1324 	immu_devi = immu_devi_get(rdip);
1325 
1326 	/*
1327 	 * First allocate a domainid.
1328 	 * This routine will never fail, since if we run out
1329 	 * of domains the unity domain will be allocated.
1330 	 */
1331 	did = did_alloc(immu, rdip, ddip, immu_flags);
1332 	if (did == IMMU_UNITY_DID) {
1333 		/* domain overflow */
1334 		ASSERT(immu->immu_unity_domain);
1335 		return (immu->immu_unity_domain);
1336 	}
1337 
1338 	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1339 	domain = kmem_zalloc(sizeof (domain_t), kmflags);
1340 	if (domain == NULL) {
1341 		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1342 		    "structure for device. IOMMU unit: %s", immu->immu_name);
1343 		/*NOTREACHED*/
1344 	}
1345 
1346 	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1347 
1348 	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1349 	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
1350 
1351 	domain->dom_did = did;
1352 	domain->dom_immu = immu;
1353 	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1354 	domain->dom_dip = ddip;
1355 
1356 	/*
1357 	 * Create xlate DVMA arena for this domain.
1358 	 */
1359 	create_xlate_arena(immu, domain, rdip, immu_flags);
1360 
1361 	/*
1362 	 * Setup the domain's initial page table
1363 	 */
1364 	domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
1365 	if (domain->dom_pgtable_root == NULL) {
1366 		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1367 		    "pgtable for domain (%d). IOMMU unit: %s",
1368 		    domain->dom_did, immu->immu_name);
1369 		/*NOTREACHED*/
1370 	}
1371 	pgtable_zero(domain->dom_pgtable_root);
1372 
1373 	/*
1374 	 * Since this is a immu unit-specific domain, put it on
1375 	 * the per-immu domain list.
1376 	 */
1377 	mutex_enter(&(immu->immu_lock));
1378 	list_insert_head(&immu->immu_domain_list, domain);
1379 	mutex_exit(&(immu->immu_lock));
1380 
1381 	/*
1382 	 * Also put it on the system-wide xlate domain list
1383 	 */
1384 	mutex_enter(&(immu_domain_lock));
1385 	list_insert_head(&immu_xlate_domain_list, domain);
1386 	mutex_exit(&(immu_domain_lock));
1387 
1388 	bdf_domain_insert(immu_devi, domain);
1389 
1390 #ifdef BUGGY_DRIVERS
1391 	/*
1392 	 * Map page0. Some broken HW/FW access it.
1393 	 */
1394 	dcookies[0].dck_paddr = 0;
1395 	dcookies[0].dck_npages = 1;
1396 	dcount = 1;
1397 	(void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
1398 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1399 #endif
1400 	return (domain);
1401 }
1402 
1403 /*
1404  * Create domainid arena.
1405  * Domainid 0 is reserved by Vt-d spec and cannot be used by
1406  * system software.
1407  * Domainid 1 is reserved by solaris and used for *all* of the following:
1408  *	as the "uninitialized" domain - For devices not yet controlled
1409  *	by Solaris
1410  *	as the "unity" domain - For devices that will always belong
1411  *	to the unity domain
1412  *	as the "overflow" domain - Used for any new device after we
1413  *	run out of domains
1414  * All of the above domains map into a single domain with
1415  * domainid 1 and UNITY DVMA mapping
1416  * Each IMMU unity has its own unity/uninit/overflow domain
1417  */
1418 static void
did_init(immu_t * immu)1419 did_init(immu_t *immu)
1420 {
1421 	(void) snprintf(immu->immu_did_arena_name,
1422 	    sizeof (immu->immu_did_arena_name),
1423 	    "%s_domainid_arena", immu->immu_name);
1424 
1425 	ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s",
1426 	    immu->immu_did_arena_name);
1427 
1428 	immu->immu_did_arena = vmem_create(
1429 	    immu->immu_did_arena_name,
1430 	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
1431 	    immu->immu_max_domains - IMMU_UNITY_DID,
1432 	    1,				/* quantum */
1433 	    NULL,			/* afunc */
1434 	    NULL,			/* ffunc */
1435 	    NULL,			/* source */
1436 	    0,				/* qcache_max */
1437 	    VM_SLEEP);
1438 
1439 	/* Even with SLEEP flag, vmem_create() can fail */
1440 	if (immu->immu_did_arena == NULL) {
1441 		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1442 		    "IOMMU domainid allocator: %s", immu->immu_name,
1443 		    immu->immu_did_arena_name);
1444 	}
1445 }
1446 
1447 /* #########################  CONTEXT CODE ################################# */
1448 
1449 static void
context_set(immu_t * immu,domain_t * domain,pgtable_t * root_table,int bus,int devfunc)1450 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1451     int bus, int devfunc)
1452 {
1453 	pgtable_t *context;
1454 	pgtable_t *pgtable_root;
1455 	hw_rce_t *hw_rent;
1456 	hw_rce_t *hw_cent;
1457 	hw_rce_t *ctxp;
1458 	int sid;
1459 	krw_t rwtype;
1460 	boolean_t fill_root;
1461 	boolean_t fill_ctx;
1462 
1463 	pgtable_root = domain->dom_pgtable_root;
1464 
1465 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1466 	context = *(pgtable_t **)(ctxp + bus);
1467 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1468 
1469 	fill_root = B_FALSE;
1470 	fill_ctx = B_FALSE;
1471 
1472 	/* Check the most common case first with reader lock */
1473 	rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
1474 	rwtype = RW_READER;
1475 again:
1476 	if (ROOT_GET_P(hw_rent)) {
1477 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1478 		if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
1479 			rw_exit(&(immu->immu_ctx_rwlock));
1480 			return;
1481 		} else {
1482 			fill_ctx = B_TRUE;
1483 		}
1484 	} else {
1485 		fill_root = B_TRUE;
1486 		fill_ctx = B_TRUE;
1487 	}
1488 
1489 	if (rwtype == RW_READER &&
1490 	    rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
1491 		rw_exit(&(immu->immu_ctx_rwlock));
1492 		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1493 		rwtype = RW_WRITER;
1494 		goto again;
1495 	}
1496 	rwtype = RW_WRITER;
1497 
1498 	if (fill_root == B_TRUE) {
1499 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1500 		ROOT_SET_P(hw_rent);
1501 		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1502 	}
1503 
1504 	if (fill_ctx == B_TRUE) {
1505 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1506 		/* need to disable context entry before reprogramming it */
1507 		bzero(hw_cent, sizeof (hw_rce_t));
1508 
1509 		/* flush caches */
1510 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1511 
1512 		sid = ((bus << 8) | devfunc);
1513 		immu_flush_context_fsi(immu, 0, sid, domain->dom_did,
1514 		    &immu->immu_ctx_inv_wait);
1515 
1516 		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1517 		CONT_SET_DID(hw_cent, domain->dom_did);
1518 		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1519 		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1520 		if (domain->dom_did == IMMU_UNITY_DID &&
1521 		    IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1522 			CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1523 		else
1524 			/*LINTED*/
1525 			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1526 		CONT_SET_P(hw_cent);
1527 		if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) {
1528 			CONT_SET_EH(hw_cent);
1529 			if (immu_use_alh)
1530 				CONT_SET_ALH(hw_cent);
1531 		}
1532 		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1533 	}
1534 	rw_exit(&(immu->immu_ctx_rwlock));
1535 }
1536 
1537 static pgtable_t *
context_create(immu_t * immu)1538 context_create(immu_t *immu)
1539 {
1540 	int	bus;
1541 	int	devfunc;
1542 	pgtable_t *root_table;
1543 	pgtable_t *context;
1544 	pgtable_t *pgtable_root;
1545 	hw_rce_t *ctxp;
1546 	hw_rce_t *hw_rent;
1547 	hw_rce_t *hw_cent;
1548 
1549 	/* Allocate a zeroed root table (4K 256b entries) */
1550 	root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1551 	pgtable_zero(root_table);
1552 
1553 	/*
1554 	 * Setup context tables for all possible root table entries.
1555 	 * Start out with unity domains for all entries.
1556 	 */
1557 	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1558 	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1559 	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1560 		context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1561 		pgtable_zero(context);
1562 		ROOT_SET_P(hw_rent);
1563 		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1564 		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1565 		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1566 		    devfunc++, hw_cent++) {
1567 			pgtable_root =
1568 			    immu->immu_unity_domain->dom_pgtable_root;
1569 			CONT_SET_DID(hw_cent,
1570 			    immu->immu_unity_domain->dom_did);
1571 			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1572 			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1573 			if (IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1574 				CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1575 			else
1576 				/*LINTED*/
1577 				CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1578 			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1579 			CONT_SET_P(hw_cent);
1580 		}
1581 		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1582 		*((pgtable_t **)ctxp) = context;
1583 	}
1584 
1585 	return (root_table);
1586 }
1587 
1588 /*
1589  * Called during rootnex attach, so no locks needed
1590  */
1591 static void
context_init(immu_t * immu)1592 context_init(immu_t *immu)
1593 {
1594 	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1595 
1596 	immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE);
1597 
1598 	immu_regs_wbf_flush(immu);
1599 
1600 	immu->immu_ctx_root = context_create(immu);
1601 
1602 	immu_regs_set_root_table(immu);
1603 
1604 	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1605 	immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait);
1606 	immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait);
1607 	rw_exit(&(immu->immu_ctx_rwlock));
1608 }
1609 
1610 
1611 /*
1612  * Find top pcib
1613  */
1614 static int
find_top_pcib(dev_info_t * dip,void * arg)1615 find_top_pcib(dev_info_t *dip, void *arg)
1616 {
1617 	immu_devi_t *immu_devi;
1618 	dev_info_t **pcibdipp = (dev_info_t **)arg;
1619 
1620 	immu_devi = immu_devi_get(dip);
1621 
1622 	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1623 		*pcibdipp = dip;
1624 	}
1625 
1626 	return (DDI_WALK_CONTINUE);
1627 }
1628 
1629 static int
immu_context_update(immu_t * immu,domain_t * domain,dev_info_t * ddip,dev_info_t * rdip,immu_flags_t immu_flags)1630 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1631     dev_info_t *rdip, immu_flags_t immu_flags)
1632 {
1633 	immu_devi_t *r_immu_devi;
1634 	immu_devi_t *d_immu_devi;
1635 	int r_bus;
1636 	int d_bus;
1637 	int r_devfunc;
1638 	int d_devfunc;
1639 	immu_pcib_t d_pcib_type;
1640 	dev_info_t *pcibdip;
1641 
1642 	if (ddip == NULL || rdip == NULL ||
1643 	    ddip == root_devinfo || rdip == root_devinfo) {
1644 		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1645 		    "request-dip are NULL or are root devinfo");
1646 		return (DDI_FAILURE);
1647 	}
1648 
1649 	/*
1650 	 * We need to set the context fields
1651 	 * based on what type of device rdip and ddip are.
1652 	 * To do that we need the immu_devi field.
1653 	 * Set the immu_devi field (if not already set)
1654 	 */
1655 	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1656 		ddi_err(DER_MODE, rdip,
1657 		    "immu_context_update: failed to set immu_devi for ddip");
1658 		return (DDI_FAILURE);
1659 	}
1660 
1661 	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1662 		ddi_err(DER_MODE, rdip,
1663 		    "immu_context_update: failed to set immu_devi for rdip");
1664 		return (DDI_FAILURE);
1665 	}
1666 
1667 	d_immu_devi = immu_devi_get(ddip);
1668 	r_immu_devi = immu_devi_get(rdip);
1669 
1670 	d_bus = d_immu_devi->imd_bus;
1671 	d_devfunc = d_immu_devi->imd_devfunc;
1672 	d_pcib_type = d_immu_devi->imd_pcib_type;
1673 	r_bus = r_immu_devi->imd_bus;
1674 	r_devfunc = r_immu_devi->imd_devfunc;
1675 
1676 	if (rdip == ddip) {
1677 		/* rdip is a PCIE device. set context for it only */
1678 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1679 		    r_devfunc);
1680 #ifdef BUGGY_DRIVERS
1681 	} else if (r_immu_devi == d_immu_devi) {
1682 #ifdef TEST
1683 		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1684 		    "0x%lx are identical", rdip, ddip);
1685 #endif
1686 		/* rdip is a PCIE device. set context for it only */
1687 		context_set(immu, domain, immu->immu_ctx_root, r_bus,
1688 		    r_devfunc);
1689 #endif
1690 	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1691 		/*
1692 		 * ddip is a PCIE_PCI bridge. Set context for ddip's
1693 		 * secondary bus. If rdip is on ddip's secondary
1694 		 * bus, set context for rdip. Else, set context
1695 		 * for rdip's PCI bridge on ddip's secondary bus.
1696 		 */
1697 		context_set(immu, domain, immu->immu_ctx_root,
1698 		    d_immu_devi->imd_sec, 0);
1699 		if (d_immu_devi->imd_sec == r_bus) {
1700 			context_set(immu, domain, immu->immu_ctx_root,
1701 			    r_bus, r_devfunc);
1702 		} else {
1703 			pcibdip = NULL;
1704 			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1705 			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1706 			    pcibdip != NULL) {
1707 				r_immu_devi = immu_devi_get(pcibdip);
1708 				r_bus = r_immu_devi->imd_bus;
1709 				r_devfunc = r_immu_devi->imd_devfunc;
1710 				context_set(immu, domain, immu->immu_ctx_root,
1711 				    r_bus, r_devfunc);
1712 			} else {
1713 				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1714 				    " bridge for PCI device");
1715 				/*NOTREACHED*/
1716 			}
1717 		}
1718 	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1719 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1720 		    d_devfunc);
1721 	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1722 		/*
1723 		 * ddip is a PCIE device which has a non-PCI device under it
1724 		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1725 		 */
1726 		context_set(immu, domain, immu->immu_ctx_root, d_bus,
1727 		    d_devfunc);
1728 	} else {
1729 		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1730 		    "set iommu context.");
1731 		/*NOTREACHED*/
1732 	}
1733 
1734 	/* XXX do we need a membar_producer() here */
1735 	return (DDI_SUCCESS);
1736 }
1737 
1738 /* ##################### END CONTEXT CODE ################################## */
1739 /* ##################### MAPPING CODE ################################## */
1740 
1741 
1742 #ifdef DEBUG
1743 static boolean_t
PDTE_check(immu_t * immu,hw_pdte_t pdte,pgtable_t * next,paddr_t paddr,dev_info_t * rdip,immu_flags_t immu_flags)1744 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1745     dev_info_t *rdip, immu_flags_t immu_flags)
1746 {
1747 	/* The PDTE must be set i.e. present bit is set */
1748 	if (!PDTE_P(pdte)) {
1749 		ddi_err(DER_MODE, rdip, "No present flag");
1750 		return (B_FALSE);
1751 	}
1752 
1753 	/*
1754 	 * Just assert to check most significant system software field
1755 	 * (PDTE_SW4) as it is same as present bit and we
1756 	 * checked that above
1757 	 */
1758 	ASSERT(PDTE_SW4(pdte));
1759 
1760 	/*
1761 	 * TM field should be clear if not reserved.
1762 	 * non-leaf is always reserved
1763 	 */
1764 	if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
1765 		if (PDTE_TM(pdte)) {
1766 			ddi_err(DER_MODE, rdip, "TM flag set");
1767 			return (B_FALSE);
1768 		}
1769 	}
1770 
1771 	/*
1772 	 * The SW3 field is not used and must be clear
1773 	 */
1774 	if (PDTE_SW3(pdte)) {
1775 		ddi_err(DER_MODE, rdip, "SW3 set");
1776 		return (B_FALSE);
1777 	}
1778 
1779 	/*
1780 	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1781 	 */
1782 	if (next == NULL) {
1783 		ASSERT(paddr % IMMU_PAGESIZE == 0);
1784 		if (PDTE_PADDR(pdte) != paddr) {
1785 			ddi_err(DER_MODE, rdip,
1786 			    "PTE paddr mismatch: %lx != %lx",
1787 			    PDTE_PADDR(pdte), paddr);
1788 			return (B_FALSE);
1789 		}
1790 	} else {
1791 		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1792 			ddi_err(DER_MODE, rdip,
1793 			    "PDE paddr mismatch: %lx != %lx",
1794 			    PDTE_PADDR(pdte), next->hwpg_paddr);
1795 			return (B_FALSE);
1796 		}
1797 	}
1798 
1799 	/*
1800 	 * SNP field should be clear if not reserved.
1801 	 * non-leaf is always reserved
1802 	 */
1803 	if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
1804 		if (PDTE_SNP(pdte)) {
1805 			ddi_err(DER_MODE, rdip, "SNP set");
1806 			return (B_FALSE);
1807 		}
1808 	}
1809 
1810 	/* second field available for system software should be clear */
1811 	if (PDTE_SW2(pdte)) {
1812 		ddi_err(DER_MODE, rdip, "SW2 set");
1813 		return (B_FALSE);
1814 	}
1815 
1816 	/* Super pages field should be clear */
1817 	if (PDTE_SP(pdte)) {
1818 		ddi_err(DER_MODE, rdip, "SP set");
1819 		return (B_FALSE);
1820 	}
1821 
1822 	/*
1823 	 * least significant field available for
1824 	 * system software should be clear
1825 	 */
1826 	if (PDTE_SW1(pdte)) {
1827 		ddi_err(DER_MODE, rdip, "SW1 set");
1828 		return (B_FALSE);
1829 	}
1830 
1831 	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1832 		ddi_err(DER_MODE, rdip, "READ not set");
1833 		return (B_FALSE);
1834 	}
1835 
1836 	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1837 		ddi_err(DER_MODE, rdip, "WRITE not set");
1838 		return (B_FALSE);
1839 	}
1840 
1841 	return (B_TRUE);
1842 }
1843 #endif
1844 
1845 /*ARGSUSED*/
1846 static void
PTE_clear_all(immu_t * immu,domain_t * domain,xlate_t * xlate,uint64_t * dvma_ptr,uint64_t * npages_ptr,dev_info_t * rdip)1847 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
1848     uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
1849 {
1850 	uint64_t npages;
1851 	uint64_t dvma;
1852 	pgtable_t *pgtable;
1853 	hw_pdte_t *hwp;
1854 	hw_pdte_t *shwp;
1855 	int idx;
1856 
1857 	pgtable = xlate->xlt_pgtable;
1858 	idx = xlate->xlt_idx;
1859 
1860 	dvma = *dvma_ptr;
1861 	npages = *npages_ptr;
1862 
1863 	/*
1864 	 * since a caller gets a unique dvma for a physical address,
1865 	 * no other concurrent thread will be writing to the same
1866 	 * PTE even if it has the same paddr. So no locks needed.
1867 	 */
1868 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1869 
1870 	hwp = shwp;
1871 	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
1872 		PDTE_CLEAR_P(*hwp);
1873 		dvma += IMMU_PAGESIZE;
1874 		npages--;
1875 	}
1876 
1877 	*dvma_ptr = dvma;
1878 	*npages_ptr = npages;
1879 
1880 	xlate->xlt_idx = idx;
1881 }
1882 
1883 static void
xlate_setup(uint64_t dvma,xlate_t * xlate,int nlevels)1884 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels)
1885 {
1886 	int level;
1887 	uint64_t offbits;
1888 
1889 	/*
1890 	 * Skip the first 12 bits which is the offset into
1891 	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1892 	 */
1893 	offbits = dvma >> IMMU_PAGESHIFT;
1894 
1895 	/* skip to level 1 i.e. leaf PTE */
1896 	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
1897 		xlate->xlt_level = level;
1898 		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
1899 		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
1900 		xlate->xlt_pgtable = NULL;
1901 		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
1902 	}
1903 }
1904 
1905 /*
1906  * Read the pgtables
1907  */
1908 static boolean_t
PDE_lookup(domain_t * domain,xlate_t * xlate,int nlevels)1909 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels)
1910 {
1911 	pgtable_t *pgtable;
1912 	pgtable_t *next;
1913 	uint_t idx;
1914 
1915 	/* start with highest level pgtable i.e. root */
1916 	xlate += nlevels;
1917 
1918 	if (xlate->xlt_pgtable == NULL) {
1919 		xlate->xlt_pgtable = domain->dom_pgtable_root;
1920 	}
1921 
1922 	for (; xlate->xlt_level > 1; xlate--) {
1923 		idx = xlate->xlt_idx;
1924 		pgtable = xlate->xlt_pgtable;
1925 
1926 		if ((xlate - 1)->xlt_pgtable) {
1927 			continue;
1928 		}
1929 
1930 		/* Lock the pgtable in read mode */
1931 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
1932 
1933 		/*
1934 		 * since we are unmapping, the pgtable should
1935 		 * already point to a leafier pgtable.
1936 		 */
1937 		next = *(pgtable->swpg_next_array + idx);
1938 		(xlate - 1)->xlt_pgtable = next;
1939 		rw_exit(&(pgtable->swpg_rwlock));
1940 		if (next == NULL)
1941 			return (B_FALSE);
1942 	}
1943 
1944 	return (B_TRUE);
1945 }
1946 
1947 static void
immu_fault_walk(void * arg,void * base,size_t len)1948 immu_fault_walk(void *arg, void *base, size_t len)
1949 {
1950 	uint64_t dvma, start;
1951 
1952 	dvma = *(uint64_t *)arg;
1953 	start = (uint64_t)(uintptr_t)base;
1954 
1955 	if (dvma >= start && dvma < (start + len)) {
1956 		ddi_err(DER_WARN, NULL,
1957 		    "faulting DVMA address is in vmem arena "
1958 		    "(%" PRIx64 "-%" PRIx64 ")",
1959 		    start, start + len);
1960 		*(uint64_t *)arg = ~0ULL;
1961 	}
1962 }
1963 
1964 void
immu_print_fault_info(uint_t sid,uint64_t dvma)1965 immu_print_fault_info(uint_t sid, uint64_t dvma)
1966 {
1967 	int nlevels;
1968 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
1969 	xlate_t *xlatep;
1970 	hw_pdte_t pte;
1971 	domain_t *domain;
1972 	immu_t *immu;
1973 	uint64_t dvma_arg;
1974 
1975 	if (mod_hash_find(bdf_domain_hash,
1976 	    (void *)(uintptr_t)sid, (void *)&domain) != 0) {
1977 		ddi_err(DER_WARN, NULL,
1978 		    "no domain for faulting SID %08x", sid);
1979 		return;
1980 	}
1981 
1982 	immu = domain->dom_immu;
1983 
1984 	dvma_arg = dvma;
1985 	vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk,
1986 	    (void *)&dvma_arg);
1987 	if (dvma_arg != ~0ULL)
1988 		ddi_err(DER_WARN, domain->dom_dip,
1989 		    "faulting DVMA address is not in vmem arena");
1990 
1991 	nlevels = immu->immu_dvma_nlevels;
1992 	xlate_setup(dvma, xlate, nlevels);
1993 
1994 	if (!PDE_lookup(domain, xlate, nlevels)) {
1995 		ddi_err(DER_WARN, domain->dom_dip,
1996 		    "pte not found in domid %d for faulting addr %" PRIx64,
1997 		    domain->dom_did, dvma);
1998 		return;
1999 	}
2000 
2001 	xlatep = &xlate[1];
2002 	pte = *((hw_pdte_t *)
2003 	    (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx);
2004 
2005 	ddi_err(DER_WARN, domain->dom_dip,
2006 	    "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did,
2007 	    (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte));
2008 }
2009 
2010 /*ARGSUSED*/
2011 static void
PTE_set_one(immu_t * immu,hw_pdte_t * hwp,paddr_t paddr,dev_info_t * rdip,immu_flags_t immu_flags)2012 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2013     dev_info_t *rdip, immu_flags_t immu_flags)
2014 {
2015 	hw_pdte_t pte;
2016 
2017 #ifndef DEBUG
2018 	pte = immu->immu_ptemask;
2019 	PDTE_SET_PADDR(pte, paddr);
2020 #else
2021 	pte = *hwp;
2022 
2023 	if (PDTE_P(pte)) {
2024 		if (PDTE_PADDR(pte) != paddr) {
2025 			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2026 			    PDTE_PADDR(pte), paddr);
2027 		}
2028 #ifdef BUGGY_DRIVERS
2029 		return;
2030 #else
2031 		goto out;
2032 #endif
2033 	}
2034 
2035 	/* clear TM field if not reserved */
2036 	if (immu->immu_TM_reserved == B_FALSE) {
2037 		PDTE_CLEAR_TM(pte);
2038 	}
2039 
2040 	/* Clear 3rd field for system software  - not used */
2041 	PDTE_CLEAR_SW3(pte);
2042 
2043 	/* Set paddr */
2044 	ASSERT(paddr % IMMU_PAGESIZE == 0);
2045 	PDTE_CLEAR_PADDR(pte);
2046 	PDTE_SET_PADDR(pte, paddr);
2047 
2048 	/*  clear SNP field if not reserved. */
2049 	if (immu->immu_SNP_reserved == B_FALSE) {
2050 		PDTE_CLEAR_SNP(pte);
2051 	}
2052 
2053 	/* Clear SW2 field available for software */
2054 	PDTE_CLEAR_SW2(pte);
2055 
2056 
2057 	/* SP is don't care for PTEs. Clear it for cleanliness */
2058 	PDTE_CLEAR_SP(pte);
2059 
2060 	/* Clear SW1 field available for software */
2061 	PDTE_CLEAR_SW1(pte);
2062 
2063 	/*
2064 	 * Now that we are done writing the PTE
2065 	 * set the "present" flag. Note this present
2066 	 * flag is a bit in the PDE/PTE that the
2067 	 * spec says is available for system software.
2068 	 * This is an implementation detail of Solaris
2069 	 * bare-metal Intel IOMMU.
2070 	 * The present field in a PDE/PTE is not defined
2071 	 * by the Vt-d spec
2072 	 */
2073 
2074 	PDTE_SET_P(pte);
2075 
2076 	pte |= immu->immu_ptemask;
2077 
2078 #ifndef BUGGY_DRIVERS
2079 out:
2080 #endif
2081 #endif /* DEBUG */
2082 #ifdef BUGGY_DRIVERS
2083 	PDTE_SET_READ(pte);
2084 	PDTE_SET_WRITE(pte);
2085 #else
2086 	if (immu_flags & IMMU_FLAGS_READ)
2087 		PDTE_SET_READ(pte);
2088 	if (immu_flags & IMMU_FLAGS_WRITE)
2089 		PDTE_SET_WRITE(pte);
2090 #endif /* BUGGY_DRIVERS */
2091 
2092 	*hwp = pte;
2093 }
2094 
2095 /*ARGSUSED*/
2096 static void
PTE_set_all(immu_t * immu,domain_t * domain,xlate_t * xlate,uint64_t * dvma_ptr,uint64_t * nvpages_ptr,immu_dcookie_t * dcookies,int dcount,dev_info_t * rdip,immu_flags_t immu_flags)2097 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2098     uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies,
2099     int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2100 {
2101 	paddr_t paddr;
2102 	uint64_t nvpages;
2103 	uint64_t nppages;
2104 	uint64_t dvma;
2105 	pgtable_t *pgtable;
2106 	hw_pdte_t *hwp;
2107 	hw_pdte_t *shwp;
2108 	int idx, nset;
2109 	int j;
2110 
2111 	pgtable = xlate->xlt_pgtable;
2112 	idx = xlate->xlt_idx;
2113 
2114 	dvma = *dvma_ptr;
2115 	nvpages = *nvpages_ptr;
2116 
2117 	/*
2118 	 * since a caller gets a unique dvma for a physical address,
2119 	 * no other concurrent thread will be writing to the same
2120 	 * PTE even if it has the same paddr. So no locks needed.
2121 	 */
2122 	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2123 
2124 	hwp = shwp;
2125 	for (j = dcount - 1; j >= 0; j--) {
2126 		if (nvpages <= dcookies[j].dck_npages)
2127 			break;
2128 		nvpages -= dcookies[j].dck_npages;
2129 	}
2130 
2131 	VERIFY(j >= 0);
2132 	nppages = nvpages;
2133 	paddr = dcookies[j].dck_paddr +
2134 	    (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;
2135 
2136 	nvpages = *nvpages_ptr;
2137 	nset = 0;
2138 	for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2139 		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2140 		nset++;
2141 
2142 		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2143 		    == B_TRUE);
2144 		nppages--;
2145 		nvpages--;
2146 		paddr += IMMU_PAGESIZE;
2147 		dvma += IMMU_PAGESIZE;
2148 
2149 		if (nppages == 0) {
2150 			j++;
2151 		}
2152 
2153 		if (j == dcount)
2154 			break;
2155 
2156 		if (nppages == 0) {
2157 			nppages = dcookies[j].dck_npages;
2158 			paddr = dcookies[j].dck_paddr;
2159 		}
2160 	}
2161 
2162 	if (nvpages) {
2163 		*dvma_ptr = dvma;
2164 		*nvpages_ptr = nvpages;
2165 	} else {
2166 		*dvma_ptr = 0;
2167 		*nvpages_ptr = 0;
2168 	}
2169 
2170 	xlate->xlt_idx = idx;
2171 }
2172 
2173 /*ARGSUSED*/
2174 static void
PDE_set_one(immu_t * immu,hw_pdte_t * hwp,pgtable_t * next,dev_info_t * rdip,immu_flags_t immu_flags)2175 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2176     dev_info_t *rdip, immu_flags_t immu_flags)
2177 {
2178 	hw_pdte_t pde;
2179 
2180 	pde = *hwp;
2181 
2182 	/* if PDE is already set, make sure it is correct */
2183 	if (PDTE_P(pde)) {
2184 		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2185 #ifdef BUGGY_DRIVERS
2186 		return;
2187 #else
2188 		goto out;
2189 #endif
2190 	}
2191 
2192 	/* Dont touch SW4, it is the present bit */
2193 
2194 	/* don't touch TM field it is reserved for PDEs */
2195 
2196 	/* 3rd field available for system software is not used */
2197 	PDTE_CLEAR_SW3(pde);
2198 
2199 	/* Set next level pgtable-paddr for PDE */
2200 	PDTE_CLEAR_PADDR(pde);
2201 	PDTE_SET_PADDR(pde, next->hwpg_paddr);
2202 
2203 	/* don't touch SNP field it is reserved for PDEs */
2204 
2205 	/* Clear second field available for system software */
2206 	PDTE_CLEAR_SW2(pde);
2207 
2208 	/* No super pages for PDEs */
2209 	PDTE_CLEAR_SP(pde);
2210 
2211 	/* Clear SW1 for software */
2212 	PDTE_CLEAR_SW1(pde);
2213 
2214 	/*
2215 	 * Now that we are done writing the PDE
2216 	 * set the "present" flag. Note this present
2217 	 * flag is a bit in the PDE/PTE that the
2218 	 * spec says is available for system software.
2219 	 * This is an implementation detail of Solaris
2220 	 * base-metal Intel IOMMU.
2221 	 * The present field in a PDE/PTE is not defined
2222 	 * by the Vt-d spec
2223 	 */
2224 
2225 #ifndef  BUGGY_DRIVERS
2226 out:
2227 #endif
2228 #ifdef  BUGGY_DRIVERS
2229 	PDTE_SET_READ(pde);
2230 	PDTE_SET_WRITE(pde);
2231 #else
2232 	if (immu_flags & IMMU_FLAGS_READ)
2233 		PDTE_SET_READ(pde);
2234 	if (immu_flags & IMMU_FLAGS_WRITE)
2235 		PDTE_SET_WRITE(pde);
2236 #endif
2237 
2238 	PDTE_SET_P(pde);
2239 
2240 	*hwp = pde;
2241 }
2242 
2243 /*
2244  * Used to set PDEs
2245  */
2246 static boolean_t
PDE_set_all(immu_t * immu,domain_t * domain,xlate_t * xlate,int nlevels,dev_info_t * rdip,immu_flags_t immu_flags)2247 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2248     dev_info_t *rdip, immu_flags_t immu_flags)
2249 {
2250 	pgtable_t *pgtable;
2251 	pgtable_t *new;
2252 	pgtable_t *next;
2253 	hw_pdte_t *hwp;
2254 	int level;
2255 	uint_t idx;
2256 	krw_t rwtype;
2257 	boolean_t set = B_FALSE;
2258 
2259 	/* start with highest level pgtable i.e. root */
2260 	xlate += nlevels;
2261 
2262 	new = NULL;
2263 	xlate->xlt_pgtable = domain->dom_pgtable_root;
2264 	for (level = nlevels; level > 1; level--, xlate--) {
2265 		idx = xlate->xlt_idx;
2266 		pgtable = xlate->xlt_pgtable;
2267 
2268 		/* Lock the pgtable in READ mode first */
2269 		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2270 		rwtype = RW_READER;
2271 again:
2272 		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2273 		next = (pgtable->swpg_next_array)[idx];
2274 
2275 		/*
2276 		 * check if leafier level already has a pgtable
2277 		 * if yes, verify
2278 		 */
2279 		if (next == NULL) {
2280 			if (new == NULL) {
2281 
2282 				IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *,
2283 				    rdip, int, level);
2284 
2285 				new = pgtable_alloc(immu, immu_flags);
2286 				if (new == NULL) {
2287 					ddi_err(DER_PANIC, rdip,
2288 					    "pgtable alloc err");
2289 				}
2290 				pgtable_zero(new);
2291 			}
2292 
2293 			/* Change to a write lock */
2294 			if (rwtype == RW_READER &&
2295 			    rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
2296 				rw_exit(&(pgtable->swpg_rwlock));
2297 				rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2298 				rwtype = RW_WRITER;
2299 				goto again;
2300 			}
2301 			rwtype = RW_WRITER;
2302 			next = new;
2303 			(pgtable->swpg_next_array)[idx] = next;
2304 			new = NULL;
2305 			PDE_set_one(immu, hwp, next, rdip, immu_flags);
2306 			set = B_TRUE;
2307 			rw_downgrade(&(pgtable->swpg_rwlock));
2308 			rwtype = RW_READER;
2309 		}
2310 #ifndef  BUGGY_DRIVERS
2311 		else {
2312 			hw_pdte_t pde = *hwp;
2313 
2314 			/*
2315 			 * If buggy driver we already set permission
2316 			 * READ+WRITE so nothing to do for that case
2317 			 * XXX Check that read writer perms change before
2318 			 * actually setting perms. Also need to hold lock
2319 			 */
2320 			if (immu_flags & IMMU_FLAGS_READ)
2321 				PDTE_SET_READ(pde);
2322 			if (immu_flags & IMMU_FLAGS_WRITE)
2323 				PDTE_SET_WRITE(pde);
2324 
2325 			*hwp = pde;
2326 		}
2327 #endif
2328 
2329 		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2330 		    == B_TRUE);
2331 
2332 		(xlate - 1)->xlt_pgtable = next;
2333 		rw_exit(&(pgtable->swpg_rwlock));
2334 	}
2335 
2336 	if (new) {
2337 		pgtable_free(immu, new);
2338 	}
2339 
2340 	return (set);
2341 }
2342 
2343 /*
2344  * dvma_map()
2345  *     map a contiguous range of DVMA pages
2346  *
2347  *     immu: IOMMU unit for which we are generating DVMA cookies
2348  *   domain: domain
2349  *    sdvma: Starting dvma
2350  *   spaddr: Starting paddr
2351  *   npages: Number of pages
2352  *     rdip: requesting device
2353  *     immu_flags: flags
2354  */
2355 static boolean_t
dvma_map(domain_t * domain,uint64_t sdvma,uint64_t snvpages,immu_dcookie_t * dcookies,int dcount,dev_info_t * rdip,immu_flags_t immu_flags)2356 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages,
2357     immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
2358     immu_flags_t immu_flags)
2359 {
2360 	uint64_t dvma;
2361 	uint64_t n;
2362 	immu_t *immu = domain->dom_immu;
2363 	int nlevels = immu->immu_dvma_nlevels;
2364 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2365 	boolean_t pde_set = B_FALSE;
2366 
2367 	n = snvpages;
2368 	dvma = sdvma;
2369 
2370 	while (n > 0) {
2371 		xlate_setup(dvma, xlate, nlevels);
2372 
2373 		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2374 		if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
2375 		    == B_TRUE) {
2376 			pde_set = B_TRUE;
2377 		}
2378 
2379 		/* set all matching ptes that fit into this leaf pgtable */
2380 		PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
2381 		    dcount, rdip, immu_flags);
2382 	}
2383 
2384 	return (pde_set);
2385 }
2386 
2387 /*
2388  * dvma_unmap()
2389  *   unmap a range of DVMAs
2390  *
2391  * immu: IOMMU unit state
2392  * domain: domain for requesting device
2393  * ddip: domain-dip
2394  * dvma: starting DVMA
2395  * npages: Number of IMMU pages to be unmapped
2396  * rdip: requesting device
2397  */
2398 static void
dvma_unmap(domain_t * domain,uint64_t sdvma,uint64_t snpages,dev_info_t * rdip)2399 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages,
2400     dev_info_t *rdip)
2401 {
2402 	immu_t *immu = domain->dom_immu;
2403 	int nlevels = immu->immu_dvma_nlevels;
2404 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2405 	uint64_t n;
2406 	uint64_t dvma;
2407 
2408 	dvma = sdvma;
2409 	n = snpages;
2410 
2411 	while (n > 0) {
2412 		/* setup the xlate array */
2413 		xlate_setup(dvma, xlate, nlevels);
2414 
2415 		/* just lookup existing pgtables. Should never fail */
2416 		if (!PDE_lookup(domain, xlate, nlevels))
2417 			ddi_err(DER_PANIC, rdip,
2418 			    "PTE not found for addr %" PRIx64,
2419 			    (unsigned long long)dvma);
2420 
2421 		/* clear all matching ptes that fit into this leaf pgtable */
2422 		PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
2423 	}
2424 
2425 	/* No need to flush IOTLB after unmap */
2426 }
2427 
2428 static uint64_t
dvma_alloc(domain_t * domain,ddi_dma_attr_t * dma_attr,uint_t npages,int kmf)2429 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf)
2430 {
2431 	uint64_t dvma;
2432 	size_t xsize, align;
2433 	uint64_t minaddr, maxaddr;
2434 
2435 	/* parameters */
2436 	xsize = npages * IMMU_PAGESIZE;
2437 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2438 	minaddr = dma_attr->dma_attr_addr_lo;
2439 	maxaddr = dma_attr->dma_attr_addr_hi + 1;
2440 
2441 	/* handle the rollover cases */
2442 	if (maxaddr < dma_attr->dma_attr_addr_hi) {
2443 		maxaddr = dma_attr->dma_attr_addr_hi;
2444 	}
2445 
2446 	/*
2447 	 * allocate from vmem arena.
2448 	 */
2449 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2450 	    xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
2451 	    (void *)(uintptr_t)maxaddr, kmf);
2452 
2453 	return (dvma);
2454 }
2455 
2456 static void
dvma_prealloc(dev_info_t * rdip,immu_hdl_priv_t * ihp,ddi_dma_attr_t * dma_attr)2457 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr)
2458 {
2459 	int nlevels;
2460 	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp;
2461 	uint64_t dvma, n;
2462 	size_t xsize, align;
2463 	uint64_t minaddr, maxaddr, dmamax;
2464 	int on, npte, pindex;
2465 	hw_pdte_t *shwp;
2466 	immu_t *immu;
2467 	domain_t *domain;
2468 
2469 	/* parameters */
2470 	domain = IMMU_DEVI(rdip)->imd_domain;
2471 	immu = domain->dom_immu;
2472 	nlevels = immu->immu_dvma_nlevels;
2473 	xsize = IMMU_NPREPTES * IMMU_PAGESIZE;
2474 	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2475 	minaddr = dma_attr->dma_attr_addr_lo;
2476 	if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG)
2477 		dmamax = dma_attr->dma_attr_seg;
2478 	else
2479 		dmamax = dma_attr->dma_attr_addr_hi;
2480 	maxaddr = dmamax + 1;
2481 
2482 	if (maxaddr < dmamax)
2483 		maxaddr = dmamax;
2484 
2485 	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2486 	    xsize, align, 0, dma_attr->dma_attr_seg + 1,
2487 	    (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2488 
2489 	ihp->ihp_predvma = dvma;
2490 	ihp->ihp_npremapped = 0;
2491 	if (dvma == 0)
2492 		return;
2493 
2494 	n = IMMU_NPREPTES;
2495 	pindex = 0;
2496 
2497 	/*
2498 	 * Set up a mapping at address 0, just so that all PDPs get allocated
2499 	 * now. Although this initial mapping should never be used,
2500 	 * explicitly set it to read-only, just to be safe.
2501 	 */
2502 	while (n > 0) {
2503 		xlate_setup(dvma, xlate, nlevels);
2504 
2505 		(void) PDE_set_all(immu, domain, xlate, nlevels, rdip,
2506 		    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2507 
2508 		xlp = &xlate[1];
2509 		shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr)
2510 		    + xlp->xlt_idx;
2511 		on = n;
2512 
2513 		PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie,
2514 		    1, rdip, IMMU_FLAGS_READ);
2515 
2516 		npte = on - n;
2517 
2518 		while (npte > 0) {
2519 			ihp->ihp_preptes[pindex++] = shwp;
2520 #ifdef BUGGY_DRIVERS
2521 			PDTE_CLEAR_WRITE(*shwp);
2522 #endif
2523 			shwp++;
2524 			npte--;
2525 		}
2526 	}
2527 }
2528 
2529 static void
dvma_prefree(dev_info_t * rdip,immu_hdl_priv_t * ihp)2530 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp)
2531 {
2532 	domain_t *domain;
2533 
2534 	domain = IMMU_DEVI(rdip)->imd_domain;
2535 
2536 	if (ihp->ihp_predvma != 0) {
2537 		dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip);
2538 		vmem_free(domain->dom_dvma_arena,
2539 		    (void *)(uintptr_t)ihp->ihp_predvma,
2540 		    IMMU_NPREPTES * IMMU_PAGESIZE);
2541 	}
2542 }
2543 
2544 static void
dvma_free(domain_t * domain,uint64_t dvma,uint64_t npages)2545 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2546 {
2547 	uint64_t size = npages * IMMU_PAGESIZE;
2548 
2549 	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE)
2550 		return;
2551 
2552 	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2553 }
2554 
2555 static int
immu_map_dvmaseg(dev_info_t * rdip,ddi_dma_handle_t handle,immu_hdl_priv_t * ihp,struct ddi_dma_req * dmareq,ddi_dma_obj_t * dma_out)2556 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle,
2557     immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq,
2558     ddi_dma_obj_t *dma_out)
2559 {
2560 	domain_t *domain;
2561 	immu_t *immu;
2562 	immu_flags_t immu_flags;
2563 	ddi_dma_atyp_t buftype;
2564 	ddi_dma_obj_t *dmar_object;
2565 	ddi_dma_attr_t *attrp;
2566 	uint64_t offset, paddr, dvma, sdvma, rwmask;
2567 	size_t npages, npgalloc;
2568 	uint_t psize, size, pcnt, dmax;
2569 	page_t **pparray;
2570 	caddr_t vaddr;
2571 	page_t *page;
2572 	struct as *vas;
2573 	immu_dcookie_t *dcookies;
2574 	int pde_set;
2575 
2576 	rwmask = 0;
2577 	page = NULL;
2578 	domain = IMMU_DEVI(rdip)->imd_domain;
2579 	immu = domain->dom_immu;
2580 	immu_flags = dma_to_immu_flags(dmareq);
2581 
2582 	attrp = &((ddi_dma_impl_t *)handle)->dmai_attr;
2583 
2584 	dmar_object = &dmareq->dmar_object;
2585 	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2586 	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2587 	buftype = dmar_object->dmao_type;
2588 	size = dmar_object->dmao_size;
2589 
2590 	IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t,
2591 	    buftype, uint_t, size);
2592 
2593 	dcookies = &ihp->ihp_dcookies[0];
2594 
2595 	pcnt = dmax = 0;
2596 
2597 	/* retrieve paddr, psize, offset from dmareq */
2598 	if (buftype == DMA_OTYP_PAGES) {
2599 		page = dmar_object->dmao_obj.pp_obj.pp_pp;
2600 		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
2601 		    MMU_PAGEOFFSET;
2602 		paddr = pfn_to_pa(page->p_pagenum) + offset;
2603 		psize = MIN((MMU_PAGESIZE - offset), size);
2604 		page = page->p_next;
2605 		vas = dmar_object->dmao_obj.virt_obj.v_as;
2606 	} else {
2607 		if (vas == NULL) {
2608 			vas = &kas;
2609 		}
2610 		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2611 		if (pparray != NULL) {
2612 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2613 			psize = MIN((MMU_PAGESIZE - offset), size);
2614 			pcnt++;
2615 		} else {
2616 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat,
2617 			    vaddr)) + offset;
2618 			psize = MIN(size, (MMU_PAGESIZE - offset));
2619 			vaddr += psize;
2620 		}
2621 	}
2622 
2623 	npgalloc = IMMU_BTOPR(size + offset);
2624 
2625 	if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) {
2626 #ifdef BUGGY_DRIVERS
2627 		rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask;
2628 #else
2629 		rwmask = immu->immu_ptemask;
2630 		if (immu_flags & IMMU_FLAGS_READ)
2631 			rwmask |= PDTE_MASK_R;
2632 		if (immu_flags & IMMU_FLAGS_WRITE)
2633 			rwmask |= PDTE_MASK_W;
2634 #endif
2635 #ifdef DEBUG
2636 		rwmask |= PDTE_MASK_P;
2637 #endif
2638 		sdvma = ihp->ihp_predvma;
2639 		ihp->ihp_npremapped = npgalloc;
2640 		*ihp->ihp_preptes[0] =
2641 		    PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask;
2642 	} else {
2643 		ihp->ihp_npremapped = 0;
2644 		sdvma = dvma_alloc(domain, attrp, npgalloc,
2645 		    dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP);
2646 		if (sdvma == 0)
2647 			return (DDI_DMA_NORESOURCES);
2648 
2649 		dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET);
2650 		dcookies[0].dck_npages = 1;
2651 	}
2652 
2653 	IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc,
2654 	    uint64_t, sdvma);
2655 
2656 	dvma = sdvma;
2657 	pde_set = 0;
2658 	npages = 1;
2659 	size -= psize;
2660 	while (size > 0) {
2661 		/* get the size for this page (i.e. partial or full page) */
2662 		psize = MIN(size, MMU_PAGESIZE);
2663 		if (buftype == DMA_OTYP_PAGES) {
2664 			/* get the paddr from the page_t */
2665 			paddr = pfn_to_pa(page->p_pagenum);
2666 			page = page->p_next;
2667 		} else if (pparray != NULL) {
2668 			/* index into the array of page_t's to get the paddr */
2669 			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2670 			pcnt++;
2671 		} else {
2672 			/* call into the VM to get the paddr */
2673 			paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr));
2674 			vaddr += psize;
2675 		}
2676 
2677 		if (ihp->ihp_npremapped > 0) {
2678 			*ihp->ihp_preptes[npages] =
2679 			    PDTE_PADDR(paddr) | rwmask;
2680 		} else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2681 			dcookies[dmax].dck_npages++;
2682 		} else {
2683 			/* No, we need a new dcookie */
2684 			if (dmax == (IMMU_NDCK - 1)) {
2685 				/*
2686 				 * Ran out of dcookies. Map them now.
2687 				 */
2688 				if (dvma_map(domain, dvma,
2689 				    npages, dcookies, dmax + 1, rdip,
2690 				    immu_flags))
2691 					pde_set++;
2692 
2693 				IMMU_DPROBE4(immu__dvmamap__early,
2694 				    dev_info_t *, rdip, uint64_t, dvma,
2695 				    uint_t, npages, uint_t, dmax+1);
2696 
2697 				dvma += (npages << IMMU_PAGESHIFT);
2698 				npages = 0;
2699 				dmax = 0;
2700 			} else {
2701 				dmax++;
2702 			}
2703 			dcookies[dmax].dck_paddr = paddr;
2704 			dcookies[dmax].dck_npages = 1;
2705 		}
2706 		size -= psize;
2707 		if (npages != 0)
2708 			npages++;
2709 	}
2710 
2711 	/*
2712 	 * Finish up, mapping all, or all of the remaining,
2713 	 * physical memory ranges.
2714 	 */
2715 	if (ihp->ihp_npremapped == 0 && npages > 0) {
2716 		IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \
2717 		    uint64_t, dvma, uint_t, npages, uint_t, dmax+1);
2718 
2719 		if (dvma_map(domain, dvma, npages, dcookies,
2720 		    dmax + 1, rdip, immu_flags))
2721 			pde_set++;
2722 	}
2723 
2724 	/* Invalidate the IOTLB */
2725 	immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc,
2726 	    pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF,
2727 	    &ihp->ihp_inv_wait);
2728 
2729 	ihp->ihp_ndvseg = 1;
2730 	ihp->ihp_dvseg[0].dvs_start = sdvma;
2731 	ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size;
2732 
2733 	dma_out->dmao_size = dmar_object->dmao_size;
2734 	dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET;
2735 	dma_out->dmao_obj.dvma_obj.dv_nseg = 1;
2736 	dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0];
2737 	dma_out->dmao_type = DMA_OTYP_DVADDR;
2738 
2739 	return (DDI_DMA_MAPPED);
2740 }
2741 
2742 static int
immu_unmap_dvmaseg(dev_info_t * rdip,ddi_dma_obj_t * dmao)2743 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao)
2744 {
2745 	uint64_t dvma, npages;
2746 	domain_t *domain;
2747 	struct dvmaseg *dvs;
2748 
2749 	domain = IMMU_DEVI(rdip)->imd_domain;
2750 	dvs = dmao->dmao_obj.dvma_obj.dv_seg;
2751 
2752 	dvma = dvs[0].dvs_start;
2753 	npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off);
2754 
2755 #ifdef DEBUG
2756 	/* Unmap only in DEBUG mode */
2757 	dvma_unmap(domain, dvma, npages, rdip);
2758 #endif
2759 	dvma_free(domain, dvma, npages);
2760 
2761 	IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages,
2762 	    uint64_t, dvma);
2763 
2764 #ifdef DEBUG
2765 	/*
2766 	 * In the DEBUG case, the unmap was actually done,
2767 	 * but an IOTLB flush was not done. So, an explicit
2768 	 * write back flush is needed.
2769 	 */
2770 	immu_regs_wbf_flush(domain->dom_immu);
2771 #endif
2772 
2773 	return (DDI_SUCCESS);
2774 }
2775 
2776 /* ############################# Functions exported ######################## */
2777 
2778 /*
2779  * setup the DVMA subsystem
2780  * this code runs only for the first IOMMU unit
2781  */
2782 void
immu_dvma_setup(list_t * listp)2783 immu_dvma_setup(list_t *listp)
2784 {
2785 	immu_t *immu;
2786 	uint_t kval;
2787 	size_t nchains;
2788 
2789 	/* locks */
2790 	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
2791 
2792 	/* Create lists */
2793 	list_create(&immu_unity_domain_list, sizeof (domain_t),
2794 	    offsetof(domain_t, dom_maptype_node));
2795 	list_create(&immu_xlate_domain_list, sizeof (domain_t),
2796 	    offsetof(domain_t, dom_maptype_node));
2797 
2798 	/* Setup BDF domain hash */
2799 	nchains = 0xff;
2800 	kval = mod_hash_iddata_gen(nchains);
2801 
2802 	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
2803 	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
2804 	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
2805 	    KM_NOSLEEP);
2806 
2807 	immu = list_head(listp);
2808 	for (; immu; immu = list_next(listp, immu)) {
2809 		create_unity_domain(immu);
2810 		did_init(immu);
2811 		context_init(immu);
2812 		immu->immu_dvma_setup = B_TRUE;
2813 	}
2814 }
2815 
2816 /*
2817  * Startup up one DVMA unit
2818  */
2819 void
immu_dvma_startup(immu_t * immu)2820 immu_dvma_startup(immu_t *immu)
2821 {
2822 	if (immu_gfxdvma_enable == B_FALSE &&
2823 	    immu->immu_dvma_gfx_only == B_TRUE) {
2824 		return;
2825 	}
2826 
2827 	/*
2828 	 * DVMA will start once IOMMU is "running"
2829 	 */
2830 	immu->immu_dvma_running = B_TRUE;
2831 }
2832 
2833 /*
2834  * immu_dvma_physmem_update()
2835  *       called when the installed memory on a
2836  *       system increases, to expand domain DVMA
2837  *       for domains with UNITY mapping
2838  */
2839 void
immu_dvma_physmem_update(uint64_t addr,uint64_t size)2840 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
2841 {
2842 	uint64_t start;
2843 	uint64_t npages;
2844 	int dcount;
2845 	immu_dcookie_t dcookies[1] = {0};
2846 	domain_t *domain;
2847 
2848 	/*
2849 	 * Just walk the system-wide list of domains with
2850 	 * UNITY mapping. Both the list of *all* domains
2851 	 * and *UNITY* domains is protected by the same
2852 	 * single lock
2853 	 */
2854 	mutex_enter(&immu_domain_lock);
2855 	domain = list_head(&immu_unity_domain_list);
2856 	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
2857 		/*
2858 		 * Nothing to do if the IOMMU supports passthrough.
2859 		 */
2860 		if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap))
2861 			continue;
2862 
2863 		/* There is no vmem_arena for unity domains. Just map it */
2864 		ddi_err(DER_LOG, domain->dom_dip,
2865 		    "iommu: unity-domain: Adding map "
2866 		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
2867 
2868 		start = IMMU_ROUNDOWN(addr);
2869 		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
2870 
2871 		dcookies[0].dck_paddr = start;
2872 		dcookies[0].dck_npages = npages;
2873 		dcount = 1;
2874 		(void) dvma_map(domain, start, npages,
2875 		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2876 
2877 	}
2878 	mutex_exit(&immu_domain_lock);
2879 }
2880 
2881 int
immu_dvma_device_setup(dev_info_t * rdip,immu_flags_t immu_flags)2882 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags)
2883 {
2884 	dev_info_t *ddip, *odip;
2885 	immu_t *immu;
2886 	domain_t *domain;
2887 
2888 	odip = rdip;
2889 
2890 	immu = immu_dvma_get_immu(rdip, immu_flags);
2891 	if (immu == NULL) {
2892 		/*
2893 		 * possible that there is no IOMMU unit for this device
2894 		 * - BIOS bugs are one example.
2895 		 */
2896 		ddi_err(DER_WARN, rdip, "No iommu unit found for device");
2897 		return (DDI_DMA_NORESOURCES);
2898 	}
2899 
2900 	/*
2901 	 * redirect isa devices attached under lpc to lpc dip
2902 	 */
2903 	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
2904 		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
2905 		if (rdip == NULL) {
2906 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2907 			/*NOTREACHED*/
2908 		}
2909 	}
2910 
2911 	/* Reset immu, as redirection can change IMMU */
2912 	immu = NULL;
2913 
2914 	/*
2915 	 * for gart, redirect to the real graphic devinfo
2916 	 */
2917 	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
2918 		rdip = get_gfx_devinfo(rdip);
2919 		if (rdip == NULL) {
2920 			ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2921 			/*NOTREACHED*/
2922 		}
2923 	}
2924 
2925 	/*
2926 	 * Setup DVMA domain for the device. This does
2927 	 * work only the first time we do DVMA for a
2928 	 * device.
2929 	 */
2930 	ddip = NULL;
2931 	domain = device_domain(rdip, &ddip, immu_flags);
2932 	if (domain == NULL) {
2933 		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
2934 		return (DDI_DMA_NORESOURCES);
2935 	}
2936 
2937 	immu = domain->dom_immu;
2938 
2939 	/*
2940 	 * If a domain is found, we must also have a domain dip
2941 	 * which is the topmost ancestor dip of rdip that shares
2942 	 * the same domain with rdip.
2943 	 */
2944 	if (domain->dom_did == 0 || ddip == NULL) {
2945 		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
2946 		    domain->dom_did, ddip);
2947 		return (DDI_DMA_NORESOURCES);
2948 	}
2949 
2950 	if (odip != rdip)
2951 		set_domain(odip, ddip, domain);
2952 
2953 	/*
2954 	 * Update the root and context entries
2955 	 */
2956 	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
2957 	    != DDI_SUCCESS) {
2958 		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
2959 		return (DDI_DMA_NORESOURCES);
2960 	}
2961 
2962 	return (DDI_SUCCESS);
2963 }
2964 
2965 int
immu_map_memrange(dev_info_t * rdip,memrng_t * mrng)2966 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng)
2967 {
2968 	immu_dcookie_t dcookies[1] = {0};
2969 	boolean_t pde_set;
2970 	immu_t *immu;
2971 	domain_t *domain;
2972 	immu_inv_wait_t iw;
2973 
2974 	dcookies[0].dck_paddr = mrng->mrng_start;
2975 	dcookies[0].dck_npages = mrng->mrng_npages;
2976 
2977 	domain = IMMU_DEVI(rdip)->imd_domain;
2978 	immu = domain->dom_immu;
2979 
2980 	pde_set = dvma_map(domain, mrng->mrng_start,
2981 	    mrng->mrng_npages, dcookies, 1, rdip,
2982 	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2983 
2984 	immu_init_inv_wait(&iw, "memrange", B_TRUE);
2985 
2986 	immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start,
2987 	    mrng->mrng_npages, pde_set == B_TRUE ?
2988 	    TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw);
2989 
2990 	return (DDI_SUCCESS);
2991 }
2992 
2993 immu_devi_t *
immu_devi_get(dev_info_t * rdip)2994 immu_devi_get(dev_info_t *rdip)
2995 {
2996 	immu_devi_t *immu_devi;
2997 	volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);
2998 
2999 	/* Just want atomic reads. No need for lock */
3000 	immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
3001 	    0);
3002 	return (immu_devi);
3003 }
3004 
3005 /*ARGSUSED*/
3006 int
immu_hdl_priv_ctor(void * buf,void * arg,int kmf)3007 immu_hdl_priv_ctor(void *buf, void *arg, int kmf)
3008 {
3009 	immu_hdl_priv_t *ihp;
3010 
3011 	ihp = buf;
3012 	immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE);
3013 
3014 	return (0);
3015 }
3016 
3017 /*
3018  * iommulib interface functions
3019  */
3020 static int
immu_probe(iommulib_handle_t handle,dev_info_t * dip)3021 immu_probe(iommulib_handle_t handle, dev_info_t *dip)
3022 {
3023 	immu_devi_t *immu_devi;
3024 	int ret;
3025 
3026 	if (!immu_enable)
3027 		return (DDI_FAILURE);
3028 
3029 	/*
3030 	 * Make sure the device has all the IOMMU structures
3031 	 * initialized. If this device goes through an IOMMU
3032 	 * unit (e.g. this probe function returns success),
3033 	 * this will be called at most N times, with N being
3034 	 * the number of IOMMUs in the system.
3035 	 *
3036 	 * After that, when iommulib_nex_open succeeds,
3037 	 * we can always assume that this device has all
3038 	 * the structures initialized. IOMMU_USED(dip) will
3039 	 * be true. There is no need to find the controlling
3040 	 * IOMMU/domain again.
3041 	 */
3042 	ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP);
3043 	if (ret != DDI_SUCCESS)
3044 		return (ret);
3045 
3046 	immu_devi = IMMU_DEVI(dip);
3047 
3048 	/*
3049 	 * For unity domains, there is no need to call in to
3050 	 * the IOMMU code.
3051 	 */
3052 	if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID)
3053 		return (DDI_FAILURE);
3054 
3055 	if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle))
3056 		return (DDI_SUCCESS);
3057 
3058 	return (DDI_FAILURE);
3059 }
3060 
3061 /*ARGSUSED*/
3062 static int
immu_allochdl(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_attr_t * attr,int (* waitfp)(caddr_t),caddr_t arg,ddi_dma_handle_t * dma_handlep)3063 immu_allochdl(iommulib_handle_t handle,
3064     dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
3065     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep)
3066 {
3067 	int ret;
3068 	immu_hdl_priv_t *ihp;
3069 	immu_t *immu;
3070 
3071 	ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp,
3072 	    arg, dma_handlep);
3073 	if (ret == DDI_SUCCESS) {
3074 		immu = IMMU_DEVI(rdip)->imd_immu;
3075 
3076 		ihp = kmem_cache_alloc(immu->immu_hdl_cache,
3077 		    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
3078 		if (ihp == NULL) {
3079 			(void) iommulib_iommu_dma_freehdl(dip, rdip,
3080 			    *dma_handlep);
3081 			return (DDI_DMA_NORESOURCES);
3082 		}
3083 
3084 		if (IMMU_DEVI(rdip)->imd_use_premap)
3085 			dvma_prealloc(rdip, ihp, attr);
3086 		else {
3087 			ihp->ihp_npremapped = 0;
3088 			ihp->ihp_predvma = 0;
3089 		}
3090 		ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep,
3091 		    ihp);
3092 	}
3093 	return (ret);
3094 }
3095 
3096 /*ARGSUSED*/
3097 static int
immu_freehdl(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle)3098 immu_freehdl(iommulib_handle_t handle,
3099     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3100 {
3101 	immu_hdl_priv_t *ihp;
3102 
3103 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3104 	if (ihp != NULL) {
3105 		if (IMMU_DEVI(rdip)->imd_use_premap)
3106 			dvma_prefree(rdip, ihp);
3107 		kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp);
3108 	}
3109 
3110 	return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle));
3111 }
3112 
3113 
3114 /*ARGSUSED*/
3115 static int
immu_bindhdl(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle,struct ddi_dma_req * dma_req,ddi_dma_cookie_t * cookiep,uint_t * ccountp)3116 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
3117     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3118     struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep,
3119     uint_t *ccountp)
3120 {
3121 	int ret;
3122 	immu_hdl_priv_t *ihp;
3123 
3124 	ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle,
3125 	    dma_req, cookiep, ccountp);
3126 
3127 	if (ret == DDI_DMA_MAPPED) {
3128 		ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3129 		immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait);
3130 	}
3131 
3132 	return (ret);
3133 }
3134 
3135 /*ARGSUSED*/
3136 static int
immu_unbindhdl(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle)3137 immu_unbindhdl(iommulib_handle_t handle,
3138     dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3139 {
3140 	return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle));
3141 }
3142 
3143 /*ARGSUSED*/
3144 static int
immu_sync(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle,off_t off,size_t len,uint_t cachefl)3145 immu_sync(iommulib_handle_t handle, dev_info_t *dip,
3146     dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off,
3147     size_t len, uint_t cachefl)
3148 {
3149 	return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len,
3150 	    cachefl));
3151 }
3152 
3153 /*ARGSUSED*/
3154 static int
immu_win(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle,uint_t win,off_t * offp,size_t * lenp,ddi_dma_cookie_t * cookiep,uint_t * ccountp)3155 immu_win(iommulib_handle_t handle, dev_info_t *dip,
3156     dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
3157     off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep,
3158     uint_t *ccountp)
3159 {
3160 	return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp,
3161 	    lenp, cookiep, ccountp));
3162 }
3163 
3164 /*ARGSUSED*/
3165 static int
immu_mapobject(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle,struct ddi_dma_req * dmareq,ddi_dma_obj_t * dmao)3166 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
3167     dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3168     struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao)
3169 {
3170 	immu_hdl_priv_t *ihp;
3171 
3172 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3173 
3174 	return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao));
3175 }
3176 
3177 /*ARGSUSED*/
3178 static int
immu_unmapobject(iommulib_handle_t handle,dev_info_t * dip,dev_info_t * rdip,ddi_dma_handle_t dma_handle,ddi_dma_obj_t * dmao)3179 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
3180     dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao)
3181 {
3182 	immu_hdl_priv_t *ihp;
3183 
3184 	ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3185 	if (ihp->ihp_npremapped > 0)
3186 		return (DDI_SUCCESS);
3187 	return (immu_unmap_dvmaseg(rdip, dmao));
3188 }
3189