xref: /titanic_50/usr/src/uts/sun4/io/px/px_dma.c (revision cc4ec4394cda0c382f50cf9d771b6fcdeffa8c8d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
27  */
28 
29 /*
30  * PCI Express nexus DVMA and DMA core routines:
31  *	dma_map/dma_bind_handle implementation
32  *	bypass and peer-to-peer support
33  *	fast track DVMA space allocation
34  *	runtime DVMA debug
35  */
36 #include <sys/types.h>
37 #include <sys/kmem.h>
38 #include <sys/async.h>
39 #include <sys/sysmacros.h>
40 #include <sys/sunddi.h>
41 #include <sys/ddi_impldefs.h>
42 #include "px_obj.h"
43 
44 /*LINTLIBRARY*/
45 
46 /*
47  * px_dma_allocmp - Allocate a pci dma implementation structure
48  *
49  * An extra ddi_dma_attr structure is bundled with the usual ddi_dma_impl
50  * to hold unmodified device limits. The ddi_dma_attr inside the
51  * ddi_dma_impl structure is augumented with system limits to enhance
52  * DVMA performance at runtime. The unaugumented device limits saved
53  * right after (accessed through (ddi_dma_attr_t *)(mp + 1)) is used
54  * strictly for peer-to-peer transfers which do not obey system limits.
55  *
56  * return: DDI_SUCCESS DDI_DMA_NORESOURCES
57  */
58 ddi_dma_impl_t *
59 px_dma_allocmp(dev_info_t *dip, dev_info_t *rdip, int (*waitfp)(caddr_t),
60 	caddr_t arg)
61 {
62 	register ddi_dma_impl_t *mp;
63 	int sleep = (waitfp == DDI_DMA_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
64 
65 	/* Caution: we don't use zalloc to enhance performance! */
66 	if ((mp = kmem_alloc(sizeof (px_dma_hdl_t), sleep)) == 0) {
67 		DBG(DBG_DMA_MAP, dip, "can't alloc dma_handle\n");
68 		if (waitfp != DDI_DMA_DONTWAIT) {
69 			DBG(DBG_DMA_MAP, dip, "alloc_mp kmem cb\n");
70 			ddi_set_callback(waitfp, arg, &px_kmem_clid);
71 		}
72 		return (mp);
73 	}
74 
75 	mp->dmai_rdip = rdip;
76 	mp->dmai_flags = 0;
77 	mp->dmai_pfnlst = NULL;
78 	mp->dmai_winlst = NULL;
79 
80 	/*
81 	 * kmem_alloc debug: the following fields are not zero-ed
82 	 * mp->dmai_mapping = 0;
83 	 * mp->dmai_size = 0;
84 	 * mp->dmai_offset = 0;
85 	 * mp->dmai_minxfer = 0;
86 	 * mp->dmai_burstsizes = 0;
87 	 * mp->dmai_ndvmapages = 0;
88 	 * mp->dmai_pool/roffset = 0;
89 	 * mp->dmai_rflags = 0;
90 	 * mp->dmai_inuse/flags
91 	 * mp->dmai_nwin = 0;
92 	 * mp->dmai_winsize = 0;
93 	 * mp->dmai_nexus_private/tte = 0;
94 	 * mp->dmai_iopte/pfnlst
95 	 * mp->dmai_sbi/pfn0 = 0;
96 	 * mp->dmai_minfo/winlst/fdvma
97 	 * mp->dmai_rdip
98 	 * bzero(&mp->dmai_object, sizeof (ddi_dma_obj_t));
99 	 * bzero(&mp->dmai_attr, sizeof (ddi_dma_attr_t));
100 	 * mp->dmai_cookie = 0;
101 	 */
102 
103 	mp->dmai_attr.dma_attr_version = (uint_t)DMA_ATTR_VERSION;
104 	mp->dmai_attr.dma_attr_flags = (uint_t)0;
105 	mp->dmai_fault = 0;
106 	mp->dmai_fault_check = NULL;
107 	mp->dmai_fault_notify = NULL;
108 
109 	mp->dmai_error.err_ena = 0;
110 	mp->dmai_error.err_status = DDI_FM_OK;
111 	mp->dmai_error.err_expected = DDI_FM_ERR_UNEXPECTED;
112 	mp->dmai_error.err_ontrap = NULL;
113 	mp->dmai_error.err_fep = NULL;
114 	mp->dmai_error.err_cf = NULL;
115 
116 	/*
117 	 * The bdf protection value is set to immediate child
118 	 * at first. It gets modified by switch/bridge drivers
119 	 * as the code traverses down the fabric topology.
120 	 *
121 	 * XXX No IOMMU protection for broken devices.
122 	 */
123 	ASSERT((intptr_t)ddi_get_parent_data(rdip) >> 1 == 0);
124 	mp->dmai_bdf = ((intptr_t)ddi_get_parent_data(rdip) == 1) ?
125 	    PCIE_INVALID_BDF : pcie_get_bdf_for_dma_xfer(dip, rdip);
126 
127 	ndi_fmc_insert(rdip, DMA_HANDLE, mp, NULL);
128 	return (mp);
129 }
130 
131 void
132 px_dma_freemp(ddi_dma_impl_t *mp)
133 {
134 	ndi_fmc_remove(mp->dmai_rdip, DMA_HANDLE, mp);
135 	if (mp->dmai_ndvmapages > 1)
136 		px_dma_freepfn(mp);
137 	if (mp->dmai_winlst)
138 		px_dma_freewin(mp);
139 	kmem_free(mp, sizeof (px_dma_hdl_t));
140 }
141 
142 void
143 px_dma_freepfn(ddi_dma_impl_t *mp)
144 {
145 	void *addr = mp->dmai_pfnlst;
146 	if (addr) {
147 		size_t npages = mp->dmai_ndvmapages;
148 		if (npages > 1)
149 			kmem_free(addr, npages * sizeof (px_iopfn_t));
150 		mp->dmai_pfnlst = NULL;
151 	}
152 	mp->dmai_ndvmapages = 0;
153 }
154 
155 /*
156  * px_dma_lmts2hdl - alloate a ddi_dma_impl_t, validate practical limits
157  *			and convert dmareq->dmar_limits to mp->dmai_attr
158  *
159  * ddi_dma_impl_t member modified     input
160  * ------------------------------------------------------------------------
161  * mp->dmai_minxfer		    - dev
162  * mp->dmai_burstsizes		    - dev
163  * mp->dmai_flags		    - no limit? peer-to-peer only?
164  *
165  * ddi_dma_attr member modified       input
166  * ------------------------------------------------------------------------
167  * mp->dmai_attr.dma_attr_addr_lo   - dev lo, sys lo
168  * mp->dmai_attr.dma_attr_addr_hi   - dev hi, sys hi
169  * mp->dmai_attr.dma_attr_count_max - dev count max, dev/sys lo/hi delta
170  * mp->dmai_attr.dma_attr_seg       - 0         (no nocross   restriction)
171  * mp->dmai_attr.dma_attr_align     - 1         (no alignment restriction)
172  *
173  * The dlim_dmaspeed member of dmareq->dmar_limits is ignored.
174  */
175 ddi_dma_impl_t *
176 px_dma_lmts2hdl(dev_info_t *dip, dev_info_t *rdip, px_mmu_t *mmu_p,
177 	ddi_dma_req_t *dmareq)
178 {
179 	ddi_dma_impl_t *mp;
180 	ddi_dma_attr_t *attr_p;
181 	uint64_t syslo		= mmu_p->mmu_dvma_base;
182 	uint64_t syshi		= mmu_p->mmu_dvma_end;
183 	uint64_t fasthi		= mmu_p->mmu_dvma_fast_end;
184 	ddi_dma_lim_t *lim_p	= dmareq->dmar_limits;
185 	uint32_t count_max	= lim_p->dlim_cntr_max;
186 	uint64_t lo		= lim_p->dlim_addr_lo;
187 	uint64_t hi		= lim_p->dlim_addr_hi;
188 	if (hi <= lo) {
189 		DBG(DBG_DMA_MAP, dip, "Bad limits\n");
190 		return ((ddi_dma_impl_t *)DDI_DMA_NOMAPPING);
191 	}
192 	if (!count_max)
193 		count_max--;
194 
195 	if (!(mp = px_dma_allocmp(dip, rdip, dmareq->dmar_fp,
196 	    dmareq->dmar_arg)))
197 		return (NULL);
198 
199 	/* store original dev input at the 2nd ddi_dma_attr */
200 	attr_p = PX_DEV_ATTR(mp);
201 	SET_DMAATTR(attr_p, lo, hi, -1, count_max);
202 	SET_DMAALIGN(attr_p, 1);
203 
204 	lo = MAX(lo, syslo);
205 	hi = MIN(hi, syshi);
206 	if (hi <= lo)
207 		mp->dmai_flags |= PX_DMAI_FLAGS_PEER_ONLY;
208 	count_max = MIN(count_max, hi - lo);
209 
210 	if (PX_DEV_NOSYSLIMIT(lo, hi, syslo, fasthi, 1))
211 		mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT |
212 		    PX_DMAI_FLAGS_NOSYSLIMIT;
213 	else {
214 		if (PX_DEV_NOFASTLIMIT(lo, hi, syslo, syshi, 1))
215 			mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT;
216 	}
217 	if (PX_DMA_NOCTX(rdip))
218 		mp->dmai_flags |= PX_DMAI_FLAGS_NOCTX;
219 
220 	/* store augumented dev input to mp->dmai_attr */
221 	mp->dmai_burstsizes	= lim_p->dlim_burstsizes;
222 	attr_p = &mp->dmai_attr;
223 	SET_DMAATTR(attr_p, lo, hi, -1, count_max);
224 	SET_DMAALIGN(attr_p, 1);
225 	return (mp);
226 }
227 
228 /*
229  * Called from px_attach to check for bypass dma support and set
230  * flags accordingly.
231  */
232 int
233 px_dma_attach(px_t *px_p)
234 {
235 	uint64_t baddr;
236 
237 	if (px_lib_iommu_getbypass(px_p->px_dip, 0ull,
238 	    PCI_MAP_ATTR_WRITE|PCI_MAP_ATTR_READ,
239 	    &baddr) != DDI_ENOTSUP)
240 		/* ignore all other errors */
241 		px_p->px_dev_caps |= PX_BYPASS_DMA_ALLOWED;
242 
243 	px_p->px_dma_sync_opt = ddi_prop_get_int(DDI_DEV_T_ANY,
244 	    px_p->px_dip, DDI_PROP_DONTPASS, "dma-sync-options", 0);
245 
246 	if (px_p->px_dma_sync_opt != 0)
247 		px_p->px_dev_caps |= PX_DMA_SYNC_REQUIRED;
248 
249 	return (DDI_SUCCESS);
250 }
251 
252 /*
253  * px_dma_attr2hdl
254  *
255  * This routine is called from the alloc handle entry point to sanity check the
256  * dma attribute structure.
257  *
258  * use by: px_dma_allochdl()
259  *
260  * return value:
261  *
262  *	DDI_SUCCESS		- on success
263  *	DDI_DMA_BADATTR		- attribute has invalid version number
264  *				  or address limits exclude dvma space
265  */
266 int
267 px_dma_attr2hdl(px_t *px_p, ddi_dma_impl_t *mp)
268 {
269 	px_mmu_t *mmu_p = px_p->px_mmu_p;
270 	uint64_t syslo, syshi;
271 	int	ret;
272 	ddi_dma_attr_t *attrp		= PX_DEV_ATTR(mp);
273 	uint64_t hi			= attrp->dma_attr_addr_hi;
274 	uint64_t lo			= attrp->dma_attr_addr_lo;
275 	uint64_t align			= attrp->dma_attr_align;
276 	uint64_t nocross		= attrp->dma_attr_seg;
277 	uint64_t count_max		= attrp->dma_attr_count_max;
278 
279 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "attrp=%p cntr_max=%x.%08x\n",
280 	    attrp, HI32(count_max), LO32(count_max));
281 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "hi=%x.%08x lo=%x.%08x\n",
282 	    HI32(hi), LO32(hi), HI32(lo), LO32(lo));
283 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "seg=%x.%08x align=%x.%08x\n",
284 	    HI32(nocross), LO32(nocross), HI32(align), LO32(align));
285 
286 	if (!nocross)
287 		nocross--;
288 	if (attrp->dma_attr_flags & DDI_DMA_FORCE_PHYSICAL) { /* BYPASS */
289 
290 		DBG(DBG_DMA_ALLOCH, px_p->px_dip, "bypass mode\n");
291 		/*
292 		 * If Bypass DMA is not supported, return error so that
293 		 * target driver can fall back to dvma mode of operation
294 		 */
295 		if (!(px_p->px_dev_caps & PX_BYPASS_DMA_ALLOWED))
296 			return (DDI_DMA_BADATTR);
297 		mp->dmai_flags |= PX_DMAI_FLAGS_BYPASSREQ;
298 		if (nocross != UINT64_MAX)
299 			return (DDI_DMA_BADATTR);
300 		if (align && (align > MMU_PAGE_SIZE))
301 			return (DDI_DMA_BADATTR);
302 		align = 1; /* align on 1 page boundary */
303 
304 		/* do a range check and get the limits */
305 		ret = px_lib_dma_bypass_rngchk(px_p->px_dip, attrp,
306 		    &syslo, &syshi);
307 		if (ret != DDI_SUCCESS)
308 			return (ret);
309 	} else { /* MMU_XLATE or PEER_TO_PEER */
310 		align = MAX(align, MMU_PAGE_SIZE) - 1;
311 		if ((align & nocross) != align) {
312 			dev_info_t *rdip = mp->dmai_rdip;
313 			cmn_err(CE_WARN, "%s%d dma_attr_seg not aligned",
314 			    NAMEINST(rdip));
315 			return (DDI_DMA_BADATTR);
316 		}
317 		align = MMU_BTOP(align + 1);
318 		syslo = mmu_p->mmu_dvma_base;
319 		syshi = mmu_p->mmu_dvma_end;
320 	}
321 	if (hi <= lo) {
322 		dev_info_t *rdip = mp->dmai_rdip;
323 		cmn_err(CE_WARN, "%s%d limits out of range", NAMEINST(rdip));
324 		return (DDI_DMA_BADATTR);
325 	}
326 	lo = MAX(lo, syslo);
327 	hi = MIN(hi, syshi);
328 	if (!count_max)
329 		count_max--;
330 
331 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "hi=%x.%08x, lo=%x.%08x\n",
332 	    HI32(hi), LO32(hi), HI32(lo), LO32(lo));
333 	if (hi <= lo) {
334 		/*
335 		 * If this is an IOMMU bypass access, the caller can't use
336 		 * the required addresses, so fail it.  Otherwise, it's
337 		 * peer-to-peer; ensure that the caller has no alignment or
338 		 * segment size restrictions.
339 		 */
340 		if ((mp->dmai_flags & PX_DMAI_FLAGS_BYPASSREQ) ||
341 		    (nocross < UINT32_MAX) || (align > 1))
342 			return (DDI_DMA_BADATTR);
343 
344 		mp->dmai_flags |= PX_DMAI_FLAGS_PEER_ONLY;
345 	} else /* set practical counter_max value */
346 		count_max = MIN(count_max, hi - lo);
347 
348 	if (PX_DEV_NOSYSLIMIT(lo, hi, syslo, syshi, align))
349 		mp->dmai_flags |= PX_DMAI_FLAGS_NOSYSLIMIT |
350 		    PX_DMAI_FLAGS_NOFASTLIMIT;
351 	else {
352 		syshi = mmu_p->mmu_dvma_fast_end;
353 		if (PX_DEV_NOFASTLIMIT(lo, hi, syslo, syshi, align))
354 			mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT;
355 	}
356 	if (PX_DMA_NOCTX(mp->dmai_rdip))
357 		mp->dmai_flags |= PX_DMAI_FLAGS_NOCTX;
358 
359 	mp->dmai_burstsizes	= attrp->dma_attr_burstsizes;
360 	attrp = &mp->dmai_attr;
361 	SET_DMAATTR(attrp, lo, hi, nocross, count_max);
362 	return (DDI_SUCCESS);
363 }
364 
365 #define	TGT_PFN_INBETWEEN(pfn, bgn, end) ((pfn >= bgn) && (pfn <= end))
366 
367 /*
368  * px_dma_type - determine which of the three types DMA (peer-to-peer,
369  *		mmu bypass, or mmu translate) we are asked to do.
370  *		Also checks pfn0 and rejects any non-peer-to-peer
371  *		requests for peer-only devices.
372  *
373  *	return values:
374  *		DDI_DMA_NOMAPPING - can't get valid pfn0, or bad dma type
375  *		DDI_SUCCESS
376  *
377  *	dma handle members affected (set on exit):
378  *	mp->dmai_object		- dmareq->dmar_object
379  *	mp->dmai_rflags		- consistent?, nosync?, dmareq->dmar_flags
380  *	mp->dmai_flags   	- DMA type
381  *	mp->dmai_pfn0   	- 1st page pfn (if va/size pair and not shadow)
382  *	mp->dmai_roffset 	- initialized to starting MMU page offset
383  *	mp->dmai_ndvmapages	- # of total MMU pages of entire object
384  */
385 int
386 px_dma_type(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
387 {
388 	dev_info_t *dip = px_p->px_dip;
389 	ddi_dma_obj_t *dobj_p = &dmareq->dmar_object;
390 	px_pec_t *pec_p = px_p->px_pec_p;
391 	uint32_t offset;
392 	pfn_t pfn0;
393 	uint_t redzone;
394 
395 	mp->dmai_rflags = dmareq->dmar_flags & DMP_DDIFLAGS;
396 
397 	if (!(px_p->px_dev_caps & PX_DMA_SYNC_REQUIRED))
398 		mp->dmai_rflags |= DMP_NOSYNC;
399 
400 	switch (dobj_p->dmao_type) {
401 	case DMA_OTYP_BUFVADDR:
402 	case DMA_OTYP_VADDR: {
403 		page_t **pplist = dobj_p->dmao_obj.virt_obj.v_priv;
404 		caddr_t vaddr = dobj_p->dmao_obj.virt_obj.v_addr;
405 
406 		DBG(DBG_DMA_MAP, dip, "vaddr=%p pplist=%p\n", vaddr, pplist);
407 		offset = (ulong_t)vaddr & MMU_PAGE_OFFSET;
408 		if (pplist) {				/* shadow list */
409 			mp->dmai_flags |= PX_DMAI_FLAGS_PGPFN;
410 			pfn0 = page_pptonum(*pplist);
411 		} else {
412 			struct as *as_p = dobj_p->dmao_obj.virt_obj.v_as;
413 			struct hat *hat_p = as_p ? as_p->a_hat : kas.a_hat;
414 			pfn0 = hat_getpfnum(hat_p, vaddr);
415 		}
416 		}
417 		break;
418 
419 	case DMA_OTYP_PAGES:
420 		offset = dobj_p->dmao_obj.pp_obj.pp_offset;
421 		mp->dmai_flags |= PX_DMAI_FLAGS_PGPFN;
422 		pfn0 = page_pptonum(dobj_p->dmao_obj.pp_obj.pp_pp);
423 		break;
424 
425 	case DMA_OTYP_PADDR:
426 	default:
427 		cmn_err(CE_WARN, "%s%d requested unsupported dma type %x",
428 		    NAMEINST(mp->dmai_rdip), dobj_p->dmao_type);
429 		return (DDI_DMA_NOMAPPING);
430 	}
431 	if (pfn0 == PFN_INVALID) {
432 		cmn_err(CE_WARN, "%s%d: invalid pfn0 for DMA object %p",
433 		    NAMEINST(dip), dobj_p);
434 		return (DDI_DMA_NOMAPPING);
435 	}
436 	if (TGT_PFN_INBETWEEN(pfn0, pec_p->pec_base32_pfn,
437 	    pec_p->pec_last32_pfn)) {
438 		mp->dmai_flags |= PX_DMAI_FLAGS_PTP|PX_DMAI_FLAGS_PTP32;
439 		goto done;	/* leave bypass and dvma flag as 0 */
440 	} else if (TGT_PFN_INBETWEEN(pfn0, pec_p->pec_base64_pfn,
441 	    pec_p->pec_last64_pfn)) {
442 		mp->dmai_flags |= PX_DMAI_FLAGS_PTP|PX_DMAI_FLAGS_PTP64;
443 		goto done;	/* leave bypass and dvma flag as 0 */
444 	}
445 	if (PX_DMA_ISPEERONLY(mp)) {
446 		dev_info_t *rdip = mp->dmai_rdip;
447 		cmn_err(CE_WARN, "Bad peer-to-peer req %s%d", NAMEINST(rdip));
448 		return (DDI_DMA_NOMAPPING);
449 	}
450 
451 	redzone = (mp->dmai_rflags & DDI_DMA_REDZONE) ||
452 	    (mp->dmai_flags & PX_DMAI_FLAGS_MAP_BUFZONE) ?
453 	    PX_DMAI_FLAGS_REDZONE : 0;
454 
455 	mp->dmai_flags |= (mp->dmai_flags & PX_DMAI_FLAGS_BYPASSREQ) ?
456 	    PX_DMAI_FLAGS_BYPASS : (PX_DMAI_FLAGS_DVMA | redzone);
457 done:
458 	mp->dmai_object	 = *dobj_p;			/* whole object    */
459 	mp->dmai_pfn0	 = (void *)pfn0;		/* cache pfn0	   */
460 	mp->dmai_roffset = offset;			/* win0 pg0 offset */
461 	mp->dmai_ndvmapages = MMU_BTOPR(offset + mp->dmai_object.dmao_size);
462 	return (DDI_SUCCESS);
463 }
464 
465 /*
466  * px_dma_pgpfn - set up pfnlst array according to pages
467  *	VA/size pair: <shadow IO, bypass, peer-to-peer>, or OTYP_PAGES
468  */
469 /*ARGSUSED*/
470 static int
471 px_dma_pgpfn(px_t *px_p, ddi_dma_impl_t *mp, uint_t npages)
472 {
473 	int i;
474 	dev_info_t *dip = px_p->px_dip;
475 
476 	switch (mp->dmai_object.dmao_type) {
477 	case DMA_OTYP_BUFVADDR:
478 	case DMA_OTYP_VADDR: {
479 		page_t **pplist = mp->dmai_object.dmao_obj.virt_obj.v_priv;
480 		DBG(DBG_DMA_MAP, dip, "shadow pplist=%p, %x pages, pfns=",
481 		    pplist, npages);
482 		for (i = 1; i < npages; i++) {
483 			px_iopfn_t pfn = page_pptonum(pplist[i]);
484 			PX_SET_MP_PFN1(mp, i, pfn);
485 			DBG(DBG_DMA_MAP|DBG_CONT, dip, "%x ", pfn);
486 		}
487 		DBG(DBG_DMA_MAP|DBG_CONT, dip, "\n");
488 		}
489 		break;
490 
491 	case DMA_OTYP_PAGES: {
492 		page_t *pp = mp->dmai_object.dmao_obj.pp_obj.pp_pp->p_next;
493 		DBG(DBG_DMA_MAP, dip, "pp=%p pfns=", pp);
494 		for (i = 1; i < npages; i++, pp = pp->p_next) {
495 			px_iopfn_t pfn = page_pptonum(pp);
496 			PX_SET_MP_PFN1(mp, i, pfn);
497 			DBG(DBG_DMA_MAP|DBG_CONT, dip, "%x ", pfn);
498 		}
499 		DBG(DBG_DMA_MAP|DBG_CONT, dip, "\n");
500 		}
501 		break;
502 
503 	default:	/* check is already done by px_dma_type */
504 		ASSERT(0);
505 		break;
506 	}
507 	return (DDI_SUCCESS);
508 }
509 
510 /*
511  * px_dma_vapfn - set up pfnlst array according to VA
512  *	VA/size pair: <normal, bypass, peer-to-peer>
513  *	pfn0 is skipped as it is already done.
514  *	In this case, the cached pfn0 is used to fill pfnlst[0]
515  */
516 static int
517 px_dma_vapfn(px_t *px_p, ddi_dma_impl_t *mp, uint_t npages)
518 {
519 	dev_info_t *dip = px_p->px_dip;
520 	int i;
521 	caddr_t vaddr = (caddr_t)mp->dmai_object.dmao_obj.virt_obj.v_as;
522 	struct hat *hat_p = vaddr ? ((struct as *)vaddr)->a_hat : kas.a_hat;
523 
524 	vaddr = mp->dmai_object.dmao_obj.virt_obj.v_addr + MMU_PAGE_SIZE;
525 	for (i = 1; i < npages; i++, vaddr += MMU_PAGE_SIZE) {
526 		px_iopfn_t pfn = hat_getpfnum(hat_p, vaddr);
527 		if (pfn == PFN_INVALID)
528 			goto err_badpfn;
529 		PX_SET_MP_PFN1(mp, i, pfn);
530 		DBG(DBG_DMA_BINDH, dip, "px_dma_vapfn: mp=%p pfnlst[%x]=%x\n",
531 		    mp, i, pfn);
532 	}
533 	return (DDI_SUCCESS);
534 err_badpfn:
535 	cmn_err(CE_WARN, "%s%d: bad page frame vaddr=%p", NAMEINST(dip), vaddr);
536 	return (DDI_DMA_NOMAPPING);
537 }
538 
539 /*
540  * px_dma_pfn - Fills pfn list for all pages being DMA-ed.
541  *
542  * dependencies:
543  *	mp->dmai_ndvmapages	- set to total # of dma pages
544  *
545  * return value:
546  *	DDI_SUCCESS
547  *	DDI_DMA_NOMAPPING
548  */
549 int
550 px_dma_pfn(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
551 {
552 	uint32_t npages = mp->dmai_ndvmapages;
553 	int (*waitfp)(caddr_t) = dmareq->dmar_fp;
554 	int i, ret, peer = PX_DMA_ISPTP(mp);
555 	int peer32 = PX_DMA_ISPTP32(mp);
556 	dev_info_t *dip = px_p->px_dip;
557 
558 	px_pec_t *pec_p = px_p->px_pec_p;
559 	px_iopfn_t pfn_base = peer32 ? pec_p->pec_base32_pfn :
560 	    pec_p->pec_base64_pfn;
561 	px_iopfn_t pfn_last = peer32 ? pec_p->pec_last32_pfn :
562 	    pec_p->pec_last64_pfn;
563 	px_iopfn_t pfn_adj = peer ? pfn_base : 0;
564 
565 	DBG(DBG_DMA_BINDH, dip, "px_dma_pfn: mp=%p pfn0=%x\n",
566 	    mp, PX_MP_PFN0(mp) - pfn_adj);
567 	/* 1 page: no array alloc/fill, no mixed mode check */
568 	if (npages == 1) {
569 		PX_SET_MP_PFN(mp, 0, PX_MP_PFN0(mp) - pfn_adj);
570 		return (DDI_SUCCESS);
571 	}
572 	/* allocate pfn array */
573 	if (!(mp->dmai_pfnlst = kmem_alloc(npages * sizeof (px_iopfn_t),
574 	    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP))) {
575 		if (waitfp != DDI_DMA_DONTWAIT)
576 			ddi_set_callback(waitfp, dmareq->dmar_arg,
577 			    &px_kmem_clid);
578 		return (DDI_DMA_NORESOURCES);
579 	}
580 	/* fill pfn array */
581 	PX_SET_MP_PFN(mp, 0, PX_MP_PFN0(mp) - pfn_adj);	/* pfnlst[0] */
582 	if ((ret = PX_DMA_ISPGPFN(mp) ? px_dma_pgpfn(px_p, mp, npages) :
583 	    px_dma_vapfn(px_p, mp, npages)) != DDI_SUCCESS)
584 		goto err;
585 
586 	/* skip pfn0, check mixed mode and adjust peer to peer pfn */
587 	for (i = 1; i < npages; i++) {
588 		px_iopfn_t pfn = PX_GET_MP_PFN1(mp, i);
589 		if (peer ^ TGT_PFN_INBETWEEN(pfn, pfn_base, pfn_last)) {
590 			cmn_err(CE_WARN, "%s%d mixed mode DMA %lx %lx",
591 			    NAMEINST(mp->dmai_rdip), PX_MP_PFN0(mp), pfn);
592 			ret = DDI_DMA_NOMAPPING;	/* mixed mode */
593 			goto err;
594 		}
595 		DBG(DBG_DMA_MAP, dip,
596 		    "px_dma_pfn: pfnlst[%x]=%x-%x\n", i, pfn, pfn_adj);
597 		if (pfn_adj)
598 			PX_SET_MP_PFN1(mp, i, pfn - pfn_adj);
599 	}
600 	return (DDI_SUCCESS);
601 err:
602 	px_dma_freepfn(mp);
603 	return (ret);
604 }
605 
606 /*
607  * px_dvma_win() - trim requested DVMA size down to window size
608  *	The 1st window starts from offset and ends at page-aligned boundary.
609  *	From the 2nd window on, each window starts and ends at page-aligned
610  *	boundary except the last window ends at wherever requested.
611  *
612  *	accesses the following mp-> members:
613  *	mp->dmai_attr.dma_attr_count_max
614  *	mp->dmai_attr.dma_attr_seg
615  *	mp->dmai_roffset   - start offset of 1st window
616  *	mp->dmai_rflags (redzone)
617  *	mp->dmai_ndvmapages (for 1 page fast path)
618  *
619  *	sets the following mp-> members:
620  *	mp->dmai_size	   - xfer size, != winsize if 1st/last win  (not fixed)
621  *	mp->dmai_winsize   - window size (no redzone), n * page size    (fixed)
622  *	mp->dmai_nwin	   - # of DMA windows of entire object		(fixed)
623  *	mp->dmai_rflags	   - remove partial flag if nwin == 1		(fixed)
624  *	mp->dmai_winlst	   - NULL, window objects not used for DVMA	(fixed)
625  *
626  *	fixed - not changed across different DMA windows
627  */
628 /*ARGSUSED*/
629 int
630 px_dvma_win(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
631 {
632 	uint32_t redzone_sz	= PX_HAS_REDZONE(mp) ? MMU_PAGE_SIZE : 0;
633 	size_t obj_sz		= mp->dmai_object.dmao_size;
634 	size_t xfer_sz;
635 	ulong_t pg_off;
636 
637 	if ((mp->dmai_ndvmapages == 1) && !redzone_sz) {
638 		mp->dmai_rflags &= ~DDI_DMA_PARTIAL;
639 		mp->dmai_size = obj_sz;
640 		mp->dmai_winsize = MMU_PAGE_SIZE;
641 		mp->dmai_nwin = 1;
642 		goto done;
643 	}
644 
645 	pg_off	= mp->dmai_roffset;
646 	xfer_sz	= obj_sz + redzone_sz;
647 
648 	/* include redzone in nocross check */	{
649 		uint64_t nocross = mp->dmai_attr.dma_attr_seg;
650 		if (xfer_sz + pg_off - 1 > nocross)
651 			xfer_sz = nocross - pg_off + 1;
652 		if (redzone_sz && (xfer_sz <= redzone_sz)) {
653 			DBG(DBG_DMA_MAP, px_p->px_dip,
654 			    "nocross too small: "
655 			    "%lx(%lx)+%lx+%lx < %llx\n",
656 			    xfer_sz, obj_sz, pg_off, redzone_sz, nocross);
657 			return (DDI_DMA_TOOBIG);
658 		}
659 	}
660 	xfer_sz -= redzone_sz;		/* restore transfer size  */
661 	/* check counter max */	{
662 		uint32_t count_max = mp->dmai_attr.dma_attr_count_max;
663 		if (xfer_sz - 1 > count_max)
664 			xfer_sz = count_max + 1;
665 	}
666 	if (xfer_sz >= obj_sz) {
667 		mp->dmai_rflags &= ~DDI_DMA_PARTIAL;
668 		mp->dmai_size = xfer_sz;
669 		mp->dmai_winsize = P2ROUNDUP(xfer_sz + pg_off, MMU_PAGE_SIZE);
670 		mp->dmai_nwin = 1;
671 		goto done;
672 	}
673 	if (!(dmareq->dmar_flags & DDI_DMA_PARTIAL)) {
674 		DBG(DBG_DMA_MAP, px_p->px_dip, "too big: %lx+%lx+%lx > %lx\n",
675 		    obj_sz, pg_off, redzone_sz, xfer_sz);
676 		return (DDI_DMA_TOOBIG);
677 	}
678 
679 	xfer_sz = MMU_PTOB(MMU_BTOP(xfer_sz + pg_off)); /* page align */
680 	mp->dmai_size = xfer_sz - pg_off;	/* 1st window xferrable size */
681 	mp->dmai_winsize = xfer_sz;		/* redzone not in winsize */
682 	mp->dmai_nwin = (obj_sz + pg_off + xfer_sz - 1) / xfer_sz;
683 done:
684 	mp->dmai_winlst = NULL;
685 	px_dump_dma_handle(DBG_DMA_MAP, px_p->px_dip, mp);
686 	return (DDI_SUCCESS);
687 }
688 
689 /*
690  * fast track cache entry to mmu context, inserts 3 0 bits between
691  * upper 6-bits and lower 3-bits of the 9-bit cache entry
692  */
693 #define	MMU_FCE_TO_CTX(i)	(((i) << 3) | ((i) & 0x7) | 0x38)
694 
695 /*
696  * px_dvma_map_fast - attempts to map fast trackable DVMA
697  */
698 /*ARGSUSED*/
699 int
700 px_dvma_map_fast(px_mmu_t *mmu_p, ddi_dma_impl_t *mp)
701 {
702 	uint_t clustsz = px_dvma_page_cache_clustsz;
703 	uint_t entries = px_dvma_page_cache_entries;
704 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
705 	    mp->dmai_attr.dma_attr_flags);
706 	int i = mmu_p->mmu_dvma_addr_scan_start;
707 	uint8_t *lock_addr = mmu_p->mmu_dvma_cache_locks + i;
708 	px_dvma_addr_t dvma_pg;
709 	size_t npages = MMU_BTOP(mp->dmai_winsize);
710 	dev_info_t *dip = mmu_p->mmu_px_p->px_dip;
711 
712 	extern uint8_t ldstub(uint8_t *);
713 	ASSERT(MMU_PTOB(npages) == mp->dmai_winsize);
714 	ASSERT(npages + PX_HAS_REDZONE(mp) <= clustsz);
715 
716 	for (; i < entries && ldstub(lock_addr); i++, lock_addr++)
717 		;
718 	if (i >= entries) {
719 		lock_addr = mmu_p->mmu_dvma_cache_locks;
720 		i = 0;
721 		for (; i < entries && ldstub(lock_addr); i++, lock_addr++)
722 			;
723 		if (i >= entries) {
724 #ifdef	PX_DMA_PROF
725 			px_dvmaft_exhaust++;
726 #endif	/* PX_DMA_PROF */
727 			return (DDI_DMA_NORESOURCES);
728 		}
729 	}
730 	mmu_p->mmu_dvma_addr_scan_start = (i + 1) & (entries - 1);
731 
732 	i *= clustsz;
733 	dvma_pg = mmu_p->dvma_base_pg + i;
734 
735 	if (px_lib_iommu_map(dip, PCI_TSBID(0, i), npages,
736 	    PX_ADD_ATTR_EXTNS(attr, mp->dmai_bdf), (void *)mp, 0,
737 	    MMU_MAP_PFN) != DDI_SUCCESS) {
738 		DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: "
739 		    "px_lib_iommu_map failed\n");
740 		return (DDI_FAILURE);
741 	}
742 
743 	if (!PX_MAP_BUFZONE(mp))
744 		goto done;
745 
746 	DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: redzone pg=%x\n", i + npages);
747 
748 	ASSERT(PX_HAS_REDZONE(mp));
749 
750 	if (px_lib_iommu_map(dip, PCI_TSBID(0, i + npages), 1,
751 	    PX_ADD_ATTR_EXTNS(attr, mp->dmai_bdf), (void *)mp, npages - 1,
752 	    MMU_MAP_PFN) != DDI_SUCCESS) {
753 		DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: "
754 		    "mapping REDZONE page failed\n");
755 
756 		(void) px_lib_iommu_demap(dip, PCI_TSBID(0, i), npages);
757 		return (DDI_FAILURE);
758 	}
759 
760 done:
761 #ifdef PX_DMA_PROF
762 	px_dvmaft_success++;
763 #endif
764 	mp->dmai_mapping = mp->dmai_roffset | MMU_PTOB(dvma_pg);
765 	mp->dmai_offset = 0;
766 	mp->dmai_flags |= PX_DMAI_FLAGS_FASTTRACK;
767 	PX_SAVE_MP_TTE(mp, attr);	/* save TTE template for unmapping */
768 	if (PX_DVMA_DBG_ON(mmu_p))
769 		px_dvma_alloc_debug(mmu_p, (char *)mp->dmai_mapping,
770 		    mp->dmai_size, mp);
771 	return (DDI_SUCCESS);
772 }
773 
774 /*
775  * px_dvma_map: map non-fasttrack DMA
776  *		Use quantum cache if single page DMA.
777  */
778 int
779 px_dvma_map(ddi_dma_impl_t *mp, ddi_dma_req_t *dmareq, px_mmu_t *mmu_p)
780 {
781 	uint_t npages = PX_DMA_WINNPGS(mp);
782 	px_dvma_addr_t dvma_pg, dvma_pg_index;
783 	void *dvma_addr;
784 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
785 	    mp->dmai_attr.dma_attr_flags);
786 	int sleep = dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP;
787 	dev_info_t *dip = mp->dmai_rdip;
788 	int	ret = DDI_SUCCESS;
789 
790 	/*
791 	 * allocate dvma space resource and map in the first window.
792 	 * (vmem_t *vmp, size_t size,
793 	 *	size_t align, size_t phase, size_t nocross,
794 	 *	void *minaddr, void *maxaddr, int vmflag)
795 	 */
796 	if ((npages == 1) && !PX_HAS_REDZONE(mp) && PX_HAS_NOSYSLIMIT(mp)) {
797 		dvma_addr = vmem_alloc(mmu_p->mmu_dvma_map,
798 		    MMU_PAGE_SIZE, sleep);
799 		mp->dmai_flags |= PX_DMAI_FLAGS_VMEMCACHE;
800 #ifdef	PX_DMA_PROF
801 		px_dvma_vmem_alloc++;
802 #endif	/* PX_DMA_PROF */
803 	} else {
804 		dvma_addr = vmem_xalloc(mmu_p->mmu_dvma_map,
805 		    MMU_PTOB(npages + PX_HAS_REDZONE(mp)),
806 		    MAX(mp->dmai_attr.dma_attr_align, MMU_PAGE_SIZE),
807 		    0,
808 		    mp->dmai_attr.dma_attr_seg + 1,
809 		    (void *)mp->dmai_attr.dma_attr_addr_lo,
810 		    (void *)(mp->dmai_attr.dma_attr_addr_hi + 1),
811 		    sleep);
812 #ifdef	PX_DMA_PROF
813 		px_dvma_vmem_xalloc++;
814 #endif	/* PX_DMA_PROF */
815 	}
816 	dvma_pg = MMU_BTOP((ulong_t)dvma_addr);
817 	dvma_pg_index = dvma_pg - mmu_p->dvma_base_pg;
818 	DBG(DBG_DMA_MAP, dip, "fallback dvma_pages: dvma_pg=%x index=%x\n",
819 	    dvma_pg, dvma_pg_index);
820 	if (dvma_pg == 0)
821 		goto noresource;
822 
823 	mp->dmai_mapping = mp->dmai_roffset | MMU_PTOB(dvma_pg);
824 	mp->dmai_offset = 0;
825 	PX_SAVE_MP_TTE(mp, attr);	/* mp->dmai_tte = tte */
826 
827 	if ((ret = px_mmu_map_pages(mmu_p,
828 	    mp, dvma_pg, npages, 0)) != DDI_SUCCESS) {
829 		if (mp->dmai_flags & PX_DMAI_FLAGS_VMEMCACHE) {
830 			vmem_free(mmu_p->mmu_dvma_map, (void *)dvma_addr,
831 			    MMU_PAGE_SIZE);
832 #ifdef PX_DMA_PROF
833 			px_dvma_vmem_free++;
834 #endif /* PX_DMA_PROF */
835 		} else {
836 			vmem_xfree(mmu_p->mmu_dvma_map, (void *)dvma_addr,
837 			    MMU_PTOB(npages + PX_HAS_REDZONE(mp)));
838 #ifdef PX_DMA_PROF
839 			px_dvma_vmem_xfree++;
840 #endif /* PX_DMA_PROF */
841 		}
842 	}
843 
844 	return (ret);
845 noresource:
846 	if (dmareq->dmar_fp != DDI_DMA_DONTWAIT) {
847 		DBG(DBG_DMA_MAP, dip, "dvma_pg 0 - set callback\n");
848 		ddi_set_callback(dmareq->dmar_fp, dmareq->dmar_arg,
849 		    &mmu_p->mmu_dvma_clid);
850 	}
851 	DBG(DBG_DMA_MAP, dip, "vmem_xalloc - DDI_DMA_NORESOURCES\n");
852 	return (DDI_DMA_NORESOURCES);
853 }
854 
855 void
856 px_dvma_unmap(px_mmu_t *mmu_p, ddi_dma_impl_t *mp)
857 {
858 	px_dvma_addr_t dvma_addr = (px_dvma_addr_t)mp->dmai_mapping;
859 	px_dvma_addr_t dvma_pg = MMU_BTOP(dvma_addr);
860 	dvma_addr = MMU_PTOB(dvma_pg);
861 
862 	if (mp->dmai_flags & PX_DMAI_FLAGS_FASTTRACK) {
863 		px_iopfn_t index = dvma_pg - mmu_p->dvma_base_pg;
864 		ASSERT(index % px_dvma_page_cache_clustsz == 0);
865 		index /= px_dvma_page_cache_clustsz;
866 		ASSERT(index < px_dvma_page_cache_entries);
867 		mmu_p->mmu_dvma_cache_locks[index] = 0;
868 #ifdef	PX_DMA_PROF
869 		px_dvmaft_free++;
870 #endif	/* PX_DMA_PROF */
871 		return;
872 	}
873 
874 	if (mp->dmai_flags & PX_DMAI_FLAGS_VMEMCACHE) {
875 		vmem_free(mmu_p->mmu_dvma_map, (void *)dvma_addr,
876 		    MMU_PAGE_SIZE);
877 #ifdef PX_DMA_PROF
878 		px_dvma_vmem_free++;
879 #endif /* PX_DMA_PROF */
880 	} else {
881 		size_t npages = MMU_BTOP(mp->dmai_winsize) + PX_HAS_REDZONE(mp);
882 		vmem_xfree(mmu_p->mmu_dvma_map, (void *)dvma_addr,
883 		    MMU_PTOB(npages));
884 #ifdef PX_DMA_PROF
885 		px_dvma_vmem_xfree++;
886 #endif /* PX_DMA_PROF */
887 	}
888 }
889 
890 /*
891  * DVMA mappings may have multiple windows, but each window always have
892  * one segment.
893  */
894 int
895 px_dvma_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_impl_t *mp,
896 	enum ddi_dma_ctlops cmd, off_t *offp, size_t *lenp, caddr_t *objp,
897 	uint_t cache_flags)
898 {
899 	switch (cmd) {
900 	default:
901 		DBG(DBG_DMA_CTL, dip, "unknown command (%x): rdip=%s%d\n",
902 		    cmd, ddi_driver_name(rdip), ddi_get_instance(rdip));
903 		break;
904 	}
905 	return (DDI_FAILURE);
906 }
907 
908 void
909 px_dma_freewin(ddi_dma_impl_t *mp)
910 {
911 	px_dma_win_t *win_p = mp->dmai_winlst, *win2_p;
912 	for (win2_p = win_p; win_p; win2_p = win_p) {
913 		win_p = win2_p->win_next;
914 		kmem_free(win2_p, sizeof (px_dma_win_t) +
915 		    sizeof (ddi_dma_cookie_t) * win2_p->win_ncookies);
916 	}
917 	mp->dmai_nwin = 0;
918 	mp->dmai_winlst = NULL;
919 }
920 
921 /*
922  * px_dma_newwin - create a dma window object and cookies
923  *
924  *	After the initial scan in px_dma_physwin(), which identifies
925  *	a portion of the pfn array that belongs to a dma window,
926  *	we are called to allocate and initialize representing memory
927  *	resources. We know from the 1st scan the number of cookies
928  *	or dma segment in this window so we can allocate a contiguous
929  *	memory array for the dma cookies (The implementation of
930  *	ddi_dma_nextcookie(9f) dictates dma cookies be contiguous).
931  *
932  *	A second round scan is done on the pfn array to identify
933  *	each dma segment and initialize its corresponding dma cookie.
934  *	We don't need to do all the safety checking and we know they
935  *	all belong to the same dma window.
936  *
937  *	Input:	cookie_no - # of cookies identified by the 1st scan
938  *		start_idx - subscript of the pfn array for the starting pfn
939  *		end_idx   - subscript of the last pfn in dma window
940  *		win_pp    - pointer to win_next member of previous window
941  *	Return:	DDI_SUCCESS - with **win_pp as newly created window object
942  *		DDI_DMA_NORESROUCE - caller frees all previous window objs
943  *	Note:	Each cookie and window size are all initialized on page
944  *		boundary. This is not true for the 1st cookie of the 1st
945  *		window and the last cookie of the last window.
946  *		We fix that later in upper layer which has access to size
947  *		and offset info.
948  *
949  */
950 /*ARGSUSED*/
951 static int
952 px_dma_newwin(dev_info_t *dip, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp,
953 	uint32_t cookie_no, uint32_t start_idx, uint32_t end_idx,
954 	px_dma_win_t **win_pp, uint64_t count_max, uint64_t bypass)
955 {
956 	int (*waitfp)(caddr_t) = dmareq->dmar_fp;
957 	ddi_dma_cookie_t *cookie_p;
958 	uint32_t pfn_no = 1;
959 	px_iopfn_t pfn = PX_GET_MP_PFN(mp, start_idx);
960 	px_iopfn_t prev_pfn = pfn;
961 	uint64_t baddr, seg_pfn0 = pfn;
962 	size_t sz = cookie_no * sizeof (ddi_dma_cookie_t);
963 	px_dma_win_t *win_p = kmem_zalloc(sizeof (px_dma_win_t) + sz,
964 	    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
965 	io_attributes_t	attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
966 	    mp->dmai_attr.dma_attr_flags);
967 
968 	if (!win_p)
969 		goto noresource;
970 
971 	win_p->win_next = NULL;
972 	win_p->win_ncookies = cookie_no;
973 	win_p->win_curseg = 0;	/* start from segment 0 */
974 	win_p->win_size = MMU_PTOB(end_idx - start_idx + 1);
975 	/* win_p->win_offset is left uninitialized */
976 
977 	cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
978 	start_idx++;
979 	for (; start_idx <= end_idx; start_idx++, prev_pfn = pfn, pfn_no++) {
980 		pfn = PX_GET_MP_PFN1(mp, start_idx);
981 		if ((pfn == prev_pfn + 1) &&
982 		    (MMU_PTOB(pfn_no + 1) - 1 <= count_max))
983 			continue;
984 
985 		/* close up the cookie up to (including) prev_pfn */
986 		baddr = MMU_PTOB(seg_pfn0);
987 		if (bypass) {
988 			if (px_lib_iommu_getbypass(dip, baddr, attr, &baddr)
989 			    == DDI_SUCCESS)
990 				baddr = px_lib_ro_bypass(dip, attr, baddr);
991 			else
992 				return (DDI_FAILURE);
993 		}
994 
995 		MAKE_DMA_COOKIE(cookie_p, baddr, MMU_PTOB(pfn_no));
996 		DBG(DBG_BYPASS, mp->dmai_rdip, "cookie %p (%x pages)\n",
997 		    MMU_PTOB(seg_pfn0), pfn_no);
998 
999 		cookie_p++;	/* advance to next available cookie cell */
1000 		pfn_no = 0;
1001 		seg_pfn0 = pfn;	/* start a new segment from current pfn */
1002 	}
1003 
1004 	baddr = MMU_PTOB(seg_pfn0);
1005 	if (bypass) {
1006 		if (px_lib_iommu_getbypass(dip, baddr, attr, &baddr)
1007 		    == DDI_SUCCESS)
1008 			baddr = px_lib_ro_bypass(dip, attr, baddr);
1009 		else
1010 			return (DDI_FAILURE);
1011 	}
1012 
1013 	MAKE_DMA_COOKIE(cookie_p, baddr, MMU_PTOB(pfn_no));
1014 	DBG(DBG_BYPASS, mp->dmai_rdip, "cookie %p (%x pages) of total %x\n",
1015 	    MMU_PTOB(seg_pfn0), pfn_no, cookie_no);
1016 #ifdef	DEBUG
1017 	cookie_p++;
1018 	ASSERT((cookie_p - (ddi_dma_cookie_t *)(win_p + 1)) == cookie_no);
1019 #endif	/* DEBUG */
1020 	*win_pp = win_p;
1021 	return (DDI_SUCCESS);
1022 noresource:
1023 	if (waitfp != DDI_DMA_DONTWAIT)
1024 		ddi_set_callback(waitfp, dmareq->dmar_arg, &px_kmem_clid);
1025 	return (DDI_DMA_NORESOURCES);
1026 }
1027 
1028 /*
1029  * px_dma_adjust - adjust 1st and last cookie and window sizes
1030  *	remove initial dma page offset from 1st cookie and window size
1031  *	remove last dma page remainder from last cookie and window size
1032  *	fill win_offset of each dma window according to just fixed up
1033  *		each window sizes
1034  *	px_dma_win_t members modified:
1035  *	win_p->win_offset - this window's offset within entire DMA object
1036  *	win_p->win_size	  - xferrable size (in bytes) for this window
1037  *
1038  *	ddi_dma_impl_t members modified:
1039  *	mp->dmai_size	  - 1st window xferrable size
1040  *	mp->dmai_offset   - 0, which is the dma offset of the 1st window
1041  *
1042  *	ddi_dma_cookie_t members modified:
1043  *	cookie_p->dmac_size - 1st and last cookie remove offset or remainder
1044  *	cookie_p->dmac_laddress - 1st cookie add page offset
1045  */
1046 static void
1047 px_dma_adjust(ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp, px_dma_win_t *win_p)
1048 {
1049 	ddi_dma_cookie_t *cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
1050 	size_t pg_offset = mp->dmai_roffset;
1051 	size_t win_offset = 0;
1052 
1053 	cookie_p->dmac_size -= pg_offset;
1054 	cookie_p->dmac_laddress |= pg_offset;
1055 	win_p->win_size -= pg_offset;
1056 	DBG(DBG_BYPASS, mp->dmai_rdip, "pg0 adjust %lx\n", pg_offset);
1057 
1058 	mp->dmai_size = win_p->win_size;
1059 	mp->dmai_offset = 0;
1060 
1061 	pg_offset += mp->dmai_object.dmao_size;
1062 	pg_offset &= MMU_PAGE_OFFSET;
1063 	if (pg_offset)
1064 		pg_offset = MMU_PAGE_SIZE - pg_offset;
1065 	DBG(DBG_BYPASS, mp->dmai_rdip, "last pg adjust %lx\n", pg_offset);
1066 
1067 	for (; win_p->win_next; win_p = win_p->win_next) {
1068 		DBG(DBG_BYPASS, mp->dmai_rdip, "win off %p\n", win_offset);
1069 		win_p->win_offset = win_offset;
1070 		win_offset += win_p->win_size;
1071 	}
1072 	/* last window */
1073 	win_p->win_offset = win_offset;
1074 	cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
1075 	cookie_p[win_p->win_ncookies - 1].dmac_size -= pg_offset;
1076 	win_p->win_size -= pg_offset;
1077 	ASSERT((win_offset + win_p->win_size) == mp->dmai_object.dmao_size);
1078 }
1079 
1080 /*
1081  * px_dma_physwin() - carve up dma windows using physical addresses.
1082  *	Called to handle mmu bypass and pci peer-to-peer transfers.
1083  *	Calls px_dma_newwin() to allocate window objects.
1084  *
1085  * Dependency: mp->dmai_pfnlst points to an array of pfns
1086  *
1087  * 1. Each dma window is represented by a px_dma_win_t object.
1088  *	The object will be casted to ddi_dma_win_t and returned
1089  *	to leaf driver through the DDI interface.
1090  * 2. Each dma window can have several dma segments with each
1091  *	segment representing a physically contiguous either memory
1092  *	space (if we are doing an mmu bypass transfer) or pci address
1093  *	space (if we are doing a peer-to-peer transfer).
1094  * 3. Each segment has a DMA cookie to program the DMA engine.
1095  *	The cookies within each DMA window must be located in a
1096  *	contiguous array per ddi_dma_nextcookie(9f).
1097  * 4. The number of DMA segments within each DMA window cannot exceed
1098  *	mp->dmai_attr.dma_attr_sgllen. If the transfer size is
1099  *	too large to fit in the sgllen, the rest needs to be
1100  *	relocated to the next dma window.
1101  * 5. Peer-to-peer DMA segment follows device hi, lo, count_max,
1102  *	and nocross restrictions while bypass DMA follows the set of
1103  *	restrictions with system limits factored in.
1104  *
1105  * Return:
1106  *	mp->dmai_winlst	 - points to a link list of px_dma_win_t objects.
1107  *		Each px_dma_win_t object on the link list contains
1108  *		infomation such as its window size (# of pages),
1109  *		starting offset (also see Restriction), an array of
1110  *		DMA cookies, and # of cookies in the array.
1111  *	mp->dmai_pfnlst	 - NULL, the pfn list is freed to conserve memory.
1112  *	mp->dmai_nwin	 - # of total DMA windows on mp->dmai_winlst.
1113  *	mp->dmai_mapping - starting cookie address
1114  *	mp->dmai_rflags	 - consistent, nosync, no redzone
1115  *	mp->dmai_cookie	 - start of cookie table of the 1st DMA window
1116  *
1117  * Restriction:
1118  *	Each px_dma_win_t object can theoratically start from any offset
1119  *	since the mmu is not involved. However, this implementation
1120  *	always make windows start from page aligned offset (except
1121  *	the 1st window, which follows the requested offset) due to the
1122  *	fact that we are handed a pfn list. This does require device's
1123  *	count_max and attr_seg to be at least MMU_PAGE_SIZE aligned.
1124  */
1125 int
1126 px_dma_physwin(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
1127 {
1128 	uint_t npages = mp->dmai_ndvmapages;
1129 	int ret, sgllen = mp->dmai_attr.dma_attr_sgllen;
1130 	px_iopfn_t pfn_lo, pfn_hi, prev_pfn;
1131 	px_iopfn_t pfn = PX_GET_MP_PFN(mp, 0);
1132 	uint32_t i, win_no = 0, pfn_no = 1, win_pfn0_index = 0, cookie_no = 0;
1133 	uint64_t count_max, bypass_addr = 0;
1134 	px_dma_win_t **win_pp = (px_dma_win_t **)&mp->dmai_winlst;
1135 	ddi_dma_cookie_t *cookie0_p;
1136 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
1137 	    mp->dmai_attr.dma_attr_flags);
1138 	dev_info_t *dip = px_p->px_dip;
1139 
1140 	ASSERT(PX_DMA_ISPTP(mp) || PX_DMA_ISBYPASS(mp));
1141 	if (PX_DMA_ISPTP(mp)) { /* ignore sys limits for peer-to-peer */
1142 		ddi_dma_attr_t *dev_attr_p = PX_DEV_ATTR(mp);
1143 		uint64_t nocross = dev_attr_p->dma_attr_seg;
1144 		px_pec_t *pec_p = px_p->px_pec_p;
1145 		px_iopfn_t pfn_last = PX_DMA_ISPTP32(mp) ?
1146 		    pec_p->pec_last32_pfn - pec_p->pec_base32_pfn :
1147 		    pec_p->pec_last64_pfn - pec_p->pec_base64_pfn;
1148 
1149 		if (nocross && (nocross < UINT32_MAX))
1150 			return (DDI_DMA_NOMAPPING);
1151 		if (dev_attr_p->dma_attr_align > MMU_PAGE_SIZE)
1152 			return (DDI_DMA_NOMAPPING);
1153 		pfn_lo = MMU_BTOP(dev_attr_p->dma_attr_addr_lo);
1154 		pfn_hi = MMU_BTOP(dev_attr_p->dma_attr_addr_hi);
1155 		pfn_hi = MIN(pfn_hi, pfn_last);
1156 		if ((pfn_lo > pfn_hi) || (pfn < pfn_lo))
1157 			return (DDI_DMA_NOMAPPING);
1158 
1159 		count_max = dev_attr_p->dma_attr_count_max;
1160 		count_max = MIN(count_max, nocross);
1161 		/*
1162 		 * the following count_max trim is not done because we are
1163 		 * making sure pfn_lo <= pfn <= pfn_hi inside the loop
1164 		 * count_max=MIN(count_max, MMU_PTOB(pfn_hi - pfn_lo + 1)-1);
1165 		 */
1166 	} else { /* bypass hi/lo/count_max have been processed by attr2hdl() */
1167 		count_max = mp->dmai_attr.dma_attr_count_max;
1168 		pfn_lo = MMU_BTOP(mp->dmai_attr.dma_attr_addr_lo);
1169 		pfn_hi = MMU_BTOP(mp->dmai_attr.dma_attr_addr_hi);
1170 
1171 		if (px_lib_iommu_getbypass(dip, MMU_PTOB(pfn),
1172 		    attr, &bypass_addr) != DDI_SUCCESS) {
1173 			DBG(DBG_BYPASS, mp->dmai_rdip,
1174 			    "bypass cookie failure %lx\n", pfn);
1175 			return (DDI_DMA_NOMAPPING);
1176 		}
1177 		pfn = MMU_BTOP(bypass_addr);
1178 	}
1179 
1180 	/* pfn: absolute (bypass mode) or relative (p2p mode) */
1181 	for (prev_pfn = pfn, i = 1; i < npages;
1182 	    i++, prev_pfn = pfn, pfn_no++) {
1183 		pfn = PX_GET_MP_PFN1(mp, i);
1184 		if (bypass_addr) {
1185 			if (px_lib_iommu_getbypass(dip, MMU_PTOB(pfn), attr,
1186 			    &bypass_addr) != DDI_SUCCESS) {
1187 				ret = DDI_DMA_NOMAPPING;
1188 				goto err;
1189 			}
1190 			pfn = MMU_BTOP(bypass_addr);
1191 		}
1192 		if ((pfn == prev_pfn + 1) &&
1193 		    (MMU_PTOB(pfn_no + 1) - 1 <= count_max))
1194 			continue;
1195 		if ((pfn < pfn_lo) || (prev_pfn > pfn_hi)) {
1196 			ret = DDI_DMA_NOMAPPING;
1197 			goto err;
1198 		}
1199 		cookie_no++;
1200 		pfn_no = 0;
1201 		if (cookie_no < sgllen)
1202 			continue;
1203 
1204 		DBG(DBG_BYPASS, mp->dmai_rdip, "newwin pfn[%x-%x] %x cks\n",
1205 		    win_pfn0_index, i - 1, cookie_no);
1206 		if (ret = px_dma_newwin(dip, dmareq, mp, cookie_no,
1207 		    win_pfn0_index, i - 1, win_pp, count_max, bypass_addr))
1208 			goto err;
1209 
1210 		win_pp = &(*win_pp)->win_next;	/* win_pp = *(win_pp) */
1211 		win_no++;
1212 		win_pfn0_index = i;
1213 		cookie_no = 0;
1214 	}
1215 	if (pfn > pfn_hi) {
1216 		ret = DDI_DMA_NOMAPPING;
1217 		goto err;
1218 	}
1219 	cookie_no++;
1220 	DBG(DBG_BYPASS, mp->dmai_rdip, "newwin pfn[%x-%x] %x cks\n",
1221 	    win_pfn0_index, i - 1, cookie_no);
1222 	if (ret = px_dma_newwin(dip, dmareq, mp, cookie_no, win_pfn0_index,
1223 	    i - 1, win_pp, count_max, bypass_addr))
1224 		goto err;
1225 	win_no++;
1226 	px_dma_adjust(dmareq, mp, mp->dmai_winlst);
1227 	mp->dmai_nwin = win_no;
1228 	mp->dmai_rflags |= DDI_DMA_CONSISTENT | DMP_NOSYNC;
1229 	mp->dmai_rflags &= ~DDI_DMA_REDZONE;
1230 	mp->dmai_flags |= PX_DMAI_FLAGS_NOSYNC;
1231 	cookie0_p = (ddi_dma_cookie_t *)(PX_WINLST(mp) + 1);
1232 	mp->dmai_cookie = PX_WINLST(mp)->win_ncookies > 1 ? cookie0_p + 1 : 0;
1233 	mp->dmai_mapping = cookie0_p->dmac_laddress;
1234 
1235 	px_dma_freepfn(mp);
1236 	return (DDI_DMA_MAPPED);
1237 err:
1238 	px_dma_freewin(mp);
1239 	return (ret);
1240 }
1241 
1242 int
1243 px_dma_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_impl_t *mp,
1244 	enum ddi_dma_ctlops cmd, off_t *offp, size_t *lenp, caddr_t *objp,
1245 	uint_t cache_flags)
1246 {
1247 	switch (cmd) {
1248 	default:
1249 		DBG(DBG_DMA_CTL, dip, "unknown command (%x): rdip=%s%d\n",
1250 		    cmd, ddi_driver_name(rdip), ddi_get_instance(rdip));
1251 		break;
1252 	}
1253 	return (DDI_FAILURE);
1254 }
1255 
1256 static void
1257 px_dvma_debug_init(px_mmu_t *mmu_p)
1258 {
1259 	size_t sz = sizeof (struct px_dvma_rec) * px_dvma_debug_rec;
1260 	ASSERT(MUTEX_HELD(&mmu_p->dvma_debug_lock));
1261 	cmn_err(CE_NOTE, "PCI Express DVMA %p stat ON", mmu_p);
1262 
1263 	mmu_p->dvma_alloc_rec = kmem_alloc(sz, KM_SLEEP);
1264 	mmu_p->dvma_free_rec = kmem_alloc(sz, KM_SLEEP);
1265 
1266 	mmu_p->dvma_active_list = NULL;
1267 	mmu_p->dvma_alloc_rec_index = 0;
1268 	mmu_p->dvma_free_rec_index = 0;
1269 	mmu_p->dvma_active_count = 0;
1270 }
1271 
1272 void
1273 px_dvma_debug_fini(px_mmu_t *mmu_p)
1274 {
1275 	struct px_dvma_rec *prev, *ptr;
1276 	size_t sz = sizeof (struct px_dvma_rec) * px_dvma_debug_rec;
1277 	uint64_t mask = ~(1ull << mmu_p->mmu_inst);
1278 	cmn_err(CE_NOTE, "PCI Express DVMA %p stat OFF", mmu_p);
1279 
1280 	if (mmu_p->dvma_alloc_rec) {
1281 		kmem_free(mmu_p->dvma_alloc_rec, sz);
1282 		mmu_p->dvma_alloc_rec = NULL;
1283 	}
1284 	if (mmu_p->dvma_free_rec) {
1285 		kmem_free(mmu_p->dvma_free_rec, sz);
1286 		mmu_p->dvma_free_rec = NULL;
1287 	}
1288 
1289 	prev = mmu_p->dvma_active_list;
1290 	if (!prev)
1291 		return;
1292 	for (ptr = prev->next; ptr; prev = ptr, ptr = ptr->next)
1293 		kmem_free(prev, sizeof (struct px_dvma_rec));
1294 	kmem_free(prev, sizeof (struct px_dvma_rec));
1295 
1296 	mmu_p->dvma_active_list = NULL;
1297 	mmu_p->dvma_alloc_rec_index = 0;
1298 	mmu_p->dvma_free_rec_index = 0;
1299 	mmu_p->dvma_active_count = 0;
1300 
1301 	px_dvma_debug_off &= mask;
1302 	px_dvma_debug_on &= mask;
1303 }
1304 
1305 void
1306 px_dvma_alloc_debug(px_mmu_t *mmu_p, char *address, uint_t len,
1307 	ddi_dma_impl_t *mp)
1308 {
1309 	struct px_dvma_rec *ptr;
1310 	mutex_enter(&mmu_p->dvma_debug_lock);
1311 
1312 	if (!mmu_p->dvma_alloc_rec)
1313 		px_dvma_debug_init(mmu_p);
1314 	if (PX_DVMA_DBG_OFF(mmu_p)) {
1315 		px_dvma_debug_fini(mmu_p);
1316 		goto done;
1317 	}
1318 
1319 	ptr = &mmu_p->dvma_alloc_rec[mmu_p->dvma_alloc_rec_index];
1320 	ptr->dvma_addr = address;
1321 	ptr->len = len;
1322 	ptr->mp = mp;
1323 	if (++mmu_p->dvma_alloc_rec_index == px_dvma_debug_rec)
1324 		mmu_p->dvma_alloc_rec_index = 0;
1325 
1326 	ptr = kmem_alloc(sizeof (struct px_dvma_rec), KM_SLEEP);
1327 	ptr->dvma_addr = address;
1328 	ptr->len = len;
1329 	ptr->mp = mp;
1330 
1331 	ptr->next = mmu_p->dvma_active_list;
1332 	mmu_p->dvma_active_list = ptr;
1333 	mmu_p->dvma_active_count++;
1334 done:
1335 	mutex_exit(&mmu_p->dvma_debug_lock);
1336 }
1337 
1338 void
1339 px_dvma_free_debug(px_mmu_t *mmu_p, char *address, uint_t len,
1340     ddi_dma_impl_t *mp)
1341 {
1342 	struct px_dvma_rec *ptr, *ptr_save;
1343 	mutex_enter(&mmu_p->dvma_debug_lock);
1344 
1345 	if (!mmu_p->dvma_alloc_rec)
1346 		px_dvma_debug_init(mmu_p);
1347 	if (PX_DVMA_DBG_OFF(mmu_p)) {
1348 		px_dvma_debug_fini(mmu_p);
1349 		goto done;
1350 	}
1351 
1352 	ptr = &mmu_p->dvma_free_rec[mmu_p->dvma_free_rec_index];
1353 	ptr->dvma_addr = address;
1354 	ptr->len = len;
1355 	ptr->mp = mp;
1356 	if (++mmu_p->dvma_free_rec_index == px_dvma_debug_rec)
1357 		mmu_p->dvma_free_rec_index = 0;
1358 
1359 	ptr_save = mmu_p->dvma_active_list;
1360 	for (ptr = ptr_save; ptr; ptr = ptr->next) {
1361 		if ((ptr->dvma_addr == address) && (ptr->len = len))
1362 			break;
1363 		ptr_save = ptr;
1364 	}
1365 	if (!ptr) {
1366 		cmn_err(CE_WARN, "bad dvma free addr=%lx len=%x",
1367 		    (long)address, len);
1368 		goto done;
1369 	}
1370 	if (ptr == mmu_p->dvma_active_list)
1371 		mmu_p->dvma_active_list = ptr->next;
1372 	else
1373 		ptr_save->next = ptr->next;
1374 	kmem_free(ptr, sizeof (struct px_dvma_rec));
1375 	mmu_p->dvma_active_count--;
1376 done:
1377 	mutex_exit(&mmu_p->dvma_debug_lock);
1378 }
1379 
1380 #ifdef	DEBUG
1381 void
1382 px_dump_dma_handle(uint64_t flag, dev_info_t *dip, ddi_dma_impl_t *hp)
1383 {
1384 	DBG(flag, dip, "mp(%p): flags=%x mapping=%lx xfer_size=%x\n",
1385 	    hp, hp->dmai_inuse, hp->dmai_mapping, hp->dmai_size);
1386 	DBG(flag|DBG_CONT, dip, "\tnpages=%x roffset=%x rflags=%x nwin=%x\n",
1387 	    hp->dmai_ndvmapages, hp->dmai_roffset, hp->dmai_rflags,
1388 	    hp->dmai_nwin);
1389 	DBG(flag|DBG_CONT, dip, "\twinsize=%x tte=%p pfnlst=%p pfn0=%p\n",
1390 	    hp->dmai_winsize, hp->dmai_tte, hp->dmai_pfnlst, hp->dmai_pfn0);
1391 	DBG(flag|DBG_CONT, dip, "\twinlst=%x obj=%p attr=%p ckp=%p\n",
1392 	    hp->dmai_winlst, &hp->dmai_object, &hp->dmai_attr,
1393 	    hp->dmai_cookie);
1394 }
1395 #endif	/* DEBUG */
1396