xref: /illumos-gate/usr/src/uts/sun4/io/px/px_dma.c (revision b3619796d92b4472acfed6b7c813f83cef335013)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
27  */
28 
29 /*
30  * PCI Express nexus DVMA and DMA core routines:
31  *	dma_map/dma_bind_handle implementation
32  *	bypass and peer-to-peer support
33  *	fast track DVMA space allocation
34  *	runtime DVMA debug
35  */
36 #include <sys/types.h>
37 #include <sys/kmem.h>
38 #include <sys/async.h>
39 #include <sys/sysmacros.h>
40 #include <sys/sunddi.h>
41 #include <sys/ddi_impldefs.h>
42 #include "px_obj.h"
43 
44 /*LINTLIBRARY*/
45 
46 /*
47  * px_dma_allocmp - Allocate a pci dma implementation structure
48  *
49  * An extra ddi_dma_attr structure is bundled with the usual ddi_dma_impl
50  * to hold unmodified device limits. The ddi_dma_attr inside the
51  * ddi_dma_impl structure is augumented with system limits to enhance
52  * DVMA performance at runtime. The unaugumented device limits saved
53  * right after (accessed through (ddi_dma_attr_t *)(mp + 1)) is used
54  * strictly for peer-to-peer transfers which do not obey system limits.
55  *
56  * return: DDI_SUCCESS DDI_DMA_NORESOURCES
57  */
58 ddi_dma_impl_t *
59 px_dma_allocmp(dev_info_t *dip, dev_info_t *rdip, int (*waitfp)(caddr_t),
60 	caddr_t arg)
61 {
62 	register ddi_dma_impl_t *mp;
63 	int sleep = (waitfp == DDI_DMA_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
64 
65 	/* Caution: we don't use zalloc to enhance performance! */
66 	if ((mp = kmem_alloc(sizeof (px_dma_hdl_t), sleep)) == 0) {
67 		DBG(DBG_DMA_MAP, dip, "can't alloc dma_handle\n");
68 		if (waitfp != DDI_DMA_DONTWAIT) {
69 			DBG(DBG_DMA_MAP, dip, "alloc_mp kmem cb\n");
70 			ddi_set_callback(waitfp, arg, &px_kmem_clid);
71 		}
72 		return (mp);
73 	}
74 
75 	mp->dmai_rdip = rdip;
76 	mp->dmai_flags = 0;
77 	mp->dmai_pfnlst = NULL;
78 	mp->dmai_winlst = NULL;
79 	mp->dmai_ncookies = 0;
80 	mp->dmai_curcookie = 0;
81 
82 	/*
83 	 * kmem_alloc debug: the following fields are not zero-ed
84 	 * mp->dmai_mapping = 0;
85 	 * mp->dmai_size = 0;
86 	 * mp->dmai_offset = 0;
87 	 * mp->dmai_minxfer = 0;
88 	 * mp->dmai_burstsizes = 0;
89 	 * mp->dmai_ndvmapages = 0;
90 	 * mp->dmai_pool/roffset = 0;
91 	 * mp->dmai_rflags = 0;
92 	 * mp->dmai_inuse/flags
93 	 * mp->dmai_nwin = 0;
94 	 * mp->dmai_winsize = 0;
95 	 * mp->dmai_nexus_private/tte = 0;
96 	 * mp->dmai_iopte/pfnlst
97 	 * mp->dmai_sbi/pfn0 = 0;
98 	 * mp->dmai_minfo/winlst/fdvma
99 	 * mp->dmai_rdip
100 	 * bzero(&mp->dmai_object, sizeof (ddi_dma_obj_t));
101 	 * bzero(&mp->dmai_attr, sizeof (ddi_dma_attr_t));
102 	 * mp->dmai_cookie = 0;
103 	 */
104 
105 	mp->dmai_attr.dma_attr_version = (uint_t)DMA_ATTR_VERSION;
106 	mp->dmai_attr.dma_attr_flags = (uint_t)0;
107 	mp->dmai_fault = 0;
108 	mp->dmai_fault_check = NULL;
109 	mp->dmai_fault_notify = NULL;
110 
111 	mp->dmai_error.err_ena = 0;
112 	mp->dmai_error.err_status = DDI_FM_OK;
113 	mp->dmai_error.err_expected = DDI_FM_ERR_UNEXPECTED;
114 	mp->dmai_error.err_ontrap = NULL;
115 	mp->dmai_error.err_fep = NULL;
116 	mp->dmai_error.err_cf = NULL;
117 
118 	/*
119 	 * The bdf protection value is set to immediate child
120 	 * at first. It gets modified by switch/bridge drivers
121 	 * as the code traverses down the fabric topology.
122 	 *
123 	 * XXX No IOMMU protection for broken devices.
124 	 */
125 	ASSERT((intptr_t)ddi_get_parent_data(rdip) >> 1 == 0);
126 	mp->dmai_bdf = ((intptr_t)ddi_get_parent_data(rdip) == 1) ?
127 	    PCIE_INVALID_BDF : pcie_get_bdf_for_dma_xfer(dip, rdip);
128 
129 	ndi_fmc_insert(rdip, DMA_HANDLE, mp, NULL);
130 	return (mp);
131 }
132 
133 void
134 px_dma_freemp(ddi_dma_impl_t *mp)
135 {
136 	ndi_fmc_remove(mp->dmai_rdip, DMA_HANDLE, mp);
137 	if (mp->dmai_ndvmapages > 1)
138 		px_dma_freepfn(mp);
139 	if (mp->dmai_winlst)
140 		px_dma_freewin(mp);
141 	kmem_free(mp, sizeof (px_dma_hdl_t));
142 }
143 
144 void
145 px_dma_freepfn(ddi_dma_impl_t *mp)
146 {
147 	void *addr = mp->dmai_pfnlst;
148 	if (addr) {
149 		size_t npages = mp->dmai_ndvmapages;
150 		if (npages > 1)
151 			kmem_free(addr, npages * sizeof (px_iopfn_t));
152 		mp->dmai_pfnlst = NULL;
153 	}
154 	mp->dmai_ndvmapages = 0;
155 }
156 
157 /*
158  * px_dma_lmts2hdl - alloate a ddi_dma_impl_t, validate practical limits
159  *			and convert dmareq->dmar_limits to mp->dmai_attr
160  *
161  * ddi_dma_impl_t member modified     input
162  * ------------------------------------------------------------------------
163  * mp->dmai_minxfer		    - dev
164  * mp->dmai_burstsizes		    - dev
165  * mp->dmai_flags		    - no limit? peer-to-peer only?
166  *
167  * ddi_dma_attr member modified       input
168  * ------------------------------------------------------------------------
169  * mp->dmai_attr.dma_attr_addr_lo   - dev lo, sys lo
170  * mp->dmai_attr.dma_attr_addr_hi   - dev hi, sys hi
171  * mp->dmai_attr.dma_attr_count_max - dev count max, dev/sys lo/hi delta
172  * mp->dmai_attr.dma_attr_seg       - 0         (no nocross   restriction)
173  * mp->dmai_attr.dma_attr_align     - 1         (no alignment restriction)
174  *
175  * The dlim_dmaspeed member of dmareq->dmar_limits is ignored.
176  */
177 ddi_dma_impl_t *
178 px_dma_lmts2hdl(dev_info_t *dip, dev_info_t *rdip, px_mmu_t *mmu_p,
179 	ddi_dma_req_t *dmareq)
180 {
181 	ddi_dma_impl_t *mp;
182 	ddi_dma_attr_t *attr_p;
183 	uint64_t syslo		= mmu_p->mmu_dvma_base;
184 	uint64_t syshi		= mmu_p->mmu_dvma_end;
185 	uint64_t fasthi		= mmu_p->mmu_dvma_fast_end;
186 	ddi_dma_lim_t *lim_p	= dmareq->dmar_limits;
187 	uint32_t count_max	= lim_p->dlim_cntr_max;
188 	uint64_t lo		= lim_p->dlim_addr_lo;
189 	uint64_t hi		= lim_p->dlim_addr_hi;
190 	if (hi <= lo) {
191 		DBG(DBG_DMA_MAP, dip, "Bad limits\n");
192 		return ((ddi_dma_impl_t *)DDI_DMA_NOMAPPING);
193 	}
194 	if (!count_max)
195 		count_max--;
196 
197 	if (!(mp = px_dma_allocmp(dip, rdip, dmareq->dmar_fp,
198 	    dmareq->dmar_arg)))
199 		return (NULL);
200 
201 	/* store original dev input at the 2nd ddi_dma_attr */
202 	attr_p = PX_DEV_ATTR(mp);
203 	SET_DMAATTR(attr_p, lo, hi, -1, count_max);
204 	SET_DMAALIGN(attr_p, 1);
205 
206 	lo = MAX(lo, syslo);
207 	hi = MIN(hi, syshi);
208 	if (hi <= lo)
209 		mp->dmai_flags |= PX_DMAI_FLAGS_PEER_ONLY;
210 	count_max = MIN(count_max, hi - lo);
211 
212 	if (PX_DEV_NOSYSLIMIT(lo, hi, syslo, fasthi, 1))
213 		mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT |
214 		    PX_DMAI_FLAGS_NOSYSLIMIT;
215 	else {
216 		if (PX_DEV_NOFASTLIMIT(lo, hi, syslo, syshi, 1))
217 			mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT;
218 	}
219 	if (PX_DMA_NOCTX(rdip))
220 		mp->dmai_flags |= PX_DMAI_FLAGS_NOCTX;
221 
222 	/* store augumented dev input to mp->dmai_attr */
223 	mp->dmai_burstsizes	= lim_p->dlim_burstsizes;
224 	attr_p = &mp->dmai_attr;
225 	SET_DMAATTR(attr_p, lo, hi, -1, count_max);
226 	SET_DMAALIGN(attr_p, 1);
227 	return (mp);
228 }
229 
230 /*
231  * Called from px_attach to check for bypass dma support and set
232  * flags accordingly.
233  */
234 int
235 px_dma_attach(px_t *px_p)
236 {
237 	uint64_t baddr;
238 
239 	if (px_lib_iommu_getbypass(px_p->px_dip, 0ull,
240 	    PCI_MAP_ATTR_WRITE|PCI_MAP_ATTR_READ,
241 	    &baddr) != DDI_ENOTSUP)
242 		/* ignore all other errors */
243 		px_p->px_dev_caps |= PX_BYPASS_DMA_ALLOWED;
244 
245 	px_p->px_dma_sync_opt = ddi_prop_get_int(DDI_DEV_T_ANY,
246 	    px_p->px_dip, DDI_PROP_DONTPASS, "dma-sync-options", 0);
247 
248 	if (px_p->px_dma_sync_opt != 0)
249 		px_p->px_dev_caps |= PX_DMA_SYNC_REQUIRED;
250 
251 	return (DDI_SUCCESS);
252 }
253 
254 /*
255  * px_dma_attr2hdl
256  *
257  * This routine is called from the alloc handle entry point to sanity check the
258  * dma attribute structure.
259  *
260  * use by: px_dma_allochdl()
261  *
262  * return value:
263  *
264  *	DDI_SUCCESS		- on success
265  *	DDI_DMA_BADATTR		- attribute has invalid version number
266  *				  or address limits exclude dvma space
267  */
268 int
269 px_dma_attr2hdl(px_t *px_p, ddi_dma_impl_t *mp)
270 {
271 	px_mmu_t *mmu_p = px_p->px_mmu_p;
272 	uint64_t syslo, syshi;
273 	int	ret;
274 	ddi_dma_attr_t *attrp		= PX_DEV_ATTR(mp);
275 	uint64_t hi			= attrp->dma_attr_addr_hi;
276 	uint64_t lo			= attrp->dma_attr_addr_lo;
277 	uint64_t align			= attrp->dma_attr_align;
278 	uint64_t nocross		= attrp->dma_attr_seg;
279 	uint64_t count_max		= attrp->dma_attr_count_max;
280 
281 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "attrp=%p cntr_max=%x.%08x\n",
282 	    attrp, HI32(count_max), LO32(count_max));
283 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "hi=%x.%08x lo=%x.%08x\n",
284 	    HI32(hi), LO32(hi), HI32(lo), LO32(lo));
285 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "seg=%x.%08x align=%x.%08x\n",
286 	    HI32(nocross), LO32(nocross), HI32(align), LO32(align));
287 
288 	if (!nocross)
289 		nocross--;
290 	if (attrp->dma_attr_flags & DDI_DMA_FORCE_PHYSICAL) { /* BYPASS */
291 
292 		DBG(DBG_DMA_ALLOCH, px_p->px_dip, "bypass mode\n");
293 		/*
294 		 * If Bypass DMA is not supported, return error so that
295 		 * target driver can fall back to dvma mode of operation
296 		 */
297 		if (!(px_p->px_dev_caps & PX_BYPASS_DMA_ALLOWED))
298 			return (DDI_DMA_BADATTR);
299 		mp->dmai_flags |= PX_DMAI_FLAGS_BYPASSREQ;
300 		if (nocross != UINT64_MAX)
301 			return (DDI_DMA_BADATTR);
302 		if (align && (align > MMU_PAGE_SIZE))
303 			return (DDI_DMA_BADATTR);
304 		align = 1; /* align on 1 page boundary */
305 
306 		/* do a range check and get the limits */
307 		ret = px_lib_dma_bypass_rngchk(px_p->px_dip, attrp,
308 		    &syslo, &syshi);
309 		if (ret != DDI_SUCCESS)
310 			return (ret);
311 	} else { /* MMU_XLATE or PEER_TO_PEER */
312 		align = MAX(align, MMU_PAGE_SIZE) - 1;
313 		if ((align & nocross) != align) {
314 			dev_info_t *rdip = mp->dmai_rdip;
315 			cmn_err(CE_WARN, "%s%d dma_attr_seg not aligned",
316 			    NAMEINST(rdip));
317 			return (DDI_DMA_BADATTR);
318 		}
319 		align = MMU_BTOP(align + 1);
320 		syslo = mmu_p->mmu_dvma_base;
321 		syshi = mmu_p->mmu_dvma_end;
322 	}
323 	if (hi <= lo) {
324 		dev_info_t *rdip = mp->dmai_rdip;
325 		cmn_err(CE_WARN, "%s%d limits out of range", NAMEINST(rdip));
326 		return (DDI_DMA_BADATTR);
327 	}
328 	lo = MAX(lo, syslo);
329 	hi = MIN(hi, syshi);
330 	if (!count_max)
331 		count_max--;
332 
333 	DBG(DBG_DMA_ALLOCH, px_p->px_dip, "hi=%x.%08x, lo=%x.%08x\n",
334 	    HI32(hi), LO32(hi), HI32(lo), LO32(lo));
335 	if (hi <= lo) {
336 		/*
337 		 * If this is an IOMMU bypass access, the caller can't use
338 		 * the required addresses, so fail it.  Otherwise, it's
339 		 * peer-to-peer; ensure that the caller has no alignment or
340 		 * segment size restrictions.
341 		 */
342 		if ((mp->dmai_flags & PX_DMAI_FLAGS_BYPASSREQ) ||
343 		    (nocross < UINT32_MAX) || (align > 1))
344 			return (DDI_DMA_BADATTR);
345 
346 		mp->dmai_flags |= PX_DMAI_FLAGS_PEER_ONLY;
347 	} else /* set practical counter_max value */
348 		count_max = MIN(count_max, hi - lo);
349 
350 	if (PX_DEV_NOSYSLIMIT(lo, hi, syslo, syshi, align))
351 		mp->dmai_flags |= PX_DMAI_FLAGS_NOSYSLIMIT |
352 		    PX_DMAI_FLAGS_NOFASTLIMIT;
353 	else {
354 		syshi = mmu_p->mmu_dvma_fast_end;
355 		if (PX_DEV_NOFASTLIMIT(lo, hi, syslo, syshi, align))
356 			mp->dmai_flags |= PX_DMAI_FLAGS_NOFASTLIMIT;
357 	}
358 	if (PX_DMA_NOCTX(mp->dmai_rdip))
359 		mp->dmai_flags |= PX_DMAI_FLAGS_NOCTX;
360 
361 	mp->dmai_burstsizes	= attrp->dma_attr_burstsizes;
362 	attrp = &mp->dmai_attr;
363 	SET_DMAATTR(attrp, lo, hi, nocross, count_max);
364 	return (DDI_SUCCESS);
365 }
366 
367 #define	TGT_PFN_INBETWEEN(pfn, bgn, end) ((pfn >= bgn) && (pfn <= end))
368 
369 /*
370  * px_dma_type - determine which of the three types DMA (peer-to-peer,
371  *		mmu bypass, or mmu translate) we are asked to do.
372  *		Also checks pfn0 and rejects any non-peer-to-peer
373  *		requests for peer-only devices.
374  *
375  *	return values:
376  *		DDI_DMA_NOMAPPING - can't get valid pfn0, or bad dma type
377  *		DDI_SUCCESS
378  *
379  *	dma handle members affected (set on exit):
380  *	mp->dmai_object		- dmareq->dmar_object
381  *	mp->dmai_rflags		- consistent?, nosync?, dmareq->dmar_flags
382  *	mp->dmai_flags   	- DMA type
383  *	mp->dmai_pfn0   	- 1st page pfn (if va/size pair and not shadow)
384  *	mp->dmai_roffset 	- initialized to starting MMU page offset
385  *	mp->dmai_ndvmapages	- # of total MMU pages of entire object
386  */
387 int
388 px_dma_type(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
389 {
390 	dev_info_t *dip = px_p->px_dip;
391 	ddi_dma_obj_t *dobj_p = &dmareq->dmar_object;
392 	px_pec_t *pec_p = px_p->px_pec_p;
393 	uint32_t offset;
394 	pfn_t pfn0;
395 	uint_t redzone;
396 
397 	mp->dmai_rflags = dmareq->dmar_flags & DMP_DDIFLAGS;
398 
399 	if (!(px_p->px_dev_caps & PX_DMA_SYNC_REQUIRED))
400 		mp->dmai_rflags |= DMP_NOSYNC;
401 
402 	switch (dobj_p->dmao_type) {
403 	case DMA_OTYP_BUFVADDR:
404 	case DMA_OTYP_VADDR: {
405 		page_t **pplist = dobj_p->dmao_obj.virt_obj.v_priv;
406 		caddr_t vaddr = dobj_p->dmao_obj.virt_obj.v_addr;
407 
408 		DBG(DBG_DMA_MAP, dip, "vaddr=%p pplist=%p\n", vaddr, pplist);
409 		offset = (ulong_t)vaddr & MMU_PAGE_OFFSET;
410 		if (pplist) {				/* shadow list */
411 			mp->dmai_flags |= PX_DMAI_FLAGS_PGPFN;
412 			pfn0 = page_pptonum(*pplist);
413 		} else {
414 			struct as *as_p = dobj_p->dmao_obj.virt_obj.v_as;
415 			struct hat *hat_p = as_p ? as_p->a_hat : kas.a_hat;
416 			pfn0 = hat_getpfnum(hat_p, vaddr);
417 		}
418 		}
419 		break;
420 
421 	case DMA_OTYP_PAGES:
422 		offset = dobj_p->dmao_obj.pp_obj.pp_offset;
423 		mp->dmai_flags |= PX_DMAI_FLAGS_PGPFN;
424 		pfn0 = page_pptonum(dobj_p->dmao_obj.pp_obj.pp_pp);
425 		break;
426 
427 	case DMA_OTYP_PADDR:
428 	default:
429 		cmn_err(CE_WARN, "%s%d requested unsupported dma type %x",
430 		    NAMEINST(mp->dmai_rdip), dobj_p->dmao_type);
431 		return (DDI_DMA_NOMAPPING);
432 	}
433 	if (pfn0 == PFN_INVALID) {
434 		cmn_err(CE_WARN, "%s%d: invalid pfn0 for DMA object %p",
435 		    NAMEINST(dip), dobj_p);
436 		return (DDI_DMA_NOMAPPING);
437 	}
438 	if (TGT_PFN_INBETWEEN(pfn0, pec_p->pec_base32_pfn,
439 	    pec_p->pec_last32_pfn)) {
440 		mp->dmai_flags |= PX_DMAI_FLAGS_PTP|PX_DMAI_FLAGS_PTP32;
441 		goto done;	/* leave bypass and dvma flag as 0 */
442 	} else if (TGT_PFN_INBETWEEN(pfn0, pec_p->pec_base64_pfn,
443 	    pec_p->pec_last64_pfn)) {
444 		mp->dmai_flags |= PX_DMAI_FLAGS_PTP|PX_DMAI_FLAGS_PTP64;
445 		goto done;	/* leave bypass and dvma flag as 0 */
446 	}
447 	if (PX_DMA_ISPEERONLY(mp)) {
448 		dev_info_t *rdip = mp->dmai_rdip;
449 		cmn_err(CE_WARN, "Bad peer-to-peer req %s%d", NAMEINST(rdip));
450 		return (DDI_DMA_NOMAPPING);
451 	}
452 
453 	redzone = (mp->dmai_rflags & DDI_DMA_REDZONE) ||
454 	    (mp->dmai_flags & PX_DMAI_FLAGS_MAP_BUFZONE) ?
455 	    PX_DMAI_FLAGS_REDZONE : 0;
456 
457 	mp->dmai_flags |= (mp->dmai_flags & PX_DMAI_FLAGS_BYPASSREQ) ?
458 	    PX_DMAI_FLAGS_BYPASS : (PX_DMAI_FLAGS_DVMA | redzone);
459 done:
460 	mp->dmai_object	 = *dobj_p;			/* whole object    */
461 	mp->dmai_pfn0	 = (void *)pfn0;		/* cache pfn0	   */
462 	mp->dmai_roffset = offset;			/* win0 pg0 offset */
463 	mp->dmai_ndvmapages = MMU_BTOPR(offset + mp->dmai_object.dmao_size);
464 	return (DDI_SUCCESS);
465 }
466 
467 /*
468  * px_dma_pgpfn - set up pfnlst array according to pages
469  *	VA/size pair: <shadow IO, bypass, peer-to-peer>, or OTYP_PAGES
470  */
471 /*ARGSUSED*/
472 static int
473 px_dma_pgpfn(px_t *px_p, ddi_dma_impl_t *mp, uint_t npages)
474 {
475 	int i;
476 	dev_info_t *dip = px_p->px_dip;
477 
478 	switch (mp->dmai_object.dmao_type) {
479 	case DMA_OTYP_BUFVADDR:
480 	case DMA_OTYP_VADDR: {
481 		page_t **pplist = mp->dmai_object.dmao_obj.virt_obj.v_priv;
482 		DBG(DBG_DMA_MAP, dip, "shadow pplist=%p, %x pages, pfns=",
483 		    pplist, npages);
484 		for (i = 1; i < npages; i++) {
485 			px_iopfn_t pfn = page_pptonum(pplist[i]);
486 			PX_SET_MP_PFN1(mp, i, pfn);
487 			DBG(DBG_DMA_MAP|DBG_CONT, dip, "%x ", pfn);
488 		}
489 		DBG(DBG_DMA_MAP|DBG_CONT, dip, "\n");
490 		}
491 		break;
492 
493 	case DMA_OTYP_PAGES: {
494 		page_t *pp = mp->dmai_object.dmao_obj.pp_obj.pp_pp->p_next;
495 		DBG(DBG_DMA_MAP, dip, "pp=%p pfns=", pp);
496 		for (i = 1; i < npages; i++, pp = pp->p_next) {
497 			px_iopfn_t pfn = page_pptonum(pp);
498 			PX_SET_MP_PFN1(mp, i, pfn);
499 			DBG(DBG_DMA_MAP|DBG_CONT, dip, "%x ", pfn);
500 		}
501 		DBG(DBG_DMA_MAP|DBG_CONT, dip, "\n");
502 		}
503 		break;
504 
505 	default:	/* check is already done by px_dma_type */
506 		ASSERT(0);
507 		break;
508 	}
509 	return (DDI_SUCCESS);
510 }
511 
512 /*
513  * px_dma_vapfn - set up pfnlst array according to VA
514  *	VA/size pair: <normal, bypass, peer-to-peer>
515  *	pfn0 is skipped as it is already done.
516  *	In this case, the cached pfn0 is used to fill pfnlst[0]
517  */
518 static int
519 px_dma_vapfn(px_t *px_p, ddi_dma_impl_t *mp, uint_t npages)
520 {
521 	dev_info_t *dip = px_p->px_dip;
522 	int i;
523 	caddr_t vaddr = (caddr_t)mp->dmai_object.dmao_obj.virt_obj.v_as;
524 	struct hat *hat_p = vaddr ? ((struct as *)vaddr)->a_hat : kas.a_hat;
525 
526 	vaddr = mp->dmai_object.dmao_obj.virt_obj.v_addr + MMU_PAGE_SIZE;
527 	for (i = 1; i < npages; i++, vaddr += MMU_PAGE_SIZE) {
528 		px_iopfn_t pfn = hat_getpfnum(hat_p, vaddr);
529 		if (pfn == PFN_INVALID)
530 			goto err_badpfn;
531 		PX_SET_MP_PFN1(mp, i, pfn);
532 		DBG(DBG_DMA_BINDH, dip, "px_dma_vapfn: mp=%p pfnlst[%x]=%x\n",
533 		    mp, i, pfn);
534 	}
535 	return (DDI_SUCCESS);
536 err_badpfn:
537 	cmn_err(CE_WARN, "%s%d: bad page frame vaddr=%p", NAMEINST(dip), vaddr);
538 	return (DDI_DMA_NOMAPPING);
539 }
540 
541 /*
542  * px_dma_pfn - Fills pfn list for all pages being DMA-ed.
543  *
544  * dependencies:
545  *	mp->dmai_ndvmapages	- set to total # of dma pages
546  *
547  * return value:
548  *	DDI_SUCCESS
549  *	DDI_DMA_NOMAPPING
550  */
551 int
552 px_dma_pfn(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
553 {
554 	uint32_t npages = mp->dmai_ndvmapages;
555 	int (*waitfp)(caddr_t) = dmareq->dmar_fp;
556 	int i, ret, peer = PX_DMA_ISPTP(mp);
557 	int peer32 = PX_DMA_ISPTP32(mp);
558 	dev_info_t *dip = px_p->px_dip;
559 
560 	px_pec_t *pec_p = px_p->px_pec_p;
561 	px_iopfn_t pfn_base = peer32 ? pec_p->pec_base32_pfn :
562 	    pec_p->pec_base64_pfn;
563 	px_iopfn_t pfn_last = peer32 ? pec_p->pec_last32_pfn :
564 	    pec_p->pec_last64_pfn;
565 	px_iopfn_t pfn_adj = peer ? pfn_base : 0;
566 
567 	DBG(DBG_DMA_BINDH, dip, "px_dma_pfn: mp=%p pfn0=%x\n",
568 	    mp, PX_MP_PFN0(mp) - pfn_adj);
569 	/* 1 page: no array alloc/fill, no mixed mode check */
570 	if (npages == 1) {
571 		PX_SET_MP_PFN(mp, 0, PX_MP_PFN0(mp) - pfn_adj);
572 		return (DDI_SUCCESS);
573 	}
574 	/* allocate pfn array */
575 	if (!(mp->dmai_pfnlst = kmem_alloc(npages * sizeof (px_iopfn_t),
576 	    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP))) {
577 		if (waitfp != DDI_DMA_DONTWAIT)
578 			ddi_set_callback(waitfp, dmareq->dmar_arg,
579 			    &px_kmem_clid);
580 		return (DDI_DMA_NORESOURCES);
581 	}
582 	/* fill pfn array */
583 	PX_SET_MP_PFN(mp, 0, PX_MP_PFN0(mp) - pfn_adj);	/* pfnlst[0] */
584 	if ((ret = PX_DMA_ISPGPFN(mp) ? px_dma_pgpfn(px_p, mp, npages) :
585 	    px_dma_vapfn(px_p, mp, npages)) != DDI_SUCCESS)
586 		goto err;
587 
588 	/* skip pfn0, check mixed mode and adjust peer to peer pfn */
589 	for (i = 1; i < npages; i++) {
590 		px_iopfn_t pfn = PX_GET_MP_PFN1(mp, i);
591 		if (peer ^ TGT_PFN_INBETWEEN(pfn, pfn_base, pfn_last)) {
592 			cmn_err(CE_WARN, "%s%d mixed mode DMA %lx %lx",
593 			    NAMEINST(mp->dmai_rdip), PX_MP_PFN0(mp), pfn);
594 			ret = DDI_DMA_NOMAPPING;	/* mixed mode */
595 			goto err;
596 		}
597 		DBG(DBG_DMA_MAP, dip,
598 		    "px_dma_pfn: pfnlst[%x]=%x-%x\n", i, pfn, pfn_adj);
599 		if (pfn_adj)
600 			PX_SET_MP_PFN1(mp, i, pfn - pfn_adj);
601 	}
602 	return (DDI_SUCCESS);
603 err:
604 	px_dma_freepfn(mp);
605 	return (ret);
606 }
607 
608 /*
609  * px_dvma_win() - trim requested DVMA size down to window size
610  *	The 1st window starts from offset and ends at page-aligned boundary.
611  *	From the 2nd window on, each window starts and ends at page-aligned
612  *	boundary except the last window ends at wherever requested.
613  *
614  *	accesses the following mp-> members:
615  *	mp->dmai_attr.dma_attr_count_max
616  *	mp->dmai_attr.dma_attr_seg
617  *	mp->dmai_roffset   - start offset of 1st window
618  *	mp->dmai_rflags (redzone)
619  *	mp->dmai_ndvmapages (for 1 page fast path)
620  *
621  *	sets the following mp-> members:
622  *	mp->dmai_size	   - xfer size, != winsize if 1st/last win  (not fixed)
623  *	mp->dmai_winsize   - window size (no redzone), n * page size    (fixed)
624  *	mp->dmai_nwin	   - # of DMA windows of entire object		(fixed)
625  *	mp->dmai_rflags	   - remove partial flag if nwin == 1		(fixed)
626  *	mp->dmai_winlst	   - NULL, window objects not used for DVMA	(fixed)
627  *
628  *	fixed - not changed across different DMA windows
629  */
630 /*ARGSUSED*/
631 int
632 px_dvma_win(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
633 {
634 	uint32_t redzone_sz	= PX_HAS_REDZONE(mp) ? MMU_PAGE_SIZE : 0;
635 	size_t obj_sz		= mp->dmai_object.dmao_size;
636 	size_t xfer_sz;
637 	ulong_t pg_off;
638 
639 	if ((mp->dmai_ndvmapages == 1) && !redzone_sz) {
640 		mp->dmai_rflags &= ~DDI_DMA_PARTIAL;
641 		mp->dmai_size = obj_sz;
642 		mp->dmai_winsize = MMU_PAGE_SIZE;
643 		mp->dmai_nwin = 1;
644 		goto done;
645 	}
646 
647 	pg_off	= mp->dmai_roffset;
648 	xfer_sz	= obj_sz + redzone_sz;
649 
650 	/* include redzone in nocross check */	{
651 		uint64_t nocross = mp->dmai_attr.dma_attr_seg;
652 		if (xfer_sz + pg_off - 1 > nocross)
653 			xfer_sz = nocross - pg_off + 1;
654 		if (redzone_sz && (xfer_sz <= redzone_sz)) {
655 			DBG(DBG_DMA_MAP, px_p->px_dip,
656 			    "nocross too small: "
657 			    "%lx(%lx)+%lx+%lx < %llx\n",
658 			    xfer_sz, obj_sz, pg_off, redzone_sz, nocross);
659 			return (DDI_DMA_TOOBIG);
660 		}
661 	}
662 	xfer_sz -= redzone_sz;		/* restore transfer size  */
663 	/* check counter max */	{
664 		uint32_t count_max = mp->dmai_attr.dma_attr_count_max;
665 		if (xfer_sz - 1 > count_max)
666 			xfer_sz = count_max + 1;
667 	}
668 	if (xfer_sz >= obj_sz) {
669 		mp->dmai_rflags &= ~DDI_DMA_PARTIAL;
670 		mp->dmai_size = xfer_sz;
671 		mp->dmai_winsize = P2ROUNDUP(xfer_sz + pg_off, MMU_PAGE_SIZE);
672 		mp->dmai_nwin = 1;
673 		goto done;
674 	}
675 	if (!(dmareq->dmar_flags & DDI_DMA_PARTIAL)) {
676 		DBG(DBG_DMA_MAP, px_p->px_dip, "too big: %lx+%lx+%lx > %lx\n",
677 		    obj_sz, pg_off, redzone_sz, xfer_sz);
678 		return (DDI_DMA_TOOBIG);
679 	}
680 
681 	xfer_sz = MMU_PTOB(MMU_BTOP(xfer_sz + pg_off)); /* page align */
682 	mp->dmai_size = xfer_sz - pg_off;	/* 1st window xferrable size */
683 	mp->dmai_winsize = xfer_sz;		/* redzone not in winsize */
684 	mp->dmai_nwin = (obj_sz + pg_off + xfer_sz - 1) / xfer_sz;
685 done:
686 	mp->dmai_winlst = NULL;
687 	px_dump_dma_handle(DBG_DMA_MAP, px_p->px_dip, mp);
688 	return (DDI_SUCCESS);
689 }
690 
691 /*
692  * fast track cache entry to mmu context, inserts 3 0 bits between
693  * upper 6-bits and lower 3-bits of the 9-bit cache entry
694  */
695 #define	MMU_FCE_TO_CTX(i)	(((i) << 3) | ((i) & 0x7) | 0x38)
696 
697 /*
698  * px_dvma_map_fast - attempts to map fast trackable DVMA
699  */
700 /*ARGSUSED*/
701 int
702 px_dvma_map_fast(px_mmu_t *mmu_p, ddi_dma_impl_t *mp)
703 {
704 	uint_t clustsz = px_dvma_page_cache_clustsz;
705 	uint_t entries = px_dvma_page_cache_entries;
706 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
707 	    mp->dmai_attr.dma_attr_flags);
708 	int i = mmu_p->mmu_dvma_addr_scan_start;
709 	uint8_t *lock_addr = mmu_p->mmu_dvma_cache_locks + i;
710 	px_dvma_addr_t dvma_pg;
711 	size_t npages = MMU_BTOP(mp->dmai_winsize);
712 	dev_info_t *dip = mmu_p->mmu_px_p->px_dip;
713 
714 	extern uint8_t ldstub(uint8_t *);
715 	ASSERT(MMU_PTOB(npages) == mp->dmai_winsize);
716 	ASSERT(npages + PX_HAS_REDZONE(mp) <= clustsz);
717 
718 	for (; i < entries && ldstub(lock_addr); i++, lock_addr++)
719 		;
720 	if (i >= entries) {
721 		lock_addr = mmu_p->mmu_dvma_cache_locks;
722 		i = 0;
723 		for (; i < entries && ldstub(lock_addr); i++, lock_addr++)
724 			;
725 		if (i >= entries) {
726 #ifdef	PX_DMA_PROF
727 			px_dvmaft_exhaust++;
728 #endif	/* PX_DMA_PROF */
729 			return (DDI_DMA_NORESOURCES);
730 		}
731 	}
732 	mmu_p->mmu_dvma_addr_scan_start = (i + 1) & (entries - 1);
733 
734 	i *= clustsz;
735 	dvma_pg = mmu_p->dvma_base_pg + i;
736 
737 	if (px_lib_iommu_map(dip, PCI_TSBID(0, i), npages,
738 	    PX_ADD_ATTR_EXTNS(attr, mp->dmai_bdf), (void *)mp, 0,
739 	    MMU_MAP_PFN) != DDI_SUCCESS) {
740 		DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: "
741 		    "px_lib_iommu_map failed\n");
742 		return (DDI_FAILURE);
743 	}
744 
745 	if (!PX_MAP_BUFZONE(mp))
746 		goto done;
747 
748 	DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: redzone pg=%x\n", i + npages);
749 
750 	ASSERT(PX_HAS_REDZONE(mp));
751 
752 	if (px_lib_iommu_map(dip, PCI_TSBID(0, i + npages), 1,
753 	    PX_ADD_ATTR_EXTNS(attr, mp->dmai_bdf), (void *)mp, npages - 1,
754 	    MMU_MAP_PFN) != DDI_SUCCESS) {
755 		DBG(DBG_MAP_WIN, dip, "px_dvma_map_fast: "
756 		    "mapping REDZONE page failed\n");
757 
758 		(void) px_lib_iommu_demap(dip, PCI_TSBID(0, i), npages);
759 		return (DDI_FAILURE);
760 	}
761 
762 done:
763 #ifdef PX_DMA_PROF
764 	px_dvmaft_success++;
765 #endif
766 	mp->dmai_mapping = mp->dmai_roffset | MMU_PTOB(dvma_pg);
767 	mp->dmai_offset = 0;
768 	mp->dmai_flags |= PX_DMAI_FLAGS_FASTTRACK;
769 	PX_SAVE_MP_TTE(mp, attr);	/* save TTE template for unmapping */
770 	if (PX_DVMA_DBG_ON(mmu_p))
771 		px_dvma_alloc_debug(mmu_p, (char *)mp->dmai_mapping,
772 		    mp->dmai_size, mp);
773 	return (DDI_SUCCESS);
774 }
775 
776 /*
777  * px_dvma_map: map non-fasttrack DMA
778  *		Use quantum cache if single page DMA.
779  */
780 int
781 px_dvma_map(ddi_dma_impl_t *mp, ddi_dma_req_t *dmareq, px_mmu_t *mmu_p)
782 {
783 	uint_t npages = PX_DMA_WINNPGS(mp);
784 	px_dvma_addr_t dvma_pg, dvma_pg_index;
785 	void *dvma_addr;
786 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
787 	    mp->dmai_attr.dma_attr_flags);
788 	int sleep = dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP;
789 	dev_info_t *dip = mp->dmai_rdip;
790 	int	ret = DDI_SUCCESS;
791 
792 	/*
793 	 * allocate dvma space resource and map in the first window.
794 	 * (vmem_t *vmp, size_t size,
795 	 *	size_t align, size_t phase, size_t nocross,
796 	 *	void *minaddr, void *maxaddr, int vmflag)
797 	 */
798 	if ((npages == 1) && !PX_HAS_REDZONE(mp) && PX_HAS_NOSYSLIMIT(mp)) {
799 		dvma_addr = vmem_alloc(mmu_p->mmu_dvma_map,
800 		    MMU_PAGE_SIZE, sleep);
801 		mp->dmai_flags |= PX_DMAI_FLAGS_VMEMCACHE;
802 #ifdef	PX_DMA_PROF
803 		px_dvma_vmem_alloc++;
804 #endif	/* PX_DMA_PROF */
805 	} else {
806 		dvma_addr = vmem_xalloc(mmu_p->mmu_dvma_map,
807 		    MMU_PTOB(npages + PX_HAS_REDZONE(mp)),
808 		    MAX(mp->dmai_attr.dma_attr_align, MMU_PAGE_SIZE),
809 		    0,
810 		    mp->dmai_attr.dma_attr_seg + 1,
811 		    (void *)mp->dmai_attr.dma_attr_addr_lo,
812 		    (void *)(mp->dmai_attr.dma_attr_addr_hi + 1),
813 		    sleep);
814 #ifdef	PX_DMA_PROF
815 		px_dvma_vmem_xalloc++;
816 #endif	/* PX_DMA_PROF */
817 	}
818 	dvma_pg = MMU_BTOP((ulong_t)dvma_addr);
819 	dvma_pg_index = dvma_pg - mmu_p->dvma_base_pg;
820 	DBG(DBG_DMA_MAP, dip, "fallback dvma_pages: dvma_pg=%x index=%x\n",
821 	    dvma_pg, dvma_pg_index);
822 	if (dvma_pg == 0)
823 		goto noresource;
824 
825 	mp->dmai_mapping = mp->dmai_roffset | MMU_PTOB(dvma_pg);
826 	mp->dmai_offset = 0;
827 	PX_SAVE_MP_TTE(mp, attr);	/* mp->dmai_tte = tte */
828 
829 	if ((ret = px_mmu_map_pages(mmu_p,
830 	    mp, dvma_pg, npages, 0)) != DDI_SUCCESS) {
831 		if (mp->dmai_flags & PX_DMAI_FLAGS_VMEMCACHE) {
832 			vmem_free(mmu_p->mmu_dvma_map, (void *)dvma_addr,
833 			    MMU_PAGE_SIZE);
834 #ifdef PX_DMA_PROF
835 			px_dvma_vmem_free++;
836 #endif /* PX_DMA_PROF */
837 		} else {
838 			vmem_xfree(mmu_p->mmu_dvma_map, (void *)dvma_addr,
839 			    MMU_PTOB(npages + PX_HAS_REDZONE(mp)));
840 #ifdef PX_DMA_PROF
841 			px_dvma_vmem_xfree++;
842 #endif /* PX_DMA_PROF */
843 		}
844 	}
845 
846 	return (ret);
847 noresource:
848 	if (dmareq->dmar_fp != DDI_DMA_DONTWAIT) {
849 		DBG(DBG_DMA_MAP, dip, "dvma_pg 0 - set callback\n");
850 		ddi_set_callback(dmareq->dmar_fp, dmareq->dmar_arg,
851 		    &mmu_p->mmu_dvma_clid);
852 	}
853 	DBG(DBG_DMA_MAP, dip, "vmem_xalloc - DDI_DMA_NORESOURCES\n");
854 	return (DDI_DMA_NORESOURCES);
855 }
856 
857 void
858 px_dvma_unmap(px_mmu_t *mmu_p, ddi_dma_impl_t *mp)
859 {
860 	px_dvma_addr_t dvma_addr = (px_dvma_addr_t)mp->dmai_mapping;
861 	px_dvma_addr_t dvma_pg = MMU_BTOP(dvma_addr);
862 	dvma_addr = MMU_PTOB(dvma_pg);
863 
864 	if (mp->dmai_flags & PX_DMAI_FLAGS_FASTTRACK) {
865 		px_iopfn_t index = dvma_pg - mmu_p->dvma_base_pg;
866 		ASSERT(index % px_dvma_page_cache_clustsz == 0);
867 		index /= px_dvma_page_cache_clustsz;
868 		ASSERT(index < px_dvma_page_cache_entries);
869 		mmu_p->mmu_dvma_cache_locks[index] = 0;
870 #ifdef	PX_DMA_PROF
871 		px_dvmaft_free++;
872 #endif	/* PX_DMA_PROF */
873 		return;
874 	}
875 
876 	if (mp->dmai_flags & PX_DMAI_FLAGS_VMEMCACHE) {
877 		vmem_free(mmu_p->mmu_dvma_map, (void *)dvma_addr,
878 		    MMU_PAGE_SIZE);
879 #ifdef PX_DMA_PROF
880 		px_dvma_vmem_free++;
881 #endif /* PX_DMA_PROF */
882 	} else {
883 		size_t npages = MMU_BTOP(mp->dmai_winsize) + PX_HAS_REDZONE(mp);
884 		vmem_xfree(mmu_p->mmu_dvma_map, (void *)dvma_addr,
885 		    MMU_PTOB(npages));
886 #ifdef PX_DMA_PROF
887 		px_dvma_vmem_xfree++;
888 #endif /* PX_DMA_PROF */
889 	}
890 }
891 
892 /*
893  * DVMA mappings may have multiple windows, but each window always have
894  * one segment.
895  */
896 int
897 px_dvma_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_impl_t *mp,
898 	enum ddi_dma_ctlops cmd, off_t *offp, size_t *lenp, caddr_t *objp,
899 	uint_t cache_flags)
900 {
901 	switch (cmd) {
902 	default:
903 		DBG(DBG_DMA_CTL, dip, "unknown command (%x): rdip=%s%d\n",
904 		    cmd, ddi_driver_name(rdip), ddi_get_instance(rdip));
905 		break;
906 	}
907 	return (DDI_FAILURE);
908 }
909 
910 void
911 px_dma_freewin(ddi_dma_impl_t *mp)
912 {
913 	px_dma_win_t *win_p = mp->dmai_winlst, *win2_p;
914 	for (win2_p = win_p; win_p; win2_p = win_p) {
915 		win_p = win2_p->win_next;
916 		kmem_free(win2_p, sizeof (px_dma_win_t) +
917 		    sizeof (ddi_dma_cookie_t) * win2_p->win_ncookies);
918 	}
919 	mp->dmai_nwin = 0;
920 	mp->dmai_winlst = NULL;
921 }
922 
923 /*
924  * px_dma_newwin - create a dma window object and cookies
925  *
926  *	After the initial scan in px_dma_physwin(), which identifies
927  *	a portion of the pfn array that belongs to a dma window,
928  *	we are called to allocate and initialize representing memory
929  *	resources. We know from the 1st scan the number of cookies
930  *	or dma segment in this window so we can allocate a contiguous
931  *	memory array for the dma cookies (The implementation of
932  *	ddi_dma_nextcookie(9f) dictates dma cookies be contiguous).
933  *
934  *	A second round scan is done on the pfn array to identify
935  *	each dma segment and initialize its corresponding dma cookie.
936  *	We don't need to do all the safety checking and we know they
937  *	all belong to the same dma window.
938  *
939  *	Input:	cookie_no - # of cookies identified by the 1st scan
940  *		start_idx - subscript of the pfn array for the starting pfn
941  *		end_idx   - subscript of the last pfn in dma window
942  *		win_pp    - pointer to win_next member of previous window
943  *	Return:	DDI_SUCCESS - with **win_pp as newly created window object
944  *		DDI_DMA_NORESROUCE - caller frees all previous window objs
945  *	Note:	Each cookie and window size are all initialized on page
946  *		boundary. This is not true for the 1st cookie of the 1st
947  *		window and the last cookie of the last window.
948  *		We fix that later in upper layer which has access to size
949  *		and offset info.
950  *
951  */
952 /*ARGSUSED*/
953 static int
954 px_dma_newwin(dev_info_t *dip, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp,
955 	uint32_t cookie_no, uint32_t start_idx, uint32_t end_idx,
956 	px_dma_win_t **win_pp, uint64_t count_max, uint64_t bypass)
957 {
958 	int (*waitfp)(caddr_t) = dmareq->dmar_fp;
959 	ddi_dma_cookie_t *cookie_p;
960 	uint32_t pfn_no = 1;
961 	px_iopfn_t pfn = PX_GET_MP_PFN(mp, start_idx);
962 	px_iopfn_t prev_pfn = pfn;
963 	uint64_t baddr, seg_pfn0 = pfn;
964 	size_t sz = cookie_no * sizeof (ddi_dma_cookie_t);
965 	px_dma_win_t *win_p = kmem_zalloc(sizeof (px_dma_win_t) + sz,
966 	    waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
967 	io_attributes_t	attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
968 	    mp->dmai_attr.dma_attr_flags);
969 
970 	if (!win_p)
971 		goto noresource;
972 
973 	win_p->win_next = NULL;
974 	win_p->win_ncookies = cookie_no;
975 	win_p->win_curseg = 0;	/* start from segment 0 */
976 	win_p->win_size = MMU_PTOB(end_idx - start_idx + 1);
977 	/* win_p->win_offset is left uninitialized */
978 
979 	cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
980 	start_idx++;
981 	for (; start_idx <= end_idx; start_idx++, prev_pfn = pfn, pfn_no++) {
982 		pfn = PX_GET_MP_PFN1(mp, start_idx);
983 		if ((pfn == prev_pfn + 1) &&
984 		    (MMU_PTOB(pfn_no + 1) - 1 <= count_max))
985 			continue;
986 
987 		/* close up the cookie up to (including) prev_pfn */
988 		baddr = MMU_PTOB(seg_pfn0);
989 		if (bypass) {
990 			if (px_lib_iommu_getbypass(dip, baddr, attr, &baddr)
991 			    == DDI_SUCCESS)
992 				baddr = px_lib_ro_bypass(dip, attr, baddr);
993 			else
994 				return (DDI_FAILURE);
995 		}
996 
997 		MAKE_DMA_COOKIE(cookie_p, baddr, MMU_PTOB(pfn_no));
998 		DBG(DBG_BYPASS, mp->dmai_rdip, "cookie %p (%x pages)\n",
999 		    MMU_PTOB(seg_pfn0), pfn_no);
1000 
1001 		cookie_p++;	/* advance to next available cookie cell */
1002 		pfn_no = 0;
1003 		seg_pfn0 = pfn;	/* start a new segment from current pfn */
1004 	}
1005 
1006 	baddr = MMU_PTOB(seg_pfn0);
1007 	if (bypass) {
1008 		if (px_lib_iommu_getbypass(dip, baddr, attr, &baddr)
1009 		    == DDI_SUCCESS)
1010 			baddr = px_lib_ro_bypass(dip, attr, baddr);
1011 		else
1012 			return (DDI_FAILURE);
1013 	}
1014 
1015 	MAKE_DMA_COOKIE(cookie_p, baddr, MMU_PTOB(pfn_no));
1016 	DBG(DBG_BYPASS, mp->dmai_rdip, "cookie %p (%x pages) of total %x\n",
1017 	    MMU_PTOB(seg_pfn0), pfn_no, cookie_no);
1018 #ifdef	DEBUG
1019 	cookie_p++;
1020 	ASSERT((cookie_p - (ddi_dma_cookie_t *)(win_p + 1)) == cookie_no);
1021 #endif	/* DEBUG */
1022 	*win_pp = win_p;
1023 	return (DDI_SUCCESS);
1024 noresource:
1025 	if (waitfp != DDI_DMA_DONTWAIT)
1026 		ddi_set_callback(waitfp, dmareq->dmar_arg, &px_kmem_clid);
1027 	return (DDI_DMA_NORESOURCES);
1028 }
1029 
1030 /*
1031  * px_dma_adjust - adjust 1st and last cookie and window sizes
1032  *	remove initial dma page offset from 1st cookie and window size
1033  *	remove last dma page remainder from last cookie and window size
1034  *	fill win_offset of each dma window according to just fixed up
1035  *		each window sizes
1036  *	px_dma_win_t members modified:
1037  *	win_p->win_offset - this window's offset within entire DMA object
1038  *	win_p->win_size	  - xferrable size (in bytes) for this window
1039  *
1040  *	ddi_dma_impl_t members modified:
1041  *	mp->dmai_size	  - 1st window xferrable size
1042  *	mp->dmai_offset   - 0, which is the dma offset of the 1st window
1043  *
1044  *	ddi_dma_cookie_t members modified:
1045  *	cookie_p->dmac_size - 1st and last cookie remove offset or remainder
1046  *	cookie_p->dmac_laddress - 1st cookie add page offset
1047  */
1048 static void
1049 px_dma_adjust(ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp, px_dma_win_t *win_p)
1050 {
1051 	ddi_dma_cookie_t *cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
1052 	size_t pg_offset = mp->dmai_roffset;
1053 	size_t win_offset = 0;
1054 
1055 	cookie_p->dmac_size -= pg_offset;
1056 	cookie_p->dmac_laddress |= pg_offset;
1057 	win_p->win_size -= pg_offset;
1058 	DBG(DBG_BYPASS, mp->dmai_rdip, "pg0 adjust %lx\n", pg_offset);
1059 
1060 	mp->dmai_size = win_p->win_size;
1061 	mp->dmai_offset = 0;
1062 
1063 	pg_offset += mp->dmai_object.dmao_size;
1064 	pg_offset &= MMU_PAGE_OFFSET;
1065 	if (pg_offset)
1066 		pg_offset = MMU_PAGE_SIZE - pg_offset;
1067 	DBG(DBG_BYPASS, mp->dmai_rdip, "last pg adjust %lx\n", pg_offset);
1068 
1069 	for (; win_p->win_next; win_p = win_p->win_next) {
1070 		DBG(DBG_BYPASS, mp->dmai_rdip, "win off %p\n", win_offset);
1071 		win_p->win_offset = win_offset;
1072 		win_offset += win_p->win_size;
1073 	}
1074 	/* last window */
1075 	win_p->win_offset = win_offset;
1076 	cookie_p = (ddi_dma_cookie_t *)(win_p + 1);
1077 	cookie_p[win_p->win_ncookies - 1].dmac_size -= pg_offset;
1078 	win_p->win_size -= pg_offset;
1079 	ASSERT((win_offset + win_p->win_size) == mp->dmai_object.dmao_size);
1080 }
1081 
1082 /*
1083  * px_dma_physwin() - carve up dma windows using physical addresses.
1084  *	Called to handle mmu bypass and pci peer-to-peer transfers.
1085  *	Calls px_dma_newwin() to allocate window objects.
1086  *
1087  * Dependency: mp->dmai_pfnlst points to an array of pfns
1088  *
1089  * 1. Each dma window is represented by a px_dma_win_t object.
1090  *	The object will be casted to ddi_dma_win_t and returned
1091  *	to leaf driver through the DDI interface.
1092  * 2. Each dma window can have several dma segments with each
1093  *	segment representing a physically contiguous either memory
1094  *	space (if we are doing an mmu bypass transfer) or pci address
1095  *	space (if we are doing a peer-to-peer transfer).
1096  * 3. Each segment has a DMA cookie to program the DMA engine.
1097  *	The cookies within each DMA window must be located in a
1098  *	contiguous array per ddi_dma_nextcookie(9f).
1099  * 4. The number of DMA segments within each DMA window cannot exceed
1100  *	mp->dmai_attr.dma_attr_sgllen. If the transfer size is
1101  *	too large to fit in the sgllen, the rest needs to be
1102  *	relocated to the next dma window.
1103  * 5. Peer-to-peer DMA segment follows device hi, lo, count_max,
1104  *	and nocross restrictions while bypass DMA follows the set of
1105  *	restrictions with system limits factored in.
1106  *
1107  * Return:
1108  *	mp->dmai_winlst	 - points to a link list of px_dma_win_t objects.
1109  *		Each px_dma_win_t object on the link list contains
1110  *		infomation such as its window size (# of pages),
1111  *		starting offset (also see Restriction), an array of
1112  *		DMA cookies, and # of cookies in the array.
1113  *	mp->dmai_pfnlst	 - NULL, the pfn list is freed to conserve memory.
1114  *	mp->dmai_nwin	 - # of total DMA windows on mp->dmai_winlst.
1115  *	mp->dmai_mapping - starting cookie address
1116  *	mp->dmai_rflags	 - consistent, nosync, no redzone
1117  *	mp->dmai_cookie	 - start of cookie table of the 1st DMA window
1118  *
1119  * Restriction:
1120  *	Each px_dma_win_t object can theoratically start from any offset
1121  *	since the mmu is not involved. However, this implementation
1122  *	always make windows start from page aligned offset (except
1123  *	the 1st window, which follows the requested offset) due to the
1124  *	fact that we are handed a pfn list. This does require device's
1125  *	count_max and attr_seg to be at least MMU_PAGE_SIZE aligned.
1126  */
1127 int
1128 px_dma_physwin(px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_impl_t *mp)
1129 {
1130 	uint_t npages = mp->dmai_ndvmapages;
1131 	int ret, sgllen = mp->dmai_attr.dma_attr_sgllen;
1132 	px_iopfn_t pfn_lo, pfn_hi, prev_pfn;
1133 	px_iopfn_t pfn = PX_GET_MP_PFN(mp, 0);
1134 	uint32_t i, win_no = 0, pfn_no = 1, win_pfn0_index = 0, cookie_no = 0;
1135 	uint64_t count_max, bypass_addr = 0;
1136 	px_dma_win_t **win_pp = (px_dma_win_t **)&mp->dmai_winlst;
1137 	ddi_dma_cookie_t *cookie0_p;
1138 	io_attributes_t attr = PX_GET_TTE_ATTR(mp->dmai_rflags,
1139 	    mp->dmai_attr.dma_attr_flags);
1140 	dev_info_t *dip = px_p->px_dip;
1141 
1142 	ASSERT(PX_DMA_ISPTP(mp) || PX_DMA_ISBYPASS(mp));
1143 	if (PX_DMA_ISPTP(mp)) { /* ignore sys limits for peer-to-peer */
1144 		ddi_dma_attr_t *dev_attr_p = PX_DEV_ATTR(mp);
1145 		uint64_t nocross = dev_attr_p->dma_attr_seg;
1146 		px_pec_t *pec_p = px_p->px_pec_p;
1147 		px_iopfn_t pfn_last = PX_DMA_ISPTP32(mp) ?
1148 		    pec_p->pec_last32_pfn - pec_p->pec_base32_pfn :
1149 		    pec_p->pec_last64_pfn - pec_p->pec_base64_pfn;
1150 
1151 		if (nocross && (nocross < UINT32_MAX))
1152 			return (DDI_DMA_NOMAPPING);
1153 		if (dev_attr_p->dma_attr_align > MMU_PAGE_SIZE)
1154 			return (DDI_DMA_NOMAPPING);
1155 		pfn_lo = MMU_BTOP(dev_attr_p->dma_attr_addr_lo);
1156 		pfn_hi = MMU_BTOP(dev_attr_p->dma_attr_addr_hi);
1157 		pfn_hi = MIN(pfn_hi, pfn_last);
1158 		if ((pfn_lo > pfn_hi) || (pfn < pfn_lo))
1159 			return (DDI_DMA_NOMAPPING);
1160 
1161 		count_max = dev_attr_p->dma_attr_count_max;
1162 		count_max = MIN(count_max, nocross);
1163 		/*
1164 		 * the following count_max trim is not done because we are
1165 		 * making sure pfn_lo <= pfn <= pfn_hi inside the loop
1166 		 * count_max=MIN(count_max, MMU_PTOB(pfn_hi - pfn_lo + 1)-1);
1167 		 */
1168 	} else { /* bypass hi/lo/count_max have been processed by attr2hdl() */
1169 		count_max = mp->dmai_attr.dma_attr_count_max;
1170 		pfn_lo = MMU_BTOP(mp->dmai_attr.dma_attr_addr_lo);
1171 		pfn_hi = MMU_BTOP(mp->dmai_attr.dma_attr_addr_hi);
1172 
1173 		if (px_lib_iommu_getbypass(dip, MMU_PTOB(pfn),
1174 		    attr, &bypass_addr) != DDI_SUCCESS) {
1175 			DBG(DBG_BYPASS, mp->dmai_rdip,
1176 			    "bypass cookie failure %lx\n", pfn);
1177 			return (DDI_DMA_NOMAPPING);
1178 		}
1179 		pfn = MMU_BTOP(bypass_addr);
1180 	}
1181 
1182 	/* pfn: absolute (bypass mode) or relative (p2p mode) */
1183 	for (prev_pfn = pfn, i = 1; i < npages;
1184 	    i++, prev_pfn = pfn, pfn_no++) {
1185 		pfn = PX_GET_MP_PFN1(mp, i);
1186 		if (bypass_addr) {
1187 			if (px_lib_iommu_getbypass(dip, MMU_PTOB(pfn), attr,
1188 			    &bypass_addr) != DDI_SUCCESS) {
1189 				ret = DDI_DMA_NOMAPPING;
1190 				goto err;
1191 			}
1192 			pfn = MMU_BTOP(bypass_addr);
1193 		}
1194 		if ((pfn == prev_pfn + 1) &&
1195 		    (MMU_PTOB(pfn_no + 1) - 1 <= count_max))
1196 			continue;
1197 		if ((pfn < pfn_lo) || (prev_pfn > pfn_hi)) {
1198 			ret = DDI_DMA_NOMAPPING;
1199 			goto err;
1200 		}
1201 		cookie_no++;
1202 		pfn_no = 0;
1203 		if (cookie_no < sgllen)
1204 			continue;
1205 
1206 		DBG(DBG_BYPASS, mp->dmai_rdip, "newwin pfn[%x-%x] %x cks\n",
1207 		    win_pfn0_index, i - 1, cookie_no);
1208 		if (ret = px_dma_newwin(dip, dmareq, mp, cookie_no,
1209 		    win_pfn0_index, i - 1, win_pp, count_max, bypass_addr))
1210 			goto err;
1211 
1212 		win_pp = &(*win_pp)->win_next;	/* win_pp = *(win_pp) */
1213 		win_no++;
1214 		win_pfn0_index = i;
1215 		cookie_no = 0;
1216 	}
1217 	if (pfn > pfn_hi) {
1218 		ret = DDI_DMA_NOMAPPING;
1219 		goto err;
1220 	}
1221 	cookie_no++;
1222 	DBG(DBG_BYPASS, mp->dmai_rdip, "newwin pfn[%x-%x] %x cks\n",
1223 	    win_pfn0_index, i - 1, cookie_no);
1224 	if (ret = px_dma_newwin(dip, dmareq, mp, cookie_no, win_pfn0_index,
1225 	    i - 1, win_pp, count_max, bypass_addr))
1226 		goto err;
1227 	win_no++;
1228 	px_dma_adjust(dmareq, mp, mp->dmai_winlst);
1229 	mp->dmai_nwin = win_no;
1230 	mp->dmai_rflags |= DDI_DMA_CONSISTENT | DMP_NOSYNC;
1231 	mp->dmai_rflags &= ~DDI_DMA_REDZONE;
1232 	mp->dmai_flags |= PX_DMAI_FLAGS_NOSYNC;
1233 	cookie0_p = (ddi_dma_cookie_t *)(PX_WINLST(mp) + 1);
1234 	mp->dmai_cookie = cookie0_p + 1;
1235 	mp->dmai_curcookie = 1;
1236 	mp->dmai_ncookies = PX_WINLST(mp)->win_ncookies;
1237 	mp->dmai_mapping = cookie0_p->dmac_laddress;
1238 
1239 	px_dma_freepfn(mp);
1240 	return (DDI_DMA_MAPPED);
1241 err:
1242 	px_dma_freewin(mp);
1243 	return (ret);
1244 }
1245 
1246 int
1247 px_dma_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_impl_t *mp,
1248 	enum ddi_dma_ctlops cmd, off_t *offp, size_t *lenp, caddr_t *objp,
1249 	uint_t cache_flags)
1250 {
1251 	switch (cmd) {
1252 	default:
1253 		DBG(DBG_DMA_CTL, dip, "unknown command (%x): rdip=%s%d\n",
1254 		    cmd, ddi_driver_name(rdip), ddi_get_instance(rdip));
1255 		break;
1256 	}
1257 	return (DDI_FAILURE);
1258 }
1259 
1260 static void
1261 px_dvma_debug_init(px_mmu_t *mmu_p)
1262 {
1263 	size_t sz = sizeof (struct px_dvma_rec) * px_dvma_debug_rec;
1264 	ASSERT(MUTEX_HELD(&mmu_p->dvma_debug_lock));
1265 	cmn_err(CE_NOTE, "PCI Express DVMA %p stat ON", mmu_p);
1266 
1267 	mmu_p->dvma_alloc_rec = kmem_alloc(sz, KM_SLEEP);
1268 	mmu_p->dvma_free_rec = kmem_alloc(sz, KM_SLEEP);
1269 
1270 	mmu_p->dvma_active_list = NULL;
1271 	mmu_p->dvma_alloc_rec_index = 0;
1272 	mmu_p->dvma_free_rec_index = 0;
1273 	mmu_p->dvma_active_count = 0;
1274 }
1275 
1276 void
1277 px_dvma_debug_fini(px_mmu_t *mmu_p)
1278 {
1279 	struct px_dvma_rec *prev, *ptr;
1280 	size_t sz = sizeof (struct px_dvma_rec) * px_dvma_debug_rec;
1281 	uint64_t mask = ~(1ull << mmu_p->mmu_inst);
1282 	cmn_err(CE_NOTE, "PCI Express DVMA %p stat OFF", mmu_p);
1283 
1284 	if (mmu_p->dvma_alloc_rec) {
1285 		kmem_free(mmu_p->dvma_alloc_rec, sz);
1286 		mmu_p->dvma_alloc_rec = NULL;
1287 	}
1288 	if (mmu_p->dvma_free_rec) {
1289 		kmem_free(mmu_p->dvma_free_rec, sz);
1290 		mmu_p->dvma_free_rec = NULL;
1291 	}
1292 
1293 	prev = mmu_p->dvma_active_list;
1294 	if (!prev)
1295 		return;
1296 	for (ptr = prev->next; ptr; prev = ptr, ptr = ptr->next)
1297 		kmem_free(prev, sizeof (struct px_dvma_rec));
1298 	kmem_free(prev, sizeof (struct px_dvma_rec));
1299 
1300 	mmu_p->dvma_active_list = NULL;
1301 	mmu_p->dvma_alloc_rec_index = 0;
1302 	mmu_p->dvma_free_rec_index = 0;
1303 	mmu_p->dvma_active_count = 0;
1304 
1305 	px_dvma_debug_off &= mask;
1306 	px_dvma_debug_on &= mask;
1307 }
1308 
1309 void
1310 px_dvma_alloc_debug(px_mmu_t *mmu_p, char *address, uint_t len,
1311 	ddi_dma_impl_t *mp)
1312 {
1313 	struct px_dvma_rec *ptr;
1314 	mutex_enter(&mmu_p->dvma_debug_lock);
1315 
1316 	if (!mmu_p->dvma_alloc_rec)
1317 		px_dvma_debug_init(mmu_p);
1318 	if (PX_DVMA_DBG_OFF(mmu_p)) {
1319 		px_dvma_debug_fini(mmu_p);
1320 		goto done;
1321 	}
1322 
1323 	ptr = &mmu_p->dvma_alloc_rec[mmu_p->dvma_alloc_rec_index];
1324 	ptr->dvma_addr = address;
1325 	ptr->len = len;
1326 	ptr->mp = mp;
1327 	if (++mmu_p->dvma_alloc_rec_index == px_dvma_debug_rec)
1328 		mmu_p->dvma_alloc_rec_index = 0;
1329 
1330 	ptr = kmem_alloc(sizeof (struct px_dvma_rec), KM_SLEEP);
1331 	ptr->dvma_addr = address;
1332 	ptr->len = len;
1333 	ptr->mp = mp;
1334 
1335 	ptr->next = mmu_p->dvma_active_list;
1336 	mmu_p->dvma_active_list = ptr;
1337 	mmu_p->dvma_active_count++;
1338 done:
1339 	mutex_exit(&mmu_p->dvma_debug_lock);
1340 }
1341 
1342 void
1343 px_dvma_free_debug(px_mmu_t *mmu_p, char *address, uint_t len,
1344     ddi_dma_impl_t *mp)
1345 {
1346 	struct px_dvma_rec *ptr, *ptr_save;
1347 	mutex_enter(&mmu_p->dvma_debug_lock);
1348 
1349 	if (!mmu_p->dvma_alloc_rec)
1350 		px_dvma_debug_init(mmu_p);
1351 	if (PX_DVMA_DBG_OFF(mmu_p)) {
1352 		px_dvma_debug_fini(mmu_p);
1353 		goto done;
1354 	}
1355 
1356 	ptr = &mmu_p->dvma_free_rec[mmu_p->dvma_free_rec_index];
1357 	ptr->dvma_addr = address;
1358 	ptr->len = len;
1359 	ptr->mp = mp;
1360 	if (++mmu_p->dvma_free_rec_index == px_dvma_debug_rec)
1361 		mmu_p->dvma_free_rec_index = 0;
1362 
1363 	ptr_save = mmu_p->dvma_active_list;
1364 	for (ptr = ptr_save; ptr; ptr = ptr->next) {
1365 		if ((ptr->dvma_addr == address) && (ptr->len = len))
1366 			break;
1367 		ptr_save = ptr;
1368 	}
1369 	if (!ptr) {
1370 		cmn_err(CE_WARN, "bad dvma free addr=%lx len=%x",
1371 		    (long)address, len);
1372 		goto done;
1373 	}
1374 	if (ptr == mmu_p->dvma_active_list)
1375 		mmu_p->dvma_active_list = ptr->next;
1376 	else
1377 		ptr_save->next = ptr->next;
1378 	kmem_free(ptr, sizeof (struct px_dvma_rec));
1379 	mmu_p->dvma_active_count--;
1380 done:
1381 	mutex_exit(&mmu_p->dvma_debug_lock);
1382 }
1383 
1384 #ifdef	DEBUG
1385 void
1386 px_dump_dma_handle(uint64_t flag, dev_info_t *dip, ddi_dma_impl_t *hp)
1387 {
1388 	DBG(flag, dip, "mp(%p): flags=%x mapping=%lx xfer_size=%x\n",
1389 	    hp, hp->dmai_inuse, hp->dmai_mapping, hp->dmai_size);
1390 	DBG(flag|DBG_CONT, dip, "\tnpages=%x roffset=%x rflags=%x nwin=%x\n",
1391 	    hp->dmai_ndvmapages, hp->dmai_roffset, hp->dmai_rflags,
1392 	    hp->dmai_nwin);
1393 	DBG(flag|DBG_CONT, dip, "\twinsize=%x tte=%p pfnlst=%p pfn0=%p\n",
1394 	    hp->dmai_winsize, hp->dmai_tte, hp->dmai_pfnlst, hp->dmai_pfn0);
1395 	DBG(flag|DBG_CONT, dip, "\twinlst=%x obj=%p attr=%p ckp=%p\n",
1396 	    hp->dmai_winlst, &hp->dmai_object, &hp->dmai_attr,
1397 	    hp->dmai_cookie);
1398 }
1399 #endif	/* DEBUG */
1400