xref: /illumos-gate/usr/src/uts/i86pc/os/pmem.c (revision 4b9db4f6425b1a08fca4390f446072c4a6aae8d5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * PMEM - Direct mapping physical memory pages to userland process
28  *
29  * Provide functions used for directly (w/o occupying kernel virtual address
30  * space) allocating and exporting physical memory pages to userland.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/mutex.h>
35 #include <sys/sunddi.h>
36 #include <sys/ddidevmap.h>
37 #include <sys/vnode.h>
38 #include <sys/sysmacros.h>
39 #include <vm/seg_dev.h>
40 #include <sys/pmem.h>
41 #include <vm/hat_i86.h>
42 #include <sys/task.h>
43 #include <sys/sdt.h>
44 
45 /*
46  * The routines in this file allocate memory which will be accessed through
47  * the AGP GART hardware.  The GART is programmed with the PFNs for this
48  * memory, and the only mechanism for removing these entries is by an
49  * explicit process operation (ioctl/close of the driver, or process exit).
50  * As such, the pages need to remain locked to ensure that they won't be
51  * relocated or paged out.
52  *
53  * To prevent these locked pages from getting in the way of page
54  * coalescing, we try to allocate large pages from the system, and carve
55  * them up to satisfy pmem allocation requests.  This will keep the locked
56  * pages within a constrained area of physical memory, limiting the number
57  * of large pages that would be pinned by our locked pages.  This is, of
58  * course, another take on the infamous kernel cage, and it has many of the
59  * downsides of the original cage.  It also interferes with system-wide
60  * resource management decisions, as it maintains its own pool of unused
61  * pages which can't be easily reclaimed and used during low-memory
62  * situations.
63  *
64  * The right solution is for pmem to register a callback that the VM system
65  * could call, which would temporarily remove any GART entries for pages
66  * that were being relocated.  This would let us leave the pages unlocked,
67  * which would remove the need for using large pages, which would simplify
68  * this code a great deal.  Unfortunately, the support for these callbacks
69  * only exists on some SPARC platforms right now.
70  *
71  * Note that this is the *only* reason that large pages are used here.  The
72  * GART can't perform large-page translations, and the code appropriately
73  * falls back to using small pages if page_create_va_large() fails.
74  */
75 
76 #define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
77 			{ mutex_enter(&dhp->dh_lock); }
78 
79 #define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
80 			{ mutex_exit(&dhp->dh_lock); }
81 
82 #define	FROM_LPG(pp) (pp->p_szc != 0)
83 #define	PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
84 
85 /*
86  * Structs and static variables used for pmem only.
87  */
88 typedef struct pmem_lpg {
89 	page_t	*pl_pp;		/* start pp */
90 	ulong_t	*pl_bitmap;	/* allocation status for each page */
91 	ushort_t pl_pfree;	/* this large page might be fully freed */
92 	struct pmem_lpg *pl_next;
93 	struct pmem_lpg *pl_prev;
94 } pmem_lpg_t;
95 
96 static size_t	pmem_lpgsize;	/* the size of one large page */
97 static pgcnt_t	pmem_pgcnt;	/* the number of small pages in a large page */
98 static uint_t	pmem_lszc;	/* page size code of the large page */
99 /* The segment to be associated with all the allocated pages. */
100 static struct seg	pmem_seg;
101 /* Fully occupied large pages allocated for pmem. */
102 static pmem_lpg_t *pmem_occ_lpgs;
103 /* Memory pool to store residual small pages from large pages. */
104 static page_t	*pmem_mpool = NULL;
105 /* Number of small pages reside in pmem_mpool currently. */
106 static pgcnt_t	pmem_nmpages = 0;
107 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
108 kmutex_t	pmem_mutex;
109 
110 static int lpg_isfree(pmem_lpg_t *);
111 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
112 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
113 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
114 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
115 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
116 static void lpg_free(page_t *spp);
117 static pgcnt_t mpool_break(page_t **, pgcnt_t);
118 static void mpool_append(page_t **, pgcnt_t);
119 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
120 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
121 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
122     vnode_t *, u_offset_t *, uint_t);
123 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
124 static void tlist_out(page_t *, pgcnt_t);
125 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
126 static int pmem_lock(pgcnt_t, proc_t *p);
127 
128 /*
129  * Called by driver devmap routine to pass physical memory mapping info to
130  * seg_dev framework, used only for physical memory allocated from
131  * devmap_pmem_alloc().
132  */
133 /* ARGSUSED */
134 int
135 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
136     struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
137     offset_t off, size_t len, uint_t maxprot, uint_t flags,
138     const ddi_device_acc_attr_t *accattrp)
139 {
140 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
141 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
142 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
143 
144 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
145 		return (DDI_FAILURE);
146 
147 	/*
148 	 * First to check if this function has been called for this dhp.
149 	 */
150 	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
151 		return (DDI_FAILURE);
152 
153 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
154 		return (DDI_FAILURE);
155 
156 	/*
157 	 * Check if the cache attributes are supported. Need to pay
158 	 * attention that only uncachable or write-combining is
159 	 * permitted for pmem.
160 	 */
161 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
162 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
163 		return (DDI_FAILURE);
164 
165 	if (flags & DEVMAP_MAPPING_INVALID) {
166 		/*
167 		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
168 		 * remap permission.
169 		 */
170 		if (!(flags & DEVMAP_ALLOW_REMAP))
171 			return (DDI_FAILURE);
172 	} else {
173 		dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
174 		/* dh_roff is the offset inside the dh_pcookie. */
175 		dhp->dh_roff = ptob(btop(off));
176 		/* Set the cache attributes correctly */
177 		i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
178 	}
179 
180 	dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
181 	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
182 	dhp->dh_len = ptob(btopr(len));
183 
184 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
185 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
186 
187 	if (callbackops != NULL) {
188 		bcopy(callbackops, &dhp->dh_callbackops,
189 		    sizeof (struct devmap_callback_ctl));
190 	}
191 
192 	/*
193 	 * Initialize dh_lock if we want to do remap.
194 	 */
195 	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
196 		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
197 		dhp->dh_flags |= DEVMAP_LOCK_INITED;
198 	}
199 
200 	dhp->dh_flags |= DEVMAP_SETUP_DONE;
201 
202 	return (DDI_SUCCESS);
203 }
204 
205 /*
206  * Replace existing mapping using a new cookie, mainly gets called when doing
207  * fork(). Should be called in associated devmap_dup(9E).
208  */
209 /* ARGSUSED */
210 int
211 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
212     devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
213     uint_t flags, const ddi_device_acc_attr_t *accattrp)
214 {
215 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
216 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
217 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
218 
219 	/*
220 	 * Reture failure if setup has not been done or no remap permission
221 	 * has been granted during the setup.
222 	 */
223 	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
224 	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
225 		return (DDI_FAILURE);
226 
227 	/* No flags supported for remap yet. */
228 	if (flags != 0)
229 		return (DDI_FAILURE);
230 
231 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
232 		return (DDI_FAILURE);
233 
234 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
235 		return (DDI_FAILURE);
236 
237 	/*
238 	 * Check if the cache attributes are supported. Need to pay
239 	 * attention that only uncachable or write-combining is
240 	 * permitted for pmem.
241 	 */
242 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
243 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
244 		return (DDI_FAILURE);
245 
246 	HOLD_DHP_LOCK(dhp);
247 	/*
248 	 * Unload the old mapping of pages reloated with this dhp, so next
249 	 * fault will setup the new mappings. It is in segdev_faultpage that
250 	 * calls hat_devload to establish the mapping. Do this while holding
251 	 * the dhp lock so other faults dont reestablish the mappings.
252 	 */
253 	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
254 	    dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
255 
256 	/* Set the cache attributes correctly */
257 	i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
258 
259 	dhp->dh_pcookie = cookie;
260 	dhp->dh_roff = ptob(btop(off));
261 	dhp->dh_len = ptob(btopr(len));
262 
263 	/* Clear the large page size flag. */
264 	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
265 
266 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
267 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
268 	RELE_DHP_LOCK(dhp);
269 	return (DDI_SUCCESS);
270 }
271 
272 /*
273  * Directly (i.e., without occupying kernel virtual address space) allocate
274  * 'npages' physical memory pages for exporting to user land. The allocated
275  * page_t pointer will be recorded in cookie.
276  */
277 int
278 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
279 {
280 	u_offset_t	pmem_off = 0;
281 	page_t		*pp = NULL;
282 	page_t		*lpp = NULL;
283 	page_t		*tlist = NULL;
284 	pgcnt_t		i = 0;
285 	pgcnt_t		rpages = 0;
286 	pgcnt_t		lpages = 0;
287 	pgcnt_t		tpages = 0;
288 	pgcnt_t		npages = btopr(size);
289 	pmem_lpg_t	*plp = NULL;
290 	struct devmap_pmem_cookie	*pcp;
291 	uint_t		reserved = 0;
292 	uint_t		locked = 0;
293 	uint_t		pflags, kflags;
294 
295 	*cookiep = NULL;
296 
297 	/*
298 	 * Number larger than this will cause page_create_va() to loop
299 	 * infinitely.
300 	 */
301 	if (npages == 0 || npages >= total_pages / 2)
302 		return (DDI_FAILURE);
303 	if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
304 		return (DDI_FAILURE);
305 	pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
306 	kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
307 
308 	/* Allocate pmem cookie. */
309 	if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
310 		return (DDI_FAILURE);
311 	pcp->dp_npages = npages;
312 
313 	/*
314 	 * See if the requested memory can be locked.
315 	 */
316 	pcp->dp_proc = curproc;
317 	if (pmem_lock(npages, curproc) == DDI_FAILURE)
318 		goto alloc_fail;
319 	locked = 1;
320 	/*
321 	 * First, grab as many as possible from pmem_mpool. If pages in
322 	 * pmem_mpool are enough for this request, we are done.
323 	 */
324 	mutex_enter(&pmem_mutex);
325 	tpages = mpool_break(&tlist, npages);
326 	/* IOlock and hashin them into the new offset. */
327 	if (tpages)
328 		tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
329 	mutex_exit(&pmem_mutex);
330 
331 	if (tpages == npages)
332 		goto done;
333 
334 	rpages = npages - tpages;
335 	/* Quit now if memory cannot be reserved. */
336 	if (!page_resv(rpages, kflags))
337 		goto alloc_fail;
338 	reserved = 1;
339 
340 	/* If we have large pages */
341 	if (pmem_lpgsize > PAGESIZE) {
342 		/* Try to alloc large pages first to decrease fragmentation. */
343 		i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
344 		if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
345 		    kflags) == DDI_FAILURE)
346 			goto alloc_fail;
347 		ASSERT(lpages == 0 ? lpp == NULL : 1);
348 	}
349 
350 	/*
351 	 * Pages in large pages is more than the request, put the residual
352 	 * pages into pmem_mpool.
353 	 */
354 	if (lpages >= rpages) {
355 		lpp_break(&lpp, lpages, lpages - rpages, plp);
356 		goto done;
357 	}
358 
359 	/* Allocate small pages if lpp+tlist cannot satisfy the request. */
360 	i =  rpages - lpages;
361 	if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
362 	    pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
363 		goto alloc_fail;
364 
365 done:
366 	page_list_concat(&tlist, &lpp);
367 	page_list_concat(&tlist, &pp);
368 	/* Set those small pages from large pages as allocated. */
369 	mutex_enter(&pmem_mutex);
370 	pmem_lpg_concat(&pmem_occ_lpgs, &plp);
371 	mutex_exit(&pmem_mutex);
372 
373 	/*
374 	 * Now tlist holds all the pages for this cookie. Record these pages in
375 	 * pmem cookie.
376 	 */
377 	for (pp = tlist, i = 0; i < npages; i++) {
378 		pcp->dp_pparray[i] = pp;
379 		page_io_unlock(pp);
380 		pp = pp->p_next;
381 		page_sub(&tlist, pp->p_prev);
382 	}
383 	ASSERT(tlist == NULL);
384 	*cookiep = (devmap_pmem_cookie_t)pcp;
385 
386 	return (DDI_SUCCESS);
387 
388 alloc_fail:
389 	DTRACE_PROBE(pmem__alloc__fail);
390 	/* Free large pages and the associated allocation records. */
391 	if (lpp)
392 		lpp_free(lpp, lpages / pmem_pgcnt, &plp);
393 	if (reserved == 1)
394 		page_unresv(rpages);
395 	/* Put those pages in tlist back into pmem_mpool. */
396 	if (tpages != 0) {
397 		mutex_enter(&pmem_mutex);
398 		/* IOunlock, hashout and update the allocation records. */
399 		tlist_out(tlist, tpages);
400 		mpool_append(&tlist, tpages);
401 		mutex_exit(&pmem_mutex);
402 	}
403 	if (locked == 1)
404 		i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages));
405 	/* Freeing pmem_cookie. */
406 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
407 	kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
408 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
409 	return (DDI_FAILURE);
410 }
411 
412 /*
413  * Free all small pages inside cookie, and return pages from large pages into
414  * mpool, if all the pages from one large page is in mpool, free it as a whole.
415  */
416 void
417 devmap_pmem_free(devmap_pmem_cookie_t cookie)
418 {
419 	struct	devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
420 	pgcnt_t		i;
421 	pgcnt_t		tpages = 0;
422 	page_t		*pp;
423 	pmem_lpg_t	*pl1, *plp;
424 	pmem_lpg_t	*pf_lpgs = NULL;
425 	uint_t		npls = 0;
426 	pmem_lpg_t *last_pl = NULL;
427 	pmem_lpg_t *plast_pl = NULL;
428 
429 	ASSERT(pcp);
430 	mutex_enter(&pmem_mutex);
431 	/* Free small pages and return them to memory pool. */
432 	for (i = pcp->dp_npages; i > 0; i--) {
433 		pp = pcp->dp_pparray[i - 1];
434 		page_hashout(pp, NULL);
435 		/*
436 		 * Remove the mapping of this single page, this mapping is
437 		 * created using hat_devload() in segdev_faultpage().
438 		 */
439 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
440 		if (!FROM_LPG(pp)) {
441 			/* Normal small page. */
442 			page_free(pp, 1);
443 			page_unresv(1);
444 		} else {
445 			/* Small page from large pages. */
446 			plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
447 			if (plp && !(plp->pl_pfree)) {
448 				/*
449 				 * Move this record to pf_lpgs list, this large
450 				 * page may be able to be freed as a whole.
451 				 */
452 				pmem_lpg_sub(&pmem_occ_lpgs, plp);
453 				pmem_lpg_concat(&pf_lpgs, &plp);
454 				plp->pl_pfree = 1;
455 				npls++;
456 				last_pl = NULL;
457 			} else {
458 				/* Search in pf_lpgs list. */
459 				plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
460 			}
461 			ASSERT(plp);
462 			/* Mark this page as free. */
463 			BT_SET(plp->pl_bitmap, PFIND(pp));
464 			/* Record this page in pmem_mpool. */
465 			mpool_append(&pp, 1);
466 		}
467 	}
468 
469 	/*
470 	 * Find out the large pages whose pages have been freed, remove them
471 	 * from plp list, free them and the associated pmem_lpg struct.
472 	 */
473 	for (plp = pf_lpgs; npls != 0; npls--) {
474 		pl1 = plp;
475 		plp = plp->pl_next;
476 		if (lpg_isfree(pl1)) {
477 			/*
478 			 * Get one free large page.  Find all pages in this
479 			 * large page and remove them from pmem_mpool.
480 			 */
481 			lpg_free(pl1->pl_pp);
482 			/* Remove associated allocation records. */
483 			pmem_lpg_sub(&pf_lpgs, pl1);
484 			pmem_lpg_free(&pf_lpgs, pl1);
485 			tpages -= pmem_pgcnt;
486 		} else
487 			pl1->pl_pfree = 0;
488 	}
489 	/* Update allocation records accordingly. */
490 	pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
491 	mutex_exit(&pmem_mutex);
492 
493 	if (curproc == pcp->dp_proc)
494 		i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages));
495 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
496 	kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
497 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
498 }
499 
500 /*
501  * To extract page frame number from specified range in a cookie.
502  */
503 int
504 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
505     pfn_t *pfnarray)
506 {
507 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
508 	pgcnt_t i;
509 
510 	if (pcp == NULL || start + npages > pcp->dp_npages)
511 		return (DDI_FAILURE);
512 
513 	for (i = start; i < start + npages; i++)
514 		pfnarray[i - start] = pfn_to_mfn(pcp->dp_pparray[i]->p_pagenum);
515 
516 	return (DDI_SUCCESS);
517 }
518 
519 void
520 pmem_init()
521 {
522 	mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
523 	pmem_lszc = MIN(1, page_num_pagesizes() - 1);
524 	pmem_lpgsize = page_get_pagesize(pmem_lszc);
525 	pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
526 	bzero(&pmem_seg, sizeof (struct seg));
527 	pmem_seg.s_as = &kas;
528 }
529 
530 /* Allocate kernel memory for one pmem cookie with n pages. */
531 static int
532 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
533 {
534 	struct devmap_pmem_cookie *pcp;
535 
536 	if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
537 	    kflags)) == NULL)
538 		return (DDI_FAILURE);
539 	pcp = *pcpp;
540 	if ((pcp->dp_vnp =
541 	    kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
542 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
543 		return (DDI_FAILURE);
544 	}
545 	if ((pcp->dp_pparray =
546 	    kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
547 		kmem_free(pcp->dp_vnp, sizeof (vnode_t));
548 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
549 		return (DDI_FAILURE);
550 	}
551 	return (DDI_SUCCESS);
552 }
553 
554 /* Try to lock down n pages resource */
555 static int
556 pmem_lock(pgcnt_t n, proc_t *p)
557 {
558 	if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) {
559 		return (DDI_FAILURE);
560 	}
561 	return (DDI_SUCCESS);
562 }
563 
564 /* To check if all the pages in a large page are freed. */
565 static int
566 lpg_isfree(pmem_lpg_t *plp)
567 {
568 	uint_t i;
569 
570 	for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
571 		if (plp->pl_bitmap[i] != BT_ULMAXMASK)
572 			return (0);
573 	/* All 1 means all pages are freed. */
574 	return (1);
575 }
576 
577 /*
578  * Using pp to get the associated large page allocation record, searching in
579  * the splp linked list with *last as the heuristic pointer. Return NULL if
580  * not found.
581  */
582 static pmem_lpg_t *
583 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
584 {
585 	pmem_lpg_t *plp;
586 	pgcnt_t root_pfn;
587 
588 	ASSERT(pp);
589 	if (splp == NULL)
590 		return (NULL);
591 	root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
592 
593 	/* Try last winner first. */
594 	if (*last && root_pfn == page_pptonum((*last)->pl_pp))
595 		goto pl_found;
596 
597 	/* Else search the whole pmem_lpg list. */
598 	for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
599 		plp = plp->pl_next;
600 		if (plp == splp) {
601 			plp = NULL;
602 			break;
603 		}
604 		ASSERT(plp->pl_pp);
605 	}
606 
607 	*last = plp;
608 
609 pl_found:
610 	return (*last);
611 }
612 
613 /*
614  *  Remove one pmem_lpg plp from the oplpp list.
615  */
616 static void
617 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
618 {
619 	if (*oplpp == plp)
620 		*oplpp = plp->pl_next;		/* go to next pmem_lpg */
621 
622 	if (*oplpp == plp)
623 		*oplpp = NULL;			/* pmem_lpg list is gone */
624 	else {
625 		plp->pl_prev->pl_next = plp->pl_next;
626 		plp->pl_next->pl_prev = plp->pl_prev;
627 	}
628 	plp->pl_prev = plp->pl_next = plp;	/* make plp a list of one */
629 }
630 
631 /*
632  * Concatenate page list nplpp onto the end of list plpp.
633  */
634 static void
635 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
636 {
637 	pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
638 
639 	if (*nplpp == NULL) {
640 		return;
641 	}
642 	if (*plpp == NULL) {
643 		*plpp = *nplpp;
644 		return;
645 	}
646 	s1p = *plpp;
647 	e1p =  s1p->pl_prev;
648 	s2p = *nplpp;
649 	e2p = s2p->pl_prev;
650 	s1p->pl_prev = e2p;
651 	e2p->pl_next = s1p;
652 	e1p->pl_next = s2p;
653 	s2p->pl_prev = e1p;
654 }
655 
656 /*
657  * Allocate and initialize the allocation record of one large page, the init
658  * value is 'allocated'.
659  */
660 static pmem_lpg_t *
661 pmem_lpg_alloc(uint_t kflags)
662 {
663 	pmem_lpg_t *plp;
664 
665 	ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
666 	plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
667 	if (plp == NULL)
668 		return (NULL);
669 	plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
670 	if (plp->pl_bitmap == NULL) {
671 		kmem_free(plp, sizeof (*plp));
672 		return (NULL);
673 	}
674 	plp->pl_next = plp->pl_prev = plp;
675 	return (plp);
676 }
677 
678 /* Free one allocation record pointed by oplp. */
679 static void
680 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
681 {
682 	if (*headp == plp)
683 		*headp = plp->pl_next;		/* go to next pmem_lpg_t */
684 
685 	if (*headp == plp)
686 		*headp = NULL;			/* this list is gone */
687 	else {
688 		plp->pl_prev->pl_next = plp->pl_next;
689 		plp->pl_next->pl_prev = plp->pl_prev;
690 	}
691 	kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
692 	kmem_free(plp, sizeof (*plp));
693 }
694 
695 /* Free one large page headed by spp from pmem_mpool. */
696 static void
697 lpg_free(page_t *spp)
698 {
699 	page_t *pp1 = spp;
700 	uint_t i;
701 
702 	ASSERT(MUTEX_HELD(&pmem_mutex));
703 	for (i = 0; i < pmem_pgcnt; i++) {
704 		/* Break pp1 from pmem_mpool. */
705 		page_sub(&pmem_mpool, pp1);
706 		pp1++;
707 	}
708 	/* Free pages in this large page. */
709 	page_free_pages(spp);
710 	page_unresv(pmem_pgcnt);
711 	pmem_nmpages -= pmem_pgcnt;
712 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
713 }
714 
715 /* Put n pages in *ppp list back into pmem_mpool. */
716 static void
717 mpool_append(page_t **ppp, pgcnt_t n)
718 {
719 	ASSERT(MUTEX_HELD(&pmem_mutex));
720 	/* Put back pages. */
721 	page_list_concat(&pmem_mpool, ppp);
722 	pmem_nmpages += n;
723 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
724 }
725 
726 /*
727  * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
728  * list, and return the number of grabbed pages.
729  */
730 static pgcnt_t
731 mpool_break(page_t **ppp, pgcnt_t n)
732 {
733 	pgcnt_t i;
734 
735 	ASSERT(MUTEX_HELD(&pmem_mutex));
736 	/* Grab the pages. */
737 	i = MIN(pmem_nmpages, n);
738 	*ppp = pmem_mpool;
739 	page_list_break(ppp, &pmem_mpool, i);
740 	pmem_nmpages -= i;
741 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
742 	return (i);
743 }
744 
745 /*
746  * Create n large pages, lpages and plpp contains the number of small pages and
747  * allocation records list respectively.
748  */
749 static int
750 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
751     vnode_t *vnp, u_offset_t *offp, uint_t kflags)
752 {
753 	pgcnt_t i;
754 	pmem_lpg_t *plp;
755 	page_t *pp;
756 
757 	for (i = 0, *lpages = 0; i < n; i++) {
758 		/* Allocte one large page each time. */
759 		pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
760 		    PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
761 		if (pp == NULL)
762 			break;
763 		*offp += pmem_lpgsize;
764 		page_list_concat(lppp, &pp);
765 		*lpages += pmem_pgcnt;
766 		/* Add one allocation record for this large page. */
767 		if ((plp = pmem_lpg_alloc(kflags)) == NULL)
768 			return (DDI_FAILURE);
769 		plp->pl_pp = pp;
770 		pmem_lpg_concat(plpp, &plp);
771 	}
772 	return (DDI_SUCCESS);
773 }
774 
775 /*
776  * Break the last r small pages from the large page list *lppp (with totally n
777  * small pages) and put them into pmem_mpool.
778  */
779 static void
780 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
781 {
782 	page_t *pp, *pp1;
783 	pgcnt_t i;
784 	pmem_lpg_t *plp;
785 
786 	if (r == 0)
787 		return;
788 	ASSERT(*lppp != NULL && r < pmem_pgcnt);
789 	page_list_break(lppp, &pp, n - r);
790 
791 	/* The residual should reside in the last large page.  */
792 	plp = oplp->pl_prev;
793 	/* IOunlock and hashout the residual pages. */
794 	for (pp1 = pp, i = 0; i < r; i++) {
795 		page_io_unlock(pp1);
796 		page_hashout(pp1, NULL);
797 		/* Mark this page as free. */
798 		BT_SET(plp->pl_bitmap, PFIND(pp1));
799 		pp1 = pp1->p_next;
800 	}
801 	ASSERT(pp1 == pp);
802 	/* Put these residual pages into memory pool. */
803 	mutex_enter(&pmem_mutex);
804 	mpool_append(&pp, r);
805 	mutex_exit(&pmem_mutex);
806 }
807 
808 /* Freeing large pages in lpp and the associated allocation records in plp. */
809 static void
810 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
811 {
812 	pgcnt_t i, j;
813 	page_t *pp = lpp, *pp1;
814 	pmem_lpg_t *plp1, *plp2;
815 
816 	for (i = 0; i < lpgs; i++) {
817 		for (j = 0; j < pmem_pgcnt; j++) {
818 			/* IO unlock and hashout this small page. */
819 			page_io_unlock(pp);
820 			page_hashout(pp, NULL);
821 			pp1 = pp->p_next;
822 			pp->p_prev = pp->p_next = pp;
823 			pp = pp1;
824 		}
825 		/* Free one large page at one time. */
826 		page_free_pages(lpp);
827 		lpp = pp;
828 	}
829 	/* Free associate pmem large page allocation records. */
830 	for (plp1 = *plpp; *plpp; plp1 = plp2) {
831 		plp2 = plp1->pl_next;
832 		pmem_lpg_free(plpp, plp1);
833 	}
834 }
835 
836 /*
837  * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
838  * and offset starting with *poffp. Update allocation records accordingly at
839  * the same time.
840  */
841 static void
842 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
843 {
844 	page_t *pp;
845 	pgcnt_t i = 0;
846 	pmem_lpg_t *plp, *last_pl = NULL;
847 
848 	ASSERT(MUTEX_HELD(&pmem_mutex));
849 	for (pp = tlist; i < tpages; i++) {
850 		ASSERT(FROM_LPG(pp));
851 		page_io_lock(pp);
852 		(void) page_hashin(pp, pvnp, *poffp, NULL);
853 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
854 		/* Mark this page as allocated. */
855 		BT_CLEAR(plp->pl_bitmap, PFIND(pp));
856 		*poffp += PAGESIZE;
857 		pp = pp->p_next;
858 	}
859 	ASSERT(pp == tlist);
860 }
861 
862 /*
863  * IOunlock and hashout all pages in tlist, update allocation records
864  * accordingly at the same time.
865  */
866 static void
867 tlist_out(page_t *tlist, pgcnt_t tpages)
868 {
869 	page_t *pp;
870 	pgcnt_t i = 0;
871 	pmem_lpg_t *plp, *last_pl = NULL;
872 
873 	ASSERT(MUTEX_HELD(&pmem_mutex));
874 	for (pp = tlist; i < tpages; i++) {
875 		ASSERT(FROM_LPG(pp));
876 		page_io_unlock(pp);
877 		page_hashout(pp, NULL);
878 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
879 		/* Mark this page as free. */
880 		BT_SET(plp->pl_bitmap, PFIND(pp));
881 		pp = pp->p_next;
882 	}
883 	ASSERT(pp == tlist);
884 }
885