xref: /titanic_51/usr/src/uts/i86pc/os/pmem.c (revision ebd1706e95186ddae1d4c0d63c47544cf33832ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * PMEM - Direct mapping physical memory pages to userland process
31  *
32  * Provide functions used for directly (w/o occupying kernel virtual address
33  * space) allocating and exporting physical memory pages to userland.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/mutex.h>
38 #include <sys/sunddi.h>
39 #include <sys/ddidevmap.h>
40 #include <sys/vnode.h>
41 #include <sys/sysmacros.h>
42 #include <sys/project.h>
43 #include <vm/seg_dev.h>
44 #include <sys/pmem.h>
45 #include <vm/hat_i86.h>
46 #include <sys/task.h>
47 #include <sys/sdt.h>
48 
49 /*
50  * The routines in this file allocate memory which will be accessed through
51  * the AGP GART hardware.  The GART is programmed with the PFNs for this
52  * memory, and the only mechanism for removing these entries is by an
53  * explicit process operation (ioctl/close of the driver, or process exit).
54  * As such, the pages need to remain locked to ensure that they won't be
55  * relocated or paged out.
56  *
57  * To prevent these locked pages from getting in the way of page
58  * coalescing, we try to allocate large pages from the system, and carve
59  * them up to satisfy pmem allocation requests.  This will keep the locked
60  * pages within a constrained area of physical memory, limiting the number
61  * of large pages that would be pinned by our locked pages.  This is, of
62  * course, another take on the infamous kernel cage, and it has many of the
63  * downsides of the original cage.  It also interferes with system-wide
64  * resource management decisions, as it maintains its own pool of unused
65  * pages which can't be easily reclaimed and used during low-memory
66  * situations.
67  *
68  * The right solution is for pmem to register a callback that the VM system
69  * could call, which would temporarily remove any GART entries for pages
70  * that were being relocated.  This would let us leave the pages unlocked,
71  * which would remove the need for using large pages, which would simplify
72  * this code a great deal.  Unfortunately, the support for these callbacks
73  * only exists on some SPARC platforms right now.
74  *
75  * Note that this is the *only* reason that large pages are used here.  The
76  * GART can't perform large-page translations, and the code appropriately
77  * falls back to using small pages if page_create_va_large() fails.
78  */
79 
80 #define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
81 			{ mutex_enter(&dhp->dh_lock); }
82 
83 #define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
84 			{ mutex_exit(&dhp->dh_lock); }
85 
86 #define	FROM_LPG(pp) (pp->p_szc != 0)
87 #define	PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
88 
89 /*
90  * Structs and static variables used for pmem only.
91  */
92 typedef struct pmem_lpg {
93 	page_t	*pl_pp;		/* start pp */
94 	ulong_t	*pl_bitmap;	/* allocation status for each page */
95 	ushort_t pl_pfree;	/* this large page might be fully freed */
96 	struct pmem_lpg *pl_next;
97 	struct pmem_lpg *pl_prev;
98 } pmem_lpg_t;
99 
100 static size_t	pmem_lpgsize;	/* the size of one large page */
101 static pgcnt_t	pmem_pgcnt;	/* the number of small pages in a large page */
102 static uint_t	pmem_lszc;	/* page size code of the large page */
103 /* The segment to be associated with all the allocated pages. */
104 static struct seg	pmem_seg;
105 /* Fully occupied large pages allocated for pmem. */
106 static pmem_lpg_t *pmem_occ_lpgs;
107 /* Memory pool to store residual small pages from large pages. */
108 static page_t	*pmem_mpool = NULL;
109 /* Number of small pages reside in pmem_mpool currently. */
110 static pgcnt_t	pmem_nmpages = 0;
111 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
112 kmutex_t	pmem_mutex;
113 
114 static int lpg_isfree(pmem_lpg_t *);
115 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
116 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
117 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
118 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
119 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
120 static void lpg_free(page_t *spp);
121 static pgcnt_t mpool_break(page_t **, pgcnt_t);
122 static void mpool_append(page_t **, pgcnt_t);
123 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
124 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
125 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
126     vnode_t *, u_offset_t *, uint_t);
127 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
128 static void tlist_out(page_t *, pgcnt_t);
129 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
130 static int pmem_lock(pgcnt_t, kproject_t **);
131 
132 /*
133  * Called by driver devmap routine to pass physical memory mapping info to
134  * seg_dev framework, used only for physical memory allocated from
135  * devmap_pmem_alloc().
136  */
137 /* ARGSUSED */
138 int
139 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
140     struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
141     offset_t off, size_t len, uint_t maxprot, uint_t flags,
142     ddi_device_acc_attr_t *accattrp)
143 {
144 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
145 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
146 
147 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
148 		return (DDI_FAILURE);
149 
150 	/*
151 	 * First to check if this function has been called for this dhp.
152 	 */
153 	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
154 		return (DDI_FAILURE);
155 
156 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
157 		return (DDI_FAILURE);
158 
159 	if (flags & DEVMAP_MAPPING_INVALID) {
160 		/*
161 		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
162 		 * remap permission.
163 		 */
164 		if (!(flags & DEVMAP_ALLOW_REMAP))
165 			return (DDI_FAILURE);
166 	} else {
167 		dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
168 		/* dh_roff is the offset inside the dh_pcookie. */
169 		dhp->dh_roff = ptob(btop(off));
170 	}
171 
172 	/*
173 	 * Only "No Cache" and "Write Combining" are supported. If any other
174 	 * cache type is specified, override with "No Cache".
175 	 */
176 	if (accattrp->devacc_attr_dataorder == DDI_MERGING_OK_ACC)
177 		dhp->dh_hat_attr = HAT_PLAT_NOCACHE | HAT_MERGING_OK;
178 	else
179 		dhp->dh_hat_attr = HAT_PLAT_NOCACHE | HAT_STRICTORDER;
180 	dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
181 	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
182 	dhp->dh_len = ptob(btopr(len));
183 
184 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
185 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
186 
187 	if (callbackops != NULL) {
188 		bcopy(callbackops, &dhp->dh_callbackops,
189 		    sizeof (struct devmap_callback_ctl));
190 	}
191 
192 	/*
193 	 * Initialize dh_lock if we want to do remap.
194 	 */
195 	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
196 		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
197 		dhp->dh_flags |= DEVMAP_LOCK_INITED;
198 	}
199 
200 	dhp->dh_flags |= DEVMAP_SETUP_DONE;
201 
202 	return (DDI_SUCCESS);
203 }
204 
205 /*
206  * Replace existing mapping using a new cookie, mainly gets called when doing
207  * fork(). Should be called in associated devmap_dup(9E).
208  */
209 /* ARGSUSED */
210 int
211 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
212     devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
213     uint_t flags, ddi_device_acc_attr_t *accattrp)
214 {
215 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
216 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
217 
218 	/*
219 	 * Reture failure if setup has not been done or no remap permission
220 	 * has been granted during the setup.
221 	 */
222 	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
223 	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
224 		return (DDI_FAILURE);
225 
226 	/* No flags supported for remap yet. */
227 	if (flags != 0)
228 		return (DDI_FAILURE);
229 
230 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
231 		return (DDI_FAILURE);
232 
233 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
234 		return (DDI_FAILURE);
235 
236 	HOLD_DHP_LOCK(dhp);
237 	/*
238 	 * Unload the old mapping of pages reloated with this dhp, so next
239 	 * fault will setup the new mappings. It is in segdev_faultpage that
240 	 * calls hat_devload to establish the mapping. Do this while holding
241 	 * the dhp lock so other faults dont reestablish the mappings.
242 	 */
243 	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
244 	    dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
245 
246 	/*
247 	 * Only "No Cache" and "Write Combining" are supported, if other cache
248 	 * type is specified, override with "No Cache".
249 	 */
250 	if (accattrp->devacc_attr_dataorder == DDI_MERGING_OK_ACC)
251 		dhp->dh_hat_attr = HAT_MERGING_OK;
252 	else
253 		dhp->dh_hat_attr = HAT_STRICTORDER;
254 	dhp->dh_pcookie = cookie;
255 	dhp->dh_roff = ptob(btop(off));
256 	dhp->dh_len = ptob(btopr(len));
257 
258 	/* Clear the large page size flag. */
259 	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
260 
261 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
262 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
263 	RELE_DHP_LOCK(dhp);
264 	return (DDI_SUCCESS);
265 }
266 
267 /*
268  * Directly (i.e., without occupying kernel virtual address space) allocate
269  * 'npages' physical memory pages for exporting to user land. The allocated
270  * page_t pointer will be recorded in cookie.
271  */
272 int
273 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
274 {
275 	u_offset_t	pmem_off = 0;
276 	page_t		*pp = NULL;
277 	page_t		*lpp = NULL;
278 	page_t		*tlist = NULL;
279 	pgcnt_t		i = 0;
280 	pgcnt_t		rpages = 0;
281 	pgcnt_t		lpages = 0;
282 	pgcnt_t		tpages = 0;
283 	pgcnt_t		npages = btopr(size);
284 	pmem_lpg_t	*plp = NULL;
285 	struct devmap_pmem_cookie	*pcp;
286 	uint_t		reserved = 0;
287 	uint_t		locked = 0;
288 	uint_t		pflags, kflags;
289 
290 	*cookiep = NULL;
291 
292 	/*
293 	 * Number larger than this will cause page_create_va() to loop
294 	 * infinitely.
295 	 */
296 	if (npages == 0 || npages >= total_pages / 2)
297 		return (DDI_FAILURE);
298 	if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
299 		return (DDI_FAILURE);
300 	pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
301 	kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
302 
303 	/* Allocate pmem cookie. */
304 	if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
305 		return (DDI_FAILURE);
306 	pcp->dp_npages = npages;
307 
308 	/*
309 	 * See if the requested memory can be locked. Currently we do resource
310 	 * controls on the project levlel only.
311 	 */
312 	if (pmem_lock(npages, &(pcp->dp_projp)) == DDI_FAILURE)
313 		goto alloc_fail;
314 	locked = 1;
315 
316 	/*
317 	 * First, grab as many as possible from pmem_mpool. If pages in
318 	 * pmem_mpool are enough for this request, we are done.
319 	 */
320 	mutex_enter(&pmem_mutex);
321 	tpages = mpool_break(&tlist, npages);
322 	/* IOlock and hashin them into the new offset. */
323 	if (tpages)
324 		tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
325 	mutex_exit(&pmem_mutex);
326 
327 	if (tpages == npages)
328 		goto done;
329 
330 	rpages = npages - tpages;
331 	/* Quit now if memory cannot be reserved. */
332 	if (!page_resv(rpages, kflags))
333 		goto alloc_fail;
334 	reserved = 1;
335 
336 	/* Try to allocate large pages first to decrease fragmentation. */
337 	i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
338 	if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
339 	    kflags) == DDI_FAILURE)
340 		goto alloc_fail;
341 	ASSERT(lpages == 0 ? lpp == NULL : 1);
342 
343 	/*
344 	 * Pages in large pages is more than the request, put the residual
345 	 * pages into pmem_mpool.
346 	 */
347 	if (lpages >= rpages) {
348 		lpp_break(&lpp, lpages, lpages - rpages, plp);
349 		goto done;
350 	}
351 
352 	/* Allocate small pages if lpp+tlist cannot satisfy the request. */
353 	i =  rpages - lpages;
354 	if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
355 	    pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
356 		goto alloc_fail;
357 
358 done:
359 	page_list_concat(&tlist, &lpp);
360 	page_list_concat(&tlist, &pp);
361 	/* Set those small pages from large pages as allocated. */
362 	mutex_enter(&pmem_mutex);
363 	pmem_lpg_concat(&pmem_occ_lpgs, &plp);
364 	mutex_exit(&pmem_mutex);
365 
366 	/*
367 	 * Now tlist holds all the pages for this cookie. Record these pages in
368 	 * pmem cookie.
369 	 */
370 	for (pp = tlist, i = 0; i < npages; i++) {
371 		pcp->dp_pparray[i] = pp;
372 		page_io_unlock(pp);
373 		pp = pp->p_next;
374 		page_sub(&tlist, pp->p_prev);
375 	}
376 	ASSERT(tlist == NULL);
377 	*cookiep = (devmap_pmem_cookie_t)pcp;
378 
379 	return (DDI_SUCCESS);
380 
381 alloc_fail:
382 	DTRACE_PROBE(pmem__alloc__fail);
383 	/* Free large pages and the associated allocation records. */
384 	if (lpp)
385 		lpp_free(lpp, lpages / pmem_pgcnt, &plp);
386 	if (reserved == 1)
387 		page_unresv(rpages);
388 	/* Put those pages in tlist back into pmem_mpool. */
389 	if (tpages != 0) {
390 		mutex_enter(&pmem_mutex);
391 		/* IOunlock, hashout and update the allocation records. */
392 		tlist_out(tlist, tpages);
393 		mpool_append(&tlist, tpages);
394 		mutex_exit(&pmem_mutex);
395 	}
396 	if (locked == 1)
397 		i_ddi_decr_locked_memory(NULL, NULL, pcp->dp_projp, NULL,
398 		    ptob(pcp->dp_npages));
399 	/* Freeing pmem_cookie. */
400 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
401 	kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
402 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
403 	return (DDI_FAILURE);
404 }
405 
406 /*
407  * Free all small pages inside cookie, and return pages from large pages into
408  * mpool, if all the pages from one large page is in mpool, free it as a whole.
409  */
410 void
411 devmap_pmem_free(devmap_pmem_cookie_t cookie)
412 {
413 	struct	devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
414 	pgcnt_t		i;
415 	pgcnt_t		tpages = 0;
416 	page_t		*pp;
417 	pmem_lpg_t 	*pl1, *plp;
418 	pmem_lpg_t	*pf_lpgs = NULL;
419 	uint_t		npls = 0;
420 	pmem_lpg_t *last_pl = NULL;
421 	pmem_lpg_t *plast_pl = NULL;
422 
423 	ASSERT(pcp);
424 	mutex_enter(&pmem_mutex);
425 	/* Free small pages and return them to memory pool. */
426 	for (i = pcp->dp_npages; i > 0; i--) {
427 		pp = pcp->dp_pparray[i - 1];
428 		page_hashout(pp, NULL);
429 		/*
430 		 * Remove the mapping of this single page, this mapping is
431 		 * created using hat_devload() in segdev_faultpage().
432 		 */
433 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
434 		if (!FROM_LPG(pp)) {
435 			/* Normal small page. */
436 			page_free(pp, 1);
437 			page_unresv(1);
438 		} else {
439 			/* Small page from large pages. */
440 			plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
441 			if (plp && !(plp->pl_pfree)) {
442 				/*
443 				 * Move this record to pf_lpgs list, this large
444 				 * page may be able to be freed as a whole.
445 				 */
446 				pmem_lpg_sub(&pmem_occ_lpgs, plp);
447 				pmem_lpg_concat(&pf_lpgs, &plp);
448 				plp->pl_pfree = 1;
449 				npls++;
450 				last_pl = NULL;
451 			} else {
452 				/* Search in pf_lpgs list. */
453 				plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
454 			}
455 			ASSERT(plp);
456 			/* Mark this page as free. */
457 			BT_SET(plp->pl_bitmap, PFIND(pp));
458 			/* Record this page in pmem_mpool. */
459 			mpool_append(&pp, 1);
460 		}
461 	}
462 
463 	/*
464 	 * Find out the large pages whose pages have been freed, remove them
465 	 * from plp list, free them and the associated pmem_lpg struct.
466 	 */
467 	for (plp = pf_lpgs; npls != 0; npls--) {
468 		pl1 = plp;
469 		plp = plp->pl_next;
470 		if (lpg_isfree(pl1)) {
471 			/*
472 			 * Get one free large page.  Find all pages in this
473 			 * large page and remove them from pmem_mpool.
474 			 */
475 			lpg_free(pl1->pl_pp);
476 			/* Remove associated allocation records. */
477 			pmem_lpg_sub(&pf_lpgs, pl1);
478 			pmem_lpg_free(&pf_lpgs, pl1);
479 			tpages -= pmem_pgcnt;
480 		} else
481 			pl1->pl_pfree = 0;
482 	}
483 	/* Update allocation records accordingly. */
484 	pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
485 	mutex_exit(&pmem_mutex);
486 
487 	i_ddi_decr_locked_memory(NULL, NULL, (kproject_t *)pcp->dp_projp, NULL,
488 	    ptob(pcp->dp_npages));
489 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
490 	kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
491 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
492 }
493 
494 /*
495  * To extract page frame number from specified range in a cookie.
496  */
497 int
498 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
499     pfn_t *pfnarray)
500 {
501 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
502 	pgcnt_t i;
503 
504 	if (pcp == NULL || start + npages > pcp->dp_npages)
505 		return (DDI_FAILURE);
506 
507 	for (i = start; i < start + npages; i++)
508 		pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum;
509 	return (DDI_SUCCESS);
510 }
511 
512 void
513 pmem_init()
514 {
515 	mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
516 	pmem_lszc = MIN(1, page_num_pagesizes() - 1);
517 	pmem_lpgsize = page_get_pagesize(pmem_lszc);
518 	pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
519 	bzero(&pmem_seg, sizeof (struct seg));
520 	pmem_seg.s_as = &kas;
521 }
522 
523 /* Allocate kernel memory for one pmem cookie with n pages. */
524 static int
525 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
526 {
527 	struct devmap_pmem_cookie *pcp;
528 
529 	if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
530 	    kflags)) == NULL)
531 		return (DDI_FAILURE);
532 	pcp = *pcpp;
533 	if ((pcp->dp_vnp =
534 	    kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
535 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
536 		return (DDI_FAILURE);
537 	}
538 	if ((pcp->dp_pparray =
539 	    kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
540 		kmem_free(pcp->dp_vnp, sizeof (vnode_t));
541 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
542 		return (DDI_FAILURE);
543 	}
544 	return (DDI_SUCCESS);
545 }
546 
547 /* Try to lock down n pages resource for current project. */
548 static int
549 pmem_lock(pgcnt_t n, kproject_t **prjpp)
550 {
551 	mutex_enter(&curproc->p_lock);
552 	if (i_ddi_incr_locked_memory(curproc, NULL, NULL, NULL,
553 	    ptob(n)) != 0) {
554 		mutex_exit(&curproc->p_lock);
555 		return (DDI_FAILURE);
556 	}
557 	/* Store this project in cookie for later lock/unlock. */
558 	*prjpp = curproc->p_task->tk_proj;
559 	mutex_exit(&curproc->p_lock);
560 	return (DDI_SUCCESS);
561 }
562 
563 /* To check if all the pages in a large page are freed. */
564 static int
565 lpg_isfree(pmem_lpg_t *plp)
566 {
567 	uint_t i;
568 
569 	for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
570 		if (plp->pl_bitmap[i] != BT_ULMAXMASK)
571 			return (0);
572 	/* All 1 means all pages are freed. */
573 	return (1);
574 }
575 
576 /*
577  * Using pp to get the associated large page allocation record, searching in
578  * the splp linked list with *last as the heuristic pointer. Return NULL if
579  * not found.
580  */
581 static pmem_lpg_t *
582 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
583 {
584 	pmem_lpg_t *plp;
585 	pgcnt_t root_pfn;
586 
587 	ASSERT(pp);
588 	if (splp == NULL)
589 		return (NULL);
590 	root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
591 
592 	/* Try last winner first. */
593 	if (*last && root_pfn == page_pptonum((*last)->pl_pp))
594 		goto pl_found;
595 
596 	/* Else search the whole pmem_lpg list. */
597 	for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
598 		plp = plp->pl_next;
599 		if (plp == splp) {
600 			plp = NULL;
601 			break;
602 		}
603 		ASSERT(plp->pl_pp);
604 	}
605 
606 	*last = plp;
607 
608 pl_found:
609 	return (*last);
610 }
611 
612 /*
613  *  Remove one pmem_lpg plp from the oplpp list.
614  */
615 static void
616 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
617 {
618 	if (*oplpp == plp)
619 		*oplpp = plp->pl_next;		/* go to next pmem_lpg */
620 
621 	if (*oplpp == plp)
622 		*oplpp = NULL;			/* pmem_lpg list is gone */
623 	else {
624 		plp->pl_prev->pl_next = plp->pl_next;
625 		plp->pl_next->pl_prev = plp->pl_prev;
626 	}
627 	plp->pl_prev = plp->pl_next = plp;	/* make plp a list of one */
628 }
629 
630 /*
631  * Concatenate page list nplpp onto the end of list plpp.
632  */
633 static void
634 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
635 {
636 	pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
637 
638 	if (*nplpp == NULL) {
639 		return;
640 	}
641 	if (*plpp == NULL) {
642 		*plpp = *nplpp;
643 		return;
644 	}
645 	s1p = *plpp;
646 	e1p =  s1p->pl_prev;
647 	s2p = *nplpp;
648 	e2p = s2p->pl_prev;
649 	s1p->pl_prev = e2p;
650 	e2p->pl_next = s1p;
651 	e1p->pl_next = s2p;
652 	s2p->pl_prev = e1p;
653 }
654 
655 /*
656  * Allocate and initialize the allocation record of one large page, the init
657  * value is 'allocated'.
658  */
659 static pmem_lpg_t *
660 pmem_lpg_alloc(uint_t kflags)
661 {
662 	pmem_lpg_t *plp;
663 
664 	ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
665 	plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
666 	if (plp == NULL)
667 		return (NULL);
668 	plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
669 	if (plp->pl_bitmap == NULL) {
670 		kmem_free(plp, sizeof (*plp));
671 		return (NULL);
672 	}
673 	plp->pl_next = plp->pl_prev = plp;
674 	return (plp);
675 }
676 
677 /* Free one allocation record pointed by oplp. */
678 static void
679 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
680 {
681 	if (*headp == plp)
682 		*headp = plp->pl_next;		/* go to next pmem_lpg_t */
683 
684 	if (*headp == plp)
685 		*headp = NULL;			/* this list is gone */
686 	else {
687 		plp->pl_prev->pl_next = plp->pl_next;
688 		plp->pl_next->pl_prev = plp->pl_prev;
689 	}
690 	kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
691 	kmem_free(plp, sizeof (*plp));
692 }
693 
694 /* Free one large page headed by spp from pmem_mpool. */
695 static void
696 lpg_free(page_t *spp)
697 {
698 	page_t *pp1 = spp;
699 	uint_t i;
700 
701 	ASSERT(MUTEX_HELD(&pmem_mutex));
702 	for (i = 0; i < pmem_pgcnt; i++) {
703 		/* Break pp1 from pmem_mpool. */
704 		page_sub(&pmem_mpool, pp1);
705 		pp1++;
706 	}
707 	/* Free pages in this large page. */
708 	page_free_pages(spp);
709 	page_unresv(pmem_pgcnt);
710 	pmem_nmpages -= pmem_pgcnt;
711 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
712 }
713 
714 /* Put n pages in *ppp list back into pmem_mpool. */
715 static void
716 mpool_append(page_t **ppp, pgcnt_t n)
717 {
718 	ASSERT(MUTEX_HELD(&pmem_mutex));
719 	/* Put back pages. */
720 	page_list_concat(&pmem_mpool, ppp);
721 	pmem_nmpages += n;
722 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
723 }
724 
725 /*
726  * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
727  * list, and return the number of grabbed pages.
728  */
729 static pgcnt_t
730 mpool_break(page_t **ppp, pgcnt_t n)
731 {
732 	pgcnt_t i;
733 
734 	ASSERT(MUTEX_HELD(&pmem_mutex));
735 	/* Grab the pages. */
736 	i = MIN(pmem_nmpages, n);
737 	*ppp = pmem_mpool;
738 	page_list_break(ppp, &pmem_mpool, i);
739 	pmem_nmpages -= i;
740 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
741 	return (i);
742 }
743 
744 /*
745  * Create n large pages, lpages and plpp contains the number of small pages and
746  * allocation records list respectively.
747  */
748 static int
749 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
750     vnode_t *vnp, u_offset_t *offp, uint_t kflags)
751 {
752 	pgcnt_t i;
753 	pmem_lpg_t *plp;
754 	page_t *pp;
755 
756 	for (i = 0, *lpages = 0; i < n; i++) {
757 		/* Allocte one large page each time. */
758 		pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
759 		    PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
760 		if (pp == NULL)
761 			break;
762 		*offp += pmem_lpgsize;
763 		page_list_concat(lppp, &pp);
764 		*lpages += pmem_pgcnt;
765 		/* Add one allocation record for this large page. */
766 		if ((plp = pmem_lpg_alloc(kflags)) == NULL)
767 			return (DDI_FAILURE);
768 		plp->pl_pp = pp;
769 		pmem_lpg_concat(plpp, &plp);
770 	}
771 	return (DDI_SUCCESS);
772 }
773 
774 /*
775  * Break the last r small pages from the large page list *lppp (with totally n
776  * small pages) and put them into pmem_mpool.
777  */
778 static void
779 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
780 {
781 	page_t *pp, *pp1;
782 	pgcnt_t i;
783 	pmem_lpg_t *plp;
784 
785 	if (r == 0)
786 		return;
787 	ASSERT(*lppp != NULL && r < pmem_pgcnt);
788 	page_list_break(lppp, &pp, n - r);
789 
790 	/* The residual should reside in the last large page.  */
791 	plp = oplp->pl_prev;
792 	/* IOunlock and hashout the residual pages. */
793 	for (pp1 = pp, i = 0; i < r; i++) {
794 		page_io_unlock(pp1);
795 		page_hashout(pp1, NULL);
796 		/* Mark this page as free. */
797 		BT_SET(plp->pl_bitmap, PFIND(pp1));
798 		pp1 = pp1->p_next;
799 	}
800 	ASSERT(pp1 == pp);
801 	/* Put these residual pages into memory pool. */
802 	mutex_enter(&pmem_mutex);
803 	mpool_append(&pp, r);
804 	mutex_exit(&pmem_mutex);
805 }
806 
807 /* Freeing large pages in lpp and the associated allocation records in plp. */
808 static void
809 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
810 {
811 	pgcnt_t i, j;
812 	page_t *pp = lpp, *pp1;
813 	pmem_lpg_t *plp1, *plp2;
814 
815 	for (i = 0; i < lpgs; i++) {
816 		for (j = 0; j < pmem_pgcnt; j++) {
817 			/* IO unlock and hashout this small page. */
818 			page_io_unlock(pp);
819 			page_hashout(pp, NULL);
820 			pp1 = pp->p_next;
821 			pp->p_prev = pp->p_next = pp;
822 			pp = pp1;
823 		}
824 		/* Free one large page at one time. */
825 		page_free_pages(lpp);
826 		lpp = pp;
827 	}
828 	/* Free associate pmem large page allocation records. */
829 	for (plp1 = *plpp; *plpp; plp1 = plp2) {
830 		plp2 = plp1->pl_next;
831 		pmem_lpg_free(plpp, plp1);
832 	}
833 }
834 
835 /*
836  * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
837  * and offset starting with *poffp. Update allocation records accordingly at
838  * the same time.
839  */
840 static void
841 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
842 {
843 	page_t *pp;
844 	pgcnt_t i = 0;
845 	pmem_lpg_t *plp, *last_pl = NULL;
846 
847 	ASSERT(MUTEX_HELD(&pmem_mutex));
848 	for (pp = tlist; i < tpages; i++) {
849 		ASSERT(FROM_LPG(pp));
850 		page_io_lock(pp);
851 		(void) page_hashin(pp, pvnp, *poffp, NULL);
852 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
853 		/* Mark this page as allocated. */
854 		BT_CLEAR(plp->pl_bitmap, PFIND(pp));
855 		*poffp += PAGESIZE;
856 		pp = pp->p_next;
857 	}
858 	ASSERT(pp == tlist);
859 }
860 
861 /*
862  * IOunlock and hashout all pages in tlist, update allocation records
863  * accordingly at the same time.
864  */
865 static void
866 tlist_out(page_t *tlist, pgcnt_t tpages)
867 {
868 	page_t *pp;
869 	pgcnt_t i = 0;
870 	pmem_lpg_t *plp, *last_pl = NULL;
871 
872 	ASSERT(MUTEX_HELD(&pmem_mutex));
873 	for (pp = tlist; i < tpages; i++) {
874 		ASSERT(FROM_LPG(pp));
875 		page_io_unlock(pp);
876 		page_hashout(pp, NULL);
877 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
878 		/* Mark this page as free. */
879 		BT_SET(plp->pl_bitmap, PFIND(pp));
880 		pp = pp->p_next;
881 	}
882 	ASSERT(pp == tlist);
883 }
884