xref: /titanic_51/usr/src/uts/i86pc/os/pmem.c (revision 672986541be54a7a471bb088e60780c37e371d7e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * PMEM - Direct mapping physical memory pages to userland process
30  *
31  * Provide functions used for directly (w/o occupying kernel virtual address
32  * space) allocating and exporting physical memory pages to userland.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/mutex.h>
37 #include <sys/sunddi.h>
38 #include <sys/ddidevmap.h>
39 #include <sys/vnode.h>
40 #include <sys/sysmacros.h>
41 #include <vm/seg_dev.h>
42 #include <sys/pmem.h>
43 #include <vm/hat_i86.h>
44 #include <sys/task.h>
45 #include <sys/sdt.h>
46 
47 /*
48  * The routines in this file allocate memory which will be accessed through
49  * the AGP GART hardware.  The GART is programmed with the PFNs for this
50  * memory, and the only mechanism for removing these entries is by an
51  * explicit process operation (ioctl/close of the driver, or process exit).
52  * As such, the pages need to remain locked to ensure that they won't be
53  * relocated or paged out.
54  *
55  * To prevent these locked pages from getting in the way of page
56  * coalescing, we try to allocate large pages from the system, and carve
57  * them up to satisfy pmem allocation requests.  This will keep the locked
58  * pages within a constrained area of physical memory, limiting the number
59  * of large pages that would be pinned by our locked pages.  This is, of
60  * course, another take on the infamous kernel cage, and it has many of the
61  * downsides of the original cage.  It also interferes with system-wide
62  * resource management decisions, as it maintains its own pool of unused
63  * pages which can't be easily reclaimed and used during low-memory
64  * situations.
65  *
66  * The right solution is for pmem to register a callback that the VM system
67  * could call, which would temporarily remove any GART entries for pages
68  * that were being relocated.  This would let us leave the pages unlocked,
69  * which would remove the need for using large pages, which would simplify
70  * this code a great deal.  Unfortunately, the support for these callbacks
71  * only exists on some SPARC platforms right now.
72  *
73  * Note that this is the *only* reason that large pages are used here.  The
74  * GART can't perform large-page translations, and the code appropriately
75  * falls back to using small pages if page_create_va_large() fails.
76  */
77 
78 #define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
79 			{ mutex_enter(&dhp->dh_lock); }
80 
81 #define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
82 			{ mutex_exit(&dhp->dh_lock); }
83 
84 #define	FROM_LPG(pp) (pp->p_szc != 0)
85 #define	PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
86 
87 /*
88  * Structs and static variables used for pmem only.
89  */
90 typedef struct pmem_lpg {
91 	page_t	*pl_pp;		/* start pp */
92 	ulong_t	*pl_bitmap;	/* allocation status for each page */
93 	ushort_t pl_pfree;	/* this large page might be fully freed */
94 	struct pmem_lpg *pl_next;
95 	struct pmem_lpg *pl_prev;
96 } pmem_lpg_t;
97 
98 static size_t	pmem_lpgsize;	/* the size of one large page */
99 static pgcnt_t	pmem_pgcnt;	/* the number of small pages in a large page */
100 static uint_t	pmem_lszc;	/* page size code of the large page */
101 /* The segment to be associated with all the allocated pages. */
102 static struct seg	pmem_seg;
103 /* Fully occupied large pages allocated for pmem. */
104 static pmem_lpg_t *pmem_occ_lpgs;
105 /* Memory pool to store residual small pages from large pages. */
106 static page_t	*pmem_mpool = NULL;
107 /* Number of small pages reside in pmem_mpool currently. */
108 static pgcnt_t	pmem_nmpages = 0;
109 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
110 kmutex_t	pmem_mutex;
111 
112 static int lpg_isfree(pmem_lpg_t *);
113 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
114 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
115 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
116 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
117 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
118 static void lpg_free(page_t *spp);
119 static pgcnt_t mpool_break(page_t **, pgcnt_t);
120 static void mpool_append(page_t **, pgcnt_t);
121 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
122 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
123 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
124     vnode_t *, u_offset_t *, uint_t);
125 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
126 static void tlist_out(page_t *, pgcnt_t);
127 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
128 static int pmem_lock(pgcnt_t, proc_t *p);
129 
130 /*
131  * Called by driver devmap routine to pass physical memory mapping info to
132  * seg_dev framework, used only for physical memory allocated from
133  * devmap_pmem_alloc().
134  */
135 /* ARGSUSED */
136 int
137 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
138     struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
139     offset_t off, size_t len, uint_t maxprot, uint_t flags,
140     ddi_device_acc_attr_t *accattrp)
141 {
142 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
143 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
144 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
145 
146 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
147 		return (DDI_FAILURE);
148 
149 	/*
150 	 * First to check if this function has been called for this dhp.
151 	 */
152 	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
153 		return (DDI_FAILURE);
154 
155 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
156 		return (DDI_FAILURE);
157 
158 	/*
159 	 * Check if the cache attributes are supported. Need to pay
160 	 * attention that only uncachable or write-combining is
161 	 * permitted for pmem.
162 	 */
163 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
164 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
165 		return (DDI_FAILURE);
166 
167 	if (flags & DEVMAP_MAPPING_INVALID) {
168 		/*
169 		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
170 		 * remap permission.
171 		 */
172 		if (!(flags & DEVMAP_ALLOW_REMAP))
173 			return (DDI_FAILURE);
174 	} else {
175 		dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
176 		/* dh_roff is the offset inside the dh_pcookie. */
177 		dhp->dh_roff = ptob(btop(off));
178 		/* Set the cache attributes correctly */
179 		i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
180 	}
181 
182 	dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
183 	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
184 	dhp->dh_len = ptob(btopr(len));
185 
186 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
187 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
188 
189 	if (callbackops != NULL) {
190 		bcopy(callbackops, &dhp->dh_callbackops,
191 		    sizeof (struct devmap_callback_ctl));
192 	}
193 
194 	/*
195 	 * Initialize dh_lock if we want to do remap.
196 	 */
197 	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
198 		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
199 		dhp->dh_flags |= DEVMAP_LOCK_INITED;
200 	}
201 
202 	dhp->dh_flags |= DEVMAP_SETUP_DONE;
203 
204 	return (DDI_SUCCESS);
205 }
206 
207 /*
208  * Replace existing mapping using a new cookie, mainly gets called when doing
209  * fork(). Should be called in associated devmap_dup(9E).
210  */
211 /* ARGSUSED */
212 int
213 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
214     devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
215     uint_t flags, ddi_device_acc_attr_t *accattrp)
216 {
217 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
218 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
219 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
220 
221 	/*
222 	 * Reture failure if setup has not been done or no remap permission
223 	 * has been granted during the setup.
224 	 */
225 	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
226 	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
227 		return (DDI_FAILURE);
228 
229 	/* No flags supported for remap yet. */
230 	if (flags != 0)
231 		return (DDI_FAILURE);
232 
233 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
234 		return (DDI_FAILURE);
235 
236 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
237 		return (DDI_FAILURE);
238 
239 	/*
240 	 * Check if the cache attributes are supported. Need to pay
241 	 * attention that only uncachable or write-combining is
242 	 * permitted for pmem.
243 	 */
244 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
245 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
246 		return (DDI_FAILURE);
247 
248 	HOLD_DHP_LOCK(dhp);
249 	/*
250 	 * Unload the old mapping of pages reloated with this dhp, so next
251 	 * fault will setup the new mappings. It is in segdev_faultpage that
252 	 * calls hat_devload to establish the mapping. Do this while holding
253 	 * the dhp lock so other faults dont reestablish the mappings.
254 	 */
255 	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
256 	    dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
257 
258 	/* Set the cache attributes correctly */
259 	i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
260 
261 	dhp->dh_pcookie = cookie;
262 	dhp->dh_roff = ptob(btop(off));
263 	dhp->dh_len = ptob(btopr(len));
264 
265 	/* Clear the large page size flag. */
266 	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
267 
268 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
269 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
270 	RELE_DHP_LOCK(dhp);
271 	return (DDI_SUCCESS);
272 }
273 
274 /*
275  * Directly (i.e., without occupying kernel virtual address space) allocate
276  * 'npages' physical memory pages for exporting to user land. The allocated
277  * page_t pointer will be recorded in cookie.
278  */
279 int
280 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
281 {
282 	u_offset_t	pmem_off = 0;
283 	page_t		*pp = NULL;
284 	page_t		*lpp = NULL;
285 	page_t		*tlist = NULL;
286 	pgcnt_t		i = 0;
287 	pgcnt_t		rpages = 0;
288 	pgcnt_t		lpages = 0;
289 	pgcnt_t		tpages = 0;
290 	pgcnt_t		npages = btopr(size);
291 	pmem_lpg_t	*plp = NULL;
292 	struct devmap_pmem_cookie	*pcp;
293 	uint_t		reserved = 0;
294 	uint_t		locked = 0;
295 	uint_t		pflags, kflags;
296 
297 	*cookiep = NULL;
298 
299 	/*
300 	 * Number larger than this will cause page_create_va() to loop
301 	 * infinitely.
302 	 */
303 	if (npages == 0 || npages >= total_pages / 2)
304 		return (DDI_FAILURE);
305 	if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
306 		return (DDI_FAILURE);
307 	pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
308 	kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
309 
310 	/* Allocate pmem cookie. */
311 	if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
312 		return (DDI_FAILURE);
313 	pcp->dp_npages = npages;
314 
315 	/*
316 	 * See if the requested memory can be locked.
317 	 */
318 	pcp->dp_proc = curproc;
319 	if (pmem_lock(npages, curproc) == DDI_FAILURE)
320 		goto alloc_fail;
321 	locked = 1;
322 	/*
323 	 * First, grab as many as possible from pmem_mpool. If pages in
324 	 * pmem_mpool are enough for this request, we are done.
325 	 */
326 	mutex_enter(&pmem_mutex);
327 	tpages = mpool_break(&tlist, npages);
328 	/* IOlock and hashin them into the new offset. */
329 	if (tpages)
330 		tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
331 	mutex_exit(&pmem_mutex);
332 
333 	if (tpages == npages)
334 		goto done;
335 
336 	rpages = npages - tpages;
337 	/* Quit now if memory cannot be reserved. */
338 	if (!page_resv(rpages, kflags))
339 		goto alloc_fail;
340 	reserved = 1;
341 
342 	/* Try to allocate large pages first to decrease fragmentation. */
343 	i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
344 	if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
345 	    kflags) == DDI_FAILURE)
346 		goto alloc_fail;
347 	ASSERT(lpages == 0 ? lpp == NULL : 1);
348 
349 	/*
350 	 * Pages in large pages is more than the request, put the residual
351 	 * pages into pmem_mpool.
352 	 */
353 	if (lpages >= rpages) {
354 		lpp_break(&lpp, lpages, lpages - rpages, plp);
355 		goto done;
356 	}
357 
358 	/* Allocate small pages if lpp+tlist cannot satisfy the request. */
359 	i =  rpages - lpages;
360 	if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
361 	    pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
362 		goto alloc_fail;
363 
364 done:
365 	page_list_concat(&tlist, &lpp);
366 	page_list_concat(&tlist, &pp);
367 	/* Set those small pages from large pages as allocated. */
368 	mutex_enter(&pmem_mutex);
369 	pmem_lpg_concat(&pmem_occ_lpgs, &plp);
370 	mutex_exit(&pmem_mutex);
371 
372 	/*
373 	 * Now tlist holds all the pages for this cookie. Record these pages in
374 	 * pmem cookie.
375 	 */
376 	for (pp = tlist, i = 0; i < npages; i++) {
377 		pcp->dp_pparray[i] = pp;
378 		page_io_unlock(pp);
379 		pp = pp->p_next;
380 		page_sub(&tlist, pp->p_prev);
381 	}
382 	ASSERT(tlist == NULL);
383 	*cookiep = (devmap_pmem_cookie_t)pcp;
384 
385 	return (DDI_SUCCESS);
386 
387 alloc_fail:
388 	DTRACE_PROBE(pmem__alloc__fail);
389 	/* Free large pages and the associated allocation records. */
390 	if (lpp)
391 		lpp_free(lpp, lpages / pmem_pgcnt, &plp);
392 	if (reserved == 1)
393 		page_unresv(rpages);
394 	/* Put those pages in tlist back into pmem_mpool. */
395 	if (tpages != 0) {
396 		mutex_enter(&pmem_mutex);
397 		/* IOunlock, hashout and update the allocation records. */
398 		tlist_out(tlist, tpages);
399 		mpool_append(&tlist, tpages);
400 		mutex_exit(&pmem_mutex);
401 	}
402 	if (locked == 1)
403 		i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages));
404 	/* Freeing pmem_cookie. */
405 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
406 	kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
407 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
408 	return (DDI_FAILURE);
409 }
410 
411 /*
412  * Free all small pages inside cookie, and return pages from large pages into
413  * mpool, if all the pages from one large page is in mpool, free it as a whole.
414  */
415 void
416 devmap_pmem_free(devmap_pmem_cookie_t cookie)
417 {
418 	struct	devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
419 	pgcnt_t		i;
420 	pgcnt_t		tpages = 0;
421 	page_t		*pp;
422 	pmem_lpg_t 	*pl1, *plp;
423 	pmem_lpg_t	*pf_lpgs = NULL;
424 	uint_t		npls = 0;
425 	pmem_lpg_t *last_pl = NULL;
426 	pmem_lpg_t *plast_pl = NULL;
427 
428 	ASSERT(pcp);
429 	mutex_enter(&pmem_mutex);
430 	/* Free small pages and return them to memory pool. */
431 	for (i = pcp->dp_npages; i > 0; i--) {
432 		pp = pcp->dp_pparray[i - 1];
433 		page_hashout(pp, NULL);
434 		/*
435 		 * Remove the mapping of this single page, this mapping is
436 		 * created using hat_devload() in segdev_faultpage().
437 		 */
438 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
439 		if (!FROM_LPG(pp)) {
440 			/* Normal small page. */
441 			page_free(pp, 1);
442 			page_unresv(1);
443 		} else {
444 			/* Small page from large pages. */
445 			plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
446 			if (plp && !(plp->pl_pfree)) {
447 				/*
448 				 * Move this record to pf_lpgs list, this large
449 				 * page may be able to be freed as a whole.
450 				 */
451 				pmem_lpg_sub(&pmem_occ_lpgs, plp);
452 				pmem_lpg_concat(&pf_lpgs, &plp);
453 				plp->pl_pfree = 1;
454 				npls++;
455 				last_pl = NULL;
456 			} else {
457 				/* Search in pf_lpgs list. */
458 				plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
459 			}
460 			ASSERT(plp);
461 			/* Mark this page as free. */
462 			BT_SET(plp->pl_bitmap, PFIND(pp));
463 			/* Record this page in pmem_mpool. */
464 			mpool_append(&pp, 1);
465 		}
466 	}
467 
468 	/*
469 	 * Find out the large pages whose pages have been freed, remove them
470 	 * from plp list, free them and the associated pmem_lpg struct.
471 	 */
472 	for (plp = pf_lpgs; npls != 0; npls--) {
473 		pl1 = plp;
474 		plp = plp->pl_next;
475 		if (lpg_isfree(pl1)) {
476 			/*
477 			 * Get one free large page.  Find all pages in this
478 			 * large page and remove them from pmem_mpool.
479 			 */
480 			lpg_free(pl1->pl_pp);
481 			/* Remove associated allocation records. */
482 			pmem_lpg_sub(&pf_lpgs, pl1);
483 			pmem_lpg_free(&pf_lpgs, pl1);
484 			tpages -= pmem_pgcnt;
485 		} else
486 			pl1->pl_pfree = 0;
487 	}
488 	/* Update allocation records accordingly. */
489 	pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
490 	mutex_exit(&pmem_mutex);
491 
492 	if (curproc == pcp->dp_proc)
493 		i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages));
494 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
495 	kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
496 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
497 }
498 
499 /*
500  * To extract page frame number from specified range in a cookie.
501  */
502 int
503 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
504     pfn_t *pfnarray)
505 {
506 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
507 	pgcnt_t i;
508 
509 	if (pcp == NULL || start + npages > pcp->dp_npages)
510 		return (DDI_FAILURE);
511 
512 	for (i = start; i < start + npages; i++)
513 		pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum;
514 	return (DDI_SUCCESS);
515 }
516 
517 void
518 pmem_init()
519 {
520 	mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
521 	pmem_lszc = MIN(1, page_num_pagesizes() - 1);
522 	pmem_lpgsize = page_get_pagesize(pmem_lszc);
523 	pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
524 	bzero(&pmem_seg, sizeof (struct seg));
525 	pmem_seg.s_as = &kas;
526 }
527 
528 /* Allocate kernel memory for one pmem cookie with n pages. */
529 static int
530 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
531 {
532 	struct devmap_pmem_cookie *pcp;
533 
534 	if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
535 	    kflags)) == NULL)
536 		return (DDI_FAILURE);
537 	pcp = *pcpp;
538 	if ((pcp->dp_vnp =
539 	    kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
540 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
541 		return (DDI_FAILURE);
542 	}
543 	if ((pcp->dp_pparray =
544 	    kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
545 		kmem_free(pcp->dp_vnp, sizeof (vnode_t));
546 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
547 		return (DDI_FAILURE);
548 	}
549 	return (DDI_SUCCESS);
550 }
551 
552 /* Try to lock down n pages resource */
553 static int
554 pmem_lock(pgcnt_t n, proc_t *p)
555 {
556 	if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) {
557 		return (DDI_FAILURE);
558 	}
559 	return (DDI_SUCCESS);
560 }
561 
562 /* To check if all the pages in a large page are freed. */
563 static int
564 lpg_isfree(pmem_lpg_t *plp)
565 {
566 	uint_t i;
567 
568 	for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
569 		if (plp->pl_bitmap[i] != BT_ULMAXMASK)
570 			return (0);
571 	/* All 1 means all pages are freed. */
572 	return (1);
573 }
574 
575 /*
576  * Using pp to get the associated large page allocation record, searching in
577  * the splp linked list with *last as the heuristic pointer. Return NULL if
578  * not found.
579  */
580 static pmem_lpg_t *
581 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
582 {
583 	pmem_lpg_t *plp;
584 	pgcnt_t root_pfn;
585 
586 	ASSERT(pp);
587 	if (splp == NULL)
588 		return (NULL);
589 	root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
590 
591 	/* Try last winner first. */
592 	if (*last && root_pfn == page_pptonum((*last)->pl_pp))
593 		goto pl_found;
594 
595 	/* Else search the whole pmem_lpg list. */
596 	for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
597 		plp = plp->pl_next;
598 		if (plp == splp) {
599 			plp = NULL;
600 			break;
601 		}
602 		ASSERT(plp->pl_pp);
603 	}
604 
605 	*last = plp;
606 
607 pl_found:
608 	return (*last);
609 }
610 
611 /*
612  *  Remove one pmem_lpg plp from the oplpp list.
613  */
614 static void
615 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
616 {
617 	if (*oplpp == plp)
618 		*oplpp = plp->pl_next;		/* go to next pmem_lpg */
619 
620 	if (*oplpp == plp)
621 		*oplpp = NULL;			/* pmem_lpg list is gone */
622 	else {
623 		plp->pl_prev->pl_next = plp->pl_next;
624 		plp->pl_next->pl_prev = plp->pl_prev;
625 	}
626 	plp->pl_prev = plp->pl_next = plp;	/* make plp a list of one */
627 }
628 
629 /*
630  * Concatenate page list nplpp onto the end of list plpp.
631  */
632 static void
633 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
634 {
635 	pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
636 
637 	if (*nplpp == NULL) {
638 		return;
639 	}
640 	if (*plpp == NULL) {
641 		*plpp = *nplpp;
642 		return;
643 	}
644 	s1p = *plpp;
645 	e1p =  s1p->pl_prev;
646 	s2p = *nplpp;
647 	e2p = s2p->pl_prev;
648 	s1p->pl_prev = e2p;
649 	e2p->pl_next = s1p;
650 	e1p->pl_next = s2p;
651 	s2p->pl_prev = e1p;
652 }
653 
654 /*
655  * Allocate and initialize the allocation record of one large page, the init
656  * value is 'allocated'.
657  */
658 static pmem_lpg_t *
659 pmem_lpg_alloc(uint_t kflags)
660 {
661 	pmem_lpg_t *plp;
662 
663 	ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
664 	plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
665 	if (plp == NULL)
666 		return (NULL);
667 	plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
668 	if (plp->pl_bitmap == NULL) {
669 		kmem_free(plp, sizeof (*plp));
670 		return (NULL);
671 	}
672 	plp->pl_next = plp->pl_prev = plp;
673 	return (plp);
674 }
675 
676 /* Free one allocation record pointed by oplp. */
677 static void
678 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
679 {
680 	if (*headp == plp)
681 		*headp = plp->pl_next;		/* go to next pmem_lpg_t */
682 
683 	if (*headp == plp)
684 		*headp = NULL;			/* this list is gone */
685 	else {
686 		plp->pl_prev->pl_next = plp->pl_next;
687 		plp->pl_next->pl_prev = plp->pl_prev;
688 	}
689 	kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
690 	kmem_free(plp, sizeof (*plp));
691 }
692 
693 /* Free one large page headed by spp from pmem_mpool. */
694 static void
695 lpg_free(page_t *spp)
696 {
697 	page_t *pp1 = spp;
698 	uint_t i;
699 
700 	ASSERT(MUTEX_HELD(&pmem_mutex));
701 	for (i = 0; i < pmem_pgcnt; i++) {
702 		/* Break pp1 from pmem_mpool. */
703 		page_sub(&pmem_mpool, pp1);
704 		pp1++;
705 	}
706 	/* Free pages in this large page. */
707 	page_free_pages(spp);
708 	page_unresv(pmem_pgcnt);
709 	pmem_nmpages -= pmem_pgcnt;
710 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
711 }
712 
713 /* Put n pages in *ppp list back into pmem_mpool. */
714 static void
715 mpool_append(page_t **ppp, pgcnt_t n)
716 {
717 	ASSERT(MUTEX_HELD(&pmem_mutex));
718 	/* Put back pages. */
719 	page_list_concat(&pmem_mpool, ppp);
720 	pmem_nmpages += n;
721 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
722 }
723 
724 /*
725  * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
726  * list, and return the number of grabbed pages.
727  */
728 static pgcnt_t
729 mpool_break(page_t **ppp, pgcnt_t n)
730 {
731 	pgcnt_t i;
732 
733 	ASSERT(MUTEX_HELD(&pmem_mutex));
734 	/* Grab the pages. */
735 	i = MIN(pmem_nmpages, n);
736 	*ppp = pmem_mpool;
737 	page_list_break(ppp, &pmem_mpool, i);
738 	pmem_nmpages -= i;
739 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
740 	return (i);
741 }
742 
743 /*
744  * Create n large pages, lpages and plpp contains the number of small pages and
745  * allocation records list respectively.
746  */
747 static int
748 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
749     vnode_t *vnp, u_offset_t *offp, uint_t kflags)
750 {
751 	pgcnt_t i;
752 	pmem_lpg_t *plp;
753 	page_t *pp;
754 
755 	for (i = 0, *lpages = 0; i < n; i++) {
756 		/* Allocte one large page each time. */
757 		pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
758 		    PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
759 		if (pp == NULL)
760 			break;
761 		*offp += pmem_lpgsize;
762 		page_list_concat(lppp, &pp);
763 		*lpages += pmem_pgcnt;
764 		/* Add one allocation record for this large page. */
765 		if ((plp = pmem_lpg_alloc(kflags)) == NULL)
766 			return (DDI_FAILURE);
767 		plp->pl_pp = pp;
768 		pmem_lpg_concat(plpp, &plp);
769 	}
770 	return (DDI_SUCCESS);
771 }
772 
773 /*
774  * Break the last r small pages from the large page list *lppp (with totally n
775  * small pages) and put them into pmem_mpool.
776  */
777 static void
778 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
779 {
780 	page_t *pp, *pp1;
781 	pgcnt_t i;
782 	pmem_lpg_t *plp;
783 
784 	if (r == 0)
785 		return;
786 	ASSERT(*lppp != NULL && r < pmem_pgcnt);
787 	page_list_break(lppp, &pp, n - r);
788 
789 	/* The residual should reside in the last large page.  */
790 	plp = oplp->pl_prev;
791 	/* IOunlock and hashout the residual pages. */
792 	for (pp1 = pp, i = 0; i < r; i++) {
793 		page_io_unlock(pp1);
794 		page_hashout(pp1, NULL);
795 		/* Mark this page as free. */
796 		BT_SET(plp->pl_bitmap, PFIND(pp1));
797 		pp1 = pp1->p_next;
798 	}
799 	ASSERT(pp1 == pp);
800 	/* Put these residual pages into memory pool. */
801 	mutex_enter(&pmem_mutex);
802 	mpool_append(&pp, r);
803 	mutex_exit(&pmem_mutex);
804 }
805 
806 /* Freeing large pages in lpp and the associated allocation records in plp. */
807 static void
808 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
809 {
810 	pgcnt_t i, j;
811 	page_t *pp = lpp, *pp1;
812 	pmem_lpg_t *plp1, *plp2;
813 
814 	for (i = 0; i < lpgs; i++) {
815 		for (j = 0; j < pmem_pgcnt; j++) {
816 			/* IO unlock and hashout this small page. */
817 			page_io_unlock(pp);
818 			page_hashout(pp, NULL);
819 			pp1 = pp->p_next;
820 			pp->p_prev = pp->p_next = pp;
821 			pp = pp1;
822 		}
823 		/* Free one large page at one time. */
824 		page_free_pages(lpp);
825 		lpp = pp;
826 	}
827 	/* Free associate pmem large page allocation records. */
828 	for (plp1 = *plpp; *plpp; plp1 = plp2) {
829 		plp2 = plp1->pl_next;
830 		pmem_lpg_free(plpp, plp1);
831 	}
832 }
833 
834 /*
835  * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
836  * and offset starting with *poffp. Update allocation records accordingly at
837  * the same time.
838  */
839 static void
840 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
841 {
842 	page_t *pp;
843 	pgcnt_t i = 0;
844 	pmem_lpg_t *plp, *last_pl = NULL;
845 
846 	ASSERT(MUTEX_HELD(&pmem_mutex));
847 	for (pp = tlist; i < tpages; i++) {
848 		ASSERT(FROM_LPG(pp));
849 		page_io_lock(pp);
850 		(void) page_hashin(pp, pvnp, *poffp, NULL);
851 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
852 		/* Mark this page as allocated. */
853 		BT_CLEAR(plp->pl_bitmap, PFIND(pp));
854 		*poffp += PAGESIZE;
855 		pp = pp->p_next;
856 	}
857 	ASSERT(pp == tlist);
858 }
859 
860 /*
861  * IOunlock and hashout all pages in tlist, update allocation records
862  * accordingly at the same time.
863  */
864 static void
865 tlist_out(page_t *tlist, pgcnt_t tpages)
866 {
867 	page_t *pp;
868 	pgcnt_t i = 0;
869 	pmem_lpg_t *plp, *last_pl = NULL;
870 
871 	ASSERT(MUTEX_HELD(&pmem_mutex));
872 	for (pp = tlist; i < tpages; i++) {
873 		ASSERT(FROM_LPG(pp));
874 		page_io_unlock(pp);
875 		page_hashout(pp, NULL);
876 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
877 		/* Mark this page as free. */
878 		BT_SET(plp->pl_bitmap, PFIND(pp));
879 		pp = pp->p_next;
880 	}
881 	ASSERT(pp == tlist);
882 }
883