xref: /titanic_51/usr/src/uts/i86pc/os/pmem.c (revision ac4d633f367252125bb35e97c5725d2aa68c1291)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * PMEM - Direct mapping physical memory pages to userland process
30  *
31  * Provide functions used for directly (w/o occupying kernel virtual address
32  * space) allocating and exporting physical memory pages to userland.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/mutex.h>
37 #include <sys/sunddi.h>
38 #include <sys/ddidevmap.h>
39 #include <sys/vnode.h>
40 #include <sys/sysmacros.h>
41 #include <sys/project.h>
42 #include <vm/seg_dev.h>
43 #include <sys/pmem.h>
44 #include <vm/hat_i86.h>
45 #include <sys/task.h>
46 #include <sys/sdt.h>
47 
48 /*
49  * The routines in this file allocate memory which will be accessed through
50  * the AGP GART hardware.  The GART is programmed with the PFNs for this
51  * memory, and the only mechanism for removing these entries is by an
52  * explicit process operation (ioctl/close of the driver, or process exit).
53  * As such, the pages need to remain locked to ensure that they won't be
54  * relocated or paged out.
55  *
56  * To prevent these locked pages from getting in the way of page
57  * coalescing, we try to allocate large pages from the system, and carve
58  * them up to satisfy pmem allocation requests.  This will keep the locked
59  * pages within a constrained area of physical memory, limiting the number
60  * of large pages that would be pinned by our locked pages.  This is, of
61  * course, another take on the infamous kernel cage, and it has many of the
62  * downsides of the original cage.  It also interferes with system-wide
63  * resource management decisions, as it maintains its own pool of unused
64  * pages which can't be easily reclaimed and used during low-memory
65  * situations.
66  *
67  * The right solution is for pmem to register a callback that the VM system
68  * could call, which would temporarily remove any GART entries for pages
69  * that were being relocated.  This would let us leave the pages unlocked,
70  * which would remove the need for using large pages, which would simplify
71  * this code a great deal.  Unfortunately, the support for these callbacks
72  * only exists on some SPARC platforms right now.
73  *
74  * Note that this is the *only* reason that large pages are used here.  The
75  * GART can't perform large-page translations, and the code appropriately
76  * falls back to using small pages if page_create_va_large() fails.
77  */
78 
79 #define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
80 			{ mutex_enter(&dhp->dh_lock); }
81 
82 #define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
83 			{ mutex_exit(&dhp->dh_lock); }
84 
85 #define	FROM_LPG(pp) (pp->p_szc != 0)
86 #define	PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
87 
88 /*
89  * Structs and static variables used for pmem only.
90  */
91 typedef struct pmem_lpg {
92 	page_t	*pl_pp;		/* start pp */
93 	ulong_t	*pl_bitmap;	/* allocation status for each page */
94 	ushort_t pl_pfree;	/* this large page might be fully freed */
95 	struct pmem_lpg *pl_next;
96 	struct pmem_lpg *pl_prev;
97 } pmem_lpg_t;
98 
99 static size_t	pmem_lpgsize;	/* the size of one large page */
100 static pgcnt_t	pmem_pgcnt;	/* the number of small pages in a large page */
101 static uint_t	pmem_lszc;	/* page size code of the large page */
102 /* The segment to be associated with all the allocated pages. */
103 static struct seg	pmem_seg;
104 /* Fully occupied large pages allocated for pmem. */
105 static pmem_lpg_t *pmem_occ_lpgs;
106 /* Memory pool to store residual small pages from large pages. */
107 static page_t	*pmem_mpool = NULL;
108 /* Number of small pages reside in pmem_mpool currently. */
109 static pgcnt_t	pmem_nmpages = 0;
110 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
111 kmutex_t	pmem_mutex;
112 
113 static int lpg_isfree(pmem_lpg_t *);
114 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
115 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
116 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
117 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
118 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
119 static void lpg_free(page_t *spp);
120 static pgcnt_t mpool_break(page_t **, pgcnt_t);
121 static void mpool_append(page_t **, pgcnt_t);
122 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
123 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
124 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
125     vnode_t *, u_offset_t *, uint_t);
126 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
127 static void tlist_out(page_t *, pgcnt_t);
128 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
129 static int pmem_lock(pgcnt_t, kproject_t **);
130 
131 /*
132  * Called by driver devmap routine to pass physical memory mapping info to
133  * seg_dev framework, used only for physical memory allocated from
134  * devmap_pmem_alloc().
135  */
136 /* ARGSUSED */
137 int
138 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
139     struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
140     offset_t off, size_t len, uint_t maxprot, uint_t flags,
141     ddi_device_acc_attr_t *accattrp)
142 {
143 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
144 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
145 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
146 
147 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
148 		return (DDI_FAILURE);
149 
150 	/*
151 	 * First to check if this function has been called for this dhp.
152 	 */
153 	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
154 		return (DDI_FAILURE);
155 
156 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
157 		return (DDI_FAILURE);
158 
159 	/*
160 	 * Check if the cache attributes are supported. Need to pay
161 	 * attention that only uncachable or write-combining is
162 	 * permitted for pmem.
163 	 */
164 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
165 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
166 		return (DDI_FAILURE);
167 
168 	if (flags & DEVMAP_MAPPING_INVALID) {
169 		/*
170 		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
171 		 * remap permission.
172 		 */
173 		if (!(flags & DEVMAP_ALLOW_REMAP))
174 			return (DDI_FAILURE);
175 	} else {
176 		dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
177 		/* dh_roff is the offset inside the dh_pcookie. */
178 		dhp->dh_roff = ptob(btop(off));
179 		/* Set the cache attributes correctly */
180 		i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
181 	}
182 
183 	dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
184 	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
185 	dhp->dh_len = ptob(btopr(len));
186 
187 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
188 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
189 
190 	if (callbackops != NULL) {
191 		bcopy(callbackops, &dhp->dh_callbackops,
192 		    sizeof (struct devmap_callback_ctl));
193 	}
194 
195 	/*
196 	 * Initialize dh_lock if we want to do remap.
197 	 */
198 	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
199 		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
200 		dhp->dh_flags |= DEVMAP_LOCK_INITED;
201 	}
202 
203 	dhp->dh_flags |= DEVMAP_SETUP_DONE;
204 
205 	return (DDI_SUCCESS);
206 }
207 
208 /*
209  * Replace existing mapping using a new cookie, mainly gets called when doing
210  * fork(). Should be called in associated devmap_dup(9E).
211  */
212 /* ARGSUSED */
213 int
214 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
215     devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
216     uint_t flags, ddi_device_acc_attr_t *accattrp)
217 {
218 	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
219 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
220 	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
221 
222 	/*
223 	 * Reture failure if setup has not been done or no remap permission
224 	 * has been granted during the setup.
225 	 */
226 	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
227 	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
228 		return (DDI_FAILURE);
229 
230 	/* No flags supported for remap yet. */
231 	if (flags != 0)
232 		return (DDI_FAILURE);
233 
234 	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
235 		return (DDI_FAILURE);
236 
237 	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
238 		return (DDI_FAILURE);
239 
240 	/*
241 	 * Check if the cache attributes are supported. Need to pay
242 	 * attention that only uncachable or write-combining is
243 	 * permitted for pmem.
244 	 */
245 	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
246 	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
247 		return (DDI_FAILURE);
248 
249 	HOLD_DHP_LOCK(dhp);
250 	/*
251 	 * Unload the old mapping of pages reloated with this dhp, so next
252 	 * fault will setup the new mappings. It is in segdev_faultpage that
253 	 * calls hat_devload to establish the mapping. Do this while holding
254 	 * the dhp lock so other faults dont reestablish the mappings.
255 	 */
256 	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
257 	    dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
258 
259 	/* Set the cache attributes correctly */
260 	i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
261 
262 	dhp->dh_pcookie = cookie;
263 	dhp->dh_roff = ptob(btop(off));
264 	dhp->dh_len = ptob(btopr(len));
265 
266 	/* Clear the large page size flag. */
267 	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
268 
269 	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
270 	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
271 	RELE_DHP_LOCK(dhp);
272 	return (DDI_SUCCESS);
273 }
274 
275 /*
276  * Directly (i.e., without occupying kernel virtual address space) allocate
277  * 'npages' physical memory pages for exporting to user land. The allocated
278  * page_t pointer will be recorded in cookie.
279  */
280 int
281 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
282 {
283 	u_offset_t	pmem_off = 0;
284 	page_t		*pp = NULL;
285 	page_t		*lpp = NULL;
286 	page_t		*tlist = NULL;
287 	pgcnt_t		i = 0;
288 	pgcnt_t		rpages = 0;
289 	pgcnt_t		lpages = 0;
290 	pgcnt_t		tpages = 0;
291 	pgcnt_t		npages = btopr(size);
292 	pmem_lpg_t	*plp = NULL;
293 	struct devmap_pmem_cookie	*pcp;
294 	uint_t		reserved = 0;
295 	uint_t		locked = 0;
296 	uint_t		pflags, kflags;
297 
298 	*cookiep = NULL;
299 
300 	/*
301 	 * Number larger than this will cause page_create_va() to loop
302 	 * infinitely.
303 	 */
304 	if (npages == 0 || npages >= total_pages / 2)
305 		return (DDI_FAILURE);
306 	if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
307 		return (DDI_FAILURE);
308 	pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
309 	kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
310 
311 	/* Allocate pmem cookie. */
312 	if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
313 		return (DDI_FAILURE);
314 	pcp->dp_npages = npages;
315 
316 	/*
317 	 * See if the requested memory can be locked. Currently we do resource
318 	 * controls on the project levlel only.
319 	 */
320 	if (pmem_lock(npages, &(pcp->dp_projp)) == DDI_FAILURE)
321 		goto alloc_fail;
322 	locked = 1;
323 
324 	/*
325 	 * First, grab as many as possible from pmem_mpool. If pages in
326 	 * pmem_mpool are enough for this request, we are done.
327 	 */
328 	mutex_enter(&pmem_mutex);
329 	tpages = mpool_break(&tlist, npages);
330 	/* IOlock and hashin them into the new offset. */
331 	if (tpages)
332 		tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
333 	mutex_exit(&pmem_mutex);
334 
335 	if (tpages == npages)
336 		goto done;
337 
338 	rpages = npages - tpages;
339 	/* Quit now if memory cannot be reserved. */
340 	if (!page_resv(rpages, kflags))
341 		goto alloc_fail;
342 	reserved = 1;
343 
344 	/* Try to allocate large pages first to decrease fragmentation. */
345 	i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
346 	if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
347 	    kflags) == DDI_FAILURE)
348 		goto alloc_fail;
349 	ASSERT(lpages == 0 ? lpp == NULL : 1);
350 
351 	/*
352 	 * Pages in large pages is more than the request, put the residual
353 	 * pages into pmem_mpool.
354 	 */
355 	if (lpages >= rpages) {
356 		lpp_break(&lpp, lpages, lpages - rpages, plp);
357 		goto done;
358 	}
359 
360 	/* Allocate small pages if lpp+tlist cannot satisfy the request. */
361 	i =  rpages - lpages;
362 	if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
363 	    pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
364 		goto alloc_fail;
365 
366 done:
367 	page_list_concat(&tlist, &lpp);
368 	page_list_concat(&tlist, &pp);
369 	/* Set those small pages from large pages as allocated. */
370 	mutex_enter(&pmem_mutex);
371 	pmem_lpg_concat(&pmem_occ_lpgs, &plp);
372 	mutex_exit(&pmem_mutex);
373 
374 	/*
375 	 * Now tlist holds all the pages for this cookie. Record these pages in
376 	 * pmem cookie.
377 	 */
378 	for (pp = tlist, i = 0; i < npages; i++) {
379 		pcp->dp_pparray[i] = pp;
380 		page_io_unlock(pp);
381 		pp = pp->p_next;
382 		page_sub(&tlist, pp->p_prev);
383 	}
384 	ASSERT(tlist == NULL);
385 	*cookiep = (devmap_pmem_cookie_t)pcp;
386 
387 	return (DDI_SUCCESS);
388 
389 alloc_fail:
390 	DTRACE_PROBE(pmem__alloc__fail);
391 	/* Free large pages and the associated allocation records. */
392 	if (lpp)
393 		lpp_free(lpp, lpages / pmem_pgcnt, &plp);
394 	if (reserved == 1)
395 		page_unresv(rpages);
396 	/* Put those pages in tlist back into pmem_mpool. */
397 	if (tpages != 0) {
398 		mutex_enter(&pmem_mutex);
399 		/* IOunlock, hashout and update the allocation records. */
400 		tlist_out(tlist, tpages);
401 		mpool_append(&tlist, tpages);
402 		mutex_exit(&pmem_mutex);
403 	}
404 	if (locked == 1)
405 		i_ddi_decr_locked_memory(NULL, NULL, pcp->dp_projp, NULL,
406 		    ptob(pcp->dp_npages));
407 	/* Freeing pmem_cookie. */
408 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
409 	kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
410 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
411 	return (DDI_FAILURE);
412 }
413 
414 /*
415  * Free all small pages inside cookie, and return pages from large pages into
416  * mpool, if all the pages from one large page is in mpool, free it as a whole.
417  */
418 void
419 devmap_pmem_free(devmap_pmem_cookie_t cookie)
420 {
421 	struct	devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
422 	pgcnt_t		i;
423 	pgcnt_t		tpages = 0;
424 	page_t		*pp;
425 	pmem_lpg_t 	*pl1, *plp;
426 	pmem_lpg_t	*pf_lpgs = NULL;
427 	uint_t		npls = 0;
428 	pmem_lpg_t *last_pl = NULL;
429 	pmem_lpg_t *plast_pl = NULL;
430 
431 	ASSERT(pcp);
432 	mutex_enter(&pmem_mutex);
433 	/* Free small pages and return them to memory pool. */
434 	for (i = pcp->dp_npages; i > 0; i--) {
435 		pp = pcp->dp_pparray[i - 1];
436 		page_hashout(pp, NULL);
437 		/*
438 		 * Remove the mapping of this single page, this mapping is
439 		 * created using hat_devload() in segdev_faultpage().
440 		 */
441 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
442 		if (!FROM_LPG(pp)) {
443 			/* Normal small page. */
444 			page_free(pp, 1);
445 			page_unresv(1);
446 		} else {
447 			/* Small page from large pages. */
448 			plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
449 			if (plp && !(plp->pl_pfree)) {
450 				/*
451 				 * Move this record to pf_lpgs list, this large
452 				 * page may be able to be freed as a whole.
453 				 */
454 				pmem_lpg_sub(&pmem_occ_lpgs, plp);
455 				pmem_lpg_concat(&pf_lpgs, &plp);
456 				plp->pl_pfree = 1;
457 				npls++;
458 				last_pl = NULL;
459 			} else {
460 				/* Search in pf_lpgs list. */
461 				plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
462 			}
463 			ASSERT(plp);
464 			/* Mark this page as free. */
465 			BT_SET(plp->pl_bitmap, PFIND(pp));
466 			/* Record this page in pmem_mpool. */
467 			mpool_append(&pp, 1);
468 		}
469 	}
470 
471 	/*
472 	 * Find out the large pages whose pages have been freed, remove them
473 	 * from plp list, free them and the associated pmem_lpg struct.
474 	 */
475 	for (plp = pf_lpgs; npls != 0; npls--) {
476 		pl1 = plp;
477 		plp = plp->pl_next;
478 		if (lpg_isfree(pl1)) {
479 			/*
480 			 * Get one free large page.  Find all pages in this
481 			 * large page and remove them from pmem_mpool.
482 			 */
483 			lpg_free(pl1->pl_pp);
484 			/* Remove associated allocation records. */
485 			pmem_lpg_sub(&pf_lpgs, pl1);
486 			pmem_lpg_free(&pf_lpgs, pl1);
487 			tpages -= pmem_pgcnt;
488 		} else
489 			pl1->pl_pfree = 0;
490 	}
491 	/* Update allocation records accordingly. */
492 	pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
493 	mutex_exit(&pmem_mutex);
494 
495 	i_ddi_decr_locked_memory(NULL, NULL, (kproject_t *)pcp->dp_projp, NULL,
496 	    ptob(pcp->dp_npages));
497 	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
498 	kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
499 	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
500 }
501 
502 /*
503  * To extract page frame number from specified range in a cookie.
504  */
505 int
506 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
507     pfn_t *pfnarray)
508 {
509 	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
510 	pgcnt_t i;
511 
512 	if (pcp == NULL || start + npages > pcp->dp_npages)
513 		return (DDI_FAILURE);
514 
515 	for (i = start; i < start + npages; i++)
516 		pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum;
517 	return (DDI_SUCCESS);
518 }
519 
520 void
521 pmem_init()
522 {
523 	mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
524 	pmem_lszc = MIN(1, page_num_pagesizes() - 1);
525 	pmem_lpgsize = page_get_pagesize(pmem_lszc);
526 	pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
527 	bzero(&pmem_seg, sizeof (struct seg));
528 	pmem_seg.s_as = &kas;
529 }
530 
531 /* Allocate kernel memory for one pmem cookie with n pages. */
532 static int
533 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
534 {
535 	struct devmap_pmem_cookie *pcp;
536 
537 	if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
538 	    kflags)) == NULL)
539 		return (DDI_FAILURE);
540 	pcp = *pcpp;
541 	if ((pcp->dp_vnp =
542 	    kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
543 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
544 		return (DDI_FAILURE);
545 	}
546 	if ((pcp->dp_pparray =
547 	    kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
548 		kmem_free(pcp->dp_vnp, sizeof (vnode_t));
549 		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
550 		return (DDI_FAILURE);
551 	}
552 	return (DDI_SUCCESS);
553 }
554 
555 /* Try to lock down n pages resource for current project. */
556 static int
557 pmem_lock(pgcnt_t n, kproject_t **prjpp)
558 {
559 	mutex_enter(&curproc->p_lock);
560 	if (i_ddi_incr_locked_memory(curproc, NULL, NULL, NULL,
561 	    ptob(n)) != 0) {
562 		mutex_exit(&curproc->p_lock);
563 		return (DDI_FAILURE);
564 	}
565 	/* Store this project in cookie for later lock/unlock. */
566 	*prjpp = curproc->p_task->tk_proj;
567 	mutex_exit(&curproc->p_lock);
568 	return (DDI_SUCCESS);
569 }
570 
571 /* To check if all the pages in a large page are freed. */
572 static int
573 lpg_isfree(pmem_lpg_t *plp)
574 {
575 	uint_t i;
576 
577 	for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
578 		if (plp->pl_bitmap[i] != BT_ULMAXMASK)
579 			return (0);
580 	/* All 1 means all pages are freed. */
581 	return (1);
582 }
583 
584 /*
585  * Using pp to get the associated large page allocation record, searching in
586  * the splp linked list with *last as the heuristic pointer. Return NULL if
587  * not found.
588  */
589 static pmem_lpg_t *
590 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
591 {
592 	pmem_lpg_t *plp;
593 	pgcnt_t root_pfn;
594 
595 	ASSERT(pp);
596 	if (splp == NULL)
597 		return (NULL);
598 	root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
599 
600 	/* Try last winner first. */
601 	if (*last && root_pfn == page_pptonum((*last)->pl_pp))
602 		goto pl_found;
603 
604 	/* Else search the whole pmem_lpg list. */
605 	for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
606 		plp = plp->pl_next;
607 		if (plp == splp) {
608 			plp = NULL;
609 			break;
610 		}
611 		ASSERT(plp->pl_pp);
612 	}
613 
614 	*last = plp;
615 
616 pl_found:
617 	return (*last);
618 }
619 
620 /*
621  *  Remove one pmem_lpg plp from the oplpp list.
622  */
623 static void
624 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
625 {
626 	if (*oplpp == plp)
627 		*oplpp = plp->pl_next;		/* go to next pmem_lpg */
628 
629 	if (*oplpp == plp)
630 		*oplpp = NULL;			/* pmem_lpg list is gone */
631 	else {
632 		plp->pl_prev->pl_next = plp->pl_next;
633 		plp->pl_next->pl_prev = plp->pl_prev;
634 	}
635 	plp->pl_prev = plp->pl_next = plp;	/* make plp a list of one */
636 }
637 
638 /*
639  * Concatenate page list nplpp onto the end of list plpp.
640  */
641 static void
642 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
643 {
644 	pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
645 
646 	if (*nplpp == NULL) {
647 		return;
648 	}
649 	if (*plpp == NULL) {
650 		*plpp = *nplpp;
651 		return;
652 	}
653 	s1p = *plpp;
654 	e1p =  s1p->pl_prev;
655 	s2p = *nplpp;
656 	e2p = s2p->pl_prev;
657 	s1p->pl_prev = e2p;
658 	e2p->pl_next = s1p;
659 	e1p->pl_next = s2p;
660 	s2p->pl_prev = e1p;
661 }
662 
663 /*
664  * Allocate and initialize the allocation record of one large page, the init
665  * value is 'allocated'.
666  */
667 static pmem_lpg_t *
668 pmem_lpg_alloc(uint_t kflags)
669 {
670 	pmem_lpg_t *plp;
671 
672 	ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
673 	plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
674 	if (plp == NULL)
675 		return (NULL);
676 	plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
677 	if (plp->pl_bitmap == NULL) {
678 		kmem_free(plp, sizeof (*plp));
679 		return (NULL);
680 	}
681 	plp->pl_next = plp->pl_prev = plp;
682 	return (plp);
683 }
684 
685 /* Free one allocation record pointed by oplp. */
686 static void
687 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
688 {
689 	if (*headp == plp)
690 		*headp = plp->pl_next;		/* go to next pmem_lpg_t */
691 
692 	if (*headp == plp)
693 		*headp = NULL;			/* this list is gone */
694 	else {
695 		plp->pl_prev->pl_next = plp->pl_next;
696 		plp->pl_next->pl_prev = plp->pl_prev;
697 	}
698 	kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
699 	kmem_free(plp, sizeof (*plp));
700 }
701 
702 /* Free one large page headed by spp from pmem_mpool. */
703 static void
704 lpg_free(page_t *spp)
705 {
706 	page_t *pp1 = spp;
707 	uint_t i;
708 
709 	ASSERT(MUTEX_HELD(&pmem_mutex));
710 	for (i = 0; i < pmem_pgcnt; i++) {
711 		/* Break pp1 from pmem_mpool. */
712 		page_sub(&pmem_mpool, pp1);
713 		pp1++;
714 	}
715 	/* Free pages in this large page. */
716 	page_free_pages(spp);
717 	page_unresv(pmem_pgcnt);
718 	pmem_nmpages -= pmem_pgcnt;
719 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
720 }
721 
722 /* Put n pages in *ppp list back into pmem_mpool. */
723 static void
724 mpool_append(page_t **ppp, pgcnt_t n)
725 {
726 	ASSERT(MUTEX_HELD(&pmem_mutex));
727 	/* Put back pages. */
728 	page_list_concat(&pmem_mpool, ppp);
729 	pmem_nmpages += n;
730 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
731 }
732 
733 /*
734  * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
735  * list, and return the number of grabbed pages.
736  */
737 static pgcnt_t
738 mpool_break(page_t **ppp, pgcnt_t n)
739 {
740 	pgcnt_t i;
741 
742 	ASSERT(MUTEX_HELD(&pmem_mutex));
743 	/* Grab the pages. */
744 	i = MIN(pmem_nmpages, n);
745 	*ppp = pmem_mpool;
746 	page_list_break(ppp, &pmem_mpool, i);
747 	pmem_nmpages -= i;
748 	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
749 	return (i);
750 }
751 
752 /*
753  * Create n large pages, lpages and plpp contains the number of small pages and
754  * allocation records list respectively.
755  */
756 static int
757 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
758     vnode_t *vnp, u_offset_t *offp, uint_t kflags)
759 {
760 	pgcnt_t i;
761 	pmem_lpg_t *plp;
762 	page_t *pp;
763 
764 	for (i = 0, *lpages = 0; i < n; i++) {
765 		/* Allocte one large page each time. */
766 		pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
767 		    PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
768 		if (pp == NULL)
769 			break;
770 		*offp += pmem_lpgsize;
771 		page_list_concat(lppp, &pp);
772 		*lpages += pmem_pgcnt;
773 		/* Add one allocation record for this large page. */
774 		if ((plp = pmem_lpg_alloc(kflags)) == NULL)
775 			return (DDI_FAILURE);
776 		plp->pl_pp = pp;
777 		pmem_lpg_concat(plpp, &plp);
778 	}
779 	return (DDI_SUCCESS);
780 }
781 
782 /*
783  * Break the last r small pages from the large page list *lppp (with totally n
784  * small pages) and put them into pmem_mpool.
785  */
786 static void
787 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
788 {
789 	page_t *pp, *pp1;
790 	pgcnt_t i;
791 	pmem_lpg_t *plp;
792 
793 	if (r == 0)
794 		return;
795 	ASSERT(*lppp != NULL && r < pmem_pgcnt);
796 	page_list_break(lppp, &pp, n - r);
797 
798 	/* The residual should reside in the last large page.  */
799 	plp = oplp->pl_prev;
800 	/* IOunlock and hashout the residual pages. */
801 	for (pp1 = pp, i = 0; i < r; i++) {
802 		page_io_unlock(pp1);
803 		page_hashout(pp1, NULL);
804 		/* Mark this page as free. */
805 		BT_SET(plp->pl_bitmap, PFIND(pp1));
806 		pp1 = pp1->p_next;
807 	}
808 	ASSERT(pp1 == pp);
809 	/* Put these residual pages into memory pool. */
810 	mutex_enter(&pmem_mutex);
811 	mpool_append(&pp, r);
812 	mutex_exit(&pmem_mutex);
813 }
814 
815 /* Freeing large pages in lpp and the associated allocation records in plp. */
816 static void
817 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
818 {
819 	pgcnt_t i, j;
820 	page_t *pp = lpp, *pp1;
821 	pmem_lpg_t *plp1, *plp2;
822 
823 	for (i = 0; i < lpgs; i++) {
824 		for (j = 0; j < pmem_pgcnt; j++) {
825 			/* IO unlock and hashout this small page. */
826 			page_io_unlock(pp);
827 			page_hashout(pp, NULL);
828 			pp1 = pp->p_next;
829 			pp->p_prev = pp->p_next = pp;
830 			pp = pp1;
831 		}
832 		/* Free one large page at one time. */
833 		page_free_pages(lpp);
834 		lpp = pp;
835 	}
836 	/* Free associate pmem large page allocation records. */
837 	for (plp1 = *plpp; *plpp; plp1 = plp2) {
838 		plp2 = plp1->pl_next;
839 		pmem_lpg_free(plpp, plp1);
840 	}
841 }
842 
843 /*
844  * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
845  * and offset starting with *poffp. Update allocation records accordingly at
846  * the same time.
847  */
848 static void
849 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
850 {
851 	page_t *pp;
852 	pgcnt_t i = 0;
853 	pmem_lpg_t *plp, *last_pl = NULL;
854 
855 	ASSERT(MUTEX_HELD(&pmem_mutex));
856 	for (pp = tlist; i < tpages; i++) {
857 		ASSERT(FROM_LPG(pp));
858 		page_io_lock(pp);
859 		(void) page_hashin(pp, pvnp, *poffp, NULL);
860 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
861 		/* Mark this page as allocated. */
862 		BT_CLEAR(plp->pl_bitmap, PFIND(pp));
863 		*poffp += PAGESIZE;
864 		pp = pp->p_next;
865 	}
866 	ASSERT(pp == tlist);
867 }
868 
869 /*
870  * IOunlock and hashout all pages in tlist, update allocation records
871  * accordingly at the same time.
872  */
873 static void
874 tlist_out(page_t *tlist, pgcnt_t tpages)
875 {
876 	page_t *pp;
877 	pgcnt_t i = 0;
878 	pmem_lpg_t *plp, *last_pl = NULL;
879 
880 	ASSERT(MUTEX_HELD(&pmem_mutex));
881 	for (pp = tlist; i < tpages; i++) {
882 		ASSERT(FROM_LPG(pp));
883 		page_io_unlock(pp);
884 		page_hashout(pp, NULL);
885 		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
886 		/* Mark this page as free. */
887 		BT_SET(plp->pl_bitmap, PFIND(pp));
888 		pp = pp->p_next;
889 	}
890 	ASSERT(pp == tlist);
891 }
892