i86pc/os/pmem.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * PMEM - Direct mapping physical memory pages to userland process
 *
 * Provide functions used for directly (w/o occupying kernel virtual address
 * space) allocating and exporting physical memory pages to userland.
 */

#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/sunddi.h>
#include <sys/ddidevmap.h>
#include <sys/vnode.h>
#include <sys/sysmacros.h>
#include <sys/project.h>
#include <vm/seg_dev.h>
#include <sys/pmem.h>
#include <vm/hat_i86.h>
#include <sys/task.h>
#include <sys/sdt.h>

/*
 * The routines in this file allocate memory which will be accessed through
 * the AGP GART hardware.  The GART is programmed with the PFNs for this
 * memory, and the only mechanism for removing these entries is by an
 * explicit process operation (ioctl/close of the driver, or process exit).
 * As such, the pages need to remain locked to ensure that they won't be
 * relocated or paged out.
 *
 * To prevent these locked pages from getting in the way of page
 * coalescing, we try to allocate large pages from the system, and carve
 * them up to satisfy pmem allocation requests.  This will keep the locked
 * pages within a constrained area of physical memory, limiting the number
 * of large pages that would be pinned by our locked pages.  This is, of
 * course, another take on the infamous kernel cage, and it has many of the
 * downsides of the original cage.  It also interferes with system-wide
 * resource management decisions, as it maintains its own pool of unused
 * pages which can't be easily reclaimed and used during low-memory
 * situations.
 *
 * The right solution is for pmem to register a callback that the VM system
 * could call, which would temporarily remove any GART entries for pages
 * that were being relocated.  This would let us leave the pages unlocked,
 * which would remove the need for using large pages, which would simplify
 * this code a great deal.  Unfortunately, the support for these callbacks
 * only exists on some SPARC platforms right now.
 *
 * Note that this is the *only* reason that large pages are used here.  The
 * GART can't perform large-page translations, and the code appropriately
 * falls back to using small pages if page_create_va_large() fails.
 */

#define	HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
			{ mutex_enter(&dhp->dh_lock); }

#define	RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
			{ mutex_exit(&dhp->dh_lock); }

#define	FROM_LPG(pp) (pp->p_szc != 0)
#define	PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))

/*
 * Structs and static variables used for pmem only.
 */
typedef struct pmem_lpg {
	page_t	*pl_pp;		/* start pp */
	ulong_t	*pl_bitmap;	/* allocation status for each page */
	ushort_t pl_pfree;	/* this large page might be fully freed */
	struct pmem_lpg *pl_next;
	struct pmem_lpg *pl_prev;
} pmem_lpg_t;

static size_t	pmem_lpgsize;	/* the size of one large page */
static pgcnt_t	pmem_pgcnt;	/* the number of small pages in a large page */
static uint_t	pmem_lszc;	/* page size code of the large page */
/* The segment to be associated with all the allocated pages. */
static struct seg	pmem_seg;
/* Fully occupied large pages allocated for pmem. */
static pmem_lpg_t *pmem_occ_lpgs;
/* Memory pool to store residual small pages from large pages. */
static page_t	*pmem_mpool = NULL;
/* Number of small pages reside in pmem_mpool currently. */
static pgcnt_t	pmem_nmpages = 0;
/* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
kmutex_t	pmem_mutex;

static int lpg_isfree(pmem_lpg_t *);
static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
static pmem_lpg_t *pmem_lpg_alloc(uint_t);
static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
static void lpg_free(page_t *spp);
static pgcnt_t mpool_break(page_t **, pgcnt_t);
static void mpool_append(page_t **, pgcnt_t);
static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
    vnode_t *, u_offset_t *, uint_t);
static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
static void tlist_out(page_t *, pgcnt_t);
static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
static int pmem_lock(pgcnt_t, kproject_t **);

/*
 * Called by driver devmap routine to pass physical memory mapping info to
 * seg_dev framework, used only for physical memory allocated from
 * devmap_pmem_alloc().
 */
/* ARGSUSED */
int
devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
    struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
    offset_t off, size_t len, uint_t maxprot, uint_t flags,
    ddi_device_acc_attr_t *accattrp)
{
	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);

	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
		return (DDI_FAILURE);

	/*
	 * First to check if this function has been called for this dhp.
	 */
	if (dhp->dh_flags & DEVMAP_SETUP_DONE)
		return (DDI_FAILURE);

	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
		return (DDI_FAILURE);

	/*
	 * Check if the cache attributes are supported. Need to pay
	 * attention that only uncachable or write-combining is
	 * permitted for pmem.
	 */
	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
		return (DDI_FAILURE);

	if (flags & DEVMAP_MAPPING_INVALID) {
		/*
		 * If DEVMAP_MAPPING_INVALID is specified, we have to grant
		 * remap permission.
		 */
		if (!(flags & DEVMAP_ALLOW_REMAP))
			return (DDI_FAILURE);
	} else {
		dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
		/* dh_roff is the offset inside the dh_pcookie. */
		dhp->dh_roff = ptob(btop(off));
		/* Set the cache attributes correctly */
		i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
	}

	dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
	dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
	dhp->dh_len = ptob(btopr(len));

	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);

	if (callbackops != NULL) {
		bcopy(callbackops, &dhp->dh_callbackops,
		    sizeof (struct devmap_callback_ctl));
	}

	/*
	 * Initialize dh_lock if we want to do remap.
	 */
	if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
		mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
		dhp->dh_flags |= DEVMAP_LOCK_INITED;
	}

	dhp->dh_flags |= DEVMAP_SETUP_DONE;

	return (DDI_SUCCESS);
}

/*
 * Replace existing mapping using a new cookie, mainly gets called when doing
 * fork(). Should be called in associated devmap_dup(9E).
 */
/* ARGSUSED */
int
devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
    devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
    uint_t flags, ddi_device_acc_attr_t *accattrp)
{
	devmap_handle_t *dhp = (devmap_handle_t *)dhc;
	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
	uint_t cache_attr = IOMEM_CACHE_ATTR(flags);

	/*
	 * Reture failure if setup has not been done or no remap permission
	 * has been granted during the setup.
	 */
	if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
	    (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
		return (DDI_FAILURE);

	/* No flags supported for remap yet. */
	if (flags != 0)
		return (DDI_FAILURE);

	if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
		return (DDI_FAILURE);

	if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
		return (DDI_FAILURE);

	/*
	 * Check if the cache attributes are supported. Need to pay
	 * attention that only uncachable or write-combining is
	 * permitted for pmem.
	 */
	if (i_ddi_check_cache_attr(flags) == B_FALSE ||
	    (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
		return (DDI_FAILURE);

	HOLD_DHP_LOCK(dhp);
	/*
	 * Unload the old mapping of pages reloated with this dhp, so next
	 * fault will setup the new mappings. It is in segdev_faultpage that
	 * calls hat_devload to establish the mapping. Do this while holding
	 * the dhp lock so other faults dont reestablish the mappings.
	 */
	hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
	    dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);

	/* Set the cache attributes correctly */
	i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);

	dhp->dh_pcookie = cookie;
	dhp->dh_roff = ptob(btop(off));
	dhp->dh_len = ptob(btopr(len));

	/* Clear the large page size flag. */
	dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;

	dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
	ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
	RELE_DHP_LOCK(dhp);
	return (DDI_SUCCESS);
}

/*
 * Directly (i.e., without occupying kernel virtual address space) allocate
 * 'npages' physical memory pages for exporting to user land. The allocated
 * page_t pointer will be recorded in cookie.
 */
int
devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
{
	u_offset_t	pmem_off = 0;
	page_t		*pp = NULL;
	page_t		*lpp = NULL;
	page_t		*tlist = NULL;
	pgcnt_t		i = 0;
	pgcnt_t		rpages = 0;
	pgcnt_t		lpages = 0;
	pgcnt_t		tpages = 0;
	pgcnt_t		npages = btopr(size);
	pmem_lpg_t	*plp = NULL;
	struct devmap_pmem_cookie	*pcp;
	uint_t		reserved = 0;
	uint_t		locked = 0;
	uint_t		pflags, kflags;

	*cookiep = NULL;

	/*
	 * Number larger than this will cause page_create_va() to loop
	 * infinitely.
	 */
	if (npages == 0 || npages >= total_pages / 2)
		return (DDI_FAILURE);
	if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
		return (DDI_FAILURE);
	pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
	kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;

	/* Allocate pmem cookie. */
	if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
		return (DDI_FAILURE);
	pcp->dp_npages = npages;

	/*
	 * See if the requested memory can be locked. Currently we do resource
	 * controls on the project levlel only.
	 */
	if (pmem_lock(npages, &(pcp->dp_projp)) == DDI_FAILURE)
		goto alloc_fail;
	locked = 1;

	/*
	 * First, grab as many as possible from pmem_mpool. If pages in
	 * pmem_mpool are enough for this request, we are done.
	 */
	mutex_enter(&pmem_mutex);
	tpages = mpool_break(&tlist, npages);
	/* IOlock and hashin them into the new offset. */
	if (tpages)
		tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
	mutex_exit(&pmem_mutex);

	if (tpages == npages)
		goto done;

	rpages = npages - tpages;
	/* Quit now if memory cannot be reserved. */
	if (!page_resv(rpages, kflags))
		goto alloc_fail;
	reserved = 1;

	/* Try to allocate large pages first to decrease fragmentation. */
	i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
	if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
	    kflags) == DDI_FAILURE)
		goto alloc_fail;
	ASSERT(lpages == 0 ? lpp == NULL : 1);

	/*
	 * Pages in large pages is more than the request, put the residual
	 * pages into pmem_mpool.
	 */
	if (lpages >= rpages) {
		lpp_break(&lpp, lpages, lpages - rpages, plp);
		goto done;
	}

	/* Allocate small pages if lpp+tlist cannot satisfy the request. */
	i =  rpages - lpages;
	if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
	    pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
		goto alloc_fail;

done:
	page_list_concat(&tlist, &lpp);
	page_list_concat(&tlist, &pp);
	/* Set those small pages from large pages as allocated. */
	mutex_enter(&pmem_mutex);
	pmem_lpg_concat(&pmem_occ_lpgs, &plp);
	mutex_exit(&pmem_mutex);

	/*
	 * Now tlist holds all the pages for this cookie. Record these pages in
	 * pmem cookie.
	 */
	for (pp = tlist, i = 0; i < npages; i++) {
		pcp->dp_pparray[i] = pp;
		page_io_unlock(pp);
		pp = pp->p_next;
		page_sub(&tlist, pp->p_prev);
	}
	ASSERT(tlist == NULL);
	*cookiep = (devmap_pmem_cookie_t)pcp;

	return (DDI_SUCCESS);

alloc_fail:
	DTRACE_PROBE(pmem__alloc__fail);
	/* Free large pages and the associated allocation records. */
	if (lpp)
		lpp_free(lpp, lpages / pmem_pgcnt, &plp);
	if (reserved == 1)
		page_unresv(rpages);
	/* Put those pages in tlist back into pmem_mpool. */
	if (tpages != 0) {
		mutex_enter(&pmem_mutex);
		/* IOunlock, hashout and update the allocation records. */
		tlist_out(tlist, tpages);
		mpool_append(&tlist, tpages);
		mutex_exit(&pmem_mutex);
	}
	if (locked == 1)
		i_ddi_decr_locked_memory(NULL, NULL, pcp->dp_projp, NULL,
		    ptob(pcp->dp_npages));
	/* Freeing pmem_cookie. */
	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
	kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
	return (DDI_FAILURE);
}

/*
 * Free all small pages inside cookie, and return pages from large pages into
 * mpool, if all the pages from one large page is in mpool, free it as a whole.
 */
void
devmap_pmem_free(devmap_pmem_cookie_t cookie)
{
	struct	devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
	pgcnt_t		i;
	pgcnt_t		tpages = 0;
	page_t		*pp;
	pmem_lpg_t 	*pl1, *plp;
	pmem_lpg_t	*pf_lpgs = NULL;
	uint_t		npls = 0;
	pmem_lpg_t *last_pl = NULL;
	pmem_lpg_t *plast_pl = NULL;

	ASSERT(pcp);
	mutex_enter(&pmem_mutex);
	/* Free small pages and return them to memory pool. */
	for (i = pcp->dp_npages; i > 0; i--) {
		pp = pcp->dp_pparray[i - 1];
		page_hashout(pp, NULL);
		/*
		 * Remove the mapping of this single page, this mapping is
		 * created using hat_devload() in segdev_faultpage().
		 */
		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
		if (!FROM_LPG(pp)) {
			/* Normal small page. */
			page_free(pp, 1);
			page_unresv(1);
		} else {
			/* Small page from large pages. */
			plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
			if (plp && !(plp->pl_pfree)) {
				/*
				 * Move this record to pf_lpgs list, this large
				 * page may be able to be freed as a whole.
				 */
				pmem_lpg_sub(&pmem_occ_lpgs, plp);
				pmem_lpg_concat(&pf_lpgs, &plp);
				plp->pl_pfree = 1;
				npls++;
				last_pl = NULL;
			} else {
				/* Search in pf_lpgs list. */
				plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
			}
			ASSERT(plp);
			/* Mark this page as free. */
			BT_SET(plp->pl_bitmap, PFIND(pp));
			/* Record this page in pmem_mpool. */
			mpool_append(&pp, 1);
		}
	}

	/*
	 * Find out the large pages whose pages have been freed, remove them
	 * from plp list, free them and the associated pmem_lpg struct.
	 */
	for (plp = pf_lpgs; npls != 0; npls--) {
		pl1 = plp;
		plp = plp->pl_next;
		if (lpg_isfree(pl1)) {
			/*
			 * Get one free large page.  Find all pages in this
			 * large page and remove them from pmem_mpool.
			 */
			lpg_free(pl1->pl_pp);
			/* Remove associated allocation records. */
			pmem_lpg_sub(&pf_lpgs, pl1);
			pmem_lpg_free(&pf_lpgs, pl1);
			tpages -= pmem_pgcnt;
		} else
			pl1->pl_pfree = 0;
	}
	/* Update allocation records accordingly. */
	pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
	mutex_exit(&pmem_mutex);

	i_ddi_decr_locked_memory(NULL, NULL, (kproject_t *)pcp->dp_projp, NULL,
	    ptob(pcp->dp_npages));
	kmem_free(pcp->dp_vnp, sizeof (vnode_t));
	kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
	kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
}

/*
 * To extract page frame number from specified range in a cookie.
 */
int
devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
    pfn_t *pfnarray)
{
	struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
	pgcnt_t i;

	if (pcp == NULL || start + npages > pcp->dp_npages)
		return (DDI_FAILURE);

	for (i = start; i < start + npages; i++)
		pfnarray[i - start] = pcp->dp_pparray[i]->p_pagenum;
	return (DDI_SUCCESS);
}

void
pmem_init()
{
	mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
	pmem_lszc = MIN(1, page_num_pagesizes() - 1);
	pmem_lpgsize = page_get_pagesize(pmem_lszc);
	pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
	bzero(&pmem_seg, sizeof (struct seg));
	pmem_seg.s_as = &kas;
}

/* Allocate kernel memory for one pmem cookie with n pages. */
static int
pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
{
	struct devmap_pmem_cookie *pcp;

	if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
	    kflags)) == NULL)
		return (DDI_FAILURE);
	pcp = *pcpp;
	if ((pcp->dp_vnp =
	    kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
		return (DDI_FAILURE);
	}
	if ((pcp->dp_pparray =
	    kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
		kmem_free(pcp->dp_vnp, sizeof (vnode_t));
		kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
		return (DDI_FAILURE);
	}
	return (DDI_SUCCESS);
}

/* Try to lock down n pages resource for current project. */
static int
pmem_lock(pgcnt_t n, kproject_t **prjpp)
{
	mutex_enter(&curproc->p_lock);
	if (i_ddi_incr_locked_memory(curproc, NULL, NULL, NULL,
	    ptob(n)) != 0) {
		mutex_exit(&curproc->p_lock);
		return (DDI_FAILURE);
	}
	/* Store this project in cookie for later lock/unlock. */
	*prjpp = curproc->p_task->tk_proj;
	mutex_exit(&curproc->p_lock);
	return (DDI_SUCCESS);
}

/* To check if all the pages in a large page are freed. */
static int
lpg_isfree(pmem_lpg_t *plp)
{
	uint_t i;

	for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
		if (plp->pl_bitmap[i] != BT_ULMAXMASK)
			return (0);
	/* All 1 means all pages are freed. */
	return (1);
}

/*
 * Using pp to get the associated large page allocation record, searching in
 * the splp linked list with *last as the heuristic pointer. Return NULL if
 * not found.
 */
static pmem_lpg_t *
pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
{
	pmem_lpg_t *plp;
	pgcnt_t root_pfn;

	ASSERT(pp);
	if (splp == NULL)
		return (NULL);
	root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);

	/* Try last winner first. */
	if (*last && root_pfn == page_pptonum((*last)->pl_pp))
		goto pl_found;

	/* Else search the whole pmem_lpg list. */
	for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
		plp = plp->pl_next;
		if (plp == splp) {
			plp = NULL;
			break;
		}
		ASSERT(plp->pl_pp);
	}

	*last = plp;

pl_found:
	return (*last);
}

/*
 *  Remove one pmem_lpg plp from the oplpp list.
 */
static void
pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
{
	if (*oplpp == plp)
		*oplpp = plp->pl_next;		/* go to next pmem_lpg */

	if (*oplpp == plp)
		*oplpp = NULL;			/* pmem_lpg list is gone */
	else {
		plp->pl_prev->pl_next = plp->pl_next;
		plp->pl_next->pl_prev = plp->pl_prev;
	}
	plp->pl_prev = plp->pl_next = plp;	/* make plp a list of one */
}

/*
 * Concatenate page list nplpp onto the end of list plpp.
 */
static void
pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
{
	pmem_lpg_t *s1p, *s2p, *e1p, *e2p;

	if (*nplpp == NULL) {
		return;
	}
	if (*plpp == NULL) {
		*plpp = *nplpp;
		return;
	}
	s1p = *plpp;
	e1p =  s1p->pl_prev;
	s2p = *nplpp;
	e2p = s2p->pl_prev;
	s1p->pl_prev = e2p;
	e2p->pl_next = s1p;
	e1p->pl_next = s2p;
	s2p->pl_prev = e1p;
}

/*
 * Allocate and initialize the allocation record of one large page, the init
 * value is 'allocated'.
 */
static pmem_lpg_t *
pmem_lpg_alloc(uint_t kflags)
{
	pmem_lpg_t *plp;

	ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
	plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
	if (plp == NULL)
		return (NULL);
	plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
	if (plp->pl_bitmap == NULL) {
		kmem_free(plp, sizeof (*plp));
		return (NULL);
	}
	plp->pl_next = plp->pl_prev = plp;
	return (plp);
}

/* Free one allocation record pointed by oplp. */
static void
pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
{
	if (*headp == plp)
		*headp = plp->pl_next;		/* go to next pmem_lpg_t */

	if (*headp == plp)
		*headp = NULL;			/* this list is gone */
	else {
		plp->pl_prev->pl_next = plp->pl_next;
		plp->pl_next->pl_prev = plp->pl_prev;
	}
	kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
	kmem_free(plp, sizeof (*plp));
}

/* Free one large page headed by spp from pmem_mpool. */
static void
lpg_free(page_t *spp)
{
	page_t *pp1 = spp;
	uint_t i;

	ASSERT(MUTEX_HELD(&pmem_mutex));
	for (i = 0; i < pmem_pgcnt; i++) {
		/* Break pp1 from pmem_mpool. */
		page_sub(&pmem_mpool, pp1);
		pp1++;
	}
	/* Free pages in this large page. */
	page_free_pages(spp);
	page_unresv(pmem_pgcnt);
	pmem_nmpages -= pmem_pgcnt;
	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
}

/* Put n pages in *ppp list back into pmem_mpool. */
static void
mpool_append(page_t **ppp, pgcnt_t n)
{
	ASSERT(MUTEX_HELD(&pmem_mutex));
	/* Put back pages. */
	page_list_concat(&pmem_mpool, ppp);
	pmem_nmpages += n;
	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
}

/*
 * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
 * list, and return the number of grabbed pages.
 */
static pgcnt_t
mpool_break(page_t **ppp, pgcnt_t n)
{
	pgcnt_t i;

	ASSERT(MUTEX_HELD(&pmem_mutex));
	/* Grab the pages. */
	i = MIN(pmem_nmpages, n);
	*ppp = pmem_mpool;
	page_list_break(ppp, &pmem_mpool, i);
	pmem_nmpages -= i;
	ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
	return (i);
}

/*
 * Create n large pages, lpages and plpp contains the number of small pages and
 * allocation records list respectively.
 */
static int
lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
    vnode_t *vnp, u_offset_t *offp, uint_t kflags)
{
	pgcnt_t i;
	pmem_lpg_t *plp;
	page_t *pp;

	for (i = 0, *lpages = 0; i < n; i++) {
		/* Allocte one large page each time. */
		pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
		    PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
		if (pp == NULL)
			break;
		*offp += pmem_lpgsize;
		page_list_concat(lppp, &pp);
		*lpages += pmem_pgcnt;
		/* Add one allocation record for this large page. */
		if ((plp = pmem_lpg_alloc(kflags)) == NULL)
			return (DDI_FAILURE);
		plp->pl_pp = pp;
		pmem_lpg_concat(plpp, &plp);
	}
	return (DDI_SUCCESS);
}

/*
 * Break the last r small pages from the large page list *lppp (with totally n
 * small pages) and put them into pmem_mpool.
 */
static void
lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
{
	page_t *pp, *pp1;
	pgcnt_t i;
	pmem_lpg_t *plp;

	if (r == 0)
		return;
	ASSERT(*lppp != NULL && r < pmem_pgcnt);
	page_list_break(lppp, &pp, n - r);

	/* The residual should reside in the last large page.  */
	plp = oplp->pl_prev;
	/* IOunlock and hashout the residual pages. */
	for (pp1 = pp, i = 0; i < r; i++) {
		page_io_unlock(pp1);
		page_hashout(pp1, NULL);
		/* Mark this page as free. */
		BT_SET(plp->pl_bitmap, PFIND(pp1));
		pp1 = pp1->p_next;
	}
	ASSERT(pp1 == pp);
	/* Put these residual pages into memory pool. */
	mutex_enter(&pmem_mutex);
	mpool_append(&pp, r);
	mutex_exit(&pmem_mutex);
}

/* Freeing large pages in lpp and the associated allocation records in plp. */
static void
lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
{
	pgcnt_t i, j;
	page_t *pp = lpp, *pp1;
	pmem_lpg_t *plp1, *plp2;

	for (i = 0; i < lpgs; i++) {
		for (j = 0; j < pmem_pgcnt; j++) {
			/* IO unlock and hashout this small page. */
			page_io_unlock(pp);
			page_hashout(pp, NULL);
			pp1 = pp->p_next;
			pp->p_prev = pp->p_next = pp;
			pp = pp1;
		}
		/* Free one large page at one time. */
		page_free_pages(lpp);
		lpp = pp;
	}
	/* Free associate pmem large page allocation records. */
	for (plp1 = *plpp; *plpp; plp1 = plp2) {
		plp2 = plp1->pl_next;
		pmem_lpg_free(plpp, plp1);
	}
}

/*
 * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
 * and offset starting with *poffp. Update allocation records accordingly at
 * the same time.
 */
static void
tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
{
	page_t *pp;
	pgcnt_t i = 0;
	pmem_lpg_t *plp, *last_pl = NULL;

	ASSERT(MUTEX_HELD(&pmem_mutex));
	for (pp = tlist; i < tpages; i++) {
		ASSERT(FROM_LPG(pp));
		page_io_lock(pp);
		(void) page_hashin(pp, pvnp, *poffp, NULL);
		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
		/* Mark this page as allocated. */
		BT_CLEAR(plp->pl_bitmap, PFIND(pp));
		*poffp += PAGESIZE;
		pp = pp->p_next;
	}
	ASSERT(pp == tlist);
}

/*
 * IOunlock and hashout all pages in tlist, update allocation records
 * accordingly at the same time.
 */
static void
tlist_out(page_t *tlist, pgcnt_t tpages)
{
	page_t *pp;
	pgcnt_t i = 0;
	pmem_lpg_t *plp, *last_pl = NULL;

	ASSERT(MUTEX_HELD(&pmem_mutex));
	for (pp = tlist; i < tpages; i++) {
		ASSERT(FROM_LPG(pp));
		page_io_unlock(pp);
		page_hashout(pp, NULL);
		plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
		/* Mark this page as free. */
		BT_SET(plp->pl_bitmap, PFIND(pp));
		pp = pp->p_next;
	}
	ASSERT(pp == tlist);
}