vmm_reservoir.c (revision 40fc84a2910bb99408fbe8d10a06d5d5f1eaf953) - OpenGrok cross reference for /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c

/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2023 Oxide Computer Company
 */

/*
 * VMM Memory Reservoir
 *
 *
 * In order to make the allocation of large (multi-GiB) chunks of memory
 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
 * operators can set aside a substantial portion of system memory exclusively
 * for VMs.  This memory is unavailable for general use by the rest of the
 * system.  Rather than having to scour the freelist, reap kmem caches, or put
 * pressure on the ARC, bhyve guest memory allocations can quickly determine if
 * there is adequate reservoir memory available.  Since the pages stored in the
 * reservoir are pre-zeroed, it can be immediately used when allocated to a
 * guest.  When the memory is returned to the reservoir, it is zeroed once more
 * to avoid leaking any sensitive data from that guest.
 *
 *
 * Transient Allocations
 *
 * While the explicit reservoir model may work well for some applications,
 * others may want a more traditional model, where pages for guest memory
 * objects are allocated on demand, rather than from a pool set aside from the
 * system.  In this case, the allocation can be made in "transient" mode, where
 * the memory is allocated normally, even if there is free capacity in the
 * reservoir.  When use of the transient allocation is complete (the guest is
 * halted and destroyed), the pages will be freed back to the system, rather
 * than added back to the reservoir.
 *
 * From an implementation standpoint, transient allocations follow the same
 * code paths as ones using the reservoir normally.  Those allocations have a
 * tag which marks them as transient, and used/free size tallies are maintained
 * separately for normal and transient operations.  When performing a transient
 * allocation, that amount of memory is immediately added to the reservoir ,
 * from which the allocation can be made.  When freeing a transient allocation,
 * a matching amount of memory is removed from the reservoir as part of the
 * operation.  This allows both allocation types to coexist without too much
 * additional machinery.
 *
 *
 * Administration
 *
 * Operators may attempt to alter the amount of memory allocated to the
 * reservoir via an ioctl against the vmmctl device.  The total amount of memory
 * in the reservoir (free, or allocated to VMs) is limited by
 * `vmm_total_limit` (see its definition for how this limit is calculated).
 *
 * The limit is in place to prevent the reservoir from inadvertently growing
 * to a size where the system has inadequate memory to make forward progress.
 * Shrinking the reservoir is only possible when it contains free (not
 * allocated by any guest VMs) memory.
 *
 *
 * Page Tracking
 *
 * The reservoir currently uses vnode association to keep track of pages under
 * its control (either designated to the reservoir and free, or allocated to a
 * guest VM object).  This means using the existing VM system primitives for
 * page_t instances being associated with a given (vnode, offset) tuple.  It
 * means that spans of pages, either free or allocated, need only to store a
 * length (of the span) and an offset (into the vnode) in order to gain access
 * to all of the underlying pages associated with that span.  Associating the
 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
 * properly tracked as KAS pages, but be excluded from normal dumps (unless the
 * operator has chosen to dump all of RAM).
 */

#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/avl.h>
#include <sys/list.h>
#include <sys/machparam.h>
#include <sys/kmem.h>
#include <sys/stddef.h>
#include <sys/null.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/sunddi.h>
#include <sys/policy.h>
#include <vm/seg_kmem.h>
#include <vm/hat_i86.h>
#include <sys/kstat.h>

#include <sys/vmm_reservoir.h>
#include <sys/vmm_dev.h>
#include <sys/vmm_impl.h>

#define	VMMR_TARGET_INACTIVE	SIZE_MAX

static kmutex_t vmmr_lock;

static size_t vmmr_free_sz;
static size_t vmmr_free_transient_sz;
static size_t vmmr_adding_sz;
static size_t vmmr_alloc_sz;
static size_t vmmr_alloc_transient_sz;
static size_t vmmr_empty_sz;

/*
 * Target size of the reservoir during active vmmr_set_target() operation.
 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
 */
static size_t vmmr_target_sz;

static uintptr_t vmmr_empty_last;
/* Upper limit for the size (free + allocated) of the reservoir */
static size_t vmmr_total_limit;

/* VA range allocated from the VMM arena for the mappings */
static uintptr_t vmmr_va;
static uintptr_t vmmr_va_sz;

static kstat_t *vmmr_kstat;

/* Pair of AVL trees to store set of spans ordered by addr and size */
typedef struct vmmr_treepair {
	avl_tree_t by_addr;
	avl_tree_t by_size;
} vmmr_treepair_t;

/* Spans of free memory in the reservoir */
static vmmr_treepair_t vmmr_free_tp;

/* Spans of empty (not backed by memory) space in the reservoir */
static vmmr_treepair_t vmmr_empty_tp;

/* Regions of memory allocated from the reservoir */
static list_t vmmr_alloc_regions;

struct vmmr_span {
	uintptr_t	vs_addr;
	size_t		vs_size;
	avl_node_t	vs_by_addr;
	avl_node_t	vs_by_size;
	uintptr_t	vs_region_addr;
};
typedef struct vmmr_span vmmr_span_t;

struct vmmr_region {
	size_t		vr_size;
	avl_tree_t	vr_spans;
	list_node_t	vr_node;
	bool		vr_transient;
};

typedef struct vmmr_kstats {
	kstat_named_t	vmrks_bytes_free;
	kstat_named_t	vmrks_bytes_alloc;
	kstat_named_t	vmrks_bytes_transient;
	kstat_named_t	vmrks_bytes_limit;
} vmmr_kstats_t;


static int vmmr_add(size_t, bool);
static int vmmr_remove(size_t, bool);

static int
vmmr_cmp_addr(const void *a, const void *b)
{
	const vmmr_span_t *sa = a;
	const vmmr_span_t *sb = b;

	if (sa->vs_addr == sb->vs_addr) {
		return (0);
	} else if (sa->vs_addr < sb->vs_addr) {
		return (-1);
	} else {
		return (1);
	}
}

static int
vmmr_cmp_size(const void *a, const void *b)
{
	const vmmr_span_t *sa = a;
	const vmmr_span_t *sb = b;

	if (sa->vs_size == sb->vs_size) {
		/*
		 * Since discontiguous spans could have the same size in a
		 * by-size tree, differentiate them (as required by AVL) by
		 * address so they can safely coexist while remaining sorted.
		 */
		return (vmmr_cmp_addr(a, b));
	} else if (sa->vs_size < sb->vs_size) {
		return (-1);
	} else {
		return (1);
	}
}

static int
vmmr_cmp_region_addr(const void *a, const void *b)
{
	const vmmr_span_t *sa = a;
	const vmmr_span_t *sb = b;

	if (sa->vs_region_addr == sb->vs_region_addr) {
		return (0);
	} else if (sa->vs_region_addr < sb->vs_region_addr) {
		return (-1);
	} else {
		return (1);
	}
}

static void
vmmr_tp_init(vmmr_treepair_t *tree)
{
	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
	    offsetof(vmmr_span_t, vs_by_addr));
	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
	    offsetof(vmmr_span_t, vs_by_size));
}

static void
vmmr_tp_destroy(vmmr_treepair_t *tree)
{
	void *vcp = NULL;
	vmmr_span_t *span;

	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
		/* Freeing spans will be done when tearing down by-size tree */
	}
	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
		kmem_free(span, sizeof (*span));
	}
	avl_destroy(&tree->by_addr);
	avl_destroy(&tree->by_size);
}

/*
 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
 * span(s).  Such concatenation could result in the `to_add` span being freed,
 * so the caller cannot use it after this returns.
 */
static void
vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
{
	avl_tree_t *by_addr = &tree->by_addr;
	avl_tree_t *by_size = &tree->by_size;
	vmmr_span_t *node;
	avl_index_t where;

	/* This addr should not already exist in the treepair */
	node = avl_find(by_addr, to_add, &where);
	ASSERT3P(node, ==, NULL);

	node = avl_nearest(by_addr, where, AVL_BEFORE);
	if (node != NULL &&
	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
		/* concat with preceeding item */
		avl_remove(by_addr, node);
		avl_remove(by_size, node);
		node->vs_size += to_add->vs_size;
		kmem_free(to_add, sizeof (*to_add));

		/*
		 * Since this now-concatenated span could be adjacent one
		 * trailing it, fall through to perform that check.
		 */
		to_add = node;
	}

	node = avl_nearest(by_addr, where, AVL_AFTER);
	if (node != NULL &&
	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
		/* concat with trailing item */
		avl_remove(by_addr, node);
		avl_remove(by_size, node);
		node->vs_addr = to_add->vs_addr;
		node->vs_size += to_add->vs_size;
		avl_add(by_addr, node);
		avl_add(by_size, node);

		kmem_free(to_add, sizeof (*to_add));
		return;
	}

	/* simply insert */
	avl_add(by_addr, to_add);
	avl_add(by_size, to_add);
}

/*
 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
 * the exact target size is not present, but a larger one is.  May return a span
 * with a size smaller than the target if splitting is not an option.
 */
static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
{
	avl_tree_t *by_addr = &tree->by_addr;
	avl_tree_t *by_size = &tree->by_size;
	vmmr_span_t *span;
	avl_index_t where;

	ASSERT3U(target_sz, !=, 0);
	ASSERT(!avl_is_empty(by_addr));
	ASSERT(!avl_is_empty(by_size));

	vmmr_span_t search = { .vs_size = target_sz };
	span = avl_find(by_size, &search, &where);
	if (span == NULL) {
		/* Try for a larger span (instead of exact match) */
		span = avl_nearest(by_size, where, AVL_AFTER);
		if (span == NULL) {
			/*
			 * Caller will need to collect several smaller spans in
			 * order to fulfill their request.
			 */
			span = avl_nearest(by_size, where, AVL_BEFORE);
			ASSERT3P(span, !=, NULL);
		}
	}

	if (span->vs_size <= target_sz) {
		avl_remove(by_size, span);
		avl_remove(by_addr, span);

		return (span);
	} else {
		/* Split off adequate chunk from larger span */
		uintptr_t start = span->vs_addr + span->vs_size - target_sz;

		avl_remove(by_size, span);
		span->vs_size -= target_sz;
		avl_add(by_size, span);

		vmmr_span_t *split_span =
		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
		split_span->vs_addr = start;
		split_span->vs_size = target_sz;

		return (split_span);
	}
}

static int
vmmr_kstat_update(struct kstat *ksp, int rw)
{
	vmmr_kstats_t *vkp = ksp->ks_data;

	mutex_enter(&vmmr_lock);
	vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
	vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
	/*
	 * In addition to the memory which is actually actually allocated to
	 * transient consumers, memory which is considered free-for-transient is
	 * also included in the sizing.
	 */
	vkp->vmrks_bytes_transient.value.ui64 =
	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
	vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
	mutex_exit(&vmmr_lock);

	return (0);
}

int
vmmr_init()
{
	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);

	/*
	 * `vmm_total_limit` represents the absolute maximum size of the VMM
	 * memory reservoir.  It is meant to provide some measure of protection
	 * against an operator pushing the system into unrecoverable memory
	 * starvation through explicit or transient additions to the reservoir.
	 *
	 * There will be many situations where this limit would be inadequate to
	 * prevent kernel memory starvation in the face of certain operator
	 * actions.  It is a balance to be struck between safety and allowing
	 * large systems to reach high utilization.
	 *
	 * The value is based off of pages_pp_maximum: "Number of currently
	 * available pages that cannot be 'locked'".  It is sized as all of
	 * `physmem` less 120% of `pages_pp_maximum`.
	 */
	vmmr_total_limit =
	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;

	vmmr_empty_last = 0;
	vmmr_free_sz = 0;
	vmmr_alloc_sz = 0;
	vmmr_empty_sz = 0;
	vmmr_adding_sz = 0;
	vmmr_free_transient_sz = 0;
	vmmr_alloc_transient_sz = 0;
	vmmr_target_sz = VMMR_TARGET_INACTIVE;

	/*
	 * Attempt kstat allocation early, since it is the only part of
	 * reservoir initialization which is fallible.
	 */
	kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
	    sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
	if (ksp == NULL) {
		mutex_destroy(&vmmr_lock);
		return (ENOMEM);
	}

	vmmr_kstats_t *vkp = ksp->ks_data;

	kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
	    KSTAT_DATA_UINT64);
	ksp->ks_private = NULL;
	ksp->ks_update = vmmr_kstat_update;
	vmmr_kstat = ksp;

	vmmr_tp_init(&vmmr_free_tp);
	vmmr_tp_init(&vmmr_empty_tp);

	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
	    offsetof(vmmr_region_t, vr_node));

	/* Grab a chunk of VA for the reservoir */
	vmmr_va_sz = physmem * PAGESIZE;
	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);

	kstat_install(vmmr_kstat);

	return (0);
}

void
vmmr_fini()
{
	mutex_enter(&vmmr_lock);
	VERIFY3U(vmmr_alloc_sz, ==, 0);
	VERIFY3U(vmmr_free_sz, ==, 0);
	VERIFY3U(vmmr_adding_sz, ==, 0);
	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
	VERIFY3U(vmmr_free_transient_sz, ==, 0);
	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
	VERIFY(list_is_empty(&vmmr_alloc_regions));

	kstat_delete(vmmr_kstat);
	vmmr_kstat = NULL;

	vmmr_tp_destroy(&vmmr_free_tp);
	vmmr_tp_destroy(&vmmr_empty_tp);
	list_destroy(&vmmr_alloc_regions);

	/* Release reservoir VA chunk */
	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
	vmmr_va = 0;
	vmmr_va_sz = 0;
	vmmr_total_limit = 0;
	vmmr_empty_last = 0;

	mutex_exit(&vmmr_lock);
	mutex_destroy(&vmmr_lock);
}

bool
vmmr_is_empty()
{
	mutex_enter(&vmmr_lock);
	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
	mutex_exit(&vmmr_lock);
	return (res);
}

int
vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
{
	VERIFY3U(sz & PAGEOFFSET, ==, 0);

	if (!transient) {
		mutex_enter(&vmmr_lock);
		if (sz > vmmr_free_sz) {
			mutex_exit(&vmmr_lock);
			return (ENOSPC);
		}
	} else {
		int err;

		mutex_enter(&vmmr_lock);
		err = vmmr_add(sz, true);
		if (err != 0) {
			mutex_exit(&vmmr_lock);
			return (err);
		}
		VERIFY3U(vmmr_free_transient_sz, >=, sz);
	}

	vmmr_region_t *region;
	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
	region->vr_size = sz;

	size_t remain = sz;
	uintptr_t map_at = 0;
	while (remain > 0) {
		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);

		/*
		 * We have already ensured that adequate free memory is present
		 * in the reservoir for this allocation.
		 */
		VERIFY3P(span, !=, NULL);
		ASSERT3U(span->vs_size, <=, remain);

		span->vs_region_addr = map_at;
		avl_add(&region->vr_spans, span);
		map_at += span->vs_size;
		remain -= span->vs_size;
	}

	if (!transient) {
		vmmr_free_sz -= sz;
		vmmr_alloc_sz += sz;
	} else {
		vmmr_free_transient_sz -= sz;
		vmmr_alloc_transient_sz += sz;
		region->vr_transient = true;
	}
	list_insert_tail(&vmmr_alloc_regions, region);
	mutex_exit(&vmmr_lock);

	*resp = region;
	return (0);
}

void *
vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
{
	/* just use KPM region for now */
	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
}

pfn_t
vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
{
	VERIFY3U(off & PAGEOFFSET, ==, 0);
	VERIFY3U(off, <, region->vr_size);

	vmmr_span_t search = {
		.vs_region_addr = off
	};
	avl_index_t where;
	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);

	if (span == NULL) {
		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
		ASSERT3P(span, !=, NULL);
	}
	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
	VERIFY(pp != NULL);
	return (pp->p_pagenum);
}

void
vmmr_free(vmmr_region_t *region)
{
	mutex_enter(&vmmr_lock);
	if (!region->vr_transient) {
		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
	} else {
		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
	}
	list_remove(&vmmr_alloc_regions, region);
	mutex_exit(&vmmr_lock);

	/* Zero the contents (while not monopolizing vmmr_lock) */
	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
	}

	mutex_enter(&vmmr_lock);

	/* Put the contained span(s) back in the free pool */
	void *cookie = NULL;
	vmmr_span_t *span;
	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
		span->vs_region_addr = 0;
		vmmr_tp_insert_concat(span, &vmmr_free_tp);
	}
	avl_destroy(&region->vr_spans);
	if (!region->vr_transient) {
		vmmr_free_sz += region->vr_size;
		vmmr_alloc_sz -= region->vr_size;
	} else {
		vmmr_free_transient_sz += region->vr_size;
		vmmr_alloc_transient_sz -= region->vr_size;
	}

	if (region->vr_transient) {
		/*
		 * Since the transient capacity was previously allocated for
		 * this region, its removal should not fail.
		 */
		VERIFY0(vmmr_remove(region->vr_size, true));
	}
	kmem_free(region, sizeof (*region));
	mutex_exit(&vmmr_lock);
}

static void
vmmr_destroy_pages(vmmr_span_t *span)
{
	const uintptr_t end = span->vs_addr + span->vs_size;
	struct vnode *vp = &kvps[KV_VVP];
	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
		page_t *pp;

		/* Page-free logic cribbed from segkmem_xfree(): */
		pp = page_find(vp, (u_offset_t)pos);
		VERIFY(pp != NULL);
		if (!page_tryupgrade(pp)) {
			/*
			 * Some other thread has a sharelock. Wait for
			 * it to drop the lock so we can free this page.
			 */
			page_unlock(pp);
			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
		}

		/*
		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
		 * That will be taken care of later via page_unresv().
		 */
		pp->p_lckcnt = 0;
		page_destroy(pp, 0);
	}
}

static int
vmmr_alloc_pages(const vmmr_span_t *span)
{
	struct seg kseg = {
		.s_as = &kas
	};
	struct vnode *vp = &kvps[KV_VVP];

	const uintptr_t end = span->vs_addr + span->vs_size;
	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
		page_t *pp;

		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));

		if (pp == NULL) {
			/* Destroy any already-created pages */
			if (pos != span->vs_addr) {
				vmmr_span_t destroy_span = {
					.vs_addr = span->vs_addr,
					.vs_size = pos - span->vs_addr,
				};

				vmmr_destroy_pages(&destroy_span);
			}
			return (ENOMEM);
		}

		/* mimic page state from segkmem */
		ASSERT(PAGE_EXCL(pp));
		page_io_unlock(pp);
		pp->p_lckcnt = 1;
		page_downgrade(pp);

		/* pre-zero the page */
		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
	}

	return (0);
}

static int
vmmr_resv_wait()
{
	if (delay_sig(hz >> 2) != 0) {
		/* bail due to interruption */
		return (0);
	}
	return (1);
}

static void
vmmr_remove_raw(size_t sz)
{
	VERIFY3U(sz & PAGEOFFSET, ==, 0);
	VERIFY(MUTEX_HELD(&vmmr_lock));

	size_t remain = sz;
	while (remain > 0) {
		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);

		/*
		 * The caller must ensure that at least `sz` amount is present
		 * in the free treepair.
		 */
		VERIFY3P(span, !=, NULL);
		ASSERT3U(span->vs_size, <=, remain);

		/* TODO: perhaps arrange to destroy pages outside the lock? */
		vmmr_destroy_pages(span);

		remain -= span->vs_size;
		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
	}

	vmmr_empty_sz += sz;
}

/*
 * Add memory to vmm reservoir.  Memory may be marked for transient use, where
 * the addition is part of a transient allocation from the reservoir.  Otherwise
 * it is placed in the reservoir to be available for non-transient allocations.
 *
 * Expects vmmr_lock to be held when called, and will return with it held, but
 * will drop it during portions of the addition.
 */
static int
vmmr_add(size_t sz, bool transient)
{
	VERIFY3U(sz & PAGEOFFSET, ==, 0);
	VERIFY3U(sz, >, 0);
	VERIFY(MUTEX_HELD(&vmmr_lock));

	/*
	 * Make sure that the amount added is not going to breach the limits
	 * we've chosen
	 */
	const size_t current_total =
	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
	if ((current_total + sz) < current_total) {
		return (EOVERFLOW);
	}
	if ((current_total + sz) > vmmr_total_limit) {
		return (ENOSPC);
	}
	vmmr_adding_sz += sz;
	mutex_exit(&vmmr_lock);

	/* Wait for enough pages to become available */
	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
		mutex_enter(&vmmr_lock);
		vmmr_adding_sz -= sz;
		return (EINTR);
	}

	mutex_enter(&vmmr_lock);
	size_t added = 0;
	size_t remain = sz;
	while (added < sz) {
		vmmr_span_t *span = NULL;

		if (vmmr_empty_sz > 0) {
			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);

			vmmr_empty_sz -= span->vs_size;
		} else {
			/*
			 * No empty space to fill with new pages, so just tack
			 * it on at the end instead.
			 */
			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
			span->vs_addr = vmmr_empty_last;
			span->vs_size = remain;
			vmmr_empty_last += remain;
		}
		VERIFY3P(span, !=, NULL);


		/* Allocate the actual pages to back this span */
		mutex_exit(&vmmr_lock);
		int err = vmmr_alloc_pages(span);
		mutex_enter(&vmmr_lock);

		/*
		 * If an error is encountered during page allocation for the
		 * span, unwind any progress made by the addition request.
		 */
		if (err != 0) {
			/*
			 * Without pages allocated to this span, it is now
			 * tracked as empty.
			 */
			vmmr_empty_sz += span->vs_size;
			vmmr_tp_insert_concat(span, &vmmr_empty_tp);

			if (added != 0) {
				vmmr_remove_raw(added);
			}

			vmmr_adding_sz -= sz;

			page_unresv(sz >> PAGESHIFT);
			return (err);
		}

		/*
		 * The allocated-page-bearing span is placed in the "free"
		 * treepair now, but is not officially exposed for consumption
		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
		 *
		 * This allows us to unwind the allocation in case of a failure
		 * without the risk of the freshly added span(s) being snapped
		 * up by a consumer already.
		 */
		added += span->vs_size;
		remain -= span->vs_size;
		vmmr_tp_insert_concat(span, &vmmr_free_tp);
	}

	/* Make the added memory usable by exposing it to the size accounting */
	if (!transient) {
		vmmr_free_sz += added;
	} else {
		vmmr_free_transient_sz += added;
	}
	ASSERT3U(added, ==, sz);
	vmmr_adding_sz -= added;

	return (0);
}

/*
 * Remove memory from vmm reservoir.  Normally this will remove memory from the
 * reservoir which was available for non-transient allocations.  If the removal
 * is part of a vmmr_free() of a transient allocation, it will act on only that
 * transient region being freed, not the available memory in the reservoir.
 *
 * Expects vmmr_lock to be held when called, and will return with it held, but
 * may drop it during portions of the removal.
 */
static int
vmmr_remove(size_t sz, bool transient)
{
	VERIFY3U(sz & PAGEOFFSET, ==, 0);
	VERIFY(sz);
	VERIFY(MUTEX_HELD(&vmmr_lock));

	if ((!transient && sz > vmmr_free_sz) ||
	    (transient && sz > vmmr_free_transient_sz)) {
		return (ENOSPC);
	}

	vmmr_remove_raw(sz);

	if (!transient) {
		vmmr_free_sz -= sz;
	} else {
		vmmr_free_transient_sz -= sz;
	}
	page_unresv(sz >> PAGESHIFT);
	return (0);
}

static int
vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
{
	VERIFY(resp != NULL);

	mutex_enter(&vmmr_lock);

	size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;

	/* Be sure to communicate current size in case of an early bail-out */
	*resp = current_sz;

	if ((target_sz & PAGEOFFSET) != 0 ||
	    (chunk_sz & PAGEOFFSET) != 0) {
		mutex_exit(&vmmr_lock);
		return (EINVAL);
	}
	/* Reject sentinel value */
	if (target_sz == VMMR_TARGET_INACTIVE) {
		mutex_exit(&vmmr_lock);
		return (EINVAL);
	}

	/* Already at target size */
	if (target_sz == current_sz) {
		mutex_exit(&vmmr_lock);
		return (0);
	}

	/* Reject racing requests size */
	if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
		mutex_exit(&vmmr_lock);
		return (EALREADY);
	}
	/* Record the target now to excluding a racing request */
	vmmr_target_sz = target_sz;

	int err = 0;
	do {
		/* Be sensitive to signal interruption */
		if (issig(JUSTLOOKING) != 0) {
			mutex_exit(&vmmr_lock);
			const bool sig_bail = issig(FORREAL) != 0;
			mutex_enter(&vmmr_lock);
			if (sig_bail) {
				err = EINTR;
				break;
			}
		}

		if (current_sz > target_sz) {
			/* Shrinking reservoir */

			size_t req_sz = current_sz - target_sz;
			if (chunk_sz != 0) {
				req_sz = MIN(req_sz, chunk_sz);
			}
			err = vmmr_remove(req_sz, false);
		} else {
			/* Growing reservoir */
			ASSERT(current_sz < target_sz);

			size_t req_sz = target_sz - current_sz;
			if (chunk_sz != 0) {
				req_sz = MIN(req_sz, chunk_sz);
			}
			err = vmmr_add(req_sz, false);
		}

		current_sz = vmmr_alloc_sz + vmmr_free_sz;
	} while (err == 0 && current_sz != target_sz);

	/* Clear the target now that we are done (success or not) */
	vmmr_target_sz = VMMR_TARGET_INACTIVE;
	mutex_exit(&vmmr_lock);
	*resp = current_sz;
	return (err);
}

int
vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
{
	/*
	 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
	 * do not need to duplicate such checks here.
	 */

	switch (cmd) {
	case VMM_RESV_QUERY: {
		struct vmm_resv_query res;
		void *datap = (void *)(uintptr_t)arg;

		/* For now, anyone with access to vmmctl device can query */
		mutex_enter(&vmmr_lock);
		res.vrq_free_sz = vmmr_free_sz;
		res.vrq_alloc_sz = vmmr_alloc_sz;
		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
		res.vrq_limit = vmmr_total_limit;
		mutex_exit(&vmmr_lock);
		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
			return (EFAULT);
		}
		break;
	}
	case VMM_RESV_SET_TARGET: {
		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
			return (EPERM);
		}

		struct vmm_resv_target tgt;
		void *datap = (void *)(uintptr_t)arg;

		if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
			return (EFAULT);
		}

		int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
		    &tgt.vrt_result_sz);

		/*
		 * Attempt to communicate the resultant size of the reservoir if
		 * setting it to the target was a success, or if we were
		 * interrupted (by a signal) while doing so.
		 */
		if (err == 0 || err == EINTR) {
			if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
				err = EFAULT;
			}
		}

		return (err);
	}
	default:
		return (ENOTTY);
	}
	return (0);
}