xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision e0994bd28f025d3d74315f7479562b6be19773c3)
17c8c0b82SPatrick Mooney /*
27c8c0b82SPatrick Mooney  * This file and its contents are supplied under the terms of the
37c8c0b82SPatrick Mooney  * Common Development and Distribution License ("CDDL"), version 1.0.
47c8c0b82SPatrick Mooney  * You may only use this file in accordance with the terms of version
57c8c0b82SPatrick Mooney  * 1.0 of the CDDL.
67c8c0b82SPatrick Mooney  *
77c8c0b82SPatrick Mooney  * A full copy of the text of the CDDL should have accompanied this
87c8c0b82SPatrick Mooney  * source.  A copy of the CDDL is also available via the Internet at
97c8c0b82SPatrick Mooney  * http://www.illumos.org/license/CDDL.
107c8c0b82SPatrick Mooney  */
117c8c0b82SPatrick Mooney 
127c8c0b82SPatrick Mooney /*
137c8c0b82SPatrick Mooney  * Copyright 2021 Oxide Computer Company
147c8c0b82SPatrick Mooney  */
157c8c0b82SPatrick Mooney 
167c8c0b82SPatrick Mooney /*
177c8c0b82SPatrick Mooney  * VMM Memory Reservoir
187c8c0b82SPatrick Mooney  *
197c8c0b82SPatrick Mooney  *
207c8c0b82SPatrick Mooney  * In order to make the allocation of large (multi-GiB) chunks of memory
217c8c0b82SPatrick Mooney  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
227c8c0b82SPatrick Mooney  * operators can set aside a substantial portion of system memory exclusively
237c8c0b82SPatrick Mooney  * for VMs.  This memory is unavailable for general use by the rest of the
247c8c0b82SPatrick Mooney  * system.  Rather than having to scour the freelist, reap kmem caches, or put
257c8c0b82SPatrick Mooney  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
267c8c0b82SPatrick Mooney  * there is adequate reservoir memory available.  Since the pages stored in the
277c8c0b82SPatrick Mooney  * reservoir are pre-zeroed, it can be immediately used when allocated to a
287c8c0b82SPatrick Mooney  * guest.  When the memory is returned to the reservoir, it is zeroed once more
297c8c0b82SPatrick Mooney  * to avoid leaking any sensitive data from that guest.
307c8c0b82SPatrick Mooney  *
317c8c0b82SPatrick Mooney  *
327c8c0b82SPatrick Mooney  * Transient Allocations
337c8c0b82SPatrick Mooney  *
347c8c0b82SPatrick Mooney  * While the explicit reservoir model may work well for some applications,
357c8c0b82SPatrick Mooney  * others may want a more traditional model, where pages for guest memory
367c8c0b82SPatrick Mooney  * objects are allocated on demand, rather than from a pool set aside from the
377c8c0b82SPatrick Mooney  * system.  In this case, the allocation can be made in "transient" mode, where
387c8c0b82SPatrick Mooney  * the memory is allocated normally, even if there is free capacity in the
397c8c0b82SPatrick Mooney  * reservoir.  When use of the transient allocation is complete (the guest is
407c8c0b82SPatrick Mooney  * halted and destroyed), the pages will be freed back to the system, rather
417c8c0b82SPatrick Mooney  * than added back to the reservoir.
427c8c0b82SPatrick Mooney  *
437c8c0b82SPatrick Mooney  * From an implementation standpoint, transient allocations follow the same
447c8c0b82SPatrick Mooney  * code paths as ones using the reservoir normally.  Those allocations have a
457c8c0b82SPatrick Mooney  * tag which marks them as transient, and used/free size tallies are maintained
467c8c0b82SPatrick Mooney  * separately for normal and transient operations.  When performing a transient
477c8c0b82SPatrick Mooney  * allocation, that amount of memory is immediately added to the reservoir ,
487c8c0b82SPatrick Mooney  * from which the allocation can be made.  When freeing a transient allocation,
497c8c0b82SPatrick Mooney  * a matching amount of memory is removed from the reservoir as part of the
507c8c0b82SPatrick Mooney  * operation.  This allows both allocation types to coexist without too much
517c8c0b82SPatrick Mooney  * additional machinery.
527c8c0b82SPatrick Mooney  *
537c8c0b82SPatrick Mooney  *
547c8c0b82SPatrick Mooney  * Administration
557c8c0b82SPatrick Mooney  *
567c8c0b82SPatrick Mooney  * Operators may increase, decrease, and query the the amount of memory
577c8c0b82SPatrick Mooney  * allocated to the reservoir and from to VMs via ioctls against the vmmctl
587c8c0b82SPatrick Mooney  * device.  The total amount added to the reservoir is arbitrarily limited at
597c8c0b82SPatrick Mooney  * this time by `vmmr_total_limit` which defaults to 80% of physmem.  This is
607c8c0b82SPatrick Mooney  * done to prevent the reservoir from inadvertently growing to a size where the
617c8c0b82SPatrick Mooney  * system has inadequate memory to make forward progress.  Memory may only be
627c8c0b82SPatrick Mooney  * removed from the reservoir when it is free (not allocated by any guest VMs).
637c8c0b82SPatrick Mooney  *
647c8c0b82SPatrick Mooney  *
657c8c0b82SPatrick Mooney  * Page Tracking
667c8c0b82SPatrick Mooney  *
677c8c0b82SPatrick Mooney  * The reservoir currently uses vnode association to keep track of pages under
687c8c0b82SPatrick Mooney  * its control (either designated to the reservoir and free, or allocated to a
697c8c0b82SPatrick Mooney  * guest VM object).  This means using the existing VM system primitives for
707c8c0b82SPatrick Mooney  * page_t instances being associated with a given (vnode, offset) tuple.  It
717c8c0b82SPatrick Mooney  * means that spans of pages, either free or allocated, need only to store a
727c8c0b82SPatrick Mooney  * length (of the span) and an offset (into the vnode) in order to gain access
737c8c0b82SPatrick Mooney  * to all of the underlying pages associated with that span.  Associating the
747c8c0b82SPatrick Mooney  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
757c8c0b82SPatrick Mooney  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
767c8c0b82SPatrick Mooney  * operator has chosen to dump all of RAM).
777c8c0b82SPatrick Mooney  */
787c8c0b82SPatrick Mooney 
797c8c0b82SPatrick Mooney #include <sys/types.h>
807c8c0b82SPatrick Mooney #include <sys/mutex.h>
817c8c0b82SPatrick Mooney #include <sys/avl.h>
827c8c0b82SPatrick Mooney #include <sys/list.h>
837c8c0b82SPatrick Mooney #include <sys/machparam.h>
847c8c0b82SPatrick Mooney #include <sys/kmem.h>
857c8c0b82SPatrick Mooney #include <sys/stddef.h>
867c8c0b82SPatrick Mooney #include <sys/null.h>
877c8c0b82SPatrick Mooney #include <sys/errno.h>
887c8c0b82SPatrick Mooney #include <sys/systm.h>
897c8c0b82SPatrick Mooney #include <sys/sunddi.h>
907c8c0b82SPatrick Mooney #include <sys/policy.h>
917c8c0b82SPatrick Mooney #include <vm/seg_kmem.h>
927c8c0b82SPatrick Mooney #include <vm/hat_i86.h>
937c8c0b82SPatrick Mooney 
947c8c0b82SPatrick Mooney #include <sys/vmm_reservoir.h>
957c8c0b82SPatrick Mooney #include <sys/vmm_dev.h>
967c8c0b82SPatrick Mooney 
977c8c0b82SPatrick Mooney static kmutex_t vmmr_lock;
987c8c0b82SPatrick Mooney 
997c8c0b82SPatrick Mooney static size_t vmmr_free_sz;
1007c8c0b82SPatrick Mooney static size_t vmmr_free_transient_sz;
1017c8c0b82SPatrick Mooney static size_t vmmr_adding_sz;
1027c8c0b82SPatrick Mooney static size_t vmmr_alloc_sz;
1037c8c0b82SPatrick Mooney static size_t vmmr_alloc_transient_sz;
1047c8c0b82SPatrick Mooney static size_t vmmr_empty_sz;
1057c8c0b82SPatrick Mooney 
1067c8c0b82SPatrick Mooney static uintptr_t vmmr_empty_last;
1077c8c0b82SPatrick Mooney /* Upper limit for the size (free + allocated) of the reservoir */
1087c8c0b82SPatrick Mooney static size_t vmmr_total_limit;
1097c8c0b82SPatrick Mooney 
1107c8c0b82SPatrick Mooney /* VA range allocated from the VMM arena for the mappings */
1117c8c0b82SPatrick Mooney static uintptr_t vmmr_va;
1127c8c0b82SPatrick Mooney static uintptr_t vmmr_va_sz;
1137c8c0b82SPatrick Mooney 
1147c8c0b82SPatrick Mooney /* Pair of AVL trees to store set of spans ordered by addr and size */
1157c8c0b82SPatrick Mooney typedef struct vmmr_treepair {
1167c8c0b82SPatrick Mooney 	avl_tree_t by_addr;
1177c8c0b82SPatrick Mooney 	avl_tree_t by_size;
1187c8c0b82SPatrick Mooney } vmmr_treepair_t;
1197c8c0b82SPatrick Mooney 
1207c8c0b82SPatrick Mooney /* Spans of free memory in the reservoir */
1217c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_free_tp;
1227c8c0b82SPatrick Mooney 
1237c8c0b82SPatrick Mooney /* Spans of empty (not backed by memory) space in the reservoir */
1247c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_empty_tp;
1257c8c0b82SPatrick Mooney 
1267c8c0b82SPatrick Mooney /* Regions of memory allocated from the reservoir */
1277c8c0b82SPatrick Mooney static list_t vmmr_alloc_regions;
1287c8c0b82SPatrick Mooney 
1297c8c0b82SPatrick Mooney struct vmmr_span {
1307c8c0b82SPatrick Mooney 	uintptr_t	vs_addr;
1317c8c0b82SPatrick Mooney 	size_t		vs_size;
1327c8c0b82SPatrick Mooney 	avl_node_t	vs_by_addr;
1337c8c0b82SPatrick Mooney 	avl_node_t	vs_by_size;
1347c8c0b82SPatrick Mooney 	uintptr_t	vs_region_addr;
1357c8c0b82SPatrick Mooney };
1367c8c0b82SPatrick Mooney typedef struct vmmr_span vmmr_span_t;
1377c8c0b82SPatrick Mooney 
1387c8c0b82SPatrick Mooney struct vmmr_region {
1397c8c0b82SPatrick Mooney 	size_t		vr_size;
1407c8c0b82SPatrick Mooney 	avl_tree_t	vr_spans;
1417c8c0b82SPatrick Mooney 	list_node_t	vr_node;
1427c8c0b82SPatrick Mooney 	bool		vr_transient;
1437c8c0b82SPatrick Mooney };
1447c8c0b82SPatrick Mooney 
1457c8c0b82SPatrick Mooney static int
1467c8c0b82SPatrick Mooney vmmr_cmp_addr(const void *a, const void *b)
1477c8c0b82SPatrick Mooney {
1487c8c0b82SPatrick Mooney 	const vmmr_span_t *sa = a;
1497c8c0b82SPatrick Mooney 	const vmmr_span_t *sb = b;
1507c8c0b82SPatrick Mooney 
1517c8c0b82SPatrick Mooney 	if (sa->vs_addr == sb->vs_addr) {
1527c8c0b82SPatrick Mooney 		return (0);
1537c8c0b82SPatrick Mooney 	} else if (sa->vs_addr < sb->vs_addr) {
1547c8c0b82SPatrick Mooney 		return (-1);
1557c8c0b82SPatrick Mooney 	} else {
1567c8c0b82SPatrick Mooney 		return (1);
1577c8c0b82SPatrick Mooney 	}
1587c8c0b82SPatrick Mooney }
1597c8c0b82SPatrick Mooney 
1607c8c0b82SPatrick Mooney static int
1617c8c0b82SPatrick Mooney vmmr_cmp_size(const void *a, const void *b)
1627c8c0b82SPatrick Mooney {
1637c8c0b82SPatrick Mooney 	const vmmr_span_t *sa = a;
1647c8c0b82SPatrick Mooney 	const vmmr_span_t *sb = b;
1657c8c0b82SPatrick Mooney 
1667c8c0b82SPatrick Mooney 	if (sa->vs_size == sb->vs_size) {
1677c8c0b82SPatrick Mooney 		/*
1687c8c0b82SPatrick Mooney 		 * Since discontiguous spans could have the same size in a
1697c8c0b82SPatrick Mooney 		 * by-size tree, differentiate them (as required by AVL) by
1707c8c0b82SPatrick Mooney 		 * address so they can safely coexist while remaining sorted.
1717c8c0b82SPatrick Mooney 		 */
1727c8c0b82SPatrick Mooney 		return (vmmr_cmp_addr(a, b));
1737c8c0b82SPatrick Mooney 	} else if (sa->vs_size < sb->vs_size) {
1747c8c0b82SPatrick Mooney 		return (-1);
1757c8c0b82SPatrick Mooney 	} else {
1767c8c0b82SPatrick Mooney 		return (1);
1777c8c0b82SPatrick Mooney 	}
1787c8c0b82SPatrick Mooney }
1797c8c0b82SPatrick Mooney 
1807c8c0b82SPatrick Mooney static int
1817c8c0b82SPatrick Mooney vmmr_cmp_region_addr(const void *a, const void *b)
1827c8c0b82SPatrick Mooney {
1837c8c0b82SPatrick Mooney 	const vmmr_span_t *sa = a;
1847c8c0b82SPatrick Mooney 	const vmmr_span_t *sb = b;
1857c8c0b82SPatrick Mooney 
1867c8c0b82SPatrick Mooney 	if (sa->vs_region_addr == sb->vs_region_addr) {
1877c8c0b82SPatrick Mooney 		return (0);
1887c8c0b82SPatrick Mooney 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
1897c8c0b82SPatrick Mooney 		return (-1);
1907c8c0b82SPatrick Mooney 	} else {
1917c8c0b82SPatrick Mooney 		return (1);
1927c8c0b82SPatrick Mooney 	}
1937c8c0b82SPatrick Mooney }
1947c8c0b82SPatrick Mooney 
1957c8c0b82SPatrick Mooney static void
1967c8c0b82SPatrick Mooney vmmr_tp_init(vmmr_treepair_t *tree)
1977c8c0b82SPatrick Mooney {
1987c8c0b82SPatrick Mooney 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
1997c8c0b82SPatrick Mooney 	    offsetof(vmmr_span_t, vs_by_addr));
2007c8c0b82SPatrick Mooney 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
2017c8c0b82SPatrick Mooney 	    offsetof(vmmr_span_t, vs_by_size));
2027c8c0b82SPatrick Mooney }
2037c8c0b82SPatrick Mooney 
2047c8c0b82SPatrick Mooney static void
2057c8c0b82SPatrick Mooney vmmr_tp_destroy(vmmr_treepair_t *tree)
2067c8c0b82SPatrick Mooney {
2077c8c0b82SPatrick Mooney 	void *vcp = NULL;
2087c8c0b82SPatrick Mooney 	vmmr_span_t *span;
2097c8c0b82SPatrick Mooney 
2107c8c0b82SPatrick Mooney 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
2117c8c0b82SPatrick Mooney 		/* Freeing spans will be done when tearing down by-size tree */
2127c8c0b82SPatrick Mooney 	}
2137c8c0b82SPatrick Mooney 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
2147c8c0b82SPatrick Mooney 		kmem_free(span, sizeof (*span));
2157c8c0b82SPatrick Mooney 	}
2167c8c0b82SPatrick Mooney 	avl_destroy(&tree->by_addr);
2177c8c0b82SPatrick Mooney 	avl_destroy(&tree->by_size);
2187c8c0b82SPatrick Mooney }
2197c8c0b82SPatrick Mooney 
2207c8c0b82SPatrick Mooney /*
2217c8c0b82SPatrick Mooney  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
2227c8c0b82SPatrick Mooney  * span(s).  Such concatenation could result in the `to_add` span being freed,
2237c8c0b82SPatrick Mooney  * so the caller cannot use it after this returns.
2247c8c0b82SPatrick Mooney  */
2257c8c0b82SPatrick Mooney static void
2267c8c0b82SPatrick Mooney vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
2277c8c0b82SPatrick Mooney {
2287c8c0b82SPatrick Mooney 	avl_tree_t *by_addr = &tree->by_addr;
2297c8c0b82SPatrick Mooney 	avl_tree_t *by_size = &tree->by_size;
2307c8c0b82SPatrick Mooney 	vmmr_span_t *node;
2317c8c0b82SPatrick Mooney 	avl_index_t where;
2327c8c0b82SPatrick Mooney 
2337c8c0b82SPatrick Mooney 	/* This addr should not already exist in the treepair */
2347c8c0b82SPatrick Mooney 	node = avl_find(by_addr, to_add, &where);
2357c8c0b82SPatrick Mooney 	ASSERT3P(node, ==, NULL);
2367c8c0b82SPatrick Mooney 
2377c8c0b82SPatrick Mooney 	node = avl_nearest(by_addr, where, AVL_BEFORE);
2387c8c0b82SPatrick Mooney 	if (node != NULL &&
2397c8c0b82SPatrick Mooney 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
2407c8c0b82SPatrick Mooney 		/* concat with preceeding item */
2417c8c0b82SPatrick Mooney 		avl_remove(by_addr, node);
2427c8c0b82SPatrick Mooney 		avl_remove(by_size, node);
2437c8c0b82SPatrick Mooney 		node->vs_size += to_add->vs_size;
2447c8c0b82SPatrick Mooney 		kmem_free(to_add, sizeof (*to_add));
2457c8c0b82SPatrick Mooney 
2467c8c0b82SPatrick Mooney 		/*
2477c8c0b82SPatrick Mooney 		 * Since this now-concatenated span could be adjacent one
2487c8c0b82SPatrick Mooney 		 * trailing it, fall through to perform that check.
2497c8c0b82SPatrick Mooney 		 */
2507c8c0b82SPatrick Mooney 		to_add = node;
2517c8c0b82SPatrick Mooney 	}
2527c8c0b82SPatrick Mooney 
2537c8c0b82SPatrick Mooney 	node = avl_nearest(by_addr, where, AVL_AFTER);
2547c8c0b82SPatrick Mooney 	if (node != NULL &&
2557c8c0b82SPatrick Mooney 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
2567c8c0b82SPatrick Mooney 		/* concat with trailing item */
2577c8c0b82SPatrick Mooney 		avl_remove(by_addr, node);
2587c8c0b82SPatrick Mooney 		avl_remove(by_size, node);
2597c8c0b82SPatrick Mooney 		node->vs_addr = to_add->vs_addr;
2607c8c0b82SPatrick Mooney 		node->vs_size += to_add->vs_size;
2617c8c0b82SPatrick Mooney 		avl_add(by_addr, node);
2627c8c0b82SPatrick Mooney 		avl_add(by_size, node);
2637c8c0b82SPatrick Mooney 
2647c8c0b82SPatrick Mooney 		kmem_free(to_add, sizeof (*to_add));
2657c8c0b82SPatrick Mooney 		return;
2667c8c0b82SPatrick Mooney 	}
2677c8c0b82SPatrick Mooney 
2687c8c0b82SPatrick Mooney 	/* simply insert */
2697c8c0b82SPatrick Mooney 	avl_add(by_addr, to_add);
2707c8c0b82SPatrick Mooney 	avl_add(by_size, to_add);
2717c8c0b82SPatrick Mooney }
2727c8c0b82SPatrick Mooney 
2737c8c0b82SPatrick Mooney /*
2747c8c0b82SPatrick Mooney  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
2757c8c0b82SPatrick Mooney  * the exact target size is not present, but a larger one is.  May return a span
2767c8c0b82SPatrick Mooney  * with a size smaller than the target if splitting is not an option.
2777c8c0b82SPatrick Mooney  */
2787c8c0b82SPatrick Mooney static vmmr_span_t *
2797c8c0b82SPatrick Mooney vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
2807c8c0b82SPatrick Mooney {
2817c8c0b82SPatrick Mooney 	avl_tree_t *by_addr = &tree->by_addr;
2827c8c0b82SPatrick Mooney 	avl_tree_t *by_size = &tree->by_size;
2837c8c0b82SPatrick Mooney 	vmmr_span_t *span;
2847c8c0b82SPatrick Mooney 	avl_index_t where;
2857c8c0b82SPatrick Mooney 
2867c8c0b82SPatrick Mooney 	ASSERT3U(target_sz, !=, 0);
2877c8c0b82SPatrick Mooney 	ASSERT(!avl_is_empty(by_addr));
2887c8c0b82SPatrick Mooney 	ASSERT(!avl_is_empty(by_size));
2897c8c0b82SPatrick Mooney 
2907c8c0b82SPatrick Mooney 	vmmr_span_t search = { .vs_size = target_sz };
2917c8c0b82SPatrick Mooney 	span = avl_find(by_size, &search, &where);
2927c8c0b82SPatrick Mooney 	if (span == NULL) {
2937c8c0b82SPatrick Mooney 		/* Try for a larger span (instead of exact match) */
2947c8c0b82SPatrick Mooney 		span = avl_nearest(by_size, where, AVL_AFTER);
2957c8c0b82SPatrick Mooney 		if (span == NULL) {
2967c8c0b82SPatrick Mooney 			/*
2977c8c0b82SPatrick Mooney 			 * Caller will need to collect several smaller spans in
2987c8c0b82SPatrick Mooney 			 * order to fulfill their request.
2997c8c0b82SPatrick Mooney 			 */
3007c8c0b82SPatrick Mooney 			span = avl_nearest(by_size, where, AVL_BEFORE);
3017c8c0b82SPatrick Mooney 			ASSERT3P(span, !=, NULL);
3027c8c0b82SPatrick Mooney 		}
3037c8c0b82SPatrick Mooney 	}
3047c8c0b82SPatrick Mooney 
3057c8c0b82SPatrick Mooney 	if (span->vs_size <= target_sz) {
3067c8c0b82SPatrick Mooney 		avl_remove(by_size, span);
3077c8c0b82SPatrick Mooney 		avl_remove(by_addr, span);
3087c8c0b82SPatrick Mooney 
3097c8c0b82SPatrick Mooney 		return (span);
3107c8c0b82SPatrick Mooney 	} else {
3117c8c0b82SPatrick Mooney 		/* Split off adequate chunk from larger span */
3127c8c0b82SPatrick Mooney 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
3137c8c0b82SPatrick Mooney 
3147c8c0b82SPatrick Mooney 		avl_remove(by_size, span);
3157c8c0b82SPatrick Mooney 		span->vs_size -= target_sz;
3167c8c0b82SPatrick Mooney 		avl_add(by_size, span);
3177c8c0b82SPatrick Mooney 
3187c8c0b82SPatrick Mooney 		vmmr_span_t *split_span =
3197c8c0b82SPatrick Mooney 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
3207c8c0b82SPatrick Mooney 		split_span->vs_addr = start;
3217c8c0b82SPatrick Mooney 		split_span->vs_size = target_sz;
3227c8c0b82SPatrick Mooney 
3237c8c0b82SPatrick Mooney 		return (split_span);
3247c8c0b82SPatrick Mooney 	}
3257c8c0b82SPatrick Mooney }
3267c8c0b82SPatrick Mooney 
3277c8c0b82SPatrick Mooney void
3287c8c0b82SPatrick Mooney vmmr_init()
3297c8c0b82SPatrick Mooney {
3307c8c0b82SPatrick Mooney 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
3317c8c0b82SPatrick Mooney 
3327c8c0b82SPatrick Mooney 	/*
3337c8c0b82SPatrick Mooney 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
3347c8c0b82SPatrick Mooney 	 * memory reservoir.  It is meant to provide some measure of protection
3357c8c0b82SPatrick Mooney 	 * against an operator pushing the system into unrecoverable memory
3367c8c0b82SPatrick Mooney 	 * starvation through explicit or transient additions to the reservoir.
3377c8c0b82SPatrick Mooney 	 *
3387c8c0b82SPatrick Mooney 	 * There will be many situations where this limit would be inadequate to
3397c8c0b82SPatrick Mooney 	 * prevent kernel memory starvation in the face of certain operator
3407c8c0b82SPatrick Mooney 	 * actions.  It is a balance to be struck between safety and allowing
3417c8c0b82SPatrick Mooney 	 * large systems to reach high utilization.
3427c8c0b82SPatrick Mooney 	 *
3437c8c0b82SPatrick Mooney 	 * The value is based off of pages_pp_maximum: "Number of currently
3447c8c0b82SPatrick Mooney 	 * available pages that cannot be 'locked'".  It is sized as all of
3457c8c0b82SPatrick Mooney 	 * `physmem` less 120% of `pages_pp_maximum`.
3467c8c0b82SPatrick Mooney 	 */
3477c8c0b82SPatrick Mooney 	vmmr_total_limit =
3487c8c0b82SPatrick Mooney 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
3497c8c0b82SPatrick Mooney 
3507c8c0b82SPatrick Mooney 	vmmr_empty_last = 0;
3517c8c0b82SPatrick Mooney 	vmmr_free_sz = 0;
3527c8c0b82SPatrick Mooney 	vmmr_alloc_sz = 0;
3537c8c0b82SPatrick Mooney 	vmmr_empty_sz = 0;
3547c8c0b82SPatrick Mooney 	vmmr_adding_sz = 0;
3557c8c0b82SPatrick Mooney 	vmmr_free_transient_sz = 0;
3567c8c0b82SPatrick Mooney 	vmmr_alloc_transient_sz = 0;
3577c8c0b82SPatrick Mooney 
3587c8c0b82SPatrick Mooney 	vmmr_tp_init(&vmmr_free_tp);
3597c8c0b82SPatrick Mooney 	vmmr_tp_init(&vmmr_empty_tp);
3607c8c0b82SPatrick Mooney 
3617c8c0b82SPatrick Mooney 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
3627c8c0b82SPatrick Mooney 	    offsetof(vmmr_region_t, vr_node));
3637c8c0b82SPatrick Mooney 
3647c8c0b82SPatrick Mooney 	/* Grab a chunk of VA for the reservoir */
3657c8c0b82SPatrick Mooney 	vmmr_va_sz = physmem * PAGESIZE;
3667c8c0b82SPatrick Mooney 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
3677c8c0b82SPatrick Mooney }
3687c8c0b82SPatrick Mooney 
3697c8c0b82SPatrick Mooney void
3707c8c0b82SPatrick Mooney vmmr_fini()
3717c8c0b82SPatrick Mooney {
3727c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
3737c8c0b82SPatrick Mooney 	VERIFY3U(vmmr_alloc_sz, ==, 0);
3747c8c0b82SPatrick Mooney 	VERIFY3U(vmmr_free_sz, ==, 0);
3757c8c0b82SPatrick Mooney 	VERIFY3U(vmmr_adding_sz, ==, 0);
3767c8c0b82SPatrick Mooney 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
3777c8c0b82SPatrick Mooney 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
3787c8c0b82SPatrick Mooney 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
3797c8c0b82SPatrick Mooney 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
3807c8c0b82SPatrick Mooney 	VERIFY(list_is_empty(&vmmr_alloc_regions));
3817c8c0b82SPatrick Mooney 
3827c8c0b82SPatrick Mooney 	vmmr_tp_destroy(&vmmr_free_tp);
3837c8c0b82SPatrick Mooney 	vmmr_tp_destroy(&vmmr_empty_tp);
3847c8c0b82SPatrick Mooney 	list_destroy(&vmmr_alloc_regions);
3857c8c0b82SPatrick Mooney 
3867c8c0b82SPatrick Mooney 	/* Release reservoir VA chunk */
3877c8c0b82SPatrick Mooney 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
3887c8c0b82SPatrick Mooney 	vmmr_va = 0;
3897c8c0b82SPatrick Mooney 	vmmr_va_sz = 0;
3907c8c0b82SPatrick Mooney 	vmmr_total_limit = 0;
3917c8c0b82SPatrick Mooney 	vmmr_empty_last = 0;
3927c8c0b82SPatrick Mooney 
3937c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
3947c8c0b82SPatrick Mooney 	mutex_destroy(&vmmr_lock);
3957c8c0b82SPatrick Mooney }
3967c8c0b82SPatrick Mooney 
3977c8c0b82SPatrick Mooney bool
3987c8c0b82SPatrick Mooney vmmr_is_empty()
3997c8c0b82SPatrick Mooney {
4007c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
4017c8c0b82SPatrick Mooney 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
4027c8c0b82SPatrick Mooney 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
4037c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
4047c8c0b82SPatrick Mooney 	return (res);
4057c8c0b82SPatrick Mooney }
4067c8c0b82SPatrick Mooney 
4077c8c0b82SPatrick Mooney int
4087c8c0b82SPatrick Mooney vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
4097c8c0b82SPatrick Mooney {
4107c8c0b82SPatrick Mooney 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
4117c8c0b82SPatrick Mooney 
4127c8c0b82SPatrick Mooney 	if (!transient) {
4137c8c0b82SPatrick Mooney 		mutex_enter(&vmmr_lock);
4147c8c0b82SPatrick Mooney 		if (sz > vmmr_free_sz) {
4157c8c0b82SPatrick Mooney 			mutex_exit(&vmmr_lock);
4167c8c0b82SPatrick Mooney 			return (ENOSPC);
4177c8c0b82SPatrick Mooney 		}
4187c8c0b82SPatrick Mooney 	} else {
4197c8c0b82SPatrick Mooney 		int err;
4207c8c0b82SPatrick Mooney 
4217c8c0b82SPatrick Mooney 		err = vmmr_add(sz, true);
4227c8c0b82SPatrick Mooney 		if (err != 0) {
4237c8c0b82SPatrick Mooney 			return (err);
4247c8c0b82SPatrick Mooney 		}
4257c8c0b82SPatrick Mooney 		mutex_enter(&vmmr_lock);
4267c8c0b82SPatrick Mooney 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
4277c8c0b82SPatrick Mooney 	}
4287c8c0b82SPatrick Mooney 
4297c8c0b82SPatrick Mooney 	vmmr_region_t *region;
4307c8c0b82SPatrick Mooney 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
4317c8c0b82SPatrick Mooney 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
4327c8c0b82SPatrick Mooney 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
4337c8c0b82SPatrick Mooney 	region->vr_size = sz;
4347c8c0b82SPatrick Mooney 
4357c8c0b82SPatrick Mooney 	size_t remain = sz;
4367c8c0b82SPatrick Mooney 	uintptr_t map_at = 0;
4377c8c0b82SPatrick Mooney 	while (remain > 0) {
4387c8c0b82SPatrick Mooney 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
4397c8c0b82SPatrick Mooney 
4407c8c0b82SPatrick Mooney 		/*
4417c8c0b82SPatrick Mooney 		 * We have already ensured that adequate free memory is present
4427c8c0b82SPatrick Mooney 		 * in the reservoir for this allocation.
4437c8c0b82SPatrick Mooney 		 */
4447c8c0b82SPatrick Mooney 		VERIFY3P(span, !=, NULL);
4457c8c0b82SPatrick Mooney 		ASSERT3U(span->vs_size, <=, remain);
4467c8c0b82SPatrick Mooney 
4477c8c0b82SPatrick Mooney 		span->vs_region_addr = map_at;
4487c8c0b82SPatrick Mooney 		avl_add(&region->vr_spans, span);
4497c8c0b82SPatrick Mooney 		map_at += span->vs_size;
4507c8c0b82SPatrick Mooney 		remain -= span->vs_size;
4517c8c0b82SPatrick Mooney 	}
4527c8c0b82SPatrick Mooney 
4537c8c0b82SPatrick Mooney 	if (!transient) {
4547c8c0b82SPatrick Mooney 		vmmr_free_sz -= sz;
4557c8c0b82SPatrick Mooney 		vmmr_alloc_sz += sz;
4567c8c0b82SPatrick Mooney 	} else {
4577c8c0b82SPatrick Mooney 		vmmr_free_transient_sz -= sz;
4587c8c0b82SPatrick Mooney 		vmmr_alloc_transient_sz += sz;
4597c8c0b82SPatrick Mooney 		region->vr_transient = true;
4607c8c0b82SPatrick Mooney 	}
4617c8c0b82SPatrick Mooney 	list_insert_tail(&vmmr_alloc_regions, region);
4627c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
4637c8c0b82SPatrick Mooney 
4647c8c0b82SPatrick Mooney 	*resp = region;
4657c8c0b82SPatrick Mooney 	return (0);
4667c8c0b82SPatrick Mooney }
4677c8c0b82SPatrick Mooney 
4687c8c0b82SPatrick Mooney void *
4697c8c0b82SPatrick Mooney vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
4707c8c0b82SPatrick Mooney {
4717c8c0b82SPatrick Mooney 	/* just use KPM region for now */
4727c8c0b82SPatrick Mooney 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
4737c8c0b82SPatrick Mooney }
4747c8c0b82SPatrick Mooney 
4757c8c0b82SPatrick Mooney pfn_t
4767c8c0b82SPatrick Mooney vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
4777c8c0b82SPatrick Mooney {
4787c8c0b82SPatrick Mooney 	VERIFY3U(off & PAGEOFFSET, ==, 0);
4797c8c0b82SPatrick Mooney 	VERIFY3U(off, <, region->vr_size);
4807c8c0b82SPatrick Mooney 
4817c8c0b82SPatrick Mooney 	vmmr_span_t search = {
4827c8c0b82SPatrick Mooney 		.vs_region_addr = off
4837c8c0b82SPatrick Mooney 	};
4847c8c0b82SPatrick Mooney 	avl_index_t where;
4857c8c0b82SPatrick Mooney 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
4867c8c0b82SPatrick Mooney 
4877c8c0b82SPatrick Mooney 	if (span == NULL) {
4887c8c0b82SPatrick Mooney 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
4897c8c0b82SPatrick Mooney 		ASSERT3P(span, !=, NULL);
4907c8c0b82SPatrick Mooney 	}
4917c8c0b82SPatrick Mooney 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
4927c8c0b82SPatrick Mooney 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
4937c8c0b82SPatrick Mooney 	VERIFY(pp != NULL);
4947c8c0b82SPatrick Mooney 	return (pp->p_pagenum);
4957c8c0b82SPatrick Mooney }
4967c8c0b82SPatrick Mooney 
4977c8c0b82SPatrick Mooney void
4987c8c0b82SPatrick Mooney vmmr_free(vmmr_region_t *region)
4997c8c0b82SPatrick Mooney {
5007c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
5017c8c0b82SPatrick Mooney 	if (!region->vr_transient) {
5027c8c0b82SPatrick Mooney 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
5037c8c0b82SPatrick Mooney 	} else {
5047c8c0b82SPatrick Mooney 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
5057c8c0b82SPatrick Mooney 	}
5067c8c0b82SPatrick Mooney 	list_remove(&vmmr_alloc_regions, region);
5077c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
5087c8c0b82SPatrick Mooney 
5097c8c0b82SPatrick Mooney 	/* Zero the contents */
5107c8c0b82SPatrick Mooney 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
5117c8c0b82SPatrick Mooney 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
5127c8c0b82SPatrick Mooney 	}
5137c8c0b82SPatrick Mooney 
5147c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
5157c8c0b82SPatrick Mooney 
5167c8c0b82SPatrick Mooney 	/* Put the contained span(s) back in the free pool */
5177c8c0b82SPatrick Mooney 	void *cookie = NULL;
5187c8c0b82SPatrick Mooney 	vmmr_span_t *span;
5197c8c0b82SPatrick Mooney 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
5207c8c0b82SPatrick Mooney 		span->vs_region_addr = 0;
5217c8c0b82SPatrick Mooney 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
5227c8c0b82SPatrick Mooney 	}
5237c8c0b82SPatrick Mooney 	avl_destroy(&region->vr_spans);
5247c8c0b82SPatrick Mooney 	if (!region->vr_transient) {
5257c8c0b82SPatrick Mooney 		vmmr_free_sz += region->vr_size;
5267c8c0b82SPatrick Mooney 		vmmr_alloc_sz -= region->vr_size;
5277c8c0b82SPatrick Mooney 	} else {
5287c8c0b82SPatrick Mooney 		vmmr_free_transient_sz += region->vr_size;
5297c8c0b82SPatrick Mooney 		vmmr_alloc_transient_sz -= region->vr_size;
5307c8c0b82SPatrick Mooney 	}
5317c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
5327c8c0b82SPatrick Mooney 
5337c8c0b82SPatrick Mooney 	if (region->vr_transient) {
534*e0994bd2SPatrick Mooney 		/*
535*e0994bd2SPatrick Mooney 		 * Since the transient capacity was previously allocated for
536*e0994bd2SPatrick Mooney 		 * this region, its removal should not fail.
537*e0994bd2SPatrick Mooney 		 */
538*e0994bd2SPatrick Mooney 		VERIFY0(vmmr_remove(region->vr_size, true));
5397c8c0b82SPatrick Mooney 	}
5407c8c0b82SPatrick Mooney 	kmem_free(region, sizeof (*region));
5417c8c0b82SPatrick Mooney }
5427c8c0b82SPatrick Mooney 
5437c8c0b82SPatrick Mooney static void
5447c8c0b82SPatrick Mooney vmmr_destroy_pages(vmmr_span_t *span)
5457c8c0b82SPatrick Mooney {
5467c8c0b82SPatrick Mooney 	const uintptr_t end = span->vs_addr + span->vs_size;
5477c8c0b82SPatrick Mooney 	struct vnode *vp = &kvps[KV_VVP];
5487c8c0b82SPatrick Mooney 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
5497c8c0b82SPatrick Mooney 		page_t *pp;
5507c8c0b82SPatrick Mooney 
5517c8c0b82SPatrick Mooney 		/* Page-free logic cribbed from segkmem_xfree(): */
5527c8c0b82SPatrick Mooney 		pp = page_find(vp, (u_offset_t)pos);
5537c8c0b82SPatrick Mooney 		VERIFY(pp != NULL);
5547c8c0b82SPatrick Mooney 		if (!page_tryupgrade(pp)) {
5557c8c0b82SPatrick Mooney 			/*
5567c8c0b82SPatrick Mooney 			 * Some other thread has a sharelock. Wait for
5577c8c0b82SPatrick Mooney 			 * it to drop the lock so we can free this page.
5587c8c0b82SPatrick Mooney 			 */
5597c8c0b82SPatrick Mooney 			page_unlock(pp);
5607c8c0b82SPatrick Mooney 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
5617c8c0b82SPatrick Mooney 		}
5627c8c0b82SPatrick Mooney 
5637c8c0b82SPatrick Mooney 		/*
5647c8c0b82SPatrick Mooney 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
5657c8c0b82SPatrick Mooney 		 * That will be taken care of later via page_unresv().
5667c8c0b82SPatrick Mooney 		 */
5677c8c0b82SPatrick Mooney 		pp->p_lckcnt = 0;
5687c8c0b82SPatrick Mooney 		page_destroy(pp, 0);
5697c8c0b82SPatrick Mooney 	}
5707c8c0b82SPatrick Mooney }
5717c8c0b82SPatrick Mooney 
5727c8c0b82SPatrick Mooney static int
5737c8c0b82SPatrick Mooney vmmr_alloc_pages(const vmmr_span_t *span)
5747c8c0b82SPatrick Mooney {
5757c8c0b82SPatrick Mooney 	struct seg kseg = {
5767c8c0b82SPatrick Mooney 		.s_as = &kas
5777c8c0b82SPatrick Mooney 	};
5787c8c0b82SPatrick Mooney 	struct vnode *vp = &kvps[KV_VVP];
5797c8c0b82SPatrick Mooney 
5807c8c0b82SPatrick Mooney 	const uintptr_t end = span->vs_addr + span->vs_size;
5817c8c0b82SPatrick Mooney 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
5827c8c0b82SPatrick Mooney 		page_t *pp;
5837c8c0b82SPatrick Mooney 
5847c8c0b82SPatrick Mooney 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
5857c8c0b82SPatrick Mooney 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
5867c8c0b82SPatrick Mooney 
5877c8c0b82SPatrick Mooney 		if (pp == NULL) {
5887c8c0b82SPatrick Mooney 			/* Destroy any already-created pages */
5897c8c0b82SPatrick Mooney 			if (pos != span->vs_addr) {
5907c8c0b82SPatrick Mooney 				vmmr_span_t destroy_span = {
5917c8c0b82SPatrick Mooney 					.vs_addr = span->vs_addr,
5927c8c0b82SPatrick Mooney 					.vs_size = pos - span->vs_addr,
5937c8c0b82SPatrick Mooney 				};
5947c8c0b82SPatrick Mooney 
5957c8c0b82SPatrick Mooney 				vmmr_destroy_pages(&destroy_span);
5967c8c0b82SPatrick Mooney 			}
5977c8c0b82SPatrick Mooney 			return (ENOMEM);
5987c8c0b82SPatrick Mooney 		}
5997c8c0b82SPatrick Mooney 
6007c8c0b82SPatrick Mooney 		/* mimic page state from segkmem */
6017c8c0b82SPatrick Mooney 		ASSERT(PAGE_EXCL(pp));
6027c8c0b82SPatrick Mooney 		page_io_unlock(pp);
6037c8c0b82SPatrick Mooney 		pp->p_lckcnt = 1;
6047c8c0b82SPatrick Mooney 		page_downgrade(pp);
6057c8c0b82SPatrick Mooney 
6067c8c0b82SPatrick Mooney 		/* pre-zero the page */
6077c8c0b82SPatrick Mooney 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
6087c8c0b82SPatrick Mooney 	}
6097c8c0b82SPatrick Mooney 
6107c8c0b82SPatrick Mooney 	return (0);
6117c8c0b82SPatrick Mooney }
6127c8c0b82SPatrick Mooney 
6137c8c0b82SPatrick Mooney static int
6147c8c0b82SPatrick Mooney vmmr_resv_wait()
6157c8c0b82SPatrick Mooney {
6167c8c0b82SPatrick Mooney 	if (delay_sig(hz >> 2) != 0) {
6177c8c0b82SPatrick Mooney 		/* bail due to interruption */
6187c8c0b82SPatrick Mooney 		return (0);
6197c8c0b82SPatrick Mooney 	}
6207c8c0b82SPatrick Mooney 	return (1);
6217c8c0b82SPatrick Mooney }
6227c8c0b82SPatrick Mooney 
6237c8c0b82SPatrick Mooney static void
6247c8c0b82SPatrick Mooney vmmr_remove_raw(size_t sz)
6257c8c0b82SPatrick Mooney {
6267c8c0b82SPatrick Mooney 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
6277c8c0b82SPatrick Mooney 	VERIFY(MUTEX_HELD(&vmmr_lock));
6287c8c0b82SPatrick Mooney 
6297c8c0b82SPatrick Mooney 	size_t remain = sz;
6307c8c0b82SPatrick Mooney 	while (remain > 0) {
6317c8c0b82SPatrick Mooney 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
6327c8c0b82SPatrick Mooney 
6337c8c0b82SPatrick Mooney 		/*
6347c8c0b82SPatrick Mooney 		 * The caller must ensure that at least `sz` amount is present
6357c8c0b82SPatrick Mooney 		 * in the free treepair.
6367c8c0b82SPatrick Mooney 		 */
6377c8c0b82SPatrick Mooney 		VERIFY3P(span, !=, NULL);
6387c8c0b82SPatrick Mooney 		ASSERT3U(span->vs_size, <=, remain);
6397c8c0b82SPatrick Mooney 
6407c8c0b82SPatrick Mooney 		/* TODO: perhaps arrange to destroy pages outside the lock? */
6417c8c0b82SPatrick Mooney 		vmmr_destroy_pages(span);
6427c8c0b82SPatrick Mooney 
6437c8c0b82SPatrick Mooney 		remain -= span->vs_size;
6447c8c0b82SPatrick Mooney 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
6457c8c0b82SPatrick Mooney 	}
6467c8c0b82SPatrick Mooney 
6477c8c0b82SPatrick Mooney 	vmmr_empty_sz += sz;
6487c8c0b82SPatrick Mooney }
6497c8c0b82SPatrick Mooney 
6507c8c0b82SPatrick Mooney int
6517c8c0b82SPatrick Mooney vmmr_add(size_t sz, bool transient)
6527c8c0b82SPatrick Mooney {
6537c8c0b82SPatrick Mooney 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
6547c8c0b82SPatrick Mooney 
6557c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
6567c8c0b82SPatrick Mooney 	/*
6577c8c0b82SPatrick Mooney 	 * Make sure that the amount added is not going to breach the limits
6587c8c0b82SPatrick Mooney 	 * we've chosen
6597c8c0b82SPatrick Mooney 	 */
6607c8c0b82SPatrick Mooney 	const size_t current_total =
6617c8c0b82SPatrick Mooney 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
6627c8c0b82SPatrick Mooney 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
6637c8c0b82SPatrick Mooney 	if ((current_total + sz) < current_total) {
6647c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
6657c8c0b82SPatrick Mooney 		return (EOVERFLOW);
6667c8c0b82SPatrick Mooney 	}
6677c8c0b82SPatrick Mooney 	if ((current_total + sz) > vmmr_total_limit) {
6687c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
6697c8c0b82SPatrick Mooney 		return (ENOSPC);
6707c8c0b82SPatrick Mooney 	}
6717c8c0b82SPatrick Mooney 	vmmr_adding_sz += sz;
6727c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
6737c8c0b82SPatrick Mooney 
6747c8c0b82SPatrick Mooney 	/* Wait for enough pages to become available */
6757c8c0b82SPatrick Mooney 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
6767c8c0b82SPatrick Mooney 		mutex_enter(&vmmr_lock);
6777c8c0b82SPatrick Mooney 		vmmr_adding_sz -= sz;
6787c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
6797c8c0b82SPatrick Mooney 
6807c8c0b82SPatrick Mooney 		return (EINTR);
6817c8c0b82SPatrick Mooney 	}
6827c8c0b82SPatrick Mooney 
6837c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
6847c8c0b82SPatrick Mooney 	size_t added = 0;
6857c8c0b82SPatrick Mooney 	size_t remain = sz;
6867c8c0b82SPatrick Mooney 	while (added < sz) {
6877c8c0b82SPatrick Mooney 		vmmr_span_t *span = NULL;
6887c8c0b82SPatrick Mooney 
6897c8c0b82SPatrick Mooney 		if (vmmr_empty_sz > 0) {
6907c8c0b82SPatrick Mooney 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
6917c8c0b82SPatrick Mooney 
6927c8c0b82SPatrick Mooney 			vmmr_empty_sz -= span->vs_size;
6937c8c0b82SPatrick Mooney 		} else {
6947c8c0b82SPatrick Mooney 			/*
6957c8c0b82SPatrick Mooney 			 * No empty space to fill with new pages, so just tack
6967c8c0b82SPatrick Mooney 			 * it on at the end instead.
6977c8c0b82SPatrick Mooney 			 */
6987c8c0b82SPatrick Mooney 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
6997c8c0b82SPatrick Mooney 			span->vs_addr = vmmr_empty_last;
7007c8c0b82SPatrick Mooney 			span->vs_size = remain;
7017c8c0b82SPatrick Mooney 			vmmr_empty_last += remain;
7027c8c0b82SPatrick Mooney 		}
7037c8c0b82SPatrick Mooney 		VERIFY3P(span, !=, NULL);
7047c8c0b82SPatrick Mooney 
7057c8c0b82SPatrick Mooney 
7067c8c0b82SPatrick Mooney 		/* Allocate the actual pages to back this span */
7077c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
7087c8c0b82SPatrick Mooney 		int err = vmmr_alloc_pages(span);
7097c8c0b82SPatrick Mooney 		mutex_enter(&vmmr_lock);
7107c8c0b82SPatrick Mooney 
7117c8c0b82SPatrick Mooney 		/*
7127c8c0b82SPatrick Mooney 		 * If an error is encountered during page allocation for the
7137c8c0b82SPatrick Mooney 		 * span, unwind any progress made by the addition request.
7147c8c0b82SPatrick Mooney 		 */
7157c8c0b82SPatrick Mooney 		if (err != 0) {
7167c8c0b82SPatrick Mooney 			/*
7177c8c0b82SPatrick Mooney 			 * Without pages allocated to this span, it is now
7187c8c0b82SPatrick Mooney 			 * tracked as empty.
7197c8c0b82SPatrick Mooney 			 */
7207c8c0b82SPatrick Mooney 			vmmr_empty_sz += span->vs_size;
7217c8c0b82SPatrick Mooney 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
7227c8c0b82SPatrick Mooney 
7237c8c0b82SPatrick Mooney 			if (added != 0) {
7247c8c0b82SPatrick Mooney 				vmmr_remove_raw(added);
7257c8c0b82SPatrick Mooney 			}
7267c8c0b82SPatrick Mooney 
7277c8c0b82SPatrick Mooney 			vmmr_adding_sz -= sz;
7287c8c0b82SPatrick Mooney 			mutex_exit(&vmmr_lock);
7297c8c0b82SPatrick Mooney 
7307c8c0b82SPatrick Mooney 			page_unresv(sz >> PAGESHIFT);
7317c8c0b82SPatrick Mooney 			return (err);
7327c8c0b82SPatrick Mooney 		}
7337c8c0b82SPatrick Mooney 
7347c8c0b82SPatrick Mooney 		/*
7357c8c0b82SPatrick Mooney 		 * The allocated-page-bearing span is placed in the "free"
7367c8c0b82SPatrick Mooney 		 * treepair now, but is not officially exposed for consumption
7377c8c0b82SPatrick Mooney 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
7387c8c0b82SPatrick Mooney 		 *
7397c8c0b82SPatrick Mooney 		 * This allows us to unwind the allocation in case of a failure
7407c8c0b82SPatrick Mooney 		 * without the risk of the freshly added span(s) being snapped
7417c8c0b82SPatrick Mooney 		 * up by a consumer already.
7427c8c0b82SPatrick Mooney 		 */
7437c8c0b82SPatrick Mooney 		added += span->vs_size;
7447c8c0b82SPatrick Mooney 		remain -= span->vs_size;
7457c8c0b82SPatrick Mooney 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
7467c8c0b82SPatrick Mooney 	}
7477c8c0b82SPatrick Mooney 
7487c8c0b82SPatrick Mooney 	/* Make the added memory usable by exposing it to the size accounting */
7497c8c0b82SPatrick Mooney 	if (!transient) {
7507c8c0b82SPatrick Mooney 		vmmr_free_sz += added;
7517c8c0b82SPatrick Mooney 	} else {
7527c8c0b82SPatrick Mooney 		vmmr_free_transient_sz += added;
7537c8c0b82SPatrick Mooney 	}
7547c8c0b82SPatrick Mooney 	ASSERT3U(added, ==, sz);
7557c8c0b82SPatrick Mooney 	vmmr_adding_sz -= added;
7567c8c0b82SPatrick Mooney 
7577c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
7587c8c0b82SPatrick Mooney 	return (0);
7597c8c0b82SPatrick Mooney }
7607c8c0b82SPatrick Mooney 
7617c8c0b82SPatrick Mooney int
7627c8c0b82SPatrick Mooney vmmr_remove(size_t sz, bool transient)
7637c8c0b82SPatrick Mooney {
7647c8c0b82SPatrick Mooney 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
7657c8c0b82SPatrick Mooney 
7667c8c0b82SPatrick Mooney 	mutex_enter(&vmmr_lock);
7677c8c0b82SPatrick Mooney 	if ((!transient && sz > vmmr_free_sz) ||
7687c8c0b82SPatrick Mooney 	    (transient && sz > vmmr_free_transient_sz)) {
7697c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
7707c8c0b82SPatrick Mooney 		return (ENOSPC);
7717c8c0b82SPatrick Mooney 	}
7727c8c0b82SPatrick Mooney 
7737c8c0b82SPatrick Mooney 	vmmr_remove_raw(sz);
7747c8c0b82SPatrick Mooney 
7757c8c0b82SPatrick Mooney 	if (!transient) {
7767c8c0b82SPatrick Mooney 		vmmr_free_sz -= sz;
7777c8c0b82SPatrick Mooney 	} else {
7787c8c0b82SPatrick Mooney 		vmmr_free_transient_sz -= sz;
7797c8c0b82SPatrick Mooney 	}
7807c8c0b82SPatrick Mooney 	mutex_exit(&vmmr_lock);
7817c8c0b82SPatrick Mooney 	page_unresv(sz >> PAGESHIFT);
7827c8c0b82SPatrick Mooney 	return (0);
7837c8c0b82SPatrick Mooney }
7847c8c0b82SPatrick Mooney 
7857c8c0b82SPatrick Mooney int
7867c8c0b82SPatrick Mooney vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
7877c8c0b82SPatrick Mooney {
7887c8c0b82SPatrick Mooney 	switch (cmd) {
7897c8c0b82SPatrick Mooney 	case VMM_RESV_QUERY: {
7907c8c0b82SPatrick Mooney 		struct vmm_resv_query res;
7917c8c0b82SPatrick Mooney 		void *datap = (void *)(uintptr_t)arg;
7927c8c0b82SPatrick Mooney 
7937c8c0b82SPatrick Mooney 		/* For now, anyone in GZ can query */
7947c8c0b82SPatrick Mooney 		if (crgetzoneid(cr) != GLOBAL_ZONEID) {
7957c8c0b82SPatrick Mooney 			return (EPERM);
7967c8c0b82SPatrick Mooney 		}
7977c8c0b82SPatrick Mooney 		mutex_enter(&vmmr_lock);
7987c8c0b82SPatrick Mooney 		res.vrq_free_sz = vmmr_free_sz;
7997c8c0b82SPatrick Mooney 		res.vrq_alloc_sz = vmmr_alloc_sz;
8007c8c0b82SPatrick Mooney 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
8017c8c0b82SPatrick Mooney 		res.vrq_limit = vmmr_total_limit;
8027c8c0b82SPatrick Mooney 		mutex_exit(&vmmr_lock);
8037c8c0b82SPatrick Mooney 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
8047c8c0b82SPatrick Mooney 			return (EFAULT);
8057c8c0b82SPatrick Mooney 		}
8067c8c0b82SPatrick Mooney 		break;
8077c8c0b82SPatrick Mooney 	}
8087c8c0b82SPatrick Mooney 	case VMM_RESV_ADD: {
8097c8c0b82SPatrick Mooney 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
8107c8c0b82SPatrick Mooney 			return (EPERM);
8117c8c0b82SPatrick Mooney 		}
8127c8c0b82SPatrick Mooney 		return (vmmr_add((size_t)arg, false));
8137c8c0b82SPatrick Mooney 	}
8147c8c0b82SPatrick Mooney 	case VMM_RESV_REMOVE: {
8157c8c0b82SPatrick Mooney 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
8167c8c0b82SPatrick Mooney 			return (EPERM);
8177c8c0b82SPatrick Mooney 		}
8187c8c0b82SPatrick Mooney 		return (vmmr_remove((size_t)arg, false));
8197c8c0b82SPatrick Mooney 	}
8207c8c0b82SPatrick Mooney 	default:
8217c8c0b82SPatrick Mooney 		return (ENOTTY);
8227c8c0b82SPatrick Mooney 	}
8237c8c0b82SPatrick Mooney 	return (0);
8247c8c0b82SPatrick Mooney }
825