xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision 7a6d80f1660abd4755c68cbd094d4a914681d26e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021 Oxide Computer Company
14  */
15 
16 /*
17  * VMM Memory Reservoir
18  *
19  *
20  * In order to make the allocation of large (multi-GiB) chunks of memory
21  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
22  * operators can set aside a substantial portion of system memory exclusively
23  * for VMs.  This memory is unavailable for general use by the rest of the
24  * system.  Rather than having to scour the freelist, reap kmem caches, or put
25  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
26  * there is adequate reservoir memory available.  Since the pages stored in the
27  * reservoir are pre-zeroed, it can be immediately used when allocated to a
28  * guest.  When the memory is returned to the reservoir, it is zeroed once more
29  * to avoid leaking any sensitive data from that guest.
30  *
31  *
32  * Transient Allocations
33  *
34  * While the explicit reservoir model may work well for some applications,
35  * others may want a more traditional model, where pages for guest memory
36  * objects are allocated on demand, rather than from a pool set aside from the
37  * system.  In this case, the allocation can be made in "transient" mode, where
38  * the memory is allocated normally, even if there is free capacity in the
39  * reservoir.  When use of the transient allocation is complete (the guest is
40  * halted and destroyed), the pages will be freed back to the system, rather
41  * than added back to the reservoir.
42  *
43  * From an implementation standpoint, transient allocations follow the same
44  * code paths as ones using the reservoir normally.  Those allocations have a
45  * tag which marks them as transient, and used/free size tallies are maintained
46  * separately for normal and transient operations.  When performing a transient
47  * allocation, that amount of memory is immediately added to the reservoir ,
48  * from which the allocation can be made.  When freeing a transient allocation,
49  * a matching amount of memory is removed from the reservoir as part of the
50  * operation.  This allows both allocation types to coexist without too much
51  * additional machinery.
52  *
53  *
54  * Administration
55  *
56  * Operators may increase, decrease, and query the the amount of memory
57  * allocated to the reservoir and from to VMs via ioctls against the vmmctl
58  * device.  The total amount added to the reservoir is arbitrarily limited at
59  * this time by `vmmr_total_limit` which defaults to 80% of physmem.  This is
60  * done to prevent the reservoir from inadvertently growing to a size where the
61  * system has inadequate memory to make forward progress.  Memory may only be
62  * removed from the reservoir when it is free (not allocated by any guest VMs).
63  *
64  *
65  * Page Tracking
66  *
67  * The reservoir currently uses vnode association to keep track of pages under
68  * its control (either designated to the reservoir and free, or allocated to a
69  * guest VM object).  This means using the existing VM system primitives for
70  * page_t instances being associated with a given (vnode, offset) tuple.  It
71  * means that spans of pages, either free or allocated, need only to store a
72  * length (of the span) and an offset (into the vnode) in order to gain access
73  * to all of the underlying pages associated with that span.  Associating the
74  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
75  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
76  * operator has chosen to dump all of RAM).
77  */
78 
79 #include <sys/types.h>
80 #include <sys/mutex.h>
81 #include <sys/avl.h>
82 #include <sys/list.h>
83 #include <sys/machparam.h>
84 #include <sys/kmem.h>
85 #include <sys/stddef.h>
86 #include <sys/null.h>
87 #include <sys/errno.h>
88 #include <sys/systm.h>
89 #include <sys/sunddi.h>
90 #include <sys/policy.h>
91 #include <vm/seg_kmem.h>
92 #include <vm/hat_i86.h>
93 
94 #include <sys/vmm_reservoir.h>
95 #include <sys/vmm_dev.h>
96 
97 static kmutex_t vmmr_lock;
98 
99 static size_t vmmr_free_sz;
100 static size_t vmmr_free_transient_sz;
101 static size_t vmmr_adding_sz;
102 static size_t vmmr_alloc_sz;
103 static size_t vmmr_alloc_transient_sz;
104 static size_t vmmr_empty_sz;
105 
106 static uintptr_t vmmr_empty_last;
107 /* Upper limit for the size (free + allocated) of the reservoir */
108 static size_t vmmr_total_limit;
109 
110 /* VA range allocated from the VMM arena for the mappings */
111 static uintptr_t vmmr_va;
112 static uintptr_t vmmr_va_sz;
113 
114 /* Pair of AVL trees to store set of spans ordered by addr and size */
115 typedef struct vmmr_treepair {
116 	avl_tree_t by_addr;
117 	avl_tree_t by_size;
118 } vmmr_treepair_t;
119 
120 /* Spans of free memory in the reservoir */
121 static vmmr_treepair_t vmmr_free_tp;
122 
123 /* Spans of empty (not backed by memory) space in the reservoir */
124 static vmmr_treepair_t vmmr_empty_tp;
125 
126 /* Regions of memory allocated from the reservoir */
127 static list_t vmmr_alloc_regions;
128 
129 struct vmmr_span {
130 	uintptr_t	vs_addr;
131 	size_t		vs_size;
132 	avl_node_t	vs_by_addr;
133 	avl_node_t	vs_by_size;
134 	uintptr_t	vs_region_addr;
135 };
136 typedef struct vmmr_span vmmr_span_t;
137 
138 struct vmmr_region {
139 	size_t		vr_size;
140 	avl_tree_t	vr_spans;
141 	list_node_t	vr_node;
142 	bool		vr_transient;
143 };
144 
145 static int
146 vmmr_cmp_addr(const void *a, const void *b)
147 {
148 	const vmmr_span_t *sa = a;
149 	const vmmr_span_t *sb = b;
150 
151 	if (sa->vs_addr == sb->vs_addr) {
152 		return (0);
153 	} else if (sa->vs_addr < sb->vs_addr) {
154 		return (-1);
155 	} else {
156 		return (1);
157 	}
158 }
159 
160 static int
161 vmmr_cmp_size(const void *a, const void *b)
162 {
163 	const vmmr_span_t *sa = a;
164 	const vmmr_span_t *sb = b;
165 
166 	if (sa->vs_size == sb->vs_size) {
167 		/*
168 		 * Since discontiguous spans could have the same size in a
169 		 * by-size tree, differentiate them (as required by AVL) by
170 		 * address so they can safely coexist while remaining sorted.
171 		 */
172 		return (vmmr_cmp_addr(a, b));
173 	} else if (sa->vs_size < sb->vs_size) {
174 		return (-1);
175 	} else {
176 		return (1);
177 	}
178 }
179 
180 static int
181 vmmr_cmp_region_addr(const void *a, const void *b)
182 {
183 	const vmmr_span_t *sa = a;
184 	const vmmr_span_t *sb = b;
185 
186 	if (sa->vs_region_addr == sb->vs_region_addr) {
187 		return (0);
188 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
189 		return (-1);
190 	} else {
191 		return (1);
192 	}
193 }
194 
195 static void
196 vmmr_tp_init(vmmr_treepair_t *tree)
197 {
198 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
199 	    offsetof(vmmr_span_t, vs_by_addr));
200 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
201 	    offsetof(vmmr_span_t, vs_by_size));
202 }
203 
204 static void
205 vmmr_tp_destroy(vmmr_treepair_t *tree)
206 {
207 	void *vcp = NULL;
208 	vmmr_span_t *span;
209 
210 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
211 		/* Freeing spans will be done when tearing down by-size tree */
212 	}
213 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
214 		kmem_free(span, sizeof (*span));
215 	}
216 	avl_destroy(&tree->by_addr);
217 	avl_destroy(&tree->by_size);
218 }
219 
220 /*
221  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
222  * span(s).  Such concatenation could result in the `to_add` span being freed,
223  * so the caller cannot use it after this returns.
224  */
225 static void
226 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
227 {
228 	avl_tree_t *by_addr = &tree->by_addr;
229 	avl_tree_t *by_size = &tree->by_size;
230 	vmmr_span_t *node;
231 	avl_index_t where;
232 
233 	/* This addr should not already exist in the treepair */
234 	node = avl_find(by_addr, to_add, &where);
235 	ASSERT3P(node, ==, NULL);
236 
237 	node = avl_nearest(by_addr, where, AVL_BEFORE);
238 	if (node != NULL &&
239 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
240 		/* concat with preceeding item */
241 		avl_remove(by_addr, node);
242 		avl_remove(by_size, node);
243 		node->vs_size += to_add->vs_size;
244 		kmem_free(to_add, sizeof (*to_add));
245 
246 		/*
247 		 * Since this now-concatenated span could be adjacent one
248 		 * trailing it, fall through to perform that check.
249 		 */
250 		to_add = node;
251 	}
252 
253 	node = avl_nearest(by_addr, where, AVL_AFTER);
254 	if (node != NULL &&
255 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
256 		/* concat with trailing item */
257 		avl_remove(by_addr, node);
258 		avl_remove(by_size, node);
259 		node->vs_addr = to_add->vs_addr;
260 		node->vs_size += to_add->vs_size;
261 		avl_add(by_addr, node);
262 		avl_add(by_size, node);
263 
264 		kmem_free(to_add, sizeof (*to_add));
265 		return;
266 	}
267 
268 	/* simply insert */
269 	avl_add(by_addr, to_add);
270 	avl_add(by_size, to_add);
271 }
272 
273 /*
274  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
275  * the exact target size is not present, but a larger one is.  May return a span
276  * with a size smaller than the target if splitting is not an option.
277  */
278 static vmmr_span_t *
279 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
280 {
281 	avl_tree_t *by_addr = &tree->by_addr;
282 	avl_tree_t *by_size = &tree->by_size;
283 	vmmr_span_t *span;
284 	avl_index_t where;
285 
286 	ASSERT3U(target_sz, !=, 0);
287 	ASSERT(!avl_is_empty(by_addr));
288 	ASSERT(!avl_is_empty(by_size));
289 
290 	vmmr_span_t search = { .vs_size = target_sz };
291 	span = avl_find(by_size, &search, &where);
292 	if (span == NULL) {
293 		/* Try for a larger span (instead of exact match) */
294 		span = avl_nearest(by_size, where, AVL_AFTER);
295 		if (span == NULL) {
296 			/*
297 			 * Caller will need to collect several smaller spans in
298 			 * order to fulfill their request.
299 			 */
300 			span = avl_nearest(by_size, where, AVL_BEFORE);
301 			ASSERT3P(span, !=, NULL);
302 		}
303 	}
304 
305 	if (span->vs_size <= target_sz) {
306 		avl_remove(by_size, span);
307 		avl_remove(by_addr, span);
308 
309 		return (span);
310 	} else {
311 		/* Split off adequate chunk from larger span */
312 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
313 
314 		avl_remove(by_size, span);
315 		span->vs_size -= target_sz;
316 		avl_add(by_size, span);
317 
318 		vmmr_span_t *split_span =
319 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
320 		split_span->vs_addr = start;
321 		split_span->vs_size = target_sz;
322 
323 		return (split_span);
324 	}
325 }
326 
327 void
328 vmmr_init()
329 {
330 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
331 
332 	/*
333 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
334 	 * memory reservoir.  It is meant to provide some measure of protection
335 	 * against an operator pushing the system into unrecoverable memory
336 	 * starvation through explicit or transient additions to the reservoir.
337 	 *
338 	 * There will be many situations where this limit would be inadequate to
339 	 * prevent kernel memory starvation in the face of certain operator
340 	 * actions.  It is a balance to be struck between safety and allowing
341 	 * large systems to reach high utilization.
342 	 *
343 	 * The value is based off of pages_pp_maximum: "Number of currently
344 	 * available pages that cannot be 'locked'".  It is sized as all of
345 	 * `physmem` less 120% of `pages_pp_maximum`.
346 	 */
347 	vmmr_total_limit =
348 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
349 
350 	vmmr_empty_last = 0;
351 	vmmr_free_sz = 0;
352 	vmmr_alloc_sz = 0;
353 	vmmr_empty_sz = 0;
354 	vmmr_adding_sz = 0;
355 	vmmr_free_transient_sz = 0;
356 	vmmr_alloc_transient_sz = 0;
357 
358 	vmmr_tp_init(&vmmr_free_tp);
359 	vmmr_tp_init(&vmmr_empty_tp);
360 
361 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
362 	    offsetof(vmmr_region_t, vr_node));
363 
364 	/* Grab a chunk of VA for the reservoir */
365 	vmmr_va_sz = physmem * PAGESIZE;
366 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
367 }
368 
369 void
370 vmmr_fini()
371 {
372 	mutex_enter(&vmmr_lock);
373 	VERIFY3U(vmmr_alloc_sz, ==, 0);
374 	VERIFY3U(vmmr_free_sz, ==, 0);
375 	VERIFY3U(vmmr_adding_sz, ==, 0);
376 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
377 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
378 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
379 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
380 	VERIFY(list_is_empty(&vmmr_alloc_regions));
381 
382 	vmmr_tp_destroy(&vmmr_free_tp);
383 	vmmr_tp_destroy(&vmmr_empty_tp);
384 	list_destroy(&vmmr_alloc_regions);
385 
386 	/* Release reservoir VA chunk */
387 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
388 	vmmr_va = 0;
389 	vmmr_va_sz = 0;
390 	vmmr_total_limit = 0;
391 	vmmr_empty_last = 0;
392 
393 	mutex_exit(&vmmr_lock);
394 	mutex_destroy(&vmmr_lock);
395 }
396 
397 bool
398 vmmr_is_empty()
399 {
400 	mutex_enter(&vmmr_lock);
401 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
402 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
403 	mutex_exit(&vmmr_lock);
404 	return (res);
405 }
406 
407 int
408 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
409 {
410 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
411 
412 	if (!transient) {
413 		mutex_enter(&vmmr_lock);
414 		if (sz > vmmr_free_sz) {
415 			mutex_exit(&vmmr_lock);
416 			return (ENOSPC);
417 		}
418 	} else {
419 		int err;
420 
421 		err = vmmr_add(sz, true);
422 		if (err != 0) {
423 			return (err);
424 		}
425 		mutex_enter(&vmmr_lock);
426 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
427 	}
428 
429 	vmmr_region_t *region;
430 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
431 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
432 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
433 	region->vr_size = sz;
434 
435 	size_t remain = sz;
436 	uintptr_t map_at = 0;
437 	while (remain > 0) {
438 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
439 
440 		/*
441 		 * We have already ensured that adequate free memory is present
442 		 * in the reservoir for this allocation.
443 		 */
444 		VERIFY3P(span, !=, NULL);
445 		ASSERT3U(span->vs_size, <=, remain);
446 
447 		span->vs_region_addr = map_at;
448 		avl_add(&region->vr_spans, span);
449 		map_at += span->vs_size;
450 		remain -= span->vs_size;
451 	}
452 
453 	if (!transient) {
454 		vmmr_free_sz -= sz;
455 		vmmr_alloc_sz += sz;
456 	} else {
457 		vmmr_free_transient_sz -= sz;
458 		vmmr_alloc_transient_sz += sz;
459 		region->vr_transient = true;
460 	}
461 	list_insert_tail(&vmmr_alloc_regions, region);
462 	mutex_exit(&vmmr_lock);
463 
464 	*resp = region;
465 	return (0);
466 }
467 
468 void *
469 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
470 {
471 	/* just use KPM region for now */
472 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
473 }
474 
475 pfn_t
476 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
477 {
478 	VERIFY3U(off & PAGEOFFSET, ==, 0);
479 	VERIFY3U(off, <, region->vr_size);
480 
481 	vmmr_span_t search = {
482 		.vs_region_addr = off
483 	};
484 	avl_index_t where;
485 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
486 
487 	if (span == NULL) {
488 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
489 		ASSERT3P(span, !=, NULL);
490 	}
491 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
492 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
493 	VERIFY(pp != NULL);
494 	return (pp->p_pagenum);
495 }
496 
497 void
498 vmmr_free(vmmr_region_t *region)
499 {
500 	mutex_enter(&vmmr_lock);
501 	if (!region->vr_transient) {
502 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
503 	} else {
504 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
505 	}
506 	list_remove(&vmmr_alloc_regions, region);
507 	mutex_exit(&vmmr_lock);
508 
509 	/* Zero the contents */
510 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
511 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
512 	}
513 
514 	mutex_enter(&vmmr_lock);
515 
516 	/* Put the contained span(s) back in the free pool */
517 	void *cookie = NULL;
518 	vmmr_span_t *span;
519 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
520 		span->vs_region_addr = 0;
521 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
522 	}
523 	avl_destroy(&region->vr_spans);
524 	if (!region->vr_transient) {
525 		vmmr_free_sz += region->vr_size;
526 		vmmr_alloc_sz -= region->vr_size;
527 	} else {
528 		vmmr_free_transient_sz += region->vr_size;
529 		vmmr_alloc_transient_sz -= region->vr_size;
530 	}
531 	mutex_exit(&vmmr_lock);
532 
533 	if (region->vr_transient) {
534 		/*
535 		 * Since the transient capacity was previously allocated for
536 		 * this region, its removal should not fail.
537 		 */
538 		VERIFY0(vmmr_remove(region->vr_size, true));
539 	}
540 	kmem_free(region, sizeof (*region));
541 }
542 
543 static void
544 vmmr_destroy_pages(vmmr_span_t *span)
545 {
546 	const uintptr_t end = span->vs_addr + span->vs_size;
547 	struct vnode *vp = &kvps[KV_VVP];
548 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
549 		page_t *pp;
550 
551 		/* Page-free logic cribbed from segkmem_xfree(): */
552 		pp = page_find(vp, (u_offset_t)pos);
553 		VERIFY(pp != NULL);
554 		if (!page_tryupgrade(pp)) {
555 			/*
556 			 * Some other thread has a sharelock. Wait for
557 			 * it to drop the lock so we can free this page.
558 			 */
559 			page_unlock(pp);
560 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
561 		}
562 
563 		/*
564 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
565 		 * That will be taken care of later via page_unresv().
566 		 */
567 		pp->p_lckcnt = 0;
568 		page_destroy(pp, 0);
569 	}
570 }
571 
572 static int
573 vmmr_alloc_pages(const vmmr_span_t *span)
574 {
575 	struct seg kseg = {
576 		.s_as = &kas
577 	};
578 	struct vnode *vp = &kvps[KV_VVP];
579 
580 	const uintptr_t end = span->vs_addr + span->vs_size;
581 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
582 		page_t *pp;
583 
584 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
585 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
586 
587 		if (pp == NULL) {
588 			/* Destroy any already-created pages */
589 			if (pos != span->vs_addr) {
590 				vmmr_span_t destroy_span = {
591 					.vs_addr = span->vs_addr,
592 					.vs_size = pos - span->vs_addr,
593 				};
594 
595 				vmmr_destroy_pages(&destroy_span);
596 			}
597 			return (ENOMEM);
598 		}
599 
600 		/* mimic page state from segkmem */
601 		ASSERT(PAGE_EXCL(pp));
602 		page_io_unlock(pp);
603 		pp->p_lckcnt = 1;
604 		page_downgrade(pp);
605 
606 		/* pre-zero the page */
607 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
608 	}
609 
610 	return (0);
611 }
612 
613 static int
614 vmmr_resv_wait()
615 {
616 	if (delay_sig(hz >> 2) != 0) {
617 		/* bail due to interruption */
618 		return (0);
619 	}
620 	return (1);
621 }
622 
623 static void
624 vmmr_remove_raw(size_t sz)
625 {
626 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
627 	VERIFY(MUTEX_HELD(&vmmr_lock));
628 
629 	size_t remain = sz;
630 	while (remain > 0) {
631 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
632 
633 		/*
634 		 * The caller must ensure that at least `sz` amount is present
635 		 * in the free treepair.
636 		 */
637 		VERIFY3P(span, !=, NULL);
638 		ASSERT3U(span->vs_size, <=, remain);
639 
640 		/* TODO: perhaps arrange to destroy pages outside the lock? */
641 		vmmr_destroy_pages(span);
642 
643 		remain -= span->vs_size;
644 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
645 	}
646 
647 	vmmr_empty_sz += sz;
648 }
649 
650 int
651 vmmr_add(size_t sz, bool transient)
652 {
653 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
654 
655 	mutex_enter(&vmmr_lock);
656 	/*
657 	 * Make sure that the amount added is not going to breach the limits
658 	 * we've chosen
659 	 */
660 	const size_t current_total =
661 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
662 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
663 	if ((current_total + sz) < current_total) {
664 		mutex_exit(&vmmr_lock);
665 		return (EOVERFLOW);
666 	}
667 	if ((current_total + sz) > vmmr_total_limit) {
668 		mutex_exit(&vmmr_lock);
669 		return (ENOSPC);
670 	}
671 	vmmr_adding_sz += sz;
672 	mutex_exit(&vmmr_lock);
673 
674 	/* Wait for enough pages to become available */
675 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
676 		mutex_enter(&vmmr_lock);
677 		vmmr_adding_sz -= sz;
678 		mutex_exit(&vmmr_lock);
679 
680 		return (EINTR);
681 	}
682 
683 	mutex_enter(&vmmr_lock);
684 	size_t added = 0;
685 	size_t remain = sz;
686 	while (added < sz) {
687 		vmmr_span_t *span = NULL;
688 
689 		if (vmmr_empty_sz > 0) {
690 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
691 
692 			vmmr_empty_sz -= span->vs_size;
693 		} else {
694 			/*
695 			 * No empty space to fill with new pages, so just tack
696 			 * it on at the end instead.
697 			 */
698 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
699 			span->vs_addr = vmmr_empty_last;
700 			span->vs_size = remain;
701 			vmmr_empty_last += remain;
702 		}
703 		VERIFY3P(span, !=, NULL);
704 
705 
706 		/* Allocate the actual pages to back this span */
707 		mutex_exit(&vmmr_lock);
708 		int err = vmmr_alloc_pages(span);
709 		mutex_enter(&vmmr_lock);
710 
711 		/*
712 		 * If an error is encountered during page allocation for the
713 		 * span, unwind any progress made by the addition request.
714 		 */
715 		if (err != 0) {
716 			/*
717 			 * Without pages allocated to this span, it is now
718 			 * tracked as empty.
719 			 */
720 			vmmr_empty_sz += span->vs_size;
721 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
722 
723 			if (added != 0) {
724 				vmmr_remove_raw(added);
725 			}
726 
727 			vmmr_adding_sz -= sz;
728 			mutex_exit(&vmmr_lock);
729 
730 			page_unresv(sz >> PAGESHIFT);
731 			return (err);
732 		}
733 
734 		/*
735 		 * The allocated-page-bearing span is placed in the "free"
736 		 * treepair now, but is not officially exposed for consumption
737 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
738 		 *
739 		 * This allows us to unwind the allocation in case of a failure
740 		 * without the risk of the freshly added span(s) being snapped
741 		 * up by a consumer already.
742 		 */
743 		added += span->vs_size;
744 		remain -= span->vs_size;
745 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
746 	}
747 
748 	/* Make the added memory usable by exposing it to the size accounting */
749 	if (!transient) {
750 		vmmr_free_sz += added;
751 	} else {
752 		vmmr_free_transient_sz += added;
753 	}
754 	ASSERT3U(added, ==, sz);
755 	vmmr_adding_sz -= added;
756 
757 	mutex_exit(&vmmr_lock);
758 	return (0);
759 }
760 
761 int
762 vmmr_remove(size_t sz, bool transient)
763 {
764 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
765 
766 	mutex_enter(&vmmr_lock);
767 	if ((!transient && sz > vmmr_free_sz) ||
768 	    (transient && sz > vmmr_free_transient_sz)) {
769 		mutex_exit(&vmmr_lock);
770 		return (ENOSPC);
771 	}
772 
773 	vmmr_remove_raw(sz);
774 
775 	if (!transient) {
776 		vmmr_free_sz -= sz;
777 	} else {
778 		vmmr_free_transient_sz -= sz;
779 	}
780 	mutex_exit(&vmmr_lock);
781 	page_unresv(sz >> PAGESHIFT);
782 	return (0);
783 }
784 
785 int
786 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
787 {
788 	switch (cmd) {
789 	case VMM_RESV_QUERY: {
790 		struct vmm_resv_query res;
791 		void *datap = (void *)(uintptr_t)arg;
792 
793 		/* For now, anyone in GZ can query */
794 		if (crgetzoneid(cr) != GLOBAL_ZONEID) {
795 			return (EPERM);
796 		}
797 		mutex_enter(&vmmr_lock);
798 		res.vrq_free_sz = vmmr_free_sz;
799 		res.vrq_alloc_sz = vmmr_alloc_sz;
800 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
801 		res.vrq_limit = vmmr_total_limit;
802 		mutex_exit(&vmmr_lock);
803 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
804 			return (EFAULT);
805 		}
806 		break;
807 	}
808 	case VMM_RESV_ADD: {
809 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
810 			return (EPERM);
811 		}
812 		return (vmmr_add((size_t)arg, false));
813 	}
814 	case VMM_RESV_REMOVE: {
815 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
816 			return (EPERM);
817 		}
818 		return (vmmr_remove((size_t)arg, false));
819 	}
820 	default:
821 		return (ENOTTY);
822 	}
823 	return (0);
824 }
825