xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision 7c8c0b8227679b4684566e408ccc96d6ef7175e9)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021 Oxide Computer Company
14  */
15 
16 /*
17  * VMM Memory Reservoir
18  *
19  *
20  * In order to make the allocation of large (multi-GiB) chunks of memory
21  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
22  * operators can set aside a substantial portion of system memory exclusively
23  * for VMs.  This memory is unavailable for general use by the rest of the
24  * system.  Rather than having to scour the freelist, reap kmem caches, or put
25  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
26  * there is adequate reservoir memory available.  Since the pages stored in the
27  * reservoir are pre-zeroed, it can be immediately used when allocated to a
28  * guest.  When the memory is returned to the reservoir, it is zeroed once more
29  * to avoid leaking any sensitive data from that guest.
30  *
31  *
32  * Transient Allocations
33  *
34  * While the explicit reservoir model may work well for some applications,
35  * others may want a more traditional model, where pages for guest memory
36  * objects are allocated on demand, rather than from a pool set aside from the
37  * system.  In this case, the allocation can be made in "transient" mode, where
38  * the memory is allocated normally, even if there is free capacity in the
39  * reservoir.  When use of the transient allocation is complete (the guest is
40  * halted and destroyed), the pages will be freed back to the system, rather
41  * than added back to the reservoir.
42  *
43  * From an implementation standpoint, transient allocations follow the same
44  * code paths as ones using the reservoir normally.  Those allocations have a
45  * tag which marks them as transient, and used/free size tallies are maintained
46  * separately for normal and transient operations.  When performing a transient
47  * allocation, that amount of memory is immediately added to the reservoir ,
48  * from which the allocation can be made.  When freeing a transient allocation,
49  * a matching amount of memory is removed from the reservoir as part of the
50  * operation.  This allows both allocation types to coexist without too much
51  * additional machinery.
52  *
53  *
54  * Administration
55  *
56  * Operators may increase, decrease, and query the the amount of memory
57  * allocated to the reservoir and from to VMs via ioctls against the vmmctl
58  * device.  The total amount added to the reservoir is arbitrarily limited at
59  * this time by `vmmr_total_limit` which defaults to 80% of physmem.  This is
60  * done to prevent the reservoir from inadvertently growing to a size where the
61  * system has inadequate memory to make forward progress.  Memory may only be
62  * removed from the reservoir when it is free (not allocated by any guest VMs).
63  *
64  *
65  * Page Tracking
66  *
67  * The reservoir currently uses vnode association to keep track of pages under
68  * its control (either designated to the reservoir and free, or allocated to a
69  * guest VM object).  This means using the existing VM system primitives for
70  * page_t instances being associated with a given (vnode, offset) tuple.  It
71  * means that spans of pages, either free or allocated, need only to store a
72  * length (of the span) and an offset (into the vnode) in order to gain access
73  * to all of the underlying pages associated with that span.  Associating the
74  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
75  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
76  * operator has chosen to dump all of RAM).
77  */
78 
79 #include <sys/types.h>
80 #include <sys/mutex.h>
81 #include <sys/avl.h>
82 #include <sys/list.h>
83 #include <sys/machparam.h>
84 #include <sys/kmem.h>
85 #include <sys/stddef.h>
86 #include <sys/null.h>
87 #include <sys/errno.h>
88 #include <sys/systm.h>
89 #include <sys/sunddi.h>
90 #include <sys/policy.h>
91 #include <vm/seg_kmem.h>
92 #include <vm/hat_i86.h>
93 
94 #include <sys/vmm_reservoir.h>
95 #include <sys/vmm_dev.h>
96 
97 static kmutex_t vmmr_lock;
98 
99 static size_t vmmr_free_sz;
100 static size_t vmmr_free_transient_sz;
101 static size_t vmmr_adding_sz;
102 static size_t vmmr_alloc_sz;
103 static size_t vmmr_alloc_transient_sz;
104 static size_t vmmr_empty_sz;
105 
106 static uintptr_t vmmr_empty_last;
107 /* Upper limit for the size (free + allocated) of the reservoir */
108 static size_t vmmr_total_limit;
109 
110 /* VA range allocated from the VMM arena for the mappings */
111 static uintptr_t vmmr_va;
112 static uintptr_t vmmr_va_sz;
113 
114 /* Pair of AVL trees to store set of spans ordered by addr and size */
115 typedef struct vmmr_treepair {
116 	avl_tree_t by_addr;
117 	avl_tree_t by_size;
118 } vmmr_treepair_t;
119 
120 /* Spans of free memory in the reservoir */
121 static vmmr_treepair_t vmmr_free_tp;
122 
123 /* Spans of empty (not backed by memory) space in the reservoir */
124 static vmmr_treepair_t vmmr_empty_tp;
125 
126 /* Regions of memory allocated from the reservoir */
127 static list_t vmmr_alloc_regions;
128 
129 struct vmmr_span {
130 	uintptr_t	vs_addr;
131 	size_t		vs_size;
132 	avl_node_t	vs_by_addr;
133 	avl_node_t	vs_by_size;
134 	uintptr_t	vs_region_addr;
135 };
136 typedef struct vmmr_span vmmr_span_t;
137 
138 struct vmmr_region {
139 	size_t		vr_size;
140 	avl_tree_t	vr_spans;
141 	list_node_t	vr_node;
142 	bool		vr_transient;
143 };
144 
145 static int
146 vmmr_cmp_addr(const void *a, const void *b)
147 {
148 	const vmmr_span_t *sa = a;
149 	const vmmr_span_t *sb = b;
150 
151 	if (sa->vs_addr == sb->vs_addr) {
152 		return (0);
153 	} else if (sa->vs_addr < sb->vs_addr) {
154 		return (-1);
155 	} else {
156 		return (1);
157 	}
158 }
159 
160 static int
161 vmmr_cmp_size(const void *a, const void *b)
162 {
163 	const vmmr_span_t *sa = a;
164 	const vmmr_span_t *sb = b;
165 
166 	if (sa->vs_size == sb->vs_size) {
167 		/*
168 		 * Since discontiguous spans could have the same size in a
169 		 * by-size tree, differentiate them (as required by AVL) by
170 		 * address so they can safely coexist while remaining sorted.
171 		 */
172 		return (vmmr_cmp_addr(a, b));
173 	} else if (sa->vs_size < sb->vs_size) {
174 		return (-1);
175 	} else {
176 		return (1);
177 	}
178 }
179 
180 static int
181 vmmr_cmp_region_addr(const void *a, const void *b)
182 {
183 	const vmmr_span_t *sa = a;
184 	const vmmr_span_t *sb = b;
185 
186 	if (sa->vs_region_addr == sb->vs_region_addr) {
187 		return (0);
188 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
189 		return (-1);
190 	} else {
191 		return (1);
192 	}
193 }
194 
195 static void
196 vmmr_tp_init(vmmr_treepair_t *tree)
197 {
198 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
199 	    offsetof(vmmr_span_t, vs_by_addr));
200 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
201 	    offsetof(vmmr_span_t, vs_by_size));
202 }
203 
204 static void
205 vmmr_tp_destroy(vmmr_treepair_t *tree)
206 {
207 	void *vcp = NULL;
208 	vmmr_span_t *span;
209 
210 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
211 		/* Freeing spans will be done when tearing down by-size tree */
212 	}
213 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
214 		kmem_free(span, sizeof (*span));
215 	}
216 	avl_destroy(&tree->by_addr);
217 	avl_destroy(&tree->by_size);
218 }
219 
220 /*
221  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
222  * span(s).  Such concatenation could result in the `to_add` span being freed,
223  * so the caller cannot use it after this returns.
224  */
225 static void
226 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
227 {
228 	avl_tree_t *by_addr = &tree->by_addr;
229 	avl_tree_t *by_size = &tree->by_size;
230 	vmmr_span_t *node;
231 	avl_index_t where;
232 
233 	/* This addr should not already exist in the treepair */
234 	node = avl_find(by_addr, to_add, &where);
235 	ASSERT3P(node, ==, NULL);
236 
237 	node = avl_nearest(by_addr, where, AVL_BEFORE);
238 	if (node != NULL &&
239 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
240 		/* concat with preceeding item */
241 		avl_remove(by_addr, node);
242 		avl_remove(by_size, node);
243 		node->vs_size += to_add->vs_size;
244 		kmem_free(to_add, sizeof (*to_add));
245 
246 		/*
247 		 * Since this now-concatenated span could be adjacent one
248 		 * trailing it, fall through to perform that check.
249 		 */
250 		to_add = node;
251 	}
252 
253 	node = avl_nearest(by_addr, where, AVL_AFTER);
254 	if (node != NULL &&
255 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
256 		/* concat with trailing item */
257 		avl_remove(by_addr, node);
258 		avl_remove(by_size, node);
259 		node->vs_addr = to_add->vs_addr;
260 		node->vs_size += to_add->vs_size;
261 		avl_add(by_addr, node);
262 		avl_add(by_size, node);
263 
264 		kmem_free(to_add, sizeof (*to_add));
265 		return;
266 	}
267 
268 	/* simply insert */
269 	avl_add(by_addr, to_add);
270 	avl_add(by_size, to_add);
271 }
272 
273 /*
274  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
275  * the exact target size is not present, but a larger one is.  May return a span
276  * with a size smaller than the target if splitting is not an option.
277  */
278 static vmmr_span_t *
279 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
280 {
281 	avl_tree_t *by_addr = &tree->by_addr;
282 	avl_tree_t *by_size = &tree->by_size;
283 	vmmr_span_t *span;
284 	avl_index_t where;
285 
286 	ASSERT3U(target_sz, !=, 0);
287 	ASSERT(!avl_is_empty(by_addr));
288 	ASSERT(!avl_is_empty(by_size));
289 
290 	vmmr_span_t search = { .vs_size = target_sz };
291 	span = avl_find(by_size, &search, &where);
292 	if (span == NULL) {
293 		/* Try for a larger span (instead of exact match) */
294 		span = avl_nearest(by_size, where, AVL_AFTER);
295 		if (span == NULL) {
296 			/*
297 			 * Caller will need to collect several smaller spans in
298 			 * order to fulfill their request.
299 			 */
300 			span = avl_nearest(by_size, where, AVL_BEFORE);
301 			ASSERT3P(span, !=, NULL);
302 		}
303 	}
304 
305 	if (span->vs_size <= target_sz) {
306 		avl_remove(by_size, span);
307 		avl_remove(by_addr, span);
308 
309 		return (span);
310 	} else {
311 		/* Split off adequate chunk from larger span */
312 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
313 
314 		avl_remove(by_size, span);
315 		span->vs_size -= target_sz;
316 		avl_add(by_size, span);
317 
318 		vmmr_span_t *split_span =
319 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
320 		split_span->vs_addr = start;
321 		split_span->vs_size = target_sz;
322 
323 		return (split_span);
324 	}
325 }
326 
327 void
328 vmmr_init()
329 {
330 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
331 
332 	/*
333 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
334 	 * memory reservoir.  It is meant to provide some measure of protection
335 	 * against an operator pushing the system into unrecoverable memory
336 	 * starvation through explicit or transient additions to the reservoir.
337 	 *
338 	 * There will be many situations where this limit would be inadequate to
339 	 * prevent kernel memory starvation in the face of certain operator
340 	 * actions.  It is a balance to be struck between safety and allowing
341 	 * large systems to reach high utilization.
342 	 *
343 	 * The value is based off of pages_pp_maximum: "Number of currently
344 	 * available pages that cannot be 'locked'".  It is sized as all of
345 	 * `physmem` less 120% of `pages_pp_maximum`.
346 	 */
347 	vmmr_total_limit =
348 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
349 
350 	vmmr_empty_last = 0;
351 	vmmr_free_sz = 0;
352 	vmmr_alloc_sz = 0;
353 	vmmr_empty_sz = 0;
354 	vmmr_adding_sz = 0;
355 	vmmr_free_transient_sz = 0;
356 	vmmr_alloc_transient_sz = 0;
357 
358 	vmmr_tp_init(&vmmr_free_tp);
359 	vmmr_tp_init(&vmmr_empty_tp);
360 
361 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
362 	    offsetof(vmmr_region_t, vr_node));
363 
364 	/* Grab a chunk of VA for the reservoir */
365 	vmmr_va_sz = physmem * PAGESIZE;
366 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
367 }
368 
369 void
370 vmmr_fini()
371 {
372 	mutex_enter(&vmmr_lock);
373 	VERIFY3U(vmmr_alloc_sz, ==, 0);
374 	VERIFY3U(vmmr_free_sz, ==, 0);
375 	VERIFY3U(vmmr_adding_sz, ==, 0);
376 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
377 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
378 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
379 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
380 	VERIFY(list_is_empty(&vmmr_alloc_regions));
381 
382 	vmmr_tp_destroy(&vmmr_free_tp);
383 	vmmr_tp_destroy(&vmmr_empty_tp);
384 	list_destroy(&vmmr_alloc_regions);
385 
386 	/* Release reservoir VA chunk */
387 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
388 	vmmr_va = 0;
389 	vmmr_va_sz = 0;
390 	vmmr_total_limit = 0;
391 	vmmr_empty_last = 0;
392 
393 	mutex_exit(&vmmr_lock);
394 	mutex_destroy(&vmmr_lock);
395 }
396 
397 bool
398 vmmr_is_empty()
399 {
400 	mutex_enter(&vmmr_lock);
401 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
402 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
403 	mutex_exit(&vmmr_lock);
404 	return (res);
405 }
406 
407 int
408 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
409 {
410 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
411 
412 	if (!transient) {
413 		mutex_enter(&vmmr_lock);
414 		if (sz > vmmr_free_sz) {
415 			mutex_exit(&vmmr_lock);
416 			return (ENOSPC);
417 		}
418 	} else {
419 		int err;
420 
421 		err = vmmr_add(sz, true);
422 		if (err != 0) {
423 			return (err);
424 		}
425 		mutex_enter(&vmmr_lock);
426 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
427 	}
428 
429 	vmmr_region_t *region;
430 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
431 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
432 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
433 	region->vr_size = sz;
434 
435 	size_t remain = sz;
436 	uintptr_t map_at = 0;
437 	while (remain > 0) {
438 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
439 
440 		/*
441 		 * We have already ensured that adequate free memory is present
442 		 * in the reservoir for this allocation.
443 		 */
444 		VERIFY3P(span, !=, NULL);
445 		ASSERT3U(span->vs_size, <=, remain);
446 
447 		span->vs_region_addr = map_at;
448 		avl_add(&region->vr_spans, span);
449 		map_at += span->vs_size;
450 		remain -= span->vs_size;
451 	}
452 
453 	if (!transient) {
454 		vmmr_free_sz -= sz;
455 		vmmr_alloc_sz += sz;
456 	} else {
457 		vmmr_free_transient_sz -= sz;
458 		vmmr_alloc_transient_sz += sz;
459 		region->vr_transient = true;
460 	}
461 	list_insert_tail(&vmmr_alloc_regions, region);
462 	mutex_exit(&vmmr_lock);
463 
464 	*resp = region;
465 	return (0);
466 }
467 
468 void *
469 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
470 {
471 	/* just use KPM region for now */
472 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
473 }
474 
475 pfn_t
476 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
477 {
478 	VERIFY3U(off & PAGEOFFSET, ==, 0);
479 	VERIFY3U(off, <, region->vr_size);
480 
481 	vmmr_span_t search = {
482 		.vs_region_addr = off
483 	};
484 	avl_index_t where;
485 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
486 
487 	if (span == NULL) {
488 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
489 		ASSERT3P(span, !=, NULL);
490 	}
491 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
492 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
493 	VERIFY(pp != NULL);
494 	return (pp->p_pagenum);
495 }
496 
497 void
498 vmmr_free(vmmr_region_t *region)
499 {
500 	mutex_enter(&vmmr_lock);
501 	if (!region->vr_transient) {
502 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
503 	} else {
504 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
505 	}
506 	list_remove(&vmmr_alloc_regions, region);
507 	mutex_exit(&vmmr_lock);
508 
509 	/* Zero the contents */
510 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
511 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
512 	}
513 
514 	mutex_enter(&vmmr_lock);
515 
516 	/* Put the contained span(s) back in the free pool */
517 	void *cookie = NULL;
518 	vmmr_span_t *span;
519 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
520 		span->vs_region_addr = 0;
521 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
522 	}
523 	avl_destroy(&region->vr_spans);
524 	if (!region->vr_transient) {
525 		vmmr_free_sz += region->vr_size;
526 		vmmr_alloc_sz -= region->vr_size;
527 	} else {
528 		vmmr_free_transient_sz += region->vr_size;
529 		vmmr_alloc_transient_sz -= region->vr_size;
530 	}
531 	mutex_exit(&vmmr_lock);
532 
533 	if (region->vr_transient) {
534 		vmmr_remove(region->vr_size, true);
535 	}
536 	kmem_free(region, sizeof (*region));
537 }
538 
539 static void
540 vmmr_destroy_pages(vmmr_span_t *span)
541 {
542 	const uintptr_t end = span->vs_addr + span->vs_size;
543 	struct vnode *vp = &kvps[KV_VVP];
544 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
545 		page_t *pp;
546 
547 		/* Page-free logic cribbed from segkmem_xfree(): */
548 		pp = page_find(vp, (u_offset_t)pos);
549 		VERIFY(pp != NULL);
550 		if (!page_tryupgrade(pp)) {
551 			/*
552 			 * Some other thread has a sharelock. Wait for
553 			 * it to drop the lock so we can free this page.
554 			 */
555 			page_unlock(pp);
556 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
557 		}
558 
559 		/*
560 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
561 		 * That will be taken care of later via page_unresv().
562 		 */
563 		pp->p_lckcnt = 0;
564 		page_destroy(pp, 0);
565 	}
566 }
567 
568 static int
569 vmmr_alloc_pages(const vmmr_span_t *span)
570 {
571 	struct seg kseg = {
572 		.s_as = &kas
573 	};
574 	struct vnode *vp = &kvps[KV_VVP];
575 
576 	const uintptr_t end = span->vs_addr + span->vs_size;
577 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
578 		page_t *pp;
579 
580 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
581 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
582 
583 		if (pp == NULL) {
584 			/* Destroy any already-created pages */
585 			if (pos != span->vs_addr) {
586 				vmmr_span_t destroy_span = {
587 					.vs_addr = span->vs_addr,
588 					.vs_size = pos - span->vs_addr,
589 				};
590 
591 				vmmr_destroy_pages(&destroy_span);
592 			}
593 			return (ENOMEM);
594 		}
595 
596 		/* mimic page state from segkmem */
597 		ASSERT(PAGE_EXCL(pp));
598 		page_io_unlock(pp);
599 		pp->p_lckcnt = 1;
600 		page_downgrade(pp);
601 
602 		/* pre-zero the page */
603 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
604 	}
605 
606 	return (0);
607 }
608 
609 static int
610 vmmr_resv_wait()
611 {
612 	if (delay_sig(hz >> 2) != 0) {
613 		/* bail due to interruption */
614 		return (0);
615 	}
616 	return (1);
617 }
618 
619 static void
620 vmmr_remove_raw(size_t sz)
621 {
622 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
623 	VERIFY(MUTEX_HELD(&vmmr_lock));
624 
625 	size_t remain = sz;
626 	while (remain > 0) {
627 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
628 
629 		/*
630 		 * The caller must ensure that at least `sz` amount is present
631 		 * in the free treepair.
632 		 */
633 		VERIFY3P(span, !=, NULL);
634 		ASSERT3U(span->vs_size, <=, remain);
635 
636 		/* TODO: perhaps arrange to destroy pages outside the lock? */
637 		vmmr_destroy_pages(span);
638 
639 		remain -= span->vs_size;
640 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
641 	}
642 
643 	vmmr_empty_sz += sz;
644 }
645 
646 int
647 vmmr_add(size_t sz, bool transient)
648 {
649 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
650 
651 	mutex_enter(&vmmr_lock);
652 	/*
653 	 * Make sure that the amount added is not going to breach the limits
654 	 * we've chosen
655 	 */
656 	const size_t current_total =
657 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
658 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
659 	if ((current_total + sz) < current_total) {
660 		mutex_exit(&vmmr_lock);
661 		return (EOVERFLOW);
662 	}
663 	if ((current_total + sz) > vmmr_total_limit) {
664 		mutex_exit(&vmmr_lock);
665 		return (ENOSPC);
666 	}
667 	vmmr_adding_sz += sz;
668 	mutex_exit(&vmmr_lock);
669 
670 	/* Wait for enough pages to become available */
671 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
672 		mutex_enter(&vmmr_lock);
673 		vmmr_adding_sz -= sz;
674 		mutex_exit(&vmmr_lock);
675 
676 		return (EINTR);
677 	}
678 
679 	mutex_enter(&vmmr_lock);
680 	size_t added = 0;
681 	size_t remain = sz;
682 	while (added < sz) {
683 		vmmr_span_t *span = NULL;
684 
685 		if (vmmr_empty_sz > 0) {
686 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
687 
688 			vmmr_empty_sz -= span->vs_size;
689 		} else {
690 			/*
691 			 * No empty space to fill with new pages, so just tack
692 			 * it on at the end instead.
693 			 */
694 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
695 			span->vs_addr = vmmr_empty_last;
696 			span->vs_size = remain;
697 			vmmr_empty_last += remain;
698 		}
699 		VERIFY3P(span, !=, NULL);
700 
701 
702 		/* Allocate the actual pages to back this span */
703 		mutex_exit(&vmmr_lock);
704 		int err = vmmr_alloc_pages(span);
705 		mutex_enter(&vmmr_lock);
706 
707 		/*
708 		 * If an error is encountered during page allocation for the
709 		 * span, unwind any progress made by the addition request.
710 		 */
711 		if (err != 0) {
712 			/*
713 			 * Without pages allocated to this span, it is now
714 			 * tracked as empty.
715 			 */
716 			vmmr_empty_sz += span->vs_size;
717 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
718 
719 			if (added != 0) {
720 				vmmr_remove_raw(added);
721 			}
722 
723 			vmmr_adding_sz -= sz;
724 			mutex_exit(&vmmr_lock);
725 
726 			page_unresv(sz >> PAGESHIFT);
727 			return (err);
728 		}
729 
730 		/*
731 		 * The allocated-page-bearing span is placed in the "free"
732 		 * treepair now, but is not officially exposed for consumption
733 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
734 		 *
735 		 * This allows us to unwind the allocation in case of a failure
736 		 * without the risk of the freshly added span(s) being snapped
737 		 * up by a consumer already.
738 		 */
739 		added += span->vs_size;
740 		remain -= span->vs_size;
741 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
742 	}
743 
744 	/* Make the added memory usable by exposing it to the size accounting */
745 	if (!transient) {
746 		vmmr_free_sz += added;
747 	} else {
748 		vmmr_free_transient_sz += added;
749 	}
750 	ASSERT3U(added, ==, sz);
751 	vmmr_adding_sz -= added;
752 
753 	mutex_exit(&vmmr_lock);
754 	return (0);
755 }
756 
757 int
758 vmmr_remove(size_t sz, bool transient)
759 {
760 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
761 
762 	mutex_enter(&vmmr_lock);
763 	if ((!transient && sz > vmmr_free_sz) ||
764 	    (transient && sz > vmmr_free_transient_sz)) {
765 		mutex_exit(&vmmr_lock);
766 		return (ENOSPC);
767 	}
768 
769 	vmmr_remove_raw(sz);
770 
771 	if (!transient) {
772 		vmmr_free_sz -= sz;
773 	} else {
774 		vmmr_free_transient_sz -= sz;
775 	}
776 	mutex_exit(&vmmr_lock);
777 	page_unresv(sz >> PAGESHIFT);
778 	return (0);
779 }
780 
781 int
782 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
783 {
784 	switch (cmd) {
785 	case VMM_RESV_QUERY: {
786 		struct vmm_resv_query res;
787 		void *datap = (void *)(uintptr_t)arg;
788 
789 		/* For now, anyone in GZ can query */
790 		if (crgetzoneid(cr) != GLOBAL_ZONEID) {
791 			return (EPERM);
792 		}
793 		mutex_enter(&vmmr_lock);
794 		res.vrq_free_sz = vmmr_free_sz;
795 		res.vrq_alloc_sz = vmmr_alloc_sz;
796 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
797 		res.vrq_limit = vmmr_total_limit;
798 		mutex_exit(&vmmr_lock);
799 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
800 			return (EFAULT);
801 		}
802 		break;
803 	}
804 	case VMM_RESV_ADD: {
805 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
806 			return (EPERM);
807 		}
808 		return (vmmr_add((size_t)arg, false));
809 	}
810 	case VMM_RESV_REMOVE: {
811 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
812 			return (EPERM);
813 		}
814 		return (vmmr_remove((size_t)arg, false));
815 	}
816 	default:
817 		return (ENOTTY);
818 	}
819 	return (0);
820 }
821