xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision fb876f9607a4e506a871e6b539d75c9644a3566f)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2023 Oxide Computer Company
14  */
15 
16 /*
17  * VMM Memory Reservoir
18  *
19  *
20  * In order to make the allocation of large (multi-GiB) chunks of memory
21  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
22  * operators can set aside a substantial portion of system memory exclusively
23  * for VMs.  This memory is unavailable for general use by the rest of the
24  * system.  Rather than having to scour the freelist, reap kmem caches, or put
25  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
26  * there is adequate reservoir memory available.  Since the pages stored in the
27  * reservoir are pre-zeroed, it can be immediately used when allocated to a
28  * guest.  When the memory is returned to the reservoir, it is zeroed once more
29  * to avoid leaking any sensitive data from that guest.
30  *
31  *
32  * Transient Allocations
33  *
34  * While the explicit reservoir model may work well for some applications,
35  * others may want a more traditional model, where pages for guest memory
36  * objects are allocated on demand, rather than from a pool set aside from the
37  * system.  In this case, the allocation can be made in "transient" mode, where
38  * the memory is allocated normally, even if there is free capacity in the
39  * reservoir.  When use of the transient allocation is complete (the guest is
40  * halted and destroyed), the pages will be freed back to the system, rather
41  * than added back to the reservoir.
42  *
43  * From an implementation standpoint, transient allocations follow the same
44  * code paths as ones using the reservoir normally.  Those allocations have a
45  * tag which marks them as transient, and used/free size tallies are maintained
46  * separately for normal and transient operations.  When performing a transient
47  * allocation, that amount of memory is immediately added to the reservoir ,
48  * from which the allocation can be made.  When freeing a transient allocation,
49  * a matching amount of memory is removed from the reservoir as part of the
50  * operation.  This allows both allocation types to coexist without too much
51  * additional machinery.
52  *
53  *
54  * Administration
55  *
56  * Operators may attempt to alter the amount of memory allocated to the
57  * reservoir via an ioctl against the vmmctl device.  The total amount of memory
58  * in the reservoir (free, or allocated to VMs) is limited by
59  * `vmm_total_limit` (see its definition for how this limit is calculated).
60  *
61  * The limit is in place to prevent the reservoir from inadvertently growing
62  * to a size where the system has inadequate memory to make forward progress.
63  * Shrinking the reservoir is only possible when it contains free (not
64  * allocated by any guest VMs) memory.
65  *
66  *
67  * Page Tracking
68  *
69  * The reservoir currently uses vnode association to keep track of pages under
70  * its control (either designated to the reservoir and free, or allocated to a
71  * guest VM object).  This means using the existing VM system primitives for
72  * page_t instances being associated with a given (vnode, offset) tuple.  It
73  * means that spans of pages, either free or allocated, need only to store a
74  * length (of the span) and an offset (into the vnode) in order to gain access
75  * to all of the underlying pages associated with that span.  Associating the
76  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
77  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
78  * operator has chosen to dump all of RAM).
79  */
80 
81 #include <sys/types.h>
82 #include <sys/mutex.h>
83 #include <sys/avl.h>
84 #include <sys/list.h>
85 #include <sys/machparam.h>
86 #include <sys/kmem.h>
87 #include <sys/stddef.h>
88 #include <sys/null.h>
89 #include <sys/errno.h>
90 #include <sys/systm.h>
91 #include <sys/sunddi.h>
92 #include <sys/policy.h>
93 #include <vm/seg_kmem.h>
94 #include <vm/hat_i86.h>
95 #include <sys/kstat.h>
96 
97 #include <sys/vmm_reservoir.h>
98 #include <sys/vmm_dev.h>
99 #include <sys/vmm_impl.h>
100 
101 #define	VMMR_TARGET_INACTIVE	SIZE_MAX
102 
103 static kmutex_t vmmr_lock;
104 
105 static size_t vmmr_free_sz;
106 static size_t vmmr_free_transient_sz;
107 static size_t vmmr_adding_sz;
108 static size_t vmmr_alloc_sz;
109 static size_t vmmr_alloc_transient_sz;
110 static size_t vmmr_empty_sz;
111 
112 /*
113  * Target size of the reservoir during active vmmr_set_target() operation.
114  * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
115  */
116 static size_t vmmr_target_sz;
117 
118 static uintptr_t vmmr_empty_last;
119 /* Upper limit for the size (free + allocated) of the reservoir */
120 static size_t vmmr_total_limit;
121 
122 /* VA range allocated from the VMM arena for the mappings */
123 static uintptr_t vmmr_va;
124 static uintptr_t vmmr_va_sz;
125 
126 static kstat_t *vmmr_kstat;
127 
128 /* Pair of AVL trees to store set of spans ordered by addr and size */
129 typedef struct vmmr_treepair {
130 	avl_tree_t by_addr;
131 	avl_tree_t by_size;
132 } vmmr_treepair_t;
133 
134 /* Spans of free memory in the reservoir */
135 static vmmr_treepair_t vmmr_free_tp;
136 
137 /* Spans of empty (not backed by memory) space in the reservoir */
138 static vmmr_treepair_t vmmr_empty_tp;
139 
140 /* Regions of memory allocated from the reservoir */
141 static list_t vmmr_alloc_regions;
142 
143 struct vmmr_span {
144 	uintptr_t	vs_addr;
145 	size_t		vs_size;
146 	avl_node_t	vs_by_addr;
147 	avl_node_t	vs_by_size;
148 	uintptr_t	vs_region_addr;
149 };
150 typedef struct vmmr_span vmmr_span_t;
151 
152 struct vmmr_region {
153 	size_t		vr_size;
154 	avl_tree_t	vr_spans;
155 	list_node_t	vr_node;
156 	bool		vr_transient;
157 };
158 
159 typedef struct vmmr_kstats {
160 	kstat_named_t	vmrks_bytes_free;
161 	kstat_named_t	vmrks_bytes_alloc;
162 	kstat_named_t	vmrks_bytes_transient;
163 	kstat_named_t	vmrks_bytes_limit;
164 } vmmr_kstats_t;
165 
166 
167 static int vmmr_add(size_t, bool);
168 static int vmmr_remove(size_t, bool);
169 
170 static int
171 vmmr_cmp_addr(const void *a, const void *b)
172 {
173 	const vmmr_span_t *sa = a;
174 	const vmmr_span_t *sb = b;
175 
176 	if (sa->vs_addr == sb->vs_addr) {
177 		return (0);
178 	} else if (sa->vs_addr < sb->vs_addr) {
179 		return (-1);
180 	} else {
181 		return (1);
182 	}
183 }
184 
185 static int
186 vmmr_cmp_size(const void *a, const void *b)
187 {
188 	const vmmr_span_t *sa = a;
189 	const vmmr_span_t *sb = b;
190 
191 	if (sa->vs_size == sb->vs_size) {
192 		/*
193 		 * Since discontiguous spans could have the same size in a
194 		 * by-size tree, differentiate them (as required by AVL) by
195 		 * address so they can safely coexist while remaining sorted.
196 		 */
197 		return (vmmr_cmp_addr(a, b));
198 	} else if (sa->vs_size < sb->vs_size) {
199 		return (-1);
200 	} else {
201 		return (1);
202 	}
203 }
204 
205 static int
206 vmmr_cmp_region_addr(const void *a, const void *b)
207 {
208 	const vmmr_span_t *sa = a;
209 	const vmmr_span_t *sb = b;
210 
211 	if (sa->vs_region_addr == sb->vs_region_addr) {
212 		return (0);
213 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
214 		return (-1);
215 	} else {
216 		return (1);
217 	}
218 }
219 
220 static void
221 vmmr_tp_init(vmmr_treepair_t *tree)
222 {
223 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
224 	    offsetof(vmmr_span_t, vs_by_addr));
225 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
226 	    offsetof(vmmr_span_t, vs_by_size));
227 }
228 
229 static void
230 vmmr_tp_destroy(vmmr_treepair_t *tree)
231 {
232 	void *vcp = NULL;
233 	vmmr_span_t *span;
234 
235 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
236 		/* Freeing spans will be done when tearing down by-size tree */
237 	}
238 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
239 		kmem_free(span, sizeof (*span));
240 	}
241 	avl_destroy(&tree->by_addr);
242 	avl_destroy(&tree->by_size);
243 }
244 
245 /*
246  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
247  * span(s).  Such concatenation could result in the `to_add` span being freed,
248  * so the caller cannot use it after this returns.
249  */
250 static void
251 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
252 {
253 	avl_tree_t *by_addr = &tree->by_addr;
254 	avl_tree_t *by_size = &tree->by_size;
255 	vmmr_span_t *node;
256 	avl_index_t where;
257 
258 	/* This addr should not already exist in the treepair */
259 	node = avl_find(by_addr, to_add, &where);
260 	ASSERT3P(node, ==, NULL);
261 
262 	node = avl_nearest(by_addr, where, AVL_BEFORE);
263 	if (node != NULL &&
264 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
265 		/* concat with preceeding item */
266 		avl_remove(by_addr, node);
267 		avl_remove(by_size, node);
268 		node->vs_size += to_add->vs_size;
269 		kmem_free(to_add, sizeof (*to_add));
270 
271 		/*
272 		 * Since this now-concatenated span could be adjacent one
273 		 * trailing it, fall through to perform that check.
274 		 */
275 		to_add = node;
276 	}
277 
278 	node = avl_nearest(by_addr, where, AVL_AFTER);
279 	if (node != NULL &&
280 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
281 		/* concat with trailing item */
282 		avl_remove(by_addr, node);
283 		avl_remove(by_size, node);
284 		node->vs_addr = to_add->vs_addr;
285 		node->vs_size += to_add->vs_size;
286 		avl_add(by_addr, node);
287 		avl_add(by_size, node);
288 
289 		kmem_free(to_add, sizeof (*to_add));
290 		return;
291 	}
292 
293 	/* simply insert */
294 	avl_add(by_addr, to_add);
295 	avl_add(by_size, to_add);
296 }
297 
298 /*
299  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
300  * the exact target size is not present, but a larger one is.  May return a span
301  * with a size smaller than the target if splitting is not an option.
302  */
303 static vmmr_span_t *
304 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
305 {
306 	avl_tree_t *by_addr = &tree->by_addr;
307 	avl_tree_t *by_size = &tree->by_size;
308 	vmmr_span_t *span;
309 	avl_index_t where;
310 
311 	ASSERT3U(target_sz, !=, 0);
312 	ASSERT(!avl_is_empty(by_addr));
313 	ASSERT(!avl_is_empty(by_size));
314 
315 	vmmr_span_t search = { .vs_size = target_sz };
316 	span = avl_find(by_size, &search, &where);
317 	if (span == NULL) {
318 		/* Try for a larger span (instead of exact match) */
319 		span = avl_nearest(by_size, where, AVL_AFTER);
320 		if (span == NULL) {
321 			/*
322 			 * Caller will need to collect several smaller spans in
323 			 * order to fulfill their request.
324 			 */
325 			span = avl_nearest(by_size, where, AVL_BEFORE);
326 			ASSERT3P(span, !=, NULL);
327 		}
328 	}
329 
330 	if (span->vs_size <= target_sz) {
331 		avl_remove(by_size, span);
332 		avl_remove(by_addr, span);
333 
334 		return (span);
335 	} else {
336 		/* Split off adequate chunk from larger span */
337 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
338 
339 		avl_remove(by_size, span);
340 		span->vs_size -= target_sz;
341 		avl_add(by_size, span);
342 
343 		vmmr_span_t *split_span =
344 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
345 		split_span->vs_addr = start;
346 		split_span->vs_size = target_sz;
347 
348 		return (split_span);
349 	}
350 }
351 
352 static int
353 vmmr_kstat_update(struct kstat *ksp, int rw)
354 {
355 	vmmr_kstats_t *vkp = ksp->ks_data;
356 
357 	mutex_enter(&vmmr_lock);
358 	vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
359 	vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
360 	/*
361 	 * In addition to the memory which is actually actually allocated to
362 	 * transient consumers, memory which is considered free-for-transient is
363 	 * also included in the sizing.
364 	 */
365 	vkp->vmrks_bytes_transient.value.ui64 =
366 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
367 	vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
368 	mutex_exit(&vmmr_lock);
369 
370 	return (0);
371 }
372 
373 int
374 vmmr_init()
375 {
376 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
377 
378 	/*
379 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
380 	 * memory reservoir.  It is meant to provide some measure of protection
381 	 * against an operator pushing the system into unrecoverable memory
382 	 * starvation through explicit or transient additions to the reservoir.
383 	 *
384 	 * There will be many situations where this limit would be inadequate to
385 	 * prevent kernel memory starvation in the face of certain operator
386 	 * actions.  It is a balance to be struck between safety and allowing
387 	 * large systems to reach high utilization.
388 	 *
389 	 * The value is based off of pages_pp_maximum: "Number of currently
390 	 * available pages that cannot be 'locked'".  It is sized as all of
391 	 * `physmem` less 120% of `pages_pp_maximum`.
392 	 */
393 	vmmr_total_limit =
394 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
395 
396 	vmmr_empty_last = 0;
397 	vmmr_free_sz = 0;
398 	vmmr_alloc_sz = 0;
399 	vmmr_empty_sz = 0;
400 	vmmr_adding_sz = 0;
401 	vmmr_free_transient_sz = 0;
402 	vmmr_alloc_transient_sz = 0;
403 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
404 
405 	/*
406 	 * Attempt kstat allocation early, since it is the only part of
407 	 * reservoir initialization which is fallible.
408 	 */
409 	kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
410 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
411 	    sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
412 	if (ksp == NULL) {
413 		mutex_destroy(&vmmr_lock);
414 		return (ENOMEM);
415 	}
416 
417 	vmmr_kstats_t *vkp = ksp->ks_data;
418 
419 	kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
420 	    KSTAT_DATA_UINT64);
421 	kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
422 	    KSTAT_DATA_UINT64);
423 	kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
424 	    KSTAT_DATA_UINT64);
425 	kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
426 	    KSTAT_DATA_UINT64);
427 	ksp->ks_private = NULL;
428 	ksp->ks_update = vmmr_kstat_update;
429 	vmmr_kstat = ksp;
430 
431 	vmmr_tp_init(&vmmr_free_tp);
432 	vmmr_tp_init(&vmmr_empty_tp);
433 
434 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
435 	    offsetof(vmmr_region_t, vr_node));
436 
437 	/* Grab a chunk of VA for the reservoir */
438 	vmmr_va_sz = physmem * PAGESIZE;
439 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
440 
441 	kstat_install(vmmr_kstat);
442 
443 	return (0);
444 }
445 
446 void
447 vmmr_fini()
448 {
449 	mutex_enter(&vmmr_lock);
450 	VERIFY3U(vmmr_alloc_sz, ==, 0);
451 	VERIFY3U(vmmr_free_sz, ==, 0);
452 	VERIFY3U(vmmr_adding_sz, ==, 0);
453 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
454 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
455 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
456 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
457 	VERIFY(list_is_empty(&vmmr_alloc_regions));
458 
459 	kstat_delete(vmmr_kstat);
460 	vmmr_kstat = NULL;
461 
462 	vmmr_tp_destroy(&vmmr_free_tp);
463 	vmmr_tp_destroy(&vmmr_empty_tp);
464 	list_destroy(&vmmr_alloc_regions);
465 
466 	/* Release reservoir VA chunk */
467 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
468 	vmmr_va = 0;
469 	vmmr_va_sz = 0;
470 	vmmr_total_limit = 0;
471 	vmmr_empty_last = 0;
472 
473 	mutex_exit(&vmmr_lock);
474 	mutex_destroy(&vmmr_lock);
475 }
476 
477 bool
478 vmmr_is_empty()
479 {
480 	mutex_enter(&vmmr_lock);
481 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
482 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
483 	mutex_exit(&vmmr_lock);
484 	return (res);
485 }
486 
487 int
488 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
489 {
490 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
491 
492 	if (!transient) {
493 		mutex_enter(&vmmr_lock);
494 		if (sz > vmmr_free_sz) {
495 			mutex_exit(&vmmr_lock);
496 			return (ENOSPC);
497 		}
498 	} else {
499 		int err;
500 
501 		mutex_enter(&vmmr_lock);
502 		err = vmmr_add(sz, true);
503 		if (err != 0) {
504 			mutex_exit(&vmmr_lock);
505 			return (err);
506 		}
507 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
508 	}
509 
510 	vmmr_region_t *region;
511 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
512 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
513 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
514 	region->vr_size = sz;
515 
516 	size_t remain = sz;
517 	uintptr_t map_at = 0;
518 	while (remain > 0) {
519 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
520 
521 		/*
522 		 * We have already ensured that adequate free memory is present
523 		 * in the reservoir for this allocation.
524 		 */
525 		VERIFY3P(span, !=, NULL);
526 		ASSERT3U(span->vs_size, <=, remain);
527 
528 		span->vs_region_addr = map_at;
529 		avl_add(&region->vr_spans, span);
530 		map_at += span->vs_size;
531 		remain -= span->vs_size;
532 	}
533 
534 	if (!transient) {
535 		vmmr_free_sz -= sz;
536 		vmmr_alloc_sz += sz;
537 	} else {
538 		vmmr_free_transient_sz -= sz;
539 		vmmr_alloc_transient_sz += sz;
540 		region->vr_transient = true;
541 	}
542 	list_insert_tail(&vmmr_alloc_regions, region);
543 	mutex_exit(&vmmr_lock);
544 
545 	*resp = region;
546 	return (0);
547 }
548 
549 void *
550 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
551 {
552 	/* just use KPM region for now */
553 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
554 }
555 
556 pfn_t
557 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
558 {
559 	VERIFY3U(off & PAGEOFFSET, ==, 0);
560 	VERIFY3U(off, <, region->vr_size);
561 
562 	vmmr_span_t search = {
563 		.vs_region_addr = off
564 	};
565 	avl_index_t where;
566 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
567 
568 	if (span == NULL) {
569 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
570 		ASSERT3P(span, !=, NULL);
571 	}
572 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
573 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
574 	VERIFY(pp != NULL);
575 	return (pp->p_pagenum);
576 }
577 
578 void
579 vmmr_free(vmmr_region_t *region)
580 {
581 	mutex_enter(&vmmr_lock);
582 	if (!region->vr_transient) {
583 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
584 	} else {
585 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
586 	}
587 	list_remove(&vmmr_alloc_regions, region);
588 	mutex_exit(&vmmr_lock);
589 
590 	/* Zero the contents (while not monopolizing vmmr_lock) */
591 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
592 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
593 	}
594 
595 	mutex_enter(&vmmr_lock);
596 
597 	/* Put the contained span(s) back in the free pool */
598 	void *cookie = NULL;
599 	vmmr_span_t *span;
600 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
601 		span->vs_region_addr = 0;
602 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
603 	}
604 	avl_destroy(&region->vr_spans);
605 	if (!region->vr_transient) {
606 		vmmr_free_sz += region->vr_size;
607 		vmmr_alloc_sz -= region->vr_size;
608 	} else {
609 		vmmr_free_transient_sz += region->vr_size;
610 		vmmr_alloc_transient_sz -= region->vr_size;
611 	}
612 
613 	if (region->vr_transient) {
614 		/*
615 		 * Since the transient capacity was previously allocated for
616 		 * this region, its removal should not fail.
617 		 */
618 		VERIFY0(vmmr_remove(region->vr_size, true));
619 	}
620 	kmem_free(region, sizeof (*region));
621 	mutex_exit(&vmmr_lock);
622 }
623 
624 static void
625 vmmr_destroy_pages(vmmr_span_t *span)
626 {
627 	const uintptr_t end = span->vs_addr + span->vs_size;
628 	struct vnode *vp = &kvps[KV_VVP];
629 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
630 		page_t *pp;
631 
632 		/* Page-free logic cribbed from segkmem_xfree(): */
633 		pp = page_find(vp, (u_offset_t)pos);
634 		VERIFY(pp != NULL);
635 		if (!page_tryupgrade(pp)) {
636 			/*
637 			 * Some other thread has a sharelock. Wait for
638 			 * it to drop the lock so we can free this page.
639 			 */
640 			page_unlock(pp);
641 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
642 		}
643 
644 		/*
645 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
646 		 * That will be taken care of later via page_unresv().
647 		 */
648 		pp->p_lckcnt = 0;
649 		page_destroy(pp, 0);
650 	}
651 }
652 
653 static int
654 vmmr_alloc_pages(const vmmr_span_t *span)
655 {
656 	struct seg kseg = {
657 		.s_as = &kas
658 	};
659 	struct vnode *vp = &kvps[KV_VVP];
660 
661 	const uintptr_t end = span->vs_addr + span->vs_size;
662 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
663 		page_t *pp;
664 
665 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
666 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
667 
668 		if (pp == NULL) {
669 			/* Destroy any already-created pages */
670 			if (pos != span->vs_addr) {
671 				vmmr_span_t destroy_span = {
672 					.vs_addr = span->vs_addr,
673 					.vs_size = pos - span->vs_addr,
674 				};
675 
676 				vmmr_destroy_pages(&destroy_span);
677 			}
678 			return (ENOMEM);
679 		}
680 
681 		/* mimic page state from segkmem */
682 		ASSERT(PAGE_EXCL(pp));
683 		page_io_unlock(pp);
684 		pp->p_lckcnt = 1;
685 		page_downgrade(pp);
686 
687 		/* pre-zero the page */
688 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
689 	}
690 
691 	return (0);
692 }
693 
694 static int
695 vmmr_resv_wait()
696 {
697 	if (delay_sig(hz >> 2) != 0) {
698 		/* bail due to interruption */
699 		return (0);
700 	}
701 	return (1);
702 }
703 
704 static void
705 vmmr_remove_raw(size_t sz)
706 {
707 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
708 	VERIFY(MUTEX_HELD(&vmmr_lock));
709 
710 	size_t remain = sz;
711 	while (remain > 0) {
712 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
713 
714 		/*
715 		 * The caller must ensure that at least `sz` amount is present
716 		 * in the free treepair.
717 		 */
718 		VERIFY3P(span, !=, NULL);
719 		ASSERT3U(span->vs_size, <=, remain);
720 
721 		/* TODO: perhaps arrange to destroy pages outside the lock? */
722 		vmmr_destroy_pages(span);
723 
724 		remain -= span->vs_size;
725 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
726 	}
727 
728 	vmmr_empty_sz += sz;
729 }
730 
731 /*
732  * Add memory to vmm reservoir.  Memory may be marked for transient use, where
733  * the addition is part of a transient allocation from the reservoir.  Otherwise
734  * it is placed in the reservoir to be available for non-transient allocations.
735  *
736  * Expects vmmr_lock to be held when called, and will return with it held, but
737  * will drop it during portions of the addition.
738  */
739 static int
740 vmmr_add(size_t sz, bool transient)
741 {
742 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
743 	VERIFY3U(sz, >, 0);
744 	VERIFY(MUTEX_HELD(&vmmr_lock));
745 
746 	/*
747 	 * Make sure that the amount added is not going to breach the limits
748 	 * we've chosen
749 	 */
750 	const size_t current_total =
751 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
752 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
753 	if ((current_total + sz) < current_total) {
754 		return (EOVERFLOW);
755 	}
756 	if ((current_total + sz) > vmmr_total_limit) {
757 		return (ENOSPC);
758 	}
759 	vmmr_adding_sz += sz;
760 	mutex_exit(&vmmr_lock);
761 
762 	/* Wait for enough pages to become available */
763 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
764 		mutex_enter(&vmmr_lock);
765 		vmmr_adding_sz -= sz;
766 		return (EINTR);
767 	}
768 
769 	mutex_enter(&vmmr_lock);
770 	size_t added = 0;
771 	size_t remain = sz;
772 	while (added < sz) {
773 		vmmr_span_t *span = NULL;
774 
775 		if (vmmr_empty_sz > 0) {
776 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
777 
778 			vmmr_empty_sz -= span->vs_size;
779 		} else {
780 			/*
781 			 * No empty space to fill with new pages, so just tack
782 			 * it on at the end instead.
783 			 */
784 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
785 			span->vs_addr = vmmr_empty_last;
786 			span->vs_size = remain;
787 			vmmr_empty_last += remain;
788 		}
789 		VERIFY3P(span, !=, NULL);
790 
791 
792 		/* Allocate the actual pages to back this span */
793 		mutex_exit(&vmmr_lock);
794 		int err = vmmr_alloc_pages(span);
795 		mutex_enter(&vmmr_lock);
796 
797 		/*
798 		 * If an error is encountered during page allocation for the
799 		 * span, unwind any progress made by the addition request.
800 		 */
801 		if (err != 0) {
802 			/*
803 			 * Without pages allocated to this span, it is now
804 			 * tracked as empty.
805 			 */
806 			vmmr_empty_sz += span->vs_size;
807 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
808 
809 			if (added != 0) {
810 				vmmr_remove_raw(added);
811 			}
812 
813 			vmmr_adding_sz -= sz;
814 
815 			page_unresv(sz >> PAGESHIFT);
816 			return (err);
817 		}
818 
819 		/*
820 		 * The allocated-page-bearing span is placed in the "free"
821 		 * treepair now, but is not officially exposed for consumption
822 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
823 		 *
824 		 * This allows us to unwind the allocation in case of a failure
825 		 * without the risk of the freshly added span(s) being snapped
826 		 * up by a consumer already.
827 		 */
828 		added += span->vs_size;
829 		remain -= span->vs_size;
830 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
831 	}
832 
833 	/* Make the added memory usable by exposing it to the size accounting */
834 	if (!transient) {
835 		vmmr_free_sz += added;
836 	} else {
837 		vmmr_free_transient_sz += added;
838 	}
839 	ASSERT3U(added, ==, sz);
840 	vmmr_adding_sz -= added;
841 
842 	return (0);
843 }
844 
845 /*
846  * Remove memory from vmm reservoir.  Normally this will remove memory from the
847  * reservoir which was available for non-transient allocations.  If the removal
848  * is part of a vmmr_free() of a transient allocation, it will act on only that
849  * transient region being freed, not the available memory in the reservoir.
850  *
851  * Expects vmmr_lock to be held when called, and will return with it held, but
852  * may drop it during portions of the removal.
853  */
854 static int
855 vmmr_remove(size_t sz, bool transient)
856 {
857 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
858 	VERIFY(sz);
859 	VERIFY(MUTEX_HELD(&vmmr_lock));
860 
861 	if ((!transient && sz > vmmr_free_sz) ||
862 	    (transient && sz > vmmr_free_transient_sz)) {
863 		return (ENOSPC);
864 	}
865 
866 	vmmr_remove_raw(sz);
867 
868 	if (!transient) {
869 		vmmr_free_sz -= sz;
870 	} else {
871 		vmmr_free_transient_sz -= sz;
872 	}
873 	page_unresv(sz >> PAGESHIFT);
874 	return (0);
875 }
876 
877 static int
878 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
879 {
880 	VERIFY(resp != NULL);
881 
882 	mutex_enter(&vmmr_lock);
883 
884 	size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
885 
886 	/* Be sure to communicate current size in case of an early bail-out */
887 	*resp = current_sz;
888 
889 	if ((target_sz & PAGEOFFSET) != 0 ||
890 	    (chunk_sz & PAGEOFFSET) != 0) {
891 		mutex_exit(&vmmr_lock);
892 		return (EINVAL);
893 	}
894 	/* Reject sentinel value */
895 	if (target_sz == VMMR_TARGET_INACTIVE) {
896 		mutex_exit(&vmmr_lock);
897 		return (EINVAL);
898 	}
899 
900 	/* Already at target size */
901 	if (target_sz == current_sz) {
902 		mutex_exit(&vmmr_lock);
903 		return (0);
904 	}
905 
906 	/* Reject racing requests size */
907 	if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
908 		mutex_exit(&vmmr_lock);
909 		return (EALREADY);
910 	}
911 	/* Record the target now to excluding a racing request */
912 	vmmr_target_sz = target_sz;
913 
914 	int err = 0;
915 	do {
916 		/* Be sensitive to signal interruption */
917 		if (issig(JUSTLOOKING) != 0) {
918 			mutex_exit(&vmmr_lock);
919 			const bool sig_bail = issig(FORREAL) != 0;
920 			mutex_enter(&vmmr_lock);
921 			if (sig_bail) {
922 				err = EINTR;
923 				break;
924 			}
925 		}
926 
927 		if (current_sz > target_sz) {
928 			/* Shrinking reservoir */
929 
930 			size_t req_sz = current_sz - target_sz;
931 			if (chunk_sz != 0) {
932 				req_sz = MIN(req_sz, chunk_sz);
933 			}
934 			err = vmmr_remove(req_sz, false);
935 		} else {
936 			/* Growing reservoir */
937 			ASSERT(current_sz < target_sz);
938 
939 			size_t req_sz = target_sz - current_sz;
940 			if (chunk_sz != 0) {
941 				req_sz = MIN(req_sz, chunk_sz);
942 			}
943 			err = vmmr_add(req_sz, false);
944 		}
945 
946 		current_sz = vmmr_alloc_sz + vmmr_free_sz;
947 	} while (err == 0 && current_sz != target_sz);
948 
949 	/* Clear the target now that we are done (success or not) */
950 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
951 	mutex_exit(&vmmr_lock);
952 	*resp = current_sz;
953 	return (err);
954 }
955 
956 int
957 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
958 {
959 	/*
960 	 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
961 	 * do not need to duplicate such checks here.
962 	 */
963 
964 	switch (cmd) {
965 	case VMM_RESV_QUERY: {
966 		struct vmm_resv_query res;
967 		void *datap = (void *)(uintptr_t)arg;
968 
969 		/* For now, anyone with access to vmmctl device can query */
970 		mutex_enter(&vmmr_lock);
971 		res.vrq_free_sz = vmmr_free_sz;
972 		res.vrq_alloc_sz = vmmr_alloc_sz;
973 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
974 		res.vrq_limit = vmmr_total_limit;
975 		mutex_exit(&vmmr_lock);
976 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
977 			return (EFAULT);
978 		}
979 		break;
980 	}
981 	case VMM_RESV_SET_TARGET: {
982 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
983 			return (EPERM);
984 		}
985 
986 		struct vmm_resv_target tgt;
987 		void *datap = (void *)(uintptr_t)arg;
988 
989 		if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
990 			return (EFAULT);
991 		}
992 
993 		int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
994 		    &tgt.vrt_result_sz);
995 
996 		/*
997 		 * Attempt to communicate the resultant size of the reservoir if
998 		 * setting it to the target was a success, or if we were
999 		 * interrupted (by a signal) while doing so.
1000 		 */
1001 		if (err == 0 || err == EINTR) {
1002 			if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
1003 				err = EFAULT;
1004 			}
1005 		}
1006 
1007 		return (err);
1008 	}
1009 	default:
1010 		return (ENOTTY);
1011 	}
1012 	return (0);
1013 }
1014