xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision 6bba8b59433dd1331c03414e1d551355c4bd0e06)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2023 Oxide Computer Company
14  */
15 
16 /*
17  * VMM Memory Reservoir
18  *
19  *
20  * In order to make the allocation of large (multi-GiB) chunks of memory
21  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
22  * operators can set aside a substantial portion of system memory exclusively
23  * for VMs.  This memory is unavailable for general use by the rest of the
24  * system.  Rather than having to scour the freelist, reap kmem caches, or put
25  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
26  * there is adequate reservoir memory available.  Since the pages stored in the
27  * reservoir are pre-zeroed, it can be immediately used when allocated to a
28  * guest.  When the memory is returned to the reservoir, it is zeroed once more
29  * to avoid leaking any sensitive data from that guest.
30  *
31  *
32  * Transient Allocations
33  *
34  * While the explicit reservoir model may work well for some applications,
35  * others may want a more traditional model, where pages for guest memory
36  * objects are allocated on demand, rather than from a pool set aside from the
37  * system.  In this case, the allocation can be made in "transient" mode, where
38  * the memory is allocated normally, even if there is free capacity in the
39  * reservoir.  When use of the transient allocation is complete (the guest is
40  * halted and destroyed), the pages will be freed back to the system, rather
41  * than added back to the reservoir.
42  *
43  * From an implementation standpoint, transient allocations follow the same
44  * code paths as ones using the reservoir normally.  Those allocations have a
45  * tag which marks them as transient, and used/free size tallies are maintained
46  * separately for normal and transient operations.  When performing a transient
47  * allocation, that amount of memory is immediately added to the reservoir ,
48  * from which the allocation can be made.  When freeing a transient allocation,
49  * a matching amount of memory is removed from the reservoir as part of the
50  * operation.  This allows both allocation types to coexist without too much
51  * additional machinery.
52  *
53  *
54  * Administration
55  *
56  * Operators may attempt to alter the amount of memory allocated to the
57  * reservoir via an ioctl against the vmmctl device.  The total amount of memory
58  * in the reservoir (free, or allocated to VMs) is arbitrarily limited at this
59  * time by `vmmr_total_limit`, which defaults to 80% of physmem.  This is done
60  * to prevent the reservoir from inadvertently growing to a size where the
61  * system has inadequate memory to make forward progress.  Shrinking the
62  * reservoir is only possible when it contains free (not allocated by any guest
63  * VMs) memory.
64  *
65  *
66  * Page Tracking
67  *
68  * The reservoir currently uses vnode association to keep track of pages under
69  * its control (either designated to the reservoir and free, or allocated to a
70  * guest VM object).  This means using the existing VM system primitives for
71  * page_t instances being associated with a given (vnode, offset) tuple.  It
72  * means that spans of pages, either free or allocated, need only to store a
73  * length (of the span) and an offset (into the vnode) in order to gain access
74  * to all of the underlying pages associated with that span.  Associating the
75  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
76  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
77  * operator has chosen to dump all of RAM).
78  */
79 
80 #include <sys/types.h>
81 #include <sys/mutex.h>
82 #include <sys/avl.h>
83 #include <sys/list.h>
84 #include <sys/machparam.h>
85 #include <sys/kmem.h>
86 #include <sys/stddef.h>
87 #include <sys/null.h>
88 #include <sys/errno.h>
89 #include <sys/systm.h>
90 #include <sys/sunddi.h>
91 #include <sys/policy.h>
92 #include <vm/seg_kmem.h>
93 #include <vm/hat_i86.h>
94 #include <sys/kstat.h>
95 
96 #include <sys/vmm_reservoir.h>
97 #include <sys/vmm_dev.h>
98 #include <sys/vmm_impl.h>
99 
100 #define	VMMR_TARGET_INACTIVE	SIZE_MAX
101 
102 static kmutex_t vmmr_lock;
103 
104 static size_t vmmr_free_sz;
105 static size_t vmmr_free_transient_sz;
106 static size_t vmmr_adding_sz;
107 static size_t vmmr_alloc_sz;
108 static size_t vmmr_alloc_transient_sz;
109 static size_t vmmr_empty_sz;
110 
111 /*
112  * Target size of the reservoir during active vmmr_set_target() operation.
113  * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
114  */
115 static size_t vmmr_target_sz;
116 
117 static uintptr_t vmmr_empty_last;
118 /* Upper limit for the size (free + allocated) of the reservoir */
119 static size_t vmmr_total_limit;
120 
121 /* VA range allocated from the VMM arena for the mappings */
122 static uintptr_t vmmr_va;
123 static uintptr_t vmmr_va_sz;
124 
125 static kstat_t *vmmr_kstat;
126 
127 /* Pair of AVL trees to store set of spans ordered by addr and size */
128 typedef struct vmmr_treepair {
129 	avl_tree_t by_addr;
130 	avl_tree_t by_size;
131 } vmmr_treepair_t;
132 
133 /* Spans of free memory in the reservoir */
134 static vmmr_treepair_t vmmr_free_tp;
135 
136 /* Spans of empty (not backed by memory) space in the reservoir */
137 static vmmr_treepair_t vmmr_empty_tp;
138 
139 /* Regions of memory allocated from the reservoir */
140 static list_t vmmr_alloc_regions;
141 
142 struct vmmr_span {
143 	uintptr_t	vs_addr;
144 	size_t		vs_size;
145 	avl_node_t	vs_by_addr;
146 	avl_node_t	vs_by_size;
147 	uintptr_t	vs_region_addr;
148 };
149 typedef struct vmmr_span vmmr_span_t;
150 
151 struct vmmr_region {
152 	size_t		vr_size;
153 	avl_tree_t	vr_spans;
154 	list_node_t	vr_node;
155 	bool		vr_transient;
156 };
157 
158 typedef struct vmmr_kstats {
159 	kstat_named_t	vmrks_bytes_free;
160 	kstat_named_t	vmrks_bytes_alloc;
161 	kstat_named_t	vmrks_bytes_transient;
162 	kstat_named_t	vmrks_bytes_limit;
163 } vmmr_kstats_t;
164 
165 
166 static int vmmr_add(size_t, bool);
167 static int vmmr_remove(size_t, bool);
168 
169 static int
170 vmmr_cmp_addr(const void *a, const void *b)
171 {
172 	const vmmr_span_t *sa = a;
173 	const vmmr_span_t *sb = b;
174 
175 	if (sa->vs_addr == sb->vs_addr) {
176 		return (0);
177 	} else if (sa->vs_addr < sb->vs_addr) {
178 		return (-1);
179 	} else {
180 		return (1);
181 	}
182 }
183 
184 static int
185 vmmr_cmp_size(const void *a, const void *b)
186 {
187 	const vmmr_span_t *sa = a;
188 	const vmmr_span_t *sb = b;
189 
190 	if (sa->vs_size == sb->vs_size) {
191 		/*
192 		 * Since discontiguous spans could have the same size in a
193 		 * by-size tree, differentiate them (as required by AVL) by
194 		 * address so they can safely coexist while remaining sorted.
195 		 */
196 		return (vmmr_cmp_addr(a, b));
197 	} else if (sa->vs_size < sb->vs_size) {
198 		return (-1);
199 	} else {
200 		return (1);
201 	}
202 }
203 
204 static int
205 vmmr_cmp_region_addr(const void *a, const void *b)
206 {
207 	const vmmr_span_t *sa = a;
208 	const vmmr_span_t *sb = b;
209 
210 	if (sa->vs_region_addr == sb->vs_region_addr) {
211 		return (0);
212 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
213 		return (-1);
214 	} else {
215 		return (1);
216 	}
217 }
218 
219 static void
220 vmmr_tp_init(vmmr_treepair_t *tree)
221 {
222 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
223 	    offsetof(vmmr_span_t, vs_by_addr));
224 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
225 	    offsetof(vmmr_span_t, vs_by_size));
226 }
227 
228 static void
229 vmmr_tp_destroy(vmmr_treepair_t *tree)
230 {
231 	void *vcp = NULL;
232 	vmmr_span_t *span;
233 
234 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
235 		/* Freeing spans will be done when tearing down by-size tree */
236 	}
237 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
238 		kmem_free(span, sizeof (*span));
239 	}
240 	avl_destroy(&tree->by_addr);
241 	avl_destroy(&tree->by_size);
242 }
243 
244 /*
245  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
246  * span(s).  Such concatenation could result in the `to_add` span being freed,
247  * so the caller cannot use it after this returns.
248  */
249 static void
250 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
251 {
252 	avl_tree_t *by_addr = &tree->by_addr;
253 	avl_tree_t *by_size = &tree->by_size;
254 	vmmr_span_t *node;
255 	avl_index_t where;
256 
257 	/* This addr should not already exist in the treepair */
258 	node = avl_find(by_addr, to_add, &where);
259 	ASSERT3P(node, ==, NULL);
260 
261 	node = avl_nearest(by_addr, where, AVL_BEFORE);
262 	if (node != NULL &&
263 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
264 		/* concat with preceeding item */
265 		avl_remove(by_addr, node);
266 		avl_remove(by_size, node);
267 		node->vs_size += to_add->vs_size;
268 		kmem_free(to_add, sizeof (*to_add));
269 
270 		/*
271 		 * Since this now-concatenated span could be adjacent one
272 		 * trailing it, fall through to perform that check.
273 		 */
274 		to_add = node;
275 	}
276 
277 	node = avl_nearest(by_addr, where, AVL_AFTER);
278 	if (node != NULL &&
279 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
280 		/* concat with trailing item */
281 		avl_remove(by_addr, node);
282 		avl_remove(by_size, node);
283 		node->vs_addr = to_add->vs_addr;
284 		node->vs_size += to_add->vs_size;
285 		avl_add(by_addr, node);
286 		avl_add(by_size, node);
287 
288 		kmem_free(to_add, sizeof (*to_add));
289 		return;
290 	}
291 
292 	/* simply insert */
293 	avl_add(by_addr, to_add);
294 	avl_add(by_size, to_add);
295 }
296 
297 /*
298  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
299  * the exact target size is not present, but a larger one is.  May return a span
300  * with a size smaller than the target if splitting is not an option.
301  */
302 static vmmr_span_t *
303 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
304 {
305 	avl_tree_t *by_addr = &tree->by_addr;
306 	avl_tree_t *by_size = &tree->by_size;
307 	vmmr_span_t *span;
308 	avl_index_t where;
309 
310 	ASSERT3U(target_sz, !=, 0);
311 	ASSERT(!avl_is_empty(by_addr));
312 	ASSERT(!avl_is_empty(by_size));
313 
314 	vmmr_span_t search = { .vs_size = target_sz };
315 	span = avl_find(by_size, &search, &where);
316 	if (span == NULL) {
317 		/* Try for a larger span (instead of exact match) */
318 		span = avl_nearest(by_size, where, AVL_AFTER);
319 		if (span == NULL) {
320 			/*
321 			 * Caller will need to collect several smaller spans in
322 			 * order to fulfill their request.
323 			 */
324 			span = avl_nearest(by_size, where, AVL_BEFORE);
325 			ASSERT3P(span, !=, NULL);
326 		}
327 	}
328 
329 	if (span->vs_size <= target_sz) {
330 		avl_remove(by_size, span);
331 		avl_remove(by_addr, span);
332 
333 		return (span);
334 	} else {
335 		/* Split off adequate chunk from larger span */
336 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
337 
338 		avl_remove(by_size, span);
339 		span->vs_size -= target_sz;
340 		avl_add(by_size, span);
341 
342 		vmmr_span_t *split_span =
343 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
344 		split_span->vs_addr = start;
345 		split_span->vs_size = target_sz;
346 
347 		return (split_span);
348 	}
349 }
350 
351 static int
352 vmmr_kstat_update(struct kstat *ksp, int rw)
353 {
354 	vmmr_kstats_t *vkp = ksp->ks_data;
355 
356 	mutex_enter(&vmmr_lock);
357 	vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
358 	vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
359 	/*
360 	 * In addition to the memory which is actually actually allocated to
361 	 * transient consumers, memory which is considered free-for-transient is
362 	 * also included in the sizing.
363 	 */
364 	vkp->vmrks_bytes_transient.value.ui64 =
365 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
366 	vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
367 	mutex_exit(&vmmr_lock);
368 
369 	return (0);
370 }
371 
372 int
373 vmmr_init()
374 {
375 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
376 
377 	/*
378 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
379 	 * memory reservoir.  It is meant to provide some measure of protection
380 	 * against an operator pushing the system into unrecoverable memory
381 	 * starvation through explicit or transient additions to the reservoir.
382 	 *
383 	 * There will be many situations where this limit would be inadequate to
384 	 * prevent kernel memory starvation in the face of certain operator
385 	 * actions.  It is a balance to be struck between safety and allowing
386 	 * large systems to reach high utilization.
387 	 *
388 	 * The value is based off of pages_pp_maximum: "Number of currently
389 	 * available pages that cannot be 'locked'".  It is sized as all of
390 	 * `physmem` less 120% of `pages_pp_maximum`.
391 	 */
392 	vmmr_total_limit =
393 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
394 
395 	vmmr_empty_last = 0;
396 	vmmr_free_sz = 0;
397 	vmmr_alloc_sz = 0;
398 	vmmr_empty_sz = 0;
399 	vmmr_adding_sz = 0;
400 	vmmr_free_transient_sz = 0;
401 	vmmr_alloc_transient_sz = 0;
402 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
403 
404 	/*
405 	 * Attempt kstat allocation early, since it is the only part of
406 	 * reservoir initialization which is fallible.
407 	 */
408 	kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
409 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
410 	    sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
411 	if (ksp == NULL) {
412 		mutex_destroy(&vmmr_lock);
413 		return (ENOMEM);
414 	}
415 
416 	vmmr_kstats_t *vkp = ksp->ks_data;
417 
418 	kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
419 	    KSTAT_DATA_UINT64);
420 	kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
421 	    KSTAT_DATA_UINT64);
422 	kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
423 	    KSTAT_DATA_UINT64);
424 	kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
425 	    KSTAT_DATA_UINT64);
426 	ksp->ks_private = NULL;
427 	ksp->ks_update = vmmr_kstat_update;
428 	vmmr_kstat = ksp;
429 
430 	vmmr_tp_init(&vmmr_free_tp);
431 	vmmr_tp_init(&vmmr_empty_tp);
432 
433 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
434 	    offsetof(vmmr_region_t, vr_node));
435 
436 	/* Grab a chunk of VA for the reservoir */
437 	vmmr_va_sz = physmem * PAGESIZE;
438 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
439 
440 	kstat_install(vmmr_kstat);
441 
442 	return (0);
443 }
444 
445 void
446 vmmr_fini()
447 {
448 	mutex_enter(&vmmr_lock);
449 	VERIFY3U(vmmr_alloc_sz, ==, 0);
450 	VERIFY3U(vmmr_free_sz, ==, 0);
451 	VERIFY3U(vmmr_adding_sz, ==, 0);
452 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
453 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
454 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
455 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
456 	VERIFY(list_is_empty(&vmmr_alloc_regions));
457 
458 	kstat_delete(vmmr_kstat);
459 	vmmr_kstat = NULL;
460 
461 	vmmr_tp_destroy(&vmmr_free_tp);
462 	vmmr_tp_destroy(&vmmr_empty_tp);
463 	list_destroy(&vmmr_alloc_regions);
464 
465 	/* Release reservoir VA chunk */
466 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
467 	vmmr_va = 0;
468 	vmmr_va_sz = 0;
469 	vmmr_total_limit = 0;
470 	vmmr_empty_last = 0;
471 
472 	mutex_exit(&vmmr_lock);
473 	mutex_destroy(&vmmr_lock);
474 }
475 
476 bool
477 vmmr_is_empty()
478 {
479 	mutex_enter(&vmmr_lock);
480 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
481 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
482 	mutex_exit(&vmmr_lock);
483 	return (res);
484 }
485 
486 int
487 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
488 {
489 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
490 
491 	if (!transient) {
492 		mutex_enter(&vmmr_lock);
493 		if (sz > vmmr_free_sz) {
494 			mutex_exit(&vmmr_lock);
495 			return (ENOSPC);
496 		}
497 	} else {
498 		int err;
499 
500 		mutex_enter(&vmmr_lock);
501 		err = vmmr_add(sz, true);
502 		if (err != 0) {
503 			mutex_exit(&vmmr_lock);
504 			return (err);
505 		}
506 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
507 	}
508 
509 	vmmr_region_t *region;
510 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
511 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
512 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
513 	region->vr_size = sz;
514 
515 	size_t remain = sz;
516 	uintptr_t map_at = 0;
517 	while (remain > 0) {
518 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
519 
520 		/*
521 		 * We have already ensured that adequate free memory is present
522 		 * in the reservoir for this allocation.
523 		 */
524 		VERIFY3P(span, !=, NULL);
525 		ASSERT3U(span->vs_size, <=, remain);
526 
527 		span->vs_region_addr = map_at;
528 		avl_add(&region->vr_spans, span);
529 		map_at += span->vs_size;
530 		remain -= span->vs_size;
531 	}
532 
533 	if (!transient) {
534 		vmmr_free_sz -= sz;
535 		vmmr_alloc_sz += sz;
536 	} else {
537 		vmmr_free_transient_sz -= sz;
538 		vmmr_alloc_transient_sz += sz;
539 		region->vr_transient = true;
540 	}
541 	list_insert_tail(&vmmr_alloc_regions, region);
542 	mutex_exit(&vmmr_lock);
543 
544 	*resp = region;
545 	return (0);
546 }
547 
548 void *
549 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
550 {
551 	/* just use KPM region for now */
552 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
553 }
554 
555 pfn_t
556 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
557 {
558 	VERIFY3U(off & PAGEOFFSET, ==, 0);
559 	VERIFY3U(off, <, region->vr_size);
560 
561 	vmmr_span_t search = {
562 		.vs_region_addr = off
563 	};
564 	avl_index_t where;
565 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
566 
567 	if (span == NULL) {
568 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
569 		ASSERT3P(span, !=, NULL);
570 	}
571 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
572 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
573 	VERIFY(pp != NULL);
574 	return (pp->p_pagenum);
575 }
576 
577 void
578 vmmr_free(vmmr_region_t *region)
579 {
580 	mutex_enter(&vmmr_lock);
581 	if (!region->vr_transient) {
582 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
583 	} else {
584 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
585 	}
586 	list_remove(&vmmr_alloc_regions, region);
587 	mutex_exit(&vmmr_lock);
588 
589 	/* Zero the contents (while not monopolizing vmmr_lock) */
590 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
591 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
592 	}
593 
594 	mutex_enter(&vmmr_lock);
595 
596 	/* Put the contained span(s) back in the free pool */
597 	void *cookie = NULL;
598 	vmmr_span_t *span;
599 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
600 		span->vs_region_addr = 0;
601 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
602 	}
603 	avl_destroy(&region->vr_spans);
604 	if (!region->vr_transient) {
605 		vmmr_free_sz += region->vr_size;
606 		vmmr_alloc_sz -= region->vr_size;
607 	} else {
608 		vmmr_free_transient_sz += region->vr_size;
609 		vmmr_alloc_transient_sz -= region->vr_size;
610 	}
611 
612 	if (region->vr_transient) {
613 		/*
614 		 * Since the transient capacity was previously allocated for
615 		 * this region, its removal should not fail.
616 		 */
617 		VERIFY0(vmmr_remove(region->vr_size, true));
618 	}
619 	kmem_free(region, sizeof (*region));
620 	mutex_exit(&vmmr_lock);
621 }
622 
623 static void
624 vmmr_destroy_pages(vmmr_span_t *span)
625 {
626 	const uintptr_t end = span->vs_addr + span->vs_size;
627 	struct vnode *vp = &kvps[KV_VVP];
628 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
629 		page_t *pp;
630 
631 		/* Page-free logic cribbed from segkmem_xfree(): */
632 		pp = page_find(vp, (u_offset_t)pos);
633 		VERIFY(pp != NULL);
634 		if (!page_tryupgrade(pp)) {
635 			/*
636 			 * Some other thread has a sharelock. Wait for
637 			 * it to drop the lock so we can free this page.
638 			 */
639 			page_unlock(pp);
640 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
641 		}
642 
643 		/*
644 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
645 		 * That will be taken care of later via page_unresv().
646 		 */
647 		pp->p_lckcnt = 0;
648 		page_destroy(pp, 0);
649 	}
650 }
651 
652 static int
653 vmmr_alloc_pages(const vmmr_span_t *span)
654 {
655 	struct seg kseg = {
656 		.s_as = &kas
657 	};
658 	struct vnode *vp = &kvps[KV_VVP];
659 
660 	const uintptr_t end = span->vs_addr + span->vs_size;
661 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
662 		page_t *pp;
663 
664 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
665 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
666 
667 		if (pp == NULL) {
668 			/* Destroy any already-created pages */
669 			if (pos != span->vs_addr) {
670 				vmmr_span_t destroy_span = {
671 					.vs_addr = span->vs_addr,
672 					.vs_size = pos - span->vs_addr,
673 				};
674 
675 				vmmr_destroy_pages(&destroy_span);
676 			}
677 			return (ENOMEM);
678 		}
679 
680 		/* mimic page state from segkmem */
681 		ASSERT(PAGE_EXCL(pp));
682 		page_io_unlock(pp);
683 		pp->p_lckcnt = 1;
684 		page_downgrade(pp);
685 
686 		/* pre-zero the page */
687 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
688 	}
689 
690 	return (0);
691 }
692 
693 static int
694 vmmr_resv_wait()
695 {
696 	if (delay_sig(hz >> 2) != 0) {
697 		/* bail due to interruption */
698 		return (0);
699 	}
700 	return (1);
701 }
702 
703 static void
704 vmmr_remove_raw(size_t sz)
705 {
706 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
707 	VERIFY(MUTEX_HELD(&vmmr_lock));
708 
709 	size_t remain = sz;
710 	while (remain > 0) {
711 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
712 
713 		/*
714 		 * The caller must ensure that at least `sz` amount is present
715 		 * in the free treepair.
716 		 */
717 		VERIFY3P(span, !=, NULL);
718 		ASSERT3U(span->vs_size, <=, remain);
719 
720 		/* TODO: perhaps arrange to destroy pages outside the lock? */
721 		vmmr_destroy_pages(span);
722 
723 		remain -= span->vs_size;
724 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
725 	}
726 
727 	vmmr_empty_sz += sz;
728 }
729 
730 /*
731  * Add memory to vmm reservoir.  Memory may be marked for transient use, where
732  * the addition is part of a transient allocation from the reservoir.  Otherwise
733  * it is placed in the reservoir to be available for non-transient allocations.
734  *
735  * Expects vmmr_lock to be held when called, and will return with it held, but
736  * will drop it during portions of the addition.
737  */
738 static int
739 vmmr_add(size_t sz, bool transient)
740 {
741 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
742 	VERIFY3U(sz, >, 0);
743 	VERIFY(MUTEX_HELD(&vmmr_lock));
744 
745 	/*
746 	 * Make sure that the amount added is not going to breach the limits
747 	 * we've chosen
748 	 */
749 	const size_t current_total =
750 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
751 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
752 	if ((current_total + sz) < current_total) {
753 		return (EOVERFLOW);
754 	}
755 	if ((current_total + sz) > vmmr_total_limit) {
756 		return (ENOSPC);
757 	}
758 	vmmr_adding_sz += sz;
759 	mutex_exit(&vmmr_lock);
760 
761 	/* Wait for enough pages to become available */
762 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
763 		mutex_enter(&vmmr_lock);
764 		vmmr_adding_sz -= sz;
765 		return (EINTR);
766 	}
767 
768 	mutex_enter(&vmmr_lock);
769 	size_t added = 0;
770 	size_t remain = sz;
771 	while (added < sz) {
772 		vmmr_span_t *span = NULL;
773 
774 		if (vmmr_empty_sz > 0) {
775 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
776 
777 			vmmr_empty_sz -= span->vs_size;
778 		} else {
779 			/*
780 			 * No empty space to fill with new pages, so just tack
781 			 * it on at the end instead.
782 			 */
783 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
784 			span->vs_addr = vmmr_empty_last;
785 			span->vs_size = remain;
786 			vmmr_empty_last += remain;
787 		}
788 		VERIFY3P(span, !=, NULL);
789 
790 
791 		/* Allocate the actual pages to back this span */
792 		mutex_exit(&vmmr_lock);
793 		int err = vmmr_alloc_pages(span);
794 		mutex_enter(&vmmr_lock);
795 
796 		/*
797 		 * If an error is encountered during page allocation for the
798 		 * span, unwind any progress made by the addition request.
799 		 */
800 		if (err != 0) {
801 			/*
802 			 * Without pages allocated to this span, it is now
803 			 * tracked as empty.
804 			 */
805 			vmmr_empty_sz += span->vs_size;
806 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
807 
808 			if (added != 0) {
809 				vmmr_remove_raw(added);
810 			}
811 
812 			vmmr_adding_sz -= sz;
813 
814 			page_unresv(sz >> PAGESHIFT);
815 			return (err);
816 		}
817 
818 		/*
819 		 * The allocated-page-bearing span is placed in the "free"
820 		 * treepair now, but is not officially exposed for consumption
821 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
822 		 *
823 		 * This allows us to unwind the allocation in case of a failure
824 		 * without the risk of the freshly added span(s) being snapped
825 		 * up by a consumer already.
826 		 */
827 		added += span->vs_size;
828 		remain -= span->vs_size;
829 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
830 	}
831 
832 	/* Make the added memory usable by exposing it to the size accounting */
833 	if (!transient) {
834 		vmmr_free_sz += added;
835 	} else {
836 		vmmr_free_transient_sz += added;
837 	}
838 	ASSERT3U(added, ==, sz);
839 	vmmr_adding_sz -= added;
840 
841 	return (0);
842 }
843 
844 /*
845  * Remove memory from vmm reservoir.  Normally this will remove memory from the
846  * reservoir which was available for non-transient allocations.  If the removal
847  * is part of a vmmr_free() of a transient allocation, it will act on only that
848  * transient region being freed, not the available memory in the reservoir.
849  *
850  * Expects vmmr_lock to be held when called, and will return with it held, but
851  * may drop it during portions of the removal.
852  */
853 static int
854 vmmr_remove(size_t sz, bool transient)
855 {
856 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
857 	VERIFY(sz);
858 	VERIFY(MUTEX_HELD(&vmmr_lock));
859 
860 	if ((!transient && sz > vmmr_free_sz) ||
861 	    (transient && sz > vmmr_free_transient_sz)) {
862 		return (ENOSPC);
863 	}
864 
865 	vmmr_remove_raw(sz);
866 
867 	if (!transient) {
868 		vmmr_free_sz -= sz;
869 	} else {
870 		vmmr_free_transient_sz -= sz;
871 	}
872 	page_unresv(sz >> PAGESHIFT);
873 	return (0);
874 }
875 
876 static int
877 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
878 {
879 	VERIFY(resp != NULL);
880 
881 	mutex_enter(&vmmr_lock);
882 
883 	size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
884 
885 	/* Be sure to communicate current size in case of an early bail-out */
886 	*resp = current_sz;
887 
888 	if ((target_sz & PAGEOFFSET) != 0 ||
889 	    (chunk_sz & PAGEOFFSET) != 0) {
890 		mutex_exit(&vmmr_lock);
891 		return (EINVAL);
892 	}
893 	/* Reject sentinel value */
894 	if (target_sz == VMMR_TARGET_INACTIVE) {
895 		mutex_exit(&vmmr_lock);
896 		return (EINVAL);
897 	}
898 
899 	/* Already at target size */
900 	if (target_sz == current_sz) {
901 		mutex_exit(&vmmr_lock);
902 		return (0);
903 	}
904 
905 	/* Reject racing requests size */
906 	if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
907 		mutex_exit(&vmmr_lock);
908 		return (EALREADY);
909 	}
910 	/* Record the target now to excluding a racing request */
911 	vmmr_target_sz = target_sz;
912 
913 	int err = 0;
914 	do {
915 		/* Be sensitive to signal interruption */
916 		if (issig(JUSTLOOKING) != 0) {
917 			mutex_exit(&vmmr_lock);
918 			const bool sig_bail = issig(FORREAL) != 0;
919 			mutex_enter(&vmmr_lock);
920 			if (sig_bail) {
921 				err = EINTR;
922 				break;
923 			}
924 		}
925 
926 		if (current_sz > target_sz) {
927 			/* Shrinking reservoir */
928 
929 			size_t req_sz = current_sz - target_sz;
930 			if (chunk_sz != 0) {
931 				req_sz = MIN(req_sz, chunk_sz);
932 			}
933 			err = vmmr_remove(req_sz, false);
934 		} else {
935 			/* Growing reservoir */
936 			ASSERT(current_sz < target_sz);
937 
938 			size_t req_sz = target_sz - current_sz;
939 			if (chunk_sz != 0) {
940 				req_sz = MIN(req_sz, chunk_sz);
941 			}
942 			err = vmmr_add(req_sz, false);
943 		}
944 
945 		current_sz = vmmr_alloc_sz + vmmr_free_sz;
946 	} while (err == 0 && current_sz != target_sz);
947 
948 	/* Clear the target now that we are done (success or not) */
949 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
950 	mutex_exit(&vmmr_lock);
951 	*resp = current_sz;
952 	return (err);
953 }
954 
955 int
956 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
957 {
958 	/*
959 	 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
960 	 * do not need to duplicate such checks here.
961 	 */
962 
963 	switch (cmd) {
964 	case VMM_RESV_QUERY: {
965 		struct vmm_resv_query res;
966 		void *datap = (void *)(uintptr_t)arg;
967 
968 		/* For now, anyone in GZ can query */
969 		if (crgetzoneid(cr) != GLOBAL_ZONEID) {
970 			return (EPERM);
971 		}
972 		mutex_enter(&vmmr_lock);
973 		res.vrq_free_sz = vmmr_free_sz;
974 		res.vrq_alloc_sz = vmmr_alloc_sz;
975 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
976 		res.vrq_limit = vmmr_total_limit;
977 		mutex_exit(&vmmr_lock);
978 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
979 			return (EFAULT);
980 		}
981 		break;
982 	}
983 	case VMM_RESV_SET_TARGET: {
984 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
985 			return (EPERM);
986 		}
987 
988 		struct vmm_resv_target tgt;
989 		void *datap = (void *)(uintptr_t)arg;
990 
991 		if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
992 			return (EFAULT);
993 		}
994 
995 		int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
996 		    &tgt.vrt_result_sz);
997 
998 		/*
999 		 * Attempt to communicate the resultant size of the reservoir if
1000 		 * setting it to the target was a success, or if we were
1001 		 * interrupted (by a signal) while doing so.
1002 		 */
1003 		if (err == 0 || err == EINTR) {
1004 			if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
1005 				err = EFAULT;
1006 			}
1007 		}
1008 
1009 		return (err);
1010 	}
1011 	default:
1012 		return (ENOTTY);
1013 	}
1014 	return (0);
1015 }
1016