xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_reservoir.c (revision fdad6fbf87b201fdb96a704fc41fa8be1e4efbc8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2023 Oxide Computer Company
15  */
16 
17 /*
18  * VMM Memory Reservoir
19  *
20  *
21  * In order to make the allocation of large (multi-GiB) chunks of memory
22  * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
23  * operators can set aside a substantial portion of system memory exclusively
24  * for VMs.  This memory is unavailable for general use by the rest of the
25  * system.  Rather than having to scour the freelist, reap kmem caches, or put
26  * pressure on the ARC, bhyve guest memory allocations can quickly determine if
27  * there is adequate reservoir memory available.  Since the pages stored in the
28  * reservoir are pre-zeroed, it can be immediately used when allocated to a
29  * guest.  When the memory is returned to the reservoir, it is zeroed once more
30  * to avoid leaking any sensitive data from that guest.
31  *
32  *
33  * Transient Allocations
34  *
35  * While the explicit reservoir model may work well for some applications,
36  * others may want a more traditional model, where pages for guest memory
37  * objects are allocated on demand, rather than from a pool set aside from the
38  * system.  In this case, the allocation can be made in "transient" mode, where
39  * the memory is allocated normally, even if there is free capacity in the
40  * reservoir.  When use of the transient allocation is complete (the guest is
41  * halted and destroyed), the pages will be freed back to the system, rather
42  * than added back to the reservoir.
43  *
44  * From an implementation standpoint, transient allocations follow the same
45  * code paths as ones using the reservoir normally.  Those allocations have a
46  * tag which marks them as transient, and used/free size tallies are maintained
47  * separately for normal and transient operations.  When performing a transient
48  * allocation, that amount of memory is immediately added to the reservoir ,
49  * from which the allocation can be made.  When freeing a transient allocation,
50  * a matching amount of memory is removed from the reservoir as part of the
51  * operation.  This allows both allocation types to coexist without too much
52  * additional machinery.
53  *
54  *
55  * Administration
56  *
57  * Operators may attempt to alter the amount of memory allocated to the
58  * reservoir via an ioctl against the vmmctl device.  The total amount of memory
59  * in the reservoir (free, or allocated to VMs) is limited by
60  * `vmm_total_limit` (see its definition for how this limit is calculated).
61  *
62  * The limit is in place to prevent the reservoir from inadvertently growing
63  * to a size where the system has inadequate memory to make forward progress.
64  * Shrinking the reservoir is only possible when it contains free (not
65  * allocated by any guest VMs) memory.
66  *
67  *
68  * Page Tracking
69  *
70  * The reservoir currently uses vnode association to keep track of pages under
71  * its control (either designated to the reservoir and free, or allocated to a
72  * guest VM object).  This means using the existing VM system primitives for
73  * page_t instances being associated with a given (vnode, offset) tuple.  It
74  * means that spans of pages, either free or allocated, need only to store a
75  * length (of the span) and an offset (into the vnode) in order to gain access
76  * to all of the underlying pages associated with that span.  Associating the
77  * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
78  * properly tracked as KAS pages, but be excluded from normal dumps (unless the
79  * operator has chosen to dump all of RAM).
80  */
81 
82 #include <sys/types.h>
83 #include <sys/mutex.h>
84 #include <sys/avl.h>
85 #include <sys/list.h>
86 #include <sys/machparam.h>
87 #include <sys/kmem.h>
88 #include <sys/stddef.h>
89 #include <sys/null.h>
90 #include <sys/errno.h>
91 #include <sys/systm.h>
92 #include <sys/sunddi.h>
93 #include <sys/policy.h>
94 #include <vm/seg_kmem.h>
95 #include <vm/hat_i86.h>
96 #include <sys/kstat.h>
97 
98 #include <sys/vmm_reservoir.h>
99 #include <sys/vmm_dev.h>
100 #include <sys/vmm_impl.h>
101 
102 #define	VMMR_TARGET_INACTIVE	SIZE_MAX
103 
104 static kmutex_t vmmr_lock;
105 
106 static size_t vmmr_free_sz;
107 static size_t vmmr_free_transient_sz;
108 static size_t vmmr_adding_sz;
109 static size_t vmmr_alloc_sz;
110 static size_t vmmr_alloc_transient_sz;
111 static size_t vmmr_empty_sz;
112 
113 /*
114  * Target size of the reservoir during active vmmr_set_target() operation.
115  * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
116  */
117 static size_t vmmr_target_sz;
118 
119 static uintptr_t vmmr_empty_last;
120 /* Upper limit for the size (free + allocated) of the reservoir */
121 static size_t vmmr_total_limit;
122 
123 /* VA range allocated from the VMM arena for the mappings */
124 static uintptr_t vmmr_va;
125 static uintptr_t vmmr_va_sz;
126 
127 static kstat_t *vmmr_kstat;
128 
129 /* Pair of AVL trees to store set of spans ordered by addr and size */
130 typedef struct vmmr_treepair {
131 	avl_tree_t by_addr;
132 	avl_tree_t by_size;
133 } vmmr_treepair_t;
134 
135 /* Spans of free memory in the reservoir */
136 static vmmr_treepair_t vmmr_free_tp;
137 
138 /* Spans of empty (not backed by memory) space in the reservoir */
139 static vmmr_treepair_t vmmr_empty_tp;
140 
141 /* Regions of memory allocated from the reservoir */
142 static list_t vmmr_alloc_regions;
143 
144 struct vmmr_span {
145 	uintptr_t	vs_addr;
146 	size_t		vs_size;
147 	avl_node_t	vs_by_addr;
148 	avl_node_t	vs_by_size;
149 	uintptr_t	vs_region_addr;
150 };
151 typedef struct vmmr_span vmmr_span_t;
152 
153 struct vmmr_region {
154 	size_t		vr_size;
155 	avl_tree_t	vr_spans;
156 	list_node_t	vr_node;
157 	bool		vr_transient;
158 };
159 
160 typedef struct vmmr_kstats {
161 	kstat_named_t	vmrks_bytes_free;
162 	kstat_named_t	vmrks_bytes_alloc;
163 	kstat_named_t	vmrks_bytes_transient;
164 	kstat_named_t	vmrks_bytes_limit;
165 } vmmr_kstats_t;
166 
167 
168 static int vmmr_add(size_t, bool);
169 static int vmmr_remove(size_t, bool);
170 
171 static int
vmmr_cmp_addr(const void * a,const void * b)172 vmmr_cmp_addr(const void *a, const void *b)
173 {
174 	const vmmr_span_t *sa = a;
175 	const vmmr_span_t *sb = b;
176 
177 	if (sa->vs_addr == sb->vs_addr) {
178 		return (0);
179 	} else if (sa->vs_addr < sb->vs_addr) {
180 		return (-1);
181 	} else {
182 		return (1);
183 	}
184 }
185 
186 static int
vmmr_cmp_size(const void * a,const void * b)187 vmmr_cmp_size(const void *a, const void *b)
188 {
189 	const vmmr_span_t *sa = a;
190 	const vmmr_span_t *sb = b;
191 
192 	if (sa->vs_size == sb->vs_size) {
193 		/*
194 		 * Since discontiguous spans could have the same size in a
195 		 * by-size tree, differentiate them (as required by AVL) by
196 		 * address so they can safely coexist while remaining sorted.
197 		 */
198 		return (vmmr_cmp_addr(a, b));
199 	} else if (sa->vs_size < sb->vs_size) {
200 		return (-1);
201 	} else {
202 		return (1);
203 	}
204 }
205 
206 static int
vmmr_cmp_region_addr(const void * a,const void * b)207 vmmr_cmp_region_addr(const void *a, const void *b)
208 {
209 	const vmmr_span_t *sa = a;
210 	const vmmr_span_t *sb = b;
211 
212 	if (sa->vs_region_addr == sb->vs_region_addr) {
213 		return (0);
214 	} else if (sa->vs_region_addr < sb->vs_region_addr) {
215 		return (-1);
216 	} else {
217 		return (1);
218 	}
219 }
220 
221 static void
vmmr_tp_init(vmmr_treepair_t * tree)222 vmmr_tp_init(vmmr_treepair_t *tree)
223 {
224 	avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
225 	    offsetof(vmmr_span_t, vs_by_addr));
226 	avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
227 	    offsetof(vmmr_span_t, vs_by_size));
228 }
229 
230 static void
vmmr_tp_destroy(vmmr_treepair_t * tree)231 vmmr_tp_destroy(vmmr_treepair_t *tree)
232 {
233 	void *vcp = NULL;
234 	vmmr_span_t *span;
235 
236 	while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
237 		/* Freeing spans will be done when tearing down by-size tree */
238 	}
239 	while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
240 		kmem_free(span, sizeof (*span));
241 	}
242 	avl_destroy(&tree->by_addr);
243 	avl_destroy(&tree->by_size);
244 }
245 
246 /*
247  * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
248  * span(s).  Such concatenation could result in the `to_add` span being freed,
249  * so the caller cannot use it after this returns.
250  */
251 static void
vmmr_tp_insert_concat(vmmr_span_t * to_add,vmmr_treepair_t * tree)252 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
253 {
254 	avl_tree_t *by_addr = &tree->by_addr;
255 	avl_tree_t *by_size = &tree->by_size;
256 	vmmr_span_t *node;
257 	avl_index_t where;
258 
259 	/* This addr should not already exist in the treepair */
260 	node = avl_find(by_addr, to_add, &where);
261 	ASSERT3P(node, ==, NULL);
262 
263 	node = avl_nearest(by_addr, where, AVL_BEFORE);
264 	if (node != NULL &&
265 	    (node->vs_addr + node->vs_size) == to_add->vs_addr) {
266 		/* concat with preceeding item */
267 		avl_remove(by_addr, node);
268 		avl_remove(by_size, node);
269 		node->vs_size += to_add->vs_size;
270 		kmem_free(to_add, sizeof (*to_add));
271 
272 		/*
273 		 * Since this now-concatenated span could be adjacent one
274 		 * trailing it, fall through to perform that check.
275 		 */
276 		to_add = node;
277 	}
278 
279 	node = avl_nearest(by_addr, where, AVL_AFTER);
280 	if (node != NULL &&
281 	    (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
282 		/* concat with trailing item */
283 		avl_remove(by_addr, node);
284 		avl_remove(by_size, node);
285 		node->vs_addr = to_add->vs_addr;
286 		node->vs_size += to_add->vs_size;
287 		avl_add(by_addr, node);
288 		avl_add(by_size, node);
289 
290 		kmem_free(to_add, sizeof (*to_add));
291 		return;
292 	}
293 
294 	/* simply insert */
295 	avl_add(by_addr, to_add);
296 	avl_add(by_size, to_add);
297 }
298 
299 /*
300  * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
301  * the exact target size is not present, but a larger one is.  May return a span
302  * with a size smaller than the target if splitting is not an option.
303  */
304 static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz,vmmr_treepair_t * tree)305 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
306 {
307 	avl_tree_t *by_addr = &tree->by_addr;
308 	avl_tree_t *by_size = &tree->by_size;
309 	vmmr_span_t *span;
310 	avl_index_t where;
311 
312 	ASSERT3U(target_sz, !=, 0);
313 	ASSERT(!avl_is_empty(by_addr));
314 	ASSERT(!avl_is_empty(by_size));
315 
316 	vmmr_span_t search = { .vs_size = target_sz };
317 	span = avl_find(by_size, &search, &where);
318 	if (span == NULL) {
319 		/* Try for a larger span (instead of exact match) */
320 		span = avl_nearest(by_size, where, AVL_AFTER);
321 		if (span == NULL) {
322 			/*
323 			 * Caller will need to collect several smaller spans in
324 			 * order to fulfill their request.
325 			 */
326 			span = avl_nearest(by_size, where, AVL_BEFORE);
327 			ASSERT3P(span, !=, NULL);
328 		}
329 	}
330 
331 	if (span->vs_size <= target_sz) {
332 		avl_remove(by_size, span);
333 		avl_remove(by_addr, span);
334 
335 		return (span);
336 	} else {
337 		/* Split off adequate chunk from larger span */
338 		uintptr_t start = span->vs_addr + span->vs_size - target_sz;
339 
340 		avl_remove(by_size, span);
341 		span->vs_size -= target_sz;
342 		avl_add(by_size, span);
343 
344 		vmmr_span_t *split_span =
345 		    kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
346 		split_span->vs_addr = start;
347 		split_span->vs_size = target_sz;
348 
349 		return (split_span);
350 	}
351 }
352 
353 static int
vmmr_kstat_update(struct kstat * ksp,int rw)354 vmmr_kstat_update(struct kstat *ksp, int rw)
355 {
356 	vmmr_kstats_t *vkp = ksp->ks_data;
357 
358 	mutex_enter(&vmmr_lock);
359 	vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
360 	vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
361 	/*
362 	 * In addition to the memory which is actually actually allocated to
363 	 * transient consumers, memory which is considered free-for-transient is
364 	 * also included in the sizing.
365 	 */
366 	vkp->vmrks_bytes_transient.value.ui64 =
367 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
368 	vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
369 	mutex_exit(&vmmr_lock);
370 
371 	return (0);
372 }
373 
374 int
vmmr_init()375 vmmr_init()
376 {
377 	mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
378 
379 	/*
380 	 * `vmm_total_limit` represents the absolute maximum size of the VMM
381 	 * memory reservoir.  It is meant to provide some measure of protection
382 	 * against an operator pushing the system into unrecoverable memory
383 	 * starvation through explicit or transient additions to the reservoir.
384 	 *
385 	 * There will be many situations where this limit would be inadequate to
386 	 * prevent kernel memory starvation in the face of certain operator
387 	 * actions.  It is a balance to be struck between safety and allowing
388 	 * large systems to reach high utilization.
389 	 *
390 	 * The value is based off of pages_pp_maximum: "Number of currently
391 	 * available pages that cannot be 'locked'".  It is sized as all of
392 	 * `physmem` less 120% of `pages_pp_maximum`.
393 	 */
394 	vmmr_total_limit =
395 	    (((physmem * 10)  - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
396 
397 	vmmr_empty_last = 0;
398 	vmmr_free_sz = 0;
399 	vmmr_alloc_sz = 0;
400 	vmmr_empty_sz = 0;
401 	vmmr_adding_sz = 0;
402 	vmmr_free_transient_sz = 0;
403 	vmmr_alloc_transient_sz = 0;
404 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
405 
406 	/*
407 	 * Attempt kstat allocation early, since it is the only part of
408 	 * reservoir initialization which is fallible.
409 	 */
410 	kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
411 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
412 	    sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
413 	if (ksp == NULL) {
414 		mutex_destroy(&vmmr_lock);
415 		return (ENOMEM);
416 	}
417 
418 	vmmr_kstats_t *vkp = ksp->ks_data;
419 
420 	kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
421 	    KSTAT_DATA_UINT64);
422 	kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
423 	    KSTAT_DATA_UINT64);
424 	kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
425 	    KSTAT_DATA_UINT64);
426 	kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
427 	    KSTAT_DATA_UINT64);
428 	ksp->ks_private = NULL;
429 	ksp->ks_update = vmmr_kstat_update;
430 	vmmr_kstat = ksp;
431 
432 	vmmr_tp_init(&vmmr_free_tp);
433 	vmmr_tp_init(&vmmr_empty_tp);
434 
435 	list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
436 	    offsetof(vmmr_region_t, vr_node));
437 
438 	/* Grab a chunk of VA for the reservoir */
439 	vmmr_va_sz = physmem * PAGESIZE;
440 	vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
441 
442 	kstat_install(vmmr_kstat);
443 
444 	return (0);
445 }
446 
447 void
vmmr_fini()448 vmmr_fini()
449 {
450 	mutex_enter(&vmmr_lock);
451 	VERIFY3U(vmmr_alloc_sz, ==, 0);
452 	VERIFY3U(vmmr_free_sz, ==, 0);
453 	VERIFY3U(vmmr_adding_sz, ==, 0);
454 	VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
455 	VERIFY3U(vmmr_free_transient_sz, ==, 0);
456 	VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
457 	VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
458 	VERIFY(list_is_empty(&vmmr_alloc_regions));
459 
460 	kstat_delete(vmmr_kstat);
461 	vmmr_kstat = NULL;
462 
463 	vmmr_tp_destroy(&vmmr_free_tp);
464 	vmmr_tp_destroy(&vmmr_empty_tp);
465 	list_destroy(&vmmr_alloc_regions);
466 
467 	/* Release reservoir VA chunk */
468 	vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
469 	vmmr_va = 0;
470 	vmmr_va_sz = 0;
471 	vmmr_total_limit = 0;
472 	vmmr_empty_last = 0;
473 
474 	mutex_exit(&vmmr_lock);
475 	mutex_destroy(&vmmr_lock);
476 }
477 
478 bool
vmmr_is_empty()479 vmmr_is_empty()
480 {
481 	mutex_enter(&vmmr_lock);
482 	bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
483 	    vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
484 	mutex_exit(&vmmr_lock);
485 	return (res);
486 }
487 
488 int
vmmr_alloc(size_t sz,bool transient,vmmr_region_t ** resp)489 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
490 {
491 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
492 
493 	if (!transient) {
494 		mutex_enter(&vmmr_lock);
495 		if (sz > vmmr_free_sz) {
496 			mutex_exit(&vmmr_lock);
497 			return (ENOSPC);
498 		}
499 	} else {
500 		int err;
501 
502 		mutex_enter(&vmmr_lock);
503 		err = vmmr_add(sz, true);
504 		if (err != 0) {
505 			mutex_exit(&vmmr_lock);
506 			return (err);
507 		}
508 		VERIFY3U(vmmr_free_transient_sz, >=, sz);
509 	}
510 
511 	vmmr_region_t *region;
512 	region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
513 	avl_create(&region->vr_spans, vmmr_cmp_region_addr,
514 	    sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
515 	region->vr_size = sz;
516 
517 	size_t remain = sz;
518 	uintptr_t map_at = 0;
519 	while (remain > 0) {
520 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
521 
522 		/*
523 		 * We have already ensured that adequate free memory is present
524 		 * in the reservoir for this allocation.
525 		 */
526 		VERIFY3P(span, !=, NULL);
527 		ASSERT3U(span->vs_size, <=, remain);
528 
529 		span->vs_region_addr = map_at;
530 		avl_add(&region->vr_spans, span);
531 		map_at += span->vs_size;
532 		remain -= span->vs_size;
533 	}
534 
535 	if (!transient) {
536 		vmmr_free_sz -= sz;
537 		vmmr_alloc_sz += sz;
538 	} else {
539 		vmmr_free_transient_sz -= sz;
540 		vmmr_alloc_transient_sz += sz;
541 		region->vr_transient = true;
542 	}
543 	list_insert_tail(&vmmr_alloc_regions, region);
544 	mutex_exit(&vmmr_lock);
545 
546 	*resp = region;
547 	return (0);
548 }
549 
550 void *
vmmr_region_mem_at(vmmr_region_t * region,uintptr_t off)551 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
552 {
553 	/* just use KPM region for now */
554 	return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
555 }
556 
557 pfn_t
vmmr_region_pfn_at(vmmr_region_t * region,uintptr_t off)558 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
559 {
560 	VERIFY3U(off & PAGEOFFSET, ==, 0);
561 	VERIFY3U(off, <, region->vr_size);
562 
563 	vmmr_span_t search = {
564 		.vs_region_addr = off
565 	};
566 	avl_index_t where;
567 	vmmr_span_t *span = avl_find(&region->vr_spans, &search, &where);
568 
569 	if (span == NULL) {
570 		span = avl_nearest(&region->vr_spans, where, AVL_BEFORE);
571 		ASSERT3P(span, !=, NULL);
572 	}
573 	uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
574 	page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
575 	VERIFY(pp != NULL);
576 	return (pp->p_pagenum);
577 }
578 
579 void
vmmr_free(vmmr_region_t * region)580 vmmr_free(vmmr_region_t *region)
581 {
582 	mutex_enter(&vmmr_lock);
583 	if (!region->vr_transient) {
584 		VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
585 	} else {
586 		VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
587 	}
588 	list_remove(&vmmr_alloc_regions, region);
589 	mutex_exit(&vmmr_lock);
590 
591 	/* Zero the contents (while not monopolizing vmmr_lock) */
592 	for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
593 		bzero(vmmr_region_mem_at(region, off), PAGESIZE);
594 	}
595 
596 	mutex_enter(&vmmr_lock);
597 
598 	/* Put the contained span(s) back in the free pool */
599 	void *cookie = NULL;
600 	vmmr_span_t *span;
601 	while ((span = avl_destroy_nodes(&region->vr_spans, &cookie)) != NULL) {
602 		span->vs_region_addr = 0;
603 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
604 	}
605 	avl_destroy(&region->vr_spans);
606 	if (!region->vr_transient) {
607 		vmmr_free_sz += region->vr_size;
608 		vmmr_alloc_sz -= region->vr_size;
609 	} else {
610 		vmmr_free_transient_sz += region->vr_size;
611 		vmmr_alloc_transient_sz -= region->vr_size;
612 	}
613 
614 	if (region->vr_transient) {
615 		/*
616 		 * Since the transient capacity was previously allocated for
617 		 * this region, its removal should not fail.
618 		 */
619 		VERIFY0(vmmr_remove(region->vr_size, true));
620 	}
621 	kmem_free(region, sizeof (*region));
622 	mutex_exit(&vmmr_lock);
623 }
624 
625 static void
vmmr_destroy_pages(vmmr_span_t * span)626 vmmr_destroy_pages(vmmr_span_t *span)
627 {
628 	const uintptr_t end = span->vs_addr + span->vs_size;
629 	struct vnode *vp = &kvps[KV_VVP];
630 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
631 		page_t *pp;
632 
633 		/* Page-free logic cribbed from segkmem_xfree(): */
634 		pp = page_find(vp, (u_offset_t)pos);
635 		VERIFY(pp != NULL);
636 		if (!page_tryupgrade(pp)) {
637 			/*
638 			 * Some other thread has a sharelock. Wait for
639 			 * it to drop the lock so we can free this page.
640 			 */
641 			page_unlock(pp);
642 			pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
643 		}
644 
645 		/*
646 		 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
647 		 * That will be taken care of later via page_unresv().
648 		 */
649 		pp->p_lckcnt = 0;
650 		page_destroy(pp, 0);
651 	}
652 }
653 
654 static int
vmmr_alloc_pages(const vmmr_span_t * span)655 vmmr_alloc_pages(const vmmr_span_t *span)
656 {
657 	struct seg kseg = {
658 		.s_as = &kas
659 	};
660 	struct vnode *vp = &kvps[KV_VVP];
661 
662 	const uintptr_t end = span->vs_addr + span->vs_size;
663 	for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
664 		page_t *pp;
665 
666 		pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
667 		    PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
668 
669 		if (pp == NULL) {
670 			/* Destroy any already-created pages */
671 			if (pos != span->vs_addr) {
672 				vmmr_span_t destroy_span = {
673 					.vs_addr = span->vs_addr,
674 					.vs_size = pos - span->vs_addr,
675 				};
676 
677 				vmmr_destroy_pages(&destroy_span);
678 			}
679 			return (ENOMEM);
680 		}
681 
682 		/* mimic page state from segkmem */
683 		ASSERT(PAGE_EXCL(pp));
684 		page_io_unlock(pp);
685 		pp->p_lckcnt = 1;
686 		page_downgrade(pp);
687 
688 		/* pre-zero the page */
689 		bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
690 	}
691 
692 	return (0);
693 }
694 
695 static int
vmmr_resv_wait()696 vmmr_resv_wait()
697 {
698 	if (delay_sig(hz >> 2) != 0) {
699 		/* bail due to interruption */
700 		return (0);
701 	}
702 	return (1);
703 }
704 
705 static void
vmmr_remove_raw(size_t sz)706 vmmr_remove_raw(size_t sz)
707 {
708 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
709 	VERIFY(MUTEX_HELD(&vmmr_lock));
710 
711 	size_t remain = sz;
712 	while (remain > 0) {
713 		vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
714 
715 		/*
716 		 * The caller must ensure that at least `sz` amount is present
717 		 * in the free treepair.
718 		 */
719 		VERIFY3P(span, !=, NULL);
720 		ASSERT3U(span->vs_size, <=, remain);
721 
722 		/* TODO: perhaps arrange to destroy pages outside the lock? */
723 		vmmr_destroy_pages(span);
724 
725 		remain -= span->vs_size;
726 		vmmr_tp_insert_concat(span, &vmmr_empty_tp);
727 	}
728 
729 	vmmr_empty_sz += sz;
730 }
731 
732 /*
733  * Add memory to vmm reservoir.  Memory may be marked for transient use, where
734  * the addition is part of a transient allocation from the reservoir.  Otherwise
735  * it is placed in the reservoir to be available for non-transient allocations.
736  *
737  * Expects vmmr_lock to be held when called, and will return with it held, but
738  * will drop it during portions of the addition.
739  */
740 static int
vmmr_add(size_t sz,bool transient)741 vmmr_add(size_t sz, bool transient)
742 {
743 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
744 	VERIFY3U(sz, >, 0);
745 	VERIFY(MUTEX_HELD(&vmmr_lock));
746 
747 	/*
748 	 * Make sure that the amount added is not going to breach the limits
749 	 * we've chosen
750 	 */
751 	const size_t current_total =
752 	    vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
753 	    vmmr_alloc_transient_sz + vmmr_free_transient_sz;
754 	if ((current_total + sz) < current_total) {
755 		return (EOVERFLOW);
756 	}
757 	if ((current_total + sz) > vmmr_total_limit) {
758 		return (ENOSPC);
759 	}
760 	vmmr_adding_sz += sz;
761 	mutex_exit(&vmmr_lock);
762 
763 	/* Wait for enough pages to become available */
764 	if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
765 		mutex_enter(&vmmr_lock);
766 		vmmr_adding_sz -= sz;
767 		return (EINTR);
768 	}
769 
770 	mutex_enter(&vmmr_lock);
771 	size_t added = 0;
772 	size_t remain = sz;
773 	while (added < sz) {
774 		vmmr_span_t *span = NULL;
775 
776 		if (vmmr_empty_sz > 0) {
777 			span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
778 
779 			vmmr_empty_sz -= span->vs_size;
780 		} else {
781 			/*
782 			 * No empty space to fill with new pages, so just tack
783 			 * it on at the end instead.
784 			 */
785 			span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
786 			span->vs_addr = vmmr_empty_last;
787 			span->vs_size = remain;
788 			vmmr_empty_last += remain;
789 		}
790 		VERIFY3P(span, !=, NULL);
791 
792 
793 		/* Allocate the actual pages to back this span */
794 		mutex_exit(&vmmr_lock);
795 		int err = vmmr_alloc_pages(span);
796 		mutex_enter(&vmmr_lock);
797 
798 		/*
799 		 * If an error is encountered during page allocation for the
800 		 * span, unwind any progress made by the addition request.
801 		 */
802 		if (err != 0) {
803 			/*
804 			 * Without pages allocated to this span, it is now
805 			 * tracked as empty.
806 			 */
807 			vmmr_empty_sz += span->vs_size;
808 			vmmr_tp_insert_concat(span, &vmmr_empty_tp);
809 
810 			if (added != 0) {
811 				vmmr_remove_raw(added);
812 			}
813 
814 			vmmr_adding_sz -= sz;
815 
816 			page_unresv(sz >> PAGESHIFT);
817 			return (err);
818 		}
819 
820 		/*
821 		 * The allocated-page-bearing span is placed in the "free"
822 		 * treepair now, but is not officially exposed for consumption
823 		 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
824 		 *
825 		 * This allows us to unwind the allocation in case of a failure
826 		 * without the risk of the freshly added span(s) being snapped
827 		 * up by a consumer already.
828 		 */
829 		added += span->vs_size;
830 		remain -= span->vs_size;
831 		vmmr_tp_insert_concat(span, &vmmr_free_tp);
832 	}
833 
834 	/* Make the added memory usable by exposing it to the size accounting */
835 	if (!transient) {
836 		vmmr_free_sz += added;
837 	} else {
838 		vmmr_free_transient_sz += added;
839 	}
840 	ASSERT3U(added, ==, sz);
841 	vmmr_adding_sz -= added;
842 
843 	return (0);
844 }
845 
846 /*
847  * Remove memory from vmm reservoir.  Normally this will remove memory from the
848  * reservoir which was available for non-transient allocations.  If the removal
849  * is part of a vmmr_free() of a transient allocation, it will act on only that
850  * transient region being freed, not the available memory in the reservoir.
851  *
852  * Expects vmmr_lock to be held when called, and will return with it held, but
853  * may drop it during portions of the removal.
854  */
855 static int
vmmr_remove(size_t sz,bool transient)856 vmmr_remove(size_t sz, bool transient)
857 {
858 	VERIFY3U(sz & PAGEOFFSET, ==, 0);
859 	VERIFY(sz);
860 	VERIFY(MUTEX_HELD(&vmmr_lock));
861 
862 	if ((!transient && sz > vmmr_free_sz) ||
863 	    (transient && sz > vmmr_free_transient_sz)) {
864 		return (ENOSPC);
865 	}
866 
867 	vmmr_remove_raw(sz);
868 
869 	if (!transient) {
870 		vmmr_free_sz -= sz;
871 	} else {
872 		vmmr_free_transient_sz -= sz;
873 	}
874 	page_unresv(sz >> PAGESHIFT);
875 	return (0);
876 }
877 
878 static int
vmmr_set_target(size_t target_sz,size_t chunk_sz,size_t * resp)879 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
880 {
881 	VERIFY(resp != NULL);
882 
883 	mutex_enter(&vmmr_lock);
884 
885 	size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
886 
887 	/* Be sure to communicate current size in case of an early bail-out */
888 	*resp = current_sz;
889 
890 	if ((target_sz & PAGEOFFSET) != 0 ||
891 	    (chunk_sz & PAGEOFFSET) != 0) {
892 		mutex_exit(&vmmr_lock);
893 		return (EINVAL);
894 	}
895 	/* Reject sentinel value */
896 	if (target_sz == VMMR_TARGET_INACTIVE) {
897 		mutex_exit(&vmmr_lock);
898 		return (EINVAL);
899 	}
900 
901 	/* Already at target size */
902 	if (target_sz == current_sz) {
903 		mutex_exit(&vmmr_lock);
904 		return (0);
905 	}
906 
907 	/* Reject racing requests size */
908 	if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
909 		mutex_exit(&vmmr_lock);
910 		return (EALREADY);
911 	}
912 	/* Record the target now to excluding a racing request */
913 	vmmr_target_sz = target_sz;
914 
915 	int err = 0;
916 	do {
917 		/* Be sensitive to signal interruption */
918 		if (issig(JUSTLOOKING) != 0) {
919 			mutex_exit(&vmmr_lock);
920 			const bool sig_bail = issig(FORREAL) != 0;
921 			mutex_enter(&vmmr_lock);
922 			if (sig_bail) {
923 				err = EINTR;
924 				break;
925 			}
926 		}
927 
928 		if (current_sz > target_sz) {
929 			/* Shrinking reservoir */
930 
931 			size_t req_sz = current_sz - target_sz;
932 			if (chunk_sz != 0) {
933 				req_sz = MIN(req_sz, chunk_sz);
934 			}
935 			err = vmmr_remove(req_sz, false);
936 		} else {
937 			/* Growing reservoir */
938 			ASSERT(current_sz < target_sz);
939 
940 			size_t req_sz = target_sz - current_sz;
941 			if (chunk_sz != 0) {
942 				req_sz = MIN(req_sz, chunk_sz);
943 			}
944 			err = vmmr_add(req_sz, false);
945 		}
946 
947 		current_sz = vmmr_alloc_sz + vmmr_free_sz;
948 	} while (err == 0 && current_sz != target_sz);
949 
950 	/* Clear the target now that we are done (success or not) */
951 	vmmr_target_sz = VMMR_TARGET_INACTIVE;
952 	mutex_exit(&vmmr_lock);
953 	*resp = current_sz;
954 	return (err);
955 }
956 
957 int
vmmr_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)958 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
959 {
960 	/*
961 	 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
962 	 * do not need to duplicate such checks here.
963 	 */
964 
965 	switch (cmd) {
966 	case VMM_RESV_QUERY: {
967 		struct vmm_resv_query res;
968 		void *datap = (void *)(uintptr_t)arg;
969 
970 		/* For now, anyone with access to vmmctl device can query */
971 		mutex_enter(&vmmr_lock);
972 		res.vrq_free_sz = vmmr_free_sz;
973 		res.vrq_alloc_sz = vmmr_alloc_sz;
974 		res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
975 		res.vrq_limit = vmmr_total_limit;
976 		mutex_exit(&vmmr_lock);
977 		if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
978 			return (EFAULT);
979 		}
980 		break;
981 	}
982 	case VMM_RESV_SET_TARGET: {
983 		if (secpolicy_sys_config(cr, B_FALSE) != 0) {
984 			return (EPERM);
985 		}
986 
987 		struct vmm_resv_target tgt;
988 		void *datap = (void *)(uintptr_t)arg;
989 
990 		if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
991 			return (EFAULT);
992 		}
993 
994 		int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
995 		    &tgt.vrt_result_sz);
996 
997 		/*
998 		 * Attempt to communicate the resultant size of the reservoir if
999 		 * setting it to the target was a success, or if we were
1000 		 * interrupted (by a signal) while doing so.
1001 		 */
1002 		if (err == 0 || err == EINTR) {
1003 			if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
1004 				err = EFAULT;
1005 			}
1006 		}
1007 
1008 		return (err);
1009 	}
1010 	default:
1011 		return (ENOTTY);
1012 	}
1013 	return (0);
1014 }
1015