1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2023 Oxide Computer Company
15 */
16
17 /*
18 * VMM Memory Reservoir
19 *
20 *
21 * In order to make the allocation of large (multi-GiB) chunks of memory
22 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
23 * operators can set aside a substantial portion of system memory exclusively
24 * for VMs. This memory is unavailable for general use by the rest of the
25 * system. Rather than having to scour the freelist, reap kmem caches, or put
26 * pressure on the ARC, bhyve guest memory allocations can quickly determine if
27 * there is adequate reservoir memory available. Since the pages stored in the
28 * reservoir are pre-zeroed, it can be immediately used when allocated to a
29 * guest. When the memory is returned to the reservoir, it is zeroed once more
30 * to avoid leaking any sensitive data from that guest.
31 *
32 *
33 * Transient Allocations
34 *
35 * While the explicit reservoir model may work well for some applications,
36 * others may want a more traditional model, where pages for guest memory
37 * objects are allocated on demand, rather than from a pool set aside from the
38 * system. In this case, the allocation can be made in "transient" mode, where
39 * the memory is allocated normally, even if there is free capacity in the
40 * reservoir. When use of the transient allocation is complete (the guest is
41 * halted and destroyed), the pages will be freed back to the system, rather
42 * than added back to the reservoir.
43 *
44 * From an implementation standpoint, transient allocations follow the same
45 * code paths as ones using the reservoir normally. Those allocations have a
46 * tag which marks them as transient, and used/free size tallies are maintained
47 * separately for normal and transient operations. When performing a transient
48 * allocation, that amount of memory is immediately added to the reservoir ,
49 * from which the allocation can be made. When freeing a transient allocation,
50 * a matching amount of memory is removed from the reservoir as part of the
51 * operation. This allows both allocation types to coexist without too much
52 * additional machinery.
53 *
54 *
55 * Administration
56 *
57 * Operators may attempt to alter the amount of memory allocated to the
58 * reservoir via an ioctl against the vmmctl device. The total amount of memory
59 * in the reservoir (free, or allocated to VMs) is limited by
60 * `vmm_total_limit` (see its definition for how this limit is calculated).
61 *
62 * The limit is in place to prevent the reservoir from inadvertently growing
63 * to a size where the system has inadequate memory to make forward progress.
64 * Shrinking the reservoir is only possible when it contains free (not
65 * allocated by any guest VMs) memory.
66 *
67 *
68 * Page Tracking
69 *
70 * The reservoir currently uses vnode association to keep track of pages under
71 * its control (either designated to the reservoir and free, or allocated to a
72 * guest VM object). This means using the existing VM system primitives for
73 * page_t instances being associated with a given (vnode, offset) tuple. It
74 * means that spans of pages, either free or allocated, need only to store a
75 * length (of the span) and an offset (into the vnode) in order to gain access
76 * to all of the underlying pages associated with that span. Associating the
77 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
78 * properly tracked as KAS pages, but be excluded from normal dumps (unless the
79 * operator has chosen to dump all of RAM).
80 */
81
82 #include <sys/types.h>
83 #include <sys/mutex.h>
84 #include <sys/avl.h>
85 #include <sys/list.h>
86 #include <sys/machparam.h>
87 #include <sys/kmem.h>
88 #include <sys/stddef.h>
89 #include <sys/null.h>
90 #include <sys/errno.h>
91 #include <sys/systm.h>
92 #include <sys/sunddi.h>
93 #include <sys/policy.h>
94 #include <vm/seg_kmem.h>
95 #include <vm/hat_i86.h>
96 #include <sys/kstat.h>
97
98 #include <sys/vmm_reservoir.h>
99 #include <sys/vmm_dev.h>
100 #include <sys/vmm_impl.h>
101
102 #define VMMR_TARGET_INACTIVE SIZE_MAX
103
104 static kmutex_t vmmr_lock;
105
106 static size_t vmmr_free_sz;
107 static size_t vmmr_free_transient_sz;
108 static size_t vmmr_adding_sz;
109 static size_t vmmr_alloc_sz;
110 static size_t vmmr_alloc_transient_sz;
111 static size_t vmmr_empty_sz;
112
113 /*
114 * Target size of the reservoir during active vmmr_set_target() operation.
115 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
116 */
117 static size_t vmmr_target_sz;
118
119 static uintptr_t vmmr_empty_last;
120 /* Upper limit for the size (free + allocated) of the reservoir */
121 static size_t vmmr_total_limit;
122
123 /* VA range allocated from the VMM arena for the mappings */
124 static uintptr_t vmmr_va;
125 static uintptr_t vmmr_va_sz;
126
127 static kstat_t *vmmr_kstat;
128
129 /* Pair of AVL trees to store set of spans ordered by addr and size */
130 typedef struct vmmr_treepair {
131 avl_tree_t by_addr;
132 avl_tree_t by_size;
133 } vmmr_treepair_t;
134
135 /* Spans of free memory in the reservoir */
136 static vmmr_treepair_t vmmr_free_tp;
137
138 /* Spans of empty (not backed by memory) space in the reservoir */
139 static vmmr_treepair_t vmmr_empty_tp;
140
141 /* Regions of memory allocated from the reservoir */
142 static list_t vmmr_alloc_regions;
143
144 struct vmmr_span {
145 uintptr_t vs_addr;
146 size_t vs_size;
147 avl_node_t vs_by_addr;
148 avl_node_t vs_by_size;
149 uintptr_t vs_region_addr;
150 };
151 typedef struct vmmr_span vmmr_span_t;
152
153 struct vmmr_region {
154 size_t vr_size;
155 avl_tree_t vr_spans;
156 list_node_t vr_node;
157 bool vr_transient;
158 };
159
160 typedef struct vmmr_kstats {
161 kstat_named_t vmrks_bytes_free;
162 kstat_named_t vmrks_bytes_alloc;
163 kstat_named_t vmrks_bytes_transient;
164 kstat_named_t vmrks_bytes_limit;
165 } vmmr_kstats_t;
166
167
168 static int vmmr_add(size_t, bool);
169 static int vmmr_remove(size_t, bool);
170
171 static int
vmmr_cmp_addr(const void * a,const void * b)172 vmmr_cmp_addr(const void *a, const void *b)
173 {
174 const vmmr_span_t *sa = a;
175 const vmmr_span_t *sb = b;
176
177 if (sa->vs_addr == sb->vs_addr) {
178 return (0);
179 } else if (sa->vs_addr < sb->vs_addr) {
180 return (-1);
181 } else {
182 return (1);
183 }
184 }
185
186 static int
vmmr_cmp_size(const void * a,const void * b)187 vmmr_cmp_size(const void *a, const void *b)
188 {
189 const vmmr_span_t *sa = a;
190 const vmmr_span_t *sb = b;
191
192 if (sa->vs_size == sb->vs_size) {
193 /*
194 * Since discontiguous spans could have the same size in a
195 * by-size tree, differentiate them (as required by AVL) by
196 * address so they can safely coexist while remaining sorted.
197 */
198 return (vmmr_cmp_addr(a, b));
199 } else if (sa->vs_size < sb->vs_size) {
200 return (-1);
201 } else {
202 return (1);
203 }
204 }
205
206 static int
vmmr_cmp_region_addr(const void * a,const void * b)207 vmmr_cmp_region_addr(const void *a, const void *b)
208 {
209 const vmmr_span_t *sa = a;
210 const vmmr_span_t *sb = b;
211
212 if (sa->vs_region_addr == sb->vs_region_addr) {
213 return (0);
214 } else if (sa->vs_region_addr < sb->vs_region_addr) {
215 return (-1);
216 } else {
217 return (1);
218 }
219 }
220
221 static void
vmmr_tp_init(vmmr_treepair_t * tree)222 vmmr_tp_init(vmmr_treepair_t *tree)
223 {
224 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
225 offsetof(vmmr_span_t, vs_by_addr));
226 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
227 offsetof(vmmr_span_t, vs_by_size));
228 }
229
230 static void
vmmr_tp_destroy(vmmr_treepair_t * tree)231 vmmr_tp_destroy(vmmr_treepair_t *tree)
232 {
233 void *vcp = NULL;
234 vmmr_span_t *span;
235
236 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
237 /* Freeing spans will be done when tearing down by-size tree */
238 }
239 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
240 kmem_free(span, sizeof (*span));
241 }
242 avl_destroy(&tree->by_addr);
243 avl_destroy(&tree->by_size);
244 }
245
246 /*
247 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
248 * span(s). Such concatenation could result in the `to_add` span being freed,
249 * so the caller cannot use it after this returns.
250 */
251 static void
vmmr_tp_insert_concat(vmmr_span_t * to_add,vmmr_treepair_t * tree)252 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
253 {
254 avl_tree_t *by_addr = &tree->by_addr;
255 avl_tree_t *by_size = &tree->by_size;
256 vmmr_span_t *node;
257 avl_index_t where;
258
259 /* This addr should not already exist in the treepair */
260 node = avl_find(by_addr, to_add, &where);
261 ASSERT3P(node, ==, NULL);
262
263 node = avl_nearest(by_addr, where, AVL_BEFORE);
264 if (node != NULL &&
265 (node->vs_addr + node->vs_size) == to_add->vs_addr) {
266 /* concat with preceeding item */
267 avl_remove(by_addr, node);
268 avl_remove(by_size, node);
269 node->vs_size += to_add->vs_size;
270 kmem_free(to_add, sizeof (*to_add));
271
272 /*
273 * Since this now-concatenated span could be adjacent one
274 * trailing it, fall through to perform that check.
275 */
276 to_add = node;
277 }
278
279 node = avl_nearest(by_addr, where, AVL_AFTER);
280 if (node != NULL &&
281 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
282 /* concat with trailing item */
283 avl_remove(by_addr, node);
284 avl_remove(by_size, node);
285 node->vs_addr = to_add->vs_addr;
286 node->vs_size += to_add->vs_size;
287 avl_add(by_addr, node);
288 avl_add(by_size, node);
289
290 kmem_free(to_add, sizeof (*to_add));
291 return;
292 }
293
294 /* simply insert */
295 avl_add(by_addr, to_add);
296 avl_add(by_size, to_add);
297 }
298
299 /*
300 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
301 * the exact target size is not present, but a larger one is. May return a span
302 * with a size smaller than the target if splitting is not an option.
303 */
304 static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz,vmmr_treepair_t * tree)305 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
306 {
307 avl_tree_t *by_addr = &tree->by_addr;
308 avl_tree_t *by_size = &tree->by_size;
309 vmmr_span_t *span;
310 avl_index_t where;
311
312 ASSERT3U(target_sz, !=, 0);
313 ASSERT(!avl_is_empty(by_addr));
314 ASSERT(!avl_is_empty(by_size));
315
316 vmmr_span_t search = { .vs_size = target_sz };
317 span = avl_find(by_size, &search, &where);
318 if (span == NULL) {
319 /* Try for a larger span (instead of exact match) */
320 span = avl_nearest(by_size, where, AVL_AFTER);
321 if (span == NULL) {
322 /*
323 * Caller will need to collect several smaller spans in
324 * order to fulfill their request.
325 */
326 span = avl_nearest(by_size, where, AVL_BEFORE);
327 ASSERT3P(span, !=, NULL);
328 }
329 }
330
331 if (span->vs_size <= target_sz) {
332 avl_remove(by_size, span);
333 avl_remove(by_addr, span);
334
335 return (span);
336 } else {
337 /* Split off adequate chunk from larger span */
338 uintptr_t start = span->vs_addr + span->vs_size - target_sz;
339
340 avl_remove(by_size, span);
341 span->vs_size -= target_sz;
342 avl_add(by_size, span);
343
344 vmmr_span_t *split_span =
345 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
346 split_span->vs_addr = start;
347 split_span->vs_size = target_sz;
348
349 return (split_span);
350 }
351 }
352
353 static int
vmmr_kstat_update(struct kstat * ksp,int rw)354 vmmr_kstat_update(struct kstat *ksp, int rw)
355 {
356 vmmr_kstats_t *vkp = ksp->ks_data;
357
358 mutex_enter(&vmmr_lock);
359 vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
360 vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
361 /*
362 * In addition to the memory which is actually actually allocated to
363 * transient consumers, memory which is considered free-for-transient is
364 * also included in the sizing.
365 */
366 vkp->vmrks_bytes_transient.value.ui64 =
367 vmmr_alloc_transient_sz + vmmr_free_transient_sz;
368 vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
369 mutex_exit(&vmmr_lock);
370
371 return (0);
372 }
373
374 int
vmmr_init()375 vmmr_init()
376 {
377 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
378
379 /*
380 * `vmm_total_limit` represents the absolute maximum size of the VMM
381 * memory reservoir. It is meant to provide some measure of protection
382 * against an operator pushing the system into unrecoverable memory
383 * starvation through explicit or transient additions to the reservoir.
384 *
385 * There will be many situations where this limit would be inadequate to
386 * prevent kernel memory starvation in the face of certain operator
387 * actions. It is a balance to be struck between safety and allowing
388 * large systems to reach high utilization.
389 *
390 * The value is based off of pages_pp_maximum: "Number of currently
391 * available pages that cannot be 'locked'". It is sized as all of
392 * `physmem` less 120% of `pages_pp_maximum`.
393 */
394 vmmr_total_limit =
395 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
396
397 vmmr_empty_last = 0;
398 vmmr_free_sz = 0;
399 vmmr_alloc_sz = 0;
400 vmmr_empty_sz = 0;
401 vmmr_adding_sz = 0;
402 vmmr_free_transient_sz = 0;
403 vmmr_alloc_transient_sz = 0;
404 vmmr_target_sz = VMMR_TARGET_INACTIVE;
405
406 /*
407 * Attempt kstat allocation early, since it is the only part of
408 * reservoir initialization which is fallible.
409 */
410 kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
411 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
412 sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
413 if (ksp == NULL) {
414 mutex_destroy(&vmmr_lock);
415 return (ENOMEM);
416 }
417
418 vmmr_kstats_t *vkp = ksp->ks_data;
419
420 kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
421 KSTAT_DATA_UINT64);
422 kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
423 KSTAT_DATA_UINT64);
424 kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
425 KSTAT_DATA_UINT64);
426 kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
427 KSTAT_DATA_UINT64);
428 ksp->ks_private = NULL;
429 ksp->ks_update = vmmr_kstat_update;
430 vmmr_kstat = ksp;
431
432 vmmr_tp_init(&vmmr_free_tp);
433 vmmr_tp_init(&vmmr_empty_tp);
434
435 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
436 offsetof(vmmr_region_t, vr_node));
437
438 /* Grab a chunk of VA for the reservoir */
439 vmmr_va_sz = physmem * PAGESIZE;
440 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
441
442 kstat_install(vmmr_kstat);
443
444 return (0);
445 }
446
447 void
vmmr_fini()448 vmmr_fini()
449 {
450 mutex_enter(&vmmr_lock);
451 VERIFY3U(vmmr_alloc_sz, ==, 0);
452 VERIFY3U(vmmr_free_sz, ==, 0);
453 VERIFY3U(vmmr_adding_sz, ==, 0);
454 VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
455 VERIFY3U(vmmr_free_transient_sz, ==, 0);
456 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
457 VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
458 VERIFY(list_is_empty(&vmmr_alloc_regions));
459
460 kstat_delete(vmmr_kstat);
461 vmmr_kstat = NULL;
462
463 vmmr_tp_destroy(&vmmr_free_tp);
464 vmmr_tp_destroy(&vmmr_empty_tp);
465 list_destroy(&vmmr_alloc_regions);
466
467 /* Release reservoir VA chunk */
468 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
469 vmmr_va = 0;
470 vmmr_va_sz = 0;
471 vmmr_total_limit = 0;
472 vmmr_empty_last = 0;
473
474 mutex_exit(&vmmr_lock);
475 mutex_destroy(&vmmr_lock);
476 }
477
478 bool
vmmr_is_empty()479 vmmr_is_empty()
480 {
481 mutex_enter(&vmmr_lock);
482 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
483 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
484 mutex_exit(&vmmr_lock);
485 return (res);
486 }
487
488 int
vmmr_alloc(size_t sz,bool transient,vmmr_region_t ** resp)489 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
490 {
491 VERIFY3U(sz & PAGEOFFSET, ==, 0);
492
493 if (!transient) {
494 mutex_enter(&vmmr_lock);
495 if (sz > vmmr_free_sz) {
496 mutex_exit(&vmmr_lock);
497 return (ENOSPC);
498 }
499 } else {
500 int err;
501
502 mutex_enter(&vmmr_lock);
503 err = vmmr_add(sz, true);
504 if (err != 0) {
505 mutex_exit(&vmmr_lock);
506 return (err);
507 }
508 VERIFY3U(vmmr_free_transient_sz, >=, sz);
509 }
510
511 vmmr_region_t *region;
512 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
513 avl_create(®ion->vr_spans, vmmr_cmp_region_addr,
514 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
515 region->vr_size = sz;
516
517 size_t remain = sz;
518 uintptr_t map_at = 0;
519 while (remain > 0) {
520 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
521
522 /*
523 * We have already ensured that adequate free memory is present
524 * in the reservoir for this allocation.
525 */
526 VERIFY3P(span, !=, NULL);
527 ASSERT3U(span->vs_size, <=, remain);
528
529 span->vs_region_addr = map_at;
530 avl_add(®ion->vr_spans, span);
531 map_at += span->vs_size;
532 remain -= span->vs_size;
533 }
534
535 if (!transient) {
536 vmmr_free_sz -= sz;
537 vmmr_alloc_sz += sz;
538 } else {
539 vmmr_free_transient_sz -= sz;
540 vmmr_alloc_transient_sz += sz;
541 region->vr_transient = true;
542 }
543 list_insert_tail(&vmmr_alloc_regions, region);
544 mutex_exit(&vmmr_lock);
545
546 *resp = region;
547 return (0);
548 }
549
550 void *
vmmr_region_mem_at(vmmr_region_t * region,uintptr_t off)551 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
552 {
553 /* just use KPM region for now */
554 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
555 }
556
557 pfn_t
vmmr_region_pfn_at(vmmr_region_t * region,uintptr_t off)558 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
559 {
560 VERIFY3U(off & PAGEOFFSET, ==, 0);
561 VERIFY3U(off, <, region->vr_size);
562
563 vmmr_span_t search = {
564 .vs_region_addr = off
565 };
566 avl_index_t where;
567 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where);
568
569 if (span == NULL) {
570 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE);
571 ASSERT3P(span, !=, NULL);
572 }
573 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
574 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
575 VERIFY(pp != NULL);
576 return (pp->p_pagenum);
577 }
578
579 void
vmmr_free(vmmr_region_t * region)580 vmmr_free(vmmr_region_t *region)
581 {
582 mutex_enter(&vmmr_lock);
583 if (!region->vr_transient) {
584 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
585 } else {
586 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
587 }
588 list_remove(&vmmr_alloc_regions, region);
589 mutex_exit(&vmmr_lock);
590
591 /* Zero the contents (while not monopolizing vmmr_lock) */
592 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
593 bzero(vmmr_region_mem_at(region, off), PAGESIZE);
594 }
595
596 mutex_enter(&vmmr_lock);
597
598 /* Put the contained span(s) back in the free pool */
599 void *cookie = NULL;
600 vmmr_span_t *span;
601 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) {
602 span->vs_region_addr = 0;
603 vmmr_tp_insert_concat(span, &vmmr_free_tp);
604 }
605 avl_destroy(®ion->vr_spans);
606 if (!region->vr_transient) {
607 vmmr_free_sz += region->vr_size;
608 vmmr_alloc_sz -= region->vr_size;
609 } else {
610 vmmr_free_transient_sz += region->vr_size;
611 vmmr_alloc_transient_sz -= region->vr_size;
612 }
613
614 if (region->vr_transient) {
615 /*
616 * Since the transient capacity was previously allocated for
617 * this region, its removal should not fail.
618 */
619 VERIFY0(vmmr_remove(region->vr_size, true));
620 }
621 kmem_free(region, sizeof (*region));
622 mutex_exit(&vmmr_lock);
623 }
624
625 static void
vmmr_destroy_pages(vmmr_span_t * span)626 vmmr_destroy_pages(vmmr_span_t *span)
627 {
628 const uintptr_t end = span->vs_addr + span->vs_size;
629 struct vnode *vp = &kvps[KV_VVP];
630 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
631 page_t *pp;
632
633 /* Page-free logic cribbed from segkmem_xfree(): */
634 pp = page_find(vp, (u_offset_t)pos);
635 VERIFY(pp != NULL);
636 if (!page_tryupgrade(pp)) {
637 /*
638 * Some other thread has a sharelock. Wait for
639 * it to drop the lock so we can free this page.
640 */
641 page_unlock(pp);
642 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
643 }
644
645 /*
646 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
647 * That will be taken care of later via page_unresv().
648 */
649 pp->p_lckcnt = 0;
650 page_destroy(pp, 0);
651 }
652 }
653
654 static int
vmmr_alloc_pages(const vmmr_span_t * span)655 vmmr_alloc_pages(const vmmr_span_t *span)
656 {
657 struct seg kseg = {
658 .s_as = &kas
659 };
660 struct vnode *vp = &kvps[KV_VVP];
661
662 const uintptr_t end = span->vs_addr + span->vs_size;
663 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
664 page_t *pp;
665
666 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
667 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
668
669 if (pp == NULL) {
670 /* Destroy any already-created pages */
671 if (pos != span->vs_addr) {
672 vmmr_span_t destroy_span = {
673 .vs_addr = span->vs_addr,
674 .vs_size = pos - span->vs_addr,
675 };
676
677 vmmr_destroy_pages(&destroy_span);
678 }
679 return (ENOMEM);
680 }
681
682 /* mimic page state from segkmem */
683 ASSERT(PAGE_EXCL(pp));
684 page_io_unlock(pp);
685 pp->p_lckcnt = 1;
686 page_downgrade(pp);
687
688 /* pre-zero the page */
689 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
690 }
691
692 return (0);
693 }
694
695 static int
vmmr_resv_wait()696 vmmr_resv_wait()
697 {
698 if (delay_sig(hz >> 2) != 0) {
699 /* bail due to interruption */
700 return (0);
701 }
702 return (1);
703 }
704
705 static void
vmmr_remove_raw(size_t sz)706 vmmr_remove_raw(size_t sz)
707 {
708 VERIFY3U(sz & PAGEOFFSET, ==, 0);
709 VERIFY(MUTEX_HELD(&vmmr_lock));
710
711 size_t remain = sz;
712 while (remain > 0) {
713 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
714
715 /*
716 * The caller must ensure that at least `sz` amount is present
717 * in the free treepair.
718 */
719 VERIFY3P(span, !=, NULL);
720 ASSERT3U(span->vs_size, <=, remain);
721
722 /* TODO: perhaps arrange to destroy pages outside the lock? */
723 vmmr_destroy_pages(span);
724
725 remain -= span->vs_size;
726 vmmr_tp_insert_concat(span, &vmmr_empty_tp);
727 }
728
729 vmmr_empty_sz += sz;
730 }
731
732 /*
733 * Add memory to vmm reservoir. Memory may be marked for transient use, where
734 * the addition is part of a transient allocation from the reservoir. Otherwise
735 * it is placed in the reservoir to be available for non-transient allocations.
736 *
737 * Expects vmmr_lock to be held when called, and will return with it held, but
738 * will drop it during portions of the addition.
739 */
740 static int
vmmr_add(size_t sz,bool transient)741 vmmr_add(size_t sz, bool transient)
742 {
743 VERIFY3U(sz & PAGEOFFSET, ==, 0);
744 VERIFY3U(sz, >, 0);
745 VERIFY(MUTEX_HELD(&vmmr_lock));
746
747 /*
748 * Make sure that the amount added is not going to breach the limits
749 * we've chosen
750 */
751 const size_t current_total =
752 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
753 vmmr_alloc_transient_sz + vmmr_free_transient_sz;
754 if ((current_total + sz) < current_total) {
755 return (EOVERFLOW);
756 }
757 if ((current_total + sz) > vmmr_total_limit) {
758 return (ENOSPC);
759 }
760 vmmr_adding_sz += sz;
761 mutex_exit(&vmmr_lock);
762
763 /* Wait for enough pages to become available */
764 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
765 mutex_enter(&vmmr_lock);
766 vmmr_adding_sz -= sz;
767 return (EINTR);
768 }
769
770 mutex_enter(&vmmr_lock);
771 size_t added = 0;
772 size_t remain = sz;
773 while (added < sz) {
774 vmmr_span_t *span = NULL;
775
776 if (vmmr_empty_sz > 0) {
777 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
778
779 vmmr_empty_sz -= span->vs_size;
780 } else {
781 /*
782 * No empty space to fill with new pages, so just tack
783 * it on at the end instead.
784 */
785 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
786 span->vs_addr = vmmr_empty_last;
787 span->vs_size = remain;
788 vmmr_empty_last += remain;
789 }
790 VERIFY3P(span, !=, NULL);
791
792
793 /* Allocate the actual pages to back this span */
794 mutex_exit(&vmmr_lock);
795 int err = vmmr_alloc_pages(span);
796 mutex_enter(&vmmr_lock);
797
798 /*
799 * If an error is encountered during page allocation for the
800 * span, unwind any progress made by the addition request.
801 */
802 if (err != 0) {
803 /*
804 * Without pages allocated to this span, it is now
805 * tracked as empty.
806 */
807 vmmr_empty_sz += span->vs_size;
808 vmmr_tp_insert_concat(span, &vmmr_empty_tp);
809
810 if (added != 0) {
811 vmmr_remove_raw(added);
812 }
813
814 vmmr_adding_sz -= sz;
815
816 page_unresv(sz >> PAGESHIFT);
817 return (err);
818 }
819
820 /*
821 * The allocated-page-bearing span is placed in the "free"
822 * treepair now, but is not officially exposed for consumption
823 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
824 *
825 * This allows us to unwind the allocation in case of a failure
826 * without the risk of the freshly added span(s) being snapped
827 * up by a consumer already.
828 */
829 added += span->vs_size;
830 remain -= span->vs_size;
831 vmmr_tp_insert_concat(span, &vmmr_free_tp);
832 }
833
834 /* Make the added memory usable by exposing it to the size accounting */
835 if (!transient) {
836 vmmr_free_sz += added;
837 } else {
838 vmmr_free_transient_sz += added;
839 }
840 ASSERT3U(added, ==, sz);
841 vmmr_adding_sz -= added;
842
843 return (0);
844 }
845
846 /*
847 * Remove memory from vmm reservoir. Normally this will remove memory from the
848 * reservoir which was available for non-transient allocations. If the removal
849 * is part of a vmmr_free() of a transient allocation, it will act on only that
850 * transient region being freed, not the available memory in the reservoir.
851 *
852 * Expects vmmr_lock to be held when called, and will return with it held, but
853 * may drop it during portions of the removal.
854 */
855 static int
vmmr_remove(size_t sz,bool transient)856 vmmr_remove(size_t sz, bool transient)
857 {
858 VERIFY3U(sz & PAGEOFFSET, ==, 0);
859 VERIFY(sz);
860 VERIFY(MUTEX_HELD(&vmmr_lock));
861
862 if ((!transient && sz > vmmr_free_sz) ||
863 (transient && sz > vmmr_free_transient_sz)) {
864 return (ENOSPC);
865 }
866
867 vmmr_remove_raw(sz);
868
869 if (!transient) {
870 vmmr_free_sz -= sz;
871 } else {
872 vmmr_free_transient_sz -= sz;
873 }
874 page_unresv(sz >> PAGESHIFT);
875 return (0);
876 }
877
878 static int
vmmr_set_target(size_t target_sz,size_t chunk_sz,size_t * resp)879 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
880 {
881 VERIFY(resp != NULL);
882
883 mutex_enter(&vmmr_lock);
884
885 size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
886
887 /* Be sure to communicate current size in case of an early bail-out */
888 *resp = current_sz;
889
890 if ((target_sz & PAGEOFFSET) != 0 ||
891 (chunk_sz & PAGEOFFSET) != 0) {
892 mutex_exit(&vmmr_lock);
893 return (EINVAL);
894 }
895 /* Reject sentinel value */
896 if (target_sz == VMMR_TARGET_INACTIVE) {
897 mutex_exit(&vmmr_lock);
898 return (EINVAL);
899 }
900
901 /* Already at target size */
902 if (target_sz == current_sz) {
903 mutex_exit(&vmmr_lock);
904 return (0);
905 }
906
907 /* Reject racing requests size */
908 if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
909 mutex_exit(&vmmr_lock);
910 return (EALREADY);
911 }
912 /* Record the target now to excluding a racing request */
913 vmmr_target_sz = target_sz;
914
915 int err = 0;
916 do {
917 /* Be sensitive to signal interruption */
918 if (issig(JUSTLOOKING) != 0) {
919 mutex_exit(&vmmr_lock);
920 const bool sig_bail = issig(FORREAL) != 0;
921 mutex_enter(&vmmr_lock);
922 if (sig_bail) {
923 err = EINTR;
924 break;
925 }
926 }
927
928 if (current_sz > target_sz) {
929 /* Shrinking reservoir */
930
931 size_t req_sz = current_sz - target_sz;
932 if (chunk_sz != 0) {
933 req_sz = MIN(req_sz, chunk_sz);
934 }
935 err = vmmr_remove(req_sz, false);
936 } else {
937 /* Growing reservoir */
938 ASSERT(current_sz < target_sz);
939
940 size_t req_sz = target_sz - current_sz;
941 if (chunk_sz != 0) {
942 req_sz = MIN(req_sz, chunk_sz);
943 }
944 err = vmmr_add(req_sz, false);
945 }
946
947 current_sz = vmmr_alloc_sz + vmmr_free_sz;
948 } while (err == 0 && current_sz != target_sz);
949
950 /* Clear the target now that we are done (success or not) */
951 vmmr_target_sz = VMMR_TARGET_INACTIVE;
952 mutex_exit(&vmmr_lock);
953 *resp = current_sz;
954 return (err);
955 }
956
957 int
vmmr_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)958 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
959 {
960 /*
961 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
962 * do not need to duplicate such checks here.
963 */
964
965 switch (cmd) {
966 case VMM_RESV_QUERY: {
967 struct vmm_resv_query res;
968 void *datap = (void *)(uintptr_t)arg;
969
970 /* For now, anyone with access to vmmctl device can query */
971 mutex_enter(&vmmr_lock);
972 res.vrq_free_sz = vmmr_free_sz;
973 res.vrq_alloc_sz = vmmr_alloc_sz;
974 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
975 res.vrq_limit = vmmr_total_limit;
976 mutex_exit(&vmmr_lock);
977 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
978 return (EFAULT);
979 }
980 break;
981 }
982 case VMM_RESV_SET_TARGET: {
983 if (secpolicy_sys_config(cr, B_FALSE) != 0) {
984 return (EPERM);
985 }
986
987 struct vmm_resv_target tgt;
988 void *datap = (void *)(uintptr_t)arg;
989
990 if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
991 return (EFAULT);
992 }
993
994 int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
995 &tgt.vrt_result_sz);
996
997 /*
998 * Attempt to communicate the resultant size of the reservoir if
999 * setting it to the target was a success, or if we were
1000 * interrupted (by a signal) while doing so.
1001 */
1002 if (err == 0 || err == EINTR) {
1003 if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
1004 err = EFAULT;
1005 }
1006 }
1007
1008 return (err);
1009 }
1010 default:
1011 return (ENOTTY);
1012 }
1013 return (0);
1014 }
1015