1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2023 Oxide Computer Company
14 */
15
16 /*
17 * VMM Memory Reservoir
18 *
19 *
20 * In order to make the allocation of large (multi-GiB) chunks of memory
21 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
22 * operators can set aside a substantial portion of system memory exclusively
23 * for VMs. This memory is unavailable for general use by the rest of the
24 * system. Rather than having to scour the freelist, reap kmem caches, or put
25 * pressure on the ARC, bhyve guest memory allocations can quickly determine if
26 * there is adequate reservoir memory available. Since the pages stored in the
27 * reservoir are pre-zeroed, it can be immediately used when allocated to a
28 * guest. When the memory is returned to the reservoir, it is zeroed once more
29 * to avoid leaking any sensitive data from that guest.
30 *
31 *
32 * Transient Allocations
33 *
34 * While the explicit reservoir model may work well for some applications,
35 * others may want a more traditional model, where pages for guest memory
36 * objects are allocated on demand, rather than from a pool set aside from the
37 * system. In this case, the allocation can be made in "transient" mode, where
38 * the memory is allocated normally, even if there is free capacity in the
39 * reservoir. When use of the transient allocation is complete (the guest is
40 * halted and destroyed), the pages will be freed back to the system, rather
41 * than added back to the reservoir.
42 *
43 * From an implementation standpoint, transient allocations follow the same
44 * code paths as ones using the reservoir normally. Those allocations have a
45 * tag which marks them as transient, and used/free size tallies are maintained
46 * separately for normal and transient operations. When performing a transient
47 * allocation, that amount of memory is immediately added to the reservoir ,
48 * from which the allocation can be made. When freeing a transient allocation,
49 * a matching amount of memory is removed from the reservoir as part of the
50 * operation. This allows both allocation types to coexist without too much
51 * additional machinery.
52 *
53 *
54 * Administration
55 *
56 * Operators may attempt to alter the amount of memory allocated to the
57 * reservoir via an ioctl against the vmmctl device. The total amount of memory
58 * in the reservoir (free, or allocated to VMs) is limited by
59 * `vmm_total_limit` (see its definition for how this limit is calculated).
60 *
61 * The limit is in place to prevent the reservoir from inadvertently growing
62 * to a size where the system has inadequate memory to make forward progress.
63 * Shrinking the reservoir is only possible when it contains free (not
64 * allocated by any guest VMs) memory.
65 *
66 *
67 * Page Tracking
68 *
69 * The reservoir currently uses vnode association to keep track of pages under
70 * its control (either designated to the reservoir and free, or allocated to a
71 * guest VM object). This means using the existing VM system primitives for
72 * page_t instances being associated with a given (vnode, offset) tuple. It
73 * means that spans of pages, either free or allocated, need only to store a
74 * length (of the span) and an offset (into the vnode) in order to gain access
75 * to all of the underlying pages associated with that span. Associating the
76 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
77 * properly tracked as KAS pages, but be excluded from normal dumps (unless the
78 * operator has chosen to dump all of RAM).
79 */
80
81 #include <sys/types.h>
82 #include <sys/mutex.h>
83 #include <sys/avl.h>
84 #include <sys/list.h>
85 #include <sys/machparam.h>
86 #include <sys/kmem.h>
87 #include <sys/stddef.h>
88 #include <sys/null.h>
89 #include <sys/errno.h>
90 #include <sys/systm.h>
91 #include <sys/sunddi.h>
92 #include <sys/policy.h>
93 #include <vm/seg_kmem.h>
94 #include <vm/hat_i86.h>
95 #include <sys/kstat.h>
96
97 #include <sys/vmm_reservoir.h>
98 #include <sys/vmm_dev.h>
99 #include <sys/vmm_impl.h>
100
101 #define VMMR_TARGET_INACTIVE SIZE_MAX
102
103 static kmutex_t vmmr_lock;
104
105 static size_t vmmr_free_sz;
106 static size_t vmmr_free_transient_sz;
107 static size_t vmmr_adding_sz;
108 static size_t vmmr_alloc_sz;
109 static size_t vmmr_alloc_transient_sz;
110 static size_t vmmr_empty_sz;
111
112 /*
113 * Target size of the reservoir during active vmmr_set_target() operation.
114 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
115 */
116 static size_t vmmr_target_sz;
117
118 static uintptr_t vmmr_empty_last;
119 /* Upper limit for the size (free + allocated) of the reservoir */
120 static size_t vmmr_total_limit;
121
122 /* VA range allocated from the VMM arena for the mappings */
123 static uintptr_t vmmr_va;
124 static uintptr_t vmmr_va_sz;
125
126 static kstat_t *vmmr_kstat;
127
128 /* Pair of AVL trees to store set of spans ordered by addr and size */
129 typedef struct vmmr_treepair {
130 avl_tree_t by_addr;
131 avl_tree_t by_size;
132 } vmmr_treepair_t;
133
134 /* Spans of free memory in the reservoir */
135 static vmmr_treepair_t vmmr_free_tp;
136
137 /* Spans of empty (not backed by memory) space in the reservoir */
138 static vmmr_treepair_t vmmr_empty_tp;
139
140 /* Regions of memory allocated from the reservoir */
141 static list_t vmmr_alloc_regions;
142
143 struct vmmr_span {
144 uintptr_t vs_addr;
145 size_t vs_size;
146 avl_node_t vs_by_addr;
147 avl_node_t vs_by_size;
148 uintptr_t vs_region_addr;
149 };
150 typedef struct vmmr_span vmmr_span_t;
151
152 struct vmmr_region {
153 size_t vr_size;
154 avl_tree_t vr_spans;
155 list_node_t vr_node;
156 bool vr_transient;
157 };
158
159 typedef struct vmmr_kstats {
160 kstat_named_t vmrks_bytes_free;
161 kstat_named_t vmrks_bytes_alloc;
162 kstat_named_t vmrks_bytes_transient;
163 kstat_named_t vmrks_bytes_limit;
164 } vmmr_kstats_t;
165
166
167 static int vmmr_add(size_t, bool);
168 static int vmmr_remove(size_t, bool);
169
170 static int
vmmr_cmp_addr(const void * a,const void * b)171 vmmr_cmp_addr(const void *a, const void *b)
172 {
173 const vmmr_span_t *sa = a;
174 const vmmr_span_t *sb = b;
175
176 if (sa->vs_addr == sb->vs_addr) {
177 return (0);
178 } else if (sa->vs_addr < sb->vs_addr) {
179 return (-1);
180 } else {
181 return (1);
182 }
183 }
184
185 static int
vmmr_cmp_size(const void * a,const void * b)186 vmmr_cmp_size(const void *a, const void *b)
187 {
188 const vmmr_span_t *sa = a;
189 const vmmr_span_t *sb = b;
190
191 if (sa->vs_size == sb->vs_size) {
192 /*
193 * Since discontiguous spans could have the same size in a
194 * by-size tree, differentiate them (as required by AVL) by
195 * address so they can safely coexist while remaining sorted.
196 */
197 return (vmmr_cmp_addr(a, b));
198 } else if (sa->vs_size < sb->vs_size) {
199 return (-1);
200 } else {
201 return (1);
202 }
203 }
204
205 static int
vmmr_cmp_region_addr(const void * a,const void * b)206 vmmr_cmp_region_addr(const void *a, const void *b)
207 {
208 const vmmr_span_t *sa = a;
209 const vmmr_span_t *sb = b;
210
211 if (sa->vs_region_addr == sb->vs_region_addr) {
212 return (0);
213 } else if (sa->vs_region_addr < sb->vs_region_addr) {
214 return (-1);
215 } else {
216 return (1);
217 }
218 }
219
220 static void
vmmr_tp_init(vmmr_treepair_t * tree)221 vmmr_tp_init(vmmr_treepair_t *tree)
222 {
223 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
224 offsetof(vmmr_span_t, vs_by_addr));
225 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
226 offsetof(vmmr_span_t, vs_by_size));
227 }
228
229 static void
vmmr_tp_destroy(vmmr_treepair_t * tree)230 vmmr_tp_destroy(vmmr_treepair_t *tree)
231 {
232 void *vcp = NULL;
233 vmmr_span_t *span;
234
235 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
236 /* Freeing spans will be done when tearing down by-size tree */
237 }
238 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
239 kmem_free(span, sizeof (*span));
240 }
241 avl_destroy(&tree->by_addr);
242 avl_destroy(&tree->by_size);
243 }
244
245 /*
246 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
247 * span(s). Such concatenation could result in the `to_add` span being freed,
248 * so the caller cannot use it after this returns.
249 */
250 static void
vmmr_tp_insert_concat(vmmr_span_t * to_add,vmmr_treepair_t * tree)251 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
252 {
253 avl_tree_t *by_addr = &tree->by_addr;
254 avl_tree_t *by_size = &tree->by_size;
255 vmmr_span_t *node;
256 avl_index_t where;
257
258 /* This addr should not already exist in the treepair */
259 node = avl_find(by_addr, to_add, &where);
260 ASSERT3P(node, ==, NULL);
261
262 node = avl_nearest(by_addr, where, AVL_BEFORE);
263 if (node != NULL &&
264 (node->vs_addr + node->vs_size) == to_add->vs_addr) {
265 /* concat with preceeding item */
266 avl_remove(by_addr, node);
267 avl_remove(by_size, node);
268 node->vs_size += to_add->vs_size;
269 kmem_free(to_add, sizeof (*to_add));
270
271 /*
272 * Since this now-concatenated span could be adjacent one
273 * trailing it, fall through to perform that check.
274 */
275 to_add = node;
276 }
277
278 node = avl_nearest(by_addr, where, AVL_AFTER);
279 if (node != NULL &&
280 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
281 /* concat with trailing item */
282 avl_remove(by_addr, node);
283 avl_remove(by_size, node);
284 node->vs_addr = to_add->vs_addr;
285 node->vs_size += to_add->vs_size;
286 avl_add(by_addr, node);
287 avl_add(by_size, node);
288
289 kmem_free(to_add, sizeof (*to_add));
290 return;
291 }
292
293 /* simply insert */
294 avl_add(by_addr, to_add);
295 avl_add(by_size, to_add);
296 }
297
298 /*
299 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
300 * the exact target size is not present, but a larger one is. May return a span
301 * with a size smaller than the target if splitting is not an option.
302 */
303 static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz,vmmr_treepair_t * tree)304 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
305 {
306 avl_tree_t *by_addr = &tree->by_addr;
307 avl_tree_t *by_size = &tree->by_size;
308 vmmr_span_t *span;
309 avl_index_t where;
310
311 ASSERT3U(target_sz, !=, 0);
312 ASSERT(!avl_is_empty(by_addr));
313 ASSERT(!avl_is_empty(by_size));
314
315 vmmr_span_t search = { .vs_size = target_sz };
316 span = avl_find(by_size, &search, &where);
317 if (span == NULL) {
318 /* Try for a larger span (instead of exact match) */
319 span = avl_nearest(by_size, where, AVL_AFTER);
320 if (span == NULL) {
321 /*
322 * Caller will need to collect several smaller spans in
323 * order to fulfill their request.
324 */
325 span = avl_nearest(by_size, where, AVL_BEFORE);
326 ASSERT3P(span, !=, NULL);
327 }
328 }
329
330 if (span->vs_size <= target_sz) {
331 avl_remove(by_size, span);
332 avl_remove(by_addr, span);
333
334 return (span);
335 } else {
336 /* Split off adequate chunk from larger span */
337 uintptr_t start = span->vs_addr + span->vs_size - target_sz;
338
339 avl_remove(by_size, span);
340 span->vs_size -= target_sz;
341 avl_add(by_size, span);
342
343 vmmr_span_t *split_span =
344 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
345 split_span->vs_addr = start;
346 split_span->vs_size = target_sz;
347
348 return (split_span);
349 }
350 }
351
352 static int
vmmr_kstat_update(struct kstat * ksp,int rw)353 vmmr_kstat_update(struct kstat *ksp, int rw)
354 {
355 vmmr_kstats_t *vkp = ksp->ks_data;
356
357 mutex_enter(&vmmr_lock);
358 vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
359 vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
360 /*
361 * In addition to the memory which is actually actually allocated to
362 * transient consumers, memory which is considered free-for-transient is
363 * also included in the sizing.
364 */
365 vkp->vmrks_bytes_transient.value.ui64 =
366 vmmr_alloc_transient_sz + vmmr_free_transient_sz;
367 vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
368 mutex_exit(&vmmr_lock);
369
370 return (0);
371 }
372
373 int
vmmr_init()374 vmmr_init()
375 {
376 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
377
378 /*
379 * `vmm_total_limit` represents the absolute maximum size of the VMM
380 * memory reservoir. It is meant to provide some measure of protection
381 * against an operator pushing the system into unrecoverable memory
382 * starvation through explicit or transient additions to the reservoir.
383 *
384 * There will be many situations where this limit would be inadequate to
385 * prevent kernel memory starvation in the face of certain operator
386 * actions. It is a balance to be struck between safety and allowing
387 * large systems to reach high utilization.
388 *
389 * The value is based off of pages_pp_maximum: "Number of currently
390 * available pages that cannot be 'locked'". It is sized as all of
391 * `physmem` less 120% of `pages_pp_maximum`.
392 */
393 vmmr_total_limit =
394 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
395
396 vmmr_empty_last = 0;
397 vmmr_free_sz = 0;
398 vmmr_alloc_sz = 0;
399 vmmr_empty_sz = 0;
400 vmmr_adding_sz = 0;
401 vmmr_free_transient_sz = 0;
402 vmmr_alloc_transient_sz = 0;
403 vmmr_target_sz = VMMR_TARGET_INACTIVE;
404
405 /*
406 * Attempt kstat allocation early, since it is the only part of
407 * reservoir initialization which is fallible.
408 */
409 kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
410 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
411 sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
412 if (ksp == NULL) {
413 mutex_destroy(&vmmr_lock);
414 return (ENOMEM);
415 }
416
417 vmmr_kstats_t *vkp = ksp->ks_data;
418
419 kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
420 KSTAT_DATA_UINT64);
421 kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
422 KSTAT_DATA_UINT64);
423 kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
424 KSTAT_DATA_UINT64);
425 kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
426 KSTAT_DATA_UINT64);
427 ksp->ks_private = NULL;
428 ksp->ks_update = vmmr_kstat_update;
429 vmmr_kstat = ksp;
430
431 vmmr_tp_init(&vmmr_free_tp);
432 vmmr_tp_init(&vmmr_empty_tp);
433
434 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
435 offsetof(vmmr_region_t, vr_node));
436
437 /* Grab a chunk of VA for the reservoir */
438 vmmr_va_sz = physmem * PAGESIZE;
439 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
440
441 kstat_install(vmmr_kstat);
442
443 return (0);
444 }
445
446 void
vmmr_fini()447 vmmr_fini()
448 {
449 mutex_enter(&vmmr_lock);
450 VERIFY3U(vmmr_alloc_sz, ==, 0);
451 VERIFY3U(vmmr_free_sz, ==, 0);
452 VERIFY3U(vmmr_adding_sz, ==, 0);
453 VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
454 VERIFY3U(vmmr_free_transient_sz, ==, 0);
455 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
456 VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
457 VERIFY(list_is_empty(&vmmr_alloc_regions));
458
459 kstat_delete(vmmr_kstat);
460 vmmr_kstat = NULL;
461
462 vmmr_tp_destroy(&vmmr_free_tp);
463 vmmr_tp_destroy(&vmmr_empty_tp);
464 list_destroy(&vmmr_alloc_regions);
465
466 /* Release reservoir VA chunk */
467 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
468 vmmr_va = 0;
469 vmmr_va_sz = 0;
470 vmmr_total_limit = 0;
471 vmmr_empty_last = 0;
472
473 mutex_exit(&vmmr_lock);
474 mutex_destroy(&vmmr_lock);
475 }
476
477 bool
vmmr_is_empty()478 vmmr_is_empty()
479 {
480 mutex_enter(&vmmr_lock);
481 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
482 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
483 mutex_exit(&vmmr_lock);
484 return (res);
485 }
486
487 int
vmmr_alloc(size_t sz,bool transient,vmmr_region_t ** resp)488 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
489 {
490 VERIFY3U(sz & PAGEOFFSET, ==, 0);
491
492 if (!transient) {
493 mutex_enter(&vmmr_lock);
494 if (sz > vmmr_free_sz) {
495 mutex_exit(&vmmr_lock);
496 return (ENOSPC);
497 }
498 } else {
499 int err;
500
501 mutex_enter(&vmmr_lock);
502 err = vmmr_add(sz, true);
503 if (err != 0) {
504 mutex_exit(&vmmr_lock);
505 return (err);
506 }
507 VERIFY3U(vmmr_free_transient_sz, >=, sz);
508 }
509
510 vmmr_region_t *region;
511 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
512 avl_create(®ion->vr_spans, vmmr_cmp_region_addr,
513 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
514 region->vr_size = sz;
515
516 size_t remain = sz;
517 uintptr_t map_at = 0;
518 while (remain > 0) {
519 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
520
521 /*
522 * We have already ensured that adequate free memory is present
523 * in the reservoir for this allocation.
524 */
525 VERIFY3P(span, !=, NULL);
526 ASSERT3U(span->vs_size, <=, remain);
527
528 span->vs_region_addr = map_at;
529 avl_add(®ion->vr_spans, span);
530 map_at += span->vs_size;
531 remain -= span->vs_size;
532 }
533
534 if (!transient) {
535 vmmr_free_sz -= sz;
536 vmmr_alloc_sz += sz;
537 } else {
538 vmmr_free_transient_sz -= sz;
539 vmmr_alloc_transient_sz += sz;
540 region->vr_transient = true;
541 }
542 list_insert_tail(&vmmr_alloc_regions, region);
543 mutex_exit(&vmmr_lock);
544
545 *resp = region;
546 return (0);
547 }
548
549 void *
vmmr_region_mem_at(vmmr_region_t * region,uintptr_t off)550 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
551 {
552 /* just use KPM region for now */
553 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
554 }
555
556 pfn_t
vmmr_region_pfn_at(vmmr_region_t * region,uintptr_t off)557 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
558 {
559 VERIFY3U(off & PAGEOFFSET, ==, 0);
560 VERIFY3U(off, <, region->vr_size);
561
562 vmmr_span_t search = {
563 .vs_region_addr = off
564 };
565 avl_index_t where;
566 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where);
567
568 if (span == NULL) {
569 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE);
570 ASSERT3P(span, !=, NULL);
571 }
572 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
573 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
574 VERIFY(pp != NULL);
575 return (pp->p_pagenum);
576 }
577
578 void
vmmr_free(vmmr_region_t * region)579 vmmr_free(vmmr_region_t *region)
580 {
581 mutex_enter(&vmmr_lock);
582 if (!region->vr_transient) {
583 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
584 } else {
585 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
586 }
587 list_remove(&vmmr_alloc_regions, region);
588 mutex_exit(&vmmr_lock);
589
590 /* Zero the contents (while not monopolizing vmmr_lock) */
591 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
592 bzero(vmmr_region_mem_at(region, off), PAGESIZE);
593 }
594
595 mutex_enter(&vmmr_lock);
596
597 /* Put the contained span(s) back in the free pool */
598 void *cookie = NULL;
599 vmmr_span_t *span;
600 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) {
601 span->vs_region_addr = 0;
602 vmmr_tp_insert_concat(span, &vmmr_free_tp);
603 }
604 avl_destroy(®ion->vr_spans);
605 if (!region->vr_transient) {
606 vmmr_free_sz += region->vr_size;
607 vmmr_alloc_sz -= region->vr_size;
608 } else {
609 vmmr_free_transient_sz += region->vr_size;
610 vmmr_alloc_transient_sz -= region->vr_size;
611 }
612
613 if (region->vr_transient) {
614 /*
615 * Since the transient capacity was previously allocated for
616 * this region, its removal should not fail.
617 */
618 VERIFY0(vmmr_remove(region->vr_size, true));
619 }
620 kmem_free(region, sizeof (*region));
621 mutex_exit(&vmmr_lock);
622 }
623
624 static void
vmmr_destroy_pages(vmmr_span_t * span)625 vmmr_destroy_pages(vmmr_span_t *span)
626 {
627 const uintptr_t end = span->vs_addr + span->vs_size;
628 struct vnode *vp = &kvps[KV_VVP];
629 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
630 page_t *pp;
631
632 /* Page-free logic cribbed from segkmem_xfree(): */
633 pp = page_find(vp, (u_offset_t)pos);
634 VERIFY(pp != NULL);
635 if (!page_tryupgrade(pp)) {
636 /*
637 * Some other thread has a sharelock. Wait for
638 * it to drop the lock so we can free this page.
639 */
640 page_unlock(pp);
641 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
642 }
643
644 /*
645 * Clear p_lckcnt so page_destroy() doesn't update availrmem.
646 * That will be taken care of later via page_unresv().
647 */
648 pp->p_lckcnt = 0;
649 page_destroy(pp, 0);
650 }
651 }
652
653 static int
vmmr_alloc_pages(const vmmr_span_t * span)654 vmmr_alloc_pages(const vmmr_span_t *span)
655 {
656 struct seg kseg = {
657 .s_as = &kas
658 };
659 struct vnode *vp = &kvps[KV_VVP];
660
661 const uintptr_t end = span->vs_addr + span->vs_size;
662 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
663 page_t *pp;
664
665 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
666 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
667
668 if (pp == NULL) {
669 /* Destroy any already-created pages */
670 if (pos != span->vs_addr) {
671 vmmr_span_t destroy_span = {
672 .vs_addr = span->vs_addr,
673 .vs_size = pos - span->vs_addr,
674 };
675
676 vmmr_destroy_pages(&destroy_span);
677 }
678 return (ENOMEM);
679 }
680
681 /* mimic page state from segkmem */
682 ASSERT(PAGE_EXCL(pp));
683 page_io_unlock(pp);
684 pp->p_lckcnt = 1;
685 page_downgrade(pp);
686
687 /* pre-zero the page */
688 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
689 }
690
691 return (0);
692 }
693
694 static int
vmmr_resv_wait()695 vmmr_resv_wait()
696 {
697 if (delay_sig(hz >> 2) != 0) {
698 /* bail due to interruption */
699 return (0);
700 }
701 return (1);
702 }
703
704 static void
vmmr_remove_raw(size_t sz)705 vmmr_remove_raw(size_t sz)
706 {
707 VERIFY3U(sz & PAGEOFFSET, ==, 0);
708 VERIFY(MUTEX_HELD(&vmmr_lock));
709
710 size_t remain = sz;
711 while (remain > 0) {
712 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
713
714 /*
715 * The caller must ensure that at least `sz` amount is present
716 * in the free treepair.
717 */
718 VERIFY3P(span, !=, NULL);
719 ASSERT3U(span->vs_size, <=, remain);
720
721 /* TODO: perhaps arrange to destroy pages outside the lock? */
722 vmmr_destroy_pages(span);
723
724 remain -= span->vs_size;
725 vmmr_tp_insert_concat(span, &vmmr_empty_tp);
726 }
727
728 vmmr_empty_sz += sz;
729 }
730
731 /*
732 * Add memory to vmm reservoir. Memory may be marked for transient use, where
733 * the addition is part of a transient allocation from the reservoir. Otherwise
734 * it is placed in the reservoir to be available for non-transient allocations.
735 *
736 * Expects vmmr_lock to be held when called, and will return with it held, but
737 * will drop it during portions of the addition.
738 */
739 static int
vmmr_add(size_t sz,bool transient)740 vmmr_add(size_t sz, bool transient)
741 {
742 VERIFY3U(sz & PAGEOFFSET, ==, 0);
743 VERIFY3U(sz, >, 0);
744 VERIFY(MUTEX_HELD(&vmmr_lock));
745
746 /*
747 * Make sure that the amount added is not going to breach the limits
748 * we've chosen
749 */
750 const size_t current_total =
751 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
752 vmmr_alloc_transient_sz + vmmr_free_transient_sz;
753 if ((current_total + sz) < current_total) {
754 return (EOVERFLOW);
755 }
756 if ((current_total + sz) > vmmr_total_limit) {
757 return (ENOSPC);
758 }
759 vmmr_adding_sz += sz;
760 mutex_exit(&vmmr_lock);
761
762 /* Wait for enough pages to become available */
763 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
764 mutex_enter(&vmmr_lock);
765 vmmr_adding_sz -= sz;
766 return (EINTR);
767 }
768
769 mutex_enter(&vmmr_lock);
770 size_t added = 0;
771 size_t remain = sz;
772 while (added < sz) {
773 vmmr_span_t *span = NULL;
774
775 if (vmmr_empty_sz > 0) {
776 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
777
778 vmmr_empty_sz -= span->vs_size;
779 } else {
780 /*
781 * No empty space to fill with new pages, so just tack
782 * it on at the end instead.
783 */
784 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
785 span->vs_addr = vmmr_empty_last;
786 span->vs_size = remain;
787 vmmr_empty_last += remain;
788 }
789 VERIFY3P(span, !=, NULL);
790
791
792 /* Allocate the actual pages to back this span */
793 mutex_exit(&vmmr_lock);
794 int err = vmmr_alloc_pages(span);
795 mutex_enter(&vmmr_lock);
796
797 /*
798 * If an error is encountered during page allocation for the
799 * span, unwind any progress made by the addition request.
800 */
801 if (err != 0) {
802 /*
803 * Without pages allocated to this span, it is now
804 * tracked as empty.
805 */
806 vmmr_empty_sz += span->vs_size;
807 vmmr_tp_insert_concat(span, &vmmr_empty_tp);
808
809 if (added != 0) {
810 vmmr_remove_raw(added);
811 }
812
813 vmmr_adding_sz -= sz;
814
815 page_unresv(sz >> PAGESHIFT);
816 return (err);
817 }
818
819 /*
820 * The allocated-page-bearing span is placed in the "free"
821 * treepair now, but is not officially exposed for consumption
822 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
823 *
824 * This allows us to unwind the allocation in case of a failure
825 * without the risk of the freshly added span(s) being snapped
826 * up by a consumer already.
827 */
828 added += span->vs_size;
829 remain -= span->vs_size;
830 vmmr_tp_insert_concat(span, &vmmr_free_tp);
831 }
832
833 /* Make the added memory usable by exposing it to the size accounting */
834 if (!transient) {
835 vmmr_free_sz += added;
836 } else {
837 vmmr_free_transient_sz += added;
838 }
839 ASSERT3U(added, ==, sz);
840 vmmr_adding_sz -= added;
841
842 return (0);
843 }
844
845 /*
846 * Remove memory from vmm reservoir. Normally this will remove memory from the
847 * reservoir which was available for non-transient allocations. If the removal
848 * is part of a vmmr_free() of a transient allocation, it will act on only that
849 * transient region being freed, not the available memory in the reservoir.
850 *
851 * Expects vmmr_lock to be held when called, and will return with it held, but
852 * may drop it during portions of the removal.
853 */
854 static int
vmmr_remove(size_t sz,bool transient)855 vmmr_remove(size_t sz, bool transient)
856 {
857 VERIFY3U(sz & PAGEOFFSET, ==, 0);
858 VERIFY(sz);
859 VERIFY(MUTEX_HELD(&vmmr_lock));
860
861 if ((!transient && sz > vmmr_free_sz) ||
862 (transient && sz > vmmr_free_transient_sz)) {
863 return (ENOSPC);
864 }
865
866 vmmr_remove_raw(sz);
867
868 if (!transient) {
869 vmmr_free_sz -= sz;
870 } else {
871 vmmr_free_transient_sz -= sz;
872 }
873 page_unresv(sz >> PAGESHIFT);
874 return (0);
875 }
876
877 static int
vmmr_set_target(size_t target_sz,size_t chunk_sz,size_t * resp)878 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
879 {
880 VERIFY(resp != NULL);
881
882 mutex_enter(&vmmr_lock);
883
884 size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
885
886 /* Be sure to communicate current size in case of an early bail-out */
887 *resp = current_sz;
888
889 if ((target_sz & PAGEOFFSET) != 0 ||
890 (chunk_sz & PAGEOFFSET) != 0) {
891 mutex_exit(&vmmr_lock);
892 return (EINVAL);
893 }
894 /* Reject sentinel value */
895 if (target_sz == VMMR_TARGET_INACTIVE) {
896 mutex_exit(&vmmr_lock);
897 return (EINVAL);
898 }
899
900 /* Already at target size */
901 if (target_sz == current_sz) {
902 mutex_exit(&vmmr_lock);
903 return (0);
904 }
905
906 /* Reject racing requests size */
907 if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
908 mutex_exit(&vmmr_lock);
909 return (EALREADY);
910 }
911 /* Record the target now to excluding a racing request */
912 vmmr_target_sz = target_sz;
913
914 int err = 0;
915 do {
916 /* Be sensitive to signal interruption */
917 if (issig(JUSTLOOKING) != 0) {
918 mutex_exit(&vmmr_lock);
919 const bool sig_bail = issig(FORREAL) != 0;
920 mutex_enter(&vmmr_lock);
921 if (sig_bail) {
922 err = EINTR;
923 break;
924 }
925 }
926
927 if (current_sz > target_sz) {
928 /* Shrinking reservoir */
929
930 size_t req_sz = current_sz - target_sz;
931 if (chunk_sz != 0) {
932 req_sz = MIN(req_sz, chunk_sz);
933 }
934 err = vmmr_remove(req_sz, false);
935 } else {
936 /* Growing reservoir */
937 ASSERT(current_sz < target_sz);
938
939 size_t req_sz = target_sz - current_sz;
940 if (chunk_sz != 0) {
941 req_sz = MIN(req_sz, chunk_sz);
942 }
943 err = vmmr_add(req_sz, false);
944 }
945
946 current_sz = vmmr_alloc_sz + vmmr_free_sz;
947 } while (err == 0 && current_sz != target_sz);
948
949 /* Clear the target now that we are done (success or not) */
950 vmmr_target_sz = VMMR_TARGET_INACTIVE;
951 mutex_exit(&vmmr_lock);
952 *resp = current_sz;
953 return (err);
954 }
955
956 int
vmmr_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)957 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
958 {
959 /*
960 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
961 * do not need to duplicate such checks here.
962 */
963
964 switch (cmd) {
965 case VMM_RESV_QUERY: {
966 struct vmm_resv_query res;
967 void *datap = (void *)(uintptr_t)arg;
968
969 /* For now, anyone with access to vmmctl device can query */
970 mutex_enter(&vmmr_lock);
971 res.vrq_free_sz = vmmr_free_sz;
972 res.vrq_alloc_sz = vmmr_alloc_sz;
973 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
974 res.vrq_limit = vmmr_total_limit;
975 mutex_exit(&vmmr_lock);
976 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
977 return (EFAULT);
978 }
979 break;
980 }
981 case VMM_RESV_SET_TARGET: {
982 if (secpolicy_sys_config(cr, B_FALSE) != 0) {
983 return (EPERM);
984 }
985
986 struct vmm_resv_target tgt;
987 void *datap = (void *)(uintptr_t)arg;
988
989 if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
990 return (EFAULT);
991 }
992
993 int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
994 &tgt.vrt_result_sz);
995
996 /*
997 * Attempt to communicate the resultant size of the reservoir if
998 * setting it to the target was a success, or if we were
999 * interrupted (by a signal) while doing so.
1000 */
1001 if (err == 0 || err == EINTR) {
1002 if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
1003 err = EFAULT;
1004 }
1005 }
1006
1007 return (err);
1008 }
1009 default:
1010 return (ENOTTY);
1011 }
1012 return (0);
1013 }
1014