17c8c0b82SPatrick Mooney /*
27c8c0b82SPatrick Mooney * This file and its contents are supplied under the terms of the
37c8c0b82SPatrick Mooney * Common Development and Distribution License ("CDDL"), version 1.0.
47c8c0b82SPatrick Mooney * You may only use this file in accordance with the terms of version
57c8c0b82SPatrick Mooney * 1.0 of the CDDL.
67c8c0b82SPatrick Mooney *
77c8c0b82SPatrick Mooney * A full copy of the text of the CDDL should have accompanied this
87c8c0b82SPatrick Mooney * source. A copy of the CDDL is also available via the Internet at
97c8c0b82SPatrick Mooney * http://www.illumos.org/license/CDDL.
107c8c0b82SPatrick Mooney */
117c8c0b82SPatrick Mooney
127c8c0b82SPatrick Mooney /*
136bba8b59SPatrick Mooney * Copyright 2023 Oxide Computer Company
147c8c0b82SPatrick Mooney */
157c8c0b82SPatrick Mooney
167c8c0b82SPatrick Mooney /*
177c8c0b82SPatrick Mooney * VMM Memory Reservoir
187c8c0b82SPatrick Mooney *
197c8c0b82SPatrick Mooney *
207c8c0b82SPatrick Mooney * In order to make the allocation of large (multi-GiB) chunks of memory
217c8c0b82SPatrick Mooney * for bhyve VMs easier, we introduce the "VMM Reservoir", where system
227c8c0b82SPatrick Mooney * operators can set aside a substantial portion of system memory exclusively
237c8c0b82SPatrick Mooney * for VMs. This memory is unavailable for general use by the rest of the
247c8c0b82SPatrick Mooney * system. Rather than having to scour the freelist, reap kmem caches, or put
257c8c0b82SPatrick Mooney * pressure on the ARC, bhyve guest memory allocations can quickly determine if
267c8c0b82SPatrick Mooney * there is adequate reservoir memory available. Since the pages stored in the
277c8c0b82SPatrick Mooney * reservoir are pre-zeroed, it can be immediately used when allocated to a
287c8c0b82SPatrick Mooney * guest. When the memory is returned to the reservoir, it is zeroed once more
297c8c0b82SPatrick Mooney * to avoid leaking any sensitive data from that guest.
307c8c0b82SPatrick Mooney *
317c8c0b82SPatrick Mooney *
327c8c0b82SPatrick Mooney * Transient Allocations
337c8c0b82SPatrick Mooney *
347c8c0b82SPatrick Mooney * While the explicit reservoir model may work well for some applications,
357c8c0b82SPatrick Mooney * others may want a more traditional model, where pages for guest memory
367c8c0b82SPatrick Mooney * objects are allocated on demand, rather than from a pool set aside from the
377c8c0b82SPatrick Mooney * system. In this case, the allocation can be made in "transient" mode, where
387c8c0b82SPatrick Mooney * the memory is allocated normally, even if there is free capacity in the
397c8c0b82SPatrick Mooney * reservoir. When use of the transient allocation is complete (the guest is
407c8c0b82SPatrick Mooney * halted and destroyed), the pages will be freed back to the system, rather
417c8c0b82SPatrick Mooney * than added back to the reservoir.
427c8c0b82SPatrick Mooney *
437c8c0b82SPatrick Mooney * From an implementation standpoint, transient allocations follow the same
447c8c0b82SPatrick Mooney * code paths as ones using the reservoir normally. Those allocations have a
457c8c0b82SPatrick Mooney * tag which marks them as transient, and used/free size tallies are maintained
467c8c0b82SPatrick Mooney * separately for normal and transient operations. When performing a transient
477c8c0b82SPatrick Mooney * allocation, that amount of memory is immediately added to the reservoir ,
487c8c0b82SPatrick Mooney * from which the allocation can be made. When freeing a transient allocation,
497c8c0b82SPatrick Mooney * a matching amount of memory is removed from the reservoir as part of the
507c8c0b82SPatrick Mooney * operation. This allows both allocation types to coexist without too much
517c8c0b82SPatrick Mooney * additional machinery.
527c8c0b82SPatrick Mooney *
537c8c0b82SPatrick Mooney *
547c8c0b82SPatrick Mooney * Administration
557c8c0b82SPatrick Mooney *
566bba8b59SPatrick Mooney * Operators may attempt to alter the amount of memory allocated to the
576bba8b59SPatrick Mooney * reservoir via an ioctl against the vmmctl device. The total amount of memory
58*40fc84a2SJordan Paige Hendricks * in the reservoir (free, or allocated to VMs) is limited by
59*40fc84a2SJordan Paige Hendricks * `vmm_total_limit` (see its definition for how this limit is calculated).
60*40fc84a2SJordan Paige Hendricks *
61*40fc84a2SJordan Paige Hendricks * The limit is in place to prevent the reservoir from inadvertently growing
62*40fc84a2SJordan Paige Hendricks * to a size where the system has inadequate memory to make forward progress.
63*40fc84a2SJordan Paige Hendricks * Shrinking the reservoir is only possible when it contains free (not
64*40fc84a2SJordan Paige Hendricks * allocated by any guest VMs) memory.
657c8c0b82SPatrick Mooney *
667c8c0b82SPatrick Mooney *
677c8c0b82SPatrick Mooney * Page Tracking
687c8c0b82SPatrick Mooney *
697c8c0b82SPatrick Mooney * The reservoir currently uses vnode association to keep track of pages under
707c8c0b82SPatrick Mooney * its control (either designated to the reservoir and free, or allocated to a
717c8c0b82SPatrick Mooney * guest VM object). This means using the existing VM system primitives for
727c8c0b82SPatrick Mooney * page_t instances being associated with a given (vnode, offset) tuple. It
737c8c0b82SPatrick Mooney * means that spans of pages, either free or allocated, need only to store a
747c8c0b82SPatrick Mooney * length (of the span) and an offset (into the vnode) in order to gain access
757c8c0b82SPatrick Mooney * to all of the underlying pages associated with that span. Associating the
767c8c0b82SPatrick Mooney * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be
777c8c0b82SPatrick Mooney * properly tracked as KAS pages, but be excluded from normal dumps (unless the
787c8c0b82SPatrick Mooney * operator has chosen to dump all of RAM).
797c8c0b82SPatrick Mooney */
807c8c0b82SPatrick Mooney
817c8c0b82SPatrick Mooney #include <sys/types.h>
827c8c0b82SPatrick Mooney #include <sys/mutex.h>
837c8c0b82SPatrick Mooney #include <sys/avl.h>
847c8c0b82SPatrick Mooney #include <sys/list.h>
857c8c0b82SPatrick Mooney #include <sys/machparam.h>
867c8c0b82SPatrick Mooney #include <sys/kmem.h>
877c8c0b82SPatrick Mooney #include <sys/stddef.h>
887c8c0b82SPatrick Mooney #include <sys/null.h>
897c8c0b82SPatrick Mooney #include <sys/errno.h>
907c8c0b82SPatrick Mooney #include <sys/systm.h>
917c8c0b82SPatrick Mooney #include <sys/sunddi.h>
927c8c0b82SPatrick Mooney #include <sys/policy.h>
937c8c0b82SPatrick Mooney #include <vm/seg_kmem.h>
947c8c0b82SPatrick Mooney #include <vm/hat_i86.h>
956bba8b59SPatrick Mooney #include <sys/kstat.h>
967c8c0b82SPatrick Mooney
977c8c0b82SPatrick Mooney #include <sys/vmm_reservoir.h>
987c8c0b82SPatrick Mooney #include <sys/vmm_dev.h>
996bba8b59SPatrick Mooney #include <sys/vmm_impl.h>
1006bba8b59SPatrick Mooney
1016bba8b59SPatrick Mooney #define VMMR_TARGET_INACTIVE SIZE_MAX
1027c8c0b82SPatrick Mooney
1037c8c0b82SPatrick Mooney static kmutex_t vmmr_lock;
1047c8c0b82SPatrick Mooney
1057c8c0b82SPatrick Mooney static size_t vmmr_free_sz;
1067c8c0b82SPatrick Mooney static size_t vmmr_free_transient_sz;
1077c8c0b82SPatrick Mooney static size_t vmmr_adding_sz;
1087c8c0b82SPatrick Mooney static size_t vmmr_alloc_sz;
1097c8c0b82SPatrick Mooney static size_t vmmr_alloc_transient_sz;
1107c8c0b82SPatrick Mooney static size_t vmmr_empty_sz;
1117c8c0b82SPatrick Mooney
1126bba8b59SPatrick Mooney /*
1136bba8b59SPatrick Mooney * Target size of the reservoir during active vmmr_set_target() operation.
1146bba8b59SPatrick Mooney * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active.
1156bba8b59SPatrick Mooney */
1166bba8b59SPatrick Mooney static size_t vmmr_target_sz;
1176bba8b59SPatrick Mooney
1187c8c0b82SPatrick Mooney static uintptr_t vmmr_empty_last;
1197c8c0b82SPatrick Mooney /* Upper limit for the size (free + allocated) of the reservoir */
1207c8c0b82SPatrick Mooney static size_t vmmr_total_limit;
1217c8c0b82SPatrick Mooney
1227c8c0b82SPatrick Mooney /* VA range allocated from the VMM arena for the mappings */
1237c8c0b82SPatrick Mooney static uintptr_t vmmr_va;
1247c8c0b82SPatrick Mooney static uintptr_t vmmr_va_sz;
1257c8c0b82SPatrick Mooney
1266bba8b59SPatrick Mooney static kstat_t *vmmr_kstat;
1276bba8b59SPatrick Mooney
1287c8c0b82SPatrick Mooney /* Pair of AVL trees to store set of spans ordered by addr and size */
1297c8c0b82SPatrick Mooney typedef struct vmmr_treepair {
1307c8c0b82SPatrick Mooney avl_tree_t by_addr;
1317c8c0b82SPatrick Mooney avl_tree_t by_size;
1327c8c0b82SPatrick Mooney } vmmr_treepair_t;
1337c8c0b82SPatrick Mooney
1347c8c0b82SPatrick Mooney /* Spans of free memory in the reservoir */
1357c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_free_tp;
1367c8c0b82SPatrick Mooney
1377c8c0b82SPatrick Mooney /* Spans of empty (not backed by memory) space in the reservoir */
1387c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_empty_tp;
1397c8c0b82SPatrick Mooney
1407c8c0b82SPatrick Mooney /* Regions of memory allocated from the reservoir */
1417c8c0b82SPatrick Mooney static list_t vmmr_alloc_regions;
1427c8c0b82SPatrick Mooney
1437c8c0b82SPatrick Mooney struct vmmr_span {
1447c8c0b82SPatrick Mooney uintptr_t vs_addr;
1457c8c0b82SPatrick Mooney size_t vs_size;
1467c8c0b82SPatrick Mooney avl_node_t vs_by_addr;
1477c8c0b82SPatrick Mooney avl_node_t vs_by_size;
1487c8c0b82SPatrick Mooney uintptr_t vs_region_addr;
1497c8c0b82SPatrick Mooney };
1507c8c0b82SPatrick Mooney typedef struct vmmr_span vmmr_span_t;
1517c8c0b82SPatrick Mooney
1527c8c0b82SPatrick Mooney struct vmmr_region {
1537c8c0b82SPatrick Mooney size_t vr_size;
1547c8c0b82SPatrick Mooney avl_tree_t vr_spans;
1557c8c0b82SPatrick Mooney list_node_t vr_node;
1567c8c0b82SPatrick Mooney bool vr_transient;
1577c8c0b82SPatrick Mooney };
1587c8c0b82SPatrick Mooney
1596bba8b59SPatrick Mooney typedef struct vmmr_kstats {
1606bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_free;
1616bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_alloc;
1626bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_transient;
1636bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_limit;
1646bba8b59SPatrick Mooney } vmmr_kstats_t;
1656bba8b59SPatrick Mooney
1666bba8b59SPatrick Mooney
1676bba8b59SPatrick Mooney static int vmmr_add(size_t, bool);
1686bba8b59SPatrick Mooney static int vmmr_remove(size_t, bool);
1696bba8b59SPatrick Mooney
1707c8c0b82SPatrick Mooney static int
vmmr_cmp_addr(const void * a,const void * b)1717c8c0b82SPatrick Mooney vmmr_cmp_addr(const void *a, const void *b)
1727c8c0b82SPatrick Mooney {
1737c8c0b82SPatrick Mooney const vmmr_span_t *sa = a;
1747c8c0b82SPatrick Mooney const vmmr_span_t *sb = b;
1757c8c0b82SPatrick Mooney
1767c8c0b82SPatrick Mooney if (sa->vs_addr == sb->vs_addr) {
1777c8c0b82SPatrick Mooney return (0);
1787c8c0b82SPatrick Mooney } else if (sa->vs_addr < sb->vs_addr) {
1797c8c0b82SPatrick Mooney return (-1);
1807c8c0b82SPatrick Mooney } else {
1817c8c0b82SPatrick Mooney return (1);
1827c8c0b82SPatrick Mooney }
1837c8c0b82SPatrick Mooney }
1847c8c0b82SPatrick Mooney
1857c8c0b82SPatrick Mooney static int
vmmr_cmp_size(const void * a,const void * b)1867c8c0b82SPatrick Mooney vmmr_cmp_size(const void *a, const void *b)
1877c8c0b82SPatrick Mooney {
1887c8c0b82SPatrick Mooney const vmmr_span_t *sa = a;
1897c8c0b82SPatrick Mooney const vmmr_span_t *sb = b;
1907c8c0b82SPatrick Mooney
1917c8c0b82SPatrick Mooney if (sa->vs_size == sb->vs_size) {
1927c8c0b82SPatrick Mooney /*
1937c8c0b82SPatrick Mooney * Since discontiguous spans could have the same size in a
1947c8c0b82SPatrick Mooney * by-size tree, differentiate them (as required by AVL) by
1957c8c0b82SPatrick Mooney * address so they can safely coexist while remaining sorted.
1967c8c0b82SPatrick Mooney */
1977c8c0b82SPatrick Mooney return (vmmr_cmp_addr(a, b));
1987c8c0b82SPatrick Mooney } else if (sa->vs_size < sb->vs_size) {
1997c8c0b82SPatrick Mooney return (-1);
2007c8c0b82SPatrick Mooney } else {
2017c8c0b82SPatrick Mooney return (1);
2027c8c0b82SPatrick Mooney }
2037c8c0b82SPatrick Mooney }
2047c8c0b82SPatrick Mooney
2057c8c0b82SPatrick Mooney static int
vmmr_cmp_region_addr(const void * a,const void * b)2067c8c0b82SPatrick Mooney vmmr_cmp_region_addr(const void *a, const void *b)
2077c8c0b82SPatrick Mooney {
2087c8c0b82SPatrick Mooney const vmmr_span_t *sa = a;
2097c8c0b82SPatrick Mooney const vmmr_span_t *sb = b;
2107c8c0b82SPatrick Mooney
2117c8c0b82SPatrick Mooney if (sa->vs_region_addr == sb->vs_region_addr) {
2127c8c0b82SPatrick Mooney return (0);
2137c8c0b82SPatrick Mooney } else if (sa->vs_region_addr < sb->vs_region_addr) {
2147c8c0b82SPatrick Mooney return (-1);
2157c8c0b82SPatrick Mooney } else {
2167c8c0b82SPatrick Mooney return (1);
2177c8c0b82SPatrick Mooney }
2187c8c0b82SPatrick Mooney }
2197c8c0b82SPatrick Mooney
2207c8c0b82SPatrick Mooney static void
vmmr_tp_init(vmmr_treepair_t * tree)2217c8c0b82SPatrick Mooney vmmr_tp_init(vmmr_treepair_t *tree)
2227c8c0b82SPatrick Mooney {
2237c8c0b82SPatrick Mooney avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
2247c8c0b82SPatrick Mooney offsetof(vmmr_span_t, vs_by_addr));
2257c8c0b82SPatrick Mooney avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
2267c8c0b82SPatrick Mooney offsetof(vmmr_span_t, vs_by_size));
2277c8c0b82SPatrick Mooney }
2287c8c0b82SPatrick Mooney
2297c8c0b82SPatrick Mooney static void
vmmr_tp_destroy(vmmr_treepair_t * tree)2307c8c0b82SPatrick Mooney vmmr_tp_destroy(vmmr_treepair_t *tree)
2317c8c0b82SPatrick Mooney {
2327c8c0b82SPatrick Mooney void *vcp = NULL;
2337c8c0b82SPatrick Mooney vmmr_span_t *span;
2347c8c0b82SPatrick Mooney
2357c8c0b82SPatrick Mooney while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
2367c8c0b82SPatrick Mooney /* Freeing spans will be done when tearing down by-size tree */
2377c8c0b82SPatrick Mooney }
2387c8c0b82SPatrick Mooney while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
2397c8c0b82SPatrick Mooney kmem_free(span, sizeof (*span));
2407c8c0b82SPatrick Mooney }
2417c8c0b82SPatrick Mooney avl_destroy(&tree->by_addr);
2427c8c0b82SPatrick Mooney avl_destroy(&tree->by_size);
2437c8c0b82SPatrick Mooney }
2447c8c0b82SPatrick Mooney
2457c8c0b82SPatrick Mooney /*
2467c8c0b82SPatrick Mooney * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent
2477c8c0b82SPatrick Mooney * span(s). Such concatenation could result in the `to_add` span being freed,
2487c8c0b82SPatrick Mooney * so the caller cannot use it after this returns.
2497c8c0b82SPatrick Mooney */
2507c8c0b82SPatrick Mooney static void
vmmr_tp_insert_concat(vmmr_span_t * to_add,vmmr_treepair_t * tree)2517c8c0b82SPatrick Mooney vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
2527c8c0b82SPatrick Mooney {
2537c8c0b82SPatrick Mooney avl_tree_t *by_addr = &tree->by_addr;
2547c8c0b82SPatrick Mooney avl_tree_t *by_size = &tree->by_size;
2557c8c0b82SPatrick Mooney vmmr_span_t *node;
2567c8c0b82SPatrick Mooney avl_index_t where;
2577c8c0b82SPatrick Mooney
2587c8c0b82SPatrick Mooney /* This addr should not already exist in the treepair */
2597c8c0b82SPatrick Mooney node = avl_find(by_addr, to_add, &where);
2607c8c0b82SPatrick Mooney ASSERT3P(node, ==, NULL);
2617c8c0b82SPatrick Mooney
2627c8c0b82SPatrick Mooney node = avl_nearest(by_addr, where, AVL_BEFORE);
2637c8c0b82SPatrick Mooney if (node != NULL &&
2647c8c0b82SPatrick Mooney (node->vs_addr + node->vs_size) == to_add->vs_addr) {
2657c8c0b82SPatrick Mooney /* concat with preceeding item */
2667c8c0b82SPatrick Mooney avl_remove(by_addr, node);
2677c8c0b82SPatrick Mooney avl_remove(by_size, node);
2687c8c0b82SPatrick Mooney node->vs_size += to_add->vs_size;
2697c8c0b82SPatrick Mooney kmem_free(to_add, sizeof (*to_add));
2707c8c0b82SPatrick Mooney
2717c8c0b82SPatrick Mooney /*
2727c8c0b82SPatrick Mooney * Since this now-concatenated span could be adjacent one
2737c8c0b82SPatrick Mooney * trailing it, fall through to perform that check.
2747c8c0b82SPatrick Mooney */
2757c8c0b82SPatrick Mooney to_add = node;
2767c8c0b82SPatrick Mooney }
2777c8c0b82SPatrick Mooney
2787c8c0b82SPatrick Mooney node = avl_nearest(by_addr, where, AVL_AFTER);
2797c8c0b82SPatrick Mooney if (node != NULL &&
2807c8c0b82SPatrick Mooney (to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
2817c8c0b82SPatrick Mooney /* concat with trailing item */
2827c8c0b82SPatrick Mooney avl_remove(by_addr, node);
2837c8c0b82SPatrick Mooney avl_remove(by_size, node);
2847c8c0b82SPatrick Mooney node->vs_addr = to_add->vs_addr;
2857c8c0b82SPatrick Mooney node->vs_size += to_add->vs_size;
2867c8c0b82SPatrick Mooney avl_add(by_addr, node);
2877c8c0b82SPatrick Mooney avl_add(by_size, node);
2887c8c0b82SPatrick Mooney
2897c8c0b82SPatrick Mooney kmem_free(to_add, sizeof (*to_add));
2907c8c0b82SPatrick Mooney return;
2917c8c0b82SPatrick Mooney }
2927c8c0b82SPatrick Mooney
2937c8c0b82SPatrick Mooney /* simply insert */
2947c8c0b82SPatrick Mooney avl_add(by_addr, to_add);
2957c8c0b82SPatrick Mooney avl_add(by_size, to_add);
2967c8c0b82SPatrick Mooney }
2977c8c0b82SPatrick Mooney
2987c8c0b82SPatrick Mooney /*
2997c8c0b82SPatrick Mooney * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of
3007c8c0b82SPatrick Mooney * the exact target size is not present, but a larger one is. May return a span
3017c8c0b82SPatrick Mooney * with a size smaller than the target if splitting is not an option.
3027c8c0b82SPatrick Mooney */
3037c8c0b82SPatrick Mooney static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz,vmmr_treepair_t * tree)3047c8c0b82SPatrick Mooney vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
3057c8c0b82SPatrick Mooney {
3067c8c0b82SPatrick Mooney avl_tree_t *by_addr = &tree->by_addr;
3077c8c0b82SPatrick Mooney avl_tree_t *by_size = &tree->by_size;
3087c8c0b82SPatrick Mooney vmmr_span_t *span;
3097c8c0b82SPatrick Mooney avl_index_t where;
3107c8c0b82SPatrick Mooney
3117c8c0b82SPatrick Mooney ASSERT3U(target_sz, !=, 0);
3127c8c0b82SPatrick Mooney ASSERT(!avl_is_empty(by_addr));
3137c8c0b82SPatrick Mooney ASSERT(!avl_is_empty(by_size));
3147c8c0b82SPatrick Mooney
3157c8c0b82SPatrick Mooney vmmr_span_t search = { .vs_size = target_sz };
3167c8c0b82SPatrick Mooney span = avl_find(by_size, &search, &where);
3177c8c0b82SPatrick Mooney if (span == NULL) {
3187c8c0b82SPatrick Mooney /* Try for a larger span (instead of exact match) */
3197c8c0b82SPatrick Mooney span = avl_nearest(by_size, where, AVL_AFTER);
3207c8c0b82SPatrick Mooney if (span == NULL) {
3217c8c0b82SPatrick Mooney /*
3227c8c0b82SPatrick Mooney * Caller will need to collect several smaller spans in
3237c8c0b82SPatrick Mooney * order to fulfill their request.
3247c8c0b82SPatrick Mooney */
3257c8c0b82SPatrick Mooney span = avl_nearest(by_size, where, AVL_BEFORE);
3267c8c0b82SPatrick Mooney ASSERT3P(span, !=, NULL);
3277c8c0b82SPatrick Mooney }
3287c8c0b82SPatrick Mooney }
3297c8c0b82SPatrick Mooney
3307c8c0b82SPatrick Mooney if (span->vs_size <= target_sz) {
3317c8c0b82SPatrick Mooney avl_remove(by_size, span);
3327c8c0b82SPatrick Mooney avl_remove(by_addr, span);
3337c8c0b82SPatrick Mooney
3347c8c0b82SPatrick Mooney return (span);
3357c8c0b82SPatrick Mooney } else {
3367c8c0b82SPatrick Mooney /* Split off adequate chunk from larger span */
3377c8c0b82SPatrick Mooney uintptr_t start = span->vs_addr + span->vs_size - target_sz;
3387c8c0b82SPatrick Mooney
3397c8c0b82SPatrick Mooney avl_remove(by_size, span);
3407c8c0b82SPatrick Mooney span->vs_size -= target_sz;
3417c8c0b82SPatrick Mooney avl_add(by_size, span);
3427c8c0b82SPatrick Mooney
3437c8c0b82SPatrick Mooney vmmr_span_t *split_span =
3447c8c0b82SPatrick Mooney kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
3457c8c0b82SPatrick Mooney split_span->vs_addr = start;
3467c8c0b82SPatrick Mooney split_span->vs_size = target_sz;
3477c8c0b82SPatrick Mooney
3487c8c0b82SPatrick Mooney return (split_span);
3497c8c0b82SPatrick Mooney }
3507c8c0b82SPatrick Mooney }
3517c8c0b82SPatrick Mooney
3526bba8b59SPatrick Mooney static int
vmmr_kstat_update(struct kstat * ksp,int rw)3536bba8b59SPatrick Mooney vmmr_kstat_update(struct kstat *ksp, int rw)
3546bba8b59SPatrick Mooney {
3556bba8b59SPatrick Mooney vmmr_kstats_t *vkp = ksp->ks_data;
3566bba8b59SPatrick Mooney
3576bba8b59SPatrick Mooney mutex_enter(&vmmr_lock);
3586bba8b59SPatrick Mooney vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
3596bba8b59SPatrick Mooney vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
3606bba8b59SPatrick Mooney /*
3616bba8b59SPatrick Mooney * In addition to the memory which is actually actually allocated to
3626bba8b59SPatrick Mooney * transient consumers, memory which is considered free-for-transient is
3636bba8b59SPatrick Mooney * also included in the sizing.
3646bba8b59SPatrick Mooney */
3656bba8b59SPatrick Mooney vkp->vmrks_bytes_transient.value.ui64 =
3666bba8b59SPatrick Mooney vmmr_alloc_transient_sz + vmmr_free_transient_sz;
3676bba8b59SPatrick Mooney vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
3686bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
3696bba8b59SPatrick Mooney
3706bba8b59SPatrick Mooney return (0);
3716bba8b59SPatrick Mooney }
3726bba8b59SPatrick Mooney
3736bba8b59SPatrick Mooney int
vmmr_init()3747c8c0b82SPatrick Mooney vmmr_init()
3757c8c0b82SPatrick Mooney {
3767c8c0b82SPatrick Mooney mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
3777c8c0b82SPatrick Mooney
3787c8c0b82SPatrick Mooney /*
3797c8c0b82SPatrick Mooney * `vmm_total_limit` represents the absolute maximum size of the VMM
3807c8c0b82SPatrick Mooney * memory reservoir. It is meant to provide some measure of protection
3817c8c0b82SPatrick Mooney * against an operator pushing the system into unrecoverable memory
3827c8c0b82SPatrick Mooney * starvation through explicit or transient additions to the reservoir.
3837c8c0b82SPatrick Mooney *
3847c8c0b82SPatrick Mooney * There will be many situations where this limit would be inadequate to
3857c8c0b82SPatrick Mooney * prevent kernel memory starvation in the face of certain operator
3867c8c0b82SPatrick Mooney * actions. It is a balance to be struck between safety and allowing
3877c8c0b82SPatrick Mooney * large systems to reach high utilization.
3887c8c0b82SPatrick Mooney *
3897c8c0b82SPatrick Mooney * The value is based off of pages_pp_maximum: "Number of currently
3907c8c0b82SPatrick Mooney * available pages that cannot be 'locked'". It is sized as all of
3917c8c0b82SPatrick Mooney * `physmem` less 120% of `pages_pp_maximum`.
3927c8c0b82SPatrick Mooney */
3937c8c0b82SPatrick Mooney vmmr_total_limit =
3947c8c0b82SPatrick Mooney (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
3957c8c0b82SPatrick Mooney
3967c8c0b82SPatrick Mooney vmmr_empty_last = 0;
3977c8c0b82SPatrick Mooney vmmr_free_sz = 0;
3987c8c0b82SPatrick Mooney vmmr_alloc_sz = 0;
3997c8c0b82SPatrick Mooney vmmr_empty_sz = 0;
4007c8c0b82SPatrick Mooney vmmr_adding_sz = 0;
4017c8c0b82SPatrick Mooney vmmr_free_transient_sz = 0;
4027c8c0b82SPatrick Mooney vmmr_alloc_transient_sz = 0;
4036bba8b59SPatrick Mooney vmmr_target_sz = VMMR_TARGET_INACTIVE;
4046bba8b59SPatrick Mooney
4056bba8b59SPatrick Mooney /*
4066bba8b59SPatrick Mooney * Attempt kstat allocation early, since it is the only part of
4076bba8b59SPatrick Mooney * reservoir initialization which is fallible.
4086bba8b59SPatrick Mooney */
4096bba8b59SPatrick Mooney kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
4106bba8b59SPatrick Mooney VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
4116bba8b59SPatrick Mooney sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
4126bba8b59SPatrick Mooney if (ksp == NULL) {
4136bba8b59SPatrick Mooney mutex_destroy(&vmmr_lock);
4146bba8b59SPatrick Mooney return (ENOMEM);
4156bba8b59SPatrick Mooney }
4166bba8b59SPatrick Mooney
4176bba8b59SPatrick Mooney vmmr_kstats_t *vkp = ksp->ks_data;
4186bba8b59SPatrick Mooney
4196bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
4206bba8b59SPatrick Mooney KSTAT_DATA_UINT64);
4216bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
4226bba8b59SPatrick Mooney KSTAT_DATA_UINT64);
4236bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
4246bba8b59SPatrick Mooney KSTAT_DATA_UINT64);
4256bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
4266bba8b59SPatrick Mooney KSTAT_DATA_UINT64);
4276bba8b59SPatrick Mooney ksp->ks_private = NULL;
4286bba8b59SPatrick Mooney ksp->ks_update = vmmr_kstat_update;
4296bba8b59SPatrick Mooney vmmr_kstat = ksp;
4307c8c0b82SPatrick Mooney
4317c8c0b82SPatrick Mooney vmmr_tp_init(&vmmr_free_tp);
4327c8c0b82SPatrick Mooney vmmr_tp_init(&vmmr_empty_tp);
4337c8c0b82SPatrick Mooney
4347c8c0b82SPatrick Mooney list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
4357c8c0b82SPatrick Mooney offsetof(vmmr_region_t, vr_node));
4367c8c0b82SPatrick Mooney
4377c8c0b82SPatrick Mooney /* Grab a chunk of VA for the reservoir */
4387c8c0b82SPatrick Mooney vmmr_va_sz = physmem * PAGESIZE;
4397c8c0b82SPatrick Mooney vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
4406bba8b59SPatrick Mooney
4416bba8b59SPatrick Mooney kstat_install(vmmr_kstat);
4426bba8b59SPatrick Mooney
4436bba8b59SPatrick Mooney return (0);
4447c8c0b82SPatrick Mooney }
4457c8c0b82SPatrick Mooney
4467c8c0b82SPatrick Mooney void
vmmr_fini()4477c8c0b82SPatrick Mooney vmmr_fini()
4487c8c0b82SPatrick Mooney {
4497c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
4507c8c0b82SPatrick Mooney VERIFY3U(vmmr_alloc_sz, ==, 0);
4517c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_sz, ==, 0);
4527c8c0b82SPatrick Mooney VERIFY3U(vmmr_adding_sz, ==, 0);
4537c8c0b82SPatrick Mooney VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
4547c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_transient_sz, ==, 0);
4557c8c0b82SPatrick Mooney VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
4567c8c0b82SPatrick Mooney VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
4577c8c0b82SPatrick Mooney VERIFY(list_is_empty(&vmmr_alloc_regions));
4587c8c0b82SPatrick Mooney
4596bba8b59SPatrick Mooney kstat_delete(vmmr_kstat);
4606bba8b59SPatrick Mooney vmmr_kstat = NULL;
4616bba8b59SPatrick Mooney
4627c8c0b82SPatrick Mooney vmmr_tp_destroy(&vmmr_free_tp);
4637c8c0b82SPatrick Mooney vmmr_tp_destroy(&vmmr_empty_tp);
4647c8c0b82SPatrick Mooney list_destroy(&vmmr_alloc_regions);
4657c8c0b82SPatrick Mooney
4667c8c0b82SPatrick Mooney /* Release reservoir VA chunk */
4677c8c0b82SPatrick Mooney vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
4687c8c0b82SPatrick Mooney vmmr_va = 0;
4697c8c0b82SPatrick Mooney vmmr_va_sz = 0;
4707c8c0b82SPatrick Mooney vmmr_total_limit = 0;
4717c8c0b82SPatrick Mooney vmmr_empty_last = 0;
4727c8c0b82SPatrick Mooney
4737c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
4747c8c0b82SPatrick Mooney mutex_destroy(&vmmr_lock);
4757c8c0b82SPatrick Mooney }
4767c8c0b82SPatrick Mooney
4777c8c0b82SPatrick Mooney bool
vmmr_is_empty()4787c8c0b82SPatrick Mooney vmmr_is_empty()
4797c8c0b82SPatrick Mooney {
4807c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
4817c8c0b82SPatrick Mooney bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
4827c8c0b82SPatrick Mooney vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
4837c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
4847c8c0b82SPatrick Mooney return (res);
4857c8c0b82SPatrick Mooney }
4867c8c0b82SPatrick Mooney
4877c8c0b82SPatrick Mooney int
vmmr_alloc(size_t sz,bool transient,vmmr_region_t ** resp)4887c8c0b82SPatrick Mooney vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
4897c8c0b82SPatrick Mooney {
4907c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0);
4917c8c0b82SPatrick Mooney
4927c8c0b82SPatrick Mooney if (!transient) {
4937c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
4947c8c0b82SPatrick Mooney if (sz > vmmr_free_sz) {
4957c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
4967c8c0b82SPatrick Mooney return (ENOSPC);
4977c8c0b82SPatrick Mooney }
4987c8c0b82SPatrick Mooney } else {
4997c8c0b82SPatrick Mooney int err;
5007c8c0b82SPatrick Mooney
5016bba8b59SPatrick Mooney mutex_enter(&vmmr_lock);
5027c8c0b82SPatrick Mooney err = vmmr_add(sz, true);
5037c8c0b82SPatrick Mooney if (err != 0) {
5046bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
5057c8c0b82SPatrick Mooney return (err);
5067c8c0b82SPatrick Mooney }
5077c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_transient_sz, >=, sz);
5087c8c0b82SPatrick Mooney }
5097c8c0b82SPatrick Mooney
5107c8c0b82SPatrick Mooney vmmr_region_t *region;
5117c8c0b82SPatrick Mooney region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
5127c8c0b82SPatrick Mooney avl_create(®ion->vr_spans, vmmr_cmp_region_addr,
5137c8c0b82SPatrick Mooney sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
5147c8c0b82SPatrick Mooney region->vr_size = sz;
5157c8c0b82SPatrick Mooney
5167c8c0b82SPatrick Mooney size_t remain = sz;
5177c8c0b82SPatrick Mooney uintptr_t map_at = 0;
5187c8c0b82SPatrick Mooney while (remain > 0) {
5197c8c0b82SPatrick Mooney vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
5207c8c0b82SPatrick Mooney
5217c8c0b82SPatrick Mooney /*
5227c8c0b82SPatrick Mooney * We have already ensured that adequate free memory is present
5237c8c0b82SPatrick Mooney * in the reservoir for this allocation.
5247c8c0b82SPatrick Mooney */
5257c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL);
5267c8c0b82SPatrick Mooney ASSERT3U(span->vs_size, <=, remain);
5277c8c0b82SPatrick Mooney
5287c8c0b82SPatrick Mooney span->vs_region_addr = map_at;
5297c8c0b82SPatrick Mooney avl_add(®ion->vr_spans, span);
5307c8c0b82SPatrick Mooney map_at += span->vs_size;
5317c8c0b82SPatrick Mooney remain -= span->vs_size;
5327c8c0b82SPatrick Mooney }
5337c8c0b82SPatrick Mooney
5347c8c0b82SPatrick Mooney if (!transient) {
5357c8c0b82SPatrick Mooney vmmr_free_sz -= sz;
5367c8c0b82SPatrick Mooney vmmr_alloc_sz += sz;
5377c8c0b82SPatrick Mooney } else {
5387c8c0b82SPatrick Mooney vmmr_free_transient_sz -= sz;
5397c8c0b82SPatrick Mooney vmmr_alloc_transient_sz += sz;
5407c8c0b82SPatrick Mooney region->vr_transient = true;
5417c8c0b82SPatrick Mooney }
5427c8c0b82SPatrick Mooney list_insert_tail(&vmmr_alloc_regions, region);
5437c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
5447c8c0b82SPatrick Mooney
5457c8c0b82SPatrick Mooney *resp = region;
5467c8c0b82SPatrick Mooney return (0);
5477c8c0b82SPatrick Mooney }
5487c8c0b82SPatrick Mooney
5497c8c0b82SPatrick Mooney void *
vmmr_region_mem_at(vmmr_region_t * region,uintptr_t off)5507c8c0b82SPatrick Mooney vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
5517c8c0b82SPatrick Mooney {
5527c8c0b82SPatrick Mooney /* just use KPM region for now */
5537c8c0b82SPatrick Mooney return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
5547c8c0b82SPatrick Mooney }
5557c8c0b82SPatrick Mooney
5567c8c0b82SPatrick Mooney pfn_t
vmmr_region_pfn_at(vmmr_region_t * region,uintptr_t off)5577c8c0b82SPatrick Mooney vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
5587c8c0b82SPatrick Mooney {
5597c8c0b82SPatrick Mooney VERIFY3U(off & PAGEOFFSET, ==, 0);
5607c8c0b82SPatrick Mooney VERIFY3U(off, <, region->vr_size);
5617c8c0b82SPatrick Mooney
5627c8c0b82SPatrick Mooney vmmr_span_t search = {
5637c8c0b82SPatrick Mooney .vs_region_addr = off
5647c8c0b82SPatrick Mooney };
5657c8c0b82SPatrick Mooney avl_index_t where;
5667c8c0b82SPatrick Mooney vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where);
5677c8c0b82SPatrick Mooney
5687c8c0b82SPatrick Mooney if (span == NULL) {
5697c8c0b82SPatrick Mooney span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE);
5707c8c0b82SPatrick Mooney ASSERT3P(span, !=, NULL);
5717c8c0b82SPatrick Mooney }
5727c8c0b82SPatrick Mooney uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
5737c8c0b82SPatrick Mooney page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
5747c8c0b82SPatrick Mooney VERIFY(pp != NULL);
5757c8c0b82SPatrick Mooney return (pp->p_pagenum);
5767c8c0b82SPatrick Mooney }
5777c8c0b82SPatrick Mooney
5787c8c0b82SPatrick Mooney void
vmmr_free(vmmr_region_t * region)5797c8c0b82SPatrick Mooney vmmr_free(vmmr_region_t *region)
5807c8c0b82SPatrick Mooney {
5817c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
5827c8c0b82SPatrick Mooney if (!region->vr_transient) {
5837c8c0b82SPatrick Mooney VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
5847c8c0b82SPatrick Mooney } else {
5857c8c0b82SPatrick Mooney VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
5867c8c0b82SPatrick Mooney }
5877c8c0b82SPatrick Mooney list_remove(&vmmr_alloc_regions, region);
5887c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
5897c8c0b82SPatrick Mooney
5906bba8b59SPatrick Mooney /* Zero the contents (while not monopolizing vmmr_lock) */
5917c8c0b82SPatrick Mooney for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
5927c8c0b82SPatrick Mooney bzero(vmmr_region_mem_at(region, off), PAGESIZE);
5937c8c0b82SPatrick Mooney }
5947c8c0b82SPatrick Mooney
5957c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
5967c8c0b82SPatrick Mooney
5977c8c0b82SPatrick Mooney /* Put the contained span(s) back in the free pool */
5987c8c0b82SPatrick Mooney void *cookie = NULL;
5997c8c0b82SPatrick Mooney vmmr_span_t *span;
6007c8c0b82SPatrick Mooney while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) {
6017c8c0b82SPatrick Mooney span->vs_region_addr = 0;
6027c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_free_tp);
6037c8c0b82SPatrick Mooney }
6047c8c0b82SPatrick Mooney avl_destroy(®ion->vr_spans);
6057c8c0b82SPatrick Mooney if (!region->vr_transient) {
6067c8c0b82SPatrick Mooney vmmr_free_sz += region->vr_size;
6077c8c0b82SPatrick Mooney vmmr_alloc_sz -= region->vr_size;
6087c8c0b82SPatrick Mooney } else {
6097c8c0b82SPatrick Mooney vmmr_free_transient_sz += region->vr_size;
6107c8c0b82SPatrick Mooney vmmr_alloc_transient_sz -= region->vr_size;
6117c8c0b82SPatrick Mooney }
6127c8c0b82SPatrick Mooney
6137c8c0b82SPatrick Mooney if (region->vr_transient) {
614e0994bd2SPatrick Mooney /*
615e0994bd2SPatrick Mooney * Since the transient capacity was previously allocated for
616e0994bd2SPatrick Mooney * this region, its removal should not fail.
617e0994bd2SPatrick Mooney */
618e0994bd2SPatrick Mooney VERIFY0(vmmr_remove(region->vr_size, true));
6197c8c0b82SPatrick Mooney }
6207c8c0b82SPatrick Mooney kmem_free(region, sizeof (*region));
6216bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
6227c8c0b82SPatrick Mooney }
6237c8c0b82SPatrick Mooney
6247c8c0b82SPatrick Mooney static void
vmmr_destroy_pages(vmmr_span_t * span)6257c8c0b82SPatrick Mooney vmmr_destroy_pages(vmmr_span_t *span)
6267c8c0b82SPatrick Mooney {
6277c8c0b82SPatrick Mooney const uintptr_t end = span->vs_addr + span->vs_size;
6287c8c0b82SPatrick Mooney struct vnode *vp = &kvps[KV_VVP];
6297c8c0b82SPatrick Mooney for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
6307c8c0b82SPatrick Mooney page_t *pp;
6317c8c0b82SPatrick Mooney
6327c8c0b82SPatrick Mooney /* Page-free logic cribbed from segkmem_xfree(): */
6337c8c0b82SPatrick Mooney pp = page_find(vp, (u_offset_t)pos);
6347c8c0b82SPatrick Mooney VERIFY(pp != NULL);
6357c8c0b82SPatrick Mooney if (!page_tryupgrade(pp)) {
6367c8c0b82SPatrick Mooney /*
6377c8c0b82SPatrick Mooney * Some other thread has a sharelock. Wait for
6387c8c0b82SPatrick Mooney * it to drop the lock so we can free this page.
6397c8c0b82SPatrick Mooney */
6407c8c0b82SPatrick Mooney page_unlock(pp);
6417c8c0b82SPatrick Mooney pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
6427c8c0b82SPatrick Mooney }
6437c8c0b82SPatrick Mooney
6447c8c0b82SPatrick Mooney /*
6457c8c0b82SPatrick Mooney * Clear p_lckcnt so page_destroy() doesn't update availrmem.
6467c8c0b82SPatrick Mooney * That will be taken care of later via page_unresv().
6477c8c0b82SPatrick Mooney */
6487c8c0b82SPatrick Mooney pp->p_lckcnt = 0;
6497c8c0b82SPatrick Mooney page_destroy(pp, 0);
6507c8c0b82SPatrick Mooney }
6517c8c0b82SPatrick Mooney }
6527c8c0b82SPatrick Mooney
6537c8c0b82SPatrick Mooney static int
vmmr_alloc_pages(const vmmr_span_t * span)6547c8c0b82SPatrick Mooney vmmr_alloc_pages(const vmmr_span_t *span)
6557c8c0b82SPatrick Mooney {
6567c8c0b82SPatrick Mooney struct seg kseg = {
6577c8c0b82SPatrick Mooney .s_as = &kas
6587c8c0b82SPatrick Mooney };
6597c8c0b82SPatrick Mooney struct vnode *vp = &kvps[KV_VVP];
6607c8c0b82SPatrick Mooney
6617c8c0b82SPatrick Mooney const uintptr_t end = span->vs_addr + span->vs_size;
6627c8c0b82SPatrick Mooney for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
6637c8c0b82SPatrick Mooney page_t *pp;
6647c8c0b82SPatrick Mooney
6657c8c0b82SPatrick Mooney pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
6667c8c0b82SPatrick Mooney PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
6677c8c0b82SPatrick Mooney
6687c8c0b82SPatrick Mooney if (pp == NULL) {
6697c8c0b82SPatrick Mooney /* Destroy any already-created pages */
6707c8c0b82SPatrick Mooney if (pos != span->vs_addr) {
6717c8c0b82SPatrick Mooney vmmr_span_t destroy_span = {
6727c8c0b82SPatrick Mooney .vs_addr = span->vs_addr,
6737c8c0b82SPatrick Mooney .vs_size = pos - span->vs_addr,
6747c8c0b82SPatrick Mooney };
6757c8c0b82SPatrick Mooney
6767c8c0b82SPatrick Mooney vmmr_destroy_pages(&destroy_span);
6777c8c0b82SPatrick Mooney }
6787c8c0b82SPatrick Mooney return (ENOMEM);
6797c8c0b82SPatrick Mooney }
6807c8c0b82SPatrick Mooney
6817c8c0b82SPatrick Mooney /* mimic page state from segkmem */
6827c8c0b82SPatrick Mooney ASSERT(PAGE_EXCL(pp));
6837c8c0b82SPatrick Mooney page_io_unlock(pp);
6847c8c0b82SPatrick Mooney pp->p_lckcnt = 1;
6857c8c0b82SPatrick Mooney page_downgrade(pp);
6867c8c0b82SPatrick Mooney
6877c8c0b82SPatrick Mooney /* pre-zero the page */
6887c8c0b82SPatrick Mooney bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
6897c8c0b82SPatrick Mooney }
6907c8c0b82SPatrick Mooney
6917c8c0b82SPatrick Mooney return (0);
6927c8c0b82SPatrick Mooney }
6937c8c0b82SPatrick Mooney
6947c8c0b82SPatrick Mooney static int
vmmr_resv_wait()6957c8c0b82SPatrick Mooney vmmr_resv_wait()
6967c8c0b82SPatrick Mooney {
6977c8c0b82SPatrick Mooney if (delay_sig(hz >> 2) != 0) {
6987c8c0b82SPatrick Mooney /* bail due to interruption */
6997c8c0b82SPatrick Mooney return (0);
7007c8c0b82SPatrick Mooney }
7017c8c0b82SPatrick Mooney return (1);
7027c8c0b82SPatrick Mooney }
7037c8c0b82SPatrick Mooney
7047c8c0b82SPatrick Mooney static void
vmmr_remove_raw(size_t sz)7057c8c0b82SPatrick Mooney vmmr_remove_raw(size_t sz)
7067c8c0b82SPatrick Mooney {
7077c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0);
7087c8c0b82SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock));
7097c8c0b82SPatrick Mooney
7107c8c0b82SPatrick Mooney size_t remain = sz;
7117c8c0b82SPatrick Mooney while (remain > 0) {
7127c8c0b82SPatrick Mooney vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
7137c8c0b82SPatrick Mooney
7147c8c0b82SPatrick Mooney /*
7157c8c0b82SPatrick Mooney * The caller must ensure that at least `sz` amount is present
7167c8c0b82SPatrick Mooney * in the free treepair.
7177c8c0b82SPatrick Mooney */
7187c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL);
7197c8c0b82SPatrick Mooney ASSERT3U(span->vs_size, <=, remain);
7207c8c0b82SPatrick Mooney
7217c8c0b82SPatrick Mooney /* TODO: perhaps arrange to destroy pages outside the lock? */
7227c8c0b82SPatrick Mooney vmmr_destroy_pages(span);
7237c8c0b82SPatrick Mooney
7247c8c0b82SPatrick Mooney remain -= span->vs_size;
7257c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_empty_tp);
7267c8c0b82SPatrick Mooney }
7277c8c0b82SPatrick Mooney
7287c8c0b82SPatrick Mooney vmmr_empty_sz += sz;
7297c8c0b82SPatrick Mooney }
7307c8c0b82SPatrick Mooney
7316bba8b59SPatrick Mooney /*
7326bba8b59SPatrick Mooney * Add memory to vmm reservoir. Memory may be marked for transient use, where
7336bba8b59SPatrick Mooney * the addition is part of a transient allocation from the reservoir. Otherwise
7346bba8b59SPatrick Mooney * it is placed in the reservoir to be available for non-transient allocations.
7356bba8b59SPatrick Mooney *
7366bba8b59SPatrick Mooney * Expects vmmr_lock to be held when called, and will return with it held, but
7376bba8b59SPatrick Mooney * will drop it during portions of the addition.
7386bba8b59SPatrick Mooney */
7396bba8b59SPatrick Mooney static int
vmmr_add(size_t sz,bool transient)7407c8c0b82SPatrick Mooney vmmr_add(size_t sz, bool transient)
7417c8c0b82SPatrick Mooney {
7427c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0);
7436bba8b59SPatrick Mooney VERIFY3U(sz, >, 0);
7446bba8b59SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock));
7457c8c0b82SPatrick Mooney
7467c8c0b82SPatrick Mooney /*
7477c8c0b82SPatrick Mooney * Make sure that the amount added is not going to breach the limits
7487c8c0b82SPatrick Mooney * we've chosen
7497c8c0b82SPatrick Mooney */
7507c8c0b82SPatrick Mooney const size_t current_total =
7517c8c0b82SPatrick Mooney vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
7527c8c0b82SPatrick Mooney vmmr_alloc_transient_sz + vmmr_free_transient_sz;
7537c8c0b82SPatrick Mooney if ((current_total + sz) < current_total) {
7547c8c0b82SPatrick Mooney return (EOVERFLOW);
7557c8c0b82SPatrick Mooney }
7567c8c0b82SPatrick Mooney if ((current_total + sz) > vmmr_total_limit) {
7577c8c0b82SPatrick Mooney return (ENOSPC);
7587c8c0b82SPatrick Mooney }
7597c8c0b82SPatrick Mooney vmmr_adding_sz += sz;
7607c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
7617c8c0b82SPatrick Mooney
7627c8c0b82SPatrick Mooney /* Wait for enough pages to become available */
7637c8c0b82SPatrick Mooney if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
7647c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
7657c8c0b82SPatrick Mooney vmmr_adding_sz -= sz;
7667c8c0b82SPatrick Mooney return (EINTR);
7677c8c0b82SPatrick Mooney }
7687c8c0b82SPatrick Mooney
7697c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
7707c8c0b82SPatrick Mooney size_t added = 0;
7717c8c0b82SPatrick Mooney size_t remain = sz;
7727c8c0b82SPatrick Mooney while (added < sz) {
7737c8c0b82SPatrick Mooney vmmr_span_t *span = NULL;
7747c8c0b82SPatrick Mooney
7757c8c0b82SPatrick Mooney if (vmmr_empty_sz > 0) {
7767c8c0b82SPatrick Mooney span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
7777c8c0b82SPatrick Mooney
7787c8c0b82SPatrick Mooney vmmr_empty_sz -= span->vs_size;
7797c8c0b82SPatrick Mooney } else {
7807c8c0b82SPatrick Mooney /*
7817c8c0b82SPatrick Mooney * No empty space to fill with new pages, so just tack
7827c8c0b82SPatrick Mooney * it on at the end instead.
7837c8c0b82SPatrick Mooney */
7847c8c0b82SPatrick Mooney span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
7857c8c0b82SPatrick Mooney span->vs_addr = vmmr_empty_last;
7867c8c0b82SPatrick Mooney span->vs_size = remain;
7877c8c0b82SPatrick Mooney vmmr_empty_last += remain;
7887c8c0b82SPatrick Mooney }
7897c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL);
7907c8c0b82SPatrick Mooney
7917c8c0b82SPatrick Mooney
7927c8c0b82SPatrick Mooney /* Allocate the actual pages to back this span */
7937c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
7947c8c0b82SPatrick Mooney int err = vmmr_alloc_pages(span);
7957c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
7967c8c0b82SPatrick Mooney
7977c8c0b82SPatrick Mooney /*
7987c8c0b82SPatrick Mooney * If an error is encountered during page allocation for the
7997c8c0b82SPatrick Mooney * span, unwind any progress made by the addition request.
8007c8c0b82SPatrick Mooney */
8017c8c0b82SPatrick Mooney if (err != 0) {
8027c8c0b82SPatrick Mooney /*
8037c8c0b82SPatrick Mooney * Without pages allocated to this span, it is now
8047c8c0b82SPatrick Mooney * tracked as empty.
8057c8c0b82SPatrick Mooney */
8067c8c0b82SPatrick Mooney vmmr_empty_sz += span->vs_size;
8077c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_empty_tp);
8087c8c0b82SPatrick Mooney
8097c8c0b82SPatrick Mooney if (added != 0) {
8107c8c0b82SPatrick Mooney vmmr_remove_raw(added);
8117c8c0b82SPatrick Mooney }
8127c8c0b82SPatrick Mooney
8137c8c0b82SPatrick Mooney vmmr_adding_sz -= sz;
8147c8c0b82SPatrick Mooney
8157c8c0b82SPatrick Mooney page_unresv(sz >> PAGESHIFT);
8167c8c0b82SPatrick Mooney return (err);
8177c8c0b82SPatrick Mooney }
8187c8c0b82SPatrick Mooney
8197c8c0b82SPatrick Mooney /*
8207c8c0b82SPatrick Mooney * The allocated-page-bearing span is placed in the "free"
8217c8c0b82SPatrick Mooney * treepair now, but is not officially exposed for consumption
8227c8c0b82SPatrick Mooney * until `vmm_free_sz` or `vmm_free_transient_sz` are updated.
8237c8c0b82SPatrick Mooney *
8247c8c0b82SPatrick Mooney * This allows us to unwind the allocation in case of a failure
8257c8c0b82SPatrick Mooney * without the risk of the freshly added span(s) being snapped
8267c8c0b82SPatrick Mooney * up by a consumer already.
8277c8c0b82SPatrick Mooney */
8287c8c0b82SPatrick Mooney added += span->vs_size;
8297c8c0b82SPatrick Mooney remain -= span->vs_size;
8307c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_free_tp);
8317c8c0b82SPatrick Mooney }
8327c8c0b82SPatrick Mooney
8337c8c0b82SPatrick Mooney /* Make the added memory usable by exposing it to the size accounting */
8347c8c0b82SPatrick Mooney if (!transient) {
8357c8c0b82SPatrick Mooney vmmr_free_sz += added;
8367c8c0b82SPatrick Mooney } else {
8377c8c0b82SPatrick Mooney vmmr_free_transient_sz += added;
8387c8c0b82SPatrick Mooney }
8397c8c0b82SPatrick Mooney ASSERT3U(added, ==, sz);
8407c8c0b82SPatrick Mooney vmmr_adding_sz -= added;
8417c8c0b82SPatrick Mooney
8427c8c0b82SPatrick Mooney return (0);
8437c8c0b82SPatrick Mooney }
8447c8c0b82SPatrick Mooney
8456bba8b59SPatrick Mooney /*
8466bba8b59SPatrick Mooney * Remove memory from vmm reservoir. Normally this will remove memory from the
8476bba8b59SPatrick Mooney * reservoir which was available for non-transient allocations. If the removal
8486bba8b59SPatrick Mooney * is part of a vmmr_free() of a transient allocation, it will act on only that
8496bba8b59SPatrick Mooney * transient region being freed, not the available memory in the reservoir.
8506bba8b59SPatrick Mooney *
8516bba8b59SPatrick Mooney * Expects vmmr_lock to be held when called, and will return with it held, but
8526bba8b59SPatrick Mooney * may drop it during portions of the removal.
8536bba8b59SPatrick Mooney */
8546bba8b59SPatrick Mooney static int
vmmr_remove(size_t sz,bool transient)8557c8c0b82SPatrick Mooney vmmr_remove(size_t sz, bool transient)
8567c8c0b82SPatrick Mooney {
8577c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0);
8586bba8b59SPatrick Mooney VERIFY(sz);
8596bba8b59SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock));
8607c8c0b82SPatrick Mooney
8617c8c0b82SPatrick Mooney if ((!transient && sz > vmmr_free_sz) ||
8627c8c0b82SPatrick Mooney (transient && sz > vmmr_free_transient_sz)) {
8637c8c0b82SPatrick Mooney return (ENOSPC);
8647c8c0b82SPatrick Mooney }
8657c8c0b82SPatrick Mooney
8667c8c0b82SPatrick Mooney vmmr_remove_raw(sz);
8677c8c0b82SPatrick Mooney
8687c8c0b82SPatrick Mooney if (!transient) {
8697c8c0b82SPatrick Mooney vmmr_free_sz -= sz;
8707c8c0b82SPatrick Mooney } else {
8717c8c0b82SPatrick Mooney vmmr_free_transient_sz -= sz;
8727c8c0b82SPatrick Mooney }
8737c8c0b82SPatrick Mooney page_unresv(sz >> PAGESHIFT);
8747c8c0b82SPatrick Mooney return (0);
8757c8c0b82SPatrick Mooney }
8767c8c0b82SPatrick Mooney
8776bba8b59SPatrick Mooney static int
vmmr_set_target(size_t target_sz,size_t chunk_sz,size_t * resp)8786bba8b59SPatrick Mooney vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
8796bba8b59SPatrick Mooney {
8806bba8b59SPatrick Mooney VERIFY(resp != NULL);
8816bba8b59SPatrick Mooney
8826bba8b59SPatrick Mooney mutex_enter(&vmmr_lock);
8836bba8b59SPatrick Mooney
8846bba8b59SPatrick Mooney size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
8856bba8b59SPatrick Mooney
8866bba8b59SPatrick Mooney /* Be sure to communicate current size in case of an early bail-out */
8876bba8b59SPatrick Mooney *resp = current_sz;
8886bba8b59SPatrick Mooney
8896bba8b59SPatrick Mooney if ((target_sz & PAGEOFFSET) != 0 ||
8906bba8b59SPatrick Mooney (chunk_sz & PAGEOFFSET) != 0) {
8916bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
8926bba8b59SPatrick Mooney return (EINVAL);
8936bba8b59SPatrick Mooney }
8946bba8b59SPatrick Mooney /* Reject sentinel value */
8956bba8b59SPatrick Mooney if (target_sz == VMMR_TARGET_INACTIVE) {
8966bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
8976bba8b59SPatrick Mooney return (EINVAL);
8986bba8b59SPatrick Mooney }
8996bba8b59SPatrick Mooney
9006bba8b59SPatrick Mooney /* Already at target size */
9016bba8b59SPatrick Mooney if (target_sz == current_sz) {
9026bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
9036bba8b59SPatrick Mooney return (0);
9046bba8b59SPatrick Mooney }
9056bba8b59SPatrick Mooney
9066bba8b59SPatrick Mooney /* Reject racing requests size */
9076bba8b59SPatrick Mooney if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
9086bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
9096bba8b59SPatrick Mooney return (EALREADY);
9106bba8b59SPatrick Mooney }
9116bba8b59SPatrick Mooney /* Record the target now to excluding a racing request */
9126bba8b59SPatrick Mooney vmmr_target_sz = target_sz;
9136bba8b59SPatrick Mooney
9146bba8b59SPatrick Mooney int err = 0;
9156bba8b59SPatrick Mooney do {
9166bba8b59SPatrick Mooney /* Be sensitive to signal interruption */
9176bba8b59SPatrick Mooney if (issig(JUSTLOOKING) != 0) {
9186bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
9196bba8b59SPatrick Mooney const bool sig_bail = issig(FORREAL) != 0;
9206bba8b59SPatrick Mooney mutex_enter(&vmmr_lock);
9216bba8b59SPatrick Mooney if (sig_bail) {
9226bba8b59SPatrick Mooney err = EINTR;
9236bba8b59SPatrick Mooney break;
9246bba8b59SPatrick Mooney }
9256bba8b59SPatrick Mooney }
9266bba8b59SPatrick Mooney
9276bba8b59SPatrick Mooney if (current_sz > target_sz) {
9286bba8b59SPatrick Mooney /* Shrinking reservoir */
9296bba8b59SPatrick Mooney
9306bba8b59SPatrick Mooney size_t req_sz = current_sz - target_sz;
9316bba8b59SPatrick Mooney if (chunk_sz != 0) {
9326bba8b59SPatrick Mooney req_sz = MIN(req_sz, chunk_sz);
9336bba8b59SPatrick Mooney }
9346bba8b59SPatrick Mooney err = vmmr_remove(req_sz, false);
9356bba8b59SPatrick Mooney } else {
9366bba8b59SPatrick Mooney /* Growing reservoir */
9376bba8b59SPatrick Mooney ASSERT(current_sz < target_sz);
9386bba8b59SPatrick Mooney
9396bba8b59SPatrick Mooney size_t req_sz = target_sz - current_sz;
9406bba8b59SPatrick Mooney if (chunk_sz != 0) {
9416bba8b59SPatrick Mooney req_sz = MIN(req_sz, chunk_sz);
9426bba8b59SPatrick Mooney }
9436bba8b59SPatrick Mooney err = vmmr_add(req_sz, false);
9446bba8b59SPatrick Mooney }
9456bba8b59SPatrick Mooney
9466bba8b59SPatrick Mooney current_sz = vmmr_alloc_sz + vmmr_free_sz;
9476bba8b59SPatrick Mooney } while (err == 0 && current_sz != target_sz);
9486bba8b59SPatrick Mooney
9496bba8b59SPatrick Mooney /* Clear the target now that we are done (success or not) */
9506bba8b59SPatrick Mooney vmmr_target_sz = VMMR_TARGET_INACTIVE;
9516bba8b59SPatrick Mooney mutex_exit(&vmmr_lock);
9526bba8b59SPatrick Mooney *resp = current_sz;
9536bba8b59SPatrick Mooney return (err);
9546bba8b59SPatrick Mooney }
9556bba8b59SPatrick Mooney
9567c8c0b82SPatrick Mooney int
vmmr_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)9577c8c0b82SPatrick Mooney vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
9587c8c0b82SPatrick Mooney {
9596bba8b59SPatrick Mooney /*
9606bba8b59SPatrick Mooney * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we
9616bba8b59SPatrick Mooney * do not need to duplicate such checks here.
9626bba8b59SPatrick Mooney */
9636bba8b59SPatrick Mooney
9647c8c0b82SPatrick Mooney switch (cmd) {
9657c8c0b82SPatrick Mooney case VMM_RESV_QUERY: {
9667c8c0b82SPatrick Mooney struct vmm_resv_query res;
9677c8c0b82SPatrick Mooney void *datap = (void *)(uintptr_t)arg;
9687c8c0b82SPatrick Mooney
969f4659490SPatrick Mooney /* For now, anyone with access to vmmctl device can query */
9707c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock);
9717c8c0b82SPatrick Mooney res.vrq_free_sz = vmmr_free_sz;
9727c8c0b82SPatrick Mooney res.vrq_alloc_sz = vmmr_alloc_sz;
9737c8c0b82SPatrick Mooney res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
9747c8c0b82SPatrick Mooney res.vrq_limit = vmmr_total_limit;
9757c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock);
9767c8c0b82SPatrick Mooney if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
9777c8c0b82SPatrick Mooney return (EFAULT);
9787c8c0b82SPatrick Mooney }
9797c8c0b82SPatrick Mooney break;
9807c8c0b82SPatrick Mooney }
9816bba8b59SPatrick Mooney case VMM_RESV_SET_TARGET: {
9827c8c0b82SPatrick Mooney if (secpolicy_sys_config(cr, B_FALSE) != 0) {
9837c8c0b82SPatrick Mooney return (EPERM);
9847c8c0b82SPatrick Mooney }
9856bba8b59SPatrick Mooney
9866bba8b59SPatrick Mooney struct vmm_resv_target tgt;
9876bba8b59SPatrick Mooney void *datap = (void *)(uintptr_t)arg;
9886bba8b59SPatrick Mooney
9896bba8b59SPatrick Mooney if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
9906bba8b59SPatrick Mooney return (EFAULT);
9917c8c0b82SPatrick Mooney }
9926bba8b59SPatrick Mooney
9936bba8b59SPatrick Mooney int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
9946bba8b59SPatrick Mooney &tgt.vrt_result_sz);
9956bba8b59SPatrick Mooney
9966bba8b59SPatrick Mooney /*
9976bba8b59SPatrick Mooney * Attempt to communicate the resultant size of the reservoir if
9986bba8b59SPatrick Mooney * setting it to the target was a success, or if we were
9996bba8b59SPatrick Mooney * interrupted (by a signal) while doing so.
10006bba8b59SPatrick Mooney */
10016bba8b59SPatrick Mooney if (err == 0 || err == EINTR) {
10026bba8b59SPatrick Mooney if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
10036bba8b59SPatrick Mooney err = EFAULT;
10047c8c0b82SPatrick Mooney }
10056bba8b59SPatrick Mooney }
10066bba8b59SPatrick Mooney
10076bba8b59SPatrick Mooney return (err);
10087c8c0b82SPatrick Mooney }
10097c8c0b82SPatrick Mooney default:
10107c8c0b82SPatrick Mooney return (ENOTTY);
10117c8c0b82SPatrick Mooney }
10127c8c0b82SPatrick Mooney return (0);
10137c8c0b82SPatrick Mooney }
1014