17c8c0b82SPatrick Mooney /* 27c8c0b82SPatrick Mooney * This file and its contents are supplied under the terms of the 37c8c0b82SPatrick Mooney * Common Development and Distribution License ("CDDL"), version 1.0. 47c8c0b82SPatrick Mooney * You may only use this file in accordance with the terms of version 57c8c0b82SPatrick Mooney * 1.0 of the CDDL. 67c8c0b82SPatrick Mooney * 77c8c0b82SPatrick Mooney * A full copy of the text of the CDDL should have accompanied this 87c8c0b82SPatrick Mooney * source. A copy of the CDDL is also available via the Internet at 97c8c0b82SPatrick Mooney * http://www.illumos.org/license/CDDL. 107c8c0b82SPatrick Mooney */ 117c8c0b82SPatrick Mooney 127c8c0b82SPatrick Mooney /* 13*6bba8b59SPatrick Mooney * Copyright 2023 Oxide Computer Company 147c8c0b82SPatrick Mooney */ 157c8c0b82SPatrick Mooney 167c8c0b82SPatrick Mooney /* 177c8c0b82SPatrick Mooney * VMM Memory Reservoir 187c8c0b82SPatrick Mooney * 197c8c0b82SPatrick Mooney * 207c8c0b82SPatrick Mooney * In order to make the allocation of large (multi-GiB) chunks of memory 217c8c0b82SPatrick Mooney * for bhyve VMs easier, we introduce the "VMM Reservoir", where system 227c8c0b82SPatrick Mooney * operators can set aside a substantial portion of system memory exclusively 237c8c0b82SPatrick Mooney * for VMs. This memory is unavailable for general use by the rest of the 247c8c0b82SPatrick Mooney * system. Rather than having to scour the freelist, reap kmem caches, or put 257c8c0b82SPatrick Mooney * pressure on the ARC, bhyve guest memory allocations can quickly determine if 267c8c0b82SPatrick Mooney * there is adequate reservoir memory available. Since the pages stored in the 277c8c0b82SPatrick Mooney * reservoir are pre-zeroed, it can be immediately used when allocated to a 287c8c0b82SPatrick Mooney * guest. When the memory is returned to the reservoir, it is zeroed once more 297c8c0b82SPatrick Mooney * to avoid leaking any sensitive data from that guest. 307c8c0b82SPatrick Mooney * 317c8c0b82SPatrick Mooney * 327c8c0b82SPatrick Mooney * Transient Allocations 337c8c0b82SPatrick Mooney * 347c8c0b82SPatrick Mooney * While the explicit reservoir model may work well for some applications, 357c8c0b82SPatrick Mooney * others may want a more traditional model, where pages for guest memory 367c8c0b82SPatrick Mooney * objects are allocated on demand, rather than from a pool set aside from the 377c8c0b82SPatrick Mooney * system. In this case, the allocation can be made in "transient" mode, where 387c8c0b82SPatrick Mooney * the memory is allocated normally, even if there is free capacity in the 397c8c0b82SPatrick Mooney * reservoir. When use of the transient allocation is complete (the guest is 407c8c0b82SPatrick Mooney * halted and destroyed), the pages will be freed back to the system, rather 417c8c0b82SPatrick Mooney * than added back to the reservoir. 427c8c0b82SPatrick Mooney * 437c8c0b82SPatrick Mooney * From an implementation standpoint, transient allocations follow the same 447c8c0b82SPatrick Mooney * code paths as ones using the reservoir normally. Those allocations have a 457c8c0b82SPatrick Mooney * tag which marks them as transient, and used/free size tallies are maintained 467c8c0b82SPatrick Mooney * separately for normal and transient operations. When performing a transient 477c8c0b82SPatrick Mooney * allocation, that amount of memory is immediately added to the reservoir , 487c8c0b82SPatrick Mooney * from which the allocation can be made. When freeing a transient allocation, 497c8c0b82SPatrick Mooney * a matching amount of memory is removed from the reservoir as part of the 507c8c0b82SPatrick Mooney * operation. This allows both allocation types to coexist without too much 517c8c0b82SPatrick Mooney * additional machinery. 527c8c0b82SPatrick Mooney * 537c8c0b82SPatrick Mooney * 547c8c0b82SPatrick Mooney * Administration 557c8c0b82SPatrick Mooney * 56*6bba8b59SPatrick Mooney * Operators may attempt to alter the amount of memory allocated to the 57*6bba8b59SPatrick Mooney * reservoir via an ioctl against the vmmctl device. The total amount of memory 58*6bba8b59SPatrick Mooney * in the reservoir (free, or allocated to VMs) is arbitrarily limited at this 59*6bba8b59SPatrick Mooney * time by `vmmr_total_limit`, which defaults to 80% of physmem. This is done 60*6bba8b59SPatrick Mooney * to prevent the reservoir from inadvertently growing to a size where the 61*6bba8b59SPatrick Mooney * system has inadequate memory to make forward progress. Shrinking the 62*6bba8b59SPatrick Mooney * reservoir is only possible when it contains free (not allocated by any guest 63*6bba8b59SPatrick Mooney * VMs) memory. 647c8c0b82SPatrick Mooney * 657c8c0b82SPatrick Mooney * 667c8c0b82SPatrick Mooney * Page Tracking 677c8c0b82SPatrick Mooney * 687c8c0b82SPatrick Mooney * The reservoir currently uses vnode association to keep track of pages under 697c8c0b82SPatrick Mooney * its control (either designated to the reservoir and free, or allocated to a 707c8c0b82SPatrick Mooney * guest VM object). This means using the existing VM system primitives for 717c8c0b82SPatrick Mooney * page_t instances being associated with a given (vnode, offset) tuple. It 727c8c0b82SPatrick Mooney * means that spans of pages, either free or allocated, need only to store a 737c8c0b82SPatrick Mooney * length (of the span) and an offset (into the vnode) in order to gain access 747c8c0b82SPatrick Mooney * to all of the underlying pages associated with that span. Associating the 757c8c0b82SPatrick Mooney * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be 767c8c0b82SPatrick Mooney * properly tracked as KAS pages, but be excluded from normal dumps (unless the 777c8c0b82SPatrick Mooney * operator has chosen to dump all of RAM). 787c8c0b82SPatrick Mooney */ 797c8c0b82SPatrick Mooney 807c8c0b82SPatrick Mooney #include <sys/types.h> 817c8c0b82SPatrick Mooney #include <sys/mutex.h> 827c8c0b82SPatrick Mooney #include <sys/avl.h> 837c8c0b82SPatrick Mooney #include <sys/list.h> 847c8c0b82SPatrick Mooney #include <sys/machparam.h> 857c8c0b82SPatrick Mooney #include <sys/kmem.h> 867c8c0b82SPatrick Mooney #include <sys/stddef.h> 877c8c0b82SPatrick Mooney #include <sys/null.h> 887c8c0b82SPatrick Mooney #include <sys/errno.h> 897c8c0b82SPatrick Mooney #include <sys/systm.h> 907c8c0b82SPatrick Mooney #include <sys/sunddi.h> 917c8c0b82SPatrick Mooney #include <sys/policy.h> 927c8c0b82SPatrick Mooney #include <vm/seg_kmem.h> 937c8c0b82SPatrick Mooney #include <vm/hat_i86.h> 94*6bba8b59SPatrick Mooney #include <sys/kstat.h> 957c8c0b82SPatrick Mooney 967c8c0b82SPatrick Mooney #include <sys/vmm_reservoir.h> 977c8c0b82SPatrick Mooney #include <sys/vmm_dev.h> 98*6bba8b59SPatrick Mooney #include <sys/vmm_impl.h> 99*6bba8b59SPatrick Mooney 100*6bba8b59SPatrick Mooney #define VMMR_TARGET_INACTIVE SIZE_MAX 1017c8c0b82SPatrick Mooney 1027c8c0b82SPatrick Mooney static kmutex_t vmmr_lock; 1037c8c0b82SPatrick Mooney 1047c8c0b82SPatrick Mooney static size_t vmmr_free_sz; 1057c8c0b82SPatrick Mooney static size_t vmmr_free_transient_sz; 1067c8c0b82SPatrick Mooney static size_t vmmr_adding_sz; 1077c8c0b82SPatrick Mooney static size_t vmmr_alloc_sz; 1087c8c0b82SPatrick Mooney static size_t vmmr_alloc_transient_sz; 1097c8c0b82SPatrick Mooney static size_t vmmr_empty_sz; 1107c8c0b82SPatrick Mooney 111*6bba8b59SPatrick Mooney /* 112*6bba8b59SPatrick Mooney * Target size of the reservoir during active vmmr_set_target() operation. 113*6bba8b59SPatrick Mooney * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active. 114*6bba8b59SPatrick Mooney */ 115*6bba8b59SPatrick Mooney static size_t vmmr_target_sz; 116*6bba8b59SPatrick Mooney 1177c8c0b82SPatrick Mooney static uintptr_t vmmr_empty_last; 1187c8c0b82SPatrick Mooney /* Upper limit for the size (free + allocated) of the reservoir */ 1197c8c0b82SPatrick Mooney static size_t vmmr_total_limit; 1207c8c0b82SPatrick Mooney 1217c8c0b82SPatrick Mooney /* VA range allocated from the VMM arena for the mappings */ 1227c8c0b82SPatrick Mooney static uintptr_t vmmr_va; 1237c8c0b82SPatrick Mooney static uintptr_t vmmr_va_sz; 1247c8c0b82SPatrick Mooney 125*6bba8b59SPatrick Mooney static kstat_t *vmmr_kstat; 126*6bba8b59SPatrick Mooney 1277c8c0b82SPatrick Mooney /* Pair of AVL trees to store set of spans ordered by addr and size */ 1287c8c0b82SPatrick Mooney typedef struct vmmr_treepair { 1297c8c0b82SPatrick Mooney avl_tree_t by_addr; 1307c8c0b82SPatrick Mooney avl_tree_t by_size; 1317c8c0b82SPatrick Mooney } vmmr_treepair_t; 1327c8c0b82SPatrick Mooney 1337c8c0b82SPatrick Mooney /* Spans of free memory in the reservoir */ 1347c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_free_tp; 1357c8c0b82SPatrick Mooney 1367c8c0b82SPatrick Mooney /* Spans of empty (not backed by memory) space in the reservoir */ 1377c8c0b82SPatrick Mooney static vmmr_treepair_t vmmr_empty_tp; 1387c8c0b82SPatrick Mooney 1397c8c0b82SPatrick Mooney /* Regions of memory allocated from the reservoir */ 1407c8c0b82SPatrick Mooney static list_t vmmr_alloc_regions; 1417c8c0b82SPatrick Mooney 1427c8c0b82SPatrick Mooney struct vmmr_span { 1437c8c0b82SPatrick Mooney uintptr_t vs_addr; 1447c8c0b82SPatrick Mooney size_t vs_size; 1457c8c0b82SPatrick Mooney avl_node_t vs_by_addr; 1467c8c0b82SPatrick Mooney avl_node_t vs_by_size; 1477c8c0b82SPatrick Mooney uintptr_t vs_region_addr; 1487c8c0b82SPatrick Mooney }; 1497c8c0b82SPatrick Mooney typedef struct vmmr_span vmmr_span_t; 1507c8c0b82SPatrick Mooney 1517c8c0b82SPatrick Mooney struct vmmr_region { 1527c8c0b82SPatrick Mooney size_t vr_size; 1537c8c0b82SPatrick Mooney avl_tree_t vr_spans; 1547c8c0b82SPatrick Mooney list_node_t vr_node; 1557c8c0b82SPatrick Mooney bool vr_transient; 1567c8c0b82SPatrick Mooney }; 1577c8c0b82SPatrick Mooney 158*6bba8b59SPatrick Mooney typedef struct vmmr_kstats { 159*6bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_free; 160*6bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_alloc; 161*6bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_transient; 162*6bba8b59SPatrick Mooney kstat_named_t vmrks_bytes_limit; 163*6bba8b59SPatrick Mooney } vmmr_kstats_t; 164*6bba8b59SPatrick Mooney 165*6bba8b59SPatrick Mooney 166*6bba8b59SPatrick Mooney static int vmmr_add(size_t, bool); 167*6bba8b59SPatrick Mooney static int vmmr_remove(size_t, bool); 168*6bba8b59SPatrick Mooney 1697c8c0b82SPatrick Mooney static int 1707c8c0b82SPatrick Mooney vmmr_cmp_addr(const void *a, const void *b) 1717c8c0b82SPatrick Mooney { 1727c8c0b82SPatrick Mooney const vmmr_span_t *sa = a; 1737c8c0b82SPatrick Mooney const vmmr_span_t *sb = b; 1747c8c0b82SPatrick Mooney 1757c8c0b82SPatrick Mooney if (sa->vs_addr == sb->vs_addr) { 1767c8c0b82SPatrick Mooney return (0); 1777c8c0b82SPatrick Mooney } else if (sa->vs_addr < sb->vs_addr) { 1787c8c0b82SPatrick Mooney return (-1); 1797c8c0b82SPatrick Mooney } else { 1807c8c0b82SPatrick Mooney return (1); 1817c8c0b82SPatrick Mooney } 1827c8c0b82SPatrick Mooney } 1837c8c0b82SPatrick Mooney 1847c8c0b82SPatrick Mooney static int 1857c8c0b82SPatrick Mooney vmmr_cmp_size(const void *a, const void *b) 1867c8c0b82SPatrick Mooney { 1877c8c0b82SPatrick Mooney const vmmr_span_t *sa = a; 1887c8c0b82SPatrick Mooney const vmmr_span_t *sb = b; 1897c8c0b82SPatrick Mooney 1907c8c0b82SPatrick Mooney if (sa->vs_size == sb->vs_size) { 1917c8c0b82SPatrick Mooney /* 1927c8c0b82SPatrick Mooney * Since discontiguous spans could have the same size in a 1937c8c0b82SPatrick Mooney * by-size tree, differentiate them (as required by AVL) by 1947c8c0b82SPatrick Mooney * address so they can safely coexist while remaining sorted. 1957c8c0b82SPatrick Mooney */ 1967c8c0b82SPatrick Mooney return (vmmr_cmp_addr(a, b)); 1977c8c0b82SPatrick Mooney } else if (sa->vs_size < sb->vs_size) { 1987c8c0b82SPatrick Mooney return (-1); 1997c8c0b82SPatrick Mooney } else { 2007c8c0b82SPatrick Mooney return (1); 2017c8c0b82SPatrick Mooney } 2027c8c0b82SPatrick Mooney } 2037c8c0b82SPatrick Mooney 2047c8c0b82SPatrick Mooney static int 2057c8c0b82SPatrick Mooney vmmr_cmp_region_addr(const void *a, const void *b) 2067c8c0b82SPatrick Mooney { 2077c8c0b82SPatrick Mooney const vmmr_span_t *sa = a; 2087c8c0b82SPatrick Mooney const vmmr_span_t *sb = b; 2097c8c0b82SPatrick Mooney 2107c8c0b82SPatrick Mooney if (sa->vs_region_addr == sb->vs_region_addr) { 2117c8c0b82SPatrick Mooney return (0); 2127c8c0b82SPatrick Mooney } else if (sa->vs_region_addr < sb->vs_region_addr) { 2137c8c0b82SPatrick Mooney return (-1); 2147c8c0b82SPatrick Mooney } else { 2157c8c0b82SPatrick Mooney return (1); 2167c8c0b82SPatrick Mooney } 2177c8c0b82SPatrick Mooney } 2187c8c0b82SPatrick Mooney 2197c8c0b82SPatrick Mooney static void 2207c8c0b82SPatrick Mooney vmmr_tp_init(vmmr_treepair_t *tree) 2217c8c0b82SPatrick Mooney { 2227c8c0b82SPatrick Mooney avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), 2237c8c0b82SPatrick Mooney offsetof(vmmr_span_t, vs_by_addr)); 2247c8c0b82SPatrick Mooney avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), 2257c8c0b82SPatrick Mooney offsetof(vmmr_span_t, vs_by_size)); 2267c8c0b82SPatrick Mooney } 2277c8c0b82SPatrick Mooney 2287c8c0b82SPatrick Mooney static void 2297c8c0b82SPatrick Mooney vmmr_tp_destroy(vmmr_treepair_t *tree) 2307c8c0b82SPatrick Mooney { 2317c8c0b82SPatrick Mooney void *vcp = NULL; 2327c8c0b82SPatrick Mooney vmmr_span_t *span; 2337c8c0b82SPatrick Mooney 2347c8c0b82SPatrick Mooney while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { 2357c8c0b82SPatrick Mooney /* Freeing spans will be done when tearing down by-size tree */ 2367c8c0b82SPatrick Mooney } 2377c8c0b82SPatrick Mooney while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { 2387c8c0b82SPatrick Mooney kmem_free(span, sizeof (*span)); 2397c8c0b82SPatrick Mooney } 2407c8c0b82SPatrick Mooney avl_destroy(&tree->by_addr); 2417c8c0b82SPatrick Mooney avl_destroy(&tree->by_size); 2427c8c0b82SPatrick Mooney } 2437c8c0b82SPatrick Mooney 2447c8c0b82SPatrick Mooney /* 2457c8c0b82SPatrick Mooney * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent 2467c8c0b82SPatrick Mooney * span(s). Such concatenation could result in the `to_add` span being freed, 2477c8c0b82SPatrick Mooney * so the caller cannot use it after this returns. 2487c8c0b82SPatrick Mooney */ 2497c8c0b82SPatrick Mooney static void 2507c8c0b82SPatrick Mooney vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) 2517c8c0b82SPatrick Mooney { 2527c8c0b82SPatrick Mooney avl_tree_t *by_addr = &tree->by_addr; 2537c8c0b82SPatrick Mooney avl_tree_t *by_size = &tree->by_size; 2547c8c0b82SPatrick Mooney vmmr_span_t *node; 2557c8c0b82SPatrick Mooney avl_index_t where; 2567c8c0b82SPatrick Mooney 2577c8c0b82SPatrick Mooney /* This addr should not already exist in the treepair */ 2587c8c0b82SPatrick Mooney node = avl_find(by_addr, to_add, &where); 2597c8c0b82SPatrick Mooney ASSERT3P(node, ==, NULL); 2607c8c0b82SPatrick Mooney 2617c8c0b82SPatrick Mooney node = avl_nearest(by_addr, where, AVL_BEFORE); 2627c8c0b82SPatrick Mooney if (node != NULL && 2637c8c0b82SPatrick Mooney (node->vs_addr + node->vs_size) == to_add->vs_addr) { 2647c8c0b82SPatrick Mooney /* concat with preceeding item */ 2657c8c0b82SPatrick Mooney avl_remove(by_addr, node); 2667c8c0b82SPatrick Mooney avl_remove(by_size, node); 2677c8c0b82SPatrick Mooney node->vs_size += to_add->vs_size; 2687c8c0b82SPatrick Mooney kmem_free(to_add, sizeof (*to_add)); 2697c8c0b82SPatrick Mooney 2707c8c0b82SPatrick Mooney /* 2717c8c0b82SPatrick Mooney * Since this now-concatenated span could be adjacent one 2727c8c0b82SPatrick Mooney * trailing it, fall through to perform that check. 2737c8c0b82SPatrick Mooney */ 2747c8c0b82SPatrick Mooney to_add = node; 2757c8c0b82SPatrick Mooney } 2767c8c0b82SPatrick Mooney 2777c8c0b82SPatrick Mooney node = avl_nearest(by_addr, where, AVL_AFTER); 2787c8c0b82SPatrick Mooney if (node != NULL && 2797c8c0b82SPatrick Mooney (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { 2807c8c0b82SPatrick Mooney /* concat with trailing item */ 2817c8c0b82SPatrick Mooney avl_remove(by_addr, node); 2827c8c0b82SPatrick Mooney avl_remove(by_size, node); 2837c8c0b82SPatrick Mooney node->vs_addr = to_add->vs_addr; 2847c8c0b82SPatrick Mooney node->vs_size += to_add->vs_size; 2857c8c0b82SPatrick Mooney avl_add(by_addr, node); 2867c8c0b82SPatrick Mooney avl_add(by_size, node); 2877c8c0b82SPatrick Mooney 2887c8c0b82SPatrick Mooney kmem_free(to_add, sizeof (*to_add)); 2897c8c0b82SPatrick Mooney return; 2907c8c0b82SPatrick Mooney } 2917c8c0b82SPatrick Mooney 2927c8c0b82SPatrick Mooney /* simply insert */ 2937c8c0b82SPatrick Mooney avl_add(by_addr, to_add); 2947c8c0b82SPatrick Mooney avl_add(by_size, to_add); 2957c8c0b82SPatrick Mooney } 2967c8c0b82SPatrick Mooney 2977c8c0b82SPatrick Mooney /* 2987c8c0b82SPatrick Mooney * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of 2997c8c0b82SPatrick Mooney * the exact target size is not present, but a larger one is. May return a span 3007c8c0b82SPatrick Mooney * with a size smaller than the target if splitting is not an option. 3017c8c0b82SPatrick Mooney */ 3027c8c0b82SPatrick Mooney static vmmr_span_t * 3037c8c0b82SPatrick Mooney vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) 3047c8c0b82SPatrick Mooney { 3057c8c0b82SPatrick Mooney avl_tree_t *by_addr = &tree->by_addr; 3067c8c0b82SPatrick Mooney avl_tree_t *by_size = &tree->by_size; 3077c8c0b82SPatrick Mooney vmmr_span_t *span; 3087c8c0b82SPatrick Mooney avl_index_t where; 3097c8c0b82SPatrick Mooney 3107c8c0b82SPatrick Mooney ASSERT3U(target_sz, !=, 0); 3117c8c0b82SPatrick Mooney ASSERT(!avl_is_empty(by_addr)); 3127c8c0b82SPatrick Mooney ASSERT(!avl_is_empty(by_size)); 3137c8c0b82SPatrick Mooney 3147c8c0b82SPatrick Mooney vmmr_span_t search = { .vs_size = target_sz }; 3157c8c0b82SPatrick Mooney span = avl_find(by_size, &search, &where); 3167c8c0b82SPatrick Mooney if (span == NULL) { 3177c8c0b82SPatrick Mooney /* Try for a larger span (instead of exact match) */ 3187c8c0b82SPatrick Mooney span = avl_nearest(by_size, where, AVL_AFTER); 3197c8c0b82SPatrick Mooney if (span == NULL) { 3207c8c0b82SPatrick Mooney /* 3217c8c0b82SPatrick Mooney * Caller will need to collect several smaller spans in 3227c8c0b82SPatrick Mooney * order to fulfill their request. 3237c8c0b82SPatrick Mooney */ 3247c8c0b82SPatrick Mooney span = avl_nearest(by_size, where, AVL_BEFORE); 3257c8c0b82SPatrick Mooney ASSERT3P(span, !=, NULL); 3267c8c0b82SPatrick Mooney } 3277c8c0b82SPatrick Mooney } 3287c8c0b82SPatrick Mooney 3297c8c0b82SPatrick Mooney if (span->vs_size <= target_sz) { 3307c8c0b82SPatrick Mooney avl_remove(by_size, span); 3317c8c0b82SPatrick Mooney avl_remove(by_addr, span); 3327c8c0b82SPatrick Mooney 3337c8c0b82SPatrick Mooney return (span); 3347c8c0b82SPatrick Mooney } else { 3357c8c0b82SPatrick Mooney /* Split off adequate chunk from larger span */ 3367c8c0b82SPatrick Mooney uintptr_t start = span->vs_addr + span->vs_size - target_sz; 3377c8c0b82SPatrick Mooney 3387c8c0b82SPatrick Mooney avl_remove(by_size, span); 3397c8c0b82SPatrick Mooney span->vs_size -= target_sz; 3407c8c0b82SPatrick Mooney avl_add(by_size, span); 3417c8c0b82SPatrick Mooney 3427c8c0b82SPatrick Mooney vmmr_span_t *split_span = 3437c8c0b82SPatrick Mooney kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 3447c8c0b82SPatrick Mooney split_span->vs_addr = start; 3457c8c0b82SPatrick Mooney split_span->vs_size = target_sz; 3467c8c0b82SPatrick Mooney 3477c8c0b82SPatrick Mooney return (split_span); 3487c8c0b82SPatrick Mooney } 3497c8c0b82SPatrick Mooney } 3507c8c0b82SPatrick Mooney 351*6bba8b59SPatrick Mooney static int 352*6bba8b59SPatrick Mooney vmmr_kstat_update(struct kstat *ksp, int rw) 353*6bba8b59SPatrick Mooney { 354*6bba8b59SPatrick Mooney vmmr_kstats_t *vkp = ksp->ks_data; 355*6bba8b59SPatrick Mooney 356*6bba8b59SPatrick Mooney mutex_enter(&vmmr_lock); 357*6bba8b59SPatrick Mooney vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz; 358*6bba8b59SPatrick Mooney vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz; 359*6bba8b59SPatrick Mooney /* 360*6bba8b59SPatrick Mooney * In addition to the memory which is actually actually allocated to 361*6bba8b59SPatrick Mooney * transient consumers, memory which is considered free-for-transient is 362*6bba8b59SPatrick Mooney * also included in the sizing. 363*6bba8b59SPatrick Mooney */ 364*6bba8b59SPatrick Mooney vkp->vmrks_bytes_transient.value.ui64 = 365*6bba8b59SPatrick Mooney vmmr_alloc_transient_sz + vmmr_free_transient_sz; 366*6bba8b59SPatrick Mooney vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit; 367*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 368*6bba8b59SPatrick Mooney 369*6bba8b59SPatrick Mooney return (0); 370*6bba8b59SPatrick Mooney } 371*6bba8b59SPatrick Mooney 372*6bba8b59SPatrick Mooney int 3737c8c0b82SPatrick Mooney vmmr_init() 3747c8c0b82SPatrick Mooney { 3757c8c0b82SPatrick Mooney mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); 3767c8c0b82SPatrick Mooney 3777c8c0b82SPatrick Mooney /* 3787c8c0b82SPatrick Mooney * `vmm_total_limit` represents the absolute maximum size of the VMM 3797c8c0b82SPatrick Mooney * memory reservoir. It is meant to provide some measure of protection 3807c8c0b82SPatrick Mooney * against an operator pushing the system into unrecoverable memory 3817c8c0b82SPatrick Mooney * starvation through explicit or transient additions to the reservoir. 3827c8c0b82SPatrick Mooney * 3837c8c0b82SPatrick Mooney * There will be many situations where this limit would be inadequate to 3847c8c0b82SPatrick Mooney * prevent kernel memory starvation in the face of certain operator 3857c8c0b82SPatrick Mooney * actions. It is a balance to be struck between safety and allowing 3867c8c0b82SPatrick Mooney * large systems to reach high utilization. 3877c8c0b82SPatrick Mooney * 3887c8c0b82SPatrick Mooney * The value is based off of pages_pp_maximum: "Number of currently 3897c8c0b82SPatrick Mooney * available pages that cannot be 'locked'". It is sized as all of 3907c8c0b82SPatrick Mooney * `physmem` less 120% of `pages_pp_maximum`. 3917c8c0b82SPatrick Mooney */ 3927c8c0b82SPatrick Mooney vmmr_total_limit = 3937c8c0b82SPatrick Mooney (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; 3947c8c0b82SPatrick Mooney 3957c8c0b82SPatrick Mooney vmmr_empty_last = 0; 3967c8c0b82SPatrick Mooney vmmr_free_sz = 0; 3977c8c0b82SPatrick Mooney vmmr_alloc_sz = 0; 3987c8c0b82SPatrick Mooney vmmr_empty_sz = 0; 3997c8c0b82SPatrick Mooney vmmr_adding_sz = 0; 4007c8c0b82SPatrick Mooney vmmr_free_transient_sz = 0; 4017c8c0b82SPatrick Mooney vmmr_alloc_transient_sz = 0; 402*6bba8b59SPatrick Mooney vmmr_target_sz = VMMR_TARGET_INACTIVE; 403*6bba8b59SPatrick Mooney 404*6bba8b59SPatrick Mooney /* 405*6bba8b59SPatrick Mooney * Attempt kstat allocation early, since it is the only part of 406*6bba8b59SPatrick Mooney * reservoir initialization which is fallible. 407*6bba8b59SPatrick Mooney */ 408*6bba8b59SPatrick Mooney kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir", 409*6bba8b59SPatrick Mooney VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 410*6bba8b59SPatrick Mooney sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID); 411*6bba8b59SPatrick Mooney if (ksp == NULL) { 412*6bba8b59SPatrick Mooney mutex_destroy(&vmmr_lock); 413*6bba8b59SPatrick Mooney return (ENOMEM); 414*6bba8b59SPatrick Mooney } 415*6bba8b59SPatrick Mooney 416*6bba8b59SPatrick Mooney vmmr_kstats_t *vkp = ksp->ks_data; 417*6bba8b59SPatrick Mooney 418*6bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free", 419*6bba8b59SPatrick Mooney KSTAT_DATA_UINT64); 420*6bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc", 421*6bba8b59SPatrick Mooney KSTAT_DATA_UINT64); 422*6bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc", 423*6bba8b59SPatrick Mooney KSTAT_DATA_UINT64); 424*6bba8b59SPatrick Mooney kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit", 425*6bba8b59SPatrick Mooney KSTAT_DATA_UINT64); 426*6bba8b59SPatrick Mooney ksp->ks_private = NULL; 427*6bba8b59SPatrick Mooney ksp->ks_update = vmmr_kstat_update; 428*6bba8b59SPatrick Mooney vmmr_kstat = ksp; 4297c8c0b82SPatrick Mooney 4307c8c0b82SPatrick Mooney vmmr_tp_init(&vmmr_free_tp); 4317c8c0b82SPatrick Mooney vmmr_tp_init(&vmmr_empty_tp); 4327c8c0b82SPatrick Mooney 4337c8c0b82SPatrick Mooney list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), 4347c8c0b82SPatrick Mooney offsetof(vmmr_region_t, vr_node)); 4357c8c0b82SPatrick Mooney 4367c8c0b82SPatrick Mooney /* Grab a chunk of VA for the reservoir */ 4377c8c0b82SPatrick Mooney vmmr_va_sz = physmem * PAGESIZE; 4387c8c0b82SPatrick Mooney vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); 439*6bba8b59SPatrick Mooney 440*6bba8b59SPatrick Mooney kstat_install(vmmr_kstat); 441*6bba8b59SPatrick Mooney 442*6bba8b59SPatrick Mooney return (0); 4437c8c0b82SPatrick Mooney } 4447c8c0b82SPatrick Mooney 4457c8c0b82SPatrick Mooney void 4467c8c0b82SPatrick Mooney vmmr_fini() 4477c8c0b82SPatrick Mooney { 4487c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 4497c8c0b82SPatrick Mooney VERIFY3U(vmmr_alloc_sz, ==, 0); 4507c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_sz, ==, 0); 4517c8c0b82SPatrick Mooney VERIFY3U(vmmr_adding_sz, ==, 0); 4527c8c0b82SPatrick Mooney VERIFY3U(vmmr_alloc_transient_sz, ==, 0); 4537c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_transient_sz, ==, 0); 4547c8c0b82SPatrick Mooney VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); 4557c8c0b82SPatrick Mooney VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); 4567c8c0b82SPatrick Mooney VERIFY(list_is_empty(&vmmr_alloc_regions)); 4577c8c0b82SPatrick Mooney 458*6bba8b59SPatrick Mooney kstat_delete(vmmr_kstat); 459*6bba8b59SPatrick Mooney vmmr_kstat = NULL; 460*6bba8b59SPatrick Mooney 4617c8c0b82SPatrick Mooney vmmr_tp_destroy(&vmmr_free_tp); 4627c8c0b82SPatrick Mooney vmmr_tp_destroy(&vmmr_empty_tp); 4637c8c0b82SPatrick Mooney list_destroy(&vmmr_alloc_regions); 4647c8c0b82SPatrick Mooney 4657c8c0b82SPatrick Mooney /* Release reservoir VA chunk */ 4667c8c0b82SPatrick Mooney vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); 4677c8c0b82SPatrick Mooney vmmr_va = 0; 4687c8c0b82SPatrick Mooney vmmr_va_sz = 0; 4697c8c0b82SPatrick Mooney vmmr_total_limit = 0; 4707c8c0b82SPatrick Mooney vmmr_empty_last = 0; 4717c8c0b82SPatrick Mooney 4727c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 4737c8c0b82SPatrick Mooney mutex_destroy(&vmmr_lock); 4747c8c0b82SPatrick Mooney } 4757c8c0b82SPatrick Mooney 4767c8c0b82SPatrick Mooney bool 4777c8c0b82SPatrick Mooney vmmr_is_empty() 4787c8c0b82SPatrick Mooney { 4797c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 4807c8c0b82SPatrick Mooney bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && 4817c8c0b82SPatrick Mooney vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); 4827c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 4837c8c0b82SPatrick Mooney return (res); 4847c8c0b82SPatrick Mooney } 4857c8c0b82SPatrick Mooney 4867c8c0b82SPatrick Mooney int 4877c8c0b82SPatrick Mooney vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) 4887c8c0b82SPatrick Mooney { 4897c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0); 4907c8c0b82SPatrick Mooney 4917c8c0b82SPatrick Mooney if (!transient) { 4927c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 4937c8c0b82SPatrick Mooney if (sz > vmmr_free_sz) { 4947c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 4957c8c0b82SPatrick Mooney return (ENOSPC); 4967c8c0b82SPatrick Mooney } 4977c8c0b82SPatrick Mooney } else { 4987c8c0b82SPatrick Mooney int err; 4997c8c0b82SPatrick Mooney 500*6bba8b59SPatrick Mooney mutex_enter(&vmmr_lock); 5017c8c0b82SPatrick Mooney err = vmmr_add(sz, true); 5027c8c0b82SPatrick Mooney if (err != 0) { 503*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 5047c8c0b82SPatrick Mooney return (err); 5057c8c0b82SPatrick Mooney } 5067c8c0b82SPatrick Mooney VERIFY3U(vmmr_free_transient_sz, >=, sz); 5077c8c0b82SPatrick Mooney } 5087c8c0b82SPatrick Mooney 5097c8c0b82SPatrick Mooney vmmr_region_t *region; 5107c8c0b82SPatrick Mooney region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); 5117c8c0b82SPatrick Mooney avl_create(®ion->vr_spans, vmmr_cmp_region_addr, 5127c8c0b82SPatrick Mooney sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); 5137c8c0b82SPatrick Mooney region->vr_size = sz; 5147c8c0b82SPatrick Mooney 5157c8c0b82SPatrick Mooney size_t remain = sz; 5167c8c0b82SPatrick Mooney uintptr_t map_at = 0; 5177c8c0b82SPatrick Mooney while (remain > 0) { 5187c8c0b82SPatrick Mooney vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 5197c8c0b82SPatrick Mooney 5207c8c0b82SPatrick Mooney /* 5217c8c0b82SPatrick Mooney * We have already ensured that adequate free memory is present 5227c8c0b82SPatrick Mooney * in the reservoir for this allocation. 5237c8c0b82SPatrick Mooney */ 5247c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL); 5257c8c0b82SPatrick Mooney ASSERT3U(span->vs_size, <=, remain); 5267c8c0b82SPatrick Mooney 5277c8c0b82SPatrick Mooney span->vs_region_addr = map_at; 5287c8c0b82SPatrick Mooney avl_add(®ion->vr_spans, span); 5297c8c0b82SPatrick Mooney map_at += span->vs_size; 5307c8c0b82SPatrick Mooney remain -= span->vs_size; 5317c8c0b82SPatrick Mooney } 5327c8c0b82SPatrick Mooney 5337c8c0b82SPatrick Mooney if (!transient) { 5347c8c0b82SPatrick Mooney vmmr_free_sz -= sz; 5357c8c0b82SPatrick Mooney vmmr_alloc_sz += sz; 5367c8c0b82SPatrick Mooney } else { 5377c8c0b82SPatrick Mooney vmmr_free_transient_sz -= sz; 5387c8c0b82SPatrick Mooney vmmr_alloc_transient_sz += sz; 5397c8c0b82SPatrick Mooney region->vr_transient = true; 5407c8c0b82SPatrick Mooney } 5417c8c0b82SPatrick Mooney list_insert_tail(&vmmr_alloc_regions, region); 5427c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 5437c8c0b82SPatrick Mooney 5447c8c0b82SPatrick Mooney *resp = region; 5457c8c0b82SPatrick Mooney return (0); 5467c8c0b82SPatrick Mooney } 5477c8c0b82SPatrick Mooney 5487c8c0b82SPatrick Mooney void * 5497c8c0b82SPatrick Mooney vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) 5507c8c0b82SPatrick Mooney { 5517c8c0b82SPatrick Mooney /* just use KPM region for now */ 5527c8c0b82SPatrick Mooney return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); 5537c8c0b82SPatrick Mooney } 5547c8c0b82SPatrick Mooney 5557c8c0b82SPatrick Mooney pfn_t 5567c8c0b82SPatrick Mooney vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) 5577c8c0b82SPatrick Mooney { 5587c8c0b82SPatrick Mooney VERIFY3U(off & PAGEOFFSET, ==, 0); 5597c8c0b82SPatrick Mooney VERIFY3U(off, <, region->vr_size); 5607c8c0b82SPatrick Mooney 5617c8c0b82SPatrick Mooney vmmr_span_t search = { 5627c8c0b82SPatrick Mooney .vs_region_addr = off 5637c8c0b82SPatrick Mooney }; 5647c8c0b82SPatrick Mooney avl_index_t where; 5657c8c0b82SPatrick Mooney vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); 5667c8c0b82SPatrick Mooney 5677c8c0b82SPatrick Mooney if (span == NULL) { 5687c8c0b82SPatrick Mooney span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); 5697c8c0b82SPatrick Mooney ASSERT3P(span, !=, NULL); 5707c8c0b82SPatrick Mooney } 5717c8c0b82SPatrick Mooney uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; 5727c8c0b82SPatrick Mooney page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); 5737c8c0b82SPatrick Mooney VERIFY(pp != NULL); 5747c8c0b82SPatrick Mooney return (pp->p_pagenum); 5757c8c0b82SPatrick Mooney } 5767c8c0b82SPatrick Mooney 5777c8c0b82SPatrick Mooney void 5787c8c0b82SPatrick Mooney vmmr_free(vmmr_region_t *region) 5797c8c0b82SPatrick Mooney { 5807c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 5817c8c0b82SPatrick Mooney if (!region->vr_transient) { 5827c8c0b82SPatrick Mooney VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); 5837c8c0b82SPatrick Mooney } else { 5847c8c0b82SPatrick Mooney VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); 5857c8c0b82SPatrick Mooney } 5867c8c0b82SPatrick Mooney list_remove(&vmmr_alloc_regions, region); 5877c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 5887c8c0b82SPatrick Mooney 589*6bba8b59SPatrick Mooney /* Zero the contents (while not monopolizing vmmr_lock) */ 5907c8c0b82SPatrick Mooney for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { 5917c8c0b82SPatrick Mooney bzero(vmmr_region_mem_at(region, off), PAGESIZE); 5927c8c0b82SPatrick Mooney } 5937c8c0b82SPatrick Mooney 5947c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 5957c8c0b82SPatrick Mooney 5967c8c0b82SPatrick Mooney /* Put the contained span(s) back in the free pool */ 5977c8c0b82SPatrick Mooney void *cookie = NULL; 5987c8c0b82SPatrick Mooney vmmr_span_t *span; 5997c8c0b82SPatrick Mooney while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { 6007c8c0b82SPatrick Mooney span->vs_region_addr = 0; 6017c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_free_tp); 6027c8c0b82SPatrick Mooney } 6037c8c0b82SPatrick Mooney avl_destroy(®ion->vr_spans); 6047c8c0b82SPatrick Mooney if (!region->vr_transient) { 6057c8c0b82SPatrick Mooney vmmr_free_sz += region->vr_size; 6067c8c0b82SPatrick Mooney vmmr_alloc_sz -= region->vr_size; 6077c8c0b82SPatrick Mooney } else { 6087c8c0b82SPatrick Mooney vmmr_free_transient_sz += region->vr_size; 6097c8c0b82SPatrick Mooney vmmr_alloc_transient_sz -= region->vr_size; 6107c8c0b82SPatrick Mooney } 6117c8c0b82SPatrick Mooney 6127c8c0b82SPatrick Mooney if (region->vr_transient) { 613e0994bd2SPatrick Mooney /* 614e0994bd2SPatrick Mooney * Since the transient capacity was previously allocated for 615e0994bd2SPatrick Mooney * this region, its removal should not fail. 616e0994bd2SPatrick Mooney */ 617e0994bd2SPatrick Mooney VERIFY0(vmmr_remove(region->vr_size, true)); 6187c8c0b82SPatrick Mooney } 6197c8c0b82SPatrick Mooney kmem_free(region, sizeof (*region)); 620*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 6217c8c0b82SPatrick Mooney } 6227c8c0b82SPatrick Mooney 6237c8c0b82SPatrick Mooney static void 6247c8c0b82SPatrick Mooney vmmr_destroy_pages(vmmr_span_t *span) 6257c8c0b82SPatrick Mooney { 6267c8c0b82SPatrick Mooney const uintptr_t end = span->vs_addr + span->vs_size; 6277c8c0b82SPatrick Mooney struct vnode *vp = &kvps[KV_VVP]; 6287c8c0b82SPatrick Mooney for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 6297c8c0b82SPatrick Mooney page_t *pp; 6307c8c0b82SPatrick Mooney 6317c8c0b82SPatrick Mooney /* Page-free logic cribbed from segkmem_xfree(): */ 6327c8c0b82SPatrick Mooney pp = page_find(vp, (u_offset_t)pos); 6337c8c0b82SPatrick Mooney VERIFY(pp != NULL); 6347c8c0b82SPatrick Mooney if (!page_tryupgrade(pp)) { 6357c8c0b82SPatrick Mooney /* 6367c8c0b82SPatrick Mooney * Some other thread has a sharelock. Wait for 6377c8c0b82SPatrick Mooney * it to drop the lock so we can free this page. 6387c8c0b82SPatrick Mooney */ 6397c8c0b82SPatrick Mooney page_unlock(pp); 6407c8c0b82SPatrick Mooney pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); 6417c8c0b82SPatrick Mooney } 6427c8c0b82SPatrick Mooney 6437c8c0b82SPatrick Mooney /* 6447c8c0b82SPatrick Mooney * Clear p_lckcnt so page_destroy() doesn't update availrmem. 6457c8c0b82SPatrick Mooney * That will be taken care of later via page_unresv(). 6467c8c0b82SPatrick Mooney */ 6477c8c0b82SPatrick Mooney pp->p_lckcnt = 0; 6487c8c0b82SPatrick Mooney page_destroy(pp, 0); 6497c8c0b82SPatrick Mooney } 6507c8c0b82SPatrick Mooney } 6517c8c0b82SPatrick Mooney 6527c8c0b82SPatrick Mooney static int 6537c8c0b82SPatrick Mooney vmmr_alloc_pages(const vmmr_span_t *span) 6547c8c0b82SPatrick Mooney { 6557c8c0b82SPatrick Mooney struct seg kseg = { 6567c8c0b82SPatrick Mooney .s_as = &kas 6577c8c0b82SPatrick Mooney }; 6587c8c0b82SPatrick Mooney struct vnode *vp = &kvps[KV_VVP]; 6597c8c0b82SPatrick Mooney 6607c8c0b82SPatrick Mooney const uintptr_t end = span->vs_addr + span->vs_size; 6617c8c0b82SPatrick Mooney for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 6627c8c0b82SPatrick Mooney page_t *pp; 6637c8c0b82SPatrick Mooney 6647c8c0b82SPatrick Mooney pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, 6657c8c0b82SPatrick Mooney PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); 6667c8c0b82SPatrick Mooney 6677c8c0b82SPatrick Mooney if (pp == NULL) { 6687c8c0b82SPatrick Mooney /* Destroy any already-created pages */ 6697c8c0b82SPatrick Mooney if (pos != span->vs_addr) { 6707c8c0b82SPatrick Mooney vmmr_span_t destroy_span = { 6717c8c0b82SPatrick Mooney .vs_addr = span->vs_addr, 6727c8c0b82SPatrick Mooney .vs_size = pos - span->vs_addr, 6737c8c0b82SPatrick Mooney }; 6747c8c0b82SPatrick Mooney 6757c8c0b82SPatrick Mooney vmmr_destroy_pages(&destroy_span); 6767c8c0b82SPatrick Mooney } 6777c8c0b82SPatrick Mooney return (ENOMEM); 6787c8c0b82SPatrick Mooney } 6797c8c0b82SPatrick Mooney 6807c8c0b82SPatrick Mooney /* mimic page state from segkmem */ 6817c8c0b82SPatrick Mooney ASSERT(PAGE_EXCL(pp)); 6827c8c0b82SPatrick Mooney page_io_unlock(pp); 6837c8c0b82SPatrick Mooney pp->p_lckcnt = 1; 6847c8c0b82SPatrick Mooney page_downgrade(pp); 6857c8c0b82SPatrick Mooney 6867c8c0b82SPatrick Mooney /* pre-zero the page */ 6877c8c0b82SPatrick Mooney bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); 6887c8c0b82SPatrick Mooney } 6897c8c0b82SPatrick Mooney 6907c8c0b82SPatrick Mooney return (0); 6917c8c0b82SPatrick Mooney } 6927c8c0b82SPatrick Mooney 6937c8c0b82SPatrick Mooney static int 6947c8c0b82SPatrick Mooney vmmr_resv_wait() 6957c8c0b82SPatrick Mooney { 6967c8c0b82SPatrick Mooney if (delay_sig(hz >> 2) != 0) { 6977c8c0b82SPatrick Mooney /* bail due to interruption */ 6987c8c0b82SPatrick Mooney return (0); 6997c8c0b82SPatrick Mooney } 7007c8c0b82SPatrick Mooney return (1); 7017c8c0b82SPatrick Mooney } 7027c8c0b82SPatrick Mooney 7037c8c0b82SPatrick Mooney static void 7047c8c0b82SPatrick Mooney vmmr_remove_raw(size_t sz) 7057c8c0b82SPatrick Mooney { 7067c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0); 7077c8c0b82SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock)); 7087c8c0b82SPatrick Mooney 7097c8c0b82SPatrick Mooney size_t remain = sz; 7107c8c0b82SPatrick Mooney while (remain > 0) { 7117c8c0b82SPatrick Mooney vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 7127c8c0b82SPatrick Mooney 7137c8c0b82SPatrick Mooney /* 7147c8c0b82SPatrick Mooney * The caller must ensure that at least `sz` amount is present 7157c8c0b82SPatrick Mooney * in the free treepair. 7167c8c0b82SPatrick Mooney */ 7177c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL); 7187c8c0b82SPatrick Mooney ASSERT3U(span->vs_size, <=, remain); 7197c8c0b82SPatrick Mooney 7207c8c0b82SPatrick Mooney /* TODO: perhaps arrange to destroy pages outside the lock? */ 7217c8c0b82SPatrick Mooney vmmr_destroy_pages(span); 7227c8c0b82SPatrick Mooney 7237c8c0b82SPatrick Mooney remain -= span->vs_size; 7247c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_empty_tp); 7257c8c0b82SPatrick Mooney } 7267c8c0b82SPatrick Mooney 7277c8c0b82SPatrick Mooney vmmr_empty_sz += sz; 7287c8c0b82SPatrick Mooney } 7297c8c0b82SPatrick Mooney 730*6bba8b59SPatrick Mooney /* 731*6bba8b59SPatrick Mooney * Add memory to vmm reservoir. Memory may be marked for transient use, where 732*6bba8b59SPatrick Mooney * the addition is part of a transient allocation from the reservoir. Otherwise 733*6bba8b59SPatrick Mooney * it is placed in the reservoir to be available for non-transient allocations. 734*6bba8b59SPatrick Mooney * 735*6bba8b59SPatrick Mooney * Expects vmmr_lock to be held when called, and will return with it held, but 736*6bba8b59SPatrick Mooney * will drop it during portions of the addition. 737*6bba8b59SPatrick Mooney */ 738*6bba8b59SPatrick Mooney static int 7397c8c0b82SPatrick Mooney vmmr_add(size_t sz, bool transient) 7407c8c0b82SPatrick Mooney { 7417c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0); 742*6bba8b59SPatrick Mooney VERIFY3U(sz, >, 0); 743*6bba8b59SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock)); 7447c8c0b82SPatrick Mooney 7457c8c0b82SPatrick Mooney /* 7467c8c0b82SPatrick Mooney * Make sure that the amount added is not going to breach the limits 7477c8c0b82SPatrick Mooney * we've chosen 7487c8c0b82SPatrick Mooney */ 7497c8c0b82SPatrick Mooney const size_t current_total = 7507c8c0b82SPatrick Mooney vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + 7517c8c0b82SPatrick Mooney vmmr_alloc_transient_sz + vmmr_free_transient_sz; 7527c8c0b82SPatrick Mooney if ((current_total + sz) < current_total) { 7537c8c0b82SPatrick Mooney return (EOVERFLOW); 7547c8c0b82SPatrick Mooney } 7557c8c0b82SPatrick Mooney if ((current_total + sz) > vmmr_total_limit) { 7567c8c0b82SPatrick Mooney return (ENOSPC); 7577c8c0b82SPatrick Mooney } 7587c8c0b82SPatrick Mooney vmmr_adding_sz += sz; 7597c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 7607c8c0b82SPatrick Mooney 7617c8c0b82SPatrick Mooney /* Wait for enough pages to become available */ 7627c8c0b82SPatrick Mooney if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { 7637c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 7647c8c0b82SPatrick Mooney vmmr_adding_sz -= sz; 7657c8c0b82SPatrick Mooney return (EINTR); 7667c8c0b82SPatrick Mooney } 7677c8c0b82SPatrick Mooney 7687c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 7697c8c0b82SPatrick Mooney size_t added = 0; 7707c8c0b82SPatrick Mooney size_t remain = sz; 7717c8c0b82SPatrick Mooney while (added < sz) { 7727c8c0b82SPatrick Mooney vmmr_span_t *span = NULL; 7737c8c0b82SPatrick Mooney 7747c8c0b82SPatrick Mooney if (vmmr_empty_sz > 0) { 7757c8c0b82SPatrick Mooney span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); 7767c8c0b82SPatrick Mooney 7777c8c0b82SPatrick Mooney vmmr_empty_sz -= span->vs_size; 7787c8c0b82SPatrick Mooney } else { 7797c8c0b82SPatrick Mooney /* 7807c8c0b82SPatrick Mooney * No empty space to fill with new pages, so just tack 7817c8c0b82SPatrick Mooney * it on at the end instead. 7827c8c0b82SPatrick Mooney */ 7837c8c0b82SPatrick Mooney span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 7847c8c0b82SPatrick Mooney span->vs_addr = vmmr_empty_last; 7857c8c0b82SPatrick Mooney span->vs_size = remain; 7867c8c0b82SPatrick Mooney vmmr_empty_last += remain; 7877c8c0b82SPatrick Mooney } 7887c8c0b82SPatrick Mooney VERIFY3P(span, !=, NULL); 7897c8c0b82SPatrick Mooney 7907c8c0b82SPatrick Mooney 7917c8c0b82SPatrick Mooney /* Allocate the actual pages to back this span */ 7927c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 7937c8c0b82SPatrick Mooney int err = vmmr_alloc_pages(span); 7947c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 7957c8c0b82SPatrick Mooney 7967c8c0b82SPatrick Mooney /* 7977c8c0b82SPatrick Mooney * If an error is encountered during page allocation for the 7987c8c0b82SPatrick Mooney * span, unwind any progress made by the addition request. 7997c8c0b82SPatrick Mooney */ 8007c8c0b82SPatrick Mooney if (err != 0) { 8017c8c0b82SPatrick Mooney /* 8027c8c0b82SPatrick Mooney * Without pages allocated to this span, it is now 8037c8c0b82SPatrick Mooney * tracked as empty. 8047c8c0b82SPatrick Mooney */ 8057c8c0b82SPatrick Mooney vmmr_empty_sz += span->vs_size; 8067c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_empty_tp); 8077c8c0b82SPatrick Mooney 8087c8c0b82SPatrick Mooney if (added != 0) { 8097c8c0b82SPatrick Mooney vmmr_remove_raw(added); 8107c8c0b82SPatrick Mooney } 8117c8c0b82SPatrick Mooney 8127c8c0b82SPatrick Mooney vmmr_adding_sz -= sz; 8137c8c0b82SPatrick Mooney 8147c8c0b82SPatrick Mooney page_unresv(sz >> PAGESHIFT); 8157c8c0b82SPatrick Mooney return (err); 8167c8c0b82SPatrick Mooney } 8177c8c0b82SPatrick Mooney 8187c8c0b82SPatrick Mooney /* 8197c8c0b82SPatrick Mooney * The allocated-page-bearing span is placed in the "free" 8207c8c0b82SPatrick Mooney * treepair now, but is not officially exposed for consumption 8217c8c0b82SPatrick Mooney * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. 8227c8c0b82SPatrick Mooney * 8237c8c0b82SPatrick Mooney * This allows us to unwind the allocation in case of a failure 8247c8c0b82SPatrick Mooney * without the risk of the freshly added span(s) being snapped 8257c8c0b82SPatrick Mooney * up by a consumer already. 8267c8c0b82SPatrick Mooney */ 8277c8c0b82SPatrick Mooney added += span->vs_size; 8287c8c0b82SPatrick Mooney remain -= span->vs_size; 8297c8c0b82SPatrick Mooney vmmr_tp_insert_concat(span, &vmmr_free_tp); 8307c8c0b82SPatrick Mooney } 8317c8c0b82SPatrick Mooney 8327c8c0b82SPatrick Mooney /* Make the added memory usable by exposing it to the size accounting */ 8337c8c0b82SPatrick Mooney if (!transient) { 8347c8c0b82SPatrick Mooney vmmr_free_sz += added; 8357c8c0b82SPatrick Mooney } else { 8367c8c0b82SPatrick Mooney vmmr_free_transient_sz += added; 8377c8c0b82SPatrick Mooney } 8387c8c0b82SPatrick Mooney ASSERT3U(added, ==, sz); 8397c8c0b82SPatrick Mooney vmmr_adding_sz -= added; 8407c8c0b82SPatrick Mooney 8417c8c0b82SPatrick Mooney return (0); 8427c8c0b82SPatrick Mooney } 8437c8c0b82SPatrick Mooney 844*6bba8b59SPatrick Mooney /* 845*6bba8b59SPatrick Mooney * Remove memory from vmm reservoir. Normally this will remove memory from the 846*6bba8b59SPatrick Mooney * reservoir which was available for non-transient allocations. If the removal 847*6bba8b59SPatrick Mooney * is part of a vmmr_free() of a transient allocation, it will act on only that 848*6bba8b59SPatrick Mooney * transient region being freed, not the available memory in the reservoir. 849*6bba8b59SPatrick Mooney * 850*6bba8b59SPatrick Mooney * Expects vmmr_lock to be held when called, and will return with it held, but 851*6bba8b59SPatrick Mooney * may drop it during portions of the removal. 852*6bba8b59SPatrick Mooney */ 853*6bba8b59SPatrick Mooney static int 8547c8c0b82SPatrick Mooney vmmr_remove(size_t sz, bool transient) 8557c8c0b82SPatrick Mooney { 8567c8c0b82SPatrick Mooney VERIFY3U(sz & PAGEOFFSET, ==, 0); 857*6bba8b59SPatrick Mooney VERIFY(sz); 858*6bba8b59SPatrick Mooney VERIFY(MUTEX_HELD(&vmmr_lock)); 8597c8c0b82SPatrick Mooney 8607c8c0b82SPatrick Mooney if ((!transient && sz > vmmr_free_sz) || 8617c8c0b82SPatrick Mooney (transient && sz > vmmr_free_transient_sz)) { 8627c8c0b82SPatrick Mooney return (ENOSPC); 8637c8c0b82SPatrick Mooney } 8647c8c0b82SPatrick Mooney 8657c8c0b82SPatrick Mooney vmmr_remove_raw(sz); 8667c8c0b82SPatrick Mooney 8677c8c0b82SPatrick Mooney if (!transient) { 8687c8c0b82SPatrick Mooney vmmr_free_sz -= sz; 8697c8c0b82SPatrick Mooney } else { 8707c8c0b82SPatrick Mooney vmmr_free_transient_sz -= sz; 8717c8c0b82SPatrick Mooney } 8727c8c0b82SPatrick Mooney page_unresv(sz >> PAGESHIFT); 8737c8c0b82SPatrick Mooney return (0); 8747c8c0b82SPatrick Mooney } 8757c8c0b82SPatrick Mooney 876*6bba8b59SPatrick Mooney static int 877*6bba8b59SPatrick Mooney vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp) 878*6bba8b59SPatrick Mooney { 879*6bba8b59SPatrick Mooney VERIFY(resp != NULL); 880*6bba8b59SPatrick Mooney 881*6bba8b59SPatrick Mooney mutex_enter(&vmmr_lock); 882*6bba8b59SPatrick Mooney 883*6bba8b59SPatrick Mooney size_t current_sz = vmmr_alloc_sz + vmmr_free_sz; 884*6bba8b59SPatrick Mooney 885*6bba8b59SPatrick Mooney /* Be sure to communicate current size in case of an early bail-out */ 886*6bba8b59SPatrick Mooney *resp = current_sz; 887*6bba8b59SPatrick Mooney 888*6bba8b59SPatrick Mooney if ((target_sz & PAGEOFFSET) != 0 || 889*6bba8b59SPatrick Mooney (chunk_sz & PAGEOFFSET) != 0) { 890*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 891*6bba8b59SPatrick Mooney return (EINVAL); 892*6bba8b59SPatrick Mooney } 893*6bba8b59SPatrick Mooney /* Reject sentinel value */ 894*6bba8b59SPatrick Mooney if (target_sz == VMMR_TARGET_INACTIVE) { 895*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 896*6bba8b59SPatrick Mooney return (EINVAL); 897*6bba8b59SPatrick Mooney } 898*6bba8b59SPatrick Mooney 899*6bba8b59SPatrick Mooney /* Already at target size */ 900*6bba8b59SPatrick Mooney if (target_sz == current_sz) { 901*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 902*6bba8b59SPatrick Mooney return (0); 903*6bba8b59SPatrick Mooney } 904*6bba8b59SPatrick Mooney 905*6bba8b59SPatrick Mooney /* Reject racing requests size */ 906*6bba8b59SPatrick Mooney if (vmmr_target_sz != VMMR_TARGET_INACTIVE) { 907*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 908*6bba8b59SPatrick Mooney return (EALREADY); 909*6bba8b59SPatrick Mooney } 910*6bba8b59SPatrick Mooney /* Record the target now to excluding a racing request */ 911*6bba8b59SPatrick Mooney vmmr_target_sz = target_sz; 912*6bba8b59SPatrick Mooney 913*6bba8b59SPatrick Mooney int err = 0; 914*6bba8b59SPatrick Mooney do { 915*6bba8b59SPatrick Mooney /* Be sensitive to signal interruption */ 916*6bba8b59SPatrick Mooney if (issig(JUSTLOOKING) != 0) { 917*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 918*6bba8b59SPatrick Mooney const bool sig_bail = issig(FORREAL) != 0; 919*6bba8b59SPatrick Mooney mutex_enter(&vmmr_lock); 920*6bba8b59SPatrick Mooney if (sig_bail) { 921*6bba8b59SPatrick Mooney err = EINTR; 922*6bba8b59SPatrick Mooney break; 923*6bba8b59SPatrick Mooney } 924*6bba8b59SPatrick Mooney } 925*6bba8b59SPatrick Mooney 926*6bba8b59SPatrick Mooney if (current_sz > target_sz) { 927*6bba8b59SPatrick Mooney /* Shrinking reservoir */ 928*6bba8b59SPatrick Mooney 929*6bba8b59SPatrick Mooney size_t req_sz = current_sz - target_sz; 930*6bba8b59SPatrick Mooney if (chunk_sz != 0) { 931*6bba8b59SPatrick Mooney req_sz = MIN(req_sz, chunk_sz); 932*6bba8b59SPatrick Mooney } 933*6bba8b59SPatrick Mooney err = vmmr_remove(req_sz, false); 934*6bba8b59SPatrick Mooney } else { 935*6bba8b59SPatrick Mooney /* Growing reservoir */ 936*6bba8b59SPatrick Mooney ASSERT(current_sz < target_sz); 937*6bba8b59SPatrick Mooney 938*6bba8b59SPatrick Mooney size_t req_sz = target_sz - current_sz; 939*6bba8b59SPatrick Mooney if (chunk_sz != 0) { 940*6bba8b59SPatrick Mooney req_sz = MIN(req_sz, chunk_sz); 941*6bba8b59SPatrick Mooney } 942*6bba8b59SPatrick Mooney err = vmmr_add(req_sz, false); 943*6bba8b59SPatrick Mooney } 944*6bba8b59SPatrick Mooney 945*6bba8b59SPatrick Mooney current_sz = vmmr_alloc_sz + vmmr_free_sz; 946*6bba8b59SPatrick Mooney } while (err == 0 && current_sz != target_sz); 947*6bba8b59SPatrick Mooney 948*6bba8b59SPatrick Mooney /* Clear the target now that we are done (success or not) */ 949*6bba8b59SPatrick Mooney vmmr_target_sz = VMMR_TARGET_INACTIVE; 950*6bba8b59SPatrick Mooney mutex_exit(&vmmr_lock); 951*6bba8b59SPatrick Mooney *resp = current_sz; 952*6bba8b59SPatrick Mooney return (err); 953*6bba8b59SPatrick Mooney } 954*6bba8b59SPatrick Mooney 9557c8c0b82SPatrick Mooney int 9567c8c0b82SPatrick Mooney vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 9577c8c0b82SPatrick Mooney { 958*6bba8b59SPatrick Mooney /* 959*6bba8b59SPatrick Mooney * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we 960*6bba8b59SPatrick Mooney * do not need to duplicate such checks here. 961*6bba8b59SPatrick Mooney */ 962*6bba8b59SPatrick Mooney 9637c8c0b82SPatrick Mooney switch (cmd) { 9647c8c0b82SPatrick Mooney case VMM_RESV_QUERY: { 9657c8c0b82SPatrick Mooney struct vmm_resv_query res; 9667c8c0b82SPatrick Mooney void *datap = (void *)(uintptr_t)arg; 9677c8c0b82SPatrick Mooney 9687c8c0b82SPatrick Mooney /* For now, anyone in GZ can query */ 9697c8c0b82SPatrick Mooney if (crgetzoneid(cr) != GLOBAL_ZONEID) { 9707c8c0b82SPatrick Mooney return (EPERM); 9717c8c0b82SPatrick Mooney } 9727c8c0b82SPatrick Mooney mutex_enter(&vmmr_lock); 9737c8c0b82SPatrick Mooney res.vrq_free_sz = vmmr_free_sz; 9747c8c0b82SPatrick Mooney res.vrq_alloc_sz = vmmr_alloc_sz; 9757c8c0b82SPatrick Mooney res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; 9767c8c0b82SPatrick Mooney res.vrq_limit = vmmr_total_limit; 9777c8c0b82SPatrick Mooney mutex_exit(&vmmr_lock); 9787c8c0b82SPatrick Mooney if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { 9797c8c0b82SPatrick Mooney return (EFAULT); 9807c8c0b82SPatrick Mooney } 9817c8c0b82SPatrick Mooney break; 9827c8c0b82SPatrick Mooney } 983*6bba8b59SPatrick Mooney case VMM_RESV_SET_TARGET: { 9847c8c0b82SPatrick Mooney if (secpolicy_sys_config(cr, B_FALSE) != 0) { 9857c8c0b82SPatrick Mooney return (EPERM); 9867c8c0b82SPatrick Mooney } 987*6bba8b59SPatrick Mooney 988*6bba8b59SPatrick Mooney struct vmm_resv_target tgt; 989*6bba8b59SPatrick Mooney void *datap = (void *)(uintptr_t)arg; 990*6bba8b59SPatrick Mooney 991*6bba8b59SPatrick Mooney if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) { 992*6bba8b59SPatrick Mooney return (EFAULT); 9937c8c0b82SPatrick Mooney } 994*6bba8b59SPatrick Mooney 995*6bba8b59SPatrick Mooney int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz, 996*6bba8b59SPatrick Mooney &tgt.vrt_result_sz); 997*6bba8b59SPatrick Mooney 998*6bba8b59SPatrick Mooney /* 999*6bba8b59SPatrick Mooney * Attempt to communicate the resultant size of the reservoir if 1000*6bba8b59SPatrick Mooney * setting it to the target was a success, or if we were 1001*6bba8b59SPatrick Mooney * interrupted (by a signal) while doing so. 1002*6bba8b59SPatrick Mooney */ 1003*6bba8b59SPatrick Mooney if (err == 0 || err == EINTR) { 1004*6bba8b59SPatrick Mooney if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) { 1005*6bba8b59SPatrick Mooney err = EFAULT; 10067c8c0b82SPatrick Mooney } 1007*6bba8b59SPatrick Mooney } 1008*6bba8b59SPatrick Mooney 1009*6bba8b59SPatrick Mooney return (err); 10107c8c0b82SPatrick Mooney } 10117c8c0b82SPatrick Mooney default: 10127c8c0b82SPatrick Mooney return (ENOTTY); 10137c8c0b82SPatrick Mooney } 10147c8c0b82SPatrick Mooney return (0); 10157c8c0b82SPatrick Mooney } 1016