1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2025 Oxide Computer Company
16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17 */
18
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40 #include "vmm_util.h"
41
42
43 /*
44 * VMM Virtual Memory
45 *
46 * History
47 *
48 * When bhyve was ported to illumos, one significant hole was handling guest
49 * memory and memory accesses. In the original Pluribus port, bhyve itself
50 * manually handled the EPT structures for guest memory. The updated sources
51 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
52 * system for memory allocations and management of the EPT structures. Keeping
53 * source differences to a minimum was a priority, so illumos-bhyve implemented
54 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
55 * boot and run guests.
56 *
57 * While the VM shim was successful in getting illumos-bhyve to a functional
58 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
59 * compatibility interfaces made it awkward to use. As source differences with
60 * the upstream kernel code became less of a concern, and upcoming features
61 * (such as live migration) would demand more of those VM interfaces, it became
62 * clear that an overhaul was prudent.
63 *
64 * Design
65 *
66 * The new VM system for bhyve retains a number of the same concepts as what it
67 * replaces:
68 *
69 * - `vmspace_t` is the top-level entity for a guest memory space
70 * - `vm_object_t` represents a memory object which can be mapped into a vmspace
71 * - `vm_page_t` represents a page hold within a given vmspace, providing access
72 * to the underlying memory page
73 *
74 * Unlike the old code, where most of the involved structures were exposed via
75 * public definitions, this replacement VM interface keeps all involved
76 * structures opaque to consumers. Furthermore, there is a clear delineation
77 * between infrequent administrative operations (such as mapping/unmapping
78 * regions) and common data-path operations (attempting a page hold at a given
79 * guest-physical address). Those administrative operations are performed
80 * directly against the vmspace, whereas the data-path operations are performed
81 * through a `vm_client_t` handle. That VM client abstraction is meant to
82 * reduce contention and overhead for frequent access operations and provide
83 * debugging insight into how different subcomponents are accessing the vmspace.
84 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
85 * interface) and each VMM userspace segment mapping.
86 *
87 * Exclusion
88 *
89 * Making changes to the vmspace (such as mapping or unmapping regions) requires
90 * other accessors be excluded while the change is underway to prevent them from
91 * observing invalid intermediate states. A simple approach could use a mutex
92 * or rwlock to achieve this, but that risks contention when the rate of access
93 * to the vmspace is high.
94 *
95 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
96 * at a per-vm_client_t basis. While this raises the cost for vmspace changes,
97 * it means that the much more common page accesses through the vm_client can
98 * normally proceed unimpeded and independently.
99 *
100 * When a change to the vmspace is required, the caller will put the vmspace in
101 * a 'hold' state, iterating over all associated vm_client instances, waiting
102 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
103 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on
104 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
105 * will block until the hold condition is cleared. Once the hold is asserted
106 * for all clients, the vmspace change can proceed with confidence. Upon
107 * completion of that operation, VCS_HOLD is cleared from the clients, and they
108 * are released to resume vmspace accesses.
109 *
110 * vCPU Consumers
111 *
112 * Access to the vmspace for vCPUs running in guest context is different from
113 * emulation-related vm_client activity: they solely rely on the contents of the
114 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude
115 * client access is not feasible when entering guest context, since interrupts
116 * are disabled, making it impossible to block entry. This is not a concern as
117 * long as vmspace modifications never place the page tables in invalid states
118 * (either intermediate, or final). The vm_client hold mechanism does provide
119 * the means to IPI vCPU consumers which will trigger a notification once they
120 * report their exit from guest context. This can be used to ensure that page
121 * table modifications are made visible to those vCPUs within a certain
122 * time frame.
123 */
124
125 typedef struct vmspace_mapping {
126 list_node_t vmsm_node;
127 vm_object_t *vmsm_object; /* object backing this mapping */
128 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */
129 size_t vmsm_len; /* length (in bytes) of mapping */
130 off_t vmsm_offset; /* byte offset into object */
131 uint_t vmsm_prot;
132 } vmspace_mapping_t;
133
134 #define VMSM_OFFSET(vmsm, addr) ( \
135 (vmsm)->vmsm_offset + \
136 ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
137
138 typedef enum vm_client_state {
139 VCS_IDLE = 0,
140 /* currently accessing vmspace for client operation (hold or fault) */
141 VCS_ACTIVE = (1 << 0),
142 /* client hold requested/asserted */
143 VCS_HOLD = (1 << 1),
144 /* vCPU is accessing page tables in guest context */
145 VCS_ON_CPU = (1 << 2),
146 /* client has been orphaned (no more access to vmspace) */
147 VCS_ORPHANED = (1 << 3),
148 /* client undergoing destroy operation */
149 VCS_DESTROY = (1 << 4),
150 } vm_client_state_t;
151
152 struct vmspace {
153 kmutex_t vms_lock;
154 kcondvar_t vms_cv;
155 bool vms_held;
156 uintptr_t vms_size; /* immutable after creation */
157
158 /* (nested) page table state */
159 vmm_gpt_t *vms_gpt;
160 uint64_t vms_pt_gen;
161 uint64_t vms_pages_mapped;
162 bool vms_track_dirty;
163
164 list_t vms_maplist;
165 list_t vms_clients;
166 };
167
168 struct vm_client {
169 vmspace_t *vmc_space;
170 list_node_t vmc_node;
171
172 kmutex_t vmc_lock;
173 kcondvar_t vmc_cv;
174 vm_client_state_t vmc_state;
175 int vmc_cpu_active;
176 uint64_t vmc_cpu_gen;
177 bool vmc_track_dirty;
178 vmc_inval_cb_t vmc_inval_func;
179 void *vmc_inval_data;
180
181 list_t vmc_held_pages;
182 };
183
184 typedef enum vm_object_type {
185 VMOT_NONE,
186 VMOT_MEM,
187 VMOT_MMIO,
188 } vm_object_type_t;
189
190 struct vm_object {
191 uint_t vmo_refcnt; /* manipulated with atomic ops */
192
193 /* Fields below are fixed at creation time */
194 vm_object_type_t vmo_type;
195 size_t vmo_size;
196 void *vmo_data;
197 uint8_t vmo_attr;
198 };
199
200 /* Convenience consolidation of all flag(s) for validity checking */
201 #define VPF_ALL (VPF_DEFER_DIRTY)
202
203 struct vm_page {
204 vm_client_t *vmp_client;
205 list_node_t vmp_node;
206 vm_page_t *vmp_chain;
207 uintptr_t vmp_gpa;
208 pfn_t vmp_pfn;
209 uint64_t *vmp_ptep;
210 vm_object_t *vmp_obj_ref;
211 uint8_t vmp_prot;
212 uint8_t vmp_flags;
213 };
214
215 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
216 static void vmspace_hold_enter(vmspace_t *);
217 static void vmspace_hold_exit(vmspace_t *, bool);
218 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t);
219 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *,
220 uint64_t *);
221 static void vmc_space_hold(vm_client_t *);
222 static void vmc_space_release(vm_client_t *, bool);
223 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
224 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
225 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
226
227 bool
vmm_vm_init(void)228 vmm_vm_init(void)
229 {
230 if (vmm_is_intel()) {
231 extern struct vmm_pte_impl ept_pte_impl;
232 return (vmm_gpt_init(&ept_pte_impl));
233 } else if (vmm_is_svm()) {
234 extern struct vmm_pte_impl rvi_pte_impl;
235 return (vmm_gpt_init(&rvi_pte_impl));
236 } else {
237 /* Caller should have already rejected other vendors */
238 panic("Unexpected hypervisor hardware vendor");
239 }
240 }
241
242 void
vmm_vm_fini(void)243 vmm_vm_fini(void)
244 {
245 vmm_gpt_fini();
246 }
247
248 /*
249 * Create a new vmspace with a maximum address of `end`.
250 */
251 vmspace_t *
vmspace_alloc(size_t end)252 vmspace_alloc(size_t end)
253 {
254 vmspace_t *vms;
255 const uintptr_t size = end + 1;
256
257 /*
258 * This whole mess is built on the assumption that a 64-bit address
259 * space is available to work with for the various pagetable tricks.
260 */
261 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
262 size <= (uintptr_t)USERLIMIT);
263
264 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
265 vms->vms_size = size;
266 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
267 offsetof(vmspace_mapping_t, vmsm_node));
268 list_create(&vms->vms_clients, sizeof (vm_client_t),
269 offsetof(vm_client_t, vmc_node));
270
271 vms->vms_gpt = vmm_gpt_alloc();
272 vms->vms_pt_gen = 1;
273 vms->vms_track_dirty = false;
274
275 return (vms);
276 }
277
278 /*
279 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining
280 * clients will be orphaned.
281 */
282 void
vmspace_destroy(vmspace_t * vms)283 vmspace_destroy(vmspace_t *vms)
284 {
285 mutex_enter(&vms->vms_lock);
286 VERIFY(list_is_empty(&vms->vms_maplist));
287
288 if (!list_is_empty(&vms->vms_clients)) {
289 vm_client_t *vmc = list_head(&vms->vms_clients);
290 while (vmc != NULL) {
291 vmc = vmc_space_orphan(vmc, vms);
292 }
293 /*
294 * Wait for any clients which were in the process of destroying
295 * themselves to disappear.
296 */
297 while (!list_is_empty(&vms->vms_clients)) {
298 cv_wait(&vms->vms_cv, &vms->vms_lock);
299 }
300 }
301 VERIFY(list_is_empty(&vms->vms_clients));
302
303 vmm_gpt_free(vms->vms_gpt);
304 mutex_exit(&vms->vms_lock);
305
306 mutex_destroy(&vms->vms_lock);
307 cv_destroy(&vms->vms_cv);
308 list_destroy(&vms->vms_maplist);
309 list_destroy(&vms->vms_clients);
310
311 kmem_free(vms, sizeof (*vms));
312 }
313
314 /*
315 * Retrieve the count of resident (mapped into the page tables) pages.
316 */
317 uint64_t
vmspace_resident_count(vmspace_t * vms)318 vmspace_resident_count(vmspace_t *vms)
319 {
320 return (vms->vms_pages_mapped);
321 }
322
323 /*
324 * Perform an operation on the status (accessed/dirty) bits held in the page
325 * tables of this vmspace.
326 *
327 * Such manipulations race against both hardware writes (from running vCPUs) and
328 * emulated accesses reflected from userspace. Safe functionality depends on
329 * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap
330 * operations from changing the page tables during the walk.
331 */
332 void
vmspace_bits_operate(vmspace_t * vms,const uint64_t gpa,size_t len,vmspace_bit_oper_t oper,uint8_t * bitmap)333 vmspace_bits_operate(vmspace_t *vms, const uint64_t gpa, size_t len,
334 vmspace_bit_oper_t oper, uint8_t *bitmap)
335 {
336 const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0;
337 const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0;
338 const vmspace_bit_oper_t oper_only =
339 oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT);
340 vmm_gpt_t *gpt = vms->vms_gpt;
341
342 /*
343 * The bitmap cannot be NULL if the requested operation involves reading
344 * or writing from it.
345 */
346 ASSERT(bitmap != NULL || (!bit_input && !bit_output));
347
348 vmm_gpt_iter_t iter;
349 vmm_gpt_iter_entry_t entry;
350 vmm_gpt_iter_init(&iter, gpt, gpa, len);
351
352 while (vmm_gpt_iter_next(&iter, &entry)) {
353 const size_t offset = (entry.vgie_gpa - gpa);
354 const uint64_t pfn_offset = offset >> PAGESHIFT;
355 const size_t bit_offset = pfn_offset / 8;
356 const uint8_t bit_mask = 1 << (pfn_offset % 8);
357
358 if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) {
359 continue;
360 }
361
362 bool value = false;
363 uint64_t *ptep = entry.vgie_ptep;
364 if (ptep == NULL) {
365 if (bit_output) {
366 bitmap[bit_offset] &= ~bit_mask;
367 }
368 continue;
369 }
370
371 switch (oper_only) {
372 case VBO_GET_DIRTY:
373 value = vmm_gpte_query_dirty(ptep);
374 break;
375 case VBO_SET_DIRTY: {
376 uint_t prot = 0;
377 bool present_writable = false;
378 pfn_t pfn;
379
380 /*
381 * To avoid blindly setting the dirty bit on otherwise
382 * empty PTEs, we must first check if the entry for the
383 * address in question has been populated.
384 *
385 * Only if the page is marked both Present and Writable
386 * will we permit the dirty bit to be set.
387 */
388 if (!vmm_gpte_is_mapped(ptep, &pfn, &prot)) {
389 int err = vmspace_ensure_mapped(vms,
390 entry.vgie_gpa, PROT_WRITE, &pfn, ptep);
391 if (err == 0) {
392 present_writable = true;
393 }
394 } else if ((prot & PROT_WRITE) != 0) {
395 present_writable = true;
396 }
397
398 if (present_writable) {
399 value = !vmm_gpte_reset_dirty(ptep, true);
400 }
401 break;
402 }
403 case VBO_RESET_DIRTY:
404 /*
405 * Although at first glance, it may seem like the act of
406 * resetting the dirty bit may require the same care as
407 * setting it, the constraints make for a simpler task.
408 *
409 * Any PTEs with the dirty bit set will have already
410 * been properly populated.
411 */
412 value = vmm_gpte_reset_dirty(ptep, false);
413 break;
414 default:
415 panic("unrecognized operator: %d", oper_only);
416 break;
417 }
418 if (bit_output) {
419 if (value) {
420 bitmap[bit_offset] |= bit_mask;
421 } else {
422 bitmap[bit_offset] &= ~bit_mask;
423 }
424 }
425 }
426
427 /*
428 * Invalidate the address range potentially effected by the changes to
429 * page table bits, issuing shoot-downs for those who might have it in
430 * cache.
431 */
432 vmspace_hold_enter(vms);
433 vms->vms_pt_gen++;
434 vmspace_clients_invalidate(vms, gpa, len);
435 vmspace_hold_exit(vms, true);
436 }
437
438 /*
439 * Is dirty-page-tracking enabled for the vmspace?
440 */
441 bool
vmspace_get_tracking(vmspace_t * vms)442 vmspace_get_tracking(vmspace_t *vms)
443 {
444 mutex_enter(&vms->vms_lock);
445 const bool val = vms->vms_track_dirty;
446 mutex_exit(&vms->vms_lock);
447 return (val);
448 }
449
450 /*
451 * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace.
452 */
453 int
vmspace_set_tracking(vmspace_t * vms,bool enable_dirty_tracking)454 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking)
455 {
456 if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) {
457 /* Do not allow this to be set if it is not supported */
458 return (ENOTSUP);
459 }
460
461 vmspace_hold_enter(vms);
462 if (vms->vms_track_dirty == enable_dirty_tracking) {
463 /* No further effort required if state already matches */
464 vmspace_hold_exit(vms, false);
465 return (0);
466 }
467
468 vms->vms_track_dirty = enable_dirty_tracking;
469
470 /* Configure all existing clients for new tracking behavior */
471 for (vm_client_t *vmc = list_head(&vms->vms_clients);
472 vmc != NULL;
473 vmc = list_next(&vms->vms_clients, vmc)) {
474 mutex_enter(&vmc->vmc_lock);
475 vmc->vmc_track_dirty = enable_dirty_tracking;
476 mutex_exit(&vmc->vmc_lock);
477 }
478
479 /*
480 * Notify all clients of what is considered an invalidation of the
481 * entire vmspace.
482 */
483 vms->vms_pt_gen++;
484 vmspace_clients_invalidate(vms, 0, vms->vms_size);
485
486 vmspace_hold_exit(vms, true);
487 return (0);
488 }
489
490 static pfn_t
vm_object_pager_reservoir(vm_object_t * vmo,uintptr_t off)491 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
492 {
493 vmmr_region_t *region;
494 pfn_t pfn;
495
496 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
497
498 region = vmo->vmo_data;
499 pfn = vmmr_region_pfn_at(region, off);
500
501 return (pfn);
502 }
503
504 static pfn_t
vm_object_pager_mmio(vm_object_t * vmo,uintptr_t off)505 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
506 {
507 pfn_t pfn;
508
509 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
510 ASSERT3P(vmo->vmo_data, !=, NULL);
511 ASSERT3U(off, <, vmo->vmo_size);
512
513 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
514
515 return (pfn);
516 }
517
518 /*
519 * Allocate a VM object backed by VMM reservoir memory.
520 */
521 vm_object_t *
vm_object_mem_allocate(size_t size,bool transient)522 vm_object_mem_allocate(size_t size, bool transient)
523 {
524 int err;
525 vmmr_region_t *region = NULL;
526 vm_object_t *vmo;
527
528 ASSERT3U(size, !=, 0);
529 ASSERT3U(size & PAGEOFFSET, ==, 0);
530
531 err = vmmr_alloc(size, transient, ®ion);
532 if (err != 0) {
533 return (NULL);
534 }
535
536 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
537
538 /* For now, these are to stay fixed after allocation */
539 vmo->vmo_type = VMOT_MEM;
540 vmo->vmo_size = size;
541 vmo->vmo_attr = MTRR_TYPE_WB;
542 vmo->vmo_data = region;
543 vmo->vmo_refcnt = 1;
544
545 return (vmo);
546 }
547
548 static vm_object_t *
vm_object_mmio_allocate(size_t size,uintptr_t hpa)549 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
550 {
551 vm_object_t *vmo;
552
553 ASSERT3U(size, !=, 0);
554 ASSERT3U(size & PAGEOFFSET, ==, 0);
555 ASSERT3U(hpa & PAGEOFFSET, ==, 0);
556
557 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
558
559 /* For now, these are to stay fixed after allocation */
560 vmo->vmo_type = VMOT_MMIO;
561 vmo->vmo_size = size;
562 vmo->vmo_attr = MTRR_TYPE_UC;
563 vmo->vmo_data = (void *)hpa;
564 vmo->vmo_refcnt = 1;
565
566 return (vmo);
567 }
568
569 /*
570 * Allocate a VM object backed by an existing range of physical memory.
571 */
572 vm_object_t *
vmm_mmio_alloc(vmspace_t * vmspace,uintptr_t gpa,size_t len,uintptr_t hpa)573 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
574 {
575 int error;
576 vm_object_t *obj;
577
578 obj = vm_object_mmio_allocate(len, hpa);
579 if (obj != NULL) {
580 error = vmspace_map(vmspace, obj, 0, gpa, len,
581 PROT_READ | PROT_WRITE);
582 if (error != 0) {
583 vm_object_release(obj);
584 obj = NULL;
585 }
586 }
587
588 return (obj);
589 }
590
591 /*
592 * Release a vm_object reference
593 */
594 void
vm_object_release(vm_object_t * vmo)595 vm_object_release(vm_object_t *vmo)
596 {
597 ASSERT(vmo != NULL);
598
599 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
600 /* underflow would be a deadly serious mistake */
601 VERIFY3U(ref, !=, UINT_MAX);
602 if (ref != 0) {
603 return;
604 }
605
606 switch (vmo->vmo_type) {
607 case VMOT_MEM:
608 vmmr_free((vmmr_region_t *)vmo->vmo_data);
609 break;
610 case VMOT_MMIO:
611 break;
612 default:
613 panic("unexpected object type %u", vmo->vmo_type);
614 break;
615 }
616
617 vmo->vmo_data = NULL;
618 vmo->vmo_size = 0;
619 kmem_free(vmo, sizeof (*vmo));
620 }
621
622 /*
623 * Increase refcount for vm_object reference
624 */
625 void
vm_object_reference(vm_object_t * vmo)626 vm_object_reference(vm_object_t *vmo)
627 {
628 ASSERT(vmo != NULL);
629
630 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
631 /* overflow would be a deadly serious mistake */
632 VERIFY3U(ref, !=, 0);
633 }
634
635 /*
636 * Get the host-physical PFN for a given offset into a vm_object.
637 *
638 * The provided `off` must be within the allocated size of the vm_object.
639 */
640 pfn_t
vm_object_pfn(vm_object_t * vmo,uintptr_t off)641 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
642 {
643 const uintptr_t aligned_off = off & PAGEMASK;
644
645 switch (vmo->vmo_type) {
646 case VMOT_MEM:
647 return (vm_object_pager_reservoir(vmo, aligned_off));
648 case VMOT_MMIO:
649 return (vm_object_pager_mmio(vmo, aligned_off));
650 case VMOT_NONE:
651 break;
652 }
653 panic("unexpected object type %u", vmo->vmo_type);
654 }
655
656 static vmspace_mapping_t *
vm_mapping_find(vmspace_t * vms,uintptr_t addr,size_t size)657 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
658 {
659 vmspace_mapping_t *vmsm;
660 list_t *ml = &vms->vms_maplist;
661 const uintptr_t range_end = addr + size;
662
663 ASSERT3U(addr, <=, range_end);
664
665 if (addr >= vms->vms_size) {
666 return (NULL);
667 }
668 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
669 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
670
671 if (addr >= vmsm->vmsm_addr && addr < seg_end) {
672 if (range_end <= seg_end) {
673 return (vmsm);
674 } else {
675 return (NULL);
676 }
677 }
678 }
679 return (NULL);
680 }
681
682 /*
683 * Check to see if any mappings reside within [addr, addr + size) span in the
684 * vmspace, returning true if that span is indeed empty.
685 */
686 static bool
vm_mapping_gap(vmspace_t * vms,uintptr_t addr,size_t size)687 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
688 {
689 vmspace_mapping_t *vmsm;
690 list_t *ml = &vms->vms_maplist;
691 const uintptr_t range_end = addr + size - 1;
692
693 ASSERT(MUTEX_HELD(&vms->vms_lock));
694 ASSERT(size > 0);
695
696 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
697 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
698
699 /*
700 * The two ranges do not overlap if the start of either of
701 * them is after the end of the other.
702 */
703 if (vmsm->vmsm_addr > range_end || addr > seg_end)
704 continue;
705 return (false);
706 }
707 return (true);
708 }
709
710 static void
vm_mapping_remove(vmspace_t * vms,vmspace_mapping_t * vmsm)711 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
712 {
713 list_t *ml = &vms->vms_maplist;
714
715 ASSERT(MUTEX_HELD(&vms->vms_lock));
716 ASSERT(vms->vms_held);
717
718 list_remove(ml, vmsm);
719 vm_object_release(vmsm->vmsm_object);
720 kmem_free(vmsm, sizeof (*vmsm));
721 }
722
723 /*
724 * Enter a hold state on the vmspace. This ensures that all VM clients
725 * associated with the vmspace are excluded from establishing new page holds,
726 * or any other actions which would require accessing vmspace state subject to
727 * potential change.
728 *
729 * Returns with vmspace_t`vms_lock held.
730 */
731 static void
vmspace_hold_enter(vmspace_t * vms)732 vmspace_hold_enter(vmspace_t *vms)
733 {
734 mutex_enter(&vms->vms_lock);
735 VERIFY(!vms->vms_held);
736
737 vm_client_t *vmc = list_head(&vms->vms_clients);
738 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
739 vmc_space_hold(vmc);
740 }
741 vms->vms_held = true;
742 }
743
744 /*
745 * Exit a hold state on the vmspace. This releases all VM clients associated
746 * with the vmspace to be able to establish new page holds, and partake in other
747 * actions which require accessing changed vmspace state. If `kick_on_cpu` is
748 * true, then any CPUs actively using the page tables will be IPIed, and the
749 * call will block until they have acknowledged being ready to use the latest
750 * state of the tables.
751 *
752 * Requires vmspace_t`vms_lock be held, which is released as part of the call.
753 */
754 static void
vmspace_hold_exit(vmspace_t * vms,bool kick_on_cpu)755 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
756 {
757 ASSERT(MUTEX_HELD(&vms->vms_lock));
758 VERIFY(vms->vms_held);
759
760 vm_client_t *vmc = list_head(&vms->vms_clients);
761 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
762 vmc_space_release(vmc, kick_on_cpu);
763 }
764 vms->vms_held = false;
765 mutex_exit(&vms->vms_lock);
766 }
767
768 static void
vmspace_clients_invalidate(vmspace_t * vms,uintptr_t gpa,size_t len)769 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len)
770 {
771 ASSERT(MUTEX_HELD(&vms->vms_lock));
772 VERIFY(vms->vms_held);
773
774 for (vm_client_t *vmc = list_head(&vms->vms_clients);
775 vmc != NULL;
776 vmc = list_next(&vms->vms_clients, vmc)) {
777 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
778 }
779 }
780
781 /*
782 * Attempt to map a vm_object span into the vmspace.
783 *
784 * Requirements:
785 * - `obj_off`, `addr`, and `len` must be page-aligned
786 * - `obj_off` cannot be greater than the allocated size of the object
787 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
788 * size of the object
789 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
790 * of the vmspace
791 */
792 int
vmspace_map(vmspace_t * vms,vm_object_t * vmo,uintptr_t obj_off,uintptr_t addr,size_t len,uint8_t prot)793 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
794 size_t len, uint8_t prot)
795 {
796 vmspace_mapping_t *vmsm;
797 int res = 0;
798
799 if (len == 0 || (addr + len) < addr ||
800 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
801 return (EINVAL);
802 }
803 if ((addr + len) >= vms->vms_size) {
804 return (ENOMEM);
805 }
806
807 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
808
809 vmspace_hold_enter(vms);
810 if (!vm_mapping_gap(vms, addr, len)) {
811 kmem_free(vmsm, sizeof (*vmsm));
812 res = ENOMEM;
813 } else {
814 vmsm->vmsm_object = vmo;
815 vmsm->vmsm_addr = addr;
816 vmsm->vmsm_len = len;
817 vmsm->vmsm_offset = (off_t)obj_off;
818 vmsm->vmsm_prot = prot;
819 list_insert_tail(&vms->vms_maplist, vmsm);
820
821 /*
822 * Make sure the GPT has tables ready for leaf entries across
823 * the entire new mapping.
824 */
825 vmm_gpt_populate_region(vms->vms_gpt, addr, len);
826 }
827 vmspace_hold_exit(vms, false);
828 return (res);
829 }
830
831 /*
832 * Unmap a region of the vmspace.
833 *
834 * Presently the [start, end) span must equal a region previously mapped by a
835 * call to vmspace_map().
836 */
837 int
vmspace_unmap(vmspace_t * vms,uintptr_t addr,uintptr_t len)838 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len)
839 {
840 const uintptr_t end = addr + len;
841 vmspace_mapping_t *vmsm;
842 vm_client_t *vmc;
843 uint64_t gen = 0;
844
845 ASSERT3U(addr, <, end);
846
847 vmspace_hold_enter(vms);
848 /* expect to match existing mapping exactly */
849 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL ||
850 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) {
851 vmspace_hold_exit(vms, false);
852 return (ENOENT);
853 }
854
855 /* Prepare clients (and their held pages) for the unmap. */
856 for (vmc = list_head(&vms->vms_clients); vmc != NULL;
857 vmc = list_next(&vms->vms_clients, vmc)) {
858 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object);
859 }
860
861 /* Clear all PTEs for region */
862 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) {
863 vms->vms_pt_gen++;
864 gen = vms->vms_pt_gen;
865 }
866 /* ... and the intermediate (directory) PTEs as well */
867 vmm_gpt_vacate_region(vms->vms_gpt, addr, len);
868
869 /*
870 * If pages were actually unmapped from the GPT, provide clients with
871 * an invalidation notice.
872 */
873 if (gen != 0) {
874 vmspace_clients_invalidate(vms, addr, len);
875 }
876
877 vm_mapping_remove(vms, vmsm);
878 vmspace_hold_exit(vms, true);
879 return (0);
880 }
881
882 /*
883 * For a given GPA in the vmspace, ensure that the backing page (if any) is
884 * properly mapped as present in the provided PTE.
885 */
886 static int
vmspace_ensure_mapped(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t * leaf_pte)887 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
888 uint64_t *leaf_pte)
889 {
890 vmspace_mapping_t *vmsm;
891 vm_object_t *vmo;
892 pfn_t pfn;
893
894 ASSERT(pfnp != NULL);
895 ASSERT(leaf_pte != NULL);
896
897 vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
898 if (vmsm == NULL) {
899 return (FC_NOMAP);
900 }
901 if ((req_prot & vmsm->vmsm_prot) != req_prot) {
902 return (FC_PROT);
903 }
904
905 vmo = vmsm->vmsm_object;
906 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
907 VERIFY(pfn != PFN_INVALID);
908
909 if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot,
910 vmo->vmo_attr)) {
911 atomic_inc_64(&vms->vms_pages_mapped);
912 }
913
914 *pfnp = pfn;
915 return (0);
916 }
917
918 /*
919 * Look up the PTE for a given GPA in the vmspace, populating it with
920 * appropriate contents (pfn, protection, etc) if it is empty, but backed by a
921 * valid mapping.
922 */
923 static int
vmspace_lookup_map(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t ** ptepp)924 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
925 uint64_t **ptepp)
926 {
927 vmm_gpt_t *gpt = vms->vms_gpt;
928 uint64_t *entries[MAX_GPT_LEVEL], *leaf;
929 pfn_t pfn = PFN_INVALID;
930 uint_t prot;
931
932 ASSERT0(gpa & PAGEOFFSET);
933 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
934
935 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1);
936 leaf = entries[LEVEL1];
937 if (leaf == NULL) {
938 /*
939 * Since we populated the intermediate tables for any regions
940 * mapped in the GPT, an empty leaf entry indicates there is no
941 * mapping, populated or not, at this GPA.
942 */
943 return (FC_NOMAP);
944 }
945
946 if (vmm_gpte_is_mapped(leaf, &pfn, &prot)) {
947 if ((req_prot & prot) != req_prot) {
948 return (FC_PROT);
949 }
950 } else {
951 int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf);
952 if (err != 0) {
953 return (err);
954 }
955 }
956
957 ASSERT(pfn != PFN_INVALID && leaf != NULL);
958 if (pfnp != NULL) {
959 *pfnp = pfn;
960 }
961 if (ptepp != NULL) {
962 *ptepp = leaf;
963 }
964 return (0);
965 }
966
967 /*
968 * Populate (make resident in the page tables) a region of the vmspace.
969 *
970 * Presently the [start, end) span must equal a region previously mapped by a
971 * call to vmspace_map().
972 */
973 int
vmspace_populate(vmspace_t * vms,uintptr_t addr,uintptr_t len)974 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len)
975 {
976 ASSERT0(addr & PAGEOFFSET);
977 ASSERT0(len & PAGEOFFSET);
978
979 vmspace_mapping_t *vmsm;
980 mutex_enter(&vms->vms_lock);
981
982 /* For the time being, only exact-match mappings are expected */
983 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) {
984 mutex_exit(&vms->vms_lock);
985 return (FC_NOMAP);
986 }
987
988 vm_object_t *vmo = vmsm->vmsm_object;
989 const int prot = vmsm->vmsm_prot;
990 const uint8_t attr = vmo->vmo_attr;
991 vmm_gpt_t *gpt = vms->vms_gpt;
992 size_t populated = 0;
993
994 vmm_gpt_iter_t iter;
995 vmm_gpt_iter_entry_t entry;
996 vmm_gpt_iter_init(&iter, gpt, addr, len);
997 while (vmm_gpt_iter_next(&iter, &entry)) {
998 const pfn_t pfn =
999 vm_object_pfn(vmo, VMSM_OFFSET(vmsm, entry.vgie_gpa));
1000 VERIFY(pfn != PFN_INVALID);
1001
1002 if (vmm_gpt_map_at(gpt, entry.vgie_ptep, pfn, prot, attr)) {
1003 populated++;
1004 }
1005 }
1006 atomic_add_64(&vms->vms_pages_mapped, populated);
1007
1008 mutex_exit(&vms->vms_lock);
1009 return (0);
1010 }
1011
1012 /*
1013 * Allocate a client from a given vmspace.
1014 */
1015 vm_client_t *
vmspace_client_alloc(vmspace_t * vms)1016 vmspace_client_alloc(vmspace_t *vms)
1017 {
1018 vm_client_t *vmc;
1019
1020 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
1021 vmc->vmc_space = vms;
1022 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
1023 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
1024 vmc->vmc_state = VCS_IDLE;
1025 vmc->vmc_cpu_active = -1;
1026 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
1027 offsetof(vm_page_t, vmp_node));
1028 vmc->vmc_track_dirty = vms->vms_track_dirty;
1029
1030 mutex_enter(&vms->vms_lock);
1031 list_insert_tail(&vms->vms_clients, vmc);
1032 mutex_exit(&vms->vms_lock);
1033
1034 return (vmc);
1035 }
1036
1037 /*
1038 * Get the nested page table root pointer (EPTP/NCR3) value.
1039 */
1040 uint64_t
vmspace_table_root(vmspace_t * vms)1041 vmspace_table_root(vmspace_t *vms)
1042 {
1043 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
1044 }
1045
1046 /*
1047 * Get the current generation number of the nested page table.
1048 */
1049 uint64_t
vmspace_table_gen(vmspace_t * vms)1050 vmspace_table_gen(vmspace_t *vms)
1051 {
1052 return (vms->vms_pt_gen);
1053 }
1054
1055 /*
1056 * Mark a vm_client as active. This will block if/while the client is held by
1057 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will
1058 * fail if the vm_client has been orphaned.
1059 */
1060 static int
vmc_activate(vm_client_t * vmc)1061 vmc_activate(vm_client_t *vmc)
1062 {
1063 mutex_enter(&vmc->vmc_lock);
1064 VERIFY0(vmc->vmc_state & VCS_ACTIVE);
1065 if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
1066 mutex_exit(&vmc->vmc_lock);
1067 return (ENXIO);
1068 }
1069 while ((vmc->vmc_state & VCS_HOLD) != 0) {
1070 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1071 }
1072 vmc->vmc_state |= VCS_ACTIVE;
1073 return (0);
1074 }
1075
1076 /*
1077 * Mark a vm_client as no longer active. It must be called with
1078 * vm_client_t`vmc_lock already held, and will return with it released.
1079 */
1080 static void
vmc_deactivate(vm_client_t * vmc)1081 vmc_deactivate(vm_client_t *vmc)
1082 {
1083 ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1084 VERIFY(vmc->vmc_state & VCS_ACTIVE);
1085
1086 vmc->vmc_state ^= VCS_ACTIVE;
1087 if ((vmc->vmc_state & VCS_HOLD) != 0) {
1088 cv_broadcast(&vmc->vmc_cv);
1089 }
1090 mutex_exit(&vmc->vmc_lock);
1091 }
1092
1093 /*
1094 * Indicate that a CPU will be utilizing the nested page tables through this VM
1095 * client. Interrupts (and/or the GIF) are expected to be disabled when calling
1096 * this function. Returns the generation number of the nested page table (to be
1097 * used for TLB invalidations).
1098 */
1099 uint64_t
vmc_table_enter(vm_client_t * vmc)1100 vmc_table_enter(vm_client_t *vmc)
1101 {
1102 vmspace_t *vms = vmc->vmc_space;
1103 uint64_t gen;
1104
1105 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1106 ASSERT3S(vmc->vmc_cpu_active, ==, -1);
1107
1108 /*
1109 * Since the NPT activation occurs with interrupts disabled, this must
1110 * be done without taking vmc_lock like normal.
1111 */
1112 gen = vms->vms_pt_gen;
1113 vmc->vmc_cpu_active = CPU->cpu_id;
1114 vmc->vmc_cpu_gen = gen;
1115 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
1116
1117 return (gen);
1118 }
1119
1120 /*
1121 * Indicate that this VM client is not longer (directly) using the underlying
1122 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling
1123 * this function.
1124 */
1125 void
vmc_table_exit(vm_client_t * vmc)1126 vmc_table_exit(vm_client_t *vmc)
1127 {
1128 mutex_enter(&vmc->vmc_lock);
1129
1130 ASSERT(vmc->vmc_state & VCS_ON_CPU);
1131 vmc->vmc_state ^= VCS_ON_CPU;
1132 vmc->vmc_cpu_active = -1;
1133 if ((vmc->vmc_state & VCS_HOLD) != 0) {
1134 cv_broadcast(&vmc->vmc_cv);
1135 }
1136
1137 mutex_exit(&vmc->vmc_lock);
1138 }
1139
1140 static void
vmc_space_hold(vm_client_t * vmc)1141 vmc_space_hold(vm_client_t *vmc)
1142 {
1143 mutex_enter(&vmc->vmc_lock);
1144 VERIFY0(vmc->vmc_state & VCS_HOLD);
1145
1146 /*
1147 * Because vmc_table_enter() alters vmc_state from a context where
1148 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
1149 * VMC_HOLD must be done atomically here.
1150 */
1151 atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
1152
1153 /* Wait for client to go inactive */
1154 while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
1155 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1156 }
1157 mutex_exit(&vmc->vmc_lock);
1158 }
1159
1160 static void
vmc_space_release(vm_client_t * vmc,bool kick_on_cpu)1161 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
1162 {
1163 mutex_enter(&vmc->vmc_lock);
1164 VERIFY(vmc->vmc_state & VCS_HOLD);
1165
1166 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
1167 poke_cpu(vmc->vmc_cpu_active);
1168
1169 while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1170 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1171 }
1172 }
1173
1174 /*
1175 * Because vmc_table_enter() alters vmc_state from a context where
1176 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
1177 * VMC_HOLD must be done atomically here.
1178 */
1179 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
1180 cv_broadcast(&vmc->vmc_cv);
1181 mutex_exit(&vmc->vmc_lock);
1182 }
1183
1184 static void
vmc_space_invalidate(vm_client_t * vmc,uintptr_t addr,size_t size,uint64_t gen)1185 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
1186 uint64_t gen)
1187 {
1188 mutex_enter(&vmc->vmc_lock);
1189 VERIFY(vmc->vmc_state & VCS_HOLD);
1190 if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1191 /*
1192 * Wait for clients using an old generation of the page tables
1193 * to exit guest context, where they subsequently flush the TLB
1194 * for the new generation.
1195 */
1196 if (vmc->vmc_cpu_gen < gen) {
1197 poke_cpu(vmc->vmc_cpu_active);
1198
1199 while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1200 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1201 }
1202 }
1203 }
1204 if (vmc->vmc_inval_func != NULL) {
1205 vmc_inval_cb_t func = vmc->vmc_inval_func;
1206 void *data = vmc->vmc_inval_data;
1207
1208 /*
1209 * Perform the actual invalidation call outside vmc_lock to
1210 * avoid lock ordering issues in the consumer. Since the client
1211 * is under VCS_HOLD, this is safe.
1212 */
1213 mutex_exit(&vmc->vmc_lock);
1214 func(data, addr, size);
1215 mutex_enter(&vmc->vmc_lock);
1216 }
1217 mutex_exit(&vmc->vmc_lock);
1218 }
1219
1220 static void
vmc_space_unmap(vm_client_t * vmc,uintptr_t addr,size_t size,vm_object_t * vmo)1221 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1222 vm_object_t *vmo)
1223 {
1224 mutex_enter(&vmc->vmc_lock);
1225 VERIFY(vmc->vmc_state & VCS_HOLD);
1226
1227 /*
1228 * With the current vCPU exclusion invariants in place, we do not expect
1229 * a vCPU to be in guest context during an unmap.
1230 */
1231 VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1232
1233 /*
1234 * Any holds against the unmapped region need to establish their own
1235 * reference to the underlying object to avoid a potential
1236 * use-after-free.
1237 */
1238 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1239 vmp != NULL;
1240 vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1241 if (vmp->vmp_gpa < addr ||
1242 vmp->vmp_gpa >= (addr + size)) {
1243 /* Hold outside region in question */
1244 continue;
1245 }
1246 if (vmp->vmp_obj_ref == NULL) {
1247 vm_object_reference(vmo);
1248 vmp->vmp_obj_ref = vmo;
1249 /* For an unmapped region, PTE is now meaningless */
1250 vmp->vmp_ptep = NULL;
1251 } else {
1252 /*
1253 * Object could have gone through cycle of
1254 * unmap-map-unmap before the hold was released.
1255 */
1256 VERIFY3P(vmp->vmp_ptep, ==, NULL);
1257 }
1258 }
1259 mutex_exit(&vmc->vmc_lock);
1260 }
1261
1262 static vm_client_t *
vmc_space_orphan(vm_client_t * vmc,vmspace_t * vms)1263 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1264 {
1265 vm_client_t *next;
1266
1267 ASSERT(MUTEX_HELD(&vms->vms_lock));
1268
1269 mutex_enter(&vmc->vmc_lock);
1270 VERIFY3P(vmc->vmc_space, ==, vms);
1271 VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1272 if (vmc->vmc_state & VCS_DESTROY) {
1273 /*
1274 * This vm_client is currently undergoing destruction, so it
1275 * does not need to be orphaned. Let it proceed with its own
1276 * clean-up task.
1277 */
1278 next = list_next(&vms->vms_clients, vmc);
1279 } else {
1280 /*
1281 * Clients are only orphaned when the containing vmspace is
1282 * being torn down. All mappings from the vmspace should
1283 * already be gone, meaning any remaining held pages should have
1284 * direct references to the object.
1285 */
1286 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1287 vmp != NULL;
1288 vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1289 ASSERT3P(vmp->vmp_ptep, ==, NULL);
1290 ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1291 }
1292
1293 /*
1294 * After this point, the client will be orphaned, unable to
1295 * establish new page holds (or access any vmspace-related
1296 * resources) and is in charge of cleaning up after itself.
1297 */
1298 vmc->vmc_state |= VCS_ORPHANED;
1299 next = list_next(&vms->vms_clients, vmc);
1300 list_remove(&vms->vms_clients, vmc);
1301 vmc->vmc_space = NULL;
1302 }
1303 mutex_exit(&vmc->vmc_lock);
1304 return (next);
1305 }
1306
1307 /*
1308 * Attempt to hold a page at `gpa` inside the referenced vmspace.
1309 */
1310 vm_page_t *
vmc_hold_ext(vm_client_t * vmc,uintptr_t gpa,int prot,int flags)1311 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags)
1312 {
1313 vmspace_t *vms = vmc->vmc_space;
1314 vm_page_t *vmp;
1315 pfn_t pfn = PFN_INVALID;
1316 uint64_t *ptep = NULL;
1317
1318 ASSERT0(gpa & PAGEOFFSET);
1319 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1320 ASSERT0(prot & ~PROT_ALL);
1321 ASSERT0(flags & ~VPF_ALL);
1322
1323 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1324 if (vmc_activate(vmc) != 0) {
1325 kmem_free(vmp, sizeof (*vmp));
1326 return (NULL);
1327 }
1328
1329 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1330 vmc_deactivate(vmc);
1331 kmem_free(vmp, sizeof (*vmp));
1332 return (NULL);
1333 }
1334 ASSERT(pfn != PFN_INVALID && ptep != NULL);
1335
1336 vmp->vmp_client = vmc;
1337 vmp->vmp_chain = NULL;
1338 vmp->vmp_gpa = gpa;
1339 vmp->vmp_pfn = pfn;
1340 vmp->vmp_ptep = ptep;
1341 vmp->vmp_obj_ref = NULL;
1342 vmp->vmp_prot = (uint8_t)prot;
1343 vmp->vmp_flags = (uint8_t)flags;
1344 list_insert_tail(&vmc->vmc_held_pages, vmp);
1345 vmc_deactivate(vmc);
1346
1347 return (vmp);
1348 }
1349
1350 /*
1351 * Attempt to hold a page at `gpa` inside the referenced vmspace.
1352 */
1353 vm_page_t *
vmc_hold(vm_client_t * vmc,uintptr_t gpa,int prot)1354 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1355 {
1356 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT));
1357 }
1358
1359 int
vmc_fault(vm_client_t * vmc,uintptr_t gpa,int prot)1360 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1361 {
1362 vmspace_t *vms = vmc->vmc_space;
1363 int err;
1364
1365 err = vmc_activate(vmc);
1366 if (err == 0) {
1367 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1368 vmc_deactivate(vmc);
1369 }
1370
1371 return (err);
1372 }
1373
1374 /*
1375 * Allocate an additional vm_client_t, based on an existing one. Only the
1376 * associatation with the vmspace is cloned, not existing holds or any
1377 * configured invalidation function.
1378 */
1379 vm_client_t *
vmc_clone(vm_client_t * vmc)1380 vmc_clone(vm_client_t *vmc)
1381 {
1382 vmspace_t *vms = vmc->vmc_space;
1383
1384 return (vmspace_client_alloc(vms));
1385 }
1386
1387 /*
1388 * Register a function (and associated data pointer) to be called when an
1389 * address range in the vmspace is invalidated.
1390 */
1391 int
vmc_set_inval_cb(vm_client_t * vmc,vmc_inval_cb_t func,void * data)1392 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1393 {
1394 int err;
1395
1396 err = vmc_activate(vmc);
1397 if (err == 0) {
1398 vmc->vmc_inval_func = func;
1399 vmc->vmc_inval_data = data;
1400 vmc_deactivate(vmc);
1401 }
1402
1403 return (err);
1404 }
1405
1406 /*
1407 * Destroy a vm_client_t instance.
1408 *
1409 * No pages held through this vm_client_t may be outstanding when performing a
1410 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to
1411 * vmc_table_exit() has been made).
1412 */
1413 void
vmc_destroy(vm_client_t * vmc)1414 vmc_destroy(vm_client_t *vmc)
1415 {
1416 mutex_enter(&vmc->vmc_lock);
1417
1418 VERIFY(list_is_empty(&vmc->vmc_held_pages));
1419 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1420
1421 if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1422 vmspace_t *vms;
1423
1424 /*
1425 * Deassociation with the parent vmspace must be done carefully:
1426 * The vmspace could attempt to orphan this vm_client while we
1427 * release vmc_lock in order to take vms_lock (the required
1428 * order). The client is marked to indicate that destruction is
1429 * under way. Doing so prevents any racing orphan operation
1430 * from applying to this client, allowing us to deassociate from
1431 * the vmspace safely.
1432 */
1433 vmc->vmc_state |= VCS_DESTROY;
1434 vms = vmc->vmc_space;
1435 mutex_exit(&vmc->vmc_lock);
1436
1437 mutex_enter(&vms->vms_lock);
1438 mutex_enter(&vmc->vmc_lock);
1439 list_remove(&vms->vms_clients, vmc);
1440 /*
1441 * If the vmspace began its own destruction operation while we
1442 * were navigating the locks, be sure to notify it about this
1443 * vm_client being deassociated.
1444 */
1445 cv_signal(&vms->vms_cv);
1446 mutex_exit(&vmc->vmc_lock);
1447 mutex_exit(&vms->vms_lock);
1448 } else {
1449 VERIFY3P(vmc->vmc_space, ==, NULL);
1450 mutex_exit(&vmc->vmc_lock);
1451 }
1452
1453 mutex_destroy(&vmc->vmc_lock);
1454 cv_destroy(&vmc->vmc_cv);
1455 list_destroy(&vmc->vmc_held_pages);
1456
1457 kmem_free(vmc, sizeof (*vmc));
1458 }
1459
1460 static __inline void *
vmp_ptr(const vm_page_t * vmp)1461 vmp_ptr(const vm_page_t *vmp)
1462 {
1463 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1464
1465 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1466 return ((void *)((uintptr_t)kpm_vbase + paddr));
1467 }
1468
1469 /*
1470 * Get a readable kernel-virtual pointer for a held page.
1471 *
1472 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1473 * call to acquire this page reference.
1474 */
1475 const void *
vmp_get_readable(const vm_page_t * vmp)1476 vmp_get_readable(const vm_page_t *vmp)
1477 {
1478 ASSERT(vmp->vmp_prot & PROT_READ);
1479
1480 return (vmp_ptr(vmp));
1481 }
1482
1483 /*
1484 * Get a writable kernel-virtual pointer for a held page.
1485 *
1486 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1487 * call to acquire this page reference.
1488 */
1489 void *
vmp_get_writable(const vm_page_t * vmp)1490 vmp_get_writable(const vm_page_t *vmp)
1491 {
1492 ASSERT(vmp->vmp_prot & PROT_WRITE);
1493
1494 return (vmp_ptr(vmp));
1495 }
1496
1497 /*
1498 * Get the host-physical PFN for a held page.
1499 */
1500 pfn_t
vmp_get_pfn(const vm_page_t * vmp)1501 vmp_get_pfn(const vm_page_t *vmp)
1502 {
1503 return (vmp->vmp_pfn);
1504 }
1505
1506 /*
1507 * If this page was deferring dirty-marking in the corresponding vmspace page
1508 * tables, clear such a state so it is considered dirty from now on.
1509 */
1510 void
vmp_mark_dirty(vm_page_t * vmp)1511 vmp_mark_dirty(vm_page_t *vmp)
1512 {
1513 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0);
1514
1515 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY);
1516 }
1517
1518 /*
1519 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1520 */
1521 void
vmp_chain(vm_page_t * vmp,vm_page_t * to_chain)1522 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1523 {
1524 ASSERT3P(vmp->vmp_chain, ==, NULL);
1525
1526 vmp->vmp_chain = to_chain;
1527 }
1528
1529 /*
1530 * Retrieve the pointer from the page-chaining in `vmp`.
1531 */
1532 vm_page_t *
vmp_next(const vm_page_t * vmp)1533 vmp_next(const vm_page_t *vmp)
1534 {
1535 return (vmp->vmp_chain);
1536 }
1537
1538 static __inline bool
vmp_release_inner(vm_page_t * vmp,vm_client_t * vmc)1539 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1540 {
1541 ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1542
1543 bool was_unmapped = false;
1544
1545 list_remove(&vmc->vmc_held_pages, vmp);
1546 if (vmp->vmp_obj_ref != NULL) {
1547 ASSERT3P(vmp->vmp_ptep, ==, NULL);
1548
1549 vm_object_release(vmp->vmp_obj_ref);
1550 was_unmapped = true;
1551 } else {
1552 ASSERT3P(vmp->vmp_ptep, !=, NULL);
1553
1554 /*
1555 * Track appropriate (accessed/dirty) bits for the guest-virtual
1556 * address corresponding to this page, if it is from the vmspace
1557 * rather than a direct reference to an underlying object.
1558 *
1559 * The protection and/or configured flags may obviate the need
1560 * for such an update.
1561 */
1562 if ((vmp->vmp_prot & PROT_WRITE) != 0 &&
1563 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 &&
1564 vmc->vmc_track_dirty) {
1565 (void) vmm_gpte_reset_dirty(vmp->vmp_ptep, true);
1566 }
1567 }
1568 kmem_free(vmp, sizeof (*vmp));
1569 return (was_unmapped);
1570 }
1571
1572 /*
1573 * Release held page. Returns true if page resided on region which was
1574 * subsequently unmapped.
1575 */
1576 bool
vmp_release(vm_page_t * vmp)1577 vmp_release(vm_page_t *vmp)
1578 {
1579 vm_client_t *vmc = vmp->vmp_client;
1580
1581 VERIFY(vmc != NULL);
1582
1583 mutex_enter(&vmc->vmc_lock);
1584 const bool was_unmapped = vmp_release_inner(vmp, vmc);
1585 mutex_exit(&vmc->vmc_lock);
1586 return (was_unmapped);
1587 }
1588
1589 /*
1590 * Release a chain of pages which were associated via vmp_chain() (setting
1591 * page-chaining pointer). Returns true if any pages resided upon a region
1592 * which was subsequently unmapped.
1593 *
1594 * All of those pages must have been held through the same vm_client_t.
1595 */
1596 bool
vmp_release_chain(vm_page_t * vmp)1597 vmp_release_chain(vm_page_t *vmp)
1598 {
1599 vm_client_t *vmc = vmp->vmp_client;
1600 bool any_unmapped = false;
1601
1602 ASSERT(vmp != NULL);
1603
1604 mutex_enter(&vmc->vmc_lock);
1605 while (vmp != NULL) {
1606 vm_page_t *next = vmp->vmp_chain;
1607
1608 /* We expect all pages in chain to be from same client */
1609 ASSERT3P(vmp->vmp_client, ==, vmc);
1610
1611 if (vmp_release_inner(vmp, vmc)) {
1612 any_unmapped = true;
1613 }
1614 vmp = next;
1615 }
1616 mutex_exit(&vmc->vmc_lock);
1617 return (any_unmapped);
1618 }
1619
1620
1621 int
vm_segmap_obj(struct vm * vm,int segid,off_t segoff,off_t len,struct as * as,caddr_t * addrp,uint_t prot,uint_t maxprot,uint_t flags)1622 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1623 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1624 {
1625 vm_object_t *vmo;
1626 int err;
1627
1628 if (segoff < 0 || len <= 0 ||
1629 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1630 return (EINVAL);
1631 }
1632 if ((prot & PROT_USER) == 0) {
1633 return (ENOTSUP);
1634 }
1635 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1636 if (err != 0) {
1637 return (err);
1638 }
1639
1640 VERIFY(segoff >= 0);
1641 VERIFY(len <= vmo->vmo_size);
1642 VERIFY((len + segoff) <= vmo->vmo_size);
1643
1644 if (vmo->vmo_type != VMOT_MEM) {
1645 /* Only support memory objects for now */
1646 return (ENOTSUP);
1647 }
1648
1649 as_rangelock(as);
1650
1651 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1652 if (err == 0) {
1653 segvmm_crargs_t svma;
1654
1655 svma.prot = prot;
1656 svma.offset = segoff;
1657 svma.vmo = vmo;
1658 svma.vmc = NULL;
1659
1660 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1661 }
1662
1663 as_rangeunlock(as);
1664 return (err);
1665 }
1666
1667 int
vm_segmap_space(struct vm * vm,off_t off,struct as * as,caddr_t * addrp,off_t len,uint_t prot,uint_t maxprot,uint_t flags)1668 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1669 off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1670 {
1671
1672 const uintptr_t gpa = (uintptr_t)off;
1673 const size_t size = (uintptr_t)len;
1674 int err;
1675
1676 if (off < 0 || len <= 0 ||
1677 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1678 return (EINVAL);
1679 }
1680 if ((prot & PROT_USER) == 0) {
1681 return (ENOTSUP);
1682 }
1683
1684 as_rangelock(as);
1685
1686 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1687 if (err == 0) {
1688 segvmm_crargs_t svma;
1689
1690 svma.prot = prot;
1691 svma.offset = gpa;
1692 svma.vmo = NULL;
1693 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1694
1695 err = as_map(as, *addrp, len, segvmm_create, &svma);
1696 }
1697
1698 as_rangeunlock(as);
1699 return (err);
1700 }
1701