1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2023 Oxide Computer Company
16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17 */
18
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40
41
42 /*
43 * VMM Virtual Memory
44 *
45 * History
46 *
47 * When bhyve was ported to illumos, one significant hole was handling guest
48 * memory and memory accesses. In the original Pluribus port, bhyve itself
49 * manually handled the EPT structures for guest memory. The updated sources
50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
51 * system for memory allocations and management of the EPT structures. Keeping
52 * source differences to a minimum was a priority, so illumos-bhyve implemented
53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
54 * boot and run guests.
55 *
56 * While the VM shim was successful in getting illumos-bhyve to a functional
57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
58 * compatibility interfaces made it awkward to use. As source differences with
59 * the upstream kernel code became less of a concern, and upcoming features
60 * (such as live migration) would demand more of those VM interfaces, it became
61 * clear that an overhaul was prudent.
62 *
63 * Design
64 *
65 * The new VM system for bhyve retains a number of the same concepts as what it
66 * replaces:
67 *
68 * - `vmspace_t` is the top-level entity for a guest memory space
69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace
70 * - `vm_page_t` represents a page hold within a given vmspace, providing access
71 * to the underlying memory page
72 *
73 * Unlike the old code, where most of the involved structures were exposed via
74 * public definitions, this replacement VM interface keeps all involved
75 * structures opaque to consumers. Furthermore, there is a clear delineation
76 * between infrequent administrative operations (such as mapping/unmapping
77 * regions) and common data-path operations (attempting a page hold at a given
78 * guest-physical address). Those administrative operations are performed
79 * directly against the vmspace, whereas the data-path operations are performed
80 * through a `vm_client_t` handle. That VM client abstraction is meant to
81 * reduce contention and overhead for frequent access operations and provide
82 * debugging insight into how different subcomponents are accessing the vmspace.
83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
84 * interface) and each VMM userspace segment mapping.
85 *
86 * Exclusion
87 *
88 * Making changes to the vmspace (such as mapping or unmapping regions) requires
89 * other accessors be excluded while the change is underway to prevent them from
90 * observing invalid intermediate states. A simple approach could use a mutex
91 * or rwlock to achieve this, but that risks contention when the rate of access
92 * to the vmspace is high.
93 *
94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes,
96 * it means that the much more common page accesses through the vm_client can
97 * normally proceed unimpeded and independently.
98 *
99 * When a change to the vmspace is required, the caller will put the vmspace in
100 * a 'hold' state, iterating over all associated vm_client instances, waiting
101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on
103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
104 * will block until the hold condition is cleared. Once the hold is asserted
105 * for all clients, the vmspace change can proceed with confidence. Upon
106 * completion of that operation, VCS_HOLD is cleared from the clients, and they
107 * are released to resume vmspace accesses.
108 *
109 * vCPU Consumers
110 *
111 * Access to the vmspace for vCPUs running in guest context is different from
112 * emulation-related vm_client activity: they solely rely on the contents of the
113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude
114 * client access is not feasible when entering guest context, since interrupts
115 * are disabled, making it impossible to block entry. This is not a concern as
116 * long as vmspace modifications never place the page tables in invalid states
117 * (either intermediate, or final). The vm_client hold mechanism does provide
118 * the means to IPI vCPU consumers which will trigger a notification once they
119 * report their exit from guest context. This can be used to ensure that page
120 * table modifications are made visible to those vCPUs within a certain
121 * time frame.
122 */
123
124 typedef struct vmspace_mapping {
125 list_node_t vmsm_node;
126 vm_object_t *vmsm_object; /* object backing this mapping */
127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */
128 size_t vmsm_len; /* length (in bytes) of mapping */
129 off_t vmsm_offset; /* byte offset into object */
130 uint_t vmsm_prot;
131 } vmspace_mapping_t;
132
133 #define VMSM_OFFSET(vmsm, addr) ( \
134 (vmsm)->vmsm_offset + \
135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
136
137 typedef enum vm_client_state {
138 VCS_IDLE = 0,
139 /* currently accessing vmspace for client operation (hold or fault) */
140 VCS_ACTIVE = (1 << 0),
141 /* client hold requested/asserted */
142 VCS_HOLD = (1 << 1),
143 /* vCPU is accessing page tables in guest context */
144 VCS_ON_CPU = (1 << 2),
145 /* client has been orphaned (no more access to vmspace) */
146 VCS_ORPHANED = (1 << 3),
147 /* client undergoing destroy operation */
148 VCS_DESTROY = (1 << 4),
149 } vm_client_state_t;
150
151 struct vmspace {
152 kmutex_t vms_lock;
153 kcondvar_t vms_cv;
154 bool vms_held;
155 uintptr_t vms_size; /* immutable after creation */
156
157 /* (nested) page table state */
158 vmm_gpt_t *vms_gpt;
159 uint64_t vms_pt_gen;
160 uint64_t vms_pages_mapped;
161 bool vms_track_dirty;
162
163 list_t vms_maplist;
164 list_t vms_clients;
165 };
166
167 struct vm_client {
168 vmspace_t *vmc_space;
169 list_node_t vmc_node;
170
171 kmutex_t vmc_lock;
172 kcondvar_t vmc_cv;
173 vm_client_state_t vmc_state;
174 int vmc_cpu_active;
175 uint64_t vmc_cpu_gen;
176 bool vmc_track_dirty;
177 vmc_inval_cb_t vmc_inval_func;
178 void *vmc_inval_data;
179
180 list_t vmc_held_pages;
181 };
182
183 typedef enum vm_object_type {
184 VMOT_NONE,
185 VMOT_MEM,
186 VMOT_MMIO,
187 } vm_object_type_t;
188
189 struct vm_object {
190 uint_t vmo_refcnt; /* manipulated with atomic ops */
191
192 /* Fields below are fixed at creation time */
193 vm_object_type_t vmo_type;
194 size_t vmo_size;
195 void *vmo_data;
196 uint8_t vmo_attr;
197 };
198
199 /* Convenience consolidation of all flag(s) for validity checking */
200 #define VPF_ALL (VPF_DEFER_DIRTY)
201
202 struct vm_page {
203 vm_client_t *vmp_client;
204 list_node_t vmp_node;
205 vm_page_t *vmp_chain;
206 uintptr_t vmp_gpa;
207 pfn_t vmp_pfn;
208 uint64_t *vmp_ptep;
209 vm_object_t *vmp_obj_ref;
210 uint8_t vmp_prot;
211 uint8_t vmp_flags;
212 };
213
214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
215 static void vmspace_hold_enter(vmspace_t *);
216 static void vmspace_hold_exit(vmspace_t *, bool);
217 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t);
218 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *,
219 uint64_t *);
220 static void vmc_space_hold(vm_client_t *);
221 static void vmc_space_release(vm_client_t *, bool);
222 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
223 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
224 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
225
226
227 /*
228 * Create a new vmspace with a maximum address of `end`.
229 */
230 vmspace_t *
vmspace_alloc(size_t end,vmm_pte_ops_t * pte_ops,bool track_dirty)231 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
232 {
233 vmspace_t *vms;
234 const uintptr_t size = end + 1;
235
236 /*
237 * This whole mess is built on the assumption that a 64-bit address
238 * space is available to work with for the various pagetable tricks.
239 */
240 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
241 size <= (uintptr_t)USERLIMIT);
242
243 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
244 vms->vms_size = size;
245 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
246 offsetof(vmspace_mapping_t, vmsm_node));
247 list_create(&vms->vms_clients, sizeof (vm_client_t),
248 offsetof(vm_client_t, vmc_node));
249
250 vms->vms_gpt = vmm_gpt_alloc(pte_ops);
251 vms->vms_pt_gen = 1;
252 vms->vms_track_dirty = track_dirty;
253
254 return (vms);
255 }
256
257 /*
258 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining
259 * clients will be orphaned.
260 */
261 void
vmspace_destroy(vmspace_t * vms)262 vmspace_destroy(vmspace_t *vms)
263 {
264 mutex_enter(&vms->vms_lock);
265 VERIFY(list_is_empty(&vms->vms_maplist));
266
267 if (!list_is_empty(&vms->vms_clients)) {
268 vm_client_t *vmc = list_head(&vms->vms_clients);
269 while (vmc != NULL) {
270 vmc = vmc_space_orphan(vmc, vms);
271 }
272 /*
273 * Wait for any clients which were in the process of destroying
274 * themselves to disappear.
275 */
276 while (!list_is_empty(&vms->vms_clients)) {
277 cv_wait(&vms->vms_cv, &vms->vms_lock);
278 }
279 }
280 VERIFY(list_is_empty(&vms->vms_clients));
281
282 vmm_gpt_free(vms->vms_gpt);
283 mutex_exit(&vms->vms_lock);
284
285 mutex_destroy(&vms->vms_lock);
286 cv_destroy(&vms->vms_cv);
287 list_destroy(&vms->vms_maplist);
288 list_destroy(&vms->vms_clients);
289
290 kmem_free(vms, sizeof (*vms));
291 }
292
293 /*
294 * Retrieve the count of resident (mapped into the page tables) pages.
295 */
296 uint64_t
vmspace_resident_count(vmspace_t * vms)297 vmspace_resident_count(vmspace_t *vms)
298 {
299 return (vms->vms_pages_mapped);
300 }
301
302 /*
303 * Perform an operation on the status (accessed/dirty) bits held in the page
304 * tables of this vmspace.
305 *
306 * Such manipulations race against both hardware writes (from running vCPUs) and
307 * emulated accesses reflected from userspace. Safe functionality depends on
308 * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap
309 * operations from changing the page tables during the walk.
310 */
311 void
vmspace_bits_operate(vmspace_t * vms,uint64_t gpa,size_t len,vmspace_bit_oper_t oper,uint8_t * bitmap)312 vmspace_bits_operate(vmspace_t *vms, uint64_t gpa, size_t len,
313 vmspace_bit_oper_t oper, uint8_t *bitmap)
314 {
315 const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0;
316 const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0;
317 const vmspace_bit_oper_t oper_only =
318 oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT);
319 vmm_gpt_t *gpt = vms->vms_gpt;
320
321 /*
322 * The bitmap cannot be NULL if the requested operation involves reading
323 * or writing from it.
324 */
325 ASSERT(bitmap != NULL || (!bit_input && !bit_output));
326
327 for (size_t offset = 0; offset < len; offset += PAGESIZE) {
328 const uint64_t pfn_offset = offset >> PAGESHIFT;
329 const size_t bit_offset = pfn_offset / 8;
330 const uint8_t bit_mask = 1 << (pfn_offset % 8);
331
332 if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) {
333 continue;
334 }
335
336 bool value = false;
337 uint64_t *entry = vmm_gpt_lookup(gpt, gpa + offset);
338 if (entry == NULL) {
339 if (bit_output) {
340 bitmap[bit_offset] &= ~bit_mask;
341 }
342 continue;
343 }
344
345 switch (oper_only) {
346 case VBO_GET_DIRTY:
347 value = vmm_gpt_query(gpt, entry, VGQ_DIRTY);
348 break;
349 case VBO_SET_DIRTY: {
350 uint_t prot = 0;
351 bool present_writable = false;
352 pfn_t pfn;
353
354 /*
355 * To avoid blindly setting the dirty bit on otherwise
356 * empty PTEs, we must first check if the entry for the
357 * address in question has been populated.
358 *
359 * Only if the page is marked both Present and Writable
360 * will we permit the dirty bit to be set.
361 */
362 if (!vmm_gpt_is_mapped(gpt, entry, &pfn, &prot)) {
363 int err = vmspace_ensure_mapped(vms, gpa,
364 PROT_WRITE, &pfn, entry);
365 if (err == 0) {
366 present_writable = true;
367 }
368 } else if ((prot & PROT_WRITE) != 0) {
369 present_writable = true;
370 }
371
372 if (present_writable) {
373 value = !vmm_gpt_reset_dirty(gpt, entry, true);
374 }
375 break;
376 }
377 case VBO_RESET_DIRTY:
378 /*
379 * Although at first glance, it may seem like the act of
380 * resetting the dirty bit may require the same care as
381 * setting it, the constraints make for a simpler task.
382 *
383 * Any PTEs with the dirty bit set will have already
384 * been properly populated.
385 */
386 value = vmm_gpt_reset_dirty(gpt, entry, false);
387 break;
388 default:
389 panic("unrecognized operator: %d", oper_only);
390 break;
391 }
392 if (bit_output) {
393 if (value) {
394 bitmap[bit_offset] |= bit_mask;
395 } else {
396 bitmap[bit_offset] &= ~bit_mask;
397 }
398 }
399 }
400
401 /*
402 * Invalidate the address range potentially effected by the changes to
403 * page table bits, issuing shoot-downs for those who might have it in
404 * cache.
405 */
406 vmspace_hold_enter(vms);
407 vms->vms_pt_gen++;
408 vmspace_clients_invalidate(vms, gpa, len);
409 vmspace_hold_exit(vms, true);
410 }
411
412 /*
413 * Is dirty-page-tracking enabled for the vmspace?
414 */
415 bool
vmspace_get_tracking(vmspace_t * vms)416 vmspace_get_tracking(vmspace_t *vms)
417 {
418 mutex_enter(&vms->vms_lock);
419 const bool val = vms->vms_track_dirty;
420 mutex_exit(&vms->vms_lock);
421 return (val);
422 }
423
424 /*
425 * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace.
426 */
427 int
vmspace_set_tracking(vmspace_t * vms,bool enable_dirty_tracking)428 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking)
429 {
430 if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) {
431 /* Do not allow this to be set if it is not supported */
432 return (ENOTSUP);
433 }
434
435 vmspace_hold_enter(vms);
436 if (vms->vms_track_dirty == enable_dirty_tracking) {
437 /* No further effort required if state already matches */
438 vmspace_hold_exit(vms, false);
439 return (0);
440 }
441
442 vms->vms_track_dirty = enable_dirty_tracking;
443
444 /* Configure all existing clients for new tracking behavior */
445 for (vm_client_t *vmc = list_head(&vms->vms_clients);
446 vmc != NULL;
447 vmc = list_next(&vms->vms_clients, vmc)) {
448 mutex_enter(&vmc->vmc_lock);
449 vmc->vmc_track_dirty = enable_dirty_tracking;
450 mutex_exit(&vmc->vmc_lock);
451 }
452
453 /*
454 * Notify all clients of what is considered an invalidation of the
455 * entire vmspace.
456 */
457 vms->vms_pt_gen++;
458 vmspace_clients_invalidate(vms, 0, vms->vms_size);
459
460 vmspace_hold_exit(vms, true);
461 return (0);
462 }
463
464 static pfn_t
vm_object_pager_reservoir(vm_object_t * vmo,uintptr_t off)465 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
466 {
467 vmmr_region_t *region;
468 pfn_t pfn;
469
470 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
471
472 region = vmo->vmo_data;
473 pfn = vmmr_region_pfn_at(region, off);
474
475 return (pfn);
476 }
477
478 static pfn_t
vm_object_pager_mmio(vm_object_t * vmo,uintptr_t off)479 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
480 {
481 pfn_t pfn;
482
483 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
484 ASSERT3P(vmo->vmo_data, !=, NULL);
485 ASSERT3U(off, <, vmo->vmo_size);
486
487 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
488
489 return (pfn);
490 }
491
492 /*
493 * Allocate a VM object backed by VMM reservoir memory.
494 */
495 vm_object_t *
vm_object_mem_allocate(size_t size,bool transient)496 vm_object_mem_allocate(size_t size, bool transient)
497 {
498 int err;
499 vmmr_region_t *region = NULL;
500 vm_object_t *vmo;
501
502 ASSERT3U(size, !=, 0);
503 ASSERT3U(size & PAGEOFFSET, ==, 0);
504
505 err = vmmr_alloc(size, transient, ®ion);
506 if (err != 0) {
507 return (NULL);
508 }
509
510 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
511
512 /* For now, these are to stay fixed after allocation */
513 vmo->vmo_type = VMOT_MEM;
514 vmo->vmo_size = size;
515 vmo->vmo_attr = MTRR_TYPE_WB;
516 vmo->vmo_data = region;
517 vmo->vmo_refcnt = 1;
518
519 return (vmo);
520 }
521
522 static vm_object_t *
vm_object_mmio_allocate(size_t size,uintptr_t hpa)523 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
524 {
525 vm_object_t *vmo;
526
527 ASSERT3U(size, !=, 0);
528 ASSERT3U(size & PAGEOFFSET, ==, 0);
529 ASSERT3U(hpa & PAGEOFFSET, ==, 0);
530
531 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
532
533 /* For now, these are to stay fixed after allocation */
534 vmo->vmo_type = VMOT_MMIO;
535 vmo->vmo_size = size;
536 vmo->vmo_attr = MTRR_TYPE_UC;
537 vmo->vmo_data = (void *)hpa;
538 vmo->vmo_refcnt = 1;
539
540 return (vmo);
541 }
542
543 /*
544 * Allocate a VM object backed by an existing range of physical memory.
545 */
546 vm_object_t *
vmm_mmio_alloc(vmspace_t * vmspace,uintptr_t gpa,size_t len,uintptr_t hpa)547 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
548 {
549 int error;
550 vm_object_t *obj;
551
552 obj = vm_object_mmio_allocate(len, hpa);
553 if (obj != NULL) {
554 error = vmspace_map(vmspace, obj, 0, gpa, len,
555 PROT_READ | PROT_WRITE);
556 if (error != 0) {
557 vm_object_release(obj);
558 obj = NULL;
559 }
560 }
561
562 return (obj);
563 }
564
565 /*
566 * Release a vm_object reference
567 */
568 void
vm_object_release(vm_object_t * vmo)569 vm_object_release(vm_object_t *vmo)
570 {
571 ASSERT(vmo != NULL);
572
573 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
574 /* underflow would be a deadly serious mistake */
575 VERIFY3U(ref, !=, UINT_MAX);
576 if (ref != 0) {
577 return;
578 }
579
580 switch (vmo->vmo_type) {
581 case VMOT_MEM:
582 vmmr_free((vmmr_region_t *)vmo->vmo_data);
583 break;
584 case VMOT_MMIO:
585 break;
586 default:
587 panic("unexpected object type %u", vmo->vmo_type);
588 break;
589 }
590
591 vmo->vmo_data = NULL;
592 vmo->vmo_size = 0;
593 kmem_free(vmo, sizeof (*vmo));
594 }
595
596 /*
597 * Increase refcount for vm_object reference
598 */
599 void
vm_object_reference(vm_object_t * vmo)600 vm_object_reference(vm_object_t *vmo)
601 {
602 ASSERT(vmo != NULL);
603
604 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
605 /* overflow would be a deadly serious mistake */
606 VERIFY3U(ref, !=, 0);
607 }
608
609 /*
610 * Get the host-physical PFN for a given offset into a vm_object.
611 *
612 * The provided `off` must be within the allocated size of the vm_object.
613 */
614 pfn_t
vm_object_pfn(vm_object_t * vmo,uintptr_t off)615 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
616 {
617 const uintptr_t aligned_off = off & PAGEMASK;
618
619 switch (vmo->vmo_type) {
620 case VMOT_MEM:
621 return (vm_object_pager_reservoir(vmo, aligned_off));
622 case VMOT_MMIO:
623 return (vm_object_pager_mmio(vmo, aligned_off));
624 case VMOT_NONE:
625 break;
626 }
627 panic("unexpected object type %u", vmo->vmo_type);
628 }
629
630 static vmspace_mapping_t *
vm_mapping_find(vmspace_t * vms,uintptr_t addr,size_t size)631 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
632 {
633 vmspace_mapping_t *vmsm;
634 list_t *ml = &vms->vms_maplist;
635 const uintptr_t range_end = addr + size;
636
637 ASSERT3U(addr, <=, range_end);
638
639 if (addr >= vms->vms_size) {
640 return (NULL);
641 }
642 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
643 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
644
645 if (addr >= vmsm->vmsm_addr && addr < seg_end) {
646 if (range_end <= seg_end) {
647 return (vmsm);
648 } else {
649 return (NULL);
650 }
651 }
652 }
653 return (NULL);
654 }
655
656 /*
657 * Check to see if any mappings reside within [addr, addr + size) span in the
658 * vmspace, returning true if that span is indeed empty.
659 */
660 static bool
vm_mapping_gap(vmspace_t * vms,uintptr_t addr,size_t size)661 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
662 {
663 vmspace_mapping_t *vmsm;
664 list_t *ml = &vms->vms_maplist;
665 const uintptr_t range_end = addr + size - 1;
666
667 ASSERT(MUTEX_HELD(&vms->vms_lock));
668 ASSERT(size > 0);
669
670 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
671 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
672
673 /*
674 * The two ranges do not overlap if the start of either of
675 * them is after the end of the other.
676 */
677 if (vmsm->vmsm_addr > range_end || addr > seg_end)
678 continue;
679 return (false);
680 }
681 return (true);
682 }
683
684 static void
vm_mapping_remove(vmspace_t * vms,vmspace_mapping_t * vmsm)685 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
686 {
687 list_t *ml = &vms->vms_maplist;
688
689 ASSERT(MUTEX_HELD(&vms->vms_lock));
690 ASSERT(vms->vms_held);
691
692 list_remove(ml, vmsm);
693 vm_object_release(vmsm->vmsm_object);
694 kmem_free(vmsm, sizeof (*vmsm));
695 }
696
697 /*
698 * Enter a hold state on the vmspace. This ensures that all VM clients
699 * associated with the vmspace are excluded from establishing new page holds,
700 * or any other actions which would require accessing vmspace state subject to
701 * potential change.
702 *
703 * Returns with vmspace_t`vms_lock held.
704 */
705 static void
vmspace_hold_enter(vmspace_t * vms)706 vmspace_hold_enter(vmspace_t *vms)
707 {
708 mutex_enter(&vms->vms_lock);
709 VERIFY(!vms->vms_held);
710
711 vm_client_t *vmc = list_head(&vms->vms_clients);
712 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
713 vmc_space_hold(vmc);
714 }
715 vms->vms_held = true;
716 }
717
718 /*
719 * Exit a hold state on the vmspace. This releases all VM clients associated
720 * with the vmspace to be able to establish new page holds, and partake in other
721 * actions which require accessing changed vmspace state. If `kick_on_cpu` is
722 * true, then any CPUs actively using the page tables will be IPIed, and the
723 * call will block until they have acknowledged being ready to use the latest
724 * state of the tables.
725 *
726 * Requires vmspace_t`vms_lock be held, which is released as part of the call.
727 */
728 static void
vmspace_hold_exit(vmspace_t * vms,bool kick_on_cpu)729 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
730 {
731 ASSERT(MUTEX_HELD(&vms->vms_lock));
732 VERIFY(vms->vms_held);
733
734 vm_client_t *vmc = list_head(&vms->vms_clients);
735 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
736 vmc_space_release(vmc, kick_on_cpu);
737 }
738 vms->vms_held = false;
739 mutex_exit(&vms->vms_lock);
740 }
741
742 static void
vmspace_clients_invalidate(vmspace_t * vms,uintptr_t gpa,size_t len)743 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len)
744 {
745 ASSERT(MUTEX_HELD(&vms->vms_lock));
746 VERIFY(vms->vms_held);
747
748 for (vm_client_t *vmc = list_head(&vms->vms_clients);
749 vmc != NULL;
750 vmc = list_next(&vms->vms_clients, vmc)) {
751 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
752 }
753 }
754
755 /*
756 * Attempt to map a vm_object span into the vmspace.
757 *
758 * Requirements:
759 * - `obj_off`, `addr`, and `len` must be page-aligned
760 * - `obj_off` cannot be greater than the allocated size of the object
761 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
762 * size of the object
763 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
764 * of the vmspace
765 */
766 int
vmspace_map(vmspace_t * vms,vm_object_t * vmo,uintptr_t obj_off,uintptr_t addr,size_t len,uint8_t prot)767 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
768 size_t len, uint8_t prot)
769 {
770 vmspace_mapping_t *vmsm;
771 int res = 0;
772
773 if (len == 0 || (addr + len) < addr ||
774 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
775 return (EINVAL);
776 }
777 if ((addr + len) >= vms->vms_size) {
778 return (ENOMEM);
779 }
780
781 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
782
783 vmspace_hold_enter(vms);
784 if (!vm_mapping_gap(vms, addr, len)) {
785 kmem_free(vmsm, sizeof (*vmsm));
786 res = ENOMEM;
787 } else {
788 vmsm->vmsm_object = vmo;
789 vmsm->vmsm_addr = addr;
790 vmsm->vmsm_len = len;
791 vmsm->vmsm_offset = (off_t)obj_off;
792 vmsm->vmsm_prot = prot;
793 list_insert_tail(&vms->vms_maplist, vmsm);
794
795 /*
796 * Make sure the GPT has tables ready for leaf entries across
797 * the entire new mapping.
798 */
799 vmm_gpt_populate_region(vms->vms_gpt, addr, len);
800 }
801 vmspace_hold_exit(vms, false);
802 return (res);
803 }
804
805 /*
806 * Unmap a region of the vmspace.
807 *
808 * Presently the [start, end) span must equal a region previously mapped by a
809 * call to vmspace_map().
810 */
811 int
vmspace_unmap(vmspace_t * vms,uintptr_t addr,uintptr_t len)812 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len)
813 {
814 const uintptr_t end = addr + len;
815 vmspace_mapping_t *vmsm;
816 vm_client_t *vmc;
817 uint64_t gen = 0;
818
819 ASSERT3U(addr, <, end);
820
821 vmspace_hold_enter(vms);
822 /* expect to match existing mapping exactly */
823 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL ||
824 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) {
825 vmspace_hold_exit(vms, false);
826 return (ENOENT);
827 }
828
829 /* Prepare clients (and their held pages) for the unmap. */
830 for (vmc = list_head(&vms->vms_clients); vmc != NULL;
831 vmc = list_next(&vms->vms_clients, vmc)) {
832 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object);
833 }
834
835 /* Clear all PTEs for region */
836 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) {
837 vms->vms_pt_gen++;
838 gen = vms->vms_pt_gen;
839 }
840 /* ... and the intermediate (directory) PTEs as well */
841 vmm_gpt_vacate_region(vms->vms_gpt, addr, len);
842
843 /*
844 * If pages were actually unmapped from the GPT, provide clients with
845 * an invalidation notice.
846 */
847 if (gen != 0) {
848 vmspace_clients_invalidate(vms, addr, len);
849 }
850
851 vm_mapping_remove(vms, vmsm);
852 vmspace_hold_exit(vms, true);
853 return (0);
854 }
855
856 /*
857 * For a given GPA in the vmspace, ensure that the backing page (if any) is
858 * properly mapped as present in the provided PTE.
859 */
860 static int
vmspace_ensure_mapped(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t * leaf_pte)861 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
862 uint64_t *leaf_pte)
863 {
864 vmspace_mapping_t *vmsm;
865 vm_object_t *vmo;
866 pfn_t pfn;
867
868 ASSERT(pfnp != NULL);
869 ASSERT(leaf_pte != NULL);
870
871 vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
872 if (vmsm == NULL) {
873 return (FC_NOMAP);
874 }
875 if ((req_prot & vmsm->vmsm_prot) != req_prot) {
876 return (FC_PROT);
877 }
878
879 vmo = vmsm->vmsm_object;
880 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
881 VERIFY(pfn != PFN_INVALID);
882
883 if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot,
884 vmo->vmo_attr)) {
885 atomic_inc_64(&vms->vms_pages_mapped);
886 }
887
888 *pfnp = pfn;
889 return (0);
890 }
891
892 /*
893 * Look up the PTE for a given GPA in the vmspace, populating it with
894 * appropriate contents (pfn, protection, etc) if it is empty, but backed by a
895 * valid mapping.
896 */
897 static int
vmspace_lookup_map(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t ** ptepp)898 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
899 uint64_t **ptepp)
900 {
901 vmm_gpt_t *gpt = vms->vms_gpt;
902 uint64_t *entries[MAX_GPT_LEVEL], *leaf;
903 pfn_t pfn = PFN_INVALID;
904 uint_t prot;
905
906 ASSERT0(gpa & PAGEOFFSET);
907 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
908
909 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
910 leaf = entries[LEVEL1];
911 if (leaf == NULL) {
912 /*
913 * Since we populated the intermediate tables for any regions
914 * mapped in the GPT, an empty leaf entry indicates there is no
915 * mapping, populated or not, at this GPT.
916 */
917 return (FC_NOMAP);
918 }
919
920 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
921 if ((req_prot & prot) != req_prot) {
922 return (FC_PROT);
923 }
924 } else {
925 int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf);
926 if (err != 0) {
927 return (err);
928 }
929 }
930
931 ASSERT(pfn != PFN_INVALID && leaf != NULL);
932 if (pfnp != NULL) {
933 *pfnp = pfn;
934 }
935 if (ptepp != NULL) {
936 *ptepp = leaf;
937 }
938 return (0);
939 }
940
941 /*
942 * Populate (make resident in the page tables) a region of the vmspace.
943 *
944 * Presently the [start, end) span must equal a region previously mapped by a
945 * call to vmspace_map().
946 */
947 int
vmspace_populate(vmspace_t * vms,uintptr_t addr,uintptr_t len)948 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len)
949 {
950 vmspace_mapping_t *vmsm;
951 mutex_enter(&vms->vms_lock);
952
953 /* For the time being, only exact-match mappings are expected */
954 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) {
955 mutex_exit(&vms->vms_lock);
956 return (FC_NOMAP);
957 }
958
959 vm_object_t *vmo = vmsm->vmsm_object;
960 const int prot = vmsm->vmsm_prot;
961 const uint8_t attr = vmo->vmo_attr;
962 size_t populated = 0;
963 const size_t end = addr + len;
964 for (uintptr_t gpa = addr & PAGEMASK; gpa < end; gpa += PAGESIZE) {
965 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
966 VERIFY(pfn != PFN_INVALID);
967
968 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
969 populated++;
970 }
971 }
972 atomic_add_64(&vms->vms_pages_mapped, populated);
973
974 mutex_exit(&vms->vms_lock);
975 return (0);
976 }
977
978 /*
979 * Allocate a client from a given vmspace.
980 */
981 vm_client_t *
vmspace_client_alloc(vmspace_t * vms)982 vmspace_client_alloc(vmspace_t *vms)
983 {
984 vm_client_t *vmc;
985
986 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
987 vmc->vmc_space = vms;
988 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
989 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
990 vmc->vmc_state = VCS_IDLE;
991 vmc->vmc_cpu_active = -1;
992 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
993 offsetof(vm_page_t, vmp_node));
994 vmc->vmc_track_dirty = vms->vms_track_dirty;
995
996 mutex_enter(&vms->vms_lock);
997 list_insert_tail(&vms->vms_clients, vmc);
998 mutex_exit(&vms->vms_lock);
999
1000 return (vmc);
1001 }
1002
1003 /*
1004 * Get the nested page table root pointer (EPTP/NCR3) value.
1005 */
1006 uint64_t
vmspace_table_root(vmspace_t * vms)1007 vmspace_table_root(vmspace_t *vms)
1008 {
1009 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
1010 }
1011
1012 /*
1013 * Get the current generation number of the nested page table.
1014 */
1015 uint64_t
vmspace_table_gen(vmspace_t * vms)1016 vmspace_table_gen(vmspace_t *vms)
1017 {
1018 return (vms->vms_pt_gen);
1019 }
1020
1021 /*
1022 * Mark a vm_client as active. This will block if/while the client is held by
1023 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will
1024 * fail if the vm_client has been orphaned.
1025 */
1026 static int
vmc_activate(vm_client_t * vmc)1027 vmc_activate(vm_client_t *vmc)
1028 {
1029 mutex_enter(&vmc->vmc_lock);
1030 VERIFY0(vmc->vmc_state & VCS_ACTIVE);
1031 if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
1032 mutex_exit(&vmc->vmc_lock);
1033 return (ENXIO);
1034 }
1035 while ((vmc->vmc_state & VCS_HOLD) != 0) {
1036 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1037 }
1038 vmc->vmc_state |= VCS_ACTIVE;
1039 return (0);
1040 }
1041
1042 /*
1043 * Mark a vm_client as no longer active. It must be called with
1044 * vm_client_t`vmc_lock already held, and will return with it released.
1045 */
1046 static void
vmc_deactivate(vm_client_t * vmc)1047 vmc_deactivate(vm_client_t *vmc)
1048 {
1049 ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1050 VERIFY(vmc->vmc_state & VCS_ACTIVE);
1051
1052 vmc->vmc_state ^= VCS_ACTIVE;
1053 if ((vmc->vmc_state & VCS_HOLD) != 0) {
1054 cv_broadcast(&vmc->vmc_cv);
1055 }
1056 mutex_exit(&vmc->vmc_lock);
1057 }
1058
1059 /*
1060 * Indicate that a CPU will be utilizing the nested page tables through this VM
1061 * client. Interrupts (and/or the GIF) are expected to be disabled when calling
1062 * this function. Returns the generation number of the nested page table (to be
1063 * used for TLB invalidations).
1064 */
1065 uint64_t
vmc_table_enter(vm_client_t * vmc)1066 vmc_table_enter(vm_client_t *vmc)
1067 {
1068 vmspace_t *vms = vmc->vmc_space;
1069 uint64_t gen;
1070
1071 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1072 ASSERT3S(vmc->vmc_cpu_active, ==, -1);
1073
1074 /*
1075 * Since the NPT activation occurs with interrupts disabled, this must
1076 * be done without taking vmc_lock like normal.
1077 */
1078 gen = vms->vms_pt_gen;
1079 vmc->vmc_cpu_active = CPU->cpu_id;
1080 vmc->vmc_cpu_gen = gen;
1081 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
1082
1083 return (gen);
1084 }
1085
1086 /*
1087 * Indicate that this VM client is not longer (directly) using the underlying
1088 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling
1089 * this function.
1090 */
1091 void
vmc_table_exit(vm_client_t * vmc)1092 vmc_table_exit(vm_client_t *vmc)
1093 {
1094 mutex_enter(&vmc->vmc_lock);
1095
1096 ASSERT(vmc->vmc_state & VCS_ON_CPU);
1097 vmc->vmc_state ^= VCS_ON_CPU;
1098 vmc->vmc_cpu_active = -1;
1099 if ((vmc->vmc_state & VCS_HOLD) != 0) {
1100 cv_broadcast(&vmc->vmc_cv);
1101 }
1102
1103 mutex_exit(&vmc->vmc_lock);
1104 }
1105
1106 static void
vmc_space_hold(vm_client_t * vmc)1107 vmc_space_hold(vm_client_t *vmc)
1108 {
1109 mutex_enter(&vmc->vmc_lock);
1110 VERIFY0(vmc->vmc_state & VCS_HOLD);
1111
1112 /*
1113 * Because vmc_table_enter() alters vmc_state from a context where
1114 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
1115 * VMC_HOLD must be done atomically here.
1116 */
1117 atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
1118
1119 /* Wait for client to go inactive */
1120 while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
1121 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1122 }
1123 mutex_exit(&vmc->vmc_lock);
1124 }
1125
1126 static void
vmc_space_release(vm_client_t * vmc,bool kick_on_cpu)1127 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
1128 {
1129 mutex_enter(&vmc->vmc_lock);
1130 VERIFY(vmc->vmc_state & VCS_HOLD);
1131
1132 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
1133 poke_cpu(vmc->vmc_cpu_active);
1134
1135 while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1136 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1137 }
1138 }
1139
1140 /*
1141 * Because vmc_table_enter() alters vmc_state from a context where
1142 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
1143 * VMC_HOLD must be done atomically here.
1144 */
1145 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
1146 cv_broadcast(&vmc->vmc_cv);
1147 mutex_exit(&vmc->vmc_lock);
1148 }
1149
1150 static void
vmc_space_invalidate(vm_client_t * vmc,uintptr_t addr,size_t size,uint64_t gen)1151 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
1152 uint64_t gen)
1153 {
1154 mutex_enter(&vmc->vmc_lock);
1155 VERIFY(vmc->vmc_state & VCS_HOLD);
1156 if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1157 /*
1158 * Wait for clients using an old generation of the page tables
1159 * to exit guest context, where they subsequently flush the TLB
1160 * for the new generation.
1161 */
1162 if (vmc->vmc_cpu_gen < gen) {
1163 poke_cpu(vmc->vmc_cpu_active);
1164
1165 while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1166 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1167 }
1168 }
1169 }
1170 if (vmc->vmc_inval_func != NULL) {
1171 vmc_inval_cb_t func = vmc->vmc_inval_func;
1172 void *data = vmc->vmc_inval_data;
1173
1174 /*
1175 * Perform the actual invalidation call outside vmc_lock to
1176 * avoid lock ordering issues in the consumer. Since the client
1177 * is under VCS_HOLD, this is safe.
1178 */
1179 mutex_exit(&vmc->vmc_lock);
1180 func(data, addr, size);
1181 mutex_enter(&vmc->vmc_lock);
1182 }
1183 mutex_exit(&vmc->vmc_lock);
1184 }
1185
1186 static void
vmc_space_unmap(vm_client_t * vmc,uintptr_t addr,size_t size,vm_object_t * vmo)1187 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1188 vm_object_t *vmo)
1189 {
1190 mutex_enter(&vmc->vmc_lock);
1191 VERIFY(vmc->vmc_state & VCS_HOLD);
1192
1193 /*
1194 * With the current vCPU exclusion invariants in place, we do not expect
1195 * a vCPU to be in guest context during an unmap.
1196 */
1197 VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1198
1199 /*
1200 * Any holds against the unmapped region need to establish their own
1201 * reference to the underlying object to avoid a potential
1202 * use-after-free.
1203 */
1204 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1205 vmp != NULL;
1206 vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1207 if (vmp->vmp_gpa < addr ||
1208 vmp->vmp_gpa >= (addr + size)) {
1209 /* Hold outside region in question */
1210 continue;
1211 }
1212 if (vmp->vmp_obj_ref == NULL) {
1213 vm_object_reference(vmo);
1214 vmp->vmp_obj_ref = vmo;
1215 /* For an unmapped region, PTE is now meaningless */
1216 vmp->vmp_ptep = NULL;
1217 } else {
1218 /*
1219 * Object could have gone through cycle of
1220 * unmap-map-unmap before the hold was released.
1221 */
1222 VERIFY3P(vmp->vmp_ptep, ==, NULL);
1223 }
1224 }
1225 mutex_exit(&vmc->vmc_lock);
1226 }
1227
1228 static vm_client_t *
vmc_space_orphan(vm_client_t * vmc,vmspace_t * vms)1229 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1230 {
1231 vm_client_t *next;
1232
1233 ASSERT(MUTEX_HELD(&vms->vms_lock));
1234
1235 mutex_enter(&vmc->vmc_lock);
1236 VERIFY3P(vmc->vmc_space, ==, vms);
1237 VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1238 if (vmc->vmc_state & VCS_DESTROY) {
1239 /*
1240 * This vm_client is currently undergoing destruction, so it
1241 * does not need to be orphaned. Let it proceed with its own
1242 * clean-up task.
1243 */
1244 next = list_next(&vms->vms_clients, vmc);
1245 } else {
1246 /*
1247 * Clients are only orphaned when the containing vmspace is
1248 * being torn down. All mappings from the vmspace should
1249 * already be gone, meaning any remaining held pages should have
1250 * direct references to the object.
1251 */
1252 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1253 vmp != NULL;
1254 vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1255 ASSERT3P(vmp->vmp_ptep, ==, NULL);
1256 ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1257 }
1258
1259 /*
1260 * After this point, the client will be orphaned, unable to
1261 * establish new page holds (or access any vmspace-related
1262 * resources) and is in charge of cleaning up after itself.
1263 */
1264 vmc->vmc_state |= VCS_ORPHANED;
1265 next = list_next(&vms->vms_clients, vmc);
1266 list_remove(&vms->vms_clients, vmc);
1267 vmc->vmc_space = NULL;
1268 }
1269 mutex_exit(&vmc->vmc_lock);
1270 return (next);
1271 }
1272
1273 /*
1274 * Attempt to hold a page at `gpa` inside the referenced vmspace.
1275 */
1276 vm_page_t *
vmc_hold_ext(vm_client_t * vmc,uintptr_t gpa,int prot,int flags)1277 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags)
1278 {
1279 vmspace_t *vms = vmc->vmc_space;
1280 vm_page_t *vmp;
1281 pfn_t pfn = PFN_INVALID;
1282 uint64_t *ptep = NULL;
1283
1284 ASSERT0(gpa & PAGEOFFSET);
1285 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1286 ASSERT0(prot & ~PROT_ALL);
1287 ASSERT0(flags & ~VPF_ALL);
1288
1289 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1290 if (vmc_activate(vmc) != 0) {
1291 kmem_free(vmp, sizeof (*vmp));
1292 return (NULL);
1293 }
1294
1295 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1296 vmc_deactivate(vmc);
1297 kmem_free(vmp, sizeof (*vmp));
1298 return (NULL);
1299 }
1300 ASSERT(pfn != PFN_INVALID && ptep != NULL);
1301
1302 vmp->vmp_client = vmc;
1303 vmp->vmp_chain = NULL;
1304 vmp->vmp_gpa = gpa;
1305 vmp->vmp_pfn = pfn;
1306 vmp->vmp_ptep = ptep;
1307 vmp->vmp_obj_ref = NULL;
1308 vmp->vmp_prot = (uint8_t)prot;
1309 vmp->vmp_flags = (uint8_t)flags;
1310 list_insert_tail(&vmc->vmc_held_pages, vmp);
1311 vmc_deactivate(vmc);
1312
1313 return (vmp);
1314 }
1315
1316 /*
1317 * Attempt to hold a page at `gpa` inside the referenced vmspace.
1318 */
1319 vm_page_t *
vmc_hold(vm_client_t * vmc,uintptr_t gpa,int prot)1320 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1321 {
1322 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT));
1323 }
1324
1325 int
vmc_fault(vm_client_t * vmc,uintptr_t gpa,int prot)1326 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1327 {
1328 vmspace_t *vms = vmc->vmc_space;
1329 int err;
1330
1331 err = vmc_activate(vmc);
1332 if (err == 0) {
1333 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1334 vmc_deactivate(vmc);
1335 }
1336
1337 return (err);
1338 }
1339
1340 /*
1341 * Allocate an additional vm_client_t, based on an existing one. Only the
1342 * associatation with the vmspace is cloned, not existing holds or any
1343 * configured invalidation function.
1344 */
1345 vm_client_t *
vmc_clone(vm_client_t * vmc)1346 vmc_clone(vm_client_t *vmc)
1347 {
1348 vmspace_t *vms = vmc->vmc_space;
1349
1350 return (vmspace_client_alloc(vms));
1351 }
1352
1353 /*
1354 * Register a function (and associated data pointer) to be called when an
1355 * address range in the vmspace is invalidated.
1356 */
1357 int
vmc_set_inval_cb(vm_client_t * vmc,vmc_inval_cb_t func,void * data)1358 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1359 {
1360 int err;
1361
1362 err = vmc_activate(vmc);
1363 if (err == 0) {
1364 vmc->vmc_inval_func = func;
1365 vmc->vmc_inval_data = data;
1366 vmc_deactivate(vmc);
1367 }
1368
1369 return (err);
1370 }
1371
1372 /*
1373 * Destroy a vm_client_t instance.
1374 *
1375 * No pages held through this vm_client_t may be outstanding when performing a
1376 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to
1377 * vmc_table_exit() has been made).
1378 */
1379 void
vmc_destroy(vm_client_t * vmc)1380 vmc_destroy(vm_client_t *vmc)
1381 {
1382 mutex_enter(&vmc->vmc_lock);
1383
1384 VERIFY(list_is_empty(&vmc->vmc_held_pages));
1385 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1386
1387 if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1388 vmspace_t *vms;
1389
1390 /*
1391 * Deassociation with the parent vmspace must be done carefully:
1392 * The vmspace could attempt to orphan this vm_client while we
1393 * release vmc_lock in order to take vms_lock (the required
1394 * order). The client is marked to indicate that destruction is
1395 * under way. Doing so prevents any racing orphan operation
1396 * from applying to this client, allowing us to deassociate from
1397 * the vmspace safely.
1398 */
1399 vmc->vmc_state |= VCS_DESTROY;
1400 vms = vmc->vmc_space;
1401 mutex_exit(&vmc->vmc_lock);
1402
1403 mutex_enter(&vms->vms_lock);
1404 mutex_enter(&vmc->vmc_lock);
1405 list_remove(&vms->vms_clients, vmc);
1406 /*
1407 * If the vmspace began its own destruction operation while we
1408 * were navigating the locks, be sure to notify it about this
1409 * vm_client being deassociated.
1410 */
1411 cv_signal(&vms->vms_cv);
1412 mutex_exit(&vmc->vmc_lock);
1413 mutex_exit(&vms->vms_lock);
1414 } else {
1415 VERIFY3P(vmc->vmc_space, ==, NULL);
1416 mutex_exit(&vmc->vmc_lock);
1417 }
1418
1419 mutex_destroy(&vmc->vmc_lock);
1420 cv_destroy(&vmc->vmc_cv);
1421 list_destroy(&vmc->vmc_held_pages);
1422
1423 kmem_free(vmc, sizeof (*vmc));
1424 }
1425
1426 static __inline void *
vmp_ptr(const vm_page_t * vmp)1427 vmp_ptr(const vm_page_t *vmp)
1428 {
1429 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1430
1431 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1432 return ((void *)((uintptr_t)kpm_vbase + paddr));
1433 }
1434
1435 /*
1436 * Get a readable kernel-virtual pointer for a held page.
1437 *
1438 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1439 * call to acquire this page reference.
1440 */
1441 const void *
vmp_get_readable(const vm_page_t * vmp)1442 vmp_get_readable(const vm_page_t *vmp)
1443 {
1444 ASSERT(vmp->vmp_prot & PROT_READ);
1445
1446 return (vmp_ptr(vmp));
1447 }
1448
1449 /*
1450 * Get a writable kernel-virtual pointer for a held page.
1451 *
1452 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1453 * call to acquire this page reference.
1454 */
1455 void *
vmp_get_writable(const vm_page_t * vmp)1456 vmp_get_writable(const vm_page_t *vmp)
1457 {
1458 ASSERT(vmp->vmp_prot & PROT_WRITE);
1459
1460 return (vmp_ptr(vmp));
1461 }
1462
1463 /*
1464 * Get the host-physical PFN for a held page.
1465 */
1466 pfn_t
vmp_get_pfn(const vm_page_t * vmp)1467 vmp_get_pfn(const vm_page_t *vmp)
1468 {
1469 return (vmp->vmp_pfn);
1470 }
1471
1472 /*
1473 * If this page was deferring dirty-marking in the corresponding vmspace page
1474 * tables, clear such a state so it is considered dirty from now on.
1475 */
1476 void
vmp_mark_dirty(vm_page_t * vmp)1477 vmp_mark_dirty(vm_page_t *vmp)
1478 {
1479 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0);
1480
1481 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY);
1482 }
1483
1484 /*
1485 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1486 */
1487 void
vmp_chain(vm_page_t * vmp,vm_page_t * to_chain)1488 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1489 {
1490 ASSERT3P(vmp->vmp_chain, ==, NULL);
1491
1492 vmp->vmp_chain = to_chain;
1493 }
1494
1495 /*
1496 * Retrieve the pointer from the page-chaining in `vmp`.
1497 */
1498 vm_page_t *
vmp_next(const vm_page_t * vmp)1499 vmp_next(const vm_page_t *vmp)
1500 {
1501 return (vmp->vmp_chain);
1502 }
1503
1504 static __inline bool
vmp_release_inner(vm_page_t * vmp,vm_client_t * vmc)1505 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1506 {
1507 ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1508
1509 bool was_unmapped = false;
1510
1511 list_remove(&vmc->vmc_held_pages, vmp);
1512 if (vmp->vmp_obj_ref != NULL) {
1513 ASSERT3P(vmp->vmp_ptep, ==, NULL);
1514
1515 vm_object_release(vmp->vmp_obj_ref);
1516 was_unmapped = true;
1517 } else {
1518 ASSERT3P(vmp->vmp_ptep, !=, NULL);
1519
1520 /*
1521 * Track appropriate (accessed/dirty) bits for the guest-virtual
1522 * address corresponding to this page, if it is from the vmspace
1523 * rather than a direct reference to an underlying object.
1524 *
1525 * The protection and/or configured flags may obviate the need
1526 * for such an update.
1527 */
1528 if ((vmp->vmp_prot & PROT_WRITE) != 0 &&
1529 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 &&
1530 vmc->vmc_track_dirty) {
1531 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
1532 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
1533 }
1534 }
1535 kmem_free(vmp, sizeof (*vmp));
1536 return (was_unmapped);
1537 }
1538
1539 /*
1540 * Release held page. Returns true if page resided on region which was
1541 * subsequently unmapped.
1542 */
1543 bool
vmp_release(vm_page_t * vmp)1544 vmp_release(vm_page_t *vmp)
1545 {
1546 vm_client_t *vmc = vmp->vmp_client;
1547
1548 VERIFY(vmc != NULL);
1549
1550 mutex_enter(&vmc->vmc_lock);
1551 const bool was_unmapped = vmp_release_inner(vmp, vmc);
1552 mutex_exit(&vmc->vmc_lock);
1553 return (was_unmapped);
1554 }
1555
1556 /*
1557 * Release a chain of pages which were associated via vmp_chain() (setting
1558 * page-chaining pointer). Returns true if any pages resided upon a region
1559 * which was subsequently unmapped.
1560 *
1561 * All of those pages must have been held through the same vm_client_t.
1562 */
1563 bool
vmp_release_chain(vm_page_t * vmp)1564 vmp_release_chain(vm_page_t *vmp)
1565 {
1566 vm_client_t *vmc = vmp->vmp_client;
1567 bool any_unmapped = false;
1568
1569 ASSERT(vmp != NULL);
1570
1571 mutex_enter(&vmc->vmc_lock);
1572 while (vmp != NULL) {
1573 vm_page_t *next = vmp->vmp_chain;
1574
1575 /* We expect all pages in chain to be from same client */
1576 ASSERT3P(vmp->vmp_client, ==, vmc);
1577
1578 if (vmp_release_inner(vmp, vmc)) {
1579 any_unmapped = true;
1580 }
1581 vmp = next;
1582 }
1583 mutex_exit(&vmc->vmc_lock);
1584 return (any_unmapped);
1585 }
1586
1587
1588 int
vm_segmap_obj(struct vm * vm,int segid,off_t segoff,off_t len,struct as * as,caddr_t * addrp,uint_t prot,uint_t maxprot,uint_t flags)1589 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1590 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1591 {
1592 vm_object_t *vmo;
1593 int err;
1594
1595 if (segoff < 0 || len <= 0 ||
1596 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1597 return (EINVAL);
1598 }
1599 if ((prot & PROT_USER) == 0) {
1600 return (ENOTSUP);
1601 }
1602 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1603 if (err != 0) {
1604 return (err);
1605 }
1606
1607 VERIFY(segoff >= 0);
1608 VERIFY(len <= vmo->vmo_size);
1609 VERIFY((len + segoff) <= vmo->vmo_size);
1610
1611 if (vmo->vmo_type != VMOT_MEM) {
1612 /* Only support memory objects for now */
1613 return (ENOTSUP);
1614 }
1615
1616 as_rangelock(as);
1617
1618 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1619 if (err == 0) {
1620 segvmm_crargs_t svma;
1621
1622 svma.prot = prot;
1623 svma.offset = segoff;
1624 svma.vmo = vmo;
1625 svma.vmc = NULL;
1626
1627 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1628 }
1629
1630 as_rangeunlock(as);
1631 return (err);
1632 }
1633
1634 int
vm_segmap_space(struct vm * vm,off_t off,struct as * as,caddr_t * addrp,off_t len,uint_t prot,uint_t maxprot,uint_t flags)1635 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1636 off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1637 {
1638
1639 const uintptr_t gpa = (uintptr_t)off;
1640 const size_t size = (uintptr_t)len;
1641 int err;
1642
1643 if (off < 0 || len <= 0 ||
1644 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1645 return (EINVAL);
1646 }
1647 if ((prot & PROT_USER) == 0) {
1648 return (ENOTSUP);
1649 }
1650
1651 as_rangelock(as);
1652
1653 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1654 if (err == 0) {
1655 segvmm_crargs_t svma;
1656
1657 svma.prot = prot;
1658 svma.offset = gpa;
1659 svma.vmo = NULL;
1660 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1661
1662 err = as_map(as, *addrp, len, segvmm_create, &svma);
1663 }
1664
1665 as_rangeunlock(as);
1666 return (err);
1667 }
1668