xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_vm.c (revision 50fe091cff3f2dccec5f588584a3ccb4f9933570)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2025 Oxide Computer Company
16  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34 
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40 #include "vmm_util.h"
41 
42 
43 /*
44  * VMM Virtual Memory
45  *
46  * History
47  *
48  * When bhyve was ported to illumos, one significant hole was handling guest
49  * memory and memory accesses.  In the original Pluribus port, bhyve itself
50  * manually handled the EPT structures for guest memory.  The updated sources
51  * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
52  * system for memory allocations and management of the EPT structures.  Keeping
53  * source differences to a minimum was a priority, so illumos-bhyve implemented
54  * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
55  * boot and run guests.
56  *
57  * While the VM shim was successful in getting illumos-bhyve to a functional
58  * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
59  * compatibility interfaces made it awkward to use.  As source differences with
60  * the upstream kernel code became less of a concern, and upcoming features
61  * (such as live migration) would demand more of those VM interfaces, it became
62  * clear that an overhaul was prudent.
63  *
64  * Design
65  *
66  * The new VM system for bhyve retains a number of the same concepts as what it
67  * replaces:
68  *
69  * - `vmspace_t` is the top-level entity for a guest memory space
70  * - `vm_object_t` represents a memory object which can be mapped into a vmspace
71  * - `vm_page_t` represents a page hold within a given vmspace, providing access
72  *   to the underlying memory page
73  *
74  * Unlike the old code, where most of the involved structures were exposed via
75  * public definitions, this replacement VM interface keeps all involved
76  * structures opaque to consumers.  Furthermore, there is a clear delineation
77  * between infrequent administrative operations (such as mapping/unmapping
78  * regions) and common data-path operations (attempting a page hold at a given
79  * guest-physical address).  Those administrative operations are performed
80  * directly against the vmspace, whereas the data-path operations are performed
81  * through a `vm_client_t` handle.  That VM client abstraction is meant to
82  * reduce contention and overhead for frequent access operations and provide
83  * debugging insight into how different subcomponents are accessing the vmspace.
84  * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
85  * interface) and each VMM userspace segment mapping.
86  *
87  * Exclusion
88  *
89  * Making changes to the vmspace (such as mapping or unmapping regions) requires
90  * other accessors be excluded while the change is underway to prevent them from
91  * observing invalid intermediate states.  A simple approach could use a mutex
92  * or rwlock to achieve this, but that risks contention when the rate of access
93  * to the vmspace is high.
94  *
95  * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
96  * at a per-vm_client_t basis.  While this raises the cost for vmspace changes,
97  * it means that the much more common page accesses through the vm_client can
98  * normally proceed unimpeded and independently.
99  *
100  * When a change to the vmspace is required, the caller will put the vmspace in
101  * a 'hold' state, iterating over all associated vm_client instances, waiting
102  * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
103  * setting VCS_HOLD in their state flag fields.  With VCS_HOLD set, any call on
104  * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
105  * will block until the hold condition is cleared.  Once the hold is asserted
106  * for all clients, the vmspace change can proceed with confidence.  Upon
107  * completion of that operation, VCS_HOLD is cleared from the clients, and they
108  * are released to resume vmspace accesses.
109  *
110  * vCPU Consumers
111  *
112  * Access to the vmspace for vCPUs running in guest context is different from
113  * emulation-related vm_client activity: they solely rely on the contents of the
114  * page tables.  Furthermore, the existing VCS_HOLD mechanism used to exclude
115  * client access is not feasible when entering guest context, since interrupts
116  * are disabled, making it impossible to block entry.  This is not a concern as
117  * long as vmspace modifications never place the page tables in invalid states
118  * (either intermediate, or final).  The vm_client hold mechanism does provide
119  * the means to IPI vCPU consumers which will trigger a notification once they
120  * report their exit from guest context.  This can be used to ensure that page
121  * table modifications are made visible to those vCPUs within a certain
122  * time frame.
123  */
124 
125 typedef struct vmspace_mapping {
126 	list_node_t	vmsm_node;
127 	vm_object_t	*vmsm_object;	/* object backing this mapping */
128 	uintptr_t	vmsm_addr;	/* start addr in vmspace for mapping */
129 	size_t		vmsm_len;	/* length (in bytes) of mapping */
130 	off_t		vmsm_offset;	/* byte offset into object */
131 	uint_t		vmsm_prot;
132 } vmspace_mapping_t;
133 
134 #define	VMSM_OFFSET(vmsm, addr)	(			\
135 	    (vmsm)->vmsm_offset +			\
136 	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
137 
138 typedef enum vm_client_state {
139 	VCS_IDLE	= 0,
140 	/* currently accessing vmspace for client operation (hold or fault) */
141 	VCS_ACTIVE	= (1 << 0),
142 	/* client hold requested/asserted */
143 	VCS_HOLD	= (1 << 1),
144 	/* vCPU is accessing page tables in guest context */
145 	VCS_ON_CPU	= (1 << 2),
146 	/* client has been orphaned (no more access to vmspace) */
147 	VCS_ORPHANED	= (1 << 3),
148 	/* client undergoing destroy operation */
149 	VCS_DESTROY	= (1 << 4),
150 } vm_client_state_t;
151 
152 struct vmspace {
153 	kmutex_t	vms_lock;
154 	kcondvar_t	vms_cv;
155 	bool		vms_held;
156 	uintptr_t	vms_size;	/* immutable after creation */
157 
158 	/* (nested) page table state */
159 	vmm_gpt_t	*vms_gpt;
160 	uint64_t	vms_pt_gen;
161 	uint64_t	vms_pages_mapped;
162 	bool		vms_track_dirty;
163 
164 	list_t		vms_maplist;
165 	list_t		vms_clients;
166 };
167 
168 struct vm_client {
169 	vmspace_t	*vmc_space;
170 	list_node_t	vmc_node;
171 
172 	kmutex_t	vmc_lock;
173 	kcondvar_t	vmc_cv;
174 	vm_client_state_t vmc_state;
175 	int		vmc_cpu_active;
176 	uint64_t	vmc_cpu_gen;
177 	bool		vmc_track_dirty;
178 	vmc_inval_cb_t	vmc_inval_func;
179 	void		*vmc_inval_data;
180 
181 	list_t		vmc_held_pages;
182 };
183 
184 typedef enum vm_object_type {
185 	VMOT_NONE,
186 	VMOT_MEM,
187 	VMOT_MMIO,
188 } vm_object_type_t;
189 
190 struct vm_object {
191 	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
192 
193 	/* Fields below are fixed at creation time */
194 	vm_object_type_t vmo_type;
195 	size_t		vmo_size;
196 	void		*vmo_data;
197 	uint8_t		vmo_attr;
198 };
199 
200 /* Convenience consolidation of all flag(s) for validity checking */
201 #define	VPF_ALL		(VPF_DEFER_DIRTY)
202 
203 struct vm_page {
204 	vm_client_t	*vmp_client;
205 	list_node_t	vmp_node;
206 	vm_page_t	*vmp_chain;
207 	uintptr_t	vmp_gpa;
208 	pfn_t		vmp_pfn;
209 	uint64_t	*vmp_ptep;
210 	vm_object_t	*vmp_obj_ref;
211 	uint8_t		vmp_prot;
212 	uint8_t		vmp_flags;
213 };
214 
215 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
216 static void vmspace_hold_enter(vmspace_t *);
217 static void vmspace_hold_exit(vmspace_t *, bool);
218 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t);
219 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *,
220     uint64_t *);
221 static void vmc_space_hold(vm_client_t *);
222 static void vmc_space_release(vm_client_t *, bool);
223 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
224 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
225 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
226 
227 bool
vmm_vm_init(void)228 vmm_vm_init(void)
229 {
230 	if (vmm_is_intel()) {
231 		extern struct vmm_pte_impl ept_pte_impl;
232 		return (vmm_gpt_init(&ept_pte_impl));
233 	} else if (vmm_is_svm()) {
234 		extern struct vmm_pte_impl rvi_pte_impl;
235 		return (vmm_gpt_init(&rvi_pte_impl));
236 	} else {
237 		/* Caller should have already rejected other vendors */
238 		panic("Unexpected hypervisor hardware vendor");
239 	}
240 }
241 
242 void
vmm_vm_fini(void)243 vmm_vm_fini(void)
244 {
245 	vmm_gpt_fini();
246 }
247 
248 /*
249  * Create a new vmspace with a maximum address of `end`.
250  */
251 vmspace_t *
vmspace_alloc(size_t end)252 vmspace_alloc(size_t end)
253 {
254 	vmspace_t *vms;
255 	const uintptr_t size = end + 1;
256 
257 	/*
258 	 * This whole mess is built on the assumption that a 64-bit address
259 	 * space is available to work with for the various pagetable tricks.
260 	 */
261 	VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
262 	    size <= (uintptr_t)USERLIMIT);
263 
264 	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
265 	vms->vms_size = size;
266 	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
267 	    offsetof(vmspace_mapping_t, vmsm_node));
268 	list_create(&vms->vms_clients, sizeof (vm_client_t),
269 	    offsetof(vm_client_t, vmc_node));
270 
271 	vms->vms_gpt = vmm_gpt_alloc();
272 	vms->vms_pt_gen = 1;
273 	vms->vms_track_dirty = false;
274 
275 	return (vms);
276 }
277 
278 /*
279  * Destroy a vmspace.  All regions in the space must be unmapped.  Any remaining
280  * clients will be orphaned.
281  */
282 void
vmspace_destroy(vmspace_t * vms)283 vmspace_destroy(vmspace_t *vms)
284 {
285 	mutex_enter(&vms->vms_lock);
286 	VERIFY(list_is_empty(&vms->vms_maplist));
287 
288 	if (!list_is_empty(&vms->vms_clients)) {
289 		vm_client_t *vmc = list_head(&vms->vms_clients);
290 		while (vmc != NULL) {
291 			vmc = vmc_space_orphan(vmc, vms);
292 		}
293 		/*
294 		 * Wait for any clients which were in the process of destroying
295 		 * themselves to disappear.
296 		 */
297 		while (!list_is_empty(&vms->vms_clients)) {
298 			cv_wait(&vms->vms_cv, &vms->vms_lock);
299 		}
300 	}
301 	VERIFY(list_is_empty(&vms->vms_clients));
302 
303 	vmm_gpt_free(vms->vms_gpt);
304 	mutex_exit(&vms->vms_lock);
305 
306 	mutex_destroy(&vms->vms_lock);
307 	cv_destroy(&vms->vms_cv);
308 	list_destroy(&vms->vms_maplist);
309 	list_destroy(&vms->vms_clients);
310 
311 	kmem_free(vms, sizeof (*vms));
312 }
313 
314 /*
315  * Retrieve the count of resident (mapped into the page tables) pages.
316  */
317 uint64_t
vmspace_resident_count(vmspace_t * vms)318 vmspace_resident_count(vmspace_t *vms)
319 {
320 	return (vms->vms_pages_mapped);
321 }
322 
323 /*
324  * Perform an operation on the status (accessed/dirty) bits held in the page
325  * tables of this vmspace.
326  *
327  * Such manipulations race against both hardware writes (from running vCPUs) and
328  * emulated accesses reflected from userspace.  Safe functionality depends on
329  * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap
330  * operations from changing the page tables during the walk.
331  */
332 void
vmspace_bits_operate(vmspace_t * vms,const uint64_t gpa,size_t len,vmspace_bit_oper_t oper,uint8_t * bitmap)333 vmspace_bits_operate(vmspace_t *vms, const uint64_t gpa, size_t len,
334     vmspace_bit_oper_t oper, uint8_t *bitmap)
335 {
336 	const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0;
337 	const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0;
338 	const vmspace_bit_oper_t oper_only =
339 	    oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT);
340 	vmm_gpt_t *gpt = vms->vms_gpt;
341 
342 	/*
343 	 * The bitmap cannot be NULL if the requested operation involves reading
344 	 * or writing from it.
345 	 */
346 	ASSERT(bitmap != NULL || (!bit_input && !bit_output));
347 
348 	vmm_gpt_iter_t iter;
349 	vmm_gpt_iter_entry_t entry;
350 	vmm_gpt_iter_init(&iter, gpt, gpa, len);
351 
352 	while (vmm_gpt_iter_next(&iter, &entry)) {
353 		const size_t offset = (entry.vgie_gpa - gpa);
354 		const uint64_t pfn_offset = offset >> PAGESHIFT;
355 		const size_t bit_offset = pfn_offset / 8;
356 		const uint8_t bit_mask = 1 << (pfn_offset % 8);
357 
358 		if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) {
359 			continue;
360 		}
361 
362 		bool value = false;
363 		uint64_t *ptep = entry.vgie_ptep;
364 		if (ptep == NULL) {
365 			if (bit_output) {
366 				bitmap[bit_offset] &= ~bit_mask;
367 			}
368 			continue;
369 		}
370 
371 		switch (oper_only) {
372 		case VBO_GET_DIRTY:
373 			value = vmm_gpte_query_dirty(ptep);
374 			break;
375 		case VBO_SET_DIRTY: {
376 			uint_t prot = 0;
377 			bool present_writable = false;
378 			pfn_t pfn;
379 
380 			/*
381 			 * To avoid blindly setting the dirty bit on otherwise
382 			 * empty PTEs, we must first check if the entry for the
383 			 * address in question has been populated.
384 			 *
385 			 * Only if the page is marked both Present and Writable
386 			 * will we permit the dirty bit to be set.
387 			 */
388 			if (!vmm_gpte_is_mapped(ptep, &pfn, &prot)) {
389 				int err = vmspace_ensure_mapped(vms,
390 				    entry.vgie_gpa, PROT_WRITE, &pfn, ptep);
391 				if (err == 0) {
392 					present_writable = true;
393 				}
394 			} else if ((prot & PROT_WRITE) != 0) {
395 				present_writable = true;
396 			}
397 
398 			if (present_writable) {
399 				value = !vmm_gpte_reset_dirty(ptep, true);
400 			}
401 			break;
402 		}
403 		case VBO_RESET_DIRTY:
404 			/*
405 			 * Although at first glance, it may seem like the act of
406 			 * resetting the dirty bit may require the same care as
407 			 * setting it, the constraints make for a simpler task.
408 			 *
409 			 * Any PTEs with the dirty bit set will have already
410 			 * been properly populated.
411 			 */
412 			value = vmm_gpte_reset_dirty(ptep, false);
413 			break;
414 		default:
415 			panic("unrecognized operator: %d", oper_only);
416 			break;
417 		}
418 		if (bit_output) {
419 			if (value) {
420 				bitmap[bit_offset] |= bit_mask;
421 			} else {
422 				bitmap[bit_offset] &= ~bit_mask;
423 			}
424 		}
425 	}
426 
427 	/*
428 	 * Invalidate the address range potentially effected by the changes to
429 	 * page table bits, issuing shoot-downs for those who might have it in
430 	 * cache.
431 	 */
432 	vmspace_hold_enter(vms);
433 	vms->vms_pt_gen++;
434 	vmspace_clients_invalidate(vms, gpa, len);
435 	vmspace_hold_exit(vms, true);
436 }
437 
438 /*
439  * Is dirty-page-tracking enabled for the vmspace?
440  */
441 bool
vmspace_get_tracking(vmspace_t * vms)442 vmspace_get_tracking(vmspace_t *vms)
443 {
444 	mutex_enter(&vms->vms_lock);
445 	const bool val = vms->vms_track_dirty;
446 	mutex_exit(&vms->vms_lock);
447 	return (val);
448 }
449 
450 /*
451  * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace.
452  */
453 int
vmspace_set_tracking(vmspace_t * vms,bool enable_dirty_tracking)454 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking)
455 {
456 	if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) {
457 		/* Do not allow this to be set if it is not supported */
458 		return (ENOTSUP);
459 	}
460 
461 	vmspace_hold_enter(vms);
462 	if (vms->vms_track_dirty == enable_dirty_tracking) {
463 		/* No further effort required if state already matches */
464 		vmspace_hold_exit(vms, false);
465 		return (0);
466 	}
467 
468 	vms->vms_track_dirty = enable_dirty_tracking;
469 
470 	/* Configure all existing clients for new tracking behavior */
471 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
472 	    vmc != NULL;
473 	    vmc = list_next(&vms->vms_clients, vmc)) {
474 		mutex_enter(&vmc->vmc_lock);
475 		vmc->vmc_track_dirty = enable_dirty_tracking;
476 		mutex_exit(&vmc->vmc_lock);
477 	}
478 
479 	/*
480 	 * Notify all clients of what is considered an invalidation of the
481 	 * entire vmspace.
482 	 */
483 	vms->vms_pt_gen++;
484 	vmspace_clients_invalidate(vms, 0, vms->vms_size);
485 
486 	vmspace_hold_exit(vms, true);
487 	return (0);
488 }
489 
490 static pfn_t
vm_object_pager_reservoir(vm_object_t * vmo,uintptr_t off)491 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
492 {
493 	vmmr_region_t *region;
494 	pfn_t pfn;
495 
496 	ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
497 
498 	region = vmo->vmo_data;
499 	pfn = vmmr_region_pfn_at(region, off);
500 
501 	return (pfn);
502 }
503 
504 static pfn_t
vm_object_pager_mmio(vm_object_t * vmo,uintptr_t off)505 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
506 {
507 	pfn_t pfn;
508 
509 	ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
510 	ASSERT3P(vmo->vmo_data, !=, NULL);
511 	ASSERT3U(off, <, vmo->vmo_size);
512 
513 	pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
514 
515 	return (pfn);
516 }
517 
518 /*
519  * Allocate a VM object backed by VMM reservoir memory.
520  */
521 vm_object_t *
vm_object_mem_allocate(size_t size,bool transient)522 vm_object_mem_allocate(size_t size, bool transient)
523 {
524 	int err;
525 	vmmr_region_t *region = NULL;
526 	vm_object_t *vmo;
527 
528 	ASSERT3U(size, !=, 0);
529 	ASSERT3U(size & PAGEOFFSET, ==, 0);
530 
531 	err = vmmr_alloc(size, transient, &region);
532 	if (err != 0) {
533 		return (NULL);
534 	}
535 
536 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
537 
538 	/* For now, these are to stay fixed after allocation */
539 	vmo->vmo_type = VMOT_MEM;
540 	vmo->vmo_size = size;
541 	vmo->vmo_attr = MTRR_TYPE_WB;
542 	vmo->vmo_data = region;
543 	vmo->vmo_refcnt = 1;
544 
545 	return (vmo);
546 }
547 
548 static vm_object_t *
vm_object_mmio_allocate(size_t size,uintptr_t hpa)549 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
550 {
551 	vm_object_t *vmo;
552 
553 	ASSERT3U(size, !=, 0);
554 	ASSERT3U(size & PAGEOFFSET, ==, 0);
555 	ASSERT3U(hpa & PAGEOFFSET, ==, 0);
556 
557 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
558 
559 	/* For now, these are to stay fixed after allocation */
560 	vmo->vmo_type = VMOT_MMIO;
561 	vmo->vmo_size = size;
562 	vmo->vmo_attr = MTRR_TYPE_UC;
563 	vmo->vmo_data = (void *)hpa;
564 	vmo->vmo_refcnt = 1;
565 
566 	return (vmo);
567 }
568 
569 /*
570  * Allocate a VM object backed by an existing range of physical memory.
571  */
572 vm_object_t *
vmm_mmio_alloc(vmspace_t * vmspace,uintptr_t gpa,size_t len,uintptr_t hpa)573 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
574 {
575 	int error;
576 	vm_object_t *obj;
577 
578 	obj = vm_object_mmio_allocate(len, hpa);
579 	if (obj != NULL) {
580 		error = vmspace_map(vmspace, obj, 0, gpa, len,
581 		    PROT_READ | PROT_WRITE);
582 		if (error != 0) {
583 			vm_object_release(obj);
584 			obj = NULL;
585 		}
586 	}
587 
588 	return (obj);
589 }
590 
591 /*
592  * Release a vm_object reference
593  */
594 void
vm_object_release(vm_object_t * vmo)595 vm_object_release(vm_object_t *vmo)
596 {
597 	ASSERT(vmo != NULL);
598 
599 	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
600 	/* underflow would be a deadly serious mistake */
601 	VERIFY3U(ref, !=, UINT_MAX);
602 	if (ref != 0) {
603 		return;
604 	}
605 
606 	switch (vmo->vmo_type) {
607 	case VMOT_MEM:
608 		vmmr_free((vmmr_region_t *)vmo->vmo_data);
609 		break;
610 	case VMOT_MMIO:
611 		break;
612 	default:
613 		panic("unexpected object type %u", vmo->vmo_type);
614 		break;
615 	}
616 
617 	vmo->vmo_data = NULL;
618 	vmo->vmo_size = 0;
619 	kmem_free(vmo, sizeof (*vmo));
620 }
621 
622 /*
623  * Increase refcount for vm_object reference
624  */
625 void
vm_object_reference(vm_object_t * vmo)626 vm_object_reference(vm_object_t *vmo)
627 {
628 	ASSERT(vmo != NULL);
629 
630 	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
631 	/* overflow would be a deadly serious mistake */
632 	VERIFY3U(ref, !=, 0);
633 }
634 
635 /*
636  * Get the host-physical PFN for a given offset into a vm_object.
637  *
638  * The provided `off` must be within the allocated size of the vm_object.
639  */
640 pfn_t
vm_object_pfn(vm_object_t * vmo,uintptr_t off)641 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
642 {
643 	const uintptr_t aligned_off = off & PAGEMASK;
644 
645 	switch (vmo->vmo_type) {
646 	case VMOT_MEM:
647 		return (vm_object_pager_reservoir(vmo, aligned_off));
648 	case VMOT_MMIO:
649 		return (vm_object_pager_mmio(vmo, aligned_off));
650 	case VMOT_NONE:
651 		break;
652 	}
653 	panic("unexpected object type %u", vmo->vmo_type);
654 }
655 
656 static vmspace_mapping_t *
vm_mapping_find(vmspace_t * vms,uintptr_t addr,size_t size)657 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
658 {
659 	vmspace_mapping_t *vmsm;
660 	list_t *ml = &vms->vms_maplist;
661 	const uintptr_t range_end = addr + size;
662 
663 	ASSERT3U(addr, <=, range_end);
664 
665 	if (addr >= vms->vms_size) {
666 		return (NULL);
667 	}
668 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
669 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
670 
671 		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
672 			if (range_end <= seg_end) {
673 				return (vmsm);
674 			} else {
675 				return (NULL);
676 			}
677 		}
678 	}
679 	return (NULL);
680 }
681 
682 /*
683  * Check to see if any mappings reside within [addr, addr + size) span in the
684  * vmspace, returning true if that span is indeed empty.
685  */
686 static bool
vm_mapping_gap(vmspace_t * vms,uintptr_t addr,size_t size)687 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
688 {
689 	vmspace_mapping_t *vmsm;
690 	list_t *ml = &vms->vms_maplist;
691 	const uintptr_t range_end = addr + size - 1;
692 
693 	ASSERT(MUTEX_HELD(&vms->vms_lock));
694 	ASSERT(size > 0);
695 
696 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
697 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
698 
699 		/*
700 		 * The two ranges do not overlap if the start of either of
701 		 * them is after the end of the other.
702 		 */
703 		if (vmsm->vmsm_addr > range_end || addr > seg_end)
704 			continue;
705 		return (false);
706 	}
707 	return (true);
708 }
709 
710 static void
vm_mapping_remove(vmspace_t * vms,vmspace_mapping_t * vmsm)711 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
712 {
713 	list_t *ml = &vms->vms_maplist;
714 
715 	ASSERT(MUTEX_HELD(&vms->vms_lock));
716 	ASSERT(vms->vms_held);
717 
718 	list_remove(ml, vmsm);
719 	vm_object_release(vmsm->vmsm_object);
720 	kmem_free(vmsm, sizeof (*vmsm));
721 }
722 
723 /*
724  * Enter a hold state on the vmspace.  This ensures that all VM clients
725  * associated with the vmspace are excluded from establishing new page holds,
726  * or any other actions which would require accessing vmspace state subject to
727  * potential change.
728  *
729  * Returns with vmspace_t`vms_lock held.
730  */
731 static void
vmspace_hold_enter(vmspace_t * vms)732 vmspace_hold_enter(vmspace_t *vms)
733 {
734 	mutex_enter(&vms->vms_lock);
735 	VERIFY(!vms->vms_held);
736 
737 	vm_client_t *vmc = list_head(&vms->vms_clients);
738 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
739 		vmc_space_hold(vmc);
740 	}
741 	vms->vms_held = true;
742 }
743 
744 /*
745  * Exit a hold state on the vmspace.  This releases all VM clients associated
746  * with the vmspace to be able to establish new page holds, and partake in other
747  * actions which require accessing changed vmspace state.  If `kick_on_cpu` is
748  * true, then any CPUs actively using the page tables will be IPIed, and the
749  * call will block until they have acknowledged being ready to use the latest
750  * state of the tables.
751  *
752  * Requires vmspace_t`vms_lock be held, which is released as part of the call.
753  */
754 static void
vmspace_hold_exit(vmspace_t * vms,bool kick_on_cpu)755 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
756 {
757 	ASSERT(MUTEX_HELD(&vms->vms_lock));
758 	VERIFY(vms->vms_held);
759 
760 	vm_client_t *vmc = list_head(&vms->vms_clients);
761 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
762 		vmc_space_release(vmc, kick_on_cpu);
763 	}
764 	vms->vms_held = false;
765 	mutex_exit(&vms->vms_lock);
766 }
767 
768 static void
vmspace_clients_invalidate(vmspace_t * vms,uintptr_t gpa,size_t len)769 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len)
770 {
771 	ASSERT(MUTEX_HELD(&vms->vms_lock));
772 	VERIFY(vms->vms_held);
773 
774 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
775 	    vmc != NULL;
776 	    vmc = list_next(&vms->vms_clients, vmc)) {
777 		vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
778 	}
779 }
780 
781 /*
782  * Attempt to map a vm_object span into the vmspace.
783  *
784  * Requirements:
785  * - `obj_off`, `addr`, and `len` must be page-aligned
786  * - `obj_off` cannot be greater than the allocated size of the object
787  * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
788  *   size of the object
789  * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
790  *   of the vmspace
791  */
792 int
vmspace_map(vmspace_t * vms,vm_object_t * vmo,uintptr_t obj_off,uintptr_t addr,size_t len,uint8_t prot)793 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
794     size_t len, uint8_t prot)
795 {
796 	vmspace_mapping_t *vmsm;
797 	int res = 0;
798 
799 	if (len == 0 || (addr + len) < addr ||
800 	    obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
801 		return (EINVAL);
802 	}
803 	if ((addr + len) >= vms->vms_size) {
804 		return (ENOMEM);
805 	}
806 
807 	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
808 
809 	vmspace_hold_enter(vms);
810 	if (!vm_mapping_gap(vms, addr, len)) {
811 		kmem_free(vmsm, sizeof (*vmsm));
812 		res = ENOMEM;
813 	} else {
814 		vmsm->vmsm_object = vmo;
815 		vmsm->vmsm_addr = addr;
816 		vmsm->vmsm_len = len;
817 		vmsm->vmsm_offset = (off_t)obj_off;
818 		vmsm->vmsm_prot = prot;
819 		list_insert_tail(&vms->vms_maplist, vmsm);
820 
821 		/*
822 		 * Make sure the GPT has tables ready for leaf entries across
823 		 * the entire new mapping.
824 		 */
825 		vmm_gpt_populate_region(vms->vms_gpt, addr, len);
826 	}
827 	vmspace_hold_exit(vms, false);
828 	return (res);
829 }
830 
831 /*
832  * Unmap a region of the vmspace.
833  *
834  * Presently the [start, end) span must equal a region previously mapped by a
835  * call to vmspace_map().
836  */
837 int
vmspace_unmap(vmspace_t * vms,uintptr_t addr,uintptr_t len)838 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len)
839 {
840 	const uintptr_t end = addr + len;
841 	vmspace_mapping_t *vmsm;
842 	vm_client_t *vmc;
843 	uint64_t gen = 0;
844 
845 	ASSERT3U(addr, <, end);
846 
847 	vmspace_hold_enter(vms);
848 	/* expect to match existing mapping exactly */
849 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL ||
850 	    vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) {
851 		vmspace_hold_exit(vms, false);
852 		return (ENOENT);
853 	}
854 
855 	/* Prepare clients (and their held pages) for the unmap. */
856 	for (vmc = list_head(&vms->vms_clients); vmc != NULL;
857 	    vmc = list_next(&vms->vms_clients, vmc)) {
858 		vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object);
859 	}
860 
861 	/* Clear all PTEs for region */
862 	if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) {
863 		vms->vms_pt_gen++;
864 		gen = vms->vms_pt_gen;
865 	}
866 	/* ... and the intermediate (directory) PTEs as well */
867 	vmm_gpt_vacate_region(vms->vms_gpt, addr, len);
868 
869 	/*
870 	 * If pages were actually unmapped from the GPT, provide clients with
871 	 * an invalidation notice.
872 	 */
873 	if (gen != 0) {
874 		vmspace_clients_invalidate(vms, addr, len);
875 	}
876 
877 	vm_mapping_remove(vms, vmsm);
878 	vmspace_hold_exit(vms, true);
879 	return (0);
880 }
881 
882 /*
883  * For a given GPA in the vmspace, ensure that the backing page (if any) is
884  * properly mapped as present in the provided PTE.
885  */
886 static int
vmspace_ensure_mapped(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t * leaf_pte)887 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
888     uint64_t *leaf_pte)
889 {
890 	vmspace_mapping_t *vmsm;
891 	vm_object_t *vmo;
892 	pfn_t pfn;
893 
894 	ASSERT(pfnp != NULL);
895 	ASSERT(leaf_pte != NULL);
896 
897 	vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
898 	if (vmsm == NULL) {
899 		return (FC_NOMAP);
900 	}
901 	if ((req_prot & vmsm->vmsm_prot) != req_prot) {
902 		return (FC_PROT);
903 	}
904 
905 	vmo = vmsm->vmsm_object;
906 	pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
907 	VERIFY(pfn != PFN_INVALID);
908 
909 	if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot,
910 	    vmo->vmo_attr)) {
911 		atomic_inc_64(&vms->vms_pages_mapped);
912 	}
913 
914 	*pfnp = pfn;
915 	return (0);
916 }
917 
918 /*
919  * Look up the PTE for a given GPA in the vmspace, populating it with
920  * appropriate contents (pfn, protection, etc) if it is empty, but backed by a
921  * valid mapping.
922  */
923 static int
vmspace_lookup_map(vmspace_t * vms,uintptr_t gpa,int req_prot,pfn_t * pfnp,uint64_t ** ptepp)924 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
925     uint64_t **ptepp)
926 {
927 	vmm_gpt_t *gpt = vms->vms_gpt;
928 	uint64_t *entries[MAX_GPT_LEVEL], *leaf;
929 	pfn_t pfn = PFN_INVALID;
930 	uint_t prot;
931 
932 	ASSERT0(gpa & PAGEOFFSET);
933 	ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
934 
935 	(void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1);
936 	leaf = entries[LEVEL1];
937 	if (leaf == NULL) {
938 		/*
939 		 * Since we populated the intermediate tables for any regions
940 		 * mapped in the GPT, an empty leaf entry indicates there is no
941 		 * mapping, populated or not, at this GPA.
942 		 */
943 		return (FC_NOMAP);
944 	}
945 
946 	if (vmm_gpte_is_mapped(leaf, &pfn, &prot)) {
947 		if ((req_prot & prot) != req_prot) {
948 			return (FC_PROT);
949 		}
950 	} else {
951 		int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf);
952 		if (err != 0) {
953 			return (err);
954 		}
955 	}
956 
957 	ASSERT(pfn != PFN_INVALID && leaf != NULL);
958 	if (pfnp != NULL) {
959 		*pfnp = pfn;
960 	}
961 	if (ptepp != NULL) {
962 		*ptepp = leaf;
963 	}
964 	return (0);
965 }
966 
967 /*
968  * Populate (make resident in the page tables) a region of the vmspace.
969  *
970  * Presently the [start, end) span must equal a region previously mapped by a
971  * call to vmspace_map().
972  */
973 int
vmspace_populate(vmspace_t * vms,uintptr_t addr,uintptr_t len)974 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len)
975 {
976 	ASSERT0(addr & PAGEOFFSET);
977 	ASSERT0(len & PAGEOFFSET);
978 
979 	vmspace_mapping_t *vmsm;
980 	mutex_enter(&vms->vms_lock);
981 
982 	/* For the time being, only exact-match mappings are expected */
983 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) {
984 		mutex_exit(&vms->vms_lock);
985 		return (FC_NOMAP);
986 	}
987 
988 	vm_object_t *vmo = vmsm->vmsm_object;
989 	const int prot = vmsm->vmsm_prot;
990 	const uint8_t attr = vmo->vmo_attr;
991 	vmm_gpt_t *gpt = vms->vms_gpt;
992 	size_t populated = 0;
993 
994 	vmm_gpt_iter_t iter;
995 	vmm_gpt_iter_entry_t entry;
996 	vmm_gpt_iter_init(&iter, gpt, addr, len);
997 	while (vmm_gpt_iter_next(&iter, &entry)) {
998 		const pfn_t pfn =
999 		    vm_object_pfn(vmo, VMSM_OFFSET(vmsm, entry.vgie_gpa));
1000 		VERIFY(pfn != PFN_INVALID);
1001 
1002 		if (vmm_gpt_map_at(gpt, entry.vgie_ptep, pfn, prot, attr)) {
1003 			populated++;
1004 		}
1005 	}
1006 	atomic_add_64(&vms->vms_pages_mapped, populated);
1007 
1008 	mutex_exit(&vms->vms_lock);
1009 	return (0);
1010 }
1011 
1012 /*
1013  * Allocate a client from a given vmspace.
1014  */
1015 vm_client_t *
vmspace_client_alloc(vmspace_t * vms)1016 vmspace_client_alloc(vmspace_t *vms)
1017 {
1018 	vm_client_t *vmc;
1019 
1020 	vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
1021 	vmc->vmc_space = vms;
1022 	mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
1023 	cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
1024 	vmc->vmc_state = VCS_IDLE;
1025 	vmc->vmc_cpu_active = -1;
1026 	list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
1027 	    offsetof(vm_page_t, vmp_node));
1028 	vmc->vmc_track_dirty = vms->vms_track_dirty;
1029 
1030 	mutex_enter(&vms->vms_lock);
1031 	list_insert_tail(&vms->vms_clients, vmc);
1032 	mutex_exit(&vms->vms_lock);
1033 
1034 	return (vmc);
1035 }
1036 
1037 /*
1038  * Get the nested page table root pointer (EPTP/NCR3) value.
1039  */
1040 uint64_t
vmspace_table_root(vmspace_t * vms)1041 vmspace_table_root(vmspace_t *vms)
1042 {
1043 	return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
1044 }
1045 
1046 /*
1047  * Get the current generation number of the nested page table.
1048  */
1049 uint64_t
vmspace_table_gen(vmspace_t * vms)1050 vmspace_table_gen(vmspace_t *vms)
1051 {
1052 	return (vms->vms_pt_gen);
1053 }
1054 
1055 /*
1056  * Mark a vm_client as active.  This will block if/while the client is held by
1057  * the vmspace.  On success, it returns with vm_client_t`vmc_lock held.  It will
1058  * fail if the vm_client has been orphaned.
1059  */
1060 static int
vmc_activate(vm_client_t * vmc)1061 vmc_activate(vm_client_t *vmc)
1062 {
1063 	mutex_enter(&vmc->vmc_lock);
1064 	VERIFY0(vmc->vmc_state & VCS_ACTIVE);
1065 	if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
1066 		mutex_exit(&vmc->vmc_lock);
1067 		return (ENXIO);
1068 	}
1069 	while ((vmc->vmc_state & VCS_HOLD) != 0) {
1070 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1071 	}
1072 	vmc->vmc_state |= VCS_ACTIVE;
1073 	return (0);
1074 }
1075 
1076 /*
1077  * Mark a vm_client as no longer active.  It must be called with
1078  * vm_client_t`vmc_lock already held, and will return with it released.
1079  */
1080 static void
vmc_deactivate(vm_client_t * vmc)1081 vmc_deactivate(vm_client_t *vmc)
1082 {
1083 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1084 	VERIFY(vmc->vmc_state & VCS_ACTIVE);
1085 
1086 	vmc->vmc_state ^= VCS_ACTIVE;
1087 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
1088 		cv_broadcast(&vmc->vmc_cv);
1089 	}
1090 	mutex_exit(&vmc->vmc_lock);
1091 }
1092 
1093 /*
1094  * Indicate that a CPU will be utilizing the nested page tables through this VM
1095  * client.  Interrupts (and/or the GIF) are expected to be disabled when calling
1096  * this function.  Returns the generation number of the nested page table (to be
1097  * used for TLB invalidations).
1098  */
1099 uint64_t
vmc_table_enter(vm_client_t * vmc)1100 vmc_table_enter(vm_client_t *vmc)
1101 {
1102 	vmspace_t *vms = vmc->vmc_space;
1103 	uint64_t gen;
1104 
1105 	ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1106 	ASSERT3S(vmc->vmc_cpu_active, ==, -1);
1107 
1108 	/*
1109 	 * Since the NPT activation occurs with interrupts disabled, this must
1110 	 * be done without taking vmc_lock like normal.
1111 	 */
1112 	gen = vms->vms_pt_gen;
1113 	vmc->vmc_cpu_active = CPU->cpu_id;
1114 	vmc->vmc_cpu_gen = gen;
1115 	atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
1116 
1117 	return (gen);
1118 }
1119 
1120 /*
1121  * Indicate that this VM client is not longer (directly) using the underlying
1122  * page tables.  Interrupts (and/or the GIF) must be enabled prior to calling
1123  * this function.
1124  */
1125 void
vmc_table_exit(vm_client_t * vmc)1126 vmc_table_exit(vm_client_t *vmc)
1127 {
1128 	mutex_enter(&vmc->vmc_lock);
1129 
1130 	ASSERT(vmc->vmc_state & VCS_ON_CPU);
1131 	vmc->vmc_state ^= VCS_ON_CPU;
1132 	vmc->vmc_cpu_active = -1;
1133 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
1134 		cv_broadcast(&vmc->vmc_cv);
1135 	}
1136 
1137 	mutex_exit(&vmc->vmc_lock);
1138 }
1139 
1140 static void
vmc_space_hold(vm_client_t * vmc)1141 vmc_space_hold(vm_client_t *vmc)
1142 {
1143 	mutex_enter(&vmc->vmc_lock);
1144 	VERIFY0(vmc->vmc_state & VCS_HOLD);
1145 
1146 	/*
1147 	 * Because vmc_table_enter() alters vmc_state from a context where
1148 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
1149 	 * VMC_HOLD must be done atomically here.
1150 	 */
1151 	atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
1152 
1153 	/* Wait for client to go inactive */
1154 	while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
1155 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1156 	}
1157 	mutex_exit(&vmc->vmc_lock);
1158 }
1159 
1160 static void
vmc_space_release(vm_client_t * vmc,bool kick_on_cpu)1161 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
1162 {
1163 	mutex_enter(&vmc->vmc_lock);
1164 	VERIFY(vmc->vmc_state & VCS_HOLD);
1165 
1166 	if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
1167 		poke_cpu(vmc->vmc_cpu_active);
1168 
1169 		while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1170 			cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1171 		}
1172 	}
1173 
1174 	/*
1175 	 * Because vmc_table_enter() alters vmc_state from a context where
1176 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
1177 	 * VMC_HOLD must be done atomically here.
1178 	 */
1179 	atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
1180 	cv_broadcast(&vmc->vmc_cv);
1181 	mutex_exit(&vmc->vmc_lock);
1182 }
1183 
1184 static void
vmc_space_invalidate(vm_client_t * vmc,uintptr_t addr,size_t size,uint64_t gen)1185 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
1186     uint64_t gen)
1187 {
1188 	mutex_enter(&vmc->vmc_lock);
1189 	VERIFY(vmc->vmc_state & VCS_HOLD);
1190 	if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1191 		/*
1192 		 * Wait for clients using an old generation of the page tables
1193 		 * to exit guest context, where they subsequently flush the TLB
1194 		 * for the new generation.
1195 		 */
1196 		if (vmc->vmc_cpu_gen < gen) {
1197 			poke_cpu(vmc->vmc_cpu_active);
1198 
1199 			while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1200 				cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1201 			}
1202 		}
1203 	}
1204 	if (vmc->vmc_inval_func != NULL) {
1205 		vmc_inval_cb_t func = vmc->vmc_inval_func;
1206 		void *data = vmc->vmc_inval_data;
1207 
1208 		/*
1209 		 * Perform the actual invalidation call outside vmc_lock to
1210 		 * avoid lock ordering issues in the consumer.  Since the client
1211 		 * is under VCS_HOLD, this is safe.
1212 		 */
1213 		mutex_exit(&vmc->vmc_lock);
1214 		func(data, addr, size);
1215 		mutex_enter(&vmc->vmc_lock);
1216 	}
1217 	mutex_exit(&vmc->vmc_lock);
1218 }
1219 
1220 static void
vmc_space_unmap(vm_client_t * vmc,uintptr_t addr,size_t size,vm_object_t * vmo)1221 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1222     vm_object_t *vmo)
1223 {
1224 	mutex_enter(&vmc->vmc_lock);
1225 	VERIFY(vmc->vmc_state & VCS_HOLD);
1226 
1227 	/*
1228 	 * With the current vCPU exclusion invariants in place, we do not expect
1229 	 * a vCPU to be in guest context during an unmap.
1230 	 */
1231 	VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1232 
1233 	/*
1234 	 * Any holds against the unmapped region need to establish their own
1235 	 * reference to the underlying object to avoid a potential
1236 	 * use-after-free.
1237 	 */
1238 	for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1239 	    vmp != NULL;
1240 	    vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1241 		if (vmp->vmp_gpa < addr ||
1242 		    vmp->vmp_gpa >= (addr + size)) {
1243 			/* Hold outside region in question */
1244 			continue;
1245 		}
1246 		if (vmp->vmp_obj_ref == NULL) {
1247 			vm_object_reference(vmo);
1248 			vmp->vmp_obj_ref = vmo;
1249 			/* For an unmapped region, PTE is now meaningless */
1250 			vmp->vmp_ptep = NULL;
1251 		} else {
1252 			/*
1253 			 * Object could have gone through cycle of
1254 			 * unmap-map-unmap before the hold was released.
1255 			 */
1256 			VERIFY3P(vmp->vmp_ptep, ==, NULL);
1257 		}
1258 	}
1259 	mutex_exit(&vmc->vmc_lock);
1260 }
1261 
1262 static vm_client_t *
vmc_space_orphan(vm_client_t * vmc,vmspace_t * vms)1263 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1264 {
1265 	vm_client_t *next;
1266 
1267 	ASSERT(MUTEX_HELD(&vms->vms_lock));
1268 
1269 	mutex_enter(&vmc->vmc_lock);
1270 	VERIFY3P(vmc->vmc_space, ==, vms);
1271 	VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1272 	if (vmc->vmc_state & VCS_DESTROY) {
1273 		/*
1274 		 * This vm_client is currently undergoing destruction, so it
1275 		 * does not need to be orphaned.  Let it proceed with its own
1276 		 * clean-up task.
1277 		 */
1278 		next = list_next(&vms->vms_clients, vmc);
1279 	} else {
1280 		/*
1281 		 * Clients are only orphaned when the containing vmspace is
1282 		 * being torn down.  All mappings from the vmspace should
1283 		 * already be gone, meaning any remaining held pages should have
1284 		 * direct references to the object.
1285 		 */
1286 		for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1287 		    vmp != NULL;
1288 		    vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1289 			ASSERT3P(vmp->vmp_ptep, ==, NULL);
1290 			ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1291 		}
1292 
1293 		/*
1294 		 * After this point, the client will be orphaned, unable to
1295 		 * establish new page holds (or access any vmspace-related
1296 		 * resources) and is in charge of cleaning up after itself.
1297 		 */
1298 		vmc->vmc_state |= VCS_ORPHANED;
1299 		next = list_next(&vms->vms_clients, vmc);
1300 		list_remove(&vms->vms_clients, vmc);
1301 		vmc->vmc_space = NULL;
1302 	}
1303 	mutex_exit(&vmc->vmc_lock);
1304 	return (next);
1305 }
1306 
1307 /*
1308  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1309  */
1310 vm_page_t *
vmc_hold_ext(vm_client_t * vmc,uintptr_t gpa,int prot,int flags)1311 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags)
1312 {
1313 	vmspace_t *vms = vmc->vmc_space;
1314 	vm_page_t *vmp;
1315 	pfn_t pfn = PFN_INVALID;
1316 	uint64_t *ptep = NULL;
1317 
1318 	ASSERT0(gpa & PAGEOFFSET);
1319 	ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1320 	ASSERT0(prot & ~PROT_ALL);
1321 	ASSERT0(flags & ~VPF_ALL);
1322 
1323 	vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1324 	if (vmc_activate(vmc) != 0) {
1325 		kmem_free(vmp, sizeof (*vmp));
1326 		return (NULL);
1327 	}
1328 
1329 	if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1330 		vmc_deactivate(vmc);
1331 		kmem_free(vmp, sizeof (*vmp));
1332 		return (NULL);
1333 	}
1334 	ASSERT(pfn != PFN_INVALID && ptep != NULL);
1335 
1336 	vmp->vmp_client = vmc;
1337 	vmp->vmp_chain = NULL;
1338 	vmp->vmp_gpa = gpa;
1339 	vmp->vmp_pfn = pfn;
1340 	vmp->vmp_ptep = ptep;
1341 	vmp->vmp_obj_ref = NULL;
1342 	vmp->vmp_prot = (uint8_t)prot;
1343 	vmp->vmp_flags = (uint8_t)flags;
1344 	list_insert_tail(&vmc->vmc_held_pages, vmp);
1345 	vmc_deactivate(vmc);
1346 
1347 	return (vmp);
1348 }
1349 
1350 /*
1351  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1352  */
1353 vm_page_t *
vmc_hold(vm_client_t * vmc,uintptr_t gpa,int prot)1354 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1355 {
1356 	return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT));
1357 }
1358 
1359 int
vmc_fault(vm_client_t * vmc,uintptr_t gpa,int prot)1360 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1361 {
1362 	vmspace_t *vms = vmc->vmc_space;
1363 	int err;
1364 
1365 	err = vmc_activate(vmc);
1366 	if (err == 0) {
1367 		err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1368 		vmc_deactivate(vmc);
1369 	}
1370 
1371 	return (err);
1372 }
1373 
1374 /*
1375  * Allocate an additional vm_client_t, based on an existing one.  Only the
1376  * associatation with the vmspace is cloned, not existing holds or any
1377  * configured invalidation function.
1378  */
1379 vm_client_t *
vmc_clone(vm_client_t * vmc)1380 vmc_clone(vm_client_t *vmc)
1381 {
1382 	vmspace_t *vms = vmc->vmc_space;
1383 
1384 	return (vmspace_client_alloc(vms));
1385 }
1386 
1387 /*
1388  * Register a function (and associated data pointer) to be called when an
1389  * address range in the vmspace is invalidated.
1390  */
1391 int
vmc_set_inval_cb(vm_client_t * vmc,vmc_inval_cb_t func,void * data)1392 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1393 {
1394 	int err;
1395 
1396 	err = vmc_activate(vmc);
1397 	if (err == 0) {
1398 		vmc->vmc_inval_func = func;
1399 		vmc->vmc_inval_data = data;
1400 		vmc_deactivate(vmc);
1401 	}
1402 
1403 	return (err);
1404 }
1405 
1406 /*
1407  * Destroy a vm_client_t instance.
1408  *
1409  * No pages held through this vm_client_t may be outstanding when performing a
1410  * vmc_destroy().  For vCPU clients, the client cannot be on-CPU (a call to
1411  * vmc_table_exit() has been made).
1412  */
1413 void
vmc_destroy(vm_client_t * vmc)1414 vmc_destroy(vm_client_t *vmc)
1415 {
1416 	mutex_enter(&vmc->vmc_lock);
1417 
1418 	VERIFY(list_is_empty(&vmc->vmc_held_pages));
1419 	VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1420 
1421 	if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1422 		vmspace_t *vms;
1423 
1424 		/*
1425 		 * Deassociation with the parent vmspace must be done carefully:
1426 		 * The vmspace could attempt to orphan this vm_client while we
1427 		 * release vmc_lock in order to take vms_lock (the required
1428 		 * order).  The client is marked to indicate that destruction is
1429 		 * under way.  Doing so prevents any racing orphan operation
1430 		 * from applying to this client, allowing us to deassociate from
1431 		 * the vmspace safely.
1432 		 */
1433 		vmc->vmc_state |= VCS_DESTROY;
1434 		vms = vmc->vmc_space;
1435 		mutex_exit(&vmc->vmc_lock);
1436 
1437 		mutex_enter(&vms->vms_lock);
1438 		mutex_enter(&vmc->vmc_lock);
1439 		list_remove(&vms->vms_clients, vmc);
1440 		/*
1441 		 * If the vmspace began its own destruction operation while we
1442 		 * were navigating the locks, be sure to notify it about this
1443 		 * vm_client being deassociated.
1444 		 */
1445 		cv_signal(&vms->vms_cv);
1446 		mutex_exit(&vmc->vmc_lock);
1447 		mutex_exit(&vms->vms_lock);
1448 	} else {
1449 		VERIFY3P(vmc->vmc_space, ==, NULL);
1450 		mutex_exit(&vmc->vmc_lock);
1451 	}
1452 
1453 	mutex_destroy(&vmc->vmc_lock);
1454 	cv_destroy(&vmc->vmc_cv);
1455 	list_destroy(&vmc->vmc_held_pages);
1456 
1457 	kmem_free(vmc, sizeof (*vmc));
1458 }
1459 
1460 static __inline void *
vmp_ptr(const vm_page_t * vmp)1461 vmp_ptr(const vm_page_t *vmp)
1462 {
1463 	ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1464 
1465 	const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1466 	return ((void *)((uintptr_t)kpm_vbase + paddr));
1467 }
1468 
1469 /*
1470  * Get a readable kernel-virtual pointer for a held page.
1471  *
1472  * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1473  * call to acquire this page reference.
1474  */
1475 const void *
vmp_get_readable(const vm_page_t * vmp)1476 vmp_get_readable(const vm_page_t *vmp)
1477 {
1478 	ASSERT(vmp->vmp_prot & PROT_READ);
1479 
1480 	return (vmp_ptr(vmp));
1481 }
1482 
1483 /*
1484  * Get a writable kernel-virtual pointer for a held page.
1485  *
1486  * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1487  * call to acquire this page reference.
1488  */
1489 void *
vmp_get_writable(const vm_page_t * vmp)1490 vmp_get_writable(const vm_page_t *vmp)
1491 {
1492 	ASSERT(vmp->vmp_prot & PROT_WRITE);
1493 
1494 	return (vmp_ptr(vmp));
1495 }
1496 
1497 /*
1498  * Get the host-physical PFN for a held page.
1499  */
1500 pfn_t
vmp_get_pfn(const vm_page_t * vmp)1501 vmp_get_pfn(const vm_page_t *vmp)
1502 {
1503 	return (vmp->vmp_pfn);
1504 }
1505 
1506 /*
1507  * If this page was deferring dirty-marking in the corresponding vmspace page
1508  * tables, clear such a state so it is considered dirty from now on.
1509  */
1510 void
vmp_mark_dirty(vm_page_t * vmp)1511 vmp_mark_dirty(vm_page_t *vmp)
1512 {
1513 	ASSERT((vmp->vmp_prot & PROT_WRITE) != 0);
1514 
1515 	atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY);
1516 }
1517 
1518 /*
1519  * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1520  */
1521 void
vmp_chain(vm_page_t * vmp,vm_page_t * to_chain)1522 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1523 {
1524 	ASSERT3P(vmp->vmp_chain, ==, NULL);
1525 
1526 	vmp->vmp_chain = to_chain;
1527 }
1528 
1529 /*
1530  * Retrieve the pointer from the page-chaining in `vmp`.
1531  */
1532 vm_page_t *
vmp_next(const vm_page_t * vmp)1533 vmp_next(const vm_page_t *vmp)
1534 {
1535 	return (vmp->vmp_chain);
1536 }
1537 
1538 static __inline bool
vmp_release_inner(vm_page_t * vmp,vm_client_t * vmc)1539 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1540 {
1541 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1542 
1543 	bool was_unmapped = false;
1544 
1545 	list_remove(&vmc->vmc_held_pages, vmp);
1546 	if (vmp->vmp_obj_ref != NULL) {
1547 		ASSERT3P(vmp->vmp_ptep, ==, NULL);
1548 
1549 		vm_object_release(vmp->vmp_obj_ref);
1550 		was_unmapped = true;
1551 	} else {
1552 		ASSERT3P(vmp->vmp_ptep, !=, NULL);
1553 
1554 		/*
1555 		 * Track appropriate (accessed/dirty) bits for the guest-virtual
1556 		 * address corresponding to this page, if it is from the vmspace
1557 		 * rather than a direct reference to an underlying object.
1558 		 *
1559 		 * The protection and/or configured flags may obviate the need
1560 		 * for such an update.
1561 		 */
1562 		if ((vmp->vmp_prot & PROT_WRITE) != 0 &&
1563 		    (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 &&
1564 		    vmc->vmc_track_dirty) {
1565 			(void) vmm_gpte_reset_dirty(vmp->vmp_ptep, true);
1566 		}
1567 	}
1568 	kmem_free(vmp, sizeof (*vmp));
1569 	return (was_unmapped);
1570 }
1571 
1572 /*
1573  * Release held page.  Returns true if page resided on region which was
1574  * subsequently unmapped.
1575  */
1576 bool
vmp_release(vm_page_t * vmp)1577 vmp_release(vm_page_t *vmp)
1578 {
1579 	vm_client_t *vmc = vmp->vmp_client;
1580 
1581 	VERIFY(vmc != NULL);
1582 
1583 	mutex_enter(&vmc->vmc_lock);
1584 	const bool was_unmapped = vmp_release_inner(vmp, vmc);
1585 	mutex_exit(&vmc->vmc_lock);
1586 	return (was_unmapped);
1587 }
1588 
1589 /*
1590  * Release a chain of pages which were associated via vmp_chain() (setting
1591  * page-chaining pointer).  Returns true if any pages resided upon a region
1592  * which was subsequently unmapped.
1593  *
1594  * All of those pages must have been held through the same vm_client_t.
1595  */
1596 bool
vmp_release_chain(vm_page_t * vmp)1597 vmp_release_chain(vm_page_t *vmp)
1598 {
1599 	vm_client_t *vmc = vmp->vmp_client;
1600 	bool any_unmapped = false;
1601 
1602 	ASSERT(vmp != NULL);
1603 
1604 	mutex_enter(&vmc->vmc_lock);
1605 	while (vmp != NULL) {
1606 		vm_page_t *next = vmp->vmp_chain;
1607 
1608 		/* We expect all pages in chain to be from same client */
1609 		ASSERT3P(vmp->vmp_client, ==, vmc);
1610 
1611 		if (vmp_release_inner(vmp, vmc)) {
1612 			any_unmapped = true;
1613 		}
1614 		vmp = next;
1615 	}
1616 	mutex_exit(&vmc->vmc_lock);
1617 	return (any_unmapped);
1618 }
1619 
1620 
1621 int
vm_segmap_obj(struct vm * vm,int segid,off_t segoff,off_t len,struct as * as,caddr_t * addrp,uint_t prot,uint_t maxprot,uint_t flags)1622 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1623     struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1624 {
1625 	vm_object_t *vmo;
1626 	int err;
1627 
1628 	if (segoff < 0 || len <= 0 ||
1629 	    (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1630 		return (EINVAL);
1631 	}
1632 	if ((prot & PROT_USER) == 0) {
1633 		return (ENOTSUP);
1634 	}
1635 	err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1636 	if (err != 0) {
1637 		return (err);
1638 	}
1639 
1640 	VERIFY(segoff >= 0);
1641 	VERIFY(len <= vmo->vmo_size);
1642 	VERIFY((len + segoff) <= vmo->vmo_size);
1643 
1644 	if (vmo->vmo_type != VMOT_MEM) {
1645 		/* Only support memory objects for now */
1646 		return (ENOTSUP);
1647 	}
1648 
1649 	as_rangelock(as);
1650 
1651 	err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1652 	if (err == 0) {
1653 		segvmm_crargs_t svma;
1654 
1655 		svma.prot = prot;
1656 		svma.offset = segoff;
1657 		svma.vmo = vmo;
1658 		svma.vmc = NULL;
1659 
1660 		err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1661 	}
1662 
1663 	as_rangeunlock(as);
1664 	return (err);
1665 }
1666 
1667 int
vm_segmap_space(struct vm * vm,off_t off,struct as * as,caddr_t * addrp,off_t len,uint_t prot,uint_t maxprot,uint_t flags)1668 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1669     off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1670 {
1671 
1672 	const uintptr_t gpa = (uintptr_t)off;
1673 	const size_t size = (uintptr_t)len;
1674 	int err;
1675 
1676 	if (off < 0 || len <= 0 ||
1677 	    (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1678 		return (EINVAL);
1679 	}
1680 	if ((prot & PROT_USER) == 0) {
1681 		return (ENOTSUP);
1682 	}
1683 
1684 	as_rangelock(as);
1685 
1686 	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1687 	if (err == 0) {
1688 		segvmm_crargs_t svma;
1689 
1690 		svma.prot = prot;
1691 		svma.offset = gpa;
1692 		svma.vmo = NULL;
1693 		svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1694 
1695 		err = as_map(as, *addrp, len, segvmm_create, &svma);
1696 	}
1697 
1698 	as_rangeunlock(as);
1699 	return (err);
1700 }
1701