xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_vm.c (revision 4b9db4f6425b1a08fca4390f446072c4a6aae8d5)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2023 Oxide Computer Company
16  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34 
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40 
41 
42 /*
43  * VMM Virtual Memory
44  *
45  * History
46  *
47  * When bhyve was ported to illumos, one significant hole was handling guest
48  * memory and memory accesses.  In the original Pluribus port, bhyve itself
49  * manually handled the EPT structures for guest memory.  The updated sources
50  * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
51  * system for memory allocations and management of the EPT structures.  Keeping
52  * source differences to a minimum was a priority, so illumos-bhyve implemented
53  * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
54  * boot and run guests.
55  *
56  * While the VM shim was successful in getting illumos-bhyve to a functional
57  * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
58  * compatibility interfaces made it awkward to use.  As source differences with
59  * the upstream kernel code became less of a concern, and upcoming features
60  * (such as live migration) would demand more of those VM interfaces, it became
61  * clear that an overhaul was prudent.
62  *
63  * Design
64  *
65  * The new VM system for bhyve retains a number of the same concepts as what it
66  * replaces:
67  *
68  * - `vmspace_t` is the top-level entity for a guest memory space
69  * - `vm_object_t` represents a memory object which can be mapped into a vmspace
70  * - `vm_page_t` represents a page hold within a given vmspace, providing access
71  *   to the underlying memory page
72  *
73  * Unlike the old code, where most of the involved structures were exposed via
74  * public definitions, this replacement VM interface keeps all involved
75  * structures opaque to consumers.  Furthermore, there is a clear delineation
76  * between infrequent administrative operations (such as mapping/unmapping
77  * regions) and common data-path operations (attempting a page hold at a given
78  * guest-physical address).  Those administrative operations are performed
79  * directly against the vmspace, whereas the data-path operations are performed
80  * through a `vm_client_t` handle.  That VM client abstraction is meant to
81  * reduce contention and overhead for frequent access operations and provide
82  * debugging insight into how different subcomponents are accessing the vmspace.
83  * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
84  * interface) and each VMM userspace segment mapping.
85  *
86  * Exclusion
87  *
88  * Making changes to the vmspace (such as mapping or unmapping regions) requires
89  * other accessors be excluded while the change is underway to prevent them from
90  * observing invalid intermediate states.  A simple approach could use a mutex
91  * or rwlock to achieve this, but that risks contention when the rate of access
92  * to the vmspace is high.
93  *
94  * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
95  * at a per-vm_client_t basis.  While this raises the cost for vmspace changes,
96  * it means that the much more common page accesses through the vm_client can
97  * normally proceed unimpeded and independently.
98  *
99  * When a change to the vmspace is required, the caller will put the vmspace in
100  * a 'hold' state, iterating over all associated vm_client instances, waiting
101  * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
102  * setting VCS_HOLD in their state flag fields.  With VCS_HOLD set, any call on
103  * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
104  * will block until the hold condition is cleared.  Once the hold is asserted
105  * for all clients, the vmspace change can proceed with confidence.  Upon
106  * completion of that operation, VCS_HOLD is cleared from the clients, and they
107  * are released to resume vmspace accesses.
108  *
109  * vCPU Consumers
110  *
111  * Access to the vmspace for vCPUs running in guest context is different from
112  * emulation-related vm_client activity: they solely rely on the contents of the
113  * page tables.  Furthermore, the existing VCS_HOLD mechanism used to exclude
114  * client access is not feasible when entering guest context, since interrupts
115  * are disabled, making it impossible to block entry.  This is not a concern as
116  * long as vmspace modifications never place the page tables in invalid states
117  * (either intermediate, or final).  The vm_client hold mechanism does provide
118  * the means to IPI vCPU consumers which will trigger a notification once they
119  * report their exit from guest context.  This can be used to ensure that page
120  * table modifications are made visible to those vCPUs within a certain
121  * time frame.
122  */
123 
124 typedef struct vmspace_mapping {
125 	list_node_t	vmsm_node;
126 	vm_object_t	*vmsm_object;	/* object backing this mapping */
127 	uintptr_t	vmsm_addr;	/* start addr in vmspace for mapping */
128 	size_t		vmsm_len;	/* length (in bytes) of mapping */
129 	off_t		vmsm_offset;	/* byte offset into object */
130 	uint_t		vmsm_prot;
131 } vmspace_mapping_t;
132 
133 #define	VMSM_OFFSET(vmsm, addr)	(			\
134 	    (vmsm)->vmsm_offset +			\
135 	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
136 
137 typedef enum vm_client_state {
138 	VCS_IDLE	= 0,
139 	/* currently accessing vmspace for client operation (hold or fault) */
140 	VCS_ACTIVE	= (1 << 0),
141 	/* client hold requested/asserted */
142 	VCS_HOLD	= (1 << 1),
143 	/* vCPU is accessing page tables in guest context */
144 	VCS_ON_CPU	= (1 << 2),
145 	/* client has been orphaned (no more access to vmspace) */
146 	VCS_ORPHANED	= (1 << 3),
147 	/* client undergoing destroy operation */
148 	VCS_DESTROY	= (1 << 4),
149 } vm_client_state_t;
150 
151 struct vmspace {
152 	kmutex_t	vms_lock;
153 	kcondvar_t	vms_cv;
154 	bool		vms_held;
155 	uintptr_t	vms_size;	/* immutable after creation */
156 
157 	/* (nested) page table state */
158 	vmm_gpt_t	*vms_gpt;
159 	uint64_t	vms_pt_gen;
160 	uint64_t	vms_pages_mapped;
161 	bool		vms_track_dirty;
162 
163 	list_t		vms_maplist;
164 	list_t		vms_clients;
165 };
166 
167 struct vm_client {
168 	vmspace_t	*vmc_space;
169 	list_node_t	vmc_node;
170 
171 	kmutex_t	vmc_lock;
172 	kcondvar_t	vmc_cv;
173 	vm_client_state_t vmc_state;
174 	int		vmc_cpu_active;
175 	uint64_t	vmc_cpu_gen;
176 	bool		vmc_track_dirty;
177 	vmc_inval_cb_t	vmc_inval_func;
178 	void		*vmc_inval_data;
179 
180 	list_t		vmc_held_pages;
181 };
182 
183 typedef enum vm_object_type {
184 	VMOT_NONE,
185 	VMOT_MEM,
186 	VMOT_MMIO,
187 } vm_object_type_t;
188 
189 struct vm_object {
190 	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
191 
192 	/* Fields below are fixed at creation time */
193 	vm_object_type_t vmo_type;
194 	size_t		vmo_size;
195 	void		*vmo_data;
196 	uint8_t		vmo_attr;
197 };
198 
199 /* Convenience consolidation of all flag(s) for validity checking */
200 #define	VPF_ALL		(VPF_DEFER_DIRTY)
201 
202 struct vm_page {
203 	vm_client_t	*vmp_client;
204 	list_node_t	vmp_node;
205 	vm_page_t	*vmp_chain;
206 	uintptr_t	vmp_gpa;
207 	pfn_t		vmp_pfn;
208 	uint64_t	*vmp_ptep;
209 	vm_object_t	*vmp_obj_ref;
210 	uint8_t		vmp_prot;
211 	uint8_t		vmp_flags;
212 };
213 
214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
215 static void vmspace_hold_enter(vmspace_t *);
216 static void vmspace_hold_exit(vmspace_t *, bool);
217 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t);
218 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *,
219     uint64_t *);
220 static void vmc_space_hold(vm_client_t *);
221 static void vmc_space_release(vm_client_t *, bool);
222 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
223 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
224 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
225 
226 
227 /*
228  * Create a new vmspace with a maximum address of `end`.
229  */
230 vmspace_t *
231 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
232 {
233 	vmspace_t *vms;
234 	const uintptr_t size = end + 1;
235 
236 	/*
237 	 * This whole mess is built on the assumption that a 64-bit address
238 	 * space is available to work with for the various pagetable tricks.
239 	 */
240 	VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
241 	    size <= (uintptr_t)USERLIMIT);
242 
243 	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
244 	vms->vms_size = size;
245 	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
246 	    offsetof(vmspace_mapping_t, vmsm_node));
247 	list_create(&vms->vms_clients, sizeof (vm_client_t),
248 	    offsetof(vm_client_t, vmc_node));
249 
250 	vms->vms_gpt = vmm_gpt_alloc(pte_ops);
251 	vms->vms_pt_gen = 1;
252 	vms->vms_track_dirty = track_dirty;
253 
254 	return (vms);
255 }
256 
257 /*
258  * Destroy a vmspace.  All regions in the space must be unmapped.  Any remaining
259  * clients will be orphaned.
260  */
261 void
262 vmspace_destroy(vmspace_t *vms)
263 {
264 	mutex_enter(&vms->vms_lock);
265 	VERIFY(list_is_empty(&vms->vms_maplist));
266 
267 	if (!list_is_empty(&vms->vms_clients)) {
268 		vm_client_t *vmc = list_head(&vms->vms_clients);
269 		while (vmc != NULL) {
270 			vmc = vmc_space_orphan(vmc, vms);
271 		}
272 		/*
273 		 * Wait for any clients which were in the process of destroying
274 		 * themselves to disappear.
275 		 */
276 		while (!list_is_empty(&vms->vms_clients)) {
277 			cv_wait(&vms->vms_cv, &vms->vms_lock);
278 		}
279 	}
280 	VERIFY(list_is_empty(&vms->vms_clients));
281 
282 	vmm_gpt_free(vms->vms_gpt);
283 	mutex_exit(&vms->vms_lock);
284 
285 	mutex_destroy(&vms->vms_lock);
286 	cv_destroy(&vms->vms_cv);
287 	list_destroy(&vms->vms_maplist);
288 	list_destroy(&vms->vms_clients);
289 
290 	kmem_free(vms, sizeof (*vms));
291 }
292 
293 /*
294  * Retrieve the count of resident (mapped into the page tables) pages.
295  */
296 uint64_t
297 vmspace_resident_count(vmspace_t *vms)
298 {
299 	return (vms->vms_pages_mapped);
300 }
301 
302 /*
303  * Perform an operation on the status (accessed/dirty) bits held in the page
304  * tables of this vmspace.
305  *
306  * Such manipulations race against both hardware writes (from running vCPUs) and
307  * emulated accesses reflected from userspace.  Safe functionality depends on
308  * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap
309  * operations from changing the page tables during the walk.
310  */
311 void
312 vmspace_bits_operate(vmspace_t *vms, uint64_t gpa, size_t len,
313     vmspace_bit_oper_t oper, uint8_t *bitmap)
314 {
315 	const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0;
316 	const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0;
317 	const vmspace_bit_oper_t oper_only =
318 	    oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT);
319 	vmm_gpt_t *gpt = vms->vms_gpt;
320 
321 	/*
322 	 * The bitmap cannot be NULL if the requested operation involves reading
323 	 * or writing from it.
324 	 */
325 	ASSERT(bitmap != NULL || (!bit_input && !bit_output));
326 
327 	for (size_t offset = 0; offset < len; offset += PAGESIZE) {
328 		const uint64_t pfn_offset = offset >> PAGESHIFT;
329 		const size_t bit_offset = pfn_offset / 8;
330 		const uint8_t bit_mask = 1 << (pfn_offset % 8);
331 
332 		if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) {
333 			continue;
334 		}
335 
336 		bool value = false;
337 		uint64_t *entry = vmm_gpt_lookup(gpt, gpa + offset);
338 		if (entry == NULL) {
339 			if (bit_output) {
340 				bitmap[bit_offset] &= ~bit_mask;
341 			}
342 			continue;
343 		}
344 
345 		switch (oper_only) {
346 		case VBO_GET_DIRTY:
347 			value = vmm_gpt_query(gpt, entry, VGQ_DIRTY);
348 			break;
349 		case VBO_SET_DIRTY: {
350 			uint_t prot = 0;
351 			bool present_writable = false;
352 			pfn_t pfn;
353 
354 			/*
355 			 * To avoid blindly setting the dirty bit on otherwise
356 			 * empty PTEs, we must first check if the entry for the
357 			 * address in question has been populated.
358 			 *
359 			 * Only if the page is marked both Present and Writable
360 			 * will we permit the dirty bit to be set.
361 			 */
362 			if (!vmm_gpt_is_mapped(gpt, entry, &pfn, &prot)) {
363 				int err = vmspace_ensure_mapped(vms, gpa,
364 				    PROT_WRITE, &pfn, entry);
365 				if (err == 0) {
366 					present_writable = true;
367 				}
368 			} else if ((prot & PROT_WRITE) != 0) {
369 				present_writable = true;
370 			}
371 
372 			if (present_writable) {
373 				value = !vmm_gpt_reset_dirty(gpt, entry, true);
374 			}
375 			break;
376 		}
377 		case VBO_RESET_DIRTY:
378 			/*
379 			 * Although at first glance, it may seem like the act of
380 			 * resetting the dirty bit may require the same care as
381 			 * setting it, the constraints make for a simpler task.
382 			 *
383 			 * Any PTEs with the dirty bit set will have already
384 			 * been properly populated.
385 			 */
386 			value = vmm_gpt_reset_dirty(gpt, entry, false);
387 			break;
388 		default:
389 			panic("unrecognized operator: %d", oper_only);
390 			break;
391 		}
392 		if (bit_output) {
393 			if (value) {
394 				bitmap[bit_offset] |= bit_mask;
395 			} else {
396 				bitmap[bit_offset] &= ~bit_mask;
397 			}
398 		}
399 	}
400 
401 	/*
402 	 * Invalidate the address range potentially effected by the changes to
403 	 * page table bits, issuing shoot-downs for those who might have it in
404 	 * cache.
405 	 */
406 	vmspace_hold_enter(vms);
407 	vms->vms_pt_gen++;
408 	vmspace_clients_invalidate(vms, gpa, len);
409 	vmspace_hold_exit(vms, true);
410 }
411 
412 /*
413  * Is dirty-page-tracking enabled for the vmspace?
414  */
415 bool
416 vmspace_get_tracking(vmspace_t *vms)
417 {
418 	mutex_enter(&vms->vms_lock);
419 	const bool val = vms->vms_track_dirty;
420 	mutex_exit(&vms->vms_lock);
421 	return (val);
422 }
423 
424 /*
425  * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace.
426  */
427 int
428 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking)
429 {
430 	if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) {
431 		/* Do not allow this to be set if it is not supported */
432 		return (ENOTSUP);
433 	}
434 
435 	vmspace_hold_enter(vms);
436 	if (vms->vms_track_dirty == enable_dirty_tracking) {
437 		/* No further effort required if state already matches */
438 		vmspace_hold_exit(vms, false);
439 		return (0);
440 	}
441 
442 	vms->vms_track_dirty = enable_dirty_tracking;
443 
444 	/* Configure all existing clients for new tracking behavior */
445 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
446 	    vmc != NULL;
447 	    vmc = list_next(&vms->vms_clients, vmc)) {
448 		mutex_enter(&vmc->vmc_lock);
449 		vmc->vmc_track_dirty = enable_dirty_tracking;
450 		mutex_exit(&vmc->vmc_lock);
451 	}
452 
453 	/*
454 	 * Notify all clients of what is considered an invalidation of the
455 	 * entire vmspace.
456 	 */
457 	vms->vms_pt_gen++;
458 	vmspace_clients_invalidate(vms, 0, vms->vms_size);
459 
460 	vmspace_hold_exit(vms, true);
461 	return (0);
462 }
463 
464 static pfn_t
465 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
466 {
467 	vmmr_region_t *region;
468 	pfn_t pfn;
469 
470 	ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
471 
472 	region = vmo->vmo_data;
473 	pfn = vmmr_region_pfn_at(region, off);
474 
475 	return (pfn);
476 }
477 
478 static pfn_t
479 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
480 {
481 	pfn_t pfn;
482 
483 	ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
484 	ASSERT3P(vmo->vmo_data, !=, NULL);
485 	ASSERT3U(off, <, vmo->vmo_size);
486 
487 	pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
488 
489 	return (pfn);
490 }
491 
492 /*
493  * Allocate a VM object backed by VMM reservoir memory.
494  */
495 vm_object_t *
496 vm_object_mem_allocate(size_t size, bool transient)
497 {
498 	int err;
499 	vmmr_region_t *region = NULL;
500 	vm_object_t *vmo;
501 
502 	ASSERT3U(size, !=, 0);
503 	ASSERT3U(size & PAGEOFFSET, ==, 0);
504 
505 	err = vmmr_alloc(size, transient, &region);
506 	if (err != 0) {
507 		return (NULL);
508 	}
509 
510 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
511 
512 	/* For now, these are to stay fixed after allocation */
513 	vmo->vmo_type = VMOT_MEM;
514 	vmo->vmo_size = size;
515 	vmo->vmo_attr = MTRR_TYPE_WB;
516 	vmo->vmo_data = region;
517 	vmo->vmo_refcnt = 1;
518 
519 	return (vmo);
520 }
521 
522 static vm_object_t *
523 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
524 {
525 	vm_object_t *vmo;
526 
527 	ASSERT3U(size, !=, 0);
528 	ASSERT3U(size & PAGEOFFSET, ==, 0);
529 	ASSERT3U(hpa & PAGEOFFSET, ==, 0);
530 
531 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
532 
533 	/* For now, these are to stay fixed after allocation */
534 	vmo->vmo_type = VMOT_MMIO;
535 	vmo->vmo_size = size;
536 	vmo->vmo_attr = MTRR_TYPE_UC;
537 	vmo->vmo_data = (void *)hpa;
538 	vmo->vmo_refcnt = 1;
539 
540 	return (vmo);
541 }
542 
543 /*
544  * Allocate a VM object backed by an existing range of physical memory.
545  */
546 vm_object_t *
547 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
548 {
549 	int error;
550 	vm_object_t *obj;
551 
552 	obj = vm_object_mmio_allocate(len, hpa);
553 	if (obj != NULL) {
554 		error = vmspace_map(vmspace, obj, 0, gpa, len,
555 		    PROT_READ | PROT_WRITE);
556 		if (error != 0) {
557 			vm_object_release(obj);
558 			obj = NULL;
559 		}
560 	}
561 
562 	return (obj);
563 }
564 
565 /*
566  * Release a vm_object reference
567  */
568 void
569 vm_object_release(vm_object_t *vmo)
570 {
571 	ASSERT(vmo != NULL);
572 
573 	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
574 	/* underflow would be a deadly serious mistake */
575 	VERIFY3U(ref, !=, UINT_MAX);
576 	if (ref != 0) {
577 		return;
578 	}
579 
580 	switch (vmo->vmo_type) {
581 	case VMOT_MEM:
582 		vmmr_free((vmmr_region_t *)vmo->vmo_data);
583 		break;
584 	case VMOT_MMIO:
585 		break;
586 	default:
587 		panic("unexpected object type %u", vmo->vmo_type);
588 		break;
589 	}
590 
591 	vmo->vmo_data = NULL;
592 	vmo->vmo_size = 0;
593 	kmem_free(vmo, sizeof (*vmo));
594 }
595 
596 /*
597  * Increase refcount for vm_object reference
598  */
599 void
600 vm_object_reference(vm_object_t *vmo)
601 {
602 	ASSERT(vmo != NULL);
603 
604 	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
605 	/* overflow would be a deadly serious mistake */
606 	VERIFY3U(ref, !=, 0);
607 }
608 
609 /*
610  * Get the host-physical PFN for a given offset into a vm_object.
611  *
612  * The provided `off` must be within the allocated size of the vm_object.
613  */
614 pfn_t
615 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
616 {
617 	const uintptr_t aligned_off = off & PAGEMASK;
618 
619 	switch (vmo->vmo_type) {
620 	case VMOT_MEM:
621 		return (vm_object_pager_reservoir(vmo, aligned_off));
622 	case VMOT_MMIO:
623 		return (vm_object_pager_mmio(vmo, aligned_off));
624 	case VMOT_NONE:
625 		break;
626 	}
627 	panic("unexpected object type %u", vmo->vmo_type);
628 }
629 
630 static vmspace_mapping_t *
631 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
632 {
633 	vmspace_mapping_t *vmsm;
634 	list_t *ml = &vms->vms_maplist;
635 	const uintptr_t range_end = addr + size;
636 
637 	ASSERT3U(addr, <=, range_end);
638 
639 	if (addr >= vms->vms_size) {
640 		return (NULL);
641 	}
642 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
643 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
644 
645 		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
646 			if (range_end <= seg_end) {
647 				return (vmsm);
648 			} else {
649 				return (NULL);
650 			}
651 		}
652 	}
653 	return (NULL);
654 }
655 
656 /*
657  * Check to see if any mappings reside within [addr, addr + size) span in the
658  * vmspace, returning true if that span is indeed empty.
659  */
660 static bool
661 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
662 {
663 	vmspace_mapping_t *vmsm;
664 	list_t *ml = &vms->vms_maplist;
665 	const uintptr_t range_end = addr + size - 1;
666 
667 	ASSERT(MUTEX_HELD(&vms->vms_lock));
668 	ASSERT(size > 0);
669 
670 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
671 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
672 
673 		/*
674 		 * The two ranges do not overlap if the start of either of
675 		 * them is after the end of the other.
676 		 */
677 		if (vmsm->vmsm_addr > range_end || addr > seg_end)
678 			continue;
679 		return (false);
680 	}
681 	return (true);
682 }
683 
684 static void
685 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
686 {
687 	list_t *ml = &vms->vms_maplist;
688 
689 	ASSERT(MUTEX_HELD(&vms->vms_lock));
690 	ASSERT(vms->vms_held);
691 
692 	list_remove(ml, vmsm);
693 	vm_object_release(vmsm->vmsm_object);
694 	kmem_free(vmsm, sizeof (*vmsm));
695 }
696 
697 /*
698  * Enter a hold state on the vmspace.  This ensures that all VM clients
699  * associated with the vmspace are excluded from establishing new page holds,
700  * or any other actions which would require accessing vmspace state subject to
701  * potential change.
702  *
703  * Returns with vmspace_t`vms_lock held.
704  */
705 static void
706 vmspace_hold_enter(vmspace_t *vms)
707 {
708 	mutex_enter(&vms->vms_lock);
709 	VERIFY(!vms->vms_held);
710 
711 	vm_client_t *vmc = list_head(&vms->vms_clients);
712 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
713 		vmc_space_hold(vmc);
714 	}
715 	vms->vms_held = true;
716 }
717 
718 /*
719  * Exit a hold state on the vmspace.  This releases all VM clients associated
720  * with the vmspace to be able to establish new page holds, and partake in other
721  * actions which require accessing changed vmspace state.  If `kick_on_cpu` is
722  * true, then any CPUs actively using the page tables will be IPIed, and the
723  * call will block until they have acknowledged being ready to use the latest
724  * state of the tables.
725  *
726  * Requires vmspace_t`vms_lock be held, which is released as part of the call.
727  */
728 static void
729 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
730 {
731 	ASSERT(MUTEX_HELD(&vms->vms_lock));
732 	VERIFY(vms->vms_held);
733 
734 	vm_client_t *vmc = list_head(&vms->vms_clients);
735 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
736 		vmc_space_release(vmc, kick_on_cpu);
737 	}
738 	vms->vms_held = false;
739 	mutex_exit(&vms->vms_lock);
740 }
741 
742 static void
743 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len)
744 {
745 	ASSERT(MUTEX_HELD(&vms->vms_lock));
746 	VERIFY(vms->vms_held);
747 
748 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
749 	    vmc != NULL;
750 	    vmc = list_next(&vms->vms_clients, vmc)) {
751 		vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
752 	}
753 }
754 
755 /*
756  * Attempt to map a vm_object span into the vmspace.
757  *
758  * Requirements:
759  * - `obj_off`, `addr`, and `len` must be page-aligned
760  * - `obj_off` cannot be greater than the allocated size of the object
761  * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
762  *   size of the object
763  * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
764  *   of the vmspace
765  */
766 int
767 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
768     size_t len, uint8_t prot)
769 {
770 	vmspace_mapping_t *vmsm;
771 	int res = 0;
772 
773 	if (len == 0 || (addr + len) < addr ||
774 	    obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
775 		return (EINVAL);
776 	}
777 	if ((addr + len) >= vms->vms_size) {
778 		return (ENOMEM);
779 	}
780 
781 	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
782 
783 	vmspace_hold_enter(vms);
784 	if (!vm_mapping_gap(vms, addr, len)) {
785 		kmem_free(vmsm, sizeof (*vmsm));
786 		res = ENOMEM;
787 	} else {
788 		vmsm->vmsm_object = vmo;
789 		vmsm->vmsm_addr = addr;
790 		vmsm->vmsm_len = len;
791 		vmsm->vmsm_offset = (off_t)obj_off;
792 		vmsm->vmsm_prot = prot;
793 		list_insert_tail(&vms->vms_maplist, vmsm);
794 
795 		/*
796 		 * Make sure the GPT has tables ready for leaf entries across
797 		 * the entire new mapping.
798 		 */
799 		vmm_gpt_populate_region(vms->vms_gpt, addr, len);
800 	}
801 	vmspace_hold_exit(vms, false);
802 	return (res);
803 }
804 
805 /*
806  * Unmap a region of the vmspace.
807  *
808  * Presently the [start, end) span must equal a region previously mapped by a
809  * call to vmspace_map().
810  */
811 int
812 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len)
813 {
814 	const uintptr_t end = addr + len;
815 	vmspace_mapping_t *vmsm;
816 	vm_client_t *vmc;
817 	uint64_t gen = 0;
818 
819 	ASSERT3U(addr, <, end);
820 
821 	vmspace_hold_enter(vms);
822 	/* expect to match existing mapping exactly */
823 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL ||
824 	    vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) {
825 		vmspace_hold_exit(vms, false);
826 		return (ENOENT);
827 	}
828 
829 	/* Prepare clients (and their held pages) for the unmap. */
830 	for (vmc = list_head(&vms->vms_clients); vmc != NULL;
831 	    vmc = list_next(&vms->vms_clients, vmc)) {
832 		vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object);
833 	}
834 
835 	/* Clear all PTEs for region */
836 	if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) {
837 		vms->vms_pt_gen++;
838 		gen = vms->vms_pt_gen;
839 	}
840 	/* ... and the intermediate (directory) PTEs as well */
841 	vmm_gpt_vacate_region(vms->vms_gpt, addr, len);
842 
843 	/*
844 	 * If pages were actually unmapped from the GPT, provide clients with
845 	 * an invalidation notice.
846 	 */
847 	if (gen != 0) {
848 		vmspace_clients_invalidate(vms, addr, len);
849 	}
850 
851 	vm_mapping_remove(vms, vmsm);
852 	vmspace_hold_exit(vms, true);
853 	return (0);
854 }
855 
856 /*
857  * For a given GPA in the vmspace, ensure that the backing page (if any) is
858  * properly mapped as present in the provided PTE.
859  */
860 static int
861 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
862     uint64_t *leaf_pte)
863 {
864 	vmspace_mapping_t *vmsm;
865 	vm_object_t *vmo;
866 	pfn_t pfn;
867 
868 	ASSERT(pfnp != NULL);
869 	ASSERT(leaf_pte != NULL);
870 
871 	vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
872 	if (vmsm == NULL) {
873 		return (FC_NOMAP);
874 	}
875 	if ((req_prot & vmsm->vmsm_prot) != req_prot) {
876 		return (FC_PROT);
877 	}
878 
879 	vmo = vmsm->vmsm_object;
880 	pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
881 	VERIFY(pfn != PFN_INVALID);
882 
883 	if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot,
884 	    vmo->vmo_attr)) {
885 		atomic_inc_64(&vms->vms_pages_mapped);
886 	}
887 
888 	*pfnp = pfn;
889 	return (0);
890 }
891 
892 /*
893  * Look up the PTE for a given GPA in the vmspace, populating it with
894  * appropriate contents (pfn, protection, etc) if it is empty, but backed by a
895  * valid mapping.
896  */
897 static int
898 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
899     uint64_t **ptepp)
900 {
901 	vmm_gpt_t *gpt = vms->vms_gpt;
902 	uint64_t *entries[MAX_GPT_LEVEL], *leaf;
903 	pfn_t pfn = PFN_INVALID;
904 	uint_t prot;
905 
906 	ASSERT0(gpa & PAGEOFFSET);
907 	ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
908 
909 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
910 	leaf = entries[LEVEL1];
911 	if (leaf == NULL) {
912 		/*
913 		 * Since we populated the intermediate tables for any regions
914 		 * mapped in the GPT, an empty leaf entry indicates there is no
915 		 * mapping, populated or not, at this GPT.
916 		 */
917 		return (FC_NOMAP);
918 	}
919 
920 	if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
921 		if ((req_prot & prot) != req_prot) {
922 			return (FC_PROT);
923 		}
924 	} else {
925 		int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf);
926 		if (err != 0) {
927 			return (err);
928 		}
929 	}
930 
931 	ASSERT(pfn != PFN_INVALID && leaf != NULL);
932 	if (pfnp != NULL) {
933 		*pfnp = pfn;
934 	}
935 	if (ptepp != NULL) {
936 		*ptepp = leaf;
937 	}
938 	return (0);
939 }
940 
941 /*
942  * Populate (make resident in the page tables) a region of the vmspace.
943  *
944  * Presently the [start, end) span must equal a region previously mapped by a
945  * call to vmspace_map().
946  */
947 int
948 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len)
949 {
950 	vmspace_mapping_t *vmsm;
951 	mutex_enter(&vms->vms_lock);
952 
953 	/* For the time being, only exact-match mappings are expected */
954 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) {
955 		mutex_exit(&vms->vms_lock);
956 		return (FC_NOMAP);
957 	}
958 
959 	vm_object_t *vmo = vmsm->vmsm_object;
960 	const int prot = vmsm->vmsm_prot;
961 	const uint8_t attr = vmo->vmo_attr;
962 	size_t populated = 0;
963 	const size_t end = addr + len;
964 	for (uintptr_t gpa = addr & PAGEMASK; gpa < end; gpa += PAGESIZE) {
965 		const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
966 		VERIFY(pfn != PFN_INVALID);
967 
968 		if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
969 			populated++;
970 		}
971 	}
972 	atomic_add_64(&vms->vms_pages_mapped, populated);
973 
974 	mutex_exit(&vms->vms_lock);
975 	return (0);
976 }
977 
978 /*
979  * Allocate a client from a given vmspace.
980  */
981 vm_client_t *
982 vmspace_client_alloc(vmspace_t *vms)
983 {
984 	vm_client_t *vmc;
985 
986 	vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
987 	vmc->vmc_space = vms;
988 	mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
989 	cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
990 	vmc->vmc_state = VCS_IDLE;
991 	vmc->vmc_cpu_active = -1;
992 	list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
993 	    offsetof(vm_page_t, vmp_node));
994 	vmc->vmc_track_dirty = vms->vms_track_dirty;
995 
996 	mutex_enter(&vms->vms_lock);
997 	list_insert_tail(&vms->vms_clients, vmc);
998 	mutex_exit(&vms->vms_lock);
999 
1000 	return (vmc);
1001 }
1002 
1003 /*
1004  * Get the nested page table root pointer (EPTP/NCR3) value.
1005  */
1006 uint64_t
1007 vmspace_table_root(vmspace_t *vms)
1008 {
1009 	return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
1010 }
1011 
1012 /*
1013  * Get the current generation number of the nested page table.
1014  */
1015 uint64_t
1016 vmspace_table_gen(vmspace_t *vms)
1017 {
1018 	return (vms->vms_pt_gen);
1019 }
1020 
1021 /*
1022  * Mark a vm_client as active.  This will block if/while the client is held by
1023  * the vmspace.  On success, it returns with vm_client_t`vmc_lock held.  It will
1024  * fail if the vm_client has been orphaned.
1025  */
1026 static int
1027 vmc_activate(vm_client_t *vmc)
1028 {
1029 	mutex_enter(&vmc->vmc_lock);
1030 	VERIFY0(vmc->vmc_state & VCS_ACTIVE);
1031 	if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
1032 		mutex_exit(&vmc->vmc_lock);
1033 		return (ENXIO);
1034 	}
1035 	while ((vmc->vmc_state & VCS_HOLD) != 0) {
1036 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1037 	}
1038 	vmc->vmc_state |= VCS_ACTIVE;
1039 	return (0);
1040 }
1041 
1042 /*
1043  * Mark a vm_client as no longer active.  It must be called with
1044  * vm_client_t`vmc_lock already held, and will return with it released.
1045  */
1046 static void
1047 vmc_deactivate(vm_client_t *vmc)
1048 {
1049 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1050 	VERIFY(vmc->vmc_state & VCS_ACTIVE);
1051 
1052 	vmc->vmc_state ^= VCS_ACTIVE;
1053 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
1054 		cv_broadcast(&vmc->vmc_cv);
1055 	}
1056 	mutex_exit(&vmc->vmc_lock);
1057 }
1058 
1059 /*
1060  * Indicate that a CPU will be utilizing the nested page tables through this VM
1061  * client.  Interrupts (and/or the GIF) are expected to be disabled when calling
1062  * this function.  Returns the generation number of the nested page table (to be
1063  * used for TLB invalidations).
1064  */
1065 uint64_t
1066 vmc_table_enter(vm_client_t *vmc)
1067 {
1068 	vmspace_t *vms = vmc->vmc_space;
1069 	uint64_t gen;
1070 
1071 	ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1072 	ASSERT3S(vmc->vmc_cpu_active, ==, -1);
1073 
1074 	/*
1075 	 * Since the NPT activation occurs with interrupts disabled, this must
1076 	 * be done without taking vmc_lock like normal.
1077 	 */
1078 	gen = vms->vms_pt_gen;
1079 	vmc->vmc_cpu_active = CPU->cpu_id;
1080 	vmc->vmc_cpu_gen = gen;
1081 	atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
1082 
1083 	return (gen);
1084 }
1085 
1086 /*
1087  * Indicate that this VM client is not longer (directly) using the underlying
1088  * page tables.  Interrupts (and/or the GIF) must be enabled prior to calling
1089  * this function.
1090  */
1091 void
1092 vmc_table_exit(vm_client_t *vmc)
1093 {
1094 	mutex_enter(&vmc->vmc_lock);
1095 
1096 	ASSERT(vmc->vmc_state & VCS_ON_CPU);
1097 	vmc->vmc_state ^= VCS_ON_CPU;
1098 	vmc->vmc_cpu_active = -1;
1099 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
1100 		cv_broadcast(&vmc->vmc_cv);
1101 	}
1102 
1103 	mutex_exit(&vmc->vmc_lock);
1104 }
1105 
1106 static void
1107 vmc_space_hold(vm_client_t *vmc)
1108 {
1109 	mutex_enter(&vmc->vmc_lock);
1110 	VERIFY0(vmc->vmc_state & VCS_HOLD);
1111 
1112 	/*
1113 	 * Because vmc_table_enter() alters vmc_state from a context where
1114 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
1115 	 * VMC_HOLD must be done atomically here.
1116 	 */
1117 	atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
1118 
1119 	/* Wait for client to go inactive */
1120 	while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
1121 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1122 	}
1123 	mutex_exit(&vmc->vmc_lock);
1124 }
1125 
1126 static void
1127 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
1128 {
1129 	mutex_enter(&vmc->vmc_lock);
1130 	VERIFY(vmc->vmc_state & VCS_HOLD);
1131 
1132 	if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
1133 		poke_cpu(vmc->vmc_cpu_active);
1134 
1135 		while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1136 			cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1137 		}
1138 	}
1139 
1140 	/*
1141 	 * Because vmc_table_enter() alters vmc_state from a context where
1142 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
1143 	 * VMC_HOLD must be done atomically here.
1144 	 */
1145 	atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
1146 	cv_broadcast(&vmc->vmc_cv);
1147 	mutex_exit(&vmc->vmc_lock);
1148 }
1149 
1150 static void
1151 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
1152     uint64_t gen)
1153 {
1154 	mutex_enter(&vmc->vmc_lock);
1155 	VERIFY(vmc->vmc_state & VCS_HOLD);
1156 	if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1157 		/*
1158 		 * Wait for clients using an old generation of the page tables
1159 		 * to exit guest context, where they subsequently flush the TLB
1160 		 * for the new generation.
1161 		 */
1162 		if (vmc->vmc_cpu_gen < gen) {
1163 			poke_cpu(vmc->vmc_cpu_active);
1164 
1165 			while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1166 				cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1167 			}
1168 		}
1169 	}
1170 	if (vmc->vmc_inval_func != NULL) {
1171 		vmc_inval_cb_t func = vmc->vmc_inval_func;
1172 		void *data = vmc->vmc_inval_data;
1173 
1174 		/*
1175 		 * Perform the actual invalidation call outside vmc_lock to
1176 		 * avoid lock ordering issues in the consumer.  Since the client
1177 		 * is under VCS_HOLD, this is safe.
1178 		 */
1179 		mutex_exit(&vmc->vmc_lock);
1180 		func(data, addr, size);
1181 		mutex_enter(&vmc->vmc_lock);
1182 	}
1183 	mutex_exit(&vmc->vmc_lock);
1184 }
1185 
1186 static void
1187 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1188     vm_object_t *vmo)
1189 {
1190 	mutex_enter(&vmc->vmc_lock);
1191 	VERIFY(vmc->vmc_state & VCS_HOLD);
1192 
1193 	/*
1194 	 * With the current vCPU exclusion invariants in place, we do not expect
1195 	 * a vCPU to be in guest context during an unmap.
1196 	 */
1197 	VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1198 
1199 	/*
1200 	 * Any holds against the unmapped region need to establish their own
1201 	 * reference to the underlying object to avoid a potential
1202 	 * use-after-free.
1203 	 */
1204 	for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1205 	    vmp != NULL;
1206 	    vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1207 		if (vmp->vmp_gpa < addr ||
1208 		    vmp->vmp_gpa >= (addr + size)) {
1209 			/* Hold outside region in question */
1210 			continue;
1211 		}
1212 		if (vmp->vmp_obj_ref == NULL) {
1213 			vm_object_reference(vmo);
1214 			vmp->vmp_obj_ref = vmo;
1215 			/* For an unmapped region, PTE is now meaningless */
1216 			vmp->vmp_ptep = NULL;
1217 		} else {
1218 			/*
1219 			 * Object could have gone through cycle of
1220 			 * unmap-map-unmap before the hold was released.
1221 			 */
1222 			VERIFY3P(vmp->vmp_ptep, ==, NULL);
1223 		}
1224 	}
1225 	mutex_exit(&vmc->vmc_lock);
1226 }
1227 
1228 static vm_client_t *
1229 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1230 {
1231 	vm_client_t *next;
1232 
1233 	ASSERT(MUTEX_HELD(&vms->vms_lock));
1234 
1235 	mutex_enter(&vmc->vmc_lock);
1236 	VERIFY3P(vmc->vmc_space, ==, vms);
1237 	VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1238 	if (vmc->vmc_state & VCS_DESTROY) {
1239 		/*
1240 		 * This vm_client is currently undergoing destruction, so it
1241 		 * does not need to be orphaned.  Let it proceed with its own
1242 		 * clean-up task.
1243 		 */
1244 		next = list_next(&vms->vms_clients, vmc);
1245 	} else {
1246 		/*
1247 		 * Clients are only orphaned when the containing vmspace is
1248 		 * being torn down.  All mappings from the vmspace should
1249 		 * already be gone, meaning any remaining held pages should have
1250 		 * direct references to the object.
1251 		 */
1252 		for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1253 		    vmp != NULL;
1254 		    vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1255 			ASSERT3P(vmp->vmp_ptep, ==, NULL);
1256 			ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1257 		}
1258 
1259 		/*
1260 		 * After this point, the client will be orphaned, unable to
1261 		 * establish new page holds (or access any vmspace-related
1262 		 * resources) and is in charge of cleaning up after itself.
1263 		 */
1264 		vmc->vmc_state |= VCS_ORPHANED;
1265 		next = list_next(&vms->vms_clients, vmc);
1266 		list_remove(&vms->vms_clients, vmc);
1267 		vmc->vmc_space = NULL;
1268 	}
1269 	mutex_exit(&vmc->vmc_lock);
1270 	return (next);
1271 }
1272 
1273 /*
1274  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1275  */
1276 vm_page_t *
1277 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags)
1278 {
1279 	vmspace_t *vms = vmc->vmc_space;
1280 	vm_page_t *vmp;
1281 	pfn_t pfn = PFN_INVALID;
1282 	uint64_t *ptep = NULL;
1283 
1284 	ASSERT0(gpa & PAGEOFFSET);
1285 	ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1286 	ASSERT0(prot & ~PROT_ALL);
1287 	ASSERT0(flags & ~VPF_ALL);
1288 
1289 	vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1290 	if (vmc_activate(vmc) != 0) {
1291 		kmem_free(vmp, sizeof (*vmp));
1292 		return (NULL);
1293 	}
1294 
1295 	if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1296 		vmc_deactivate(vmc);
1297 		kmem_free(vmp, sizeof (*vmp));
1298 		return (NULL);
1299 	}
1300 	ASSERT(pfn != PFN_INVALID && ptep != NULL);
1301 
1302 	vmp->vmp_client = vmc;
1303 	vmp->vmp_chain = NULL;
1304 	vmp->vmp_gpa = gpa;
1305 	vmp->vmp_pfn = pfn;
1306 	vmp->vmp_ptep = ptep;
1307 	vmp->vmp_obj_ref = NULL;
1308 	vmp->vmp_prot = (uint8_t)prot;
1309 	vmp->vmp_flags = (uint8_t)flags;
1310 	list_insert_tail(&vmc->vmc_held_pages, vmp);
1311 	vmc_deactivate(vmc);
1312 
1313 	return (vmp);
1314 }
1315 
1316 /*
1317  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1318  */
1319 vm_page_t *
1320 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1321 {
1322 	return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT));
1323 }
1324 
1325 int
1326 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1327 {
1328 	vmspace_t *vms = vmc->vmc_space;
1329 	int err;
1330 
1331 	err = vmc_activate(vmc);
1332 	if (err == 0) {
1333 		err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1334 		vmc_deactivate(vmc);
1335 	}
1336 
1337 	return (err);
1338 }
1339 
1340 /*
1341  * Allocate an additional vm_client_t, based on an existing one.  Only the
1342  * associatation with the vmspace is cloned, not existing holds or any
1343  * configured invalidation function.
1344  */
1345 vm_client_t *
1346 vmc_clone(vm_client_t *vmc)
1347 {
1348 	vmspace_t *vms = vmc->vmc_space;
1349 
1350 	return (vmspace_client_alloc(vms));
1351 }
1352 
1353 /*
1354  * Register a function (and associated data pointer) to be called when an
1355  * address range in the vmspace is invalidated.
1356  */
1357 int
1358 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1359 {
1360 	int err;
1361 
1362 	err = vmc_activate(vmc);
1363 	if (err == 0) {
1364 		vmc->vmc_inval_func = func;
1365 		vmc->vmc_inval_data = data;
1366 		vmc_deactivate(vmc);
1367 	}
1368 
1369 	return (err);
1370 }
1371 
1372 /*
1373  * Destroy a vm_client_t instance.
1374  *
1375  * No pages held through this vm_client_t may be outstanding when performing a
1376  * vmc_destroy().  For vCPU clients, the client cannot be on-CPU (a call to
1377  * vmc_table_exit() has been made).
1378  */
1379 void
1380 vmc_destroy(vm_client_t *vmc)
1381 {
1382 	mutex_enter(&vmc->vmc_lock);
1383 
1384 	VERIFY(list_is_empty(&vmc->vmc_held_pages));
1385 	VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1386 
1387 	if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1388 		vmspace_t *vms;
1389 
1390 		/*
1391 		 * Deassociation with the parent vmspace must be done carefully:
1392 		 * The vmspace could attempt to orphan this vm_client while we
1393 		 * release vmc_lock in order to take vms_lock (the required
1394 		 * order).  The client is marked to indicate that destruction is
1395 		 * under way.  Doing so prevents any racing orphan operation
1396 		 * from applying to this client, allowing us to deassociate from
1397 		 * the vmspace safely.
1398 		 */
1399 		vmc->vmc_state |= VCS_DESTROY;
1400 		vms = vmc->vmc_space;
1401 		mutex_exit(&vmc->vmc_lock);
1402 
1403 		mutex_enter(&vms->vms_lock);
1404 		mutex_enter(&vmc->vmc_lock);
1405 		list_remove(&vms->vms_clients, vmc);
1406 		/*
1407 		 * If the vmspace began its own destruction operation while we
1408 		 * were navigating the locks, be sure to notify it about this
1409 		 * vm_client being deassociated.
1410 		 */
1411 		cv_signal(&vms->vms_cv);
1412 		mutex_exit(&vmc->vmc_lock);
1413 		mutex_exit(&vms->vms_lock);
1414 	} else {
1415 		VERIFY3P(vmc->vmc_space, ==, NULL);
1416 		mutex_exit(&vmc->vmc_lock);
1417 	}
1418 
1419 	mutex_destroy(&vmc->vmc_lock);
1420 	cv_destroy(&vmc->vmc_cv);
1421 	list_destroy(&vmc->vmc_held_pages);
1422 
1423 	kmem_free(vmc, sizeof (*vmc));
1424 }
1425 
1426 static __inline void *
1427 vmp_ptr(const vm_page_t *vmp)
1428 {
1429 	ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1430 
1431 	const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1432 	return ((void *)((uintptr_t)kpm_vbase + paddr));
1433 }
1434 
1435 /*
1436  * Get a readable kernel-virtual pointer for a held page.
1437  *
1438  * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1439  * call to acquire this page reference.
1440  */
1441 const void *
1442 vmp_get_readable(const vm_page_t *vmp)
1443 {
1444 	ASSERT(vmp->vmp_prot & PROT_READ);
1445 
1446 	return (vmp_ptr(vmp));
1447 }
1448 
1449 /*
1450  * Get a writable kernel-virtual pointer for a held page.
1451  *
1452  * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1453  * call to acquire this page reference.
1454  */
1455 void *
1456 vmp_get_writable(const vm_page_t *vmp)
1457 {
1458 	ASSERT(vmp->vmp_prot & PROT_WRITE);
1459 
1460 	return (vmp_ptr(vmp));
1461 }
1462 
1463 /*
1464  * Get the host-physical PFN for a held page.
1465  */
1466 pfn_t
1467 vmp_get_pfn(const vm_page_t *vmp)
1468 {
1469 	return (vmp->vmp_pfn);
1470 }
1471 
1472 /*
1473  * If this page was deferring dirty-marking in the corresponding vmspace page
1474  * tables, clear such a state so it is considered dirty from now on.
1475  */
1476 void
1477 vmp_mark_dirty(vm_page_t *vmp)
1478 {
1479 	ASSERT((vmp->vmp_prot & PROT_WRITE) != 0);
1480 
1481 	atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY);
1482 }
1483 
1484 /*
1485  * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1486  */
1487 void
1488 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1489 {
1490 	ASSERT3P(vmp->vmp_chain, ==, NULL);
1491 
1492 	vmp->vmp_chain = to_chain;
1493 }
1494 
1495 /*
1496  * Retrieve the pointer from the page-chaining in `vmp`.
1497  */
1498 vm_page_t *
1499 vmp_next(const vm_page_t *vmp)
1500 {
1501 	return (vmp->vmp_chain);
1502 }
1503 
1504 static __inline bool
1505 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1506 {
1507 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1508 
1509 	bool was_unmapped = false;
1510 
1511 	list_remove(&vmc->vmc_held_pages, vmp);
1512 	if (vmp->vmp_obj_ref != NULL) {
1513 		ASSERT3P(vmp->vmp_ptep, ==, NULL);
1514 
1515 		vm_object_release(vmp->vmp_obj_ref);
1516 		was_unmapped = true;
1517 	} else {
1518 		ASSERT3P(vmp->vmp_ptep, !=, NULL);
1519 
1520 		/*
1521 		 * Track appropriate (accessed/dirty) bits for the guest-virtual
1522 		 * address corresponding to this page, if it is from the vmspace
1523 		 * rather than a direct reference to an underlying object.
1524 		 *
1525 		 * The protection and/or configured flags may obviate the need
1526 		 * for such an update.
1527 		 */
1528 		if ((vmp->vmp_prot & PROT_WRITE) != 0 &&
1529 		    (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 &&
1530 		    vmc->vmc_track_dirty) {
1531 			vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
1532 			(void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
1533 		}
1534 	}
1535 	kmem_free(vmp, sizeof (*vmp));
1536 	return (was_unmapped);
1537 }
1538 
1539 /*
1540  * Release held page.  Returns true if page resided on region which was
1541  * subsequently unmapped.
1542  */
1543 bool
1544 vmp_release(vm_page_t *vmp)
1545 {
1546 	vm_client_t *vmc = vmp->vmp_client;
1547 
1548 	VERIFY(vmc != NULL);
1549 
1550 	mutex_enter(&vmc->vmc_lock);
1551 	const bool was_unmapped = vmp_release_inner(vmp, vmc);
1552 	mutex_exit(&vmc->vmc_lock);
1553 	return (was_unmapped);
1554 }
1555 
1556 /*
1557  * Release a chain of pages which were associated via vmp_chain() (setting
1558  * page-chaining pointer).  Returns true if any pages resided upon a region
1559  * which was subsequently unmapped.
1560  *
1561  * All of those pages must have been held through the same vm_client_t.
1562  */
1563 bool
1564 vmp_release_chain(vm_page_t *vmp)
1565 {
1566 	vm_client_t *vmc = vmp->vmp_client;
1567 	bool any_unmapped = false;
1568 
1569 	ASSERT(vmp != NULL);
1570 
1571 	mutex_enter(&vmc->vmc_lock);
1572 	while (vmp != NULL) {
1573 		vm_page_t *next = vmp->vmp_chain;
1574 
1575 		/* We expect all pages in chain to be from same client */
1576 		ASSERT3P(vmp->vmp_client, ==, vmc);
1577 
1578 		if (vmp_release_inner(vmp, vmc)) {
1579 			any_unmapped = true;
1580 		}
1581 		vmp = next;
1582 	}
1583 	mutex_exit(&vmc->vmc_lock);
1584 	return (any_unmapped);
1585 }
1586 
1587 
1588 int
1589 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1590     struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1591 {
1592 	vm_object_t *vmo;
1593 	int err;
1594 
1595 	if (segoff < 0 || len <= 0 ||
1596 	    (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1597 		return (EINVAL);
1598 	}
1599 	if ((prot & PROT_USER) == 0) {
1600 		return (ENOTSUP);
1601 	}
1602 	err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1603 	if (err != 0) {
1604 		return (err);
1605 	}
1606 
1607 	VERIFY(segoff >= 0);
1608 	VERIFY(len <= vmo->vmo_size);
1609 	VERIFY((len + segoff) <= vmo->vmo_size);
1610 
1611 	if (vmo->vmo_type != VMOT_MEM) {
1612 		/* Only support memory objects for now */
1613 		return (ENOTSUP);
1614 	}
1615 
1616 	as_rangelock(as);
1617 
1618 	err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1619 	if (err == 0) {
1620 		segvmm_crargs_t svma;
1621 
1622 		svma.prot = prot;
1623 		svma.offset = segoff;
1624 		svma.vmo = vmo;
1625 		svma.vmc = NULL;
1626 
1627 		err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1628 	}
1629 
1630 	as_rangeunlock(as);
1631 	return (err);
1632 }
1633 
1634 int
1635 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1636     off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1637 {
1638 
1639 	const uintptr_t gpa = (uintptr_t)off;
1640 	const size_t size = (uintptr_t)len;
1641 	int err;
1642 
1643 	if (off < 0 || len <= 0 ||
1644 	    (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1645 		return (EINVAL);
1646 	}
1647 	if ((prot & PROT_USER) == 0) {
1648 		return (ENOTSUP);
1649 	}
1650 
1651 	as_rangelock(as);
1652 
1653 	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1654 	if (err == 0) {
1655 		segvmm_crargs_t svma;
1656 
1657 		svma.prot = prot;
1658 		svma.offset = gpa;
1659 		svma.vmo = NULL;
1660 		svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1661 
1662 		err = as_map(as, *addrp, len, segvmm_create, &svma);
1663 	}
1664 
1665 	as_rangeunlock(as);
1666 	return (err);
1667 }
1668