xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_vm.c (revision 067afcb3a52b5ebe76a00b3f366353cad54c649e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2022 Oxide Computer Company
16  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34 
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40 
41 
42 /*
43  * VMM Virtual Memory
44  *
45  * History
46  *
47  * When bhyve was ported to illumos, one significant hole was handling guest
48  * memory and memory accesses.  In the original Pluribus port, bhyve itself
49  * manually handled the EPT structures for guest memory.  The updated sources
50  * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
51  * system for memory allocations and management of the EPT structures.  Keeping
52  * source differences to a minimum was a priority, so illumos-bhyve implemented
53  * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
54  * boot and run guests.
55  *
56  * While the VM shim was successful in getting illumos-bhyve to a functional
57  * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
58  * compatibility interfaces made it awkward to use.  As source differences with
59  * the upstream kernel code became less of a concern, and upcoming features
60  * (such as live migration) would demand more of those VM interfaces, it became
61  * clear that an overhaul was prudent.
62  *
63  * Design
64  *
65  * The new VM system for bhyve retains a number of the same concepts as what it
66  * replaces:
67  *
68  * - `vmspace_t` is the top-level entity for a guest memory space
69  * - `vm_object_t` represents a memory object which can be mapped into a vmspace
70  * - `vm_page_t` represents a page hold within a given vmspace, providing access
71  *   to the underlying memory page
72  *
73  * Unlike the old code, where most of the involved structures were exposed via
74  * public definitions, this replacement VM interface keeps all involved
75  * structures opaque to consumers.  Furthermore, there is a clear delineation
76  * between infrequent administrative operations (such as mapping/unmapping
77  * regions) and common data-path operations (attempting a page hold at a given
78  * guest-physical address).  Those administrative operations are performed
79  * directly against the vmspace, whereas the data-path operations are performed
80  * through a `vm_client_t` handle.  That VM client abstraction is meant to
81  * reduce contention and overhead for frequent access operations and provide
82  * debugging insight into how different subcomponents are accessing the vmspace.
83  * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
84  * interface) and each VMM userspace segment mapping.
85  *
86  * Exclusion
87  *
88  * Making changes to the vmspace (such as mapping or unmapping regions) requires
89  * other accessors be excluded while the change is underway to prevent them from
90  * observing invalid intermediate states.  A simple approach could use a mutex
91  * or rwlock to achieve this, but that risks contention when the rate of access
92  * to the vmspace is high.
93  *
94  * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
95  * at a per-vm_client_t basis.  While this raises the cost for vmspace changes,
96  * it means that the much more common page accesses through the vm_client can
97  * normally proceed unimpeded and independently.
98  *
99  * When a change to the vmspace is required, the caller will put the vmspace in
100  * a 'hold' state, iterating over all associated vm_client instances, waiting
101  * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
102  * setting VCS_HOLD in their state flag fields.  With VCS_HOLD set, any call on
103  * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
104  * will block until the hold condition is cleared.  Once the hold is asserted
105  * for all clients, the vmspace change can proceed with confidence.  Upon
106  * completion of that operation, VCS_HOLD is cleared from the clients, and they
107  * are released to resume vmspace accesses.
108  *
109  * vCPU Consumers
110  *
111  * Access to the vmspace for vCPUs running in guest context is different from
112  * emulation-related vm_client activity: they solely rely on the contents of the
113  * page tables.  Furthermore, the existing VCS_HOLD mechanism used to exclude
114  * client access is not feasible when entering guest context, since interrupts
115  * are disabled, making it impossible to block entry.  This is not a concern as
116  * long as vmspace modifications never place the page tables in invalid states
117  * (either intermediate, or final).  The vm_client hold mechanism does provide
118  * the means to IPI vCPU consumers which will trigger a notification once they
119  * report their exit from guest context.  This can be used to ensure that page
120  * table modifications are made visible to those vCPUs within a certain
121  * time frame.
122  */
123 
124 typedef struct vmspace_mapping {
125 	list_node_t	vmsm_node;
126 	vm_object_t	*vmsm_object;	/* object backing this mapping */
127 	uintptr_t	vmsm_addr;	/* start addr in vmspace for mapping */
128 	size_t		vmsm_len;	/* length (in bytes) of mapping */
129 	off_t		vmsm_offset;	/* byte offset into object */
130 	uint_t		vmsm_prot;
131 } vmspace_mapping_t;
132 
133 #define	VMSM_OFFSET(vmsm, addr)	(			\
134 	    (vmsm)->vmsm_offset +			\
135 	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
136 
137 typedef enum vm_client_state {
138 	VCS_IDLE	= 0,
139 	/* currently accessing vmspace for client operation (hold or fault) */
140 	VCS_ACTIVE	= (1 << 0),
141 	/* client hold requested/asserted */
142 	VCS_HOLD	= (1 << 1),
143 	/* vCPU is accessing page tables in guest context */
144 	VCS_ON_CPU	= (1 << 2),
145 	/* client has been orphaned (no more access to vmspace) */
146 	VCS_ORPHANED	= (1 << 3),
147 	/* client undergoing destroy operation */
148 	VCS_DESTROY	= (1 << 4),
149 } vm_client_state_t;
150 
151 struct vmspace {
152 	kmutex_t	vms_lock;
153 	kcondvar_t	vms_cv;
154 	bool		vms_held;
155 	uintptr_t	vms_size;	/* immutable after creation */
156 
157 	/* (nested) page table state */
158 	vmm_gpt_t	*vms_gpt;
159 	uint64_t	vms_pt_gen;
160 	uint64_t	vms_pages_mapped;
161 	bool		vms_track_dirty;
162 
163 	list_t		vms_maplist;
164 	list_t		vms_clients;
165 };
166 
167 struct vm_client {
168 	vmspace_t	*vmc_space;
169 	list_node_t	vmc_node;
170 
171 	kmutex_t	vmc_lock;
172 	kcondvar_t	vmc_cv;
173 	vm_client_state_t vmc_state;
174 	int		vmc_cpu_active;
175 	uint64_t	vmc_cpu_gen;
176 	bool		vmc_track_dirty;
177 	vmc_inval_cb_t	vmc_inval_func;
178 	void		*vmc_inval_data;
179 
180 	list_t		vmc_held_pages;
181 };
182 
183 typedef enum vm_object_type {
184 	VMOT_NONE,
185 	VMOT_MEM,
186 	VMOT_MMIO,
187 } vm_object_type_t;
188 
189 struct vm_object {
190 	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
191 
192 	/* Fields below are fixed at creation time */
193 	vm_object_type_t vmo_type;
194 	size_t		vmo_size;
195 	void		*vmo_data;
196 	uint8_t		vmo_attr;
197 };
198 
199 struct vm_page {
200 	vm_client_t	*vmp_client;
201 	list_node_t	vmp_node;
202 	vm_page_t	*vmp_chain;
203 	uintptr_t	vmp_gpa;
204 	pfn_t		vmp_pfn;
205 	uint64_t	*vmp_ptep;
206 	vm_object_t	*vmp_obj_ref;
207 	int		vmp_prot;
208 };
209 
210 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
211 static void vmspace_hold_enter(vmspace_t *);
212 static void vmspace_hold_exit(vmspace_t *, bool);
213 static void vmc_space_hold(vm_client_t *);
214 static void vmc_space_release(vm_client_t *, bool);
215 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
216 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
217 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
218 
219 
220 /*
221  * Create a new vmspace with a maximum address of `end`.
222  */
223 vmspace_t *
224 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
225 {
226 	vmspace_t *vms;
227 	const uintptr_t size = end + 1;
228 
229 	/*
230 	 * This whole mess is built on the assumption that a 64-bit address
231 	 * space is available to work with for the various pagetable tricks.
232 	 */
233 	VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
234 	    size <= (uintptr_t)USERLIMIT);
235 
236 	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
237 	vms->vms_size = size;
238 	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
239 	    offsetof(vmspace_mapping_t, vmsm_node));
240 	list_create(&vms->vms_clients, sizeof (vm_client_t),
241 	    offsetof(vm_client_t, vmc_node));
242 
243 	vms->vms_gpt = vmm_gpt_alloc(pte_ops);
244 	vms->vms_pt_gen = 1;
245 	vms->vms_track_dirty = track_dirty;
246 
247 	return (vms);
248 }
249 
250 /*
251  * Destroy a vmspace.  All regions in the space must be unmapped.  Any remaining
252  * clients will be orphaned.
253  */
254 void
255 vmspace_destroy(vmspace_t *vms)
256 {
257 	mutex_enter(&vms->vms_lock);
258 	VERIFY(list_is_empty(&vms->vms_maplist));
259 
260 	if (!list_is_empty(&vms->vms_clients)) {
261 		vm_client_t *vmc = list_head(&vms->vms_clients);
262 		while (vmc != NULL) {
263 			vmc = vmc_space_orphan(vmc, vms);
264 		}
265 		/*
266 		 * Wait for any clients which were in the process of destroying
267 		 * themselves to disappear.
268 		 */
269 		while (!list_is_empty(&vms->vms_clients)) {
270 			cv_wait(&vms->vms_cv, &vms->vms_lock);
271 		}
272 	}
273 	VERIFY(list_is_empty(&vms->vms_clients));
274 
275 	vmm_gpt_free(vms->vms_gpt);
276 	mutex_exit(&vms->vms_lock);
277 
278 	mutex_destroy(&vms->vms_lock);
279 	cv_destroy(&vms->vms_cv);
280 	list_destroy(&vms->vms_maplist);
281 	list_destroy(&vms->vms_clients);
282 
283 	kmem_free(vms, sizeof (*vms));
284 }
285 
286 /*
287  * Retrieve the count of resident (mapped into the page tables) pages.
288  */
289 uint64_t
290 vmspace_resident_count(vmspace_t *vms)
291 {
292 	return (vms->vms_pages_mapped);
293 }
294 
295 int
296 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap)
297 {
298 	if (!vms->vms_track_dirty)
299 		return (EPERM);
300 
301 	/*
302 	 * Accumulate dirty bits into the given bit vector.  Note that this
303 	 * races both against hardware writes from running vCPUs and
304 	 * reflections from userspace.
305 	 *
306 	 * Called from a userspace-visible ioctl, this depends on the VM
307 	 * instance being read-locked to prevent vmspace_map/vmspace_unmap
308 	 * operations from changing the page tables during the walk.
309 	 */
310 	for (size_t offset = 0; offset < len; offset += PAGESIZE) {
311 		bool bit = false;
312 		uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset);
313 		if (entry != NULL)
314 			bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false);
315 		uint64_t pfn_offset = offset >> PAGESHIFT;
316 		size_t bit_offset = pfn_offset / 8;
317 		size_t bit_index = pfn_offset % 8;
318 		bitmap[bit_offset] |= (bit << bit_index);
319 	}
320 
321 	/*
322 	 * Now invalidate those bits and shoot down address spaces that
323 	 * may have them cached.
324 	 */
325 	vmspace_hold_enter(vms);
326 	vms->vms_pt_gen++;
327 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
328 	    vmc != NULL;
329 	    vmc = list_next(&vms->vms_clients, vmc)) {
330 		vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
331 	}
332 	vmspace_hold_exit(vms, true);
333 
334 	return (0);
335 }
336 
337 static pfn_t
338 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
339 {
340 	vmmr_region_t *region;
341 	pfn_t pfn;
342 
343 	ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
344 
345 	region = vmo->vmo_data;
346 	pfn = vmmr_region_pfn_at(region, off);
347 
348 	return (pfn);
349 }
350 
351 static pfn_t
352 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
353 {
354 	pfn_t pfn;
355 
356 	ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
357 	ASSERT3P(vmo->vmo_data, !=, NULL);
358 	ASSERT3U(off, <, vmo->vmo_size);
359 
360 	pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
361 
362 	return (pfn);
363 }
364 
365 /*
366  * Allocate a VM object backed by VMM reservoir memory.
367  */
368 vm_object_t *
369 vm_object_mem_allocate(size_t size, bool transient)
370 {
371 	int err;
372 	vmmr_region_t *region = NULL;
373 	vm_object_t *vmo;
374 
375 	ASSERT3U(size, !=, 0);
376 	ASSERT3U(size & PAGEOFFSET, ==, 0);
377 
378 	err = vmmr_alloc(size, transient, &region);
379 	if (err != 0) {
380 		return (NULL);
381 	}
382 
383 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
384 
385 	/* For now, these are to stay fixed after allocation */
386 	vmo->vmo_type = VMOT_MEM;
387 	vmo->vmo_size = size;
388 	vmo->vmo_attr = MTRR_TYPE_WB;
389 	vmo->vmo_data = region;
390 	vmo->vmo_refcnt = 1;
391 
392 	return (vmo);
393 }
394 
395 static vm_object_t *
396 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
397 {
398 	vm_object_t *vmo;
399 
400 	ASSERT3U(size, !=, 0);
401 	ASSERT3U(size & PAGEOFFSET, ==, 0);
402 	ASSERT3U(hpa & PAGEOFFSET, ==, 0);
403 
404 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
405 
406 	/* For now, these are to stay fixed after allocation */
407 	vmo->vmo_type = VMOT_MMIO;
408 	vmo->vmo_size = size;
409 	vmo->vmo_attr = MTRR_TYPE_UC;
410 	vmo->vmo_data = (void *)hpa;
411 	vmo->vmo_refcnt = 1;
412 
413 	return (vmo);
414 }
415 
416 /*
417  * Allocate a VM object backed by an existing range of physical memory.
418  */
419 vm_object_t *
420 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
421 {
422 	int error;
423 	vm_object_t *obj;
424 
425 	obj = vm_object_mmio_allocate(len, hpa);
426 	if (obj != NULL) {
427 		error = vmspace_map(vmspace, obj, 0, gpa, len,
428 		    PROT_READ | PROT_WRITE);
429 		if (error != 0) {
430 			vm_object_release(obj);
431 			obj = NULL;
432 		}
433 	}
434 
435 	return (obj);
436 }
437 
438 /*
439  * Release a vm_object reference
440  */
441 void
442 vm_object_release(vm_object_t *vmo)
443 {
444 	ASSERT(vmo != NULL);
445 
446 	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
447 	/* underflow would be a deadly serious mistake */
448 	VERIFY3U(ref, !=, UINT_MAX);
449 	if (ref != 0) {
450 		return;
451 	}
452 
453 	switch (vmo->vmo_type) {
454 	case VMOT_MEM:
455 		vmmr_free((vmmr_region_t *)vmo->vmo_data);
456 		break;
457 	case VMOT_MMIO:
458 		break;
459 	default:
460 		panic("unexpected object type %u", vmo->vmo_type);
461 		break;
462 	}
463 
464 	vmo->vmo_data = NULL;
465 	vmo->vmo_size = 0;
466 	kmem_free(vmo, sizeof (*vmo));
467 }
468 
469 /*
470  * Increase refcount for vm_object reference
471  */
472 void
473 vm_object_reference(vm_object_t *vmo)
474 {
475 	ASSERT(vmo != NULL);
476 
477 	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
478 	/* overflow would be a deadly serious mistake */
479 	VERIFY3U(ref, !=, 0);
480 }
481 
482 /*
483  * Get the host-physical PFN for a given offset into a vm_object.
484  *
485  * The provided `off` must be within the allocated size of the vm_object.
486  */
487 pfn_t
488 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
489 {
490 	const uintptr_t aligned_off = off & PAGEMASK;
491 
492 	switch (vmo->vmo_type) {
493 	case VMOT_MEM:
494 		return (vm_object_pager_reservoir(vmo, aligned_off));
495 	case VMOT_MMIO:
496 		return (vm_object_pager_mmio(vmo, aligned_off));
497 	case VMOT_NONE:
498 		break;
499 	}
500 	panic("unexpected object type %u", vmo->vmo_type);
501 }
502 
503 static vmspace_mapping_t *
504 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
505 {
506 	vmspace_mapping_t *vmsm;
507 	list_t *ml = &vms->vms_maplist;
508 	const uintptr_t range_end = addr + size;
509 
510 	ASSERT3U(addr, <=, range_end);
511 
512 	if (addr >= vms->vms_size) {
513 		return (NULL);
514 	}
515 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
516 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
517 
518 		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
519 			if (range_end <= seg_end) {
520 				return (vmsm);
521 			} else {
522 				return (NULL);
523 			}
524 		}
525 	}
526 	return (NULL);
527 }
528 
529 /*
530  * Check to see if any mappings reside within [addr, addr + size) span in the
531  * vmspace, returning true if that span is indeed empty.
532  */
533 static bool
534 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
535 {
536 	vmspace_mapping_t *vmsm;
537 	list_t *ml = &vms->vms_maplist;
538 	const uintptr_t range_end = addr + size - 1;
539 
540 	ASSERT(MUTEX_HELD(&vms->vms_lock));
541 	ASSERT(size > 0);
542 
543 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
544 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
545 
546 		/*
547 		 * The two ranges do not overlap if the start of either of
548 		 * them is after the end of the other.
549 		 */
550 		if (vmsm->vmsm_addr > range_end || addr > seg_end)
551 			continue;
552 		return (false);
553 	}
554 	return (true);
555 }
556 
557 static void
558 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
559 {
560 	list_t *ml = &vms->vms_maplist;
561 
562 	ASSERT(MUTEX_HELD(&vms->vms_lock));
563 	ASSERT(vms->vms_held);
564 
565 	list_remove(ml, vmsm);
566 	vm_object_release(vmsm->vmsm_object);
567 	kmem_free(vmsm, sizeof (*vmsm));
568 }
569 
570 /*
571  * Enter a hold state on the vmspace.  This ensures that all VM clients
572  * associated with the vmspace are excluded from establishing new page holds,
573  * or any other actions which would require accessing vmspace state subject to
574  * potential change.
575  *
576  * Returns with vmspace_t`vms_lock held.
577  */
578 static void
579 vmspace_hold_enter(vmspace_t *vms)
580 {
581 	mutex_enter(&vms->vms_lock);
582 	VERIFY(!vms->vms_held);
583 
584 	vm_client_t *vmc = list_head(&vms->vms_clients);
585 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
586 		vmc_space_hold(vmc);
587 	}
588 	vms->vms_held = true;
589 }
590 
591 /*
592  * Exit a hold state on the vmspace.  This releases all VM clients associated
593  * with the vmspace to be able to establish new page holds, and partake in other
594  * actions which require accessing changed vmspace state.  If `kick_on_cpu` is
595  * true, then any CPUs actively using the page tables will be IPIed, and the
596  * call will block until they have acknowledged being ready to use the latest
597  * state of the tables.
598  *
599  * Requires vmspace_t`vms_lock be held, which is released as part of the call.
600  */
601 static void
602 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
603 {
604 	ASSERT(MUTEX_HELD(&vms->vms_lock));
605 	VERIFY(vms->vms_held);
606 
607 	vm_client_t *vmc = list_head(&vms->vms_clients);
608 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
609 		vmc_space_release(vmc, kick_on_cpu);
610 	}
611 	vms->vms_held = false;
612 	mutex_exit(&vms->vms_lock);
613 }
614 
615 /*
616  * Attempt to map a vm_object span into the vmspace.
617  *
618  * Requirements:
619  * - `obj_off`, `addr`, and `len` must be page-aligned
620  * - `obj_off` cannot be greater than the allocated size of the object
621  * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
622  *   size of the object
623  * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
624  *   of the vmspace
625  */
626 int
627 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
628     size_t len, uint8_t prot)
629 {
630 	vmspace_mapping_t *vmsm;
631 	int res = 0;
632 
633 	if (len == 0 || (addr + len) < addr ||
634 	    obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
635 		return (EINVAL);
636 	}
637 	if ((addr + len) >= vms->vms_size) {
638 		return (ENOMEM);
639 	}
640 
641 	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
642 
643 	vmspace_hold_enter(vms);
644 	if (!vm_mapping_gap(vms, addr, len)) {
645 		kmem_free(vmsm, sizeof (*vmsm));
646 		res = ENOMEM;
647 	} else {
648 		vmsm->vmsm_object = vmo;
649 		vmsm->vmsm_addr = addr;
650 		vmsm->vmsm_len = len;
651 		vmsm->vmsm_offset = (off_t)obj_off;
652 		vmsm->vmsm_prot = prot;
653 		list_insert_tail(&vms->vms_maplist, vmsm);
654 
655 		/*
656 		 * Make sure the GPT has tables ready for leaf entries across
657 		 * the entire new mapping.
658 		 */
659 		vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len);
660 	}
661 	vmspace_hold_exit(vms, false);
662 	return (res);
663 }
664 
665 /*
666  * Unmap a region of the vmspace.
667  *
668  * Presently the [start, end) span must equal a region previously mapped by a
669  * call to vmspace_map().
670  */
671 int
672 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end)
673 {
674 	const size_t size = (size_t)(end - start);
675 	vmspace_mapping_t *vmsm;
676 	vm_client_t *vmc;
677 	uint64_t gen = 0;
678 
679 	ASSERT(start < end);
680 
681 	vmspace_hold_enter(vms);
682 	/* expect to match existing mapping exactly */
683 	if ((vmsm = vm_mapping_find(vms, start, size)) == NULL ||
684 	    vmsm->vmsm_addr != start || vmsm->vmsm_len != size) {
685 		vmspace_hold_exit(vms, false);
686 		return (ENOENT);
687 	}
688 
689 	/* Prepare clients (and their held pages) for the unmap. */
690 	for (vmc = list_head(&vms->vms_clients); vmc != NULL;
691 	    vmc = list_next(&vms->vms_clients, vmc)) {
692 		vmc_space_unmap(vmc, start, size, vmsm->vmsm_object);
693 	}
694 
695 	/* Clear all PTEs for region */
696 	if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) {
697 		vms->vms_pt_gen++;
698 		gen = vms->vms_pt_gen;
699 	}
700 	/* ... and the intermediate (directory) PTEs as well */
701 	vmm_gpt_vacate_region(vms->vms_gpt, start, end);
702 
703 	/*
704 	 * If pages were actually unmapped from the GPT, provide clients with
705 	 * an invalidation notice.
706 	 */
707 	if (gen != 0) {
708 		for (vmc = list_head(&vms->vms_clients); vmc != NULL;
709 		    vmc = list_next(&vms->vms_clients, vmc)) {
710 			vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen);
711 		}
712 	}
713 
714 	vm_mapping_remove(vms, vmsm);
715 	vmspace_hold_exit(vms, true);
716 	return (0);
717 }
718 
719 static int
720 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
721     uint64_t **ptepp)
722 {
723 	vmm_gpt_t *gpt = vms->vms_gpt;
724 	uint64_t *entries[MAX_GPT_LEVEL], *leaf;
725 	pfn_t pfn = PFN_INVALID;
726 	uint_t prot;
727 
728 	ASSERT0(gpa & PAGEOFFSET);
729 	ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
730 
731 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
732 	leaf = entries[LEVEL1];
733 	if (leaf == NULL) {
734 		/*
735 		 * Since we populated the intermediate tables for any regions
736 		 * mapped in the GPT, an empty leaf entry indicates there is no
737 		 * mapping, populated or not, at this GPT.
738 		 */
739 		return (FC_NOMAP);
740 	}
741 
742 	if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
743 		if ((req_prot & prot) != req_prot) {
744 			return (FC_PROT);
745 		}
746 	} else {
747 		vmspace_mapping_t *vmsm;
748 		vm_object_t *vmo;
749 
750 		vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
751 		if (vmsm == NULL) {
752 			return (FC_NOMAP);
753 		}
754 
755 		if ((req_prot & vmsm->vmsm_prot) != req_prot) {
756 			return (FC_PROT);
757 		}
758 		vmo = vmsm->vmsm_object;
759 		pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
760 		VERIFY(pfn != PFN_INVALID);
761 
762 		if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot,
763 		    vmo->vmo_attr)) {
764 			atomic_inc_64(&vms->vms_pages_mapped);
765 		}
766 	}
767 
768 	ASSERT(pfn != PFN_INVALID && leaf != NULL);
769 	if (pfnp != NULL) {
770 		*pfnp = pfn;
771 	}
772 	if (ptepp != NULL) {
773 		*ptepp = leaf;
774 	}
775 	return (0);
776 }
777 
778 /*
779  * Populate (make resident in the page tables) a region of the vmspace.
780  *
781  * Presently the [start, end) span must equal a region previously mapped by a
782  * call to vmspace_map().
783  */
784 int
785 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end)
786 {
787 	const size_t size = end - start;
788 	vmspace_mapping_t *vmsm;
789 
790 	mutex_enter(&vms->vms_lock);
791 
792 	/* For the time being, only exact-match mappings are expected */
793 	if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) {
794 		mutex_exit(&vms->vms_lock);
795 		return (FC_NOMAP);
796 	}
797 
798 	vm_object_t *vmo = vmsm->vmsm_object;
799 	const int prot = vmsm->vmsm_prot;
800 	const uint8_t attr = vmo->vmo_attr;
801 	size_t populated = 0;
802 	for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) {
803 		const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
804 		VERIFY(pfn != PFN_INVALID);
805 
806 		if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
807 			populated++;
808 		}
809 	}
810 	atomic_add_64(&vms->vms_pages_mapped, populated);
811 
812 	mutex_exit(&vms->vms_lock);
813 	return (0);
814 }
815 
816 /*
817  * Allocate a client from a given vmspace.
818  */
819 vm_client_t *
820 vmspace_client_alloc(vmspace_t *vms)
821 {
822 	vm_client_t *vmc;
823 
824 	vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
825 	vmc->vmc_space = vms;
826 	mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
827 	cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
828 	vmc->vmc_state = VCS_IDLE;
829 	vmc->vmc_cpu_active = -1;
830 	list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
831 	    offsetof(vm_page_t, vmp_node));
832 	vmc->vmc_track_dirty = vms->vms_track_dirty;
833 
834 	mutex_enter(&vms->vms_lock);
835 	list_insert_tail(&vms->vms_clients, vmc);
836 	mutex_exit(&vms->vms_lock);
837 
838 	return (vmc);
839 }
840 
841 /*
842  * Get the nested page table root pointer (EPTP/NCR3) value.
843  */
844 uint64_t
845 vmspace_table_root(vmspace_t *vms)
846 {
847 	return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
848 }
849 
850 /*
851  * Get the current generation number of the nested page table.
852  */
853 uint64_t
854 vmspace_table_gen(vmspace_t *vms)
855 {
856 	return (vms->vms_pt_gen);
857 }
858 
859 /*
860  * Mark a vm_client as active.  This will block if/while the client is held by
861  * the vmspace.  On success, it returns with vm_client_t`vmc_lock held.  It will
862  * fail if the vm_client has been orphaned.
863  */
864 static int
865 vmc_activate(vm_client_t *vmc)
866 {
867 	mutex_enter(&vmc->vmc_lock);
868 	VERIFY0(vmc->vmc_state & VCS_ACTIVE);
869 	if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
870 		mutex_exit(&vmc->vmc_lock);
871 		return (ENXIO);
872 	}
873 	while ((vmc->vmc_state & VCS_HOLD) != 0) {
874 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
875 	}
876 	vmc->vmc_state |= VCS_ACTIVE;
877 	return (0);
878 }
879 
880 /*
881  * Mark a vm_client as no longer active.  It must be called with
882  * vm_client_t`vmc_lock already held, and will return with it released.
883  */
884 static void
885 vmc_deactivate(vm_client_t *vmc)
886 {
887 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
888 	VERIFY(vmc->vmc_state & VCS_ACTIVE);
889 
890 	vmc->vmc_state ^= VCS_ACTIVE;
891 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
892 		cv_broadcast(&vmc->vmc_cv);
893 	}
894 	mutex_exit(&vmc->vmc_lock);
895 }
896 
897 /*
898  * Indicate that a CPU will be utilizing the nested page tables through this VM
899  * client.  Interrupts (and/or the GIF) are expected to be disabled when calling
900  * this function.  Returns the generation number of the nested page table (to be
901  * used for TLB invalidations).
902  */
903 uint64_t
904 vmc_table_enter(vm_client_t *vmc)
905 {
906 	vmspace_t *vms = vmc->vmc_space;
907 	uint64_t gen;
908 
909 	ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
910 	ASSERT3S(vmc->vmc_cpu_active, ==, -1);
911 
912 	/*
913 	 * Since the NPT activation occurs with interrupts disabled, this must
914 	 * be done without taking vmc_lock like normal.
915 	 */
916 	gen = vms->vms_pt_gen;
917 	vmc->vmc_cpu_active = CPU->cpu_id;
918 	vmc->vmc_cpu_gen = gen;
919 	atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
920 
921 	return (gen);
922 }
923 
924 /*
925  * Indicate that this VM client is not longer (directly) using the underlying
926  * page tables.  Interrupts (and/or the GIF) must be enabled prior to calling
927  * this function.
928  */
929 void
930 vmc_table_exit(vm_client_t *vmc)
931 {
932 	mutex_enter(&vmc->vmc_lock);
933 
934 	ASSERT(vmc->vmc_state & VCS_ON_CPU);
935 	vmc->vmc_state ^= VCS_ON_CPU;
936 	vmc->vmc_cpu_active = -1;
937 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
938 		cv_broadcast(&vmc->vmc_cv);
939 	}
940 
941 	mutex_exit(&vmc->vmc_lock);
942 }
943 
944 static void
945 vmc_space_hold(vm_client_t *vmc)
946 {
947 	mutex_enter(&vmc->vmc_lock);
948 	VERIFY0(vmc->vmc_state & VCS_HOLD);
949 
950 	/*
951 	 * Because vmc_table_enter() alters vmc_state from a context where
952 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
953 	 * VMC_HOLD must be done atomically here.
954 	 */
955 	atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
956 
957 	/* Wait for client to go inactive */
958 	while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
959 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
960 	}
961 	mutex_exit(&vmc->vmc_lock);
962 }
963 
964 static void
965 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
966 {
967 	mutex_enter(&vmc->vmc_lock);
968 	VERIFY(vmc->vmc_state & VCS_HOLD);
969 
970 	if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
971 		poke_cpu(vmc->vmc_cpu_active);
972 
973 		while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
974 			cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
975 		}
976 	}
977 
978 	/*
979 	 * Because vmc_table_enter() alters vmc_state from a context where
980 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
981 	 * VMC_HOLD must be done atomically here.
982 	 */
983 	atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
984 	cv_broadcast(&vmc->vmc_cv);
985 	mutex_exit(&vmc->vmc_lock);
986 }
987 
988 static void
989 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
990     uint64_t gen)
991 {
992 	mutex_enter(&vmc->vmc_lock);
993 	VERIFY(vmc->vmc_state & VCS_HOLD);
994 	if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
995 		/*
996 		 * Wait for clients using an old generation of the page tables
997 		 * to exit guest context, where they subsequently flush the TLB
998 		 * for the new generation.
999 		 */
1000 		if (vmc->vmc_cpu_gen < gen) {
1001 			poke_cpu(vmc->vmc_cpu_active);
1002 
1003 			while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1004 				cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1005 			}
1006 		}
1007 	}
1008 	if (vmc->vmc_inval_func != NULL) {
1009 		vmc_inval_cb_t func = vmc->vmc_inval_func;
1010 		void *data = vmc->vmc_inval_data;
1011 
1012 		/*
1013 		 * Perform the actual invalidation call outside vmc_lock to
1014 		 * avoid lock ordering issues in the consumer.  Since the client
1015 		 * is under VCS_HOLD, this is safe.
1016 		 */
1017 		mutex_exit(&vmc->vmc_lock);
1018 		func(data, addr, size);
1019 		mutex_enter(&vmc->vmc_lock);
1020 	}
1021 	mutex_exit(&vmc->vmc_lock);
1022 }
1023 
1024 static void
1025 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1026     vm_object_t *vmo)
1027 {
1028 	mutex_enter(&vmc->vmc_lock);
1029 	VERIFY(vmc->vmc_state & VCS_HOLD);
1030 
1031 	/*
1032 	 * With the current vCPU exclusion invariants in place, we do not expect
1033 	 * a vCPU to be in guest context during an unmap.
1034 	 */
1035 	VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1036 
1037 	/*
1038 	 * Any holds against the unmapped region need to establish their own
1039 	 * reference to the underlying object to avoid a potential
1040 	 * use-after-free.
1041 	 */
1042 	for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1043 	    vmp != NULL;
1044 	    vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1045 		if (vmp->vmp_gpa < addr ||
1046 		    vmp->vmp_gpa >= (addr + size)) {
1047 			/* Hold outside region in question */
1048 			continue;
1049 		}
1050 		if (vmp->vmp_obj_ref == NULL) {
1051 			vm_object_reference(vmo);
1052 			vmp->vmp_obj_ref = vmo;
1053 			/* For an unmapped region, PTE is now meaningless */
1054 			vmp->vmp_ptep = NULL;
1055 		} else {
1056 			/*
1057 			 * Object could have gone through cycle of
1058 			 * unmap-map-unmap before the hold was released.
1059 			 */
1060 			VERIFY3P(vmp->vmp_ptep, ==, NULL);
1061 		}
1062 	}
1063 	mutex_exit(&vmc->vmc_lock);
1064 }
1065 
1066 static vm_client_t *
1067 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1068 {
1069 	vm_client_t *next;
1070 
1071 	ASSERT(MUTEX_HELD(&vms->vms_lock));
1072 
1073 	mutex_enter(&vmc->vmc_lock);
1074 	VERIFY3P(vmc->vmc_space, ==, vms);
1075 	VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1076 	if (vmc->vmc_state & VCS_DESTROY) {
1077 		/*
1078 		 * This vm_client is currently undergoing destruction, so it
1079 		 * does not need to be orphaned.  Let it proceed with its own
1080 		 * clean-up task.
1081 		 */
1082 		next = list_next(&vms->vms_clients, vmc);
1083 	} else {
1084 		/*
1085 		 * Clients are only orphaned when the containing vmspace is
1086 		 * being torn down.  All mappings from the vmspace should
1087 		 * already be gone, meaning any remaining held pages should have
1088 		 * direct references to the object.
1089 		 */
1090 		for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1091 		    vmp != NULL;
1092 		    vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1093 			ASSERT3P(vmp->vmp_ptep, ==, NULL);
1094 			ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1095 		}
1096 
1097 		/*
1098 		 * After this point, the client will be orphaned, unable to
1099 		 * establish new page holds (or access any vmspace-related
1100 		 * resources) and is in charge of cleaning up after itself.
1101 		 */
1102 		vmc->vmc_state |= VCS_ORPHANED;
1103 		next = list_next(&vms->vms_clients, vmc);
1104 		list_remove(&vms->vms_clients, vmc);
1105 		vmc->vmc_space = NULL;
1106 	}
1107 	mutex_exit(&vmc->vmc_lock);
1108 	return (next);
1109 }
1110 
1111 /*
1112  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1113  */
1114 vm_page_t *
1115 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1116 {
1117 	vmspace_t *vms = vmc->vmc_space;
1118 	vm_page_t *vmp;
1119 	pfn_t pfn = PFN_INVALID;
1120 	uint64_t *ptep = NULL;
1121 
1122 	ASSERT0(gpa & PAGEOFFSET);
1123 	ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1124 
1125 	vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1126 	if (vmc_activate(vmc) != 0) {
1127 		kmem_free(vmp, sizeof (*vmp));
1128 		return (NULL);
1129 	}
1130 
1131 	if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1132 		vmc_deactivate(vmc);
1133 		kmem_free(vmp, sizeof (*vmp));
1134 		return (NULL);
1135 	}
1136 	ASSERT(pfn != PFN_INVALID && ptep != NULL);
1137 
1138 	vmp->vmp_client = vmc;
1139 	vmp->vmp_chain = NULL;
1140 	vmp->vmp_gpa = gpa;
1141 	vmp->vmp_pfn = pfn;
1142 	vmp->vmp_ptep = ptep;
1143 	vmp->vmp_obj_ref = NULL;
1144 	vmp->vmp_prot = prot;
1145 	list_insert_tail(&vmc->vmc_held_pages, vmp);
1146 	vmc_deactivate(vmc);
1147 
1148 	return (vmp);
1149 }
1150 
1151 int
1152 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1153 {
1154 	vmspace_t *vms = vmc->vmc_space;
1155 	int err;
1156 
1157 	err = vmc_activate(vmc);
1158 	if (err == 0) {
1159 		err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1160 		vmc_deactivate(vmc);
1161 	}
1162 
1163 	return (err);
1164 }
1165 
1166 /*
1167  * Allocate an additional vm_client_t, based on an existing one.  Only the
1168  * associatation with the vmspace is cloned, not existing holds or any
1169  * configured invalidation function.
1170  */
1171 vm_client_t *
1172 vmc_clone(vm_client_t *vmc)
1173 {
1174 	vmspace_t *vms = vmc->vmc_space;
1175 
1176 	return (vmspace_client_alloc(vms));
1177 }
1178 
1179 /*
1180  * Register a function (and associated data pointer) to be called when an
1181  * address range in the vmspace is invalidated.
1182  */
1183 int
1184 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1185 {
1186 	int err;
1187 
1188 	err = vmc_activate(vmc);
1189 	if (err == 0) {
1190 		vmc->vmc_inval_func = func;
1191 		vmc->vmc_inval_data = data;
1192 		vmc_deactivate(vmc);
1193 	}
1194 
1195 	return (err);
1196 }
1197 
1198 /*
1199  * Destroy a vm_client_t instance.
1200  *
1201  * No pages held through this vm_client_t may be outstanding when performing a
1202  * vmc_destroy().  For vCPU clients, the client cannot be on-CPU (a call to
1203  * vmc_table_exit() has been made).
1204  */
1205 void
1206 vmc_destroy(vm_client_t *vmc)
1207 {
1208 	mutex_enter(&vmc->vmc_lock);
1209 
1210 	VERIFY(list_is_empty(&vmc->vmc_held_pages));
1211 	VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1212 
1213 	if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1214 		vmspace_t *vms;
1215 
1216 		/*
1217 		 * Deassociation with the parent vmspace must be done carefully:
1218 		 * The vmspace could attempt to orphan this vm_client while we
1219 		 * release vmc_lock in order to take vms_lock (the required
1220 		 * order).  The client is marked to indicate that destruction is
1221 		 * under way.  Doing so prevents any racing orphan operation
1222 		 * from applying to this client, allowing us to deassociate from
1223 		 * the vmspace safely.
1224 		 */
1225 		vmc->vmc_state |= VCS_DESTROY;
1226 		vms = vmc->vmc_space;
1227 		mutex_exit(&vmc->vmc_lock);
1228 
1229 		mutex_enter(&vms->vms_lock);
1230 		mutex_enter(&vmc->vmc_lock);
1231 		list_remove(&vms->vms_clients, vmc);
1232 		/*
1233 		 * If the vmspace began its own destruction operation while we
1234 		 * were navigating the locks, be sure to notify it about this
1235 		 * vm_client being deassociated.
1236 		 */
1237 		cv_signal(&vms->vms_cv);
1238 		mutex_exit(&vmc->vmc_lock);
1239 		mutex_exit(&vms->vms_lock);
1240 	} else {
1241 		VERIFY3P(vmc->vmc_space, ==, NULL);
1242 		mutex_exit(&vmc->vmc_lock);
1243 	}
1244 
1245 	mutex_destroy(&vmc->vmc_lock);
1246 	cv_destroy(&vmc->vmc_cv);
1247 	list_destroy(&vmc->vmc_held_pages);
1248 
1249 	kmem_free(vmc, sizeof (*vmc));
1250 }
1251 
1252 static __inline void *
1253 vmp_ptr(const vm_page_t *vmp)
1254 {
1255 	ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1256 
1257 	const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1258 	return ((void *)((uintptr_t)kpm_vbase + paddr));
1259 }
1260 
1261 /*
1262  * Get a readable kernel-virtual pointer for a held page.
1263  *
1264  * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1265  * call to acquire this page reference.
1266  */
1267 const void *
1268 vmp_get_readable(const vm_page_t *vmp)
1269 {
1270 	ASSERT(vmp->vmp_prot & PROT_READ);
1271 
1272 	return (vmp_ptr(vmp));
1273 }
1274 
1275 /*
1276  * Get a writable kernel-virtual pointer for a held page.
1277  *
1278  * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1279  * call to acquire this page reference.
1280  */
1281 void *
1282 vmp_get_writable(const vm_page_t *vmp)
1283 {
1284 	ASSERT(vmp->vmp_prot & PROT_WRITE);
1285 
1286 	return (vmp_ptr(vmp));
1287 }
1288 
1289 /*
1290  * Get the host-physical PFN for a held page.
1291  */
1292 pfn_t
1293 vmp_get_pfn(const vm_page_t *vmp)
1294 {
1295 	return (vmp->vmp_pfn);
1296 }
1297 
1298 /*
1299  * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1300  */
1301 void
1302 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1303 {
1304 	ASSERT3P(vmp->vmp_chain, ==, NULL);
1305 
1306 	vmp->vmp_chain = to_chain;
1307 }
1308 
1309 /*
1310  * Retrieve the pointer from the page-chaining in `vmp`.
1311  */
1312 vm_page_t *
1313 vmp_next(const vm_page_t *vmp)
1314 {
1315 	return (vmp->vmp_chain);
1316 }
1317 
1318 static __inline bool
1319 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1320 {
1321 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1322 
1323 	bool was_unmapped = false;
1324 
1325 	list_remove(&vmc->vmc_held_pages, vmp);
1326 	if (vmp->vmp_obj_ref != NULL) {
1327 		ASSERT3P(vmp->vmp_ptep, ==, NULL);
1328 
1329 		vm_object_release(vmp->vmp_obj_ref);
1330 		was_unmapped = true;
1331 	} else {
1332 		ASSERT3P(vmp->vmp_ptep, !=, NULL);
1333 
1334 		if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) {
1335 			vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
1336 			(void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
1337 		}
1338 	}
1339 	kmem_free(vmp, sizeof (*vmp));
1340 	return (was_unmapped);
1341 }
1342 
1343 /*
1344  * Release held page.  Returns true if page resided on region which was
1345  * subsequently unmapped.
1346  */
1347 bool
1348 vmp_release(vm_page_t *vmp)
1349 {
1350 	vm_client_t *vmc = vmp->vmp_client;
1351 
1352 	VERIFY(vmc != NULL);
1353 
1354 	mutex_enter(&vmc->vmc_lock);
1355 	const bool was_unmapped = vmp_release_inner(vmp, vmc);
1356 	mutex_exit(&vmc->vmc_lock);
1357 	return (was_unmapped);
1358 }
1359 
1360 /*
1361  * Release a chain of pages which were associated via vmp_chain() (setting
1362  * page-chaining pointer).  Returns true if any pages resided upon a region
1363  * which was subsequently unmapped.
1364  *
1365  * All of those pages must have been held through the same vm_client_t.
1366  */
1367 bool
1368 vmp_release_chain(vm_page_t *vmp)
1369 {
1370 	vm_client_t *vmc = vmp->vmp_client;
1371 	bool any_unmapped = false;
1372 
1373 	ASSERT(vmp != NULL);
1374 
1375 	mutex_enter(&vmc->vmc_lock);
1376 	while (vmp != NULL) {
1377 		vm_page_t *next = vmp->vmp_chain;
1378 
1379 		/* We expect all pages in chain to be from same client */
1380 		ASSERT3P(vmp->vmp_client, ==, vmc);
1381 
1382 		if (vmp_release_inner(vmp, vmc)) {
1383 			any_unmapped = true;
1384 		}
1385 		vmp = next;
1386 	}
1387 	mutex_exit(&vmc->vmc_lock);
1388 	return (any_unmapped);
1389 }
1390 
1391 
1392 int
1393 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1394     struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1395 {
1396 	vm_object_t *vmo;
1397 	int err;
1398 
1399 	if (segoff < 0 || len <= 0 ||
1400 	    (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1401 		return (EINVAL);
1402 	}
1403 	if ((prot & PROT_USER) == 0) {
1404 		return (ENOTSUP);
1405 	}
1406 	err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1407 	if (err != 0) {
1408 		return (err);
1409 	}
1410 
1411 	VERIFY(segoff >= 0);
1412 	VERIFY(len <= vmo->vmo_size);
1413 	VERIFY((len + segoff) <= vmo->vmo_size);
1414 
1415 	if (vmo->vmo_type != VMOT_MEM) {
1416 		/* Only support memory objects for now */
1417 		return (ENOTSUP);
1418 	}
1419 
1420 	as_rangelock(as);
1421 
1422 	err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1423 	if (err == 0) {
1424 		segvmm_crargs_t svma;
1425 
1426 		svma.prot = prot;
1427 		svma.offset = segoff;
1428 		svma.vmo = vmo;
1429 		svma.vmc = NULL;
1430 
1431 		err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1432 	}
1433 
1434 	as_rangeunlock(as);
1435 	return (err);
1436 }
1437 
1438 int
1439 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1440     off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1441 {
1442 
1443 	const uintptr_t gpa = (uintptr_t)off;
1444 	const size_t size = (uintptr_t)len;
1445 	int err;
1446 
1447 	if (off < 0 || len <= 0 ||
1448 	    (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1449 		return (EINVAL);
1450 	}
1451 	if ((prot & PROT_USER) == 0) {
1452 		return (ENOTSUP);
1453 	}
1454 
1455 	as_rangelock(as);
1456 
1457 	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1458 	if (err == 0) {
1459 		segvmm_crargs_t svma;
1460 
1461 		svma.prot = prot;
1462 		svma.offset = gpa;
1463 		svma.vmo = NULL;
1464 		svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1465 
1466 		err = as_map(as, *addrp, len, segvmm_create, &svma);
1467 	}
1468 
1469 	as_rangeunlock(as);
1470 	return (err);
1471 }
1472