xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_vm.c (revision 2a22bccab8733d6de38ab1e1fbe8c810122a4427)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2022 Oxide Computer Company
16  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/malloc.h>
30 #include <sys/x86_archext.h>
31 #include <vm/as.h>
32 #include <vm/hat_i86.h>
33 #include <vm/seg_vn.h>
34 #include <vm/seg_kmem.h>
35 
36 #include <sys/vmm_vm.h>
37 #include <sys/seg_vmm.h>
38 #include <sys/vmm_kernel.h>
39 #include <sys/vmm_reservoir.h>
40 #include <sys/vmm_gpt.h>
41 
42 
43 /*
44  * VMM Virtual Memory
45  *
46  * History
47  *
48  * When bhyve was ported to illumos, one significant hole was handling guest
49  * memory and memory accesses.  In the original Pluribus port, bhyve itself
50  * manually handled the EPT structures for guest memory.  The updated sources
51  * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
52  * system for memory allocations and management of the EPT structures.  Keeping
53  * source differences to a minimum was a priority, so illumos-bhyve implemented
54  * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
55  * boot and run guests.
56  *
57  * While the VM shim was successful in getting illumos-bhyve to a functional
58  * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
59  * compatibility interfaces made it awkward to use.  As source differences with
60  * the upstream kernel code became less of a concern, and upcoming features
61  * (such as live migration) would demand more of those VM interfaces, it became
62  * clear that an overhaul was prudent.
63  *
64  * Design
65  *
66  * The new VM system for bhyve retains a number of the same concepts as what it
67  * replaces:
68  *
69  * - `vmspace_t` is the top-level entity for a guest memory space
70  * - `vm_object_t` represents a memory object which can be mapped into a vmspace
71  * - `vm_page_t` represents a page hold within a given vmspace, providing access
72  *   to the underlying memory page
73  *
74  * Unlike the old code, where most of the involved structures were exposed via
75  * public definitions, this replacement VM interface keeps all involved
76  * structures opaque to consumers.  Furthermore, there is a clear delineation
77  * between infrequent administrative operations (such as mapping/unmapping
78  * regions) and common data-path operations (attempting a page hold at a given
79  * guest-physical address).  Those administrative operations are performed
80  * directly against the vmspace, whereas the data-path operations are performed
81  * through a `vm_client_t` handle.  That VM client abstraction is meant to
82  * reduce contention and overhead for frequent access operations and provide
83  * debugging insight into how different subcomponents are accessing the vmspace.
84  * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
85  * interface) and each VMM userspace segment mapping.
86  *
87  * Exclusion
88  *
89  * Making changes to the vmspace (such as mapping or unmapping regions) requires
90  * other accessors be excluded while the change is underway to prevent them from
91  * observing invalid intermediate states.  A simple approach could use a mutex
92  * or rwlock to achieve this, but that risks contention when the rate of access
93  * to the vmspace is high.
94  *
95  * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
96  * at a per-vm_client_t basis.  While this raises the cost for vmspace changes,
97  * it means that the much more common page accesses through the vm_client can
98  * normally proceed unimpeded and independently.
99  *
100  * When a change to the vmspace is required, the caller will put the vmspace in
101  * a 'hold' state, iterating over all associated vm_client instances, waiting
102  * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
103  * setting VCS_HOLD in their state flag fields.  With VCS_HOLD set, any call on
104  * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
105  * will block until the hold condition is cleared.  Once the hold is asserted
106  * for all clients, the vmspace change can proceed with confidence.  Upon
107  * completion of that operation, VCS_HOLD is cleared from the clients, and they
108  * are released to resume vmspace accesses.
109  *
110  * vCPU Consumers
111  *
112  * Access to the vmspace for vCPUs running in guest context is different from
113  * emulation-related vm_client activity: they solely rely on the contents of the
114  * page tables.  Furthermore, the existing VCS_HOLD mechanism used to exclude
115  * client access is not feasible when entering guest context, since interrupts
116  * are disabled, making it impossible to block entry.  This is not a concern as
117  * long as vmspace modifications never place the page tables in invalid states
118  * (either intermediate, or final).  The vm_client hold mechanism does provide
119  * the means to IPI vCPU consumers which will trigger a notification once they
120  * report their exit from guest context.  This can be used to ensure that page
121  * table modifications are made visible to those vCPUs within a certain
122  * time frame.
123  */
124 
125 typedef struct vmspace_mapping {
126 	list_node_t	vmsm_node;
127 	vm_object_t	*vmsm_object;	/* object backing this mapping */
128 	uintptr_t	vmsm_addr;	/* start addr in vmspace for mapping */
129 	size_t		vmsm_len;	/* length (in bytes) of mapping */
130 	off_t		vmsm_offset;	/* byte offset into object */
131 	uint_t		vmsm_prot;
132 } vmspace_mapping_t;
133 
134 #define	VMSM_OFFSET(vmsm, addr)	(			\
135 	    (vmsm)->vmsm_offset +			\
136 	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
137 
138 typedef enum vm_client_state {
139 	VCS_IDLE	= 0,
140 	/* currently accessing vmspace for client operation (hold or fault) */
141 	VCS_ACTIVE	= (1 << 0),
142 	/* client hold requested/asserted */
143 	VCS_HOLD	= (1 << 1),
144 	/* vCPU is accessing page tables in guest context */
145 	VCS_ON_CPU	= (1 << 2),
146 	/* client has been orphaned (no more access to vmspace) */
147 	VCS_ORPHANED	= (1 << 3),
148 	/* client undergoing destroy operation */
149 	VCS_DESTROY	= (1 << 4),
150 } vm_client_state_t;
151 
152 struct vmspace {
153 	kmutex_t	vms_lock;
154 	kcondvar_t	vms_cv;
155 	bool		vms_held;
156 	uintptr_t	vms_size;	/* immutable after creation */
157 
158 	/* (nested) page table state */
159 	vmm_gpt_t	*vms_gpt;
160 	uint64_t	vms_pt_gen;
161 	uint64_t	vms_pages_mapped;
162 	bool		vms_track_dirty;
163 
164 	list_t		vms_maplist;
165 	list_t		vms_clients;
166 };
167 
168 struct vm_client {
169 	vmspace_t	*vmc_space;
170 	list_node_t	vmc_node;
171 
172 	kmutex_t	vmc_lock;
173 	kcondvar_t	vmc_cv;
174 	vm_client_state_t vmc_state;
175 	int		vmc_cpu_active;
176 	uint64_t	vmc_cpu_gen;
177 	bool		vmc_track_dirty;
178 	vmc_inval_cb_t	vmc_inval_func;
179 	void		*vmc_inval_data;
180 
181 	list_t		vmc_held_pages;
182 };
183 
184 typedef enum vm_object_type {
185 	VMOT_NONE,
186 	VMOT_MEM,
187 	VMOT_MMIO,
188 } vm_object_type_t;
189 
190 struct vm_object {
191 	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
192 
193 	/* Fields below are fixed at creation time */
194 	vm_object_type_t vmo_type;
195 	size_t		vmo_size;
196 	void		*vmo_data;
197 	uint8_t		vmo_attr;
198 };
199 
200 struct vm_page {
201 	vm_client_t	*vmp_client;
202 	list_node_t	vmp_node;
203 	vm_page_t	*vmp_chain;
204 	uintptr_t	vmp_gpa;
205 	pfn_t		vmp_pfn;
206 	uint64_t	*vmp_ptep;
207 	vm_object_t	*vmp_obj_ref;
208 	int		vmp_prot;
209 };
210 
211 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
212 static void vmspace_hold_enter(vmspace_t *);
213 static void vmspace_hold_exit(vmspace_t *, bool);
214 static void vmc_space_hold(vm_client_t *);
215 static void vmc_space_release(vm_client_t *, bool);
216 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
217 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
218 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
219 
220 
221 /*
222  * Create a new vmspace with a maximum address of `end`.
223  */
224 vmspace_t *
225 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
226 {
227 	vmspace_t *vms;
228 	const uintptr_t size = end + 1;
229 
230 	/*
231 	 * This whole mess is built on the assumption that a 64-bit address
232 	 * space is available to work with for the various pagetable tricks.
233 	 */
234 	VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
235 	    size <= (uintptr_t)USERLIMIT);
236 
237 	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
238 	vms->vms_size = size;
239 	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
240 	    offsetof(vmspace_mapping_t, vmsm_node));
241 	list_create(&vms->vms_clients, sizeof (vm_client_t),
242 	    offsetof(vm_client_t, vmc_node));
243 
244 	vms->vms_gpt = vmm_gpt_alloc(pte_ops);
245 	vms->vms_pt_gen = 1;
246 	vms->vms_track_dirty = track_dirty;
247 
248 	return (vms);
249 }
250 
251 /*
252  * Destroy a vmspace.  All regions in the space must be unmapped.  Any remaining
253  * clients will be orphaned.
254  */
255 void
256 vmspace_destroy(vmspace_t *vms)
257 {
258 	mutex_enter(&vms->vms_lock);
259 	VERIFY(list_is_empty(&vms->vms_maplist));
260 
261 	if (!list_is_empty(&vms->vms_clients)) {
262 		vm_client_t *vmc = list_head(&vms->vms_clients);
263 		while (vmc != NULL) {
264 			vmc = vmc_space_orphan(vmc, vms);
265 		}
266 		/*
267 		 * Wait for any clients which were in the process of destroying
268 		 * themselves to disappear.
269 		 */
270 		while (!list_is_empty(&vms->vms_clients)) {
271 			cv_wait(&vms->vms_cv, &vms->vms_lock);
272 		}
273 	}
274 	VERIFY(list_is_empty(&vms->vms_clients));
275 
276 	vmm_gpt_free(vms->vms_gpt);
277 	mutex_exit(&vms->vms_lock);
278 
279 	mutex_destroy(&vms->vms_lock);
280 	cv_destroy(&vms->vms_cv);
281 	list_destroy(&vms->vms_maplist);
282 	list_destroy(&vms->vms_clients);
283 
284 	kmem_free(vms, sizeof (*vms));
285 }
286 
287 /*
288  * Retrieve the count of resident (mapped into the page tables) pages.
289  */
290 uint64_t
291 vmspace_resident_count(vmspace_t *vms)
292 {
293 	return (vms->vms_pages_mapped);
294 }
295 
296 void
297 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap)
298 {
299 	/*
300 	 * Accumulate dirty bits into the given bit vector.  Note that this
301 	 * races both against hardware writes from running vCPUs and
302 	 * reflections from userspace.
303 	 *
304 	 * Called from a userspace-visible ioctl, this depends on the VM
305 	 * instance being read-locked to prevent vmspace_map/vmspace_unmap
306 	 * operations from changing the page tables during the walk.
307 	 */
308 	for (size_t offset = 0; offset < len; offset += PAGESIZE) {
309 		bool bit = false;
310 		uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset);
311 		if (entry != NULL)
312 			bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false);
313 		uint64_t pfn_offset = offset >> PAGESHIFT;
314 		size_t bit_offset = pfn_offset / 8;
315 		size_t bit_index = pfn_offset % 8;
316 		bitmap[bit_offset] |= (bit << bit_index);
317 	}
318 
319 	/*
320 	 * Now invalidate those bits and shoot down address spaces that
321 	 * may have them cached.
322 	 */
323 	vmspace_hold_enter(vms);
324 	vms->vms_pt_gen++;
325 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
326 	    vmc != NULL;
327 	    vmc = list_next(&vms->vms_clients, vmc)) {
328 		vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
329 	}
330 	vmspace_hold_exit(vms, true);
331 }
332 
333 static pfn_t
334 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
335 {
336 	vmmr_region_t *region;
337 	pfn_t pfn;
338 
339 	ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
340 
341 	region = vmo->vmo_data;
342 	pfn = vmmr_region_pfn_at(region, off);
343 
344 	return (pfn);
345 }
346 
347 static pfn_t
348 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
349 {
350 	pfn_t pfn;
351 
352 	ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
353 	ASSERT3P(vmo->vmo_data, !=, NULL);
354 	ASSERT3U(off, <, vmo->vmo_size);
355 
356 	pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
357 
358 	return (pfn);
359 }
360 
361 /*
362  * Allocate a VM object backed by VMM reservoir memory.
363  */
364 vm_object_t *
365 vm_object_mem_allocate(size_t size, bool transient)
366 {
367 	int err;
368 	vmmr_region_t *region = NULL;
369 	vm_object_t *vmo;
370 
371 	ASSERT3U(size, !=, 0);
372 	ASSERT3U(size & PAGEOFFSET, ==, 0);
373 
374 	err = vmmr_alloc(size, transient, &region);
375 	if (err != 0) {
376 		return (NULL);
377 	}
378 
379 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
380 
381 	/* For now, these are to stay fixed after allocation */
382 	vmo->vmo_type = VMOT_MEM;
383 	vmo->vmo_size = size;
384 	vmo->vmo_attr = MTRR_TYPE_WB;
385 	vmo->vmo_data = region;
386 	vmo->vmo_refcnt = 1;
387 
388 	return (vmo);
389 }
390 
391 static vm_object_t *
392 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
393 {
394 	vm_object_t *vmo;
395 
396 	ASSERT3U(size, !=, 0);
397 	ASSERT3U(size & PAGEOFFSET, ==, 0);
398 	ASSERT3U(hpa & PAGEOFFSET, ==, 0);
399 
400 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
401 
402 	/* For now, these are to stay fixed after allocation */
403 	vmo->vmo_type = VMOT_MMIO;
404 	vmo->vmo_size = size;
405 	vmo->vmo_attr = MTRR_TYPE_UC;
406 	vmo->vmo_data = (void *)hpa;
407 	vmo->vmo_refcnt = 1;
408 
409 	return (vmo);
410 }
411 
412 /*
413  * Allocate a VM object backed by an existing range of physical memory.
414  */
415 vm_object_t *
416 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
417 {
418 	int error;
419 	vm_object_t *obj;
420 
421 	obj = vm_object_mmio_allocate(len, hpa);
422 	if (obj != NULL) {
423 		error = vmspace_map(vmspace, obj, 0, gpa, len,
424 		    PROT_READ | PROT_WRITE);
425 		if (error != 0) {
426 			vm_object_release(obj);
427 			obj = NULL;
428 		}
429 	}
430 
431 	return (obj);
432 }
433 
434 /*
435  * Release a vm_object reference
436  */
437 void
438 vm_object_release(vm_object_t *vmo)
439 {
440 	ASSERT(vmo != NULL);
441 
442 	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
443 	/* underflow would be a deadly serious mistake */
444 	VERIFY3U(ref, !=, UINT_MAX);
445 	if (ref != 0) {
446 		return;
447 	}
448 
449 	switch (vmo->vmo_type) {
450 	case VMOT_MEM:
451 		vmmr_free((vmmr_region_t *)vmo->vmo_data);
452 		break;
453 	case VMOT_MMIO:
454 		break;
455 	default:
456 		panic("unexpected object type %u", vmo->vmo_type);
457 		break;
458 	}
459 
460 	vmo->vmo_data = NULL;
461 	vmo->vmo_size = 0;
462 	kmem_free(vmo, sizeof (*vmo));
463 }
464 
465 /*
466  * Increase refcount for vm_object reference
467  */
468 void
469 vm_object_reference(vm_object_t *vmo)
470 {
471 	ASSERT(vmo != NULL);
472 
473 	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
474 	/* overflow would be a deadly serious mistake */
475 	VERIFY3U(ref, !=, 0);
476 }
477 
478 /*
479  * Get the host-physical PFN for a given offset into a vm_object.
480  *
481  * The provided `off` must be within the allocated size of the vm_object.
482  */
483 pfn_t
484 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
485 {
486 	const uintptr_t aligned_off = off & PAGEMASK;
487 
488 	switch (vmo->vmo_type) {
489 	case VMOT_MEM:
490 		return (vm_object_pager_reservoir(vmo, aligned_off));
491 	case VMOT_MMIO:
492 		return (vm_object_pager_mmio(vmo, aligned_off));
493 	case VMOT_NONE:
494 		break;
495 	}
496 	panic("unexpected object type %u", vmo->vmo_type);
497 }
498 
499 static vmspace_mapping_t *
500 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
501 {
502 	vmspace_mapping_t *vmsm;
503 	list_t *ml = &vms->vms_maplist;
504 	const uintptr_t range_end = addr + size;
505 
506 	ASSERT3U(addr, <=, range_end);
507 
508 	if (addr >= vms->vms_size) {
509 		return (NULL);
510 	}
511 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
512 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
513 
514 		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
515 			if (range_end <= seg_end) {
516 				return (vmsm);
517 			} else {
518 				return (NULL);
519 			}
520 		}
521 	}
522 	return (NULL);
523 }
524 
525 /*
526  * Check to see if any mappings reside within [addr, addr + size) span in the
527  * vmspace, returning true if that span is indeed empty.
528  */
529 static bool
530 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
531 {
532 	vmspace_mapping_t *vmsm;
533 	list_t *ml = &vms->vms_maplist;
534 	const uintptr_t range_end = addr + size - 1;
535 
536 	ASSERT(MUTEX_HELD(&vms->vms_lock));
537 	ASSERT(size > 0);
538 
539 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
540 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
541 
542 		/*
543 		 * The two ranges do not overlap if the start of either of
544 		 * them is after the end of the other.
545 		 */
546 		if (vmsm->vmsm_addr > range_end || addr > seg_end)
547 			continue;
548 		return (false);
549 	}
550 	return (true);
551 }
552 
553 static void
554 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
555 {
556 	list_t *ml = &vms->vms_maplist;
557 
558 	ASSERT(MUTEX_HELD(&vms->vms_lock));
559 	ASSERT(vms->vms_held);
560 
561 	list_remove(ml, vmsm);
562 	vm_object_release(vmsm->vmsm_object);
563 	kmem_free(vmsm, sizeof (*vmsm));
564 }
565 
566 /*
567  * Enter a hold state on the vmspace.  This ensures that all VM clients
568  * associated with the vmspace are excluded from establishing new page holds,
569  * or any other actions which would require accessing vmspace state subject to
570  * potential change.
571  *
572  * Returns with vmspace_t`vms_lock held.
573  */
574 static void
575 vmspace_hold_enter(vmspace_t *vms)
576 {
577 	mutex_enter(&vms->vms_lock);
578 	VERIFY(!vms->vms_held);
579 
580 	vm_client_t *vmc = list_head(&vms->vms_clients);
581 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
582 		vmc_space_hold(vmc);
583 	}
584 	vms->vms_held = true;
585 }
586 
587 /*
588  * Exit a hold state on the vmspace.  This releases all VM clients associated
589  * with the vmspace to be able to establish new page holds, and partake in other
590  * actions which require accessing changed vmspace state.  If `kick_on_cpu` is
591  * true, then any CPUs actively using the page tables will be IPIed, and the
592  * call will block until they have acknowledged being ready to use the latest
593  * state of the tables.
594  *
595  * Requires vmspace_t`vms_lock be held, which is released as part of the call.
596  */
597 static void
598 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
599 {
600 	ASSERT(MUTEX_HELD(&vms->vms_lock));
601 	VERIFY(vms->vms_held);
602 
603 	vm_client_t *vmc = list_head(&vms->vms_clients);
604 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
605 		vmc_space_release(vmc, kick_on_cpu);
606 	}
607 	vms->vms_held = false;
608 	mutex_exit(&vms->vms_lock);
609 }
610 
611 /*
612  * Attempt to map a vm_object span into the vmspace.
613  *
614  * Requirements:
615  * - `obj_off`, `addr`, and `len` must be page-aligned
616  * - `obj_off` cannot be greater than the allocated size of the object
617  * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
618  *   size of the object
619  * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
620  *   of the vmspace
621  */
622 int
623 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
624     size_t len, uint8_t prot)
625 {
626 	vmspace_mapping_t *vmsm;
627 	int res = 0;
628 
629 	if (len == 0 || (addr + len) < addr ||
630 	    obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
631 		return (EINVAL);
632 	}
633 	if ((addr + len) >= vms->vms_size) {
634 		return (ENOMEM);
635 	}
636 
637 	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
638 
639 	vmspace_hold_enter(vms);
640 	if (!vm_mapping_gap(vms, addr, len)) {
641 		kmem_free(vmsm, sizeof (*vmsm));
642 		res = ENOMEM;
643 	} else {
644 		vmsm->vmsm_object = vmo;
645 		vmsm->vmsm_addr = addr;
646 		vmsm->vmsm_len = len;
647 		vmsm->vmsm_offset = (off_t)obj_off;
648 		vmsm->vmsm_prot = prot;
649 		list_insert_tail(&vms->vms_maplist, vmsm);
650 
651 		/*
652 		 * Make sure the GPT has tables ready for leaf entries across
653 		 * the entire new mapping.
654 		 */
655 		vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len);
656 	}
657 	vmspace_hold_exit(vms, false);
658 	return (res);
659 }
660 
661 /*
662  * Unmap a region of the vmspace.
663  *
664  * Presently the [start, end) span must equal a region previously mapped by a
665  * call to vmspace_map().
666  */
667 int
668 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end)
669 {
670 	const size_t size = (size_t)(end - start);
671 	vmspace_mapping_t *vmsm;
672 	vm_client_t *vmc;
673 	uint64_t gen = 0;
674 
675 	ASSERT(start < end);
676 
677 	vmspace_hold_enter(vms);
678 	/* expect to match existing mapping exactly */
679 	if ((vmsm = vm_mapping_find(vms, start, size)) == NULL ||
680 	    vmsm->vmsm_addr != start || vmsm->vmsm_len != size) {
681 		vmspace_hold_exit(vms, false);
682 		return (ENOENT);
683 	}
684 
685 	/* Prepare clients (and their held pages) for the unmap. */
686 	for (vmc = list_head(&vms->vms_clients); vmc != NULL;
687 	    vmc = list_next(&vms->vms_clients, vmc)) {
688 		vmc_space_unmap(vmc, start, size, vmsm->vmsm_object);
689 	}
690 
691 	/* Clear all PTEs for region */
692 	if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) {
693 		vms->vms_pt_gen++;
694 		gen = vms->vms_pt_gen;
695 	}
696 	/* ... and the intermediate (directory) PTEs as well */
697 	vmm_gpt_vacate_region(vms->vms_gpt, start, end);
698 
699 	/*
700 	 * If pages were actually unmapped from the GPT, provide clients with
701 	 * an invalidation notice.
702 	 */
703 	if (gen != 0) {
704 		for (vmc = list_head(&vms->vms_clients); vmc != NULL;
705 		    vmc = list_next(&vms->vms_clients, vmc)) {
706 			vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen);
707 		}
708 	}
709 
710 	vm_mapping_remove(vms, vmsm);
711 	vmspace_hold_exit(vms, true);
712 	return (0);
713 }
714 
715 static int
716 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
717     uint64_t **ptepp)
718 {
719 	vmm_gpt_t *gpt = vms->vms_gpt;
720 	uint64_t *entries[MAX_GPT_LEVEL], *leaf;
721 	pfn_t pfn = PFN_INVALID;
722 	uint_t prot;
723 
724 	ASSERT0(gpa & PAGEOFFSET);
725 	ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
726 
727 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
728 	leaf = entries[LEVEL1];
729 	if (leaf == NULL) {
730 		/*
731 		 * Since we populated the intermediate tables for any regions
732 		 * mapped in the GPT, an empty leaf entry indicates there is no
733 		 * mapping, populated or not, at this GPT.
734 		 */
735 		return (FC_NOMAP);
736 	}
737 
738 	if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
739 		if ((req_prot & prot) != req_prot) {
740 			return (FC_PROT);
741 		}
742 	} else {
743 		vmspace_mapping_t *vmsm;
744 		vm_object_t *vmo;
745 
746 		vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
747 		if (vmsm == NULL) {
748 			return (FC_NOMAP);
749 		}
750 
751 		if ((req_prot & vmsm->vmsm_prot) != req_prot) {
752 			return (FC_PROT);
753 		}
754 		vmo = vmsm->vmsm_object;
755 		pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
756 		VERIFY(pfn != PFN_INVALID);
757 
758 		if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot,
759 		    vmo->vmo_attr)) {
760 			atomic_inc_64(&vms->vms_pages_mapped);
761 		}
762 	}
763 
764 	ASSERT(pfn != PFN_INVALID && leaf != NULL);
765 	if (pfnp != NULL) {
766 		*pfnp = pfn;
767 	}
768 	if (ptepp != NULL) {
769 		*ptepp = leaf;
770 	}
771 	return (0);
772 }
773 
774 /*
775  * Populate (make resident in the page tables) a region of the vmspace.
776  *
777  * Presently the [start, end) span must equal a region previously mapped by a
778  * call to vmspace_map().
779  */
780 int
781 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end)
782 {
783 	const size_t size = end - start;
784 	vmspace_mapping_t *vmsm;
785 
786 	mutex_enter(&vms->vms_lock);
787 
788 	/* For the time being, only exact-match mappings are expected */
789 	if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) {
790 		mutex_exit(&vms->vms_lock);
791 		return (FC_NOMAP);
792 	}
793 
794 	vm_object_t *vmo = vmsm->vmsm_object;
795 	const int prot = vmsm->vmsm_prot;
796 	const uint8_t attr = vmo->vmo_attr;
797 	size_t populated = 0;
798 	for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) {
799 		const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
800 		VERIFY(pfn != PFN_INVALID);
801 
802 		if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
803 			populated++;
804 		}
805 	}
806 	atomic_add_64(&vms->vms_pages_mapped, populated);
807 
808 	mutex_exit(&vms->vms_lock);
809 	return (0);
810 }
811 
812 /*
813  * Allocate a client from a given vmspace.
814  */
815 vm_client_t *
816 vmspace_client_alloc(vmspace_t *vms)
817 {
818 	vm_client_t *vmc;
819 
820 	vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
821 	vmc->vmc_space = vms;
822 	mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
823 	cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
824 	vmc->vmc_state = VCS_IDLE;
825 	vmc->vmc_cpu_active = -1;
826 	list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
827 	    offsetof(vm_page_t, vmp_node));
828 	vmc->vmc_track_dirty = vms->vms_track_dirty;
829 
830 	mutex_enter(&vms->vms_lock);
831 	list_insert_tail(&vms->vms_clients, vmc);
832 	mutex_exit(&vms->vms_lock);
833 
834 	return (vmc);
835 }
836 
837 /*
838  * Get the nested page table root pointer (EPTP/NCR3) value.
839  */
840 uint64_t
841 vmspace_table_root(vmspace_t *vms)
842 {
843 	return (vmm_gpt_get_pmtp(vms->vms_gpt));
844 }
845 
846 /*
847  * Get the current generation number of the nested page table.
848  */
849 uint64_t
850 vmspace_table_gen(vmspace_t *vms)
851 {
852 	return (vms->vms_pt_gen);
853 }
854 
855 /*
856  * Mark a vm_client as active.  This will block if/while the client is held by
857  * the vmspace.  On success, it returns with vm_client_t`vmc_lock held.  It will
858  * fail if the vm_client has been orphaned.
859  */
860 static int
861 vmc_activate(vm_client_t *vmc)
862 {
863 	mutex_enter(&vmc->vmc_lock);
864 	VERIFY0(vmc->vmc_state & VCS_ACTIVE);
865 	if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
866 		mutex_exit(&vmc->vmc_lock);
867 		return (ENXIO);
868 	}
869 	while ((vmc->vmc_state & VCS_HOLD) != 0) {
870 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
871 	}
872 	vmc->vmc_state |= VCS_ACTIVE;
873 	return (0);
874 }
875 
876 /*
877  * Mark a vm_client as no longer active.  It must be called with
878  * vm_client_t`vmc_lock already held, and will return with it released.
879  */
880 static void
881 vmc_deactivate(vm_client_t *vmc)
882 {
883 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
884 	VERIFY(vmc->vmc_state & VCS_ACTIVE);
885 
886 	vmc->vmc_state ^= VCS_ACTIVE;
887 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
888 		cv_broadcast(&vmc->vmc_cv);
889 	}
890 	mutex_exit(&vmc->vmc_lock);
891 }
892 
893 /*
894  * Indicate that a CPU will be utilizing the nested page tables through this VM
895  * client.  Interrupts (and/or the GIF) are expected to be disabled when calling
896  * this function.  Returns the generation number of the nested page table (to be
897  * used for TLB invalidations).
898  */
899 uint64_t
900 vmc_table_enter(vm_client_t *vmc)
901 {
902 	vmspace_t *vms = vmc->vmc_space;
903 	uint64_t gen;
904 
905 	ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
906 	ASSERT3S(vmc->vmc_cpu_active, ==, -1);
907 
908 	/*
909 	 * Since the NPT activation occurs with interrupts disabled, this must
910 	 * be done without taking vmc_lock like normal.
911 	 */
912 	gen = vms->vms_pt_gen;
913 	vmc->vmc_cpu_active = CPU->cpu_id;
914 	vmc->vmc_cpu_gen = gen;
915 	atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
916 
917 	return (gen);
918 }
919 
920 /*
921  * Indicate that this VM client is not longer (directly) using the underlying
922  * page tables.  Interrupts (and/or the GIF) must be enabled prior to calling
923  * this function.
924  */
925 void
926 vmc_table_exit(vm_client_t *vmc)
927 {
928 	mutex_enter(&vmc->vmc_lock);
929 
930 	ASSERT(vmc->vmc_state & VCS_ON_CPU);
931 	vmc->vmc_state ^= VCS_ON_CPU;
932 	vmc->vmc_cpu_active = -1;
933 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
934 		cv_broadcast(&vmc->vmc_cv);
935 	}
936 
937 	mutex_exit(&vmc->vmc_lock);
938 }
939 
940 static void
941 vmc_space_hold(vm_client_t *vmc)
942 {
943 	mutex_enter(&vmc->vmc_lock);
944 	VERIFY0(vmc->vmc_state & VCS_HOLD);
945 
946 	/*
947 	 * Because vmc_table_enter() alters vmc_state from a context where
948 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
949 	 * VMC_HOLD must be done atomically here.
950 	 */
951 	atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
952 
953 	/* Wait for client to go inactive */
954 	while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
955 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
956 	}
957 	mutex_exit(&vmc->vmc_lock);
958 }
959 
960 static void
961 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
962 {
963 	mutex_enter(&vmc->vmc_lock);
964 	VERIFY(vmc->vmc_state & VCS_HOLD);
965 
966 	if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
967 		poke_cpu(vmc->vmc_cpu_active);
968 
969 		while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
970 			cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
971 		}
972 	}
973 
974 	/*
975 	 * Because vmc_table_enter() alters vmc_state from a context where
976 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
977 	 * VMC_HOLD must be done atomically here.
978 	 */
979 	atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
980 	cv_broadcast(&vmc->vmc_cv);
981 	mutex_exit(&vmc->vmc_lock);
982 }
983 
984 static void
985 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
986     uint64_t gen)
987 {
988 	mutex_enter(&vmc->vmc_lock);
989 	VERIFY(vmc->vmc_state & VCS_HOLD);
990 	if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
991 		/*
992 		 * Wait for clients using an old generation of the page tables
993 		 * to exit guest context, where they subsequently flush the TLB
994 		 * for the new generation.
995 		 */
996 		if (vmc->vmc_cpu_gen < gen) {
997 			poke_cpu(vmc->vmc_cpu_active);
998 
999 			while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1000 				cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1001 			}
1002 		}
1003 	}
1004 	if (vmc->vmc_inval_func != NULL) {
1005 		vmc_inval_cb_t func = vmc->vmc_inval_func;
1006 		void *data = vmc->vmc_inval_data;
1007 
1008 		/*
1009 		 * Perform the actual invalidation call outside vmc_lock to
1010 		 * avoid lock ordering issues in the consumer.  Since the client
1011 		 * is under VCS_HOLD, this is safe.
1012 		 */
1013 		mutex_exit(&vmc->vmc_lock);
1014 		func(data, addr, size);
1015 		mutex_enter(&vmc->vmc_lock);
1016 	}
1017 	mutex_exit(&vmc->vmc_lock);
1018 }
1019 
1020 static void
1021 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1022     vm_object_t *vmo)
1023 {
1024 	mutex_enter(&vmc->vmc_lock);
1025 	VERIFY(vmc->vmc_state & VCS_HOLD);
1026 
1027 	/*
1028 	 * With the current vCPU exclusion invariants in place, we do not expect
1029 	 * a vCPU to be in guest context during an unmap.
1030 	 */
1031 	VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1032 
1033 	/*
1034 	 * Any holds against the unmapped region need to establish their own
1035 	 * reference to the underlying object to avoid a potential
1036 	 * use-after-free.
1037 	 */
1038 	for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1039 	    vmp != NULL;
1040 	    vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1041 		if (vmp->vmp_gpa < addr ||
1042 		    vmp->vmp_gpa >= (addr + size)) {
1043 			/* Hold outside region in question */
1044 			continue;
1045 		}
1046 		if (vmp->vmp_obj_ref == NULL) {
1047 			vm_object_reference(vmo);
1048 			vmp->vmp_obj_ref = vmo;
1049 			/* For an unmapped region, PTE is now meaningless */
1050 			vmp->vmp_ptep = NULL;
1051 		} else {
1052 			/*
1053 			 * Object could have gone through cycle of
1054 			 * unmap-map-unmap before the hold was released.
1055 			 */
1056 			VERIFY3P(vmp->vmp_ptep, ==, NULL);
1057 		}
1058 	}
1059 	mutex_exit(&vmc->vmc_lock);
1060 }
1061 
1062 static vm_client_t *
1063 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1064 {
1065 	vm_client_t *next;
1066 
1067 	ASSERT(MUTEX_HELD(&vms->vms_lock));
1068 
1069 	mutex_enter(&vmc->vmc_lock);
1070 	VERIFY3P(vmc->vmc_space, ==, vms);
1071 	VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1072 	if (vmc->vmc_state & VCS_DESTROY) {
1073 		/*
1074 		 * This vm_client is currently undergoing destruction, so it
1075 		 * does not need to be orphaned.  Let it proceed with its own
1076 		 * clean-up task.
1077 		 */
1078 		next = list_next(&vms->vms_clients, vmc);
1079 	} else {
1080 		/*
1081 		 * Clients are only orphaned when the containing vmspace is
1082 		 * being torn down.  All mappings from the vmspace should
1083 		 * already be gone, meaning any remaining held pages should have
1084 		 * direct references to the object.
1085 		 */
1086 		for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1087 		    vmp != NULL;
1088 		    vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1089 			ASSERT3P(vmp->vmp_ptep, ==, NULL);
1090 			ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1091 		}
1092 
1093 		/*
1094 		 * After this point, the client will be orphaned, unable to
1095 		 * establish new page holds (or access any vmspace-related
1096 		 * resources) and is in charge of cleaning up after itself.
1097 		 */
1098 		vmc->vmc_state |= VCS_ORPHANED;
1099 		next = list_next(&vms->vms_clients, vmc);
1100 		list_remove(&vms->vms_clients, vmc);
1101 		vmc->vmc_space = NULL;
1102 	}
1103 	mutex_exit(&vmc->vmc_lock);
1104 	return (next);
1105 }
1106 
1107 /*
1108  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1109  */
1110 vm_page_t *
1111 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1112 {
1113 	vmspace_t *vms = vmc->vmc_space;
1114 	vm_page_t *vmp;
1115 	pfn_t pfn = PFN_INVALID;
1116 	uint64_t *ptep = NULL;
1117 
1118 	ASSERT0(gpa & PAGEOFFSET);
1119 	ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1120 
1121 	vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1122 	if (vmc_activate(vmc) != 0) {
1123 		kmem_free(vmp, sizeof (*vmp));
1124 		return (NULL);
1125 	}
1126 
1127 	if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1128 		vmc_deactivate(vmc);
1129 		kmem_free(vmp, sizeof (*vmp));
1130 		return (NULL);
1131 	}
1132 	ASSERT(pfn != PFN_INVALID && ptep != NULL);
1133 
1134 	vmp->vmp_client = vmc;
1135 	vmp->vmp_chain = NULL;
1136 	vmp->vmp_gpa = gpa;
1137 	vmp->vmp_pfn = pfn;
1138 	vmp->vmp_ptep = ptep;
1139 	vmp->vmp_obj_ref = NULL;
1140 	vmp->vmp_prot = prot;
1141 	list_insert_tail(&vmc->vmc_held_pages, vmp);
1142 	vmc_deactivate(vmc);
1143 
1144 	return (vmp);
1145 }
1146 
1147 int
1148 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1149 {
1150 	vmspace_t *vms = vmc->vmc_space;
1151 	int err;
1152 
1153 	err = vmc_activate(vmc);
1154 	if (err == 0) {
1155 		err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1156 		vmc_deactivate(vmc);
1157 	}
1158 
1159 	return (err);
1160 }
1161 
1162 /*
1163  * Allocate an additional vm_client_t, based on an existing one.  Only the
1164  * associatation with the vmspace is cloned, not existing holds or any
1165  * configured invalidation function.
1166  */
1167 vm_client_t *
1168 vmc_clone(vm_client_t *vmc)
1169 {
1170 	vmspace_t *vms = vmc->vmc_space;
1171 
1172 	return (vmspace_client_alloc(vms));
1173 }
1174 
1175 /*
1176  * Register a function (and associated data pointer) to be called when an
1177  * address range in the vmspace is invalidated.
1178  */
1179 int
1180 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1181 {
1182 	int err;
1183 
1184 	err = vmc_activate(vmc);
1185 	if (err == 0) {
1186 		vmc->vmc_inval_func = func;
1187 		vmc->vmc_inval_data = data;
1188 		vmc_deactivate(vmc);
1189 	}
1190 
1191 	return (err);
1192 }
1193 
1194 /*
1195  * Destroy a vm_client_t instance.
1196  *
1197  * No pages held through this vm_client_t may be outstanding when performing a
1198  * vmc_destroy().  For vCPU clients, the client cannot be on-CPU (a call to
1199  * vmc_table_exit() has been made).
1200  */
1201 void
1202 vmc_destroy(vm_client_t *vmc)
1203 {
1204 	mutex_enter(&vmc->vmc_lock);
1205 
1206 	VERIFY(list_is_empty(&vmc->vmc_held_pages));
1207 	VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1208 
1209 	if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1210 		vmspace_t *vms;
1211 
1212 		/*
1213 		 * Deassociation with the parent vmspace must be done carefully:
1214 		 * The vmspace could attempt to orphan this vm_client while we
1215 		 * release vmc_lock in order to take vms_lock (the required
1216 		 * order).  The client is marked to indicate that destruction is
1217 		 * under way.  Doing so prevents any racing orphan operation
1218 		 * from applying to this client, allowing us to deassociate from
1219 		 * the vmspace safely.
1220 		 */
1221 		vmc->vmc_state |= VCS_DESTROY;
1222 		vms = vmc->vmc_space;
1223 		mutex_exit(&vmc->vmc_lock);
1224 
1225 		mutex_enter(&vms->vms_lock);
1226 		mutex_enter(&vmc->vmc_lock);
1227 		list_remove(&vms->vms_clients, vmc);
1228 		/*
1229 		 * If the vmspace began its own destruction operation while we
1230 		 * were navigating the locks, be sure to notify it about this
1231 		 * vm_client being deassociated.
1232 		 */
1233 		cv_signal(&vms->vms_cv);
1234 		mutex_exit(&vmc->vmc_lock);
1235 		mutex_exit(&vms->vms_lock);
1236 	} else {
1237 		VERIFY3P(vmc->vmc_space, ==, NULL);
1238 		mutex_exit(&vmc->vmc_lock);
1239 	}
1240 
1241 	mutex_destroy(&vmc->vmc_lock);
1242 	cv_destroy(&vmc->vmc_cv);
1243 	list_destroy(&vmc->vmc_held_pages);
1244 
1245 	kmem_free(vmc, sizeof (*vmc));
1246 }
1247 
1248 static __inline void *
1249 vmp_ptr(const vm_page_t *vmp)
1250 {
1251 	ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1252 
1253 	const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1254 	return ((void *)((uintptr_t)kpm_vbase + paddr));
1255 }
1256 
1257 /*
1258  * Get a readable kernel-virtual pointer for a held page.
1259  *
1260  * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1261  * call to acquire this page reference.
1262  */
1263 const void *
1264 vmp_get_readable(const vm_page_t *vmp)
1265 {
1266 	ASSERT(vmp->vmp_prot & PROT_READ);
1267 
1268 	return (vmp_ptr(vmp));
1269 }
1270 
1271 /*
1272  * Get a writable kernel-virtual pointer for a held page.
1273  *
1274  * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1275  * call to acquire this page reference.
1276  */
1277 void *
1278 vmp_get_writable(const vm_page_t *vmp)
1279 {
1280 	ASSERT(vmp->vmp_prot & PROT_WRITE);
1281 
1282 	return (vmp_ptr(vmp));
1283 }
1284 
1285 /*
1286  * Get the host-physical PFN for a held page.
1287  */
1288 pfn_t
1289 vmp_get_pfn(const vm_page_t *vmp)
1290 {
1291 	return (vmp->vmp_pfn);
1292 }
1293 
1294 /*
1295  * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1296  */
1297 void
1298 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1299 {
1300 	ASSERT3P(vmp->vmp_chain, ==, NULL);
1301 
1302 	vmp->vmp_chain = to_chain;
1303 }
1304 
1305 /*
1306  * Retrieve the pointer from the page-chaining in `vmp`.
1307  */
1308 vm_page_t *
1309 vmp_next(const vm_page_t *vmp)
1310 {
1311 	return (vmp->vmp_chain);
1312 }
1313 
1314 static __inline bool
1315 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1316 {
1317 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1318 
1319 	bool was_unmapped = false;
1320 
1321 	list_remove(&vmc->vmc_held_pages, vmp);
1322 	if (vmp->vmp_obj_ref != NULL) {
1323 		ASSERT3P(vmp->vmp_ptep, ==, NULL);
1324 
1325 		vm_object_release(vmp->vmp_obj_ref);
1326 		was_unmapped = true;
1327 	} else {
1328 		ASSERT3P(vmp->vmp_ptep, !=, NULL);
1329 
1330 		if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) {
1331 			vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
1332 			(void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
1333 		}
1334 	}
1335 	kmem_free(vmp, sizeof (*vmp));
1336 	return (was_unmapped);
1337 }
1338 
1339 /*
1340  * Release held page.  Returns true if page resided on region which was
1341  * subsequently unmapped.
1342  */
1343 bool
1344 vmp_release(vm_page_t *vmp)
1345 {
1346 	vm_client_t *vmc = vmp->vmp_client;
1347 
1348 	VERIFY(vmc != NULL);
1349 
1350 	mutex_enter(&vmc->vmc_lock);
1351 	const bool was_unmapped = vmp_release_inner(vmp, vmc);
1352 	mutex_exit(&vmc->vmc_lock);
1353 	return (was_unmapped);
1354 }
1355 
1356 /*
1357  * Release a chain of pages which were associated via vmp_chain() (setting
1358  * page-chaining pointer).  Returns true if any pages resided upon a region
1359  * which was subsequently unmapped.
1360  *
1361  * All of those pages must have been held through the same vm_client_t.
1362  */
1363 bool
1364 vmp_release_chain(vm_page_t *vmp)
1365 {
1366 	vm_client_t *vmc = vmp->vmp_client;
1367 	bool any_unmapped = false;
1368 
1369 	ASSERT(vmp != NULL);
1370 
1371 	mutex_enter(&vmc->vmc_lock);
1372 	while (vmp != NULL) {
1373 		vm_page_t *next = vmp->vmp_chain;
1374 
1375 		/* We expect all pages in chain to be from same client */
1376 		ASSERT3P(vmp->vmp_client, ==, vmc);
1377 
1378 		if (vmp_release_inner(vmp, vmc)) {
1379 			any_unmapped = true;
1380 		}
1381 		vmp = next;
1382 	}
1383 	mutex_exit(&vmc->vmc_lock);
1384 	return (any_unmapped);
1385 }
1386 
1387 
1388 int
1389 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1390     struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1391 {
1392 	vm_object_t *vmo;
1393 	int err;
1394 
1395 	if (segoff < 0 || len <= 0 ||
1396 	    (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1397 		return (EINVAL);
1398 	}
1399 	if ((prot & PROT_USER) == 0) {
1400 		return (ENOTSUP);
1401 	}
1402 	err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1403 	if (err != 0) {
1404 		return (err);
1405 	}
1406 
1407 	VERIFY(segoff >= 0);
1408 	VERIFY(len <= vmo->vmo_size);
1409 	VERIFY((len + segoff) <= vmo->vmo_size);
1410 
1411 	if (vmo->vmo_type != VMOT_MEM) {
1412 		/* Only support memory objects for now */
1413 		return (ENOTSUP);
1414 	}
1415 
1416 	as_rangelock(as);
1417 
1418 	err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1419 	if (err == 0) {
1420 		segvmm_crargs_t svma;
1421 
1422 		svma.prot = prot;
1423 		svma.offset = segoff;
1424 		svma.vmo = vmo;
1425 		svma.vmc = NULL;
1426 
1427 		err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1428 	}
1429 
1430 	as_rangeunlock(as);
1431 	return (err);
1432 }
1433 
1434 int
1435 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1436     off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1437 {
1438 
1439 	const uintptr_t gpa = (uintptr_t)off;
1440 	const size_t size = (uintptr_t)len;
1441 	int err;
1442 
1443 	if (off < 0 || len <= 0 ||
1444 	    (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1445 		return (EINVAL);
1446 	}
1447 	if ((prot & PROT_USER) == 0) {
1448 		return (ENOTSUP);
1449 	}
1450 
1451 	as_rangelock(as);
1452 
1453 	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1454 	if (err == 0) {
1455 		segvmm_crargs_t svma;
1456 
1457 		svma.prot = prot;
1458 		svma.offset = gpa;
1459 		svma.vmo = NULL;
1460 		svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1461 
1462 		err = as_map(as, *addrp, len, segvmm_create, &svma);
1463 	}
1464 
1465 	as_rangeunlock(as);
1466 	return (err);
1467 }
1468