xref: /illumos-gate/usr/src/lib/libvmmapi/common/vmmapi.c (revision 440a8a36792bdf9ef51639066aab0b7771ffcab8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2015 Pluribus Networks Inc.
41  * Copyright 2019 Joyent, Inc.
42  */
43 
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/param.h>
48 #include <sys/sysctl.h>
49 #include <sys/ioctl.h>
50 #include <sys/mman.h>
51 #include <sys/_iovec.h>
52 #include <sys/cpuset.h>
53 
54 #include <x86/segments.h>
55 #include <machine/specialreg.h>
56 
57 #include <errno.h>
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <assert.h>
61 #include <string.h>
62 #include <fcntl.h>
63 #include <unistd.h>
64 
65 #include <libutil.h>
66 
67 #include <machine/vmm.h>
68 #include <machine/vmm_dev.h>
69 #ifndef	__FreeBSD__
70 #include <sys/vmm_impl.h>
71 #endif
72 
73 #include "vmmapi.h"
74 
75 #define	MB	(1024 * 1024UL)
76 #define	GB	(1024 * 1024 * 1024UL)
77 
78 #ifndef __FreeBSD__
79 /* shim to no-op for now */
80 #define	MAP_NOCORE		0
81 #define	MAP_ALIGNED_SUPER	0
82 
83 /* Rely on PROT_NONE for guard purposes */
84 #define	MAP_GUARD		(MAP_PRIVATE | MAP_ANON | MAP_NORESERVE)
85 #endif
86 
87 /*
88  * Size of the guard region before and after the virtual address space
89  * mapping the guest physical memory. This must be a multiple of the
90  * superpage size for performance reasons.
91  */
92 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
93 
94 #define	PROT_RW		(PROT_READ | PROT_WRITE)
95 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
96 
97 struct vmctx {
98 	int	fd;
99 	uint32_t lowmem_limit;
100 	int	memflags;
101 	size_t	lowmem;
102 	size_t	highmem;
103 	char	*baseaddr;
104 	char	*name;
105 };
106 
107 #ifdef	__FreeBSD__
108 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
109 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
110 #else
111 #define	CREATE(x)	vm_do_ctl(VMM_CREATE_VM, (x))
112 #define	DESTROY(x)	vm_do_ctl(VMM_DESTROY_VM, (x))
113 
114 static int
115 vm_do_ctl(int cmd, const char *name)
116 {
117 	int ctl_fd;
118 
119 	ctl_fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR);
120 	if (ctl_fd < 0) {
121 		return (-1);
122 	}
123 
124 	if (ioctl(ctl_fd, cmd, name) == -1) {
125 		int err = errno;
126 
127 		/* Do not lose ioctl errno through the close(2) */
128 		(void) close(ctl_fd);
129 		errno = err;
130 		return (-1);
131 	}
132 	(void) close(ctl_fd);
133 
134 	return (0);
135 }
136 #endif
137 
138 static int
139 vm_device_open(const char *name)
140 {
141 	int fd, len;
142 	char *vmfile;
143 
144 	len = strlen("/dev/vmm/") + strlen(name) + 1;
145 	vmfile = malloc(len);
146 	assert(vmfile != NULL);
147 	snprintf(vmfile, len, "/dev/vmm/%s", name);
148 
149 	/* Open the device file */
150 	fd = open(vmfile, O_RDWR, 0);
151 
152 	free(vmfile);
153 	return (fd);
154 }
155 
156 int
157 vm_create(const char *name)
158 {
159 
160 	return (CREATE((char *)name));
161 }
162 
163 struct vmctx *
164 vm_open(const char *name)
165 {
166 	struct vmctx *vm;
167 
168 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
169 	assert(vm != NULL);
170 
171 	vm->fd = -1;
172 	vm->memflags = 0;
173 	vm->lowmem_limit = 3 * GB;
174 	vm->name = (char *)(vm + 1);
175 	strcpy(vm->name, name);
176 
177 	if ((vm->fd = vm_device_open(vm->name)) < 0)
178 		goto err;
179 
180 	return (vm);
181 err:
182 #ifdef __FreeBSD__
183 	vm_destroy(vm);
184 #else
185 	/*
186 	 * As libvmmapi is used by other programs to query and control bhyve
187 	 * VMs, destroying a VM just because the open failed isn't useful. We
188 	 * have to free what we have allocated, though.
189 	 */
190 	free(vm);
191 #endif
192 	return (NULL);
193 }
194 
195 #ifndef __FreeBSD__
196 void
197 vm_close(struct vmctx *vm)
198 {
199 	assert(vm != NULL);
200 	assert(vm->fd >= 0);
201 
202 	(void) close(vm->fd);
203 
204 	free(vm);
205 }
206 #endif
207 
208 void
209 vm_destroy(struct vmctx *vm)
210 {
211 	assert(vm != NULL);
212 
213 	if (vm->fd >= 0)
214 		close(vm->fd);
215 	DESTROY(vm->name);
216 
217 	free(vm);
218 }
219 
220 int
221 vm_parse_memsize(const char *optarg, size_t *ret_memsize)
222 {
223 	char *endptr;
224 	size_t optval;
225 	int error;
226 
227 	optval = strtoul(optarg, &endptr, 0);
228 	if (*optarg != '\0' && *endptr == '\0') {
229 		/*
230 		 * For the sake of backward compatibility if the memory size
231 		 * specified on the command line is less than a megabyte then
232 		 * it is interpreted as being in units of MB.
233 		 */
234 		if (optval < MB)
235 			optval *= MB;
236 		*ret_memsize = optval;
237 		error = 0;
238 	} else
239 		error = expand_number(optarg, ret_memsize);
240 
241 	return (error);
242 }
243 
244 uint32_t
245 vm_get_lowmem_limit(struct vmctx *ctx)
246 {
247 
248 	return (ctx->lowmem_limit);
249 }
250 
251 void
252 vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
253 {
254 
255 	ctx->lowmem_limit = limit;
256 }
257 
258 void
259 vm_set_memflags(struct vmctx *ctx, int flags)
260 {
261 
262 	ctx->memflags = flags;
263 }
264 
265 int
266 vm_get_memflags(struct vmctx *ctx)
267 {
268 
269 	return (ctx->memflags);
270 }
271 
272 /*
273  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
274  */
275 int
276 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
277     size_t len, int prot)
278 {
279 	struct vm_memmap memmap;
280 	int error, flags;
281 
282 	memmap.gpa = gpa;
283 	memmap.segid = segid;
284 	memmap.segoff = off;
285 	memmap.len = len;
286 	memmap.prot = prot;
287 	memmap.flags = 0;
288 
289 	if (ctx->memflags & VM_MEM_F_WIRED)
290 		memmap.flags |= VM_MEMMAP_F_WIRED;
291 
292 	/*
293 	 * If this mapping already exists then don't create it again. This
294 	 * is the common case for SYSMEM mappings created by bhyveload(8).
295 	 */
296 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
297 	if (error == 0 && gpa == memmap.gpa) {
298 		if (segid != memmap.segid || off != memmap.segoff ||
299 		    prot != memmap.prot || flags != memmap.flags) {
300 			errno = EEXIST;
301 			return (-1);
302 		} else {
303 			return (0);
304 		}
305 	}
306 
307 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
308 	return (error);
309 }
310 
311 int
312 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
313     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
314 {
315 	struct vm_memmap memmap;
316 	int error;
317 
318 	bzero(&memmap, sizeof(struct vm_memmap));
319 	memmap.gpa = *gpa;
320 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
321 	if (error == 0) {
322 		*gpa = memmap.gpa;
323 		*segid = memmap.segid;
324 		*segoff = memmap.segoff;
325 		*len = memmap.len;
326 		*prot = memmap.prot;
327 		*flags = memmap.flags;
328 	}
329 	return (error);
330 }
331 
332 /*
333  * Return 0 if the segments are identical and non-zero otherwise.
334  *
335  * This is slightly complicated by the fact that only device memory segments
336  * are named.
337  */
338 static int
339 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
340 {
341 
342 	if (len == len2) {
343 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
344 			return (0);
345 	}
346 	return (-1);
347 }
348 
349 static int
350 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
351 {
352 	struct vm_memseg memseg;
353 	size_t n;
354 	int error;
355 
356 	/*
357 	 * If the memory segment has already been created then just return.
358 	 * This is the usual case for the SYSMEM segment created by userspace
359 	 * loaders like bhyveload(8).
360 	 */
361 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
362 	    sizeof(memseg.name));
363 	if (error)
364 		return (error);
365 
366 	if (memseg.len != 0) {
367 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
368 			errno = EINVAL;
369 			return (-1);
370 		} else {
371 			return (0);
372 		}
373 	}
374 
375 	bzero(&memseg, sizeof(struct vm_memseg));
376 	memseg.segid = segid;
377 	memseg.len = len;
378 	if (name != NULL) {
379 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
380 		if (n >= sizeof(memseg.name)) {
381 			errno = ENAMETOOLONG;
382 			return (-1);
383 		}
384 	}
385 
386 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
387 	return (error);
388 }
389 
390 int
391 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
392     size_t bufsize)
393 {
394 	struct vm_memseg memseg;
395 	size_t n;
396 	int error;
397 
398 	memseg.segid = segid;
399 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
400 	if (error == 0) {
401 		*lenp = memseg.len;
402 		n = strlcpy(namebuf, memseg.name, bufsize);
403 		if (n >= bufsize) {
404 			errno = ENAMETOOLONG;
405 			error = -1;
406 		}
407 	}
408 	return (error);
409 }
410 
411 static int
412 #ifdef __FreeBSD__
413 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
414 #else
415 setup_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len,
416     char *base)
417 #endif
418 {
419 	char *ptr;
420 	int error, flags;
421 
422 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
423 #ifdef __FreeBSD__
424 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
425 #else
426 	/*
427 	 * As we use two segments for lowmem/highmem the offset within the
428 	 * segment is 0 on illumos.
429 	 */
430 	error = vm_mmap_memseg(ctx, gpa, segid, 0, len, PROT_ALL);
431 #endif
432 	if (error)
433 		return (error);
434 
435 	flags = MAP_SHARED | MAP_FIXED;
436 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
437 		flags |= MAP_NOCORE;
438 
439 	/* mmap into the process address space on the host */
440 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
441 	if (ptr == MAP_FAILED)
442 		return (-1);
443 
444 	return (0);
445 }
446 
447 int
448 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
449 {
450 	size_t objsize, len;
451 	vm_paddr_t gpa;
452 	char *baseaddr, *ptr;
453 	int error;
454 
455 	assert(vms == VM_MMAP_ALL);
456 
457 	/*
458 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
459 	 * create another 'highmem' segment above 4GB for the remainder.
460 	 */
461 	if (memsize > ctx->lowmem_limit) {
462 		ctx->lowmem = ctx->lowmem_limit;
463 		ctx->highmem = memsize - ctx->lowmem_limit;
464 		objsize = 4*GB + ctx->highmem;
465 	} else {
466 		ctx->lowmem = memsize;
467 		ctx->highmem = 0;
468 		objsize = ctx->lowmem;
469 	}
470 
471 #ifdef __FreeBSD__
472 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
473 	if (error)
474 		return (error);
475 #endif
476 
477 	/*
478 	 * Stake out a contiguous region covering the guest physical memory
479 	 * and the adjoining guard regions.
480 	 */
481 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
482 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
483 	if (ptr == MAP_FAILED)
484 		return (-1);
485 
486 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
487 
488 #ifdef __FreeBSD__
489 	if (ctx->highmem > 0) {
490 		gpa = 4*GB;
491 		len = ctx->highmem;
492 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
493 		if (error)
494 			return (error);
495 	}
496 
497 	if (ctx->lowmem > 0) {
498 		gpa = 0;
499 		len = ctx->lowmem;
500 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
501 		if (error)
502 			return (error);
503 	}
504 #else
505 	if (ctx->highmem > 0) {
506 		error = vm_alloc_memseg(ctx, VM_HIGHMEM, ctx->highmem, NULL);
507 		if (error)
508 			return (error);
509 		gpa = 4*GB;
510 		len = ctx->highmem;
511 		error = setup_memory_segment(ctx, VM_HIGHMEM, gpa, len, baseaddr);
512 		if (error)
513 			return (error);
514 	}
515 
516 	if (ctx->lowmem > 0) {
517 		error = vm_alloc_memseg(ctx, VM_LOWMEM, ctx->lowmem, NULL);
518 		if (error)
519 			return (error);
520 		gpa = 0;
521 		len = ctx->lowmem;
522 		error = setup_memory_segment(ctx, VM_LOWMEM, gpa, len, baseaddr);
523 		if (error)
524 			return (error);
525 	}
526 #endif
527 
528 	ctx->baseaddr = baseaddr;
529 
530 	return (0);
531 }
532 
533 /*
534  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
535  * the lowmem or highmem regions.
536  *
537  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
538  * The instruction emulation code depends on this behavior.
539  */
540 void *
541 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
542 {
543 
544 	if (ctx->lowmem > 0) {
545 		if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
546 		    gaddr + len <= ctx->lowmem)
547 			return (ctx->baseaddr + gaddr);
548 	}
549 
550 	if (ctx->highmem > 0) {
551                 if (gaddr >= 4*GB) {
552 			if (gaddr < 4*GB + ctx->highmem &&
553 			    len <= ctx->highmem &&
554 			    gaddr + len <= 4*GB + ctx->highmem)
555 				return (ctx->baseaddr + gaddr);
556 		}
557 	}
558 
559 	return (NULL);
560 }
561 
562 size_t
563 vm_get_lowmem_size(struct vmctx *ctx)
564 {
565 
566 	return (ctx->lowmem);
567 }
568 
569 size_t
570 vm_get_highmem_size(struct vmctx *ctx)
571 {
572 
573 	return (ctx->highmem);
574 }
575 
576 #ifndef __FreeBSD__
577 int
578 vm_get_devmem_offset(struct vmctx *ctx, int segid, off_t *mapoff)
579 {
580 	struct vm_devmem_offset vdo;
581 	int error;
582 
583 	vdo.segid = segid;
584 	error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo);
585 	if (error == 0)
586 		*mapoff = vdo.offset;
587 
588 	return (error);
589 }
590 #endif
591 
592 void *
593 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
594 {
595 #ifdef	__FreeBSD__
596 	char pathname[MAXPATHLEN];
597 #endif
598 	size_t len2;
599 	char *base, *ptr;
600 	int fd, error, flags;
601 	off_t mapoff;
602 
603 	fd = -1;
604 	ptr = MAP_FAILED;
605 	if (name == NULL || strlen(name) == 0) {
606 		errno = EINVAL;
607 		goto done;
608 	}
609 
610 	error = vm_alloc_memseg(ctx, segid, len, name);
611 	if (error)
612 		goto done;
613 
614 #ifdef	__FreeBSD__
615 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
616 	strlcat(pathname, ctx->name, sizeof(pathname));
617 	strlcat(pathname, ".", sizeof(pathname));
618 	strlcat(pathname, name, sizeof(pathname));
619 
620 	fd = open(pathname, O_RDWR);
621 	if (fd < 0)
622 		goto done;
623 #else
624 	if (vm_get_devmem_offset(ctx, segid, &mapoff) != 0)
625 		goto done;
626 #endif
627 
628 	/*
629 	 * Stake out a contiguous region covering the device memory and the
630 	 * adjoining guard regions.
631 	 */
632 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
633 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
634 	    0);
635 	if (base == MAP_FAILED)
636 		goto done;
637 
638 	flags = MAP_SHARED | MAP_FIXED;
639 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
640 		flags |= MAP_NOCORE;
641 
642 #ifdef	__FreeBSD__
643 	/* mmap the devmem region in the host address space */
644 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
645 #else
646 	/* mmap the devmem region in the host address space */
647 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, ctx->fd,
648 	    mapoff);
649 #endif
650 done:
651 	if (fd >= 0)
652 		close(fd);
653 	return (ptr);
654 }
655 
656 int
657 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
658 	    uint64_t base, uint32_t limit, uint32_t access)
659 {
660 	int error;
661 	struct vm_seg_desc vmsegdesc;
662 
663 	bzero(&vmsegdesc, sizeof(vmsegdesc));
664 	vmsegdesc.cpuid = vcpu;
665 	vmsegdesc.regnum = reg;
666 	vmsegdesc.desc.base = base;
667 	vmsegdesc.desc.limit = limit;
668 	vmsegdesc.desc.access = access;
669 
670 	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
671 	return (error);
672 }
673 
674 int
675 vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
676 	    uint64_t *base, uint32_t *limit, uint32_t *access)
677 {
678 	int error;
679 	struct vm_seg_desc vmsegdesc;
680 
681 	bzero(&vmsegdesc, sizeof(vmsegdesc));
682 	vmsegdesc.cpuid = vcpu;
683 	vmsegdesc.regnum = reg;
684 
685 	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
686 	if (error == 0) {
687 		*base = vmsegdesc.desc.base;
688 		*limit = vmsegdesc.desc.limit;
689 		*access = vmsegdesc.desc.access;
690 	}
691 	return (error);
692 }
693 
694 int
695 vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
696 {
697 	int error;
698 
699 	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
700 	    &seg_desc->access);
701 	return (error);
702 }
703 
704 int
705 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
706 {
707 	int error;
708 	struct vm_register vmreg;
709 
710 	bzero(&vmreg, sizeof(vmreg));
711 	vmreg.cpuid = vcpu;
712 	vmreg.regnum = reg;
713 	vmreg.regval = val;
714 
715 	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
716 	return (error);
717 }
718 
719 int
720 vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
721 {
722 	int error;
723 	struct vm_register vmreg;
724 
725 	bzero(&vmreg, sizeof(vmreg));
726 	vmreg.cpuid = vcpu;
727 	vmreg.regnum = reg;
728 
729 	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
730 	*ret_val = vmreg.regval;
731 	return (error);
732 }
733 
734 int
735 vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
736     const int *regnums, uint64_t *regvals)
737 {
738 	int error;
739 	struct vm_register_set vmregset;
740 
741 	bzero(&vmregset, sizeof(vmregset));
742 	vmregset.cpuid = vcpu;
743 	vmregset.count = count;
744 	vmregset.regnums = regnums;
745 	vmregset.regvals = regvals;
746 
747 	error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset);
748 	return (error);
749 }
750 
751 int
752 vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
753     const int *regnums, uint64_t *regvals)
754 {
755 	int error;
756 	struct vm_register_set vmregset;
757 
758 	bzero(&vmregset, sizeof(vmregset));
759 	vmregset.cpuid = vcpu;
760 	vmregset.count = count;
761 	vmregset.regnums = regnums;
762 	vmregset.regvals = regvals;
763 
764 	error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset);
765 	return (error);
766 }
767 
768 int
769 vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
770 {
771 	int error;
772 	struct vm_run vmrun;
773 
774 	bzero(&vmrun, sizeof(vmrun));
775 	vmrun.cpuid = vcpu;
776 
777 	error = ioctl(ctx->fd, VM_RUN, &vmrun);
778 	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
779 	return (error);
780 }
781 
782 int
783 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
784 {
785 	struct vm_suspend vmsuspend;
786 
787 	bzero(&vmsuspend, sizeof(vmsuspend));
788 	vmsuspend.how = how;
789 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
790 }
791 
792 int
793 vm_reinit(struct vmctx *ctx)
794 {
795 
796 	return (ioctl(ctx->fd, VM_REINIT, 0));
797 }
798 
799 int
800 vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
801     uint32_t errcode, int restart_instruction)
802 {
803 	struct vm_exception exc;
804 
805 	exc.cpuid = vcpu;
806 	exc.vector = vector;
807 	exc.error_code = errcode;
808 	exc.error_code_valid = errcode_valid;
809 	exc.restart_instruction = restart_instruction;
810 
811 	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
812 }
813 
814 int
815 vm_apicid2vcpu(struct vmctx *ctx, int apicid)
816 {
817 	/*
818 	 * The apic id associated with the 'vcpu' has the same numerical value
819 	 * as the 'vcpu' itself.
820 	 */
821 	return (apicid);
822 }
823 
824 int
825 vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
826 {
827 	struct vm_lapic_irq vmirq;
828 
829 	bzero(&vmirq, sizeof(vmirq));
830 	vmirq.cpuid = vcpu;
831 	vmirq.vector = vector;
832 
833 	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
834 }
835 
836 int
837 vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
838 {
839 	struct vm_lapic_irq vmirq;
840 
841 	bzero(&vmirq, sizeof(vmirq));
842 	vmirq.cpuid = vcpu;
843 	vmirq.vector = vector;
844 
845 	return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
846 }
847 
848 int
849 vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
850 {
851 	struct vm_lapic_msi vmmsi;
852 
853 	bzero(&vmmsi, sizeof(vmmsi));
854 	vmmsi.addr = addr;
855 	vmmsi.msg = msg;
856 
857 	return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
858 }
859 
860 int
861 vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
862 {
863 	struct vm_ioapic_irq ioapic_irq;
864 
865 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
866 	ioapic_irq.irq = irq;
867 
868 	return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
869 }
870 
871 int
872 vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
873 {
874 	struct vm_ioapic_irq ioapic_irq;
875 
876 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
877 	ioapic_irq.irq = irq;
878 
879 	return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
880 }
881 
882 int
883 vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
884 {
885 	struct vm_ioapic_irq ioapic_irq;
886 
887 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
888 	ioapic_irq.irq = irq;
889 
890 	return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
891 }
892 
893 int
894 vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
895 {
896 
897 	return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
898 }
899 
900 int
901 vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
902 {
903 	struct vm_isa_irq isa_irq;
904 
905 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
906 	isa_irq.atpic_irq = atpic_irq;
907 	isa_irq.ioapic_irq = ioapic_irq;
908 
909 	return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
910 }
911 
912 int
913 vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
914 {
915 	struct vm_isa_irq isa_irq;
916 
917 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
918 	isa_irq.atpic_irq = atpic_irq;
919 	isa_irq.ioapic_irq = ioapic_irq;
920 
921 	return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
922 }
923 
924 int
925 vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
926 {
927 	struct vm_isa_irq isa_irq;
928 
929 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
930 	isa_irq.atpic_irq = atpic_irq;
931 	isa_irq.ioapic_irq = ioapic_irq;
932 
933 	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
934 }
935 
936 int
937 vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
938     enum vm_intr_trigger trigger)
939 {
940 	struct vm_isa_irq_trigger isa_irq_trigger;
941 
942 	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
943 	isa_irq_trigger.atpic_irq = atpic_irq;
944 	isa_irq_trigger.trigger = trigger;
945 
946 	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
947 }
948 
949 int
950 vm_inject_nmi(struct vmctx *ctx, int vcpu)
951 {
952 	struct vm_nmi vmnmi;
953 
954 	bzero(&vmnmi, sizeof(vmnmi));
955 	vmnmi.cpuid = vcpu;
956 
957 	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
958 }
959 
960 static struct {
961 	const char	*name;
962 	int		type;
963 } capstrmap[] = {
964 	{ "hlt_exit",		VM_CAP_HALT_EXIT },
965 	{ "mtrap_exit",		VM_CAP_MTRAP_EXIT },
966 	{ "pause_exit",		VM_CAP_PAUSE_EXIT },
967 	{ "unrestricted_guest",	VM_CAP_UNRESTRICTED_GUEST },
968 	{ "enable_invpcid",	VM_CAP_ENABLE_INVPCID },
969 	{ 0 }
970 };
971 
972 int
973 vm_capability_name2type(const char *capname)
974 {
975 	int i;
976 
977 	for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
978 		if (strcmp(capstrmap[i].name, capname) == 0)
979 			return (capstrmap[i].type);
980 	}
981 
982 	return (-1);
983 }
984 
985 const char *
986 vm_capability_type2name(int type)
987 {
988 	int i;
989 
990 	for (i = 0; capstrmap[i].name != NULL; i++) {
991 		if (capstrmap[i].type == type)
992 			return (capstrmap[i].name);
993 	}
994 
995 	return (NULL);
996 }
997 
998 int
999 vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
1000 		  int *retval)
1001 {
1002 	int error;
1003 	struct vm_capability vmcap;
1004 
1005 	bzero(&vmcap, sizeof(vmcap));
1006 	vmcap.cpuid = vcpu;
1007 	vmcap.captype = cap;
1008 
1009 	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
1010 	*retval = vmcap.capval;
1011 	return (error);
1012 }
1013 
1014 int
1015 vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
1016 {
1017 	struct vm_capability vmcap;
1018 
1019 	bzero(&vmcap, sizeof(vmcap));
1020 	vmcap.cpuid = vcpu;
1021 	vmcap.captype = cap;
1022 	vmcap.capval = val;
1023 
1024 	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
1025 }
1026 
1027 #ifdef __FreeBSD__
1028 int
1029 vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
1030 {
1031 	struct vm_pptdev pptdev;
1032 
1033 	bzero(&pptdev, sizeof(pptdev));
1034 	pptdev.bus = bus;
1035 	pptdev.slot = slot;
1036 	pptdev.func = func;
1037 
1038 	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
1039 }
1040 
1041 int
1042 vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
1043 {
1044 	struct vm_pptdev pptdev;
1045 
1046 	bzero(&pptdev, sizeof(pptdev));
1047 	pptdev.bus = bus;
1048 	pptdev.slot = slot;
1049 	pptdev.func = func;
1050 
1051 	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
1052 }
1053 
1054 int
1055 vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
1056 		   vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
1057 {
1058 	struct vm_pptdev_mmio pptmmio;
1059 
1060 	bzero(&pptmmio, sizeof(pptmmio));
1061 	pptmmio.bus = bus;
1062 	pptmmio.slot = slot;
1063 	pptmmio.func = func;
1064 	pptmmio.gpa = gpa;
1065 	pptmmio.len = len;
1066 	pptmmio.hpa = hpa;
1067 
1068 	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
1069 }
1070 
1071 int
1072 vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
1073     uint64_t addr, uint64_t msg, int numvec)
1074 {
1075 	struct vm_pptdev_msi pptmsi;
1076 
1077 	bzero(&pptmsi, sizeof(pptmsi));
1078 	pptmsi.vcpu = vcpu;
1079 	pptmsi.bus = bus;
1080 	pptmsi.slot = slot;
1081 	pptmsi.func = func;
1082 	pptmsi.msg = msg;
1083 	pptmsi.addr = addr;
1084 	pptmsi.numvec = numvec;
1085 
1086 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
1087 }
1088 
1089 int
1090 vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
1091     int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
1092 {
1093 	struct vm_pptdev_msix pptmsix;
1094 
1095 	bzero(&pptmsix, sizeof(pptmsix));
1096 	pptmsix.vcpu = vcpu;
1097 	pptmsix.bus = bus;
1098 	pptmsix.slot = slot;
1099 	pptmsix.func = func;
1100 	pptmsix.idx = idx;
1101 	pptmsix.msg = msg;
1102 	pptmsix.addr = addr;
1103 	pptmsix.vector_control = vector_control;
1104 
1105 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
1106 }
1107 
1108 int
1109 vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func,
1110     int *msi_limit, int *msix_limit)
1111 {
1112 	struct vm_pptdev_limits pptlimits;
1113 	int error;
1114 
1115 	bzero(&pptlimits, sizeof (pptlimits));
1116 	pptlimits.bus = bus;
1117 	pptlimits.slot = slot;
1118 	pptlimits.func = func;
1119 
1120 	error = ioctl(ctx->fd, VM_GET_PPTDEV_LIMITS, &pptlimits);
1121 
1122 	*msi_limit = pptlimits.msi_limit;
1123 	*msix_limit = pptlimits.msix_limit;
1124 
1125 	return (error);
1126 }
1127 #else /* __FreeBSD__ */
1128 int
1129 vm_assign_pptdev(struct vmctx *ctx, int pptfd)
1130 {
1131 	struct vm_pptdev pptdev;
1132 
1133 	pptdev.pptfd = pptfd;
1134 	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
1135 }
1136 
1137 int
1138 vm_unassign_pptdev(struct vmctx *ctx, int pptfd)
1139 {
1140 	struct vm_pptdev pptdev;
1141 
1142 	pptdev.pptfd = pptfd;
1143 	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
1144 }
1145 
1146 int
1147 vm_map_pptdev_mmio(struct vmctx *ctx, int pptfd, vm_paddr_t gpa, size_t len,
1148     vm_paddr_t hpa)
1149 {
1150 	struct vm_pptdev_mmio pptmmio;
1151 
1152 	pptmmio.pptfd = pptfd;
1153 	pptmmio.gpa = gpa;
1154 	pptmmio.len = len;
1155 	pptmmio.hpa = hpa;
1156 	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
1157 }
1158 
1159 int
1160 vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int pptfd, uint64_t addr,
1161     uint64_t msg, int numvec)
1162 {
1163 	struct vm_pptdev_msi pptmsi;
1164 
1165 	pptmsi.vcpu = vcpu;
1166 	pptmsi.pptfd = pptfd;
1167 	pptmsi.msg = msg;
1168 	pptmsi.addr = addr;
1169 	pptmsi.numvec = numvec;
1170 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
1171 }
1172 
1173 int
1174 vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int pptfd, int idx,
1175     uint64_t addr, uint64_t msg, uint32_t vector_control)
1176 {
1177 	struct vm_pptdev_msix pptmsix;
1178 
1179 	pptmsix.vcpu = vcpu;
1180 	pptmsix.pptfd = pptfd;
1181 	pptmsix.idx = idx;
1182 	pptmsix.msg = msg;
1183 	pptmsix.addr = addr;
1184 	pptmsix.vector_control = vector_control;
1185 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
1186 }
1187 
1188 int
1189 vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit,
1190     int *msix_limit)
1191 {
1192 	struct vm_pptdev_limits pptlimits;
1193 	int error;
1194 
1195 	bzero(&pptlimits, sizeof (pptlimits));
1196 	pptlimits.pptfd = pptfd;
1197 	error = ioctl(ctx->fd, VM_GET_PPTDEV_LIMITS, &pptlimits);
1198 
1199 	*msi_limit = pptlimits.msi_limit;
1200 	*msix_limit = pptlimits.msix_limit;
1201 	return (error);
1202 }
1203 #endif /* __FreeBSD__ */
1204 
1205 uint64_t *
1206 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
1207 	     int *ret_entries)
1208 {
1209 	int error;
1210 
1211 	static struct vm_stats vmstats;
1212 
1213 	vmstats.cpuid = vcpu;
1214 
1215 	error = ioctl(ctx->fd, VM_STATS_IOC, &vmstats);
1216 	if (error == 0) {
1217 		if (ret_entries)
1218 			*ret_entries = vmstats.num_entries;
1219 		if (ret_tv)
1220 			*ret_tv = vmstats.tv;
1221 		return (vmstats.statbuf);
1222 	} else
1223 		return (NULL);
1224 }
1225 
1226 const char *
1227 vm_get_stat_desc(struct vmctx *ctx, int index)
1228 {
1229 	static struct vm_stat_desc statdesc;
1230 
1231 	statdesc.index = index;
1232 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
1233 		return (statdesc.desc);
1234 	else
1235 		return (NULL);
1236 }
1237 
1238 int
1239 vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
1240 {
1241 	int error;
1242 	struct vm_x2apic x2apic;
1243 
1244 	bzero(&x2apic, sizeof(x2apic));
1245 	x2apic.cpuid = vcpu;
1246 
1247 	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
1248 	*state = x2apic.state;
1249 	return (error);
1250 }
1251 
1252 int
1253 vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
1254 {
1255 	int error;
1256 	struct vm_x2apic x2apic;
1257 
1258 	bzero(&x2apic, sizeof(x2apic));
1259 	x2apic.cpuid = vcpu;
1260 	x2apic.state = state;
1261 
1262 	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
1263 
1264 	return (error);
1265 }
1266 
1267 /*
1268  * From Intel Vol 3a:
1269  * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
1270  */
1271 int
1272 vcpu_reset(struct vmctx *vmctx, int vcpu)
1273 {
1274 	int error;
1275 	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
1276 	uint32_t desc_access, desc_limit;
1277 	uint16_t sel;
1278 
1279 	zero = 0;
1280 
1281 	rflags = 0x2;
1282 	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
1283 	if (error)
1284 		goto done;
1285 
1286 	rip = 0xfff0;
1287 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
1288 		goto done;
1289 
1290 	cr0 = CR0_NE;
1291 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
1292 		goto done;
1293 
1294 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
1295 		goto done;
1296 
1297 	cr4 = 0;
1298 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
1299 		goto done;
1300 
1301 	/*
1302 	 * CS: present, r/w, accessed, 16-bit, byte granularity, usable
1303 	 */
1304 	desc_base = 0xffff0000;
1305 	desc_limit = 0xffff;
1306 	desc_access = 0x0093;
1307 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
1308 			    desc_base, desc_limit, desc_access);
1309 	if (error)
1310 		goto done;
1311 
1312 	sel = 0xf000;
1313 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
1314 		goto done;
1315 
1316 	/*
1317 	 * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
1318 	 */
1319 	desc_base = 0;
1320 	desc_limit = 0xffff;
1321 	desc_access = 0x0093;
1322 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
1323 			    desc_base, desc_limit, desc_access);
1324 	if (error)
1325 		goto done;
1326 
1327 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
1328 			    desc_base, desc_limit, desc_access);
1329 	if (error)
1330 		goto done;
1331 
1332 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
1333 			    desc_base, desc_limit, desc_access);
1334 	if (error)
1335 		goto done;
1336 
1337 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
1338 			    desc_base, desc_limit, desc_access);
1339 	if (error)
1340 		goto done;
1341 
1342 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
1343 			    desc_base, desc_limit, desc_access);
1344 	if (error)
1345 		goto done;
1346 
1347 	sel = 0;
1348 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
1349 		goto done;
1350 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
1351 		goto done;
1352 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
1353 		goto done;
1354 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
1355 		goto done;
1356 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
1357 		goto done;
1358 
1359 	/* General purpose registers */
1360 	rdx = 0xf00;
1361 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
1362 		goto done;
1363 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
1364 		goto done;
1365 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
1366 		goto done;
1367 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
1368 		goto done;
1369 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
1370 		goto done;
1371 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
1372 		goto done;
1373 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
1374 		goto done;
1375 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
1376 		goto done;
1377 
1378 	/* GDTR, IDTR */
1379 	desc_base = 0;
1380 	desc_limit = 0xffff;
1381 	desc_access = 0;
1382 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
1383 			    desc_base, desc_limit, desc_access);
1384 	if (error != 0)
1385 		goto done;
1386 
1387 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
1388 			    desc_base, desc_limit, desc_access);
1389 	if (error != 0)
1390 		goto done;
1391 
1392 	/* TR */
1393 	desc_base = 0;
1394 	desc_limit = 0xffff;
1395 	desc_access = 0x0000008b;
1396 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
1397 	if (error)
1398 		goto done;
1399 
1400 	sel = 0;
1401 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
1402 		goto done;
1403 
1404 	/* LDTR */
1405 	desc_base = 0;
1406 	desc_limit = 0xffff;
1407 	desc_access = 0x00000082;
1408 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
1409 			    desc_limit, desc_access);
1410 	if (error)
1411 		goto done;
1412 
1413 	sel = 0;
1414 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
1415 		goto done;
1416 
1417 	/* XXX cr2, debug registers */
1418 
1419 	error = 0;
1420 done:
1421 	return (error);
1422 }
1423 
1424 int
1425 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
1426 {
1427 	int error, i;
1428 	struct vm_gpa_pte gpapte;
1429 
1430 	bzero(&gpapte, sizeof(gpapte));
1431 	gpapte.gpa = gpa;
1432 
1433 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
1434 
1435 	if (error == 0) {
1436 		*num = gpapte.ptenum;
1437 		for (i = 0; i < gpapte.ptenum; i++)
1438 			pte[i] = gpapte.pte[i];
1439 	}
1440 
1441 	return (error);
1442 }
1443 
1444 int
1445 vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
1446 {
1447 	int error;
1448 	struct vm_hpet_cap cap;
1449 
1450 	bzero(&cap, sizeof(struct vm_hpet_cap));
1451 	error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
1452 	if (capabilities != NULL)
1453 		*capabilities = cap.capabilities;
1454 	return (error);
1455 }
1456 
1457 int
1458 vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
1459     uint64_t gla, int prot, uint64_t *gpa, int *fault)
1460 {
1461 	struct vm_gla2gpa gg;
1462 	int error;
1463 
1464 	bzero(&gg, sizeof(struct vm_gla2gpa));
1465 	gg.vcpuid = vcpu;
1466 	gg.prot = prot;
1467 	gg.gla = gla;
1468 	gg.paging = *paging;
1469 
1470 	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
1471 	if (error == 0) {
1472 		*fault = gg.fault;
1473 		*gpa = gg.gpa;
1474 	}
1475 	return (error);
1476 }
1477 
1478 int
1479 vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
1480     uint64_t gla, int prot, uint64_t *gpa, int *fault)
1481 {
1482 	struct vm_gla2gpa gg;
1483 	int error;
1484 
1485 	bzero(&gg, sizeof(struct vm_gla2gpa));
1486 	gg.vcpuid = vcpu;
1487 	gg.prot = prot;
1488 	gg.gla = gla;
1489 	gg.paging = *paging;
1490 
1491 	error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg);
1492 	if (error == 0) {
1493 		*fault = gg.fault;
1494 		*gpa = gg.gpa;
1495 	}
1496 	return (error);
1497 }
1498 
1499 #ifndef min
1500 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
1501 #endif
1502 
1503 int
1504 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
1505     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
1506     int *fault)
1507 {
1508 	void *va;
1509 	uint64_t gpa;
1510 	int error, i, n, off;
1511 
1512 	for (i = 0; i < iovcnt; i++) {
1513 		iov[i].iov_base = 0;
1514 		iov[i].iov_len = 0;
1515 	}
1516 
1517 	while (len) {
1518 		assert(iovcnt > 0);
1519 		error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
1520 		if (error || *fault)
1521 			return (error);
1522 
1523 		off = gpa & PAGE_MASK;
1524 		n = min(len, PAGE_SIZE - off);
1525 
1526 		va = vm_map_gpa(ctx, gpa, n);
1527 		if (va == NULL)
1528 			return (EFAULT);
1529 
1530 		iov->iov_base = va;
1531 		iov->iov_len = n;
1532 		iov++;
1533 		iovcnt--;
1534 
1535 		gla += n;
1536 		len -= n;
1537 	}
1538 	return (0);
1539 }
1540 
1541 void
1542 vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
1543 {
1544 
1545 	return;
1546 }
1547 
1548 void
1549 vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
1550 {
1551 	const char *src;
1552 	char *dst;
1553 	size_t n;
1554 
1555 	dst = vp;
1556 	while (len) {
1557 		assert(iov->iov_len);
1558 		n = min(len, iov->iov_len);
1559 		src = iov->iov_base;
1560 		bcopy(src, dst, n);
1561 
1562 		iov++;
1563 		dst += n;
1564 		len -= n;
1565 	}
1566 }
1567 
1568 void
1569 vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
1570     size_t len)
1571 {
1572 	const char *src;
1573 	char *dst;
1574 	size_t n;
1575 
1576 	src = vp;
1577 	while (len) {
1578 		assert(iov->iov_len);
1579 		n = min(len, iov->iov_len);
1580 		dst = iov->iov_base;
1581 		bcopy(src, dst, n);
1582 
1583 		iov++;
1584 		src += n;
1585 		len -= n;
1586 	}
1587 }
1588 
1589 static int
1590 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
1591 {
1592 	struct vm_cpuset vm_cpuset;
1593 	int error;
1594 
1595 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
1596 	vm_cpuset.which = which;
1597 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
1598 	vm_cpuset.cpus = cpus;
1599 
1600 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
1601 	return (error);
1602 }
1603 
1604 int
1605 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
1606 {
1607 
1608 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
1609 }
1610 
1611 int
1612 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
1613 {
1614 
1615 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
1616 }
1617 
1618 int
1619 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
1620 {
1621 
1622 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
1623 }
1624 
1625 int
1626 vm_activate_cpu(struct vmctx *ctx, int vcpu)
1627 {
1628 	struct vm_activate_cpu ac;
1629 	int error;
1630 
1631 	bzero(&ac, sizeof(struct vm_activate_cpu));
1632 	ac.vcpuid = vcpu;
1633 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
1634 	return (error);
1635 }
1636 
1637 int
1638 vm_suspend_cpu(struct vmctx *ctx, int vcpu)
1639 {
1640 	struct vm_activate_cpu ac;
1641 	int error;
1642 
1643 	bzero(&ac, sizeof(struct vm_activate_cpu));
1644 	ac.vcpuid = vcpu;
1645 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1646 	return (error);
1647 }
1648 
1649 int
1650 vm_resume_cpu(struct vmctx *ctx, int vcpu)
1651 {
1652 	struct vm_activate_cpu ac;
1653 	int error;
1654 
1655 	bzero(&ac, sizeof(struct vm_activate_cpu));
1656 	ac.vcpuid = vcpu;
1657 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1658 	return (error);
1659 }
1660 
1661 int
1662 vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
1663 {
1664 	struct vm_intinfo vmii;
1665 	int error;
1666 
1667 	bzero(&vmii, sizeof(struct vm_intinfo));
1668 	vmii.vcpuid = vcpu;
1669 	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
1670 	if (error == 0) {
1671 		*info1 = vmii.info1;
1672 		*info2 = vmii.info2;
1673 	}
1674 	return (error);
1675 }
1676 
1677 int
1678 vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
1679 {
1680 	struct vm_intinfo vmii;
1681 	int error;
1682 
1683 	bzero(&vmii, sizeof(struct vm_intinfo));
1684 	vmii.vcpuid = vcpu;
1685 	vmii.info1 = info1;
1686 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
1687 	return (error);
1688 }
1689 
1690 int
1691 vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
1692 {
1693 	struct vm_rtc_data rtcdata;
1694 	int error;
1695 
1696 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
1697 	rtcdata.offset = offset;
1698 	rtcdata.value = value;
1699 	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
1700 	return (error);
1701 }
1702 
1703 int
1704 vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
1705 {
1706 	struct vm_rtc_data rtcdata;
1707 	int error;
1708 
1709 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
1710 	rtcdata.offset = offset;
1711 	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
1712 	if (error == 0)
1713 		*retval = rtcdata.value;
1714 	return (error);
1715 }
1716 
1717 int
1718 vm_rtc_settime(struct vmctx *ctx, time_t secs)
1719 {
1720 	struct vm_rtc_time rtctime;
1721 	int error;
1722 
1723 	bzero(&rtctime, sizeof(struct vm_rtc_time));
1724 	rtctime.secs = secs;
1725 	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
1726 	return (error);
1727 }
1728 
1729 int
1730 vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
1731 {
1732 	struct vm_rtc_time rtctime;
1733 	int error;
1734 
1735 	bzero(&rtctime, sizeof(struct vm_rtc_time));
1736 	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
1737 	if (error == 0)
1738 		*secs = rtctime.secs;
1739 	return (error);
1740 }
1741 
1742 int
1743 vm_restart_instruction(void *arg, int vcpu)
1744 {
1745 	struct vmctx *ctx = arg;
1746 
1747 	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
1748 }
1749 
1750 int
1751 vm_set_topology(struct vmctx *ctx,
1752     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1753 {
1754 	struct vm_cpu_topology topology;
1755 
1756 	bzero(&topology, sizeof (struct vm_cpu_topology));
1757 	topology.sockets = sockets;
1758 	topology.cores = cores;
1759 	topology.threads = threads;
1760 	topology.maxcpus = maxcpus;
1761 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1762 }
1763 
1764 int
1765 vm_get_topology(struct vmctx *ctx,
1766     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1767 {
1768 	struct vm_cpu_topology topology;
1769 	int error;
1770 
1771 	bzero(&topology, sizeof (struct vm_cpu_topology));
1772 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1773 	if (error == 0) {
1774 		*sockets = topology.sockets;
1775 		*cores = topology.cores;
1776 		*threads = topology.threads;
1777 		*maxcpus = topology.maxcpus;
1778 	}
1779 	return (error);
1780 }
1781 
1782 int
1783 vm_get_device_fd(struct vmctx *ctx)
1784 {
1785 
1786 	return (ctx->fd);
1787 }
1788 
1789 #ifndef __FreeBSD__
1790 int
1791 vm_wrlock_cycle(struct vmctx *ctx)
1792 {
1793 	if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) {
1794 		return (errno);
1795 	}
1796 	return (0);
1797 }
1798 #endif /* __FreeBSD__ */
1799 
1800 #ifdef __FreeBSD__
1801 const cap_ioctl_t *
1802 vm_get_ioctls(size_t *len)
1803 {
1804 	cap_ioctl_t *cmds;
1805 	/* keep in sync with machine/vmm_dev.h */
1806 	static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT,
1807 	    VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG,
1808 	    VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER,
1809 	    VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR,
1810 	    VM_SET_REGISTER_SET, VM_GET_REGISTER_SET,
1811 	    VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ,
1812 	    VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ,
1813 	    VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ,
1814 	    VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
1815 	    VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
1816 	    VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
1817 	    VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
1818 	    VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
1819 	    VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
1820 	    VM_GLA2GPA_NOFAULT,
1821 	    VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU,
1822 	    VM_SET_INTINFO, VM_GET_INTINFO,
1823 	    VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
1824 	    VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY };
1825 
1826 	if (len == NULL) {
1827 		cmds = malloc(sizeof(vm_ioctl_cmds));
1828 		if (cmds == NULL)
1829 			return (NULL);
1830 		bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds));
1831 		return (cmds);
1832 	}
1833 
1834 	*len = nitems(vm_ioctl_cmds);
1835 	return (NULL);
1836 }
1837 #endif /* __FreeBSD__ */
1838