xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <err.h>
54 #include <assert.h>
55 #include <stdbool.h>
56 #include <stddef.h>
57 #include <stdio.h>
58 #include <string.h>
59 #include <strings.h>
60 #include <vmmapi.h>
61 #define	__diagused
62 #define	KASSERT(exp,msg)	assert((exp))
63 #define	panic(...)		errx(4, __VA_ARGS__)
64 #endif	/* _KERNEL */
65 
66 #include <machine/vmm_instruction_emul.h>
67 #include <x86/psl.h>
68 #include <x86/specialreg.h>
69 
70 /* struct vie_op.op_type */
71 enum {
72 	VIE_OP_TYPE_NONE = 0,
73 	VIE_OP_TYPE_MOV,
74 	VIE_OP_TYPE_MOVSX,
75 	VIE_OP_TYPE_MOVZX,
76 	VIE_OP_TYPE_AND,
77 	VIE_OP_TYPE_OR,
78 	VIE_OP_TYPE_SUB,
79 	VIE_OP_TYPE_TWO_BYTE,
80 	VIE_OP_TYPE_PUSH,
81 	VIE_OP_TYPE_CMP,
82 	VIE_OP_TYPE_POP,
83 	VIE_OP_TYPE_MOVS,
84 	VIE_OP_TYPE_GROUP1,
85 	VIE_OP_TYPE_STOS,
86 	VIE_OP_TYPE_BITTEST,
87 	VIE_OP_TYPE_TWOB_GRP15,
88 	VIE_OP_TYPE_ADD,
89 	VIE_OP_TYPE_TEST,
90 	VIE_OP_TYPE_BEXTR,
91 	VIE_OP_TYPE_LAST
92 };
93 
94 /* struct vie_op.op_flags */
95 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
96 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
97 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
98 #define	VIE_OP_F_NO_MODRM	(1 << 3)
99 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
100 
101 static const struct vie_op three_byte_opcodes_0f38[256] = {
102 	[0xF7] = {
103 		.op_byte = 0xF7,
104 		.op_type = VIE_OP_TYPE_BEXTR,
105 	},
106 };
107 
108 static const struct vie_op two_byte_opcodes[256] = {
109 	[0xAE] = {
110 		.op_byte = 0xAE,
111 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
112 	},
113 	[0xB6] = {
114 		.op_byte = 0xB6,
115 		.op_type = VIE_OP_TYPE_MOVZX,
116 	},
117 	[0xB7] = {
118 		.op_byte = 0xB7,
119 		.op_type = VIE_OP_TYPE_MOVZX,
120 	},
121 	[0xBA] = {
122 		.op_byte = 0xBA,
123 		.op_type = VIE_OP_TYPE_BITTEST,
124 		.op_flags = VIE_OP_F_IMM8,
125 	},
126 	[0xBE] = {
127 		.op_byte = 0xBE,
128 		.op_type = VIE_OP_TYPE_MOVSX,
129 	},
130 };
131 
132 static const struct vie_op one_byte_opcodes[256] = {
133 	[0x03] = {
134 		.op_byte = 0x03,
135 		.op_type = VIE_OP_TYPE_ADD,
136 	},
137 	[0x0F] = {
138 		.op_byte = 0x0F,
139 		.op_type = VIE_OP_TYPE_TWO_BYTE
140 	},
141 	[0x0B] = {
142 		.op_byte = 0x0B,
143 		.op_type = VIE_OP_TYPE_OR,
144 	},
145 	[0x2B] = {
146 		.op_byte = 0x2B,
147 		.op_type = VIE_OP_TYPE_SUB,
148 	},
149 	[0x39] = {
150 		.op_byte = 0x39,
151 		.op_type = VIE_OP_TYPE_CMP,
152 	},
153 	[0x3B] = {
154 		.op_byte = 0x3B,
155 		.op_type = VIE_OP_TYPE_CMP,
156 	},
157 	[0x88] = {
158 		.op_byte = 0x88,
159 		.op_type = VIE_OP_TYPE_MOV,
160 	},
161 	[0x89] = {
162 		.op_byte = 0x89,
163 		.op_type = VIE_OP_TYPE_MOV,
164 	},
165 	[0x8A] = {
166 		.op_byte = 0x8A,
167 		.op_type = VIE_OP_TYPE_MOV,
168 	},
169 	[0x8B] = {
170 		.op_byte = 0x8B,
171 		.op_type = VIE_OP_TYPE_MOV,
172 	},
173 	[0xA1] = {
174 		.op_byte = 0xA1,
175 		.op_type = VIE_OP_TYPE_MOV,
176 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
177 	},
178 	[0xA3] = {
179 		.op_byte = 0xA3,
180 		.op_type = VIE_OP_TYPE_MOV,
181 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
182 	},
183 	[0xA4] = {
184 		.op_byte = 0xA4,
185 		.op_type = VIE_OP_TYPE_MOVS,
186 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
187 	},
188 	[0xA5] = {
189 		.op_byte = 0xA5,
190 		.op_type = VIE_OP_TYPE_MOVS,
191 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
192 	},
193 	[0xAA] = {
194 		.op_byte = 0xAA,
195 		.op_type = VIE_OP_TYPE_STOS,
196 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
197 	},
198 	[0xAB] = {
199 		.op_byte = 0xAB,
200 		.op_type = VIE_OP_TYPE_STOS,
201 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
202 	},
203 	[0xC6] = {
204 		/* XXX Group 11 extended opcode - not just MOV */
205 		.op_byte = 0xC6,
206 		.op_type = VIE_OP_TYPE_MOV,
207 		.op_flags = VIE_OP_F_IMM8,
208 	},
209 	[0xC7] = {
210 		.op_byte = 0xC7,
211 		.op_type = VIE_OP_TYPE_MOV,
212 		.op_flags = VIE_OP_F_IMM,
213 	},
214 	[0x23] = {
215 		.op_byte = 0x23,
216 		.op_type = VIE_OP_TYPE_AND,
217 	},
218 	[0x80] = {
219 		/* Group 1 extended opcode */
220 		.op_byte = 0x80,
221 		.op_type = VIE_OP_TYPE_GROUP1,
222 		.op_flags = VIE_OP_F_IMM8,
223 	},
224 	[0x81] = {
225 		/* Group 1 extended opcode */
226 		.op_byte = 0x81,
227 		.op_type = VIE_OP_TYPE_GROUP1,
228 		.op_flags = VIE_OP_F_IMM,
229 	},
230 	[0x83] = {
231 		/* Group 1 extended opcode */
232 		.op_byte = 0x83,
233 		.op_type = VIE_OP_TYPE_GROUP1,
234 		.op_flags = VIE_OP_F_IMM8,
235 	},
236 	[0x8F] = {
237 		/* XXX Group 1A extended opcode - not just POP */
238 		.op_byte = 0x8F,
239 		.op_type = VIE_OP_TYPE_POP,
240 	},
241 	[0xF7] = {
242 		/* XXX Group 3 extended opcode - not just TEST */
243 		.op_byte = 0xF7,
244 		.op_type = VIE_OP_TYPE_TEST,
245 		.op_flags = VIE_OP_F_IMM,
246 	},
247 	[0xFF] = {
248 		/* XXX Group 5 extended opcode - not just PUSH */
249 		.op_byte = 0xFF,
250 		.op_type = VIE_OP_TYPE_PUSH,
251 	}
252 };
253 
254 /* struct vie.mod */
255 #define	VIE_MOD_INDIRECT		0
256 #define	VIE_MOD_INDIRECT_DISP8		1
257 #define	VIE_MOD_INDIRECT_DISP32		2
258 #define	VIE_MOD_DIRECT			3
259 
260 /* struct vie.rm */
261 #define	VIE_RM_SIB			4
262 #define	VIE_RM_DISP32			5
263 
264 #define	GB				(1024 * 1024 * 1024)
265 
266 static enum vm_reg_name gpr_map[16] = {
267 	VM_REG_GUEST_RAX,
268 	VM_REG_GUEST_RCX,
269 	VM_REG_GUEST_RDX,
270 	VM_REG_GUEST_RBX,
271 	VM_REG_GUEST_RSP,
272 	VM_REG_GUEST_RBP,
273 	VM_REG_GUEST_RSI,
274 	VM_REG_GUEST_RDI,
275 	VM_REG_GUEST_R8,
276 	VM_REG_GUEST_R9,
277 	VM_REG_GUEST_R10,
278 	VM_REG_GUEST_R11,
279 	VM_REG_GUEST_R12,
280 	VM_REG_GUEST_R13,
281 	VM_REG_GUEST_R14,
282 	VM_REG_GUEST_R15
283 };
284 
285 static uint64_t size2mask[] = {
286 	[1] = 0xff,
287 	[2] = 0xffff,
288 	[4] = 0xffffffff,
289 	[8] = 0xffffffffffffffff,
290 };
291 
292 static int
293 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
294 {
295 	int error;
296 
297 	error = vm_get_register(vm, vcpuid, reg, rval);
298 
299 	return (error);
300 }
301 
302 static void
303 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
304 {
305 	*lhbr = 0;
306 	*reg = gpr_map[vie->reg];
307 
308 	/*
309 	 * 64-bit mode imposes limitations on accessing legacy high byte
310 	 * registers (lhbr).
311 	 *
312 	 * The legacy high-byte registers cannot be addressed if the REX
313 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
314 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
315 	 *
316 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
317 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
318 	 * %ah, %ch, %dh and %bh respectively.
319 	 */
320 	if (!vie->rex_present) {
321 		if (vie->reg & 0x4) {
322 			*lhbr = 1;
323 			*reg = gpr_map[vie->reg & 0x3];
324 		}
325 	}
326 }
327 
328 static int
329 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
330 {
331 	uint64_t val;
332 	int error, lhbr;
333 	enum vm_reg_name reg;
334 
335 	vie_calc_bytereg(vie, &reg, &lhbr);
336 	error = vm_get_register(vm, vcpuid, reg, &val);
337 
338 	/*
339 	 * To obtain the value of a legacy high byte register shift the
340 	 * base register right by 8 bits (%ah = %rax >> 8).
341 	 */
342 	if (lhbr)
343 		*rval = val >> 8;
344 	else
345 		*rval = val;
346 	return (error);
347 }
348 
349 static int
350 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
351 {
352 	uint64_t origval, val, mask;
353 	int error, lhbr;
354 	enum vm_reg_name reg;
355 
356 	vie_calc_bytereg(vie, &reg, &lhbr);
357 	error = vm_get_register(vm, vcpuid, reg, &origval);
358 	if (error == 0) {
359 		val = byte;
360 		mask = 0xff;
361 		if (lhbr) {
362 			/*
363 			 * Shift left by 8 to store 'byte' in a legacy high
364 			 * byte register.
365 			 */
366 			val <<= 8;
367 			mask <<= 8;
368 		}
369 		val |= origval & ~mask;
370 		error = vm_set_register(vm, vcpuid, reg, val);
371 	}
372 	return (error);
373 }
374 
375 int
376 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
377 		    uint64_t val, int size)
378 {
379 	int error;
380 	uint64_t origval;
381 
382 	switch (size) {
383 	case 1:
384 	case 2:
385 		error = vie_read_register(vm, vcpuid, reg, &origval);
386 		if (error)
387 			return (error);
388 		val &= size2mask[size];
389 		val |= origval & ~size2mask[size];
390 		break;
391 	case 4:
392 		val &= 0xffffffffUL;
393 		break;
394 	case 8:
395 		break;
396 	default:
397 		return (EINVAL);
398 	}
399 
400 	error = vm_set_register(vm, vcpuid, reg, val);
401 	return (error);
402 }
403 
404 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
405 
406 /*
407  * Return the status flags that would result from doing (x - y).
408  */
409 #define	GETCC(sz)							\
410 static u_long								\
411 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
412 {									\
413 	u_long rflags;							\
414 									\
415 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
416 	    "=r" (rflags), "+r" (x) : "m" (y));				\
417 	return (rflags);						\
418 } struct __hack
419 
420 GETCC(8);
421 GETCC(16);
422 GETCC(32);
423 GETCC(64);
424 
425 static u_long
426 getcc(int opsize, uint64_t x, uint64_t y)
427 {
428 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
429 	    ("getcc: invalid operand size %d", opsize));
430 
431 	if (opsize == 1)
432 		return (getcc8(x, y));
433 	else if (opsize == 2)
434 		return (getcc16(x, y));
435 	else if (opsize == 4)
436 		return (getcc32(x, y));
437 	else
438 		return (getcc64(x, y));
439 }
440 
441 /*
442  * Macro creation of functions getaddflags{8,16,32,64}
443  */
444 #define	GETADDFLAGS(sz)							\
445 static u_long								\
446 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
447 {									\
448 	u_long rflags;							\
449 									\
450 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
451 	    "=r" (rflags), "+r" (x) : "m" (y));				\
452 	return (rflags);						\
453 } struct __hack
454 
455 GETADDFLAGS(8);
456 GETADDFLAGS(16);
457 GETADDFLAGS(32);
458 GETADDFLAGS(64);
459 
460 static u_long
461 getaddflags(int opsize, uint64_t x, uint64_t y)
462 {
463 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
464 	    ("getaddflags: invalid operand size %d", opsize));
465 
466 	if (opsize == 1)
467 		return (getaddflags8(x, y));
468 	else if (opsize == 2)
469 		return (getaddflags16(x, y));
470 	else if (opsize == 4)
471 		return (getaddflags32(x, y));
472 	else
473 		return (getaddflags64(x, y));
474 }
475 
476 /*
477  * Return the status flags that would result from doing (x & y).
478  */
479 #define	GETANDFLAGS(sz)							\
480 static u_long								\
481 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
482 {									\
483 	u_long rflags;							\
484 									\
485 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
486 	    "=r" (rflags), "+r" (x) : "m" (y));				\
487 	return (rflags);						\
488 } struct __hack
489 
490 GETANDFLAGS(8);
491 GETANDFLAGS(16);
492 GETANDFLAGS(32);
493 GETANDFLAGS(64);
494 
495 static u_long
496 getandflags(int opsize, uint64_t x, uint64_t y)
497 {
498 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
499 	    ("getandflags: invalid operand size %d", opsize));
500 
501 	if (opsize == 1)
502 		return (getandflags8(x, y));
503 	else if (opsize == 2)
504 		return (getandflags16(x, y));
505 	else if (opsize == 4)
506 		return (getandflags32(x, y));
507 	else
508 		return (getandflags64(x, y));
509 }
510 
511 static int
512 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
513 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
514 {
515 	int error, size;
516 	enum vm_reg_name reg;
517 	uint8_t byte;
518 	uint64_t val;
519 
520 	size = vie->opsize;
521 	error = EINVAL;
522 
523 	switch (vie->op.op_byte) {
524 	case 0x88:
525 		/*
526 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
527 		 * 88/r:	mov r/m8, r8
528 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
529 		 */
530 		size = 1;	/* override for byte operation */
531 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
532 		if (error == 0)
533 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
534 		break;
535 	case 0x89:
536 		/*
537 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
538 		 * 89/r:	mov r/m16, r16
539 		 * 89/r:	mov r/m32, r32
540 		 * REX.W + 89/r	mov r/m64, r64
541 		 */
542 		reg = gpr_map[vie->reg];
543 		error = vie_read_register(vm, vcpuid, reg, &val);
544 		if (error == 0) {
545 			val &= size2mask[size];
546 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
547 		}
548 		break;
549 	case 0x8A:
550 		/*
551 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
552 		 * 8A/r:	mov r8, r/m8
553 		 * REX + 8A/r:	mov r8, r/m8
554 		 */
555 		size = 1;	/* override for byte operation */
556 		error = memread(vm, vcpuid, gpa, &val, size, arg);
557 		if (error == 0)
558 			error = vie_write_bytereg(vm, vcpuid, vie, val);
559 		break;
560 	case 0x8B:
561 		/*
562 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
563 		 * 8B/r:	mov r16, r/m16
564 		 * 8B/r:	mov r32, r/m32
565 		 * REX.W 8B/r:	mov r64, r/m64
566 		 */
567 		error = memread(vm, vcpuid, gpa, &val, size, arg);
568 		if (error == 0) {
569 			reg = gpr_map[vie->reg];
570 			error = vie_update_register(vm, vcpuid, reg, val, size);
571 		}
572 		break;
573 	case 0xA1:
574 		/*
575 		 * MOV from seg:moffset to AX/EAX/RAX
576 		 * A1:		mov AX, moffs16
577 		 * A1:		mov EAX, moffs32
578 		 * REX.W + A1:	mov RAX, moffs64
579 		 */
580 		error = memread(vm, vcpuid, gpa, &val, size, arg);
581 		if (error == 0) {
582 			reg = VM_REG_GUEST_RAX;
583 			error = vie_update_register(vm, vcpuid, reg, val, size);
584 		}
585 		break;
586 	case 0xA3:
587 		/*
588 		 * MOV from AX/EAX/RAX to seg:moffset
589 		 * A3:		mov moffs16, AX
590 		 * A3:		mov moffs32, EAX
591 		 * REX.W + A3:	mov moffs64, RAX
592 		 */
593 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
594 		if (error == 0) {
595 			val &= size2mask[size];
596 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
597 		}
598 		break;
599 	case 0xC6:
600 		/*
601 		 * MOV from imm8 to mem (ModRM:r/m)
602 		 * C6/0		mov r/m8, imm8
603 		 * REX + C6/0	mov r/m8, imm8
604 		 */
605 		size = 1;	/* override for byte operation */
606 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
607 		break;
608 	case 0xC7:
609 		/*
610 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
611 		 * C7/0		mov r/m16, imm16
612 		 * C7/0		mov r/m32, imm32
613 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
614 		 */
615 		val = vie->immediate & size2mask[size];
616 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
617 		break;
618 	default:
619 		break;
620 	}
621 
622 	return (error);
623 }
624 
625 static int
626 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
627 	     mem_region_read_t memread, mem_region_write_t memwrite,
628 	     void *arg)
629 {
630 	int error, size;
631 	enum vm_reg_name reg;
632 	uint64_t val;
633 
634 	size = vie->opsize;
635 	error = EINVAL;
636 
637 	switch (vie->op.op_byte) {
638 	case 0xB6:
639 		/*
640 		 * MOV and zero extend byte from mem (ModRM:r/m) to
641 		 * reg (ModRM:reg).
642 		 *
643 		 * 0F B6/r		movzx r16, r/m8
644 		 * 0F B6/r		movzx r32, r/m8
645 		 * REX.W + 0F B6/r	movzx r64, r/m8
646 		 */
647 
648 		/* get the first operand */
649 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
650 		if (error)
651 			break;
652 
653 		/* get the second operand */
654 		reg = gpr_map[vie->reg];
655 
656 		/* zero-extend byte */
657 		val = (uint8_t)val;
658 
659 		/* write the result */
660 		error = vie_update_register(vm, vcpuid, reg, val, size);
661 		break;
662 	case 0xB7:
663 		/*
664 		 * MOV and zero extend word from mem (ModRM:r/m) to
665 		 * reg (ModRM:reg).
666 		 *
667 		 * 0F B7/r		movzx r32, r/m16
668 		 * REX.W + 0F B7/r	movzx r64, r/m16
669 		 */
670 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
671 		if (error)
672 			return (error);
673 
674 		reg = gpr_map[vie->reg];
675 
676 		/* zero-extend word */
677 		val = (uint16_t)val;
678 
679 		error = vie_update_register(vm, vcpuid, reg, val, size);
680 		break;
681 	case 0xBE:
682 		/*
683 		 * MOV and sign extend byte from mem (ModRM:r/m) to
684 		 * reg (ModRM:reg).
685 		 *
686 		 * 0F BE/r		movsx r16, r/m8
687 		 * 0F BE/r		movsx r32, r/m8
688 		 * REX.W + 0F BE/r	movsx r64, r/m8
689 		 */
690 
691 		/* get the first operand */
692 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
693 		if (error)
694 			break;
695 
696 		/* get the second operand */
697 		reg = gpr_map[vie->reg];
698 
699 		/* sign extend byte */
700 		val = (int8_t)val;
701 
702 		/* write the result */
703 		error = vie_update_register(vm, vcpuid, reg, val, size);
704 		break;
705 	default:
706 		break;
707 	}
708 	return (error);
709 }
710 
711 /*
712  * Helper function to calculate and validate a linear address.
713  */
714 static int
715 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
716     int opsize, int addrsize, int prot, enum vm_reg_name seg,
717     enum vm_reg_name gpr, uint64_t *gla, int *fault)
718 {
719 	struct seg_desc desc;
720 	uint64_t cr0, val, rflags;
721 	int error __diagused;
722 
723 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
724 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
725 
726 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
727 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
728 
729 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
730 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
731 	    __func__, error, seg));
732 
733 	error = vie_read_register(vm, vcpuid, gpr, &val);
734 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
735 	    error, gpr));
736 
737 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
738 	    addrsize, prot, gla)) {
739 		if (seg == VM_REG_GUEST_SS)
740 			vm_inject_ss(vm, vcpuid, 0);
741 		else
742 			vm_inject_gp(vm, vcpuid);
743 		goto guest_fault;
744 	}
745 
746 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
747 		if (seg == VM_REG_GUEST_SS)
748 			vm_inject_ss(vm, vcpuid, 0);
749 		else
750 			vm_inject_gp(vm, vcpuid);
751 		goto guest_fault;
752 	}
753 
754 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
755 		vm_inject_ac(vm, vcpuid, 0);
756 		goto guest_fault;
757 	}
758 
759 	*fault = 0;
760 	return (0);
761 
762 guest_fault:
763 	*fault = 1;
764 	return (0);
765 }
766 
767 static int
768 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
769     struct vm_guest_paging *paging, mem_region_read_t memread,
770     mem_region_write_t memwrite, void *arg)
771 {
772 #ifdef _KERNEL
773 	struct vm_copyinfo copyinfo[2];
774 #else
775 	struct iovec copyinfo[2];
776 #endif
777 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
778 	uint64_t rcx, rdi, rsi, rflags;
779 	int error, fault, opsize, seg, repeat;
780 
781 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
782 	val = 0;
783 	error = 0;
784 
785 	/*
786 	 * XXX although the MOVS instruction is only supposed to be used with
787 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
788 	 *
789 	 * Empirically the "repnz" prefix has identical behavior to "rep"
790 	 * and the zero flag does not make a difference.
791 	 */
792 	repeat = vie->repz_present | vie->repnz_present;
793 
794 	if (repeat) {
795 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
796 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
797 
798 		/*
799 		 * The count register is %rcx, %ecx or %cx depending on the
800 		 * address size of the instruction.
801 		 */
802 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
803 			error = 0;
804 			goto done;
805 		}
806 	}
807 
808 	/*
809 	 *	Source		Destination	Comments
810 	 *	--------------------------------------------
811 	 * (1)  memory		memory		n/a
812 	 * (2)  memory		mmio		emulated
813 	 * (3)  mmio		memory		emulated
814 	 * (4)  mmio		mmio		emulated
815 	 *
816 	 * At this point we don't have sufficient information to distinguish
817 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
818 	 * out because it will succeed only when operating on regular memory.
819 	 *
820 	 * XXX the emulation doesn't properly handle the case where 'gpa'
821 	 * is straddling the boundary between the normal memory and MMIO.
822 	 */
823 
824 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
825 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
826 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
827 	if (error || fault)
828 		goto done;
829 
830 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
831 	    copyinfo, nitems(copyinfo), &fault);
832 	if (error == 0) {
833 		if (fault)
834 			goto done;	/* Resume guest to handle fault */
835 
836 		/*
837 		 * case (2): read from system memory and write to mmio.
838 		 */
839 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
840 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
841 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
842 		if (error)
843 			goto done;
844 	} else {
845 		/*
846 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
847 		 * if 'srcaddr' is in the mmio space.
848 		 */
849 
850 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
851 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
852 		    &fault);
853 		if (error || fault)
854 			goto done;
855 
856 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
857 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
858 		if (error == 0) {
859 			if (fault)
860 				goto done;    /* Resume guest to handle fault */
861 
862 			/*
863 			 * case (3): read from MMIO and write to system memory.
864 			 *
865 			 * A MMIO read can have side-effects so we
866 			 * commit to it only after vm_copy_setup() is
867 			 * successful. If a page-fault needs to be
868 			 * injected into the guest then it will happen
869 			 * before the MMIO read is attempted.
870 			 */
871 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
872 			if (error)
873 				goto done;
874 
875 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
876 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
877 		} else {
878 			/*
879 			 * Case (4): read from and write to mmio.
880 			 *
881 			 * Commit to the MMIO read/write (with potential
882 			 * side-effects) only after we are sure that the
883 			 * instruction is not going to be restarted due
884 			 * to address translation faults.
885 			 */
886 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
887 			    PROT_READ, &srcgpa, &fault);
888 			if (error || fault)
889 				goto done;
890 
891 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
892 			   PROT_WRITE, &dstgpa, &fault);
893 			if (error || fault)
894 				goto done;
895 
896 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
897 			if (error)
898 				goto done;
899 
900 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
901 			if (error)
902 				goto done;
903 		}
904 	}
905 
906 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
907 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
908 
909 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
910 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
911 
912 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
913 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
914 
915 	if (rflags & PSL_D) {
916 		rsi -= opsize;
917 		rdi -= opsize;
918 	} else {
919 		rsi += opsize;
920 		rdi += opsize;
921 	}
922 
923 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
924 	    vie->addrsize);
925 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
926 
927 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
928 	    vie->addrsize);
929 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
930 
931 	if (repeat) {
932 		rcx = rcx - 1;
933 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
934 		    rcx, vie->addrsize);
935 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
936 
937 		/*
938 		 * Repeat the instruction if the count register is not zero.
939 		 */
940 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
941 			vm_restart_instruction(vm, vcpuid);
942 	}
943 done:
944 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
945 	    __func__, error));
946 	return (error);
947 }
948 
949 static int
950 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
951     struct vm_guest_paging *paging, mem_region_read_t memread,
952     mem_region_write_t memwrite, void *arg)
953 {
954 	int error, opsize, repeat;
955 	uint64_t val;
956 	uint64_t rcx, rdi, rflags;
957 
958 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
959 	repeat = vie->repz_present | vie->repnz_present;
960 
961 	if (repeat) {
962 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
963 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
964 
965 		/*
966 		 * The count register is %rcx, %ecx or %cx depending on the
967 		 * address size of the instruction.
968 		 */
969 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
970 			return (0);
971 	}
972 
973 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
974 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
975 
976 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
977 	if (error)
978 		return (error);
979 
980 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
981 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
982 
983 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
984 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
985 
986 	if (rflags & PSL_D)
987 		rdi -= opsize;
988 	else
989 		rdi += opsize;
990 
991 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
992 	    vie->addrsize);
993 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
994 
995 	if (repeat) {
996 		rcx = rcx - 1;
997 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
998 		    rcx, vie->addrsize);
999 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1000 
1001 		/*
1002 		 * Repeat the instruction if the count register is not zero.
1003 		 */
1004 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1005 			vm_restart_instruction(vm, vcpuid);
1006 	}
1007 
1008 	return (0);
1009 }
1010 
1011 static int
1012 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1013 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1014 {
1015 	int error, size;
1016 	enum vm_reg_name reg;
1017 	uint64_t result, rflags, rflags2, val1, val2;
1018 
1019 	size = vie->opsize;
1020 	error = EINVAL;
1021 
1022 	switch (vie->op.op_byte) {
1023 	case 0x23:
1024 		/*
1025 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1026 		 * result in reg.
1027 		 *
1028 		 * 23/r		and r16, r/m16
1029 		 * 23/r		and r32, r/m32
1030 		 * REX.W + 23/r	and r64, r/m64
1031 		 */
1032 
1033 		/* get the first operand */
1034 		reg = gpr_map[vie->reg];
1035 		error = vie_read_register(vm, vcpuid, reg, &val1);
1036 		if (error)
1037 			break;
1038 
1039 		/* get the second operand */
1040 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1041 		if (error)
1042 			break;
1043 
1044 		/* perform the operation and write the result */
1045 		result = val1 & val2;
1046 		error = vie_update_register(vm, vcpuid, reg, result, size);
1047 		break;
1048 	case 0x81:
1049 	case 0x83:
1050 		/*
1051 		 * AND mem (ModRM:r/m) with immediate and store the
1052 		 * result in mem.
1053 		 *
1054 		 * 81 /4		and r/m16, imm16
1055 		 * 81 /4		and r/m32, imm32
1056 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1057 		 *
1058 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1059 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1060 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1061 		 */
1062 
1063 		/* get the first operand */
1064                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1065                 if (error)
1066 			break;
1067 
1068                 /*
1069 		 * perform the operation with the pre-fetched immediate
1070 		 * operand and write the result
1071 		 */
1072                 result = val1 & vie->immediate;
1073                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1074 		break;
1075 	default:
1076 		break;
1077 	}
1078 	if (error)
1079 		return (error);
1080 
1081 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1082 	if (error)
1083 		return (error);
1084 
1085 	/*
1086 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1087 	 * to the result; AF is undefined.
1088 	 *
1089 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1090 	 */
1091 	rflags2 = getcc(size, result, 0);
1092 	rflags &= ~RFLAGS_STATUS_BITS;
1093 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1094 
1095 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1096 	return (error);
1097 }
1098 
1099 static int
1100 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1101 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1102 {
1103 	int error, size;
1104 	enum vm_reg_name reg;
1105 	uint64_t result, rflags, rflags2, val1, val2;
1106 
1107 	size = vie->opsize;
1108 	error = EINVAL;
1109 
1110 	switch (vie->op.op_byte) {
1111 	case 0x0B:
1112 		/*
1113 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1114 		 * result in reg.
1115 		 *
1116 		 * 0b/r         or r16, r/m16
1117 		 * 0b/r         or r32, r/m32
1118 		 * REX.W + 0b/r or r64, r/m64
1119 		 */
1120 
1121 		/* get the first operand */
1122 		reg = gpr_map[vie->reg];
1123 		error = vie_read_register(vm, vcpuid, reg, &val1);
1124 		if (error)
1125 			break;
1126 
1127 		/* get the second operand */
1128 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1129 		if (error)
1130 			break;
1131 
1132 		/* perform the operation and write the result */
1133 		result = val1 | val2;
1134 		error = vie_update_register(vm, vcpuid, reg, result, size);
1135 		break;
1136 	case 0x81:
1137 	case 0x83:
1138 		/*
1139 		 * OR mem (ModRM:r/m) with immediate and store the
1140 		 * result in mem.
1141 		 *
1142 		 * 81 /1		or r/m16, imm16
1143 		 * 81 /1		or r/m32, imm32
1144 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1145 		 *
1146 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1147 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1148 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1149 		 */
1150 
1151 		/* get the first operand */
1152                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1153                 if (error)
1154 			break;
1155 
1156                 /*
1157 		 * perform the operation with the pre-fetched immediate
1158 		 * operand and write the result
1159 		 */
1160                 result = val1 | vie->immediate;
1161                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1162 		break;
1163 	default:
1164 		break;
1165 	}
1166 	if (error)
1167 		return (error);
1168 
1169 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1170 	if (error)
1171 		return (error);
1172 
1173 	/*
1174 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1175 	 * to the result; AF is undefined.
1176 	 *
1177 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1178 	 */
1179 	rflags2 = getcc(size, result, 0);
1180 	rflags &= ~RFLAGS_STATUS_BITS;
1181 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1182 
1183 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1184 	return (error);
1185 }
1186 
1187 static int
1188 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1189 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1190 {
1191 	int error, size;
1192 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1193 	enum vm_reg_name reg;
1194 
1195 	size = vie->opsize;
1196 	switch (vie->op.op_byte) {
1197 	case 0x39:
1198 	case 0x3B:
1199 		/*
1200 		 * 39/r		CMP r/m16, r16
1201 		 * 39/r		CMP r/m32, r32
1202 		 * REX.W 39/r	CMP r/m64, r64
1203 		 *
1204 		 * 3B/r		CMP r16, r/m16
1205 		 * 3B/r		CMP r32, r/m32
1206 		 * REX.W + 3B/r	CMP r64, r/m64
1207 		 *
1208 		 * Compare the first operand with the second operand and
1209 		 * set status flags in EFLAGS register. The comparison is
1210 		 * performed by subtracting the second operand from the first
1211 		 * operand and then setting the status flags.
1212 		 */
1213 
1214 		/* Get the register operand */
1215 		reg = gpr_map[vie->reg];
1216 		error = vie_read_register(vm, vcpuid, reg, &regop);
1217 		if (error)
1218 			return (error);
1219 
1220 		/* Get the memory operand */
1221 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1222 		if (error)
1223 			return (error);
1224 
1225 		if (vie->op.op_byte == 0x3B) {
1226 			op1 = regop;
1227 			op2 = memop;
1228 		} else {
1229 			op1 = memop;
1230 			op2 = regop;
1231 		}
1232 		rflags2 = getcc(size, op1, op2);
1233 		break;
1234 	case 0x80:
1235 	case 0x81:
1236 	case 0x83:
1237 		/*
1238 		 * 80 /7		cmp r/m8, imm8
1239 		 * REX + 80 /7		cmp r/m8, imm8
1240 		 *
1241 		 * 81 /7		cmp r/m16, imm16
1242 		 * 81 /7		cmp r/m32, imm32
1243 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1244 		 *
1245 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1246 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1247 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1248 		 *
1249 		 * Compare mem (ModRM:r/m) with immediate and set
1250 		 * status flags according to the results.  The
1251 		 * comparison is performed by subtracting the
1252 		 * immediate from the first operand and then setting
1253 		 * the status flags.
1254 		 *
1255 		 */
1256 		if (vie->op.op_byte == 0x80)
1257 			size = 1;
1258 
1259 		/* get the first operand */
1260                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1261 		if (error)
1262 			return (error);
1263 
1264 		rflags2 = getcc(size, op1, vie->immediate);
1265 		break;
1266 	default:
1267 		return (EINVAL);
1268 	}
1269 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1270 	if (error)
1271 		return (error);
1272 	rflags &= ~RFLAGS_STATUS_BITS;
1273 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1274 
1275 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1276 	return (error);
1277 }
1278 
1279 static int
1280 emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1281     mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1282 {
1283 	int error, size;
1284 	uint64_t op1, rflags, rflags2;
1285 
1286 	size = vie->opsize;
1287 	error = EINVAL;
1288 
1289 	switch (vie->op.op_byte) {
1290 	case 0xF7:
1291 		/*
1292 		 * F7 /0		test r/m16, imm16
1293 		 * F7 /0		test r/m32, imm32
1294 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1295 		 *
1296 		 * Test mem (ModRM:r/m) with immediate and set status
1297 		 * flags according to the results.  The comparison is
1298 		 * performed by anding the immediate from the first
1299 		 * operand and then setting the status flags.
1300 		 */
1301 		if ((vie->reg & 7) != 0)
1302 			return (EINVAL);
1303 
1304 		error = memread(vm, vcpuid, gpa, &op1, size, arg);
1305 		if (error)
1306 			return (error);
1307 
1308 		rflags2 = getandflags(size, op1, vie->immediate);
1309 		break;
1310 	default:
1311 		return (EINVAL);
1312 	}
1313 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1314 	if (error)
1315 		return (error);
1316 
1317 	/*
1318 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1319 	 * to the result; AF is undefined.
1320 	 */
1321 	rflags &= ~RFLAGS_STATUS_BITS;
1322 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1323 
1324 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1325 	return (error);
1326 }
1327 
1328 static int
1329 emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1330     struct vm_guest_paging *paging, mem_region_read_t memread,
1331     mem_region_write_t memwrite, void *arg)
1332 {
1333 	uint64_t src1, src2, dst, rflags;
1334 	unsigned start, len;
1335 	int error, size;
1336 
1337 	size = vie->opsize;
1338 	error = EINVAL;
1339 
1340 	/*
1341 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1342 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1343 	 *
1344 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1345 	 * Vex.vvvv.
1346 	 *
1347 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1348 	 */
1349 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1350 		size = 4;
1351 
1352 	/*
1353 	 * Extracts contiguous bits from the first /source/ operand (second
1354 	 * operand) using an index and length specified in the second /source/
1355 	 * operand (third operand).
1356 	 */
1357 	error = memread(vm, vcpuid, gpa, &src1, size, arg);
1358 	if (error)
1359 		return (error);
1360 	error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1361 	if (error)
1362 		return (error);
1363 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1364 	if (error)
1365 		return (error);
1366 
1367 	start = (src2 & 0xff);
1368 	len = (src2 & 0xff00) >> 8;
1369 
1370 	/* If no bits are extracted, the destination register is cleared. */
1371 	dst = 0;
1372 
1373 	/* If START exceeds the operand size, no bits are extracted. */
1374 	if (start > size * 8)
1375 		goto done;
1376 	/* Length is bounded by both the destination size and start offset. */
1377 	if (start + len > size * 8)
1378 		len = (size * 8) - start;
1379 	if (len == 0)
1380 		goto done;
1381 
1382 	if (start > 0)
1383 		src1 = (src1 >> start);
1384 	if (len < 64)
1385 		src1 = src1 & ((1ull << len) - 1);
1386 	dst = src1;
1387 
1388 done:
1389 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1390 	if (error)
1391 		return (error);
1392 
1393 	/*
1394 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1395 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1396 	 */
1397 	rflags &= ~RFLAGS_STATUS_BITS;
1398 	if (dst == 0)
1399 		rflags |= PSL_Z;
1400 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1401 	    8);
1402 	return (error);
1403 }
1404 
1405 static int
1406 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1407 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1408 {
1409 	int error, size;
1410 	uint64_t nval, rflags, rflags2, val1, val2;
1411 	enum vm_reg_name reg;
1412 
1413 	size = vie->opsize;
1414 	error = EINVAL;
1415 
1416 	switch (vie->op.op_byte) {
1417 	case 0x03:
1418 		/*
1419 		 * ADD r/m to r and store the result in r
1420 		 *
1421 		 * 03/r            ADD r16, r/m16
1422 		 * 03/r            ADD r32, r/m32
1423 		 * REX.W + 03/r    ADD r64, r/m64
1424 		 */
1425 
1426 		/* get the first operand */
1427 		reg = gpr_map[vie->reg];
1428 		error = vie_read_register(vm, vcpuid, reg, &val1);
1429 		if (error)
1430 			break;
1431 
1432 		/* get the second operand */
1433 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1434 		if (error)
1435 			break;
1436 
1437 		/* perform the operation and write the result */
1438 		nval = val1 + val2;
1439 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1440 		break;
1441 	default:
1442 		break;
1443 	}
1444 
1445 	if (!error) {
1446 		rflags2 = getaddflags(size, val1, val2);
1447 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1448 		    &rflags);
1449 		if (error)
1450 			return (error);
1451 
1452 		rflags &= ~RFLAGS_STATUS_BITS;
1453 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1454 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1455 		    rflags, 8);
1456 	}
1457 
1458 	return (error);
1459 }
1460 
1461 static int
1462 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1463 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1464 {
1465 	int error, size;
1466 	uint64_t nval, rflags, rflags2, val1, val2;
1467 	enum vm_reg_name reg;
1468 
1469 	size = vie->opsize;
1470 	error = EINVAL;
1471 
1472 	switch (vie->op.op_byte) {
1473 	case 0x2B:
1474 		/*
1475 		 * SUB r/m from r and store the result in r
1476 		 *
1477 		 * 2B/r            SUB r16, r/m16
1478 		 * 2B/r            SUB r32, r/m32
1479 		 * REX.W + 2B/r    SUB r64, r/m64
1480 		 */
1481 
1482 		/* get the first operand */
1483 		reg = gpr_map[vie->reg];
1484 		error = vie_read_register(vm, vcpuid, reg, &val1);
1485 		if (error)
1486 			break;
1487 
1488 		/* get the second operand */
1489 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1490 		if (error)
1491 			break;
1492 
1493 		/* perform the operation and write the result */
1494 		nval = val1 - val2;
1495 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1496 		break;
1497 	default:
1498 		break;
1499 	}
1500 
1501 	if (!error) {
1502 		rflags2 = getcc(size, val1, val2);
1503 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1504 		    &rflags);
1505 		if (error)
1506 			return (error);
1507 
1508 		rflags &= ~RFLAGS_STATUS_BITS;
1509 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1510 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1511 		    rflags, 8);
1512 	}
1513 
1514 	return (error);
1515 }
1516 
1517 static int
1518 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1519     struct vm_guest_paging *paging, mem_region_read_t memread,
1520     mem_region_write_t memwrite, void *arg)
1521 {
1522 #ifdef _KERNEL
1523 	struct vm_copyinfo copyinfo[2];
1524 #else
1525 	struct iovec copyinfo[2];
1526 #endif
1527 	struct seg_desc ss_desc;
1528 	uint64_t cr0, rflags, rsp, stack_gla, val;
1529 	int error, fault, size, stackaddrsize, pushop;
1530 
1531 	val = 0;
1532 	size = vie->opsize;
1533 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1534 
1535 	/*
1536 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1537 	 */
1538 	if (paging->cpu_mode == CPU_MODE_REAL) {
1539 		stackaddrsize = 2;
1540 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1541 		/*
1542 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1543 		 * - Stack pointer size is always 64-bits.
1544 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1545 		 * - 16-bit PUSH/POP is supported by using the operand size
1546 		 *   override prefix (66H).
1547 		 */
1548 		stackaddrsize = 8;
1549 		size = vie->opsize_override ? 2 : 8;
1550 	} else {
1551 		/*
1552 		 * In protected or compatibility mode the 'B' flag in the
1553 		 * stack-segment descriptor determines the size of the
1554 		 * stack pointer.
1555 		 */
1556 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1557 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1558 		    __func__, error));
1559 		if (SEG_DESC_DEF32(ss_desc.access))
1560 			stackaddrsize = 4;
1561 		else
1562 			stackaddrsize = 2;
1563 	}
1564 
1565 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1566 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1567 
1568 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1569 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1570 
1571 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1572 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1573 	if (pushop) {
1574 		rsp -= size;
1575 	}
1576 
1577 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1578 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1579 	    &stack_gla)) {
1580 		vm_inject_ss(vm, vcpuid, 0);
1581 		return (0);
1582 	}
1583 
1584 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1585 		vm_inject_ss(vm, vcpuid, 0);
1586 		return (0);
1587 	}
1588 
1589 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1590 		vm_inject_ac(vm, vcpuid, 0);
1591 		return (0);
1592 	}
1593 
1594 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1595 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1596 	    &fault);
1597 	if (error || fault)
1598 		return (error);
1599 
1600 	if (pushop) {
1601 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1602 		if (error == 0)
1603 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1604 	} else {
1605 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1606 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1607 		rsp += size;
1608 	}
1609 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1610 
1611 	if (error == 0) {
1612 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1613 		    stackaddrsize);
1614 		KASSERT(error == 0, ("error %d updating rsp", error));
1615 	}
1616 	return (error);
1617 }
1618 
1619 static int
1620 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1621     struct vm_guest_paging *paging, mem_region_read_t memread,
1622     mem_region_write_t memwrite, void *arg)
1623 {
1624 	int error;
1625 
1626 	/*
1627 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1628 	 *
1629 	 * PUSH is part of the group 5 extended opcodes and is identified
1630 	 * by ModRM:reg = b110.
1631 	 */
1632 	if ((vie->reg & 7) != 6)
1633 		return (EINVAL);
1634 
1635 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1636 	    memwrite, arg);
1637 	return (error);
1638 }
1639 
1640 static int
1641 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1642     struct vm_guest_paging *paging, mem_region_read_t memread,
1643     mem_region_write_t memwrite, void *arg)
1644 {
1645 	int error;
1646 
1647 	/*
1648 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1649 	 *
1650 	 * POP is part of the group 1A extended opcodes and is identified
1651 	 * by ModRM:reg = b000.
1652 	 */
1653 	if ((vie->reg & 7) != 0)
1654 		return (EINVAL);
1655 
1656 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1657 	    memwrite, arg);
1658 	return (error);
1659 }
1660 
1661 static int
1662 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1663     struct vm_guest_paging *paging, mem_region_read_t memread,
1664     mem_region_write_t memwrite, void *memarg)
1665 {
1666 	int error;
1667 
1668 	switch (vie->reg & 7) {
1669 	case 0x1:	/* OR */
1670 		error = emulate_or(vm, vcpuid, gpa, vie,
1671 		    memread, memwrite, memarg);
1672 		break;
1673 	case 0x4:	/* AND */
1674 		error = emulate_and(vm, vcpuid, gpa, vie,
1675 		    memread, memwrite, memarg);
1676 		break;
1677 	case 0x7:	/* CMP */
1678 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1679 		    memread, memwrite, memarg);
1680 		break;
1681 	default:
1682 		error = EINVAL;
1683 		break;
1684 	}
1685 
1686 	return (error);
1687 }
1688 
1689 static int
1690 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1691     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1692 {
1693 	uint64_t val, rflags;
1694 	int error, bitmask, bitoff;
1695 
1696 	/*
1697 	 * 0F BA is a Group 8 extended opcode.
1698 	 *
1699 	 * Currently we only emulate the 'Bit Test' instruction which is
1700 	 * identified by a ModR/M:reg encoding of 100b.
1701 	 */
1702 	if ((vie->reg & 7) != 4)
1703 		return (EINVAL);
1704 
1705 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1706 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1707 
1708 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1709 	if (error)
1710 		return (error);
1711 
1712 	/*
1713 	 * Intel SDM, Vol 2, Table 3-2:
1714 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1715 	 */
1716 	bitmask = vie->opsize * 8 - 1;
1717 	bitoff = vie->immediate & bitmask;
1718 
1719 	/* Copy the bit into the Carry flag in %rflags */
1720 	if (val & (1UL << bitoff))
1721 		rflags |= PSL_C;
1722 	else
1723 		rflags &= ~PSL_C;
1724 
1725 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1726 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1727 
1728 	return (0);
1729 }
1730 
1731 static int
1732 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1733     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1734 {
1735 	int error;
1736 	uint64_t buf;
1737 
1738 	switch (vie->reg & 7) {
1739 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1740 		if (vie->mod == 0x3) {
1741 			/*
1742 			 * SFENCE.  Ignore it, VM exit provides enough
1743 			 * barriers on its own.
1744 			 */
1745 			error = 0;
1746 		} else {
1747 			/*
1748 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1749 			 * rights.
1750 			 */
1751 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1752 		}
1753 		break;
1754 	default:
1755 		error = EINVAL;
1756 		break;
1757 	}
1758 
1759 	return (error);
1760 }
1761 
1762 int
1763 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1764     struct vm_guest_paging *paging, mem_region_read_t memread,
1765     mem_region_write_t memwrite, void *memarg)
1766 {
1767 	int error;
1768 
1769 	if (!vie->decoded)
1770 		return (EINVAL);
1771 
1772 	switch (vie->op.op_type) {
1773 	case VIE_OP_TYPE_GROUP1:
1774 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1775 		    memwrite, memarg);
1776 		break;
1777 	case VIE_OP_TYPE_POP:
1778 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1779 		    memwrite, memarg);
1780 		break;
1781 	case VIE_OP_TYPE_PUSH:
1782 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1783 		    memwrite, memarg);
1784 		break;
1785 	case VIE_OP_TYPE_CMP:
1786 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1787 				    memread, memwrite, memarg);
1788 		break;
1789 	case VIE_OP_TYPE_MOV:
1790 		error = emulate_mov(vm, vcpuid, gpa, vie,
1791 				    memread, memwrite, memarg);
1792 		break;
1793 	case VIE_OP_TYPE_MOVSX:
1794 	case VIE_OP_TYPE_MOVZX:
1795 		error = emulate_movx(vm, vcpuid, gpa, vie,
1796 				     memread, memwrite, memarg);
1797 		break;
1798 	case VIE_OP_TYPE_MOVS:
1799 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1800 		    memwrite, memarg);
1801 		break;
1802 	case VIE_OP_TYPE_STOS:
1803 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1804 		    memwrite, memarg);
1805 		break;
1806 	case VIE_OP_TYPE_AND:
1807 		error = emulate_and(vm, vcpuid, gpa, vie,
1808 				    memread, memwrite, memarg);
1809 		break;
1810 	case VIE_OP_TYPE_OR:
1811 		error = emulate_or(vm, vcpuid, gpa, vie,
1812 				    memread, memwrite, memarg);
1813 		break;
1814 	case VIE_OP_TYPE_SUB:
1815 		error = emulate_sub(vm, vcpuid, gpa, vie,
1816 				    memread, memwrite, memarg);
1817 		break;
1818 	case VIE_OP_TYPE_BITTEST:
1819 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1820 		    memread, memwrite, memarg);
1821 		break;
1822 	case VIE_OP_TYPE_TWOB_GRP15:
1823 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1824 		    memread, memwrite, memarg);
1825 		break;
1826 	case VIE_OP_TYPE_ADD:
1827 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1828 		    memwrite, memarg);
1829 		break;
1830 	case VIE_OP_TYPE_TEST:
1831 		error = emulate_test(vm, vcpuid, gpa, vie,
1832 		    memread, memwrite, memarg);
1833 		break;
1834 	case VIE_OP_TYPE_BEXTR:
1835 		error = emulate_bextr(vm, vcpuid, gpa, vie, paging,
1836 		    memread, memwrite, memarg);
1837 		break;
1838 	default:
1839 		error = EINVAL;
1840 		break;
1841 	}
1842 
1843 	return (error);
1844 }
1845 
1846 int
1847 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1848 {
1849 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1850 	    ("%s: invalid size %d", __func__, size));
1851 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1852 
1853 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1854 		return (0);
1855 
1856 	return ((gla & (size - 1)) ? 1 : 0);
1857 }
1858 
1859 int
1860 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1861 {
1862 	uint64_t mask;
1863 
1864 	if (cpu_mode != CPU_MODE_64BIT)
1865 		return (0);
1866 
1867 	/*
1868 	 * The value of the bit 47 in the 'gla' should be replicated in the
1869 	 * most significant 16 bits.
1870 	 */
1871 	mask = ~((1UL << 48) - 1);
1872 	if (gla & (1UL << 47))
1873 		return ((gla & mask) != mask);
1874 	else
1875 		return ((gla & mask) != 0);
1876 }
1877 
1878 uint64_t
1879 vie_size2mask(int size)
1880 {
1881 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1882 	    ("vie_size2mask: invalid size %d", size));
1883 	return (size2mask[size]);
1884 }
1885 
1886 int
1887 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1888     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1889     int prot, uint64_t *gla)
1890 {
1891 	uint64_t firstoff, low_limit, high_limit, segbase;
1892 	int glasize, type;
1893 
1894 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1895 	    ("%s: invalid segment %d", __func__, seg));
1896 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1897 	    ("%s: invalid operand size %d", __func__, length));
1898 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1899 	    ("%s: invalid prot %#x", __func__, prot));
1900 
1901 	firstoff = offset;
1902 	if (cpu_mode == CPU_MODE_64BIT) {
1903 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1904 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1905 		glasize = 8;
1906 	} else {
1907 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1908 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1909 		glasize = 4;
1910 		/*
1911 		 * If the segment selector is loaded with a NULL selector
1912 		 * then the descriptor is unusable and attempting to use
1913 		 * it results in a #GP(0).
1914 		 */
1915 		if (SEG_DESC_UNUSABLE(desc->access))
1916 			return (-1);
1917 
1918 		/*
1919 		 * The processor generates a #NP exception when a segment
1920 		 * register is loaded with a selector that points to a
1921 		 * descriptor that is not present. If this was the case then
1922 		 * it would have been checked before the VM-exit.
1923 		 */
1924 		KASSERT(SEG_DESC_PRESENT(desc->access),
1925 		    ("segment %d not present: %#x", seg, desc->access));
1926 
1927 		/*
1928 		 * The descriptor type must indicate a code/data segment.
1929 		 */
1930 		type = SEG_DESC_TYPE(desc->access);
1931 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1932 		    "descriptor type %#x", seg, type));
1933 
1934 		if (prot & PROT_READ) {
1935 			/* #GP on a read access to a exec-only code segment */
1936 			if ((type & 0xA) == 0x8)
1937 				return (-1);
1938 		}
1939 
1940 		if (prot & PROT_WRITE) {
1941 			/*
1942 			 * #GP on a write access to a code segment or a
1943 			 * read-only data segment.
1944 			 */
1945 			if (type & 0x8)			/* code segment */
1946 				return (-1);
1947 
1948 			if ((type & 0xA) == 0)		/* read-only data seg */
1949 				return (-1);
1950 		}
1951 
1952 		/*
1953 		 * 'desc->limit' is fully expanded taking granularity into
1954 		 * account.
1955 		 */
1956 		if ((type & 0xC) == 0x4) {
1957 			/* expand-down data segment */
1958 			low_limit = desc->limit + 1;
1959 			high_limit = SEG_DESC_DEF32(desc->access) ?
1960 			    0xffffffff : 0xffff;
1961 		} else {
1962 			/* code segment or expand-up data segment */
1963 			low_limit = 0;
1964 			high_limit = desc->limit;
1965 		}
1966 
1967 		while (length > 0) {
1968 			offset &= vie_size2mask(addrsize);
1969 			if (offset < low_limit || offset > high_limit)
1970 				return (-1);
1971 			offset++;
1972 			length--;
1973 		}
1974 	}
1975 
1976 	/*
1977 	 * In 64-bit mode all segments except %fs and %gs have a segment
1978 	 * base address of 0.
1979 	 */
1980 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1981 	    seg != VM_REG_GUEST_GS) {
1982 		segbase = 0;
1983 	} else {
1984 		segbase = desc->base;
1985 	}
1986 
1987 	/*
1988 	 * Truncate 'firstoff' to the effective address size before adding
1989 	 * it to the segment base.
1990 	 */
1991 	firstoff &= vie_size2mask(addrsize);
1992 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1993 	return (0);
1994 }
1995 
1996 /*
1997  * Prepare a partially decoded vie for a 2nd attempt.
1998  */
1999 void
2000 vie_restart(struct vie *vie)
2001 {
2002 	_Static_assert(
2003 	    offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
2004 	    offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
2005 	    "restart should not erase instruction length or contents");
2006 
2007 	memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
2008 	    sizeof(*vie) - offsetof(struct vie, vie_startzero));
2009 
2010 	vie->base_register = VM_REG_LAST;
2011 	vie->index_register = VM_REG_LAST;
2012 	vie->segment_register = VM_REG_LAST;
2013 }
2014 
2015 void
2016 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
2017 {
2018 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
2019 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2020 
2021 	vie_restart(vie);
2022 	memset(vie->inst, 0, sizeof(vie->inst));
2023 	if (inst_length != 0)
2024 		memcpy(vie->inst, inst_bytes, inst_length);
2025 	vie->num_valid = inst_length;
2026 }
2027 
2028 #ifdef _KERNEL
2029 static int
2030 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2031 {
2032 	int error_code = 0;
2033 
2034 	if (pte & PG_V)
2035 		error_code |= PGEX_P;
2036 	if (prot & VM_PROT_WRITE)
2037 		error_code |= PGEX_W;
2038 	if (usermode)
2039 		error_code |= PGEX_U;
2040 	if (rsvd)
2041 		error_code |= PGEX_RSV;
2042 	if (prot & VM_PROT_EXECUTE)
2043 		error_code |= PGEX_I;
2044 
2045 	return (error_code);
2046 }
2047 
2048 static void
2049 ptp_release(void **cookie)
2050 {
2051 	if (*cookie != NULL) {
2052 		vm_gpa_release(*cookie);
2053 		*cookie = NULL;
2054 	}
2055 }
2056 
2057 static void *
2058 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2059 {
2060 	void *ptr;
2061 
2062 	ptp_release(cookie);
2063 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
2064 	return (ptr);
2065 }
2066 
2067 static int
2068 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2069     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2070 {
2071 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2072 	u_int retries;
2073 	uint64_t *ptpbase, ptpphys, pte, pgsize;
2074 	uint32_t *ptpbase32, pte32;
2075 	void *cookie;
2076 
2077 	*guest_fault = 0;
2078 
2079 	usermode = (paging->cpl == 3 ? 1 : 0);
2080 	writable = prot & VM_PROT_WRITE;
2081 	cookie = NULL;
2082 	retval = 0;
2083 	retries = 0;
2084 restart:
2085 	ptpphys = paging->cr3;		/* root of the page tables */
2086 	ptp_release(&cookie);
2087 	if (retries++ > 0)
2088 		maybe_yield();
2089 
2090 	if (vie_canonical_check(paging->cpu_mode, gla)) {
2091 		/*
2092 		 * XXX assuming a non-stack reference otherwise a stack fault
2093 		 * should be generated.
2094 		 */
2095 		if (!check_only)
2096 			vm_inject_gp(vm, vcpuid);
2097 		goto fault;
2098 	}
2099 
2100 	if (paging->paging_mode == PAGING_MODE_FLAT) {
2101 		*gpa = gla;
2102 		goto done;
2103 	}
2104 
2105 	if (paging->paging_mode == PAGING_MODE_32) {
2106 		nlevels = 2;
2107 		while (--nlevels >= 0) {
2108 			/* Zero out the lower 12 bits. */
2109 			ptpphys &= ~0xfff;
2110 
2111 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2112 			    &cookie);
2113 
2114 			if (ptpbase32 == NULL)
2115 				goto error;
2116 
2117 			ptpshift = PAGE_SHIFT + nlevels * 10;
2118 			ptpindex = (gla >> ptpshift) & 0x3FF;
2119 			pgsize = 1UL << ptpshift;
2120 
2121 			pte32 = ptpbase32[ptpindex];
2122 
2123 			if ((pte32 & PG_V) == 0 ||
2124 			    (usermode && (pte32 & PG_U) == 0) ||
2125 			    (writable && (pte32 & PG_RW) == 0)) {
2126 				if (!check_only) {
2127 					pfcode = pf_error_code(usermode, prot, 0,
2128 					    pte32);
2129 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2130 				}
2131 				goto fault;
2132 			}
2133 
2134 			/*
2135 			 * Emulate the x86 MMU's management of the accessed
2136 			 * and dirty flags. While the accessed flag is set
2137 			 * at every level of the page table, the dirty flag
2138 			 * is only set at the last level providing the guest
2139 			 * physical address.
2140 			 */
2141 			if (!check_only && (pte32 & PG_A) == 0) {
2142 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2143 				    pte32, pte32 | PG_A) == 0) {
2144 					goto restart;
2145 				}
2146 			}
2147 
2148 			/* XXX must be ignored if CR4.PSE=0 */
2149 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2150 				break;
2151 
2152 			ptpphys = pte32;
2153 		}
2154 
2155 		/* Set the dirty bit in the page table entry if necessary */
2156 		if (!check_only && writable && (pte32 & PG_M) == 0) {
2157 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2158 			    pte32, pte32 | PG_M) == 0) {
2159 				goto restart;
2160 			}
2161 		}
2162 
2163 		/* Zero out the lower 'ptpshift' bits */
2164 		pte32 >>= ptpshift; pte32 <<= ptpshift;
2165 		*gpa = pte32 | (gla & (pgsize - 1));
2166 		goto done;
2167 	}
2168 
2169 	if (paging->paging_mode == PAGING_MODE_PAE) {
2170 		/* Zero out the lower 5 bits and the upper 32 bits */
2171 		ptpphys &= 0xffffffe0UL;
2172 
2173 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
2174 		    &cookie);
2175 		if (ptpbase == NULL)
2176 			goto error;
2177 
2178 		ptpindex = (gla >> 30) & 0x3;
2179 
2180 		pte = ptpbase[ptpindex];
2181 
2182 		if ((pte & PG_V) == 0) {
2183 			if (!check_only) {
2184 				pfcode = pf_error_code(usermode, prot, 0, pte);
2185 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2186 			}
2187 			goto fault;
2188 		}
2189 
2190 		ptpphys = pte;
2191 
2192 		nlevels = 2;
2193 	} else if (paging->paging_mode == PAGING_MODE_64_LA57) {
2194 		nlevels = 5;
2195 	} else {
2196 		nlevels = 4;
2197 	}
2198 
2199 	while (--nlevels >= 0) {
2200 		/* Zero out the lower 12 bits and the upper 12 bits */
2201 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2202 
2203 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
2204 		if (ptpbase == NULL)
2205 			goto error;
2206 
2207 		ptpshift = PAGE_SHIFT + nlevels * 9;
2208 		ptpindex = (gla >> ptpshift) & 0x1FF;
2209 		pgsize = 1UL << ptpshift;
2210 
2211 		pte = ptpbase[ptpindex];
2212 
2213 		if ((pte & PG_V) == 0 ||
2214 		    (usermode && (pte & PG_U) == 0) ||
2215 		    (writable && (pte & PG_RW) == 0)) {
2216 			if (!check_only) {
2217 				pfcode = pf_error_code(usermode, prot, 0, pte);
2218 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2219 			}
2220 			goto fault;
2221 		}
2222 
2223 		/* Set the accessed bit in the page table entry */
2224 		if (!check_only && (pte & PG_A) == 0) {
2225 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2226 			    pte, pte | PG_A) == 0) {
2227 				goto restart;
2228 			}
2229 		}
2230 
2231 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2232 			if (pgsize > 1 * GB) {
2233 				if (!check_only) {
2234 					pfcode = pf_error_code(usermode, prot, 1,
2235 					    pte);
2236 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2237 				}
2238 				goto fault;
2239 			}
2240 			break;
2241 		}
2242 
2243 		ptpphys = pte;
2244 	}
2245 
2246 	/* Set the dirty bit in the page table entry if necessary */
2247 	if (!check_only && writable && (pte & PG_M) == 0) {
2248 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2249 			goto restart;
2250 	}
2251 
2252 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2253 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2254 	*gpa = pte | (gla & (pgsize - 1));
2255 done:
2256 	ptp_release(&cookie);
2257 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2258 	    __func__, retval));
2259 	return (retval);
2260 error:
2261 	retval = EFAULT;
2262 	goto done;
2263 fault:
2264 	*guest_fault = 1;
2265 	goto done;
2266 }
2267 
2268 int
2269 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2270     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2271 {
2272 
2273 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2274 	    false));
2275 }
2276 
2277 int
2278 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2279     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2280 {
2281 
2282 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2283 	    true));
2284 }
2285 
2286 int
2287 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2288     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2289 {
2290 	struct vm_copyinfo copyinfo[2];
2291 	int error, prot;
2292 
2293 	if (inst_length > VIE_INST_SIZE)
2294 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2295 
2296 	prot = PROT_READ | PROT_EXEC;
2297 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2298 	    copyinfo, nitems(copyinfo), faultptr);
2299 	if (error || *faultptr)
2300 		return (error);
2301 
2302 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2303 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2304 	vie->num_valid = inst_length;
2305 	return (0);
2306 }
2307 #endif	/* _KERNEL */
2308 
2309 static int
2310 vie_peek(struct vie *vie, uint8_t *x)
2311 {
2312 
2313 	if (vie->num_processed < vie->num_valid) {
2314 		*x = vie->inst[vie->num_processed];
2315 		return (0);
2316 	} else
2317 		return (-1);
2318 }
2319 
2320 static void
2321 vie_advance(struct vie *vie)
2322 {
2323 
2324 	vie->num_processed++;
2325 }
2326 
2327 static bool
2328 segment_override(uint8_t x, int *seg)
2329 {
2330 
2331 	switch (x) {
2332 	case 0x2E:
2333 		*seg = VM_REG_GUEST_CS;
2334 		break;
2335 	case 0x36:
2336 		*seg = VM_REG_GUEST_SS;
2337 		break;
2338 	case 0x3E:
2339 		*seg = VM_REG_GUEST_DS;
2340 		break;
2341 	case 0x26:
2342 		*seg = VM_REG_GUEST_ES;
2343 		break;
2344 	case 0x64:
2345 		*seg = VM_REG_GUEST_FS;
2346 		break;
2347 	case 0x65:
2348 		*seg = VM_REG_GUEST_GS;
2349 		break;
2350 	default:
2351 		return (false);
2352 	}
2353 	return (true);
2354 }
2355 
2356 static int
2357 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2358 {
2359 	uint8_t x;
2360 
2361 	while (1) {
2362 		if (vie_peek(vie, &x))
2363 			return (-1);
2364 
2365 		if (x == 0x66)
2366 			vie->opsize_override = 1;
2367 		else if (x == 0x67)
2368 			vie->addrsize_override = 1;
2369 		else if (x == 0xF3)
2370 			vie->repz_present = 1;
2371 		else if (x == 0xF2)
2372 			vie->repnz_present = 1;
2373 		else if (segment_override(x, &vie->segment_register))
2374 			vie->segment_override = 1;
2375 		else
2376 			break;
2377 
2378 		vie_advance(vie);
2379 	}
2380 
2381 	/*
2382 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2383 	 * - Only one REX prefix is allowed per instruction.
2384 	 * - The REX prefix must immediately precede the opcode byte or the
2385 	 *   escape opcode byte.
2386 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2387 	 *   the mandatory prefix must come before the REX prefix.
2388 	 */
2389 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2390 		vie->rex_present = 1;
2391 		vie->rex_w = x & 0x8 ? 1 : 0;
2392 		vie->rex_r = x & 0x4 ? 1 : 0;
2393 		vie->rex_x = x & 0x2 ? 1 : 0;
2394 		vie->rex_b = x & 0x1 ? 1 : 0;
2395 		vie_advance(vie);
2396 	}
2397 
2398 	/*
2399 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2400 	 */
2401 	if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2402 	    && x == 0xC4) {
2403 		const struct vie_op *optab;
2404 
2405 		/* 3-byte VEX prefix. */
2406 		vie->vex_present = 1;
2407 
2408 		vie_advance(vie);
2409 		if (vie_peek(vie, &x))
2410 			return (-1);
2411 
2412 		/*
2413 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
2414 		 * relative to REX encoding.
2415 		 */
2416 		vie->rex_r = x & 0x80 ? 0 : 1;
2417 		vie->rex_x = x & 0x40 ? 0 : 1;
2418 		vie->rex_b = x & 0x20 ? 0 : 1;
2419 
2420 		switch (x & 0x1F) {
2421 		case 0x2:
2422 			/* 0F 38. */
2423 			optab = three_byte_opcodes_0f38;
2424 			break;
2425 		case 0x1:
2426 			/* 0F class - nothing handled here yet. */
2427 			/* FALLTHROUGH */
2428 		case 0x3:
2429 			/* 0F 3A class - nothing handled here yet. */
2430 			/* FALLTHROUGH */
2431 		default:
2432 			/* Reserved (#UD). */
2433 			return (-1);
2434 		}
2435 
2436 		vie_advance(vie);
2437 		if (vie_peek(vie, &x))
2438 			return (-1);
2439 
2440 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2441 		vie->rex_w = x & 0x80 ? 1 : 0;
2442 
2443 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2444 		vie->vex_l = !!(x & 0x4);
2445 		vie->vex_pp = (x & 0x3);
2446 
2447 		/* PP: 1=66 2=F3 3=F2 prefixes. */
2448 		switch (vie->vex_pp) {
2449 		case 0x1:
2450 			vie->opsize_override = 1;
2451 			break;
2452 		case 0x2:
2453 			vie->repz_present = 1;
2454 			break;
2455 		case 0x3:
2456 			vie->repnz_present = 1;
2457 			break;
2458 		}
2459 
2460 		vie_advance(vie);
2461 
2462 		/* Opcode, sans literal prefix prefix. */
2463 		if (vie_peek(vie, &x))
2464 			return (-1);
2465 
2466 		vie->op = optab[x];
2467 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
2468 			return (-1);
2469 
2470 		vie_advance(vie);
2471 	}
2472 
2473 	/*
2474 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2475 	 */
2476 	if (cpu_mode == CPU_MODE_64BIT) {
2477 		/*
2478 		 * Default address size is 64-bits and default operand size
2479 		 * is 32-bits.
2480 		 */
2481 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2482 		if (vie->rex_w)
2483 			vie->opsize = 8;
2484 		else if (vie->opsize_override)
2485 			vie->opsize = 2;
2486 		else
2487 			vie->opsize = 4;
2488 	} else if (cs_d) {
2489 		/* Default address and operand sizes are 32-bits */
2490 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2491 		vie->opsize = vie->opsize_override ? 2 : 4;
2492 	} else {
2493 		/* Default address and operand sizes are 16-bits */
2494 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2495 		vie->opsize = vie->opsize_override ? 4 : 2;
2496 	}
2497 	return (0);
2498 }
2499 
2500 static int
2501 decode_two_byte_opcode(struct vie *vie)
2502 {
2503 	uint8_t x;
2504 
2505 	if (vie_peek(vie, &x))
2506 		return (-1);
2507 
2508 	vie->op = two_byte_opcodes[x];
2509 
2510 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2511 		return (-1);
2512 
2513 	vie_advance(vie);
2514 	return (0);
2515 }
2516 
2517 static int
2518 decode_opcode(struct vie *vie)
2519 {
2520 	uint8_t x;
2521 
2522 	if (vie_peek(vie, &x))
2523 		return (-1);
2524 
2525 	/* Already did this via VEX prefix. */
2526 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
2527 		return (0);
2528 
2529 	vie->op = one_byte_opcodes[x];
2530 
2531 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2532 		return (-1);
2533 
2534 	vie_advance(vie);
2535 
2536 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2537 		return (decode_two_byte_opcode(vie));
2538 
2539 	return (0);
2540 }
2541 
2542 static int
2543 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2544 {
2545 	uint8_t x;
2546 
2547 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2548 		return (0);
2549 
2550 	if (cpu_mode == CPU_MODE_REAL)
2551 		return (-1);
2552 
2553 	if (vie_peek(vie, &x))
2554 		return (-1);
2555 
2556 	vie->mod = (x >> 6) & 0x3;
2557 	vie->rm =  (x >> 0) & 0x7;
2558 	vie->reg = (x >> 3) & 0x7;
2559 
2560 	/*
2561 	 * A direct addressing mode makes no sense in the context of an EPT
2562 	 * fault. There has to be a memory access involved to cause the
2563 	 * EPT fault.
2564 	 */
2565 	if (vie->mod == VIE_MOD_DIRECT)
2566 		return (-1);
2567 
2568 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2569 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2570 		/*
2571 		 * Table 2-5: Special Cases of REX Encodings
2572 		 *
2573 		 * mod=0, r/m=5 is used in the compatibility mode to
2574 		 * indicate a disp32 without a base register.
2575 		 *
2576 		 * mod!=3, r/m=4 is used in the compatibility mode to
2577 		 * indicate that the SIB byte is present.
2578 		 *
2579 		 * The 'b' bit in the REX prefix is don't care in
2580 		 * this case.
2581 		 */
2582 	} else {
2583 		vie->rm |= (vie->rex_b << 3);
2584 	}
2585 
2586 	vie->reg |= (vie->rex_r << 3);
2587 
2588 	/* SIB */
2589 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2590 		goto done;
2591 
2592 	vie->base_register = gpr_map[vie->rm];
2593 
2594 	switch (vie->mod) {
2595 	case VIE_MOD_INDIRECT_DISP8:
2596 		vie->disp_bytes = 1;
2597 		break;
2598 	case VIE_MOD_INDIRECT_DISP32:
2599 		vie->disp_bytes = 4;
2600 		break;
2601 	case VIE_MOD_INDIRECT:
2602 		if (vie->rm == VIE_RM_DISP32) {
2603 			vie->disp_bytes = 4;
2604 			/*
2605 			 * Table 2-7. RIP-Relative Addressing
2606 			 *
2607 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2608 			 * whereas in compatibility mode it just implies disp32.
2609 			 */
2610 
2611 			if (cpu_mode == CPU_MODE_64BIT)
2612 				vie->base_register = VM_REG_GUEST_RIP;
2613 			else
2614 				vie->base_register = VM_REG_LAST;
2615 		}
2616 		break;
2617 	}
2618 
2619 done:
2620 	vie_advance(vie);
2621 
2622 	return (0);
2623 }
2624 
2625 static int
2626 decode_sib(struct vie *vie)
2627 {
2628 	uint8_t x;
2629 
2630 	/* Proceed only if SIB byte is present */
2631 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2632 		return (0);
2633 
2634 	if (vie_peek(vie, &x))
2635 		return (-1);
2636 
2637 	/* De-construct the SIB byte */
2638 	vie->ss = (x >> 6) & 0x3;
2639 	vie->index = (x >> 3) & 0x7;
2640 	vie->base = (x >> 0) & 0x7;
2641 
2642 	/* Apply the REX prefix modifiers */
2643 	vie->index |= vie->rex_x << 3;
2644 	vie->base |= vie->rex_b << 3;
2645 
2646 	switch (vie->mod) {
2647 	case VIE_MOD_INDIRECT_DISP8:
2648 		vie->disp_bytes = 1;
2649 		break;
2650 	case VIE_MOD_INDIRECT_DISP32:
2651 		vie->disp_bytes = 4;
2652 		break;
2653 	}
2654 
2655 	if (vie->mod == VIE_MOD_INDIRECT &&
2656 	    (vie->base == 5 || vie->base == 13)) {
2657 		/*
2658 		 * Special case when base register is unused if mod = 0
2659 		 * and base = %rbp or %r13.
2660 		 *
2661 		 * Documented in:
2662 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2663 		 * Table 2-5: Special Cases of REX Encodings
2664 		 */
2665 		vie->disp_bytes = 4;
2666 	} else {
2667 		vie->base_register = gpr_map[vie->base];
2668 	}
2669 
2670 	/*
2671 	 * All encodings of 'index' are valid except for %rsp (4).
2672 	 *
2673 	 * Documented in:
2674 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2675 	 * Table 2-5: Special Cases of REX Encodings
2676 	 */
2677 	if (vie->index != 4)
2678 		vie->index_register = gpr_map[vie->index];
2679 
2680 	/* 'scale' makes sense only in the context of an index register */
2681 	if (vie->index_register < VM_REG_LAST)
2682 		vie->scale = 1 << vie->ss;
2683 
2684 	vie_advance(vie);
2685 
2686 	return (0);
2687 }
2688 
2689 static int
2690 decode_displacement(struct vie *vie)
2691 {
2692 	int n, i;
2693 	uint8_t x;
2694 
2695 	union {
2696 		char	buf[4];
2697 		int8_t	signed8;
2698 		int32_t	signed32;
2699 	} u;
2700 
2701 	if ((n = vie->disp_bytes) == 0)
2702 		return (0);
2703 
2704 	if (n != 1 && n != 4)
2705 		panic("decode_displacement: invalid disp_bytes %d", n);
2706 
2707 	for (i = 0; i < n; i++) {
2708 		if (vie_peek(vie, &x))
2709 			return (-1);
2710 
2711 		u.buf[i] = x;
2712 		vie_advance(vie);
2713 	}
2714 
2715 	if (n == 1)
2716 		vie->displacement = u.signed8;		/* sign-extended */
2717 	else
2718 		vie->displacement = u.signed32;		/* sign-extended */
2719 
2720 	return (0);
2721 }
2722 
2723 static int
2724 decode_immediate(struct vie *vie)
2725 {
2726 	int i, n;
2727 	uint8_t x;
2728 	union {
2729 		char	buf[4];
2730 		int8_t	signed8;
2731 		int16_t	signed16;
2732 		int32_t	signed32;
2733 	} u;
2734 
2735 	/* Figure out immediate operand size (if any) */
2736 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2737 		/*
2738 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2739 		 * In 64-bit mode the typical size of immediate operands
2740 		 * remains 32-bits. When the operand size if 64-bits, the
2741 		 * processor sign-extends all immediates to 64-bits prior
2742 		 * to their use.
2743 		 */
2744 		if (vie->opsize == 4 || vie->opsize == 8)
2745 			vie->imm_bytes = 4;
2746 		else
2747 			vie->imm_bytes = 2;
2748 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2749 		vie->imm_bytes = 1;
2750 	}
2751 
2752 	if ((n = vie->imm_bytes) == 0)
2753 		return (0);
2754 
2755 	KASSERT(n == 1 || n == 2 || n == 4,
2756 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2757 
2758 	for (i = 0; i < n; i++) {
2759 		if (vie_peek(vie, &x))
2760 			return (-1);
2761 
2762 		u.buf[i] = x;
2763 		vie_advance(vie);
2764 	}
2765 
2766 	/* sign-extend the immediate value before use */
2767 	if (n == 1)
2768 		vie->immediate = u.signed8;
2769 	else if (n == 2)
2770 		vie->immediate = u.signed16;
2771 	else
2772 		vie->immediate = u.signed32;
2773 
2774 	return (0);
2775 }
2776 
2777 static int
2778 decode_moffset(struct vie *vie)
2779 {
2780 	int i, n;
2781 	uint8_t x;
2782 	union {
2783 		char	buf[8];
2784 		uint64_t u64;
2785 	} u;
2786 
2787 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2788 		return (0);
2789 
2790 	/*
2791 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2792 	 * The memory offset size follows the address-size of the instruction.
2793 	 */
2794 	n = vie->addrsize;
2795 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2796 
2797 	u.u64 = 0;
2798 	for (i = 0; i < n; i++) {
2799 		if (vie_peek(vie, &x))
2800 			return (-1);
2801 
2802 		u.buf[i] = x;
2803 		vie_advance(vie);
2804 	}
2805 	vie->displacement = u.u64;
2806 	return (0);
2807 }
2808 
2809 #ifdef _KERNEL
2810 /*
2811  * Verify that the 'guest linear address' provided as collateral of the nested
2812  * page table fault matches with our instruction decoding.
2813  */
2814 static int
2815 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2816     enum vm_cpu_mode cpu_mode)
2817 {
2818 	int error;
2819 	uint64_t base, segbase, idx, gla2;
2820 	enum vm_reg_name seg;
2821 	struct seg_desc desc;
2822 
2823 	/* Skip 'gla' verification */
2824 	if (gla == VIE_INVALID_GLA)
2825 		return (0);
2826 
2827 	base = 0;
2828 	if (vie->base_register != VM_REG_LAST) {
2829 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2830 		if (error) {
2831 			printf("verify_gla: error %d getting base reg %d\n",
2832 				error, vie->base_register);
2833 			return (-1);
2834 		}
2835 
2836 		/*
2837 		 * RIP-relative addressing starts from the following
2838 		 * instruction
2839 		 */
2840 		if (vie->base_register == VM_REG_GUEST_RIP)
2841 			base += vie->num_processed;
2842 	}
2843 
2844 	idx = 0;
2845 	if (vie->index_register != VM_REG_LAST) {
2846 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2847 		if (error) {
2848 			printf("verify_gla: error %d getting index reg %d\n",
2849 				error, vie->index_register);
2850 			return (-1);
2851 		}
2852 	}
2853 
2854 	/*
2855 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2856 	 *
2857 	 * In 64-bit mode, segmentation is generally (but not
2858 	 * completely) disabled.  The exceptions are the FS and GS
2859 	 * segments.
2860 	 *
2861 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2862 	 * as the base, the SS segment is the default segment.  For
2863 	 * other data references, except when relative to stack or
2864 	 * string destination the DS segment is the default.  These
2865 	 * can be overridden to allow other segments to be accessed.
2866 	 */
2867 	if (vie->segment_override)
2868 		seg = vie->segment_register;
2869 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2870 	    vie->base_register == VM_REG_GUEST_RBP)
2871 		seg = VM_REG_GUEST_SS;
2872 	else
2873 		seg = VM_REG_GUEST_DS;
2874 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2875 	    seg != VM_REG_GUEST_GS) {
2876 		segbase = 0;
2877 	} else {
2878 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2879 		if (error) {
2880 			printf("verify_gla: error %d getting segment"
2881 			       " descriptor %d", error,
2882 			       vie->segment_register);
2883 			return (-1);
2884 		}
2885 		segbase = desc.base;
2886 	}
2887 
2888 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2889 	gla2 &= size2mask[vie->addrsize];
2890 	if (gla != gla2) {
2891 		printf("verify_gla mismatch: segbase(0x%0lx)"
2892 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2893 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2894 		       segbase, base, vie->scale, idx, vie->displacement,
2895 		       gla, gla2);
2896 		return (-1);
2897 	}
2898 
2899 	return (0);
2900 }
2901 #endif	/* _KERNEL */
2902 
2903 int
2904 #ifdef _KERNEL
2905 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2906 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2907 #else
2908 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2909 #endif
2910 {
2911 
2912 	if (decode_prefixes(vie, cpu_mode, cs_d))
2913 		return (-1);
2914 
2915 	if (decode_opcode(vie))
2916 		return (-1);
2917 
2918 	if (decode_modrm(vie, cpu_mode))
2919 		return (-1);
2920 
2921 	if (decode_sib(vie))
2922 		return (-1);
2923 
2924 	if (decode_displacement(vie))
2925 		return (-1);
2926 
2927 	if (decode_immediate(vie))
2928 		return (-1);
2929 
2930 	if (decode_moffset(vie))
2931 		return (-1);
2932 
2933 #ifdef _KERNEL
2934 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2935 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2936 			return (-1);
2937 	}
2938 #endif
2939 
2940 	vie->decoded = 1;	/* success */
2941 
2942 	return (0);
2943 }
2944