xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 67ca7330cf34a789afbbff9ae7e4cdc4a4917ae3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <assert.h>
54 #include <vmmapi.h>
55 #define	KASSERT(exp,msg)	assert((exp))
56 #endif	/* _KERNEL */
57 
58 #include <machine/vmm_instruction_emul.h>
59 #include <x86/psl.h>
60 #include <x86/specialreg.h>
61 
62 /* struct vie_op.op_type */
63 enum {
64 	VIE_OP_TYPE_NONE = 0,
65 	VIE_OP_TYPE_MOV,
66 	VIE_OP_TYPE_MOVSX,
67 	VIE_OP_TYPE_MOVZX,
68 	VIE_OP_TYPE_AND,
69 	VIE_OP_TYPE_OR,
70 	VIE_OP_TYPE_SUB,
71 	VIE_OP_TYPE_TWO_BYTE,
72 	VIE_OP_TYPE_PUSH,
73 	VIE_OP_TYPE_CMP,
74 	VIE_OP_TYPE_POP,
75 	VIE_OP_TYPE_MOVS,
76 	VIE_OP_TYPE_GROUP1,
77 	VIE_OP_TYPE_STOS,
78 	VIE_OP_TYPE_BITTEST,
79 	VIE_OP_TYPE_TWOB_GRP15,
80 	VIE_OP_TYPE_ADD,
81 	VIE_OP_TYPE_LAST
82 };
83 
84 /* struct vie_op.op_flags */
85 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
86 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
87 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
88 #define	VIE_OP_F_NO_MODRM	(1 << 3)
89 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
90 
91 static const struct vie_op two_byte_opcodes[256] = {
92 	[0xAE] = {
93 		  .op_byte = 0xAE,
94 		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
95 	},
96 	[0xB6] = {
97 		.op_byte = 0xB6,
98 		.op_type = VIE_OP_TYPE_MOVZX,
99 	},
100 	[0xB7] = {
101 		.op_byte = 0xB7,
102 		.op_type = VIE_OP_TYPE_MOVZX,
103 	},
104 	[0xBA] = {
105 		.op_byte = 0xBA,
106 		.op_type = VIE_OP_TYPE_BITTEST,
107 		.op_flags = VIE_OP_F_IMM8,
108 	},
109 	[0xBE] = {
110 		.op_byte = 0xBE,
111 		.op_type = VIE_OP_TYPE_MOVSX,
112 	},
113 };
114 
115 static const struct vie_op one_byte_opcodes[256] = {
116 	[0x03] = {
117 		.op_byte = 0x03,
118 		.op_type = VIE_OP_TYPE_ADD,
119 	},
120 	[0x0F] = {
121 		.op_byte = 0x0F,
122 		.op_type = VIE_OP_TYPE_TWO_BYTE
123 	},
124 	[0x0B] = {
125 		.op_byte = 0x0B,
126 		.op_type = VIE_OP_TYPE_OR,
127 	},
128 	[0x2B] = {
129 		.op_byte = 0x2B,
130 		.op_type = VIE_OP_TYPE_SUB,
131 	},
132 	[0x39] = {
133 		.op_byte = 0x39,
134 		.op_type = VIE_OP_TYPE_CMP,
135 	},
136 	[0x3B] = {
137 		.op_byte = 0x3B,
138 		.op_type = VIE_OP_TYPE_CMP,
139 	},
140 	[0x88] = {
141 		.op_byte = 0x88,
142 		.op_type = VIE_OP_TYPE_MOV,
143 	},
144 	[0x89] = {
145 		.op_byte = 0x89,
146 		.op_type = VIE_OP_TYPE_MOV,
147 	},
148 	[0x8A] = {
149 		.op_byte = 0x8A,
150 		.op_type = VIE_OP_TYPE_MOV,
151 	},
152 	[0x8B] = {
153 		.op_byte = 0x8B,
154 		.op_type = VIE_OP_TYPE_MOV,
155 	},
156 	[0xA1] = {
157 		.op_byte = 0xA1,
158 		.op_type = VIE_OP_TYPE_MOV,
159 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
160 	},
161 	[0xA3] = {
162 		.op_byte = 0xA3,
163 		.op_type = VIE_OP_TYPE_MOV,
164 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
165 	},
166 	[0xA4] = {
167 		.op_byte = 0xA4,
168 		.op_type = VIE_OP_TYPE_MOVS,
169 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
170 	},
171 	[0xA5] = {
172 		.op_byte = 0xA5,
173 		.op_type = VIE_OP_TYPE_MOVS,
174 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
175 	},
176 	[0xAA] = {
177 		.op_byte = 0xAA,
178 		.op_type = VIE_OP_TYPE_STOS,
179 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
180 	},
181 	[0xAB] = {
182 		.op_byte = 0xAB,
183 		.op_type = VIE_OP_TYPE_STOS,
184 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
185 	},
186 	[0xC6] = {
187 		/* XXX Group 11 extended opcode - not just MOV */
188 		.op_byte = 0xC6,
189 		.op_type = VIE_OP_TYPE_MOV,
190 		.op_flags = VIE_OP_F_IMM8,
191 	},
192 	[0xC7] = {
193 		.op_byte = 0xC7,
194 		.op_type = VIE_OP_TYPE_MOV,
195 		.op_flags = VIE_OP_F_IMM,
196 	},
197 	[0x23] = {
198 		.op_byte = 0x23,
199 		.op_type = VIE_OP_TYPE_AND,
200 	},
201 	[0x80] = {
202 		/* Group 1 extended opcode */
203 		.op_byte = 0x80,
204 		.op_type = VIE_OP_TYPE_GROUP1,
205 		.op_flags = VIE_OP_F_IMM8,
206 	},
207 	[0x81] = {
208 		/* Group 1 extended opcode */
209 		.op_byte = 0x81,
210 		.op_type = VIE_OP_TYPE_GROUP1,
211 		.op_flags = VIE_OP_F_IMM,
212 	},
213 	[0x83] = {
214 		/* Group 1 extended opcode */
215 		.op_byte = 0x83,
216 		.op_type = VIE_OP_TYPE_GROUP1,
217 		.op_flags = VIE_OP_F_IMM8,
218 	},
219 	[0x8F] = {
220 		/* XXX Group 1A extended opcode - not just POP */
221 		.op_byte = 0x8F,
222 		.op_type = VIE_OP_TYPE_POP,
223 	},
224 	[0xFF] = {
225 		/* XXX Group 5 extended opcode - not just PUSH */
226 		.op_byte = 0xFF,
227 		.op_type = VIE_OP_TYPE_PUSH,
228 	}
229 };
230 
231 /* struct vie.mod */
232 #define	VIE_MOD_INDIRECT		0
233 #define	VIE_MOD_INDIRECT_DISP8		1
234 #define	VIE_MOD_INDIRECT_DISP32		2
235 #define	VIE_MOD_DIRECT			3
236 
237 /* struct vie.rm */
238 #define	VIE_RM_SIB			4
239 #define	VIE_RM_DISP32			5
240 
241 #define	GB				(1024 * 1024 * 1024)
242 
243 static enum vm_reg_name gpr_map[16] = {
244 	VM_REG_GUEST_RAX,
245 	VM_REG_GUEST_RCX,
246 	VM_REG_GUEST_RDX,
247 	VM_REG_GUEST_RBX,
248 	VM_REG_GUEST_RSP,
249 	VM_REG_GUEST_RBP,
250 	VM_REG_GUEST_RSI,
251 	VM_REG_GUEST_RDI,
252 	VM_REG_GUEST_R8,
253 	VM_REG_GUEST_R9,
254 	VM_REG_GUEST_R10,
255 	VM_REG_GUEST_R11,
256 	VM_REG_GUEST_R12,
257 	VM_REG_GUEST_R13,
258 	VM_REG_GUEST_R14,
259 	VM_REG_GUEST_R15
260 };
261 
262 static uint64_t size2mask[] = {
263 	[1] = 0xff,
264 	[2] = 0xffff,
265 	[4] = 0xffffffff,
266 	[8] = 0xffffffffffffffff,
267 };
268 
269 static int
270 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
271 {
272 	int error;
273 
274 	error = vm_get_register(vm, vcpuid, reg, rval);
275 
276 	return (error);
277 }
278 
279 static void
280 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
281 {
282 	*lhbr = 0;
283 	*reg = gpr_map[vie->reg];
284 
285 	/*
286 	 * 64-bit mode imposes limitations on accessing legacy high byte
287 	 * registers (lhbr).
288 	 *
289 	 * The legacy high-byte registers cannot be addressed if the REX
290 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
291 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
292 	 *
293 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
294 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
295 	 * %ah, %ch, %dh and %bh respectively.
296 	 */
297 	if (!vie->rex_present) {
298 		if (vie->reg & 0x4) {
299 			*lhbr = 1;
300 			*reg = gpr_map[vie->reg & 0x3];
301 		}
302 	}
303 }
304 
305 static int
306 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
307 {
308 	uint64_t val;
309 	int error, lhbr;
310 	enum vm_reg_name reg;
311 
312 	vie_calc_bytereg(vie, &reg, &lhbr);
313 	error = vm_get_register(vm, vcpuid, reg, &val);
314 
315 	/*
316 	 * To obtain the value of a legacy high byte register shift the
317 	 * base register right by 8 bits (%ah = %rax >> 8).
318 	 */
319 	if (lhbr)
320 		*rval = val >> 8;
321 	else
322 		*rval = val;
323 	return (error);
324 }
325 
326 static int
327 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
328 {
329 	uint64_t origval, val, mask;
330 	int error, lhbr;
331 	enum vm_reg_name reg;
332 
333 	vie_calc_bytereg(vie, &reg, &lhbr);
334 	error = vm_get_register(vm, vcpuid, reg, &origval);
335 	if (error == 0) {
336 		val = byte;
337 		mask = 0xff;
338 		if (lhbr) {
339 			/*
340 			 * Shift left by 8 to store 'byte' in a legacy high
341 			 * byte register.
342 			 */
343 			val <<= 8;
344 			mask <<= 8;
345 		}
346 		val |= origval & ~mask;
347 		error = vm_set_register(vm, vcpuid, reg, val);
348 	}
349 	return (error);
350 }
351 
352 int
353 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
354 		    uint64_t val, int size)
355 {
356 	int error;
357 	uint64_t origval;
358 
359 	switch (size) {
360 	case 1:
361 	case 2:
362 		error = vie_read_register(vm, vcpuid, reg, &origval);
363 		if (error)
364 			return (error);
365 		val &= size2mask[size];
366 		val |= origval & ~size2mask[size];
367 		break;
368 	case 4:
369 		val &= 0xffffffffUL;
370 		break;
371 	case 8:
372 		break;
373 	default:
374 		return (EINVAL);
375 	}
376 
377 	error = vm_set_register(vm, vcpuid, reg, val);
378 	return (error);
379 }
380 
381 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
382 
383 /*
384  * Return the status flags that would result from doing (x - y).
385  */
386 #define	GETCC(sz)							\
387 static u_long								\
388 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
389 {									\
390 	u_long rflags;							\
391 									\
392 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
393 	    "=r" (rflags), "+r" (x) : "m" (y));				\
394 	return (rflags);						\
395 } struct __hack
396 
397 GETCC(8);
398 GETCC(16);
399 GETCC(32);
400 GETCC(64);
401 
402 static u_long
403 getcc(int opsize, uint64_t x, uint64_t y)
404 {
405 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
406 	    ("getcc: invalid operand size %d", opsize));
407 
408 	if (opsize == 1)
409 		return (getcc8(x, y));
410 	else if (opsize == 2)
411 		return (getcc16(x, y));
412 	else if (opsize == 4)
413 		return (getcc32(x, y));
414 	else
415 		return (getcc64(x, y));
416 }
417 
418 /*
419  * Macro creation of functions getaddflags{8,16,32,64}
420  */
421 #define	GETADDFLAGS(sz)							\
422 static u_long								\
423 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
424 {									\
425 	u_long rflags;							\
426 									\
427 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
428 	    "=r" (rflags), "+r" (x) : "m" (y));				\
429 	return (rflags);						\
430 } struct __hack
431 
432 GETADDFLAGS(8);
433 GETADDFLAGS(16);
434 GETADDFLAGS(32);
435 GETADDFLAGS(64);
436 
437 static u_long
438 getaddflags(int opsize, uint64_t x, uint64_t y)
439 {
440 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
441 	    ("getaddflags: invalid operand size %d", opsize));
442 
443 	if (opsize == 1)
444 		return (getaddflags8(x, y));
445 	else if (opsize == 2)
446 		return (getaddflags16(x, y));
447 	else if (opsize == 4)
448 		return (getaddflags32(x, y));
449 	else
450 		return (getaddflags64(x, y));
451 }
452 
453 static int
454 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
455 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
456 {
457 	int error, size;
458 	enum vm_reg_name reg;
459 	uint8_t byte;
460 	uint64_t val;
461 
462 	size = vie->opsize;
463 	error = EINVAL;
464 
465 	switch (vie->op.op_byte) {
466 	case 0x88:
467 		/*
468 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
469 		 * 88/r:	mov r/m8, r8
470 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
471 		 */
472 		size = 1;	/* override for byte operation */
473 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
474 		if (error == 0)
475 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
476 		break;
477 	case 0x89:
478 		/*
479 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
480 		 * 89/r:	mov r/m16, r16
481 		 * 89/r:	mov r/m32, r32
482 		 * REX.W + 89/r	mov r/m64, r64
483 		 */
484 		reg = gpr_map[vie->reg];
485 		error = vie_read_register(vm, vcpuid, reg, &val);
486 		if (error == 0) {
487 			val &= size2mask[size];
488 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
489 		}
490 		break;
491 	case 0x8A:
492 		/*
493 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
494 		 * 8A/r:	mov r8, r/m8
495 		 * REX + 8A/r:	mov r8, r/m8
496 		 */
497 		size = 1;	/* override for byte operation */
498 		error = memread(vm, vcpuid, gpa, &val, size, arg);
499 		if (error == 0)
500 			error = vie_write_bytereg(vm, vcpuid, vie, val);
501 		break;
502 	case 0x8B:
503 		/*
504 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
505 		 * 8B/r:	mov r16, r/m16
506 		 * 8B/r:	mov r32, r/m32
507 		 * REX.W 8B/r:	mov r64, r/m64
508 		 */
509 		error = memread(vm, vcpuid, gpa, &val, size, arg);
510 		if (error == 0) {
511 			reg = gpr_map[vie->reg];
512 			error = vie_update_register(vm, vcpuid, reg, val, size);
513 		}
514 		break;
515 	case 0xA1:
516 		/*
517 		 * MOV from seg:moffset to AX/EAX/RAX
518 		 * A1:		mov AX, moffs16
519 		 * A1:		mov EAX, moffs32
520 		 * REX.W + A1:	mov RAX, moffs64
521 		 */
522 		error = memread(vm, vcpuid, gpa, &val, size, arg);
523 		if (error == 0) {
524 			reg = VM_REG_GUEST_RAX;
525 			error = vie_update_register(vm, vcpuid, reg, val, size);
526 		}
527 		break;
528 	case 0xA3:
529 		/*
530 		 * MOV from AX/EAX/RAX to seg:moffset
531 		 * A3:		mov moffs16, AX
532 		 * A3:		mov moffs32, EAX
533 		 * REX.W + A3:	mov moffs64, RAX
534 		 */
535 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
536 		if (error == 0) {
537 			val &= size2mask[size];
538 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
539 		}
540 		break;
541 	case 0xC6:
542 		/*
543 		 * MOV from imm8 to mem (ModRM:r/m)
544 		 * C6/0		mov r/m8, imm8
545 		 * REX + C6/0	mov r/m8, imm8
546 		 */
547 		size = 1;	/* override for byte operation */
548 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
549 		break;
550 	case 0xC7:
551 		/*
552 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
553 		 * C7/0		mov r/m16, imm16
554 		 * C7/0		mov r/m32, imm32
555 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
556 		 */
557 		val = vie->immediate & size2mask[size];
558 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
559 		break;
560 	default:
561 		break;
562 	}
563 
564 	return (error);
565 }
566 
567 static int
568 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
569 	     mem_region_read_t memread, mem_region_write_t memwrite,
570 	     void *arg)
571 {
572 	int error, size;
573 	enum vm_reg_name reg;
574 	uint64_t val;
575 
576 	size = vie->opsize;
577 	error = EINVAL;
578 
579 	switch (vie->op.op_byte) {
580 	case 0xB6:
581 		/*
582 		 * MOV and zero extend byte from mem (ModRM:r/m) to
583 		 * reg (ModRM:reg).
584 		 *
585 		 * 0F B6/r		movzx r16, r/m8
586 		 * 0F B6/r		movzx r32, r/m8
587 		 * REX.W + 0F B6/r	movzx r64, r/m8
588 		 */
589 
590 		/* get the first operand */
591 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
592 		if (error)
593 			break;
594 
595 		/* get the second operand */
596 		reg = gpr_map[vie->reg];
597 
598 		/* zero-extend byte */
599 		val = (uint8_t)val;
600 
601 		/* write the result */
602 		error = vie_update_register(vm, vcpuid, reg, val, size);
603 		break;
604 	case 0xB7:
605 		/*
606 		 * MOV and zero extend word from mem (ModRM:r/m) to
607 		 * reg (ModRM:reg).
608 		 *
609 		 * 0F B7/r		movzx r32, r/m16
610 		 * REX.W + 0F B7/r	movzx r64, r/m16
611 		 */
612 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
613 		if (error)
614 			return (error);
615 
616 		reg = gpr_map[vie->reg];
617 
618 		/* zero-extend word */
619 		val = (uint16_t)val;
620 
621 		error = vie_update_register(vm, vcpuid, reg, val, size);
622 		break;
623 	case 0xBE:
624 		/*
625 		 * MOV and sign extend byte from mem (ModRM:r/m) to
626 		 * reg (ModRM:reg).
627 		 *
628 		 * 0F BE/r		movsx r16, r/m8
629 		 * 0F BE/r		movsx r32, r/m8
630 		 * REX.W + 0F BE/r	movsx r64, r/m8
631 		 */
632 
633 		/* get the first operand */
634 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
635 		if (error)
636 			break;
637 
638 		/* get the second operand */
639 		reg = gpr_map[vie->reg];
640 
641 		/* sign extend byte */
642 		val = (int8_t)val;
643 
644 		/* write the result */
645 		error = vie_update_register(vm, vcpuid, reg, val, size);
646 		break;
647 	default:
648 		break;
649 	}
650 	return (error);
651 }
652 
653 /*
654  * Helper function to calculate and validate a linear address.
655  */
656 static int
657 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
658     int opsize, int addrsize, int prot, enum vm_reg_name seg,
659     enum vm_reg_name gpr, uint64_t *gla, int *fault)
660 {
661 	struct seg_desc desc;
662 	uint64_t cr0, val, rflags;
663 	int error;
664 
665 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
666 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
667 
668 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
669 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
670 
671 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
672 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
673 	    __func__, error, seg));
674 
675 	error = vie_read_register(vm, vcpuid, gpr, &val);
676 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
677 	    error, gpr));
678 
679 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
680 	    addrsize, prot, gla)) {
681 		if (seg == VM_REG_GUEST_SS)
682 			vm_inject_ss(vm, vcpuid, 0);
683 		else
684 			vm_inject_gp(vm, vcpuid);
685 		goto guest_fault;
686 	}
687 
688 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
689 		if (seg == VM_REG_GUEST_SS)
690 			vm_inject_ss(vm, vcpuid, 0);
691 		else
692 			vm_inject_gp(vm, vcpuid);
693 		goto guest_fault;
694 	}
695 
696 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
697 		vm_inject_ac(vm, vcpuid, 0);
698 		goto guest_fault;
699 	}
700 
701 	*fault = 0;
702 	return (0);
703 
704 guest_fault:
705 	*fault = 1;
706 	return (0);
707 }
708 
709 static int
710 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
711     struct vm_guest_paging *paging, mem_region_read_t memread,
712     mem_region_write_t memwrite, void *arg)
713 {
714 #ifdef _KERNEL
715 	struct vm_copyinfo copyinfo[2];
716 #else
717 	struct iovec copyinfo[2];
718 #endif
719 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
720 	uint64_t rcx, rdi, rsi, rflags;
721 	int error, fault, opsize, seg, repeat;
722 
723 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
724 	val = 0;
725 	error = 0;
726 
727 	/*
728 	 * XXX although the MOVS instruction is only supposed to be used with
729 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
730 	 *
731 	 * Empirically the "repnz" prefix has identical behavior to "rep"
732 	 * and the zero flag does not make a difference.
733 	 */
734 	repeat = vie->repz_present | vie->repnz_present;
735 
736 	if (repeat) {
737 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
738 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
739 
740 		/*
741 		 * The count register is %rcx, %ecx or %cx depending on the
742 		 * address size of the instruction.
743 		 */
744 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
745 			error = 0;
746 			goto done;
747 		}
748 	}
749 
750 	/*
751 	 *	Source		Destination	Comments
752 	 *	--------------------------------------------
753 	 * (1)  memory		memory		n/a
754 	 * (2)  memory		mmio		emulated
755 	 * (3)  mmio		memory		emulated
756 	 * (4)  mmio		mmio		emulated
757 	 *
758 	 * At this point we don't have sufficient information to distinguish
759 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
760 	 * out because it will succeed only when operating on regular memory.
761 	 *
762 	 * XXX the emulation doesn't properly handle the case where 'gpa'
763 	 * is straddling the boundary between the normal memory and MMIO.
764 	 */
765 
766 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
767 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
768 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
769 	if (error || fault)
770 		goto done;
771 
772 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
773 	    copyinfo, nitems(copyinfo), &fault);
774 	if (error == 0) {
775 		if (fault)
776 			goto done;	/* Resume guest to handle fault */
777 
778 		/*
779 		 * case (2): read from system memory and write to mmio.
780 		 */
781 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
782 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
783 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
784 		if (error)
785 			goto done;
786 	} else {
787 		/*
788 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
789 		 * if 'srcaddr' is in the mmio space.
790 		 */
791 
792 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
793 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
794 		    &fault);
795 		if (error || fault)
796 			goto done;
797 
798 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
799 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
800 		if (error == 0) {
801 			if (fault)
802 				goto done;    /* Resume guest to handle fault */
803 
804 			/*
805 			 * case (3): read from MMIO and write to system memory.
806 			 *
807 			 * A MMIO read can have side-effects so we
808 			 * commit to it only after vm_copy_setup() is
809 			 * successful. If a page-fault needs to be
810 			 * injected into the guest then it will happen
811 			 * before the MMIO read is attempted.
812 			 */
813 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
814 			if (error)
815 				goto done;
816 
817 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
818 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
819 		} else {
820 			/*
821 			 * Case (4): read from and write to mmio.
822 			 *
823 			 * Commit to the MMIO read/write (with potential
824 			 * side-effects) only after we are sure that the
825 			 * instruction is not going to be restarted due
826 			 * to address translation faults.
827 			 */
828 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
829 			    PROT_READ, &srcgpa, &fault);
830 			if (error || fault)
831 				goto done;
832 
833 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
834 			   PROT_WRITE, &dstgpa, &fault);
835 			if (error || fault)
836 				goto done;
837 
838 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
839 			if (error)
840 				goto done;
841 
842 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
843 			if (error)
844 				goto done;
845 		}
846 	}
847 
848 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
849 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
850 
851 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
852 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
853 
854 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
855 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
856 
857 	if (rflags & PSL_D) {
858 		rsi -= opsize;
859 		rdi -= opsize;
860 	} else {
861 		rsi += opsize;
862 		rdi += opsize;
863 	}
864 
865 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
866 	    vie->addrsize);
867 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
868 
869 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
870 	    vie->addrsize);
871 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
872 
873 	if (repeat) {
874 		rcx = rcx - 1;
875 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
876 		    rcx, vie->addrsize);
877 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
878 
879 		/*
880 		 * Repeat the instruction if the count register is not zero.
881 		 */
882 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
883 			vm_restart_instruction(vm, vcpuid);
884 	}
885 done:
886 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
887 	    __func__, error));
888 	return (error);
889 }
890 
891 static int
892 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
893     struct vm_guest_paging *paging, mem_region_read_t memread,
894     mem_region_write_t memwrite, void *arg)
895 {
896 	int error, opsize, repeat;
897 	uint64_t val;
898 	uint64_t rcx, rdi, rflags;
899 
900 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
901 	repeat = vie->repz_present | vie->repnz_present;
902 
903 	if (repeat) {
904 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
905 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
906 
907 		/*
908 		 * The count register is %rcx, %ecx or %cx depending on the
909 		 * address size of the instruction.
910 		 */
911 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
912 			return (0);
913 	}
914 
915 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
916 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
917 
918 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
919 	if (error)
920 		return (error);
921 
922 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
923 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
924 
925 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
926 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
927 
928 	if (rflags & PSL_D)
929 		rdi -= opsize;
930 	else
931 		rdi += opsize;
932 
933 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
934 	    vie->addrsize);
935 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
936 
937 	if (repeat) {
938 		rcx = rcx - 1;
939 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
940 		    rcx, vie->addrsize);
941 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
942 
943 		/*
944 		 * Repeat the instruction if the count register is not zero.
945 		 */
946 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
947 			vm_restart_instruction(vm, vcpuid);
948 	}
949 
950 	return (0);
951 }
952 
953 static int
954 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
955 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
956 {
957 	int error, size;
958 	enum vm_reg_name reg;
959 	uint64_t result, rflags, rflags2, val1, val2;
960 
961 	size = vie->opsize;
962 	error = EINVAL;
963 
964 	switch (vie->op.op_byte) {
965 	case 0x23:
966 		/*
967 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
968 		 * result in reg.
969 		 *
970 		 * 23/r		and r16, r/m16
971 		 * 23/r		and r32, r/m32
972 		 * REX.W + 23/r	and r64, r/m64
973 		 */
974 
975 		/* get the first operand */
976 		reg = gpr_map[vie->reg];
977 		error = vie_read_register(vm, vcpuid, reg, &val1);
978 		if (error)
979 			break;
980 
981 		/* get the second operand */
982 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
983 		if (error)
984 			break;
985 
986 		/* perform the operation and write the result */
987 		result = val1 & val2;
988 		error = vie_update_register(vm, vcpuid, reg, result, size);
989 		break;
990 	case 0x81:
991 	case 0x83:
992 		/*
993 		 * AND mem (ModRM:r/m) with immediate and store the
994 		 * result in mem.
995 		 *
996 		 * 81 /4		and r/m16, imm16
997 		 * 81 /4		and r/m32, imm32
998 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
999 		 *
1000 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1001 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1002 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1003 		 */
1004 
1005 		/* get the first operand */
1006                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1007                 if (error)
1008 			break;
1009 
1010                 /*
1011 		 * perform the operation with the pre-fetched immediate
1012 		 * operand and write the result
1013 		 */
1014                 result = val1 & vie->immediate;
1015                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1016 		break;
1017 	default:
1018 		break;
1019 	}
1020 	if (error)
1021 		return (error);
1022 
1023 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1024 	if (error)
1025 		return (error);
1026 
1027 	/*
1028 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1029 	 * to the result; AF is undefined.
1030 	 *
1031 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1032 	 */
1033 	rflags2 = getcc(size, result, 0);
1034 	rflags &= ~RFLAGS_STATUS_BITS;
1035 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1036 
1037 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1038 	return (error);
1039 }
1040 
1041 static int
1042 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1043 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1044 {
1045 	int error, size;
1046 	enum vm_reg_name reg;
1047 	uint64_t result, rflags, rflags2, val1, val2;
1048 
1049 	size = vie->opsize;
1050 	error = EINVAL;
1051 
1052 	switch (vie->op.op_byte) {
1053 	case 0x0B:
1054 		/*
1055 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1056 		 * result in reg.
1057 		 *
1058 		 * 0b/r         or r16, r/m16
1059 		 * 0b/r         or r32, r/m32
1060 		 * REX.W + 0b/r or r64, r/m64
1061 		 */
1062 
1063 		/* get the first operand */
1064 		reg = gpr_map[vie->reg];
1065 		error = vie_read_register(vm, vcpuid, reg, &val1);
1066 		if (error)
1067 			break;
1068 
1069 		/* get the second operand */
1070 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1071 		if (error)
1072 			break;
1073 
1074 		/* perform the operation and write the result */
1075 		result = val1 | val2;
1076 		error = vie_update_register(vm, vcpuid, reg, result, size);
1077 		break;
1078 	case 0x81:
1079 	case 0x83:
1080 		/*
1081 		 * OR mem (ModRM:r/m) with immediate and store the
1082 		 * result in mem.
1083 		 *
1084 		 * 81 /1		or r/m16, imm16
1085 		 * 81 /1		or r/m32, imm32
1086 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1087 		 *
1088 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1089 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1090 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1091 		 */
1092 
1093 		/* get the first operand */
1094                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1095                 if (error)
1096 			break;
1097 
1098                 /*
1099 		 * perform the operation with the pre-fetched immediate
1100 		 * operand and write the result
1101 		 */
1102                 result = val1 | vie->immediate;
1103                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1104 		break;
1105 	default:
1106 		break;
1107 	}
1108 	if (error)
1109 		return (error);
1110 
1111 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1112 	if (error)
1113 		return (error);
1114 
1115 	/*
1116 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1117 	 * to the result; AF is undefined.
1118 	 *
1119 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1120 	 */
1121 	rflags2 = getcc(size, result, 0);
1122 	rflags &= ~RFLAGS_STATUS_BITS;
1123 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1124 
1125 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1126 	return (error);
1127 }
1128 
1129 static int
1130 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1131 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1132 {
1133 	int error, size;
1134 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1135 	enum vm_reg_name reg;
1136 
1137 	size = vie->opsize;
1138 	switch (vie->op.op_byte) {
1139 	case 0x39:
1140 	case 0x3B:
1141 		/*
1142 		 * 39/r		CMP r/m16, r16
1143 		 * 39/r		CMP r/m32, r32
1144 		 * REX.W 39/r	CMP r/m64, r64
1145 		 *
1146 		 * 3B/r		CMP r16, r/m16
1147 		 * 3B/r		CMP r32, r/m32
1148 		 * REX.W + 3B/r	CMP r64, r/m64
1149 		 *
1150 		 * Compare the first operand with the second operand and
1151 		 * set status flags in EFLAGS register. The comparison is
1152 		 * performed by subtracting the second operand from the first
1153 		 * operand and then setting the status flags.
1154 		 */
1155 
1156 		/* Get the register operand */
1157 		reg = gpr_map[vie->reg];
1158 		error = vie_read_register(vm, vcpuid, reg, &regop);
1159 		if (error)
1160 			return (error);
1161 
1162 		/* Get the memory operand */
1163 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1164 		if (error)
1165 			return (error);
1166 
1167 		if (vie->op.op_byte == 0x3B) {
1168 			op1 = regop;
1169 			op2 = memop;
1170 		} else {
1171 			op1 = memop;
1172 			op2 = regop;
1173 		}
1174 		rflags2 = getcc(size, op1, op2);
1175 		break;
1176 	case 0x80:
1177 	case 0x81:
1178 	case 0x83:
1179 		/*
1180 		 * 80 /7		cmp r/m8, imm8
1181 		 * REX + 80 /7		cmp r/m8, imm8
1182 		 *
1183 		 * 81 /7		cmp r/m16, imm16
1184 		 * 81 /7		cmp r/m32, imm32
1185 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1186 		 *
1187 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1188 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1189 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1190 		 *
1191 		 * Compare mem (ModRM:r/m) with immediate and set
1192 		 * status flags according to the results.  The
1193 		 * comparison is performed by subtracting the
1194 		 * immediate from the first operand and then setting
1195 		 * the status flags.
1196 		 *
1197 		 */
1198 		if (vie->op.op_byte == 0x80)
1199 			size = 1;
1200 
1201 		/* get the first operand */
1202                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1203 		if (error)
1204 			return (error);
1205 
1206 		rflags2 = getcc(size, op1, vie->immediate);
1207 		break;
1208 	default:
1209 		return (EINVAL);
1210 	}
1211 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1212 	if (error)
1213 		return (error);
1214 	rflags &= ~RFLAGS_STATUS_BITS;
1215 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1216 
1217 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1218 	return (error);
1219 }
1220 
1221 static int
1222 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1223 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1224 {
1225 	int error, size;
1226 	uint64_t nval, rflags, rflags2, val1, val2;
1227 	enum vm_reg_name reg;
1228 
1229 	size = vie->opsize;
1230 	error = EINVAL;
1231 
1232 	switch (vie->op.op_byte) {
1233 	case 0x03:
1234 		/*
1235 		 * ADD r/m to r and store the result in r
1236 		 *
1237 		 * 03/r            ADD r16, r/m16
1238 		 * 03/r            ADD r32, r/m32
1239 		 * REX.W + 03/r    ADD r64, r/m64
1240 		 */
1241 
1242 		/* get the first operand */
1243 		reg = gpr_map[vie->reg];
1244 		error = vie_read_register(vm, vcpuid, reg, &val1);
1245 		if (error)
1246 			break;
1247 
1248 		/* get the second operand */
1249 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1250 		if (error)
1251 			break;
1252 
1253 		/* perform the operation and write the result */
1254 		nval = val1 + val2;
1255 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1256 		break;
1257 	default:
1258 		break;
1259 	}
1260 
1261 	if (!error) {
1262 		rflags2 = getaddflags(size, val1, val2);
1263 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1264 		    &rflags);
1265 		if (error)
1266 			return (error);
1267 
1268 		rflags &= ~RFLAGS_STATUS_BITS;
1269 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1270 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1271 		    rflags, 8);
1272 	}
1273 
1274 	return (error);
1275 }
1276 
1277 static int
1278 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1279 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1280 {
1281 	int error, size;
1282 	uint64_t nval, rflags, rflags2, val1, val2;
1283 	enum vm_reg_name reg;
1284 
1285 	size = vie->opsize;
1286 	error = EINVAL;
1287 
1288 	switch (vie->op.op_byte) {
1289 	case 0x2B:
1290 		/*
1291 		 * SUB r/m from r and store the result in r
1292 		 *
1293 		 * 2B/r            SUB r16, r/m16
1294 		 * 2B/r            SUB r32, r/m32
1295 		 * REX.W + 2B/r    SUB r64, r/m64
1296 		 */
1297 
1298 		/* get the first operand */
1299 		reg = gpr_map[vie->reg];
1300 		error = vie_read_register(vm, vcpuid, reg, &val1);
1301 		if (error)
1302 			break;
1303 
1304 		/* get the second operand */
1305 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1306 		if (error)
1307 			break;
1308 
1309 		/* perform the operation and write the result */
1310 		nval = val1 - val2;
1311 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1312 		break;
1313 	default:
1314 		break;
1315 	}
1316 
1317 	if (!error) {
1318 		rflags2 = getcc(size, val1, val2);
1319 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1320 		    &rflags);
1321 		if (error)
1322 			return (error);
1323 
1324 		rflags &= ~RFLAGS_STATUS_BITS;
1325 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1326 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1327 		    rflags, 8);
1328 	}
1329 
1330 	return (error);
1331 }
1332 
1333 static int
1334 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1335     struct vm_guest_paging *paging, mem_region_read_t memread,
1336     mem_region_write_t memwrite, void *arg)
1337 {
1338 #ifdef _KERNEL
1339 	struct vm_copyinfo copyinfo[2];
1340 #else
1341 	struct iovec copyinfo[2];
1342 #endif
1343 	struct seg_desc ss_desc;
1344 	uint64_t cr0, rflags, rsp, stack_gla, val;
1345 	int error, fault, size, stackaddrsize, pushop;
1346 
1347 	val = 0;
1348 	size = vie->opsize;
1349 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1350 
1351 	/*
1352 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1353 	 */
1354 	if (paging->cpu_mode == CPU_MODE_REAL) {
1355 		stackaddrsize = 2;
1356 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1357 		/*
1358 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1359 		 * - Stack pointer size is always 64-bits.
1360 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1361 		 * - 16-bit PUSH/POP is supported by using the operand size
1362 		 *   override prefix (66H).
1363 		 */
1364 		stackaddrsize = 8;
1365 		size = vie->opsize_override ? 2 : 8;
1366 	} else {
1367 		/*
1368 		 * In protected or compatibility mode the 'B' flag in the
1369 		 * stack-segment descriptor determines the size of the
1370 		 * stack pointer.
1371 		 */
1372 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1373 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1374 		    __func__, error));
1375 		if (SEG_DESC_DEF32(ss_desc.access))
1376 			stackaddrsize = 4;
1377 		else
1378 			stackaddrsize = 2;
1379 	}
1380 
1381 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1382 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1383 
1384 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1385 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1386 
1387 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1388 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1389 	if (pushop) {
1390 		rsp -= size;
1391 	}
1392 
1393 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1394 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1395 	    &stack_gla)) {
1396 		vm_inject_ss(vm, vcpuid, 0);
1397 		return (0);
1398 	}
1399 
1400 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1401 		vm_inject_ss(vm, vcpuid, 0);
1402 		return (0);
1403 	}
1404 
1405 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1406 		vm_inject_ac(vm, vcpuid, 0);
1407 		return (0);
1408 	}
1409 
1410 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1411 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1412 	    &fault);
1413 	if (error || fault)
1414 		return (error);
1415 
1416 	if (pushop) {
1417 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1418 		if (error == 0)
1419 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1420 	} else {
1421 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1422 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1423 		rsp += size;
1424 	}
1425 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1426 
1427 	if (error == 0) {
1428 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1429 		    stackaddrsize);
1430 		KASSERT(error == 0, ("error %d updating rsp", error));
1431 	}
1432 	return (error);
1433 }
1434 
1435 static int
1436 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1437     struct vm_guest_paging *paging, mem_region_read_t memread,
1438     mem_region_write_t memwrite, void *arg)
1439 {
1440 	int error;
1441 
1442 	/*
1443 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1444 	 *
1445 	 * PUSH is part of the group 5 extended opcodes and is identified
1446 	 * by ModRM:reg = b110.
1447 	 */
1448 	if ((vie->reg & 7) != 6)
1449 		return (EINVAL);
1450 
1451 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1452 	    memwrite, arg);
1453 	return (error);
1454 }
1455 
1456 static int
1457 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1458     struct vm_guest_paging *paging, mem_region_read_t memread,
1459     mem_region_write_t memwrite, void *arg)
1460 {
1461 	int error;
1462 
1463 	/*
1464 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1465 	 *
1466 	 * POP is part of the group 1A extended opcodes and is identified
1467 	 * by ModRM:reg = b000.
1468 	 */
1469 	if ((vie->reg & 7) != 0)
1470 		return (EINVAL);
1471 
1472 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1473 	    memwrite, arg);
1474 	return (error);
1475 }
1476 
1477 static int
1478 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1479     struct vm_guest_paging *paging, mem_region_read_t memread,
1480     mem_region_write_t memwrite, void *memarg)
1481 {
1482 	int error;
1483 
1484 	switch (vie->reg & 7) {
1485 	case 0x1:	/* OR */
1486 		error = emulate_or(vm, vcpuid, gpa, vie,
1487 		    memread, memwrite, memarg);
1488 		break;
1489 	case 0x4:	/* AND */
1490 		error = emulate_and(vm, vcpuid, gpa, vie,
1491 		    memread, memwrite, memarg);
1492 		break;
1493 	case 0x7:	/* CMP */
1494 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1495 		    memread, memwrite, memarg);
1496 		break;
1497 	default:
1498 		error = EINVAL;
1499 		break;
1500 	}
1501 
1502 	return (error);
1503 }
1504 
1505 static int
1506 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1507     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1508 {
1509 	uint64_t val, rflags;
1510 	int error, bitmask, bitoff;
1511 
1512 	/*
1513 	 * 0F BA is a Group 8 extended opcode.
1514 	 *
1515 	 * Currently we only emulate the 'Bit Test' instruction which is
1516 	 * identified by a ModR/M:reg encoding of 100b.
1517 	 */
1518 	if ((vie->reg & 7) != 4)
1519 		return (EINVAL);
1520 
1521 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1522 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1523 
1524 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1525 	if (error)
1526 		return (error);
1527 
1528 	/*
1529 	 * Intel SDM, Vol 2, Table 3-2:
1530 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1531 	 */
1532 	bitmask = vie->opsize * 8 - 1;
1533 	bitoff = vie->immediate & bitmask;
1534 
1535 	/* Copy the bit into the Carry flag in %rflags */
1536 	if (val & (1UL << bitoff))
1537 		rflags |= PSL_C;
1538 	else
1539 		rflags &= ~PSL_C;
1540 
1541 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1542 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1543 
1544 	return (0);
1545 }
1546 
1547 static int
1548 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1549     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1550 {
1551 	int error;
1552 	uint64_t buf;
1553 
1554 	switch (vie->reg & 7) {
1555 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1556 		if (vie->mod == 0x3) {
1557 			/*
1558 			 * SFENCE.  Ignore it, VM exit provides enough
1559 			 * barriers on its own.
1560 			 */
1561 			error = 0;
1562 		} else {
1563 			/*
1564 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1565 			 * rights.
1566 			 */
1567 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1568 		}
1569 		break;
1570 	default:
1571 		error = EINVAL;
1572 		break;
1573 	}
1574 
1575 	return (error);
1576 }
1577 
1578 int
1579 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1580     struct vm_guest_paging *paging, mem_region_read_t memread,
1581     mem_region_write_t memwrite, void *memarg)
1582 {
1583 	int error;
1584 
1585 	if (!vie->decoded)
1586 		return (EINVAL);
1587 
1588 	switch (vie->op.op_type) {
1589 	case VIE_OP_TYPE_GROUP1:
1590 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1591 		    memwrite, memarg);
1592 		break;
1593 	case VIE_OP_TYPE_POP:
1594 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1595 		    memwrite, memarg);
1596 		break;
1597 	case VIE_OP_TYPE_PUSH:
1598 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1599 		    memwrite, memarg);
1600 		break;
1601 	case VIE_OP_TYPE_CMP:
1602 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1603 				    memread, memwrite, memarg);
1604 		break;
1605 	case VIE_OP_TYPE_MOV:
1606 		error = emulate_mov(vm, vcpuid, gpa, vie,
1607 				    memread, memwrite, memarg);
1608 		break;
1609 	case VIE_OP_TYPE_MOVSX:
1610 	case VIE_OP_TYPE_MOVZX:
1611 		error = emulate_movx(vm, vcpuid, gpa, vie,
1612 				     memread, memwrite, memarg);
1613 		break;
1614 	case VIE_OP_TYPE_MOVS:
1615 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1616 		    memwrite, memarg);
1617 		break;
1618 	case VIE_OP_TYPE_STOS:
1619 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1620 		    memwrite, memarg);
1621 		break;
1622 	case VIE_OP_TYPE_AND:
1623 		error = emulate_and(vm, vcpuid, gpa, vie,
1624 				    memread, memwrite, memarg);
1625 		break;
1626 	case VIE_OP_TYPE_OR:
1627 		error = emulate_or(vm, vcpuid, gpa, vie,
1628 				    memread, memwrite, memarg);
1629 		break;
1630 	case VIE_OP_TYPE_SUB:
1631 		error = emulate_sub(vm, vcpuid, gpa, vie,
1632 				    memread, memwrite, memarg);
1633 		break;
1634 	case VIE_OP_TYPE_BITTEST:
1635 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1636 		    memread, memwrite, memarg);
1637 		break;
1638 	case VIE_OP_TYPE_TWOB_GRP15:
1639 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1640 		    memread, memwrite, memarg);
1641 		break;
1642 	case VIE_OP_TYPE_ADD:
1643 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1644 		    memwrite, memarg);
1645 		break;
1646 	default:
1647 		error = EINVAL;
1648 		break;
1649 	}
1650 
1651 	return (error);
1652 }
1653 
1654 int
1655 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1656 {
1657 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1658 	    ("%s: invalid size %d", __func__, size));
1659 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1660 
1661 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1662 		return (0);
1663 
1664 	return ((gla & (size - 1)) ? 1 : 0);
1665 }
1666 
1667 int
1668 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1669 {
1670 	uint64_t mask;
1671 
1672 	if (cpu_mode != CPU_MODE_64BIT)
1673 		return (0);
1674 
1675 	/*
1676 	 * The value of the bit 47 in the 'gla' should be replicated in the
1677 	 * most significant 16 bits.
1678 	 */
1679 	mask = ~((1UL << 48) - 1);
1680 	if (gla & (1UL << 47))
1681 		return ((gla & mask) != mask);
1682 	else
1683 		return ((gla & mask) != 0);
1684 }
1685 
1686 uint64_t
1687 vie_size2mask(int size)
1688 {
1689 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1690 	    ("vie_size2mask: invalid size %d", size));
1691 	return (size2mask[size]);
1692 }
1693 
1694 int
1695 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1696     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1697     int prot, uint64_t *gla)
1698 {
1699 	uint64_t firstoff, low_limit, high_limit, segbase;
1700 	int glasize, type;
1701 
1702 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1703 	    ("%s: invalid segment %d", __func__, seg));
1704 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1705 	    ("%s: invalid operand size %d", __func__, length));
1706 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1707 	    ("%s: invalid prot %#x", __func__, prot));
1708 
1709 	firstoff = offset;
1710 	if (cpu_mode == CPU_MODE_64BIT) {
1711 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1712 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1713 		glasize = 8;
1714 	} else {
1715 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1716 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1717 		glasize = 4;
1718 		/*
1719 		 * If the segment selector is loaded with a NULL selector
1720 		 * then the descriptor is unusable and attempting to use
1721 		 * it results in a #GP(0).
1722 		 */
1723 		if (SEG_DESC_UNUSABLE(desc->access))
1724 			return (-1);
1725 
1726 		/*
1727 		 * The processor generates a #NP exception when a segment
1728 		 * register is loaded with a selector that points to a
1729 		 * descriptor that is not present. If this was the case then
1730 		 * it would have been checked before the VM-exit.
1731 		 */
1732 		KASSERT(SEG_DESC_PRESENT(desc->access),
1733 		    ("segment %d not present: %#x", seg, desc->access));
1734 
1735 		/*
1736 		 * The descriptor type must indicate a code/data segment.
1737 		 */
1738 		type = SEG_DESC_TYPE(desc->access);
1739 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1740 		    "descriptor type %#x", seg, type));
1741 
1742 		if (prot & PROT_READ) {
1743 			/* #GP on a read access to a exec-only code segment */
1744 			if ((type & 0xA) == 0x8)
1745 				return (-1);
1746 		}
1747 
1748 		if (prot & PROT_WRITE) {
1749 			/*
1750 			 * #GP on a write access to a code segment or a
1751 			 * read-only data segment.
1752 			 */
1753 			if (type & 0x8)			/* code segment */
1754 				return (-1);
1755 
1756 			if ((type & 0xA) == 0)		/* read-only data seg */
1757 				return (-1);
1758 		}
1759 
1760 		/*
1761 		 * 'desc->limit' is fully expanded taking granularity into
1762 		 * account.
1763 		 */
1764 		if ((type & 0xC) == 0x4) {
1765 			/* expand-down data segment */
1766 			low_limit = desc->limit + 1;
1767 			high_limit = SEG_DESC_DEF32(desc->access) ?
1768 			    0xffffffff : 0xffff;
1769 		} else {
1770 			/* code segment or expand-up data segment */
1771 			low_limit = 0;
1772 			high_limit = desc->limit;
1773 		}
1774 
1775 		while (length > 0) {
1776 			offset &= vie_size2mask(addrsize);
1777 			if (offset < low_limit || offset > high_limit)
1778 				return (-1);
1779 			offset++;
1780 			length--;
1781 		}
1782 	}
1783 
1784 	/*
1785 	 * In 64-bit mode all segments except %fs and %gs have a segment
1786 	 * base address of 0.
1787 	 */
1788 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1789 	    seg != VM_REG_GUEST_GS) {
1790 		segbase = 0;
1791 	} else {
1792 		segbase = desc->base;
1793 	}
1794 
1795 	/*
1796 	 * Truncate 'firstoff' to the effective address size before adding
1797 	 * it to the segment base.
1798 	 */
1799 	firstoff &= vie_size2mask(addrsize);
1800 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1801 	return (0);
1802 }
1803 
1804 #ifdef _KERNEL
1805 void
1806 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1807 {
1808 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1809 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1810 
1811 	bzero(vie, sizeof(struct vie));
1812 
1813 	vie->base_register = VM_REG_LAST;
1814 	vie->index_register = VM_REG_LAST;
1815 	vie->segment_register = VM_REG_LAST;
1816 
1817 	if (inst_length) {
1818 		bcopy(inst_bytes, vie->inst, inst_length);
1819 		vie->num_valid = inst_length;
1820 	}
1821 }
1822 
1823 static int
1824 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1825 {
1826 	int error_code = 0;
1827 
1828 	if (pte & PG_V)
1829 		error_code |= PGEX_P;
1830 	if (prot & VM_PROT_WRITE)
1831 		error_code |= PGEX_W;
1832 	if (usermode)
1833 		error_code |= PGEX_U;
1834 	if (rsvd)
1835 		error_code |= PGEX_RSV;
1836 	if (prot & VM_PROT_EXECUTE)
1837 		error_code |= PGEX_I;
1838 
1839 	return (error_code);
1840 }
1841 
1842 static void
1843 ptp_release(void **cookie)
1844 {
1845 	if (*cookie != NULL) {
1846 		vm_gpa_release(*cookie);
1847 		*cookie = NULL;
1848 	}
1849 }
1850 
1851 static void *
1852 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1853 {
1854 	void *ptr;
1855 
1856 	ptp_release(cookie);
1857 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1858 	return (ptr);
1859 }
1860 
1861 static int
1862 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1863     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
1864 {
1865 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1866 	u_int retries;
1867 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1868 	uint32_t *ptpbase32, pte32;
1869 	void *cookie;
1870 
1871 	*guest_fault = 0;
1872 
1873 	usermode = (paging->cpl == 3 ? 1 : 0);
1874 	writable = prot & VM_PROT_WRITE;
1875 	cookie = NULL;
1876 	retval = 0;
1877 	retries = 0;
1878 restart:
1879 	ptpphys = paging->cr3;		/* root of the page tables */
1880 	ptp_release(&cookie);
1881 	if (retries++ > 0)
1882 		maybe_yield();
1883 
1884 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1885 		/*
1886 		 * XXX assuming a non-stack reference otherwise a stack fault
1887 		 * should be generated.
1888 		 */
1889 		if (!check_only)
1890 			vm_inject_gp(vm, vcpuid);
1891 		goto fault;
1892 	}
1893 
1894 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1895 		*gpa = gla;
1896 		goto done;
1897 	}
1898 
1899 	if (paging->paging_mode == PAGING_MODE_32) {
1900 		nlevels = 2;
1901 		while (--nlevels >= 0) {
1902 			/* Zero out the lower 12 bits. */
1903 			ptpphys &= ~0xfff;
1904 
1905 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
1906 			    &cookie);
1907 
1908 			if (ptpbase32 == NULL)
1909 				goto error;
1910 
1911 			ptpshift = PAGE_SHIFT + nlevels * 10;
1912 			ptpindex = (gla >> ptpshift) & 0x3FF;
1913 			pgsize = 1UL << ptpshift;
1914 
1915 			pte32 = ptpbase32[ptpindex];
1916 
1917 			if ((pte32 & PG_V) == 0 ||
1918 			    (usermode && (pte32 & PG_U) == 0) ||
1919 			    (writable && (pte32 & PG_RW) == 0)) {
1920 				if (!check_only) {
1921 					pfcode = pf_error_code(usermode, prot, 0,
1922 					    pte32);
1923 					vm_inject_pf(vm, vcpuid, pfcode, gla);
1924 				}
1925 				goto fault;
1926 			}
1927 
1928 			/*
1929 			 * Emulate the x86 MMU's management of the accessed
1930 			 * and dirty flags. While the accessed flag is set
1931 			 * at every level of the page table, the dirty flag
1932 			 * is only set at the last level providing the guest
1933 			 * physical address.
1934 			 */
1935 			if (!check_only && (pte32 & PG_A) == 0) {
1936 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1937 				    pte32, pte32 | PG_A) == 0) {
1938 					goto restart;
1939 				}
1940 			}
1941 
1942 			/* XXX must be ignored if CR4.PSE=0 */
1943 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1944 				break;
1945 
1946 			ptpphys = pte32;
1947 		}
1948 
1949 		/* Set the dirty bit in the page table entry if necessary */
1950 		if (!check_only && writable && (pte32 & PG_M) == 0) {
1951 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1952 			    pte32, pte32 | PG_M) == 0) {
1953 				goto restart;
1954 			}
1955 		}
1956 
1957 		/* Zero out the lower 'ptpshift' bits */
1958 		pte32 >>= ptpshift; pte32 <<= ptpshift;
1959 		*gpa = pte32 | (gla & (pgsize - 1));
1960 		goto done;
1961 	}
1962 
1963 	if (paging->paging_mode == PAGING_MODE_PAE) {
1964 		/* Zero out the lower 5 bits and the upper 32 bits */
1965 		ptpphys &= 0xffffffe0UL;
1966 
1967 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
1968 		    &cookie);
1969 		if (ptpbase == NULL)
1970 			goto error;
1971 
1972 		ptpindex = (gla >> 30) & 0x3;
1973 
1974 		pte = ptpbase[ptpindex];
1975 
1976 		if ((pte & PG_V) == 0) {
1977 			if (!check_only) {
1978 				pfcode = pf_error_code(usermode, prot, 0, pte);
1979 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1980 			}
1981 			goto fault;
1982 		}
1983 
1984 		ptpphys = pte;
1985 
1986 		nlevels = 2;
1987 	} else
1988 		nlevels = 4;
1989 	while (--nlevels >= 0) {
1990 		/* Zero out the lower 12 bits and the upper 12 bits */
1991 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1992 
1993 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
1994 		if (ptpbase == NULL)
1995 			goto error;
1996 
1997 		ptpshift = PAGE_SHIFT + nlevels * 9;
1998 		ptpindex = (gla >> ptpshift) & 0x1FF;
1999 		pgsize = 1UL << ptpshift;
2000 
2001 		pte = ptpbase[ptpindex];
2002 
2003 		if ((pte & PG_V) == 0 ||
2004 		    (usermode && (pte & PG_U) == 0) ||
2005 		    (writable && (pte & PG_RW) == 0)) {
2006 			if (!check_only) {
2007 				pfcode = pf_error_code(usermode, prot, 0, pte);
2008 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2009 			}
2010 			goto fault;
2011 		}
2012 
2013 		/* Set the accessed bit in the page table entry */
2014 		if (!check_only && (pte & PG_A) == 0) {
2015 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2016 			    pte, pte | PG_A) == 0) {
2017 				goto restart;
2018 			}
2019 		}
2020 
2021 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2022 			if (pgsize > 1 * GB) {
2023 				if (!check_only) {
2024 					pfcode = pf_error_code(usermode, prot, 1,
2025 					    pte);
2026 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2027 				}
2028 				goto fault;
2029 			}
2030 			break;
2031 		}
2032 
2033 		ptpphys = pte;
2034 	}
2035 
2036 	/* Set the dirty bit in the page table entry if necessary */
2037 	if (!check_only && writable && (pte & PG_M) == 0) {
2038 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2039 			goto restart;
2040 	}
2041 
2042 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2043 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2044 	*gpa = pte | (gla & (pgsize - 1));
2045 done:
2046 	ptp_release(&cookie);
2047 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2048 	    __func__, retval));
2049 	return (retval);
2050 error:
2051 	retval = EFAULT;
2052 	goto done;
2053 fault:
2054 	*guest_fault = 1;
2055 	goto done;
2056 }
2057 
2058 int
2059 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2060     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2061 {
2062 
2063 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2064 	    false));
2065 }
2066 
2067 int
2068 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2069     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2070 {
2071 
2072 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2073 	    true));
2074 }
2075 
2076 int
2077 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2078     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2079 {
2080 	struct vm_copyinfo copyinfo[2];
2081 	int error, prot;
2082 
2083 	if (inst_length > VIE_INST_SIZE)
2084 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2085 
2086 	prot = PROT_READ | PROT_EXEC;
2087 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2088 	    copyinfo, nitems(copyinfo), faultptr);
2089 	if (error || *faultptr)
2090 		return (error);
2091 
2092 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2093 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2094 	vie->num_valid = inst_length;
2095 	return (0);
2096 }
2097 
2098 static int
2099 vie_peek(struct vie *vie, uint8_t *x)
2100 {
2101 
2102 	if (vie->num_processed < vie->num_valid) {
2103 		*x = vie->inst[vie->num_processed];
2104 		return (0);
2105 	} else
2106 		return (-1);
2107 }
2108 
2109 static void
2110 vie_advance(struct vie *vie)
2111 {
2112 
2113 	vie->num_processed++;
2114 }
2115 
2116 static bool
2117 segment_override(uint8_t x, int *seg)
2118 {
2119 
2120 	switch (x) {
2121 	case 0x2E:
2122 		*seg = VM_REG_GUEST_CS;
2123 		break;
2124 	case 0x36:
2125 		*seg = VM_REG_GUEST_SS;
2126 		break;
2127 	case 0x3E:
2128 		*seg = VM_REG_GUEST_DS;
2129 		break;
2130 	case 0x26:
2131 		*seg = VM_REG_GUEST_ES;
2132 		break;
2133 	case 0x64:
2134 		*seg = VM_REG_GUEST_FS;
2135 		break;
2136 	case 0x65:
2137 		*seg = VM_REG_GUEST_GS;
2138 		break;
2139 	default:
2140 		return (false);
2141 	}
2142 	return (true);
2143 }
2144 
2145 static int
2146 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2147 {
2148 	uint8_t x;
2149 
2150 	while (1) {
2151 		if (vie_peek(vie, &x))
2152 			return (-1);
2153 
2154 		if (x == 0x66)
2155 			vie->opsize_override = 1;
2156 		else if (x == 0x67)
2157 			vie->addrsize_override = 1;
2158 		else if (x == 0xF3)
2159 			vie->repz_present = 1;
2160 		else if (x == 0xF2)
2161 			vie->repnz_present = 1;
2162 		else if (segment_override(x, &vie->segment_register))
2163 			vie->segment_override = 1;
2164 		else
2165 			break;
2166 
2167 		vie_advance(vie);
2168 	}
2169 
2170 	/*
2171 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2172 	 * - Only one REX prefix is allowed per instruction.
2173 	 * - The REX prefix must immediately precede the opcode byte or the
2174 	 *   escape opcode byte.
2175 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2176 	 *   the mandatory prefix must come before the REX prefix.
2177 	 */
2178 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2179 		vie->rex_present = 1;
2180 		vie->rex_w = x & 0x8 ? 1 : 0;
2181 		vie->rex_r = x & 0x4 ? 1 : 0;
2182 		vie->rex_x = x & 0x2 ? 1 : 0;
2183 		vie->rex_b = x & 0x1 ? 1 : 0;
2184 		vie_advance(vie);
2185 	}
2186 
2187 	/*
2188 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2189 	 */
2190 	if (cpu_mode == CPU_MODE_64BIT) {
2191 		/*
2192 		 * Default address size is 64-bits and default operand size
2193 		 * is 32-bits.
2194 		 */
2195 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2196 		if (vie->rex_w)
2197 			vie->opsize = 8;
2198 		else if (vie->opsize_override)
2199 			vie->opsize = 2;
2200 		else
2201 			vie->opsize = 4;
2202 	} else if (cs_d) {
2203 		/* Default address and operand sizes are 32-bits */
2204 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2205 		vie->opsize = vie->opsize_override ? 2 : 4;
2206 	} else {
2207 		/* Default address and operand sizes are 16-bits */
2208 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2209 		vie->opsize = vie->opsize_override ? 4 : 2;
2210 	}
2211 	return (0);
2212 }
2213 
2214 static int
2215 decode_two_byte_opcode(struct vie *vie)
2216 {
2217 	uint8_t x;
2218 
2219 	if (vie_peek(vie, &x))
2220 		return (-1);
2221 
2222 	vie->op = two_byte_opcodes[x];
2223 
2224 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2225 		return (-1);
2226 
2227 	vie_advance(vie);
2228 	return (0);
2229 }
2230 
2231 static int
2232 decode_opcode(struct vie *vie)
2233 {
2234 	uint8_t x;
2235 
2236 	if (vie_peek(vie, &x))
2237 		return (-1);
2238 
2239 	vie->op = one_byte_opcodes[x];
2240 
2241 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2242 		return (-1);
2243 
2244 	vie_advance(vie);
2245 
2246 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2247 		return (decode_two_byte_opcode(vie));
2248 
2249 	return (0);
2250 }
2251 
2252 static int
2253 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2254 {
2255 	uint8_t x;
2256 
2257 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2258 		return (0);
2259 
2260 	if (cpu_mode == CPU_MODE_REAL)
2261 		return (-1);
2262 
2263 	if (vie_peek(vie, &x))
2264 		return (-1);
2265 
2266 	vie->mod = (x >> 6) & 0x3;
2267 	vie->rm =  (x >> 0) & 0x7;
2268 	vie->reg = (x >> 3) & 0x7;
2269 
2270 	/*
2271 	 * A direct addressing mode makes no sense in the context of an EPT
2272 	 * fault. There has to be a memory access involved to cause the
2273 	 * EPT fault.
2274 	 */
2275 	if (vie->mod == VIE_MOD_DIRECT)
2276 		return (-1);
2277 
2278 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2279 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2280 		/*
2281 		 * Table 2-5: Special Cases of REX Encodings
2282 		 *
2283 		 * mod=0, r/m=5 is used in the compatibility mode to
2284 		 * indicate a disp32 without a base register.
2285 		 *
2286 		 * mod!=3, r/m=4 is used in the compatibility mode to
2287 		 * indicate that the SIB byte is present.
2288 		 *
2289 		 * The 'b' bit in the REX prefix is don't care in
2290 		 * this case.
2291 		 */
2292 	} else {
2293 		vie->rm |= (vie->rex_b << 3);
2294 	}
2295 
2296 	vie->reg |= (vie->rex_r << 3);
2297 
2298 	/* SIB */
2299 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2300 		goto done;
2301 
2302 	vie->base_register = gpr_map[vie->rm];
2303 
2304 	switch (vie->mod) {
2305 	case VIE_MOD_INDIRECT_DISP8:
2306 		vie->disp_bytes = 1;
2307 		break;
2308 	case VIE_MOD_INDIRECT_DISP32:
2309 		vie->disp_bytes = 4;
2310 		break;
2311 	case VIE_MOD_INDIRECT:
2312 		if (vie->rm == VIE_RM_DISP32) {
2313 			vie->disp_bytes = 4;
2314 			/*
2315 			 * Table 2-7. RIP-Relative Addressing
2316 			 *
2317 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2318 			 * whereas in compatibility mode it just implies disp32.
2319 			 */
2320 
2321 			if (cpu_mode == CPU_MODE_64BIT)
2322 				vie->base_register = VM_REG_GUEST_RIP;
2323 			else
2324 				vie->base_register = VM_REG_LAST;
2325 		}
2326 		break;
2327 	}
2328 
2329 done:
2330 	vie_advance(vie);
2331 
2332 	return (0);
2333 }
2334 
2335 static int
2336 decode_sib(struct vie *vie)
2337 {
2338 	uint8_t x;
2339 
2340 	/* Proceed only if SIB byte is present */
2341 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2342 		return (0);
2343 
2344 	if (vie_peek(vie, &x))
2345 		return (-1);
2346 
2347 	/* De-construct the SIB byte */
2348 	vie->ss = (x >> 6) & 0x3;
2349 	vie->index = (x >> 3) & 0x7;
2350 	vie->base = (x >> 0) & 0x7;
2351 
2352 	/* Apply the REX prefix modifiers */
2353 	vie->index |= vie->rex_x << 3;
2354 	vie->base |= vie->rex_b << 3;
2355 
2356 	switch (vie->mod) {
2357 	case VIE_MOD_INDIRECT_DISP8:
2358 		vie->disp_bytes = 1;
2359 		break;
2360 	case VIE_MOD_INDIRECT_DISP32:
2361 		vie->disp_bytes = 4;
2362 		break;
2363 	}
2364 
2365 	if (vie->mod == VIE_MOD_INDIRECT &&
2366 	    (vie->base == 5 || vie->base == 13)) {
2367 		/*
2368 		 * Special case when base register is unused if mod = 0
2369 		 * and base = %rbp or %r13.
2370 		 *
2371 		 * Documented in:
2372 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2373 		 * Table 2-5: Special Cases of REX Encodings
2374 		 */
2375 		vie->disp_bytes = 4;
2376 	} else {
2377 		vie->base_register = gpr_map[vie->base];
2378 	}
2379 
2380 	/*
2381 	 * All encodings of 'index' are valid except for %rsp (4).
2382 	 *
2383 	 * Documented in:
2384 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2385 	 * Table 2-5: Special Cases of REX Encodings
2386 	 */
2387 	if (vie->index != 4)
2388 		vie->index_register = gpr_map[vie->index];
2389 
2390 	/* 'scale' makes sense only in the context of an index register */
2391 	if (vie->index_register < VM_REG_LAST)
2392 		vie->scale = 1 << vie->ss;
2393 
2394 	vie_advance(vie);
2395 
2396 	return (0);
2397 }
2398 
2399 static int
2400 decode_displacement(struct vie *vie)
2401 {
2402 	int n, i;
2403 	uint8_t x;
2404 
2405 	union {
2406 		char	buf[4];
2407 		int8_t	signed8;
2408 		int32_t	signed32;
2409 	} u;
2410 
2411 	if ((n = vie->disp_bytes) == 0)
2412 		return (0);
2413 
2414 	if (n != 1 && n != 4)
2415 		panic("decode_displacement: invalid disp_bytes %d", n);
2416 
2417 	for (i = 0; i < n; i++) {
2418 		if (vie_peek(vie, &x))
2419 			return (-1);
2420 
2421 		u.buf[i] = x;
2422 		vie_advance(vie);
2423 	}
2424 
2425 	if (n == 1)
2426 		vie->displacement = u.signed8;		/* sign-extended */
2427 	else
2428 		vie->displacement = u.signed32;		/* sign-extended */
2429 
2430 	return (0);
2431 }
2432 
2433 static int
2434 decode_immediate(struct vie *vie)
2435 {
2436 	int i, n;
2437 	uint8_t x;
2438 	union {
2439 		char	buf[4];
2440 		int8_t	signed8;
2441 		int16_t	signed16;
2442 		int32_t	signed32;
2443 	} u;
2444 
2445 	/* Figure out immediate operand size (if any) */
2446 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2447 		/*
2448 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2449 		 * In 64-bit mode the typical size of immediate operands
2450 		 * remains 32-bits. When the operand size if 64-bits, the
2451 		 * processor sign-extends all immediates to 64-bits prior
2452 		 * to their use.
2453 		 */
2454 		if (vie->opsize == 4 || vie->opsize == 8)
2455 			vie->imm_bytes = 4;
2456 		else
2457 			vie->imm_bytes = 2;
2458 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2459 		vie->imm_bytes = 1;
2460 	}
2461 
2462 	if ((n = vie->imm_bytes) == 0)
2463 		return (0);
2464 
2465 	KASSERT(n == 1 || n == 2 || n == 4,
2466 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2467 
2468 	for (i = 0; i < n; i++) {
2469 		if (vie_peek(vie, &x))
2470 			return (-1);
2471 
2472 		u.buf[i] = x;
2473 		vie_advance(vie);
2474 	}
2475 
2476 	/* sign-extend the immediate value before use */
2477 	if (n == 1)
2478 		vie->immediate = u.signed8;
2479 	else if (n == 2)
2480 		vie->immediate = u.signed16;
2481 	else
2482 		vie->immediate = u.signed32;
2483 
2484 	return (0);
2485 }
2486 
2487 static int
2488 decode_moffset(struct vie *vie)
2489 {
2490 	int i, n;
2491 	uint8_t x;
2492 	union {
2493 		char	buf[8];
2494 		uint64_t u64;
2495 	} u;
2496 
2497 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2498 		return (0);
2499 
2500 	/*
2501 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2502 	 * The memory offset size follows the address-size of the instruction.
2503 	 */
2504 	n = vie->addrsize;
2505 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2506 
2507 	u.u64 = 0;
2508 	for (i = 0; i < n; i++) {
2509 		if (vie_peek(vie, &x))
2510 			return (-1);
2511 
2512 		u.buf[i] = x;
2513 		vie_advance(vie);
2514 	}
2515 	vie->displacement = u.u64;
2516 	return (0);
2517 }
2518 
2519 /*
2520  * Verify that the 'guest linear address' provided as collateral of the nested
2521  * page table fault matches with our instruction decoding.
2522  */
2523 static int
2524 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2525     enum vm_cpu_mode cpu_mode)
2526 {
2527 	int error;
2528 	uint64_t base, segbase, idx, gla2;
2529 	enum vm_reg_name seg;
2530 	struct seg_desc desc;
2531 
2532 	/* Skip 'gla' verification */
2533 	if (gla == VIE_INVALID_GLA)
2534 		return (0);
2535 
2536 	base = 0;
2537 	if (vie->base_register != VM_REG_LAST) {
2538 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2539 		if (error) {
2540 			printf("verify_gla: error %d getting base reg %d\n",
2541 				error, vie->base_register);
2542 			return (-1);
2543 		}
2544 
2545 		/*
2546 		 * RIP-relative addressing starts from the following
2547 		 * instruction
2548 		 */
2549 		if (vie->base_register == VM_REG_GUEST_RIP)
2550 			base += vie->num_processed;
2551 	}
2552 
2553 	idx = 0;
2554 	if (vie->index_register != VM_REG_LAST) {
2555 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2556 		if (error) {
2557 			printf("verify_gla: error %d getting index reg %d\n",
2558 				error, vie->index_register);
2559 			return (-1);
2560 		}
2561 	}
2562 
2563 	/*
2564 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2565 	 *
2566 	 * In 64-bit mode, segmentation is generally (but not
2567 	 * completely) disabled.  The exceptions are the FS and GS
2568 	 * segments.
2569 	 *
2570 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2571 	 * as the base, the SS segment is the default segment.  For
2572 	 * other data references, except when relative to stack or
2573 	 * string destination the DS segment is the default.  These
2574 	 * can be overridden to allow other segments to be accessed.
2575 	 */
2576 	if (vie->segment_override)
2577 		seg = vie->segment_register;
2578 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2579 	    vie->base_register == VM_REG_GUEST_RBP)
2580 		seg = VM_REG_GUEST_SS;
2581 	else
2582 		seg = VM_REG_GUEST_DS;
2583 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2584 	    seg != VM_REG_GUEST_GS) {
2585 		segbase = 0;
2586 	} else {
2587 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2588 		if (error) {
2589 			printf("verify_gla: error %d getting segment"
2590 			       " descriptor %d", error,
2591 			       vie->segment_register);
2592 			return (-1);
2593 		}
2594 		segbase = desc.base;
2595 	}
2596 
2597 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2598 	gla2 &= size2mask[vie->addrsize];
2599 	if (gla != gla2) {
2600 		printf("verify_gla mismatch: segbase(0x%0lx)"
2601 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2602 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2603 		       segbase, base, vie->scale, idx, vie->displacement,
2604 		       gla, gla2);
2605 		return (-1);
2606 	}
2607 
2608 	return (0);
2609 }
2610 
2611 int
2612 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2613 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2614 {
2615 
2616 	if (decode_prefixes(vie, cpu_mode, cs_d))
2617 		return (-1);
2618 
2619 	if (decode_opcode(vie))
2620 		return (-1);
2621 
2622 	if (decode_modrm(vie, cpu_mode))
2623 		return (-1);
2624 
2625 	if (decode_sib(vie))
2626 		return (-1);
2627 
2628 	if (decode_displacement(vie))
2629 		return (-1);
2630 
2631 	if (decode_immediate(vie))
2632 		return (-1);
2633 
2634 	if (decode_moffset(vie))
2635 		return (-1);
2636 
2637 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2638 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2639 			return (-1);
2640 	}
2641 
2642 	vie->decoded = 1;	/* success */
2643 
2644 	return (0);
2645 }
2646 #endif	/* _KERNEL */
2647