xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 5c1d97100348ef19878fa14671a9b70f3d963ed4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <assert.h>
54 #include <vmmapi.h>
55 #define	KASSERT(exp,msg)	assert((exp))
56 #endif	/* _KERNEL */
57 
58 #include <machine/vmm_instruction_emul.h>
59 #include <x86/psl.h>
60 #include <x86/specialreg.h>
61 
62 /* struct vie_op.op_type */
63 enum {
64 	VIE_OP_TYPE_NONE = 0,
65 	VIE_OP_TYPE_MOV,
66 	VIE_OP_TYPE_MOVSX,
67 	VIE_OP_TYPE_MOVZX,
68 	VIE_OP_TYPE_AND,
69 	VIE_OP_TYPE_OR,
70 	VIE_OP_TYPE_SUB,
71 	VIE_OP_TYPE_TWO_BYTE,
72 	VIE_OP_TYPE_PUSH,
73 	VIE_OP_TYPE_CMP,
74 	VIE_OP_TYPE_POP,
75 	VIE_OP_TYPE_MOVS,
76 	VIE_OP_TYPE_GROUP1,
77 	VIE_OP_TYPE_STOS,
78 	VIE_OP_TYPE_BITTEST,
79 	VIE_OP_TYPE_TWOB_GRP15,
80 	VIE_OP_TYPE_ADD,
81 	VIE_OP_TYPE_TEST,
82 	VIE_OP_TYPE_LAST
83 };
84 
85 /* struct vie_op.op_flags */
86 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
87 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
88 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
89 #define	VIE_OP_F_NO_MODRM	(1 << 3)
90 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
91 
92 static const struct vie_op two_byte_opcodes[256] = {
93 	[0xAE] = {
94 		  .op_byte = 0xAE,
95 		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
96 	},
97 	[0xB6] = {
98 		.op_byte = 0xB6,
99 		.op_type = VIE_OP_TYPE_MOVZX,
100 	},
101 	[0xB7] = {
102 		.op_byte = 0xB7,
103 		.op_type = VIE_OP_TYPE_MOVZX,
104 	},
105 	[0xBA] = {
106 		.op_byte = 0xBA,
107 		.op_type = VIE_OP_TYPE_BITTEST,
108 		.op_flags = VIE_OP_F_IMM8,
109 	},
110 	[0xBE] = {
111 		.op_byte = 0xBE,
112 		.op_type = VIE_OP_TYPE_MOVSX,
113 	},
114 };
115 
116 static const struct vie_op one_byte_opcodes[256] = {
117 	[0x03] = {
118 		.op_byte = 0x03,
119 		.op_type = VIE_OP_TYPE_ADD,
120 	},
121 	[0x0F] = {
122 		.op_byte = 0x0F,
123 		.op_type = VIE_OP_TYPE_TWO_BYTE
124 	},
125 	[0x0B] = {
126 		.op_byte = 0x0B,
127 		.op_type = VIE_OP_TYPE_OR,
128 	},
129 	[0x2B] = {
130 		.op_byte = 0x2B,
131 		.op_type = VIE_OP_TYPE_SUB,
132 	},
133 	[0x39] = {
134 		.op_byte = 0x39,
135 		.op_type = VIE_OP_TYPE_CMP,
136 	},
137 	[0x3B] = {
138 		.op_byte = 0x3B,
139 		.op_type = VIE_OP_TYPE_CMP,
140 	},
141 	[0x88] = {
142 		.op_byte = 0x88,
143 		.op_type = VIE_OP_TYPE_MOV,
144 	},
145 	[0x89] = {
146 		.op_byte = 0x89,
147 		.op_type = VIE_OP_TYPE_MOV,
148 	},
149 	[0x8A] = {
150 		.op_byte = 0x8A,
151 		.op_type = VIE_OP_TYPE_MOV,
152 	},
153 	[0x8B] = {
154 		.op_byte = 0x8B,
155 		.op_type = VIE_OP_TYPE_MOV,
156 	},
157 	[0xA1] = {
158 		.op_byte = 0xA1,
159 		.op_type = VIE_OP_TYPE_MOV,
160 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
161 	},
162 	[0xA3] = {
163 		.op_byte = 0xA3,
164 		.op_type = VIE_OP_TYPE_MOV,
165 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
166 	},
167 	[0xA4] = {
168 		.op_byte = 0xA4,
169 		.op_type = VIE_OP_TYPE_MOVS,
170 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
171 	},
172 	[0xA5] = {
173 		.op_byte = 0xA5,
174 		.op_type = VIE_OP_TYPE_MOVS,
175 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
176 	},
177 	[0xAA] = {
178 		.op_byte = 0xAA,
179 		.op_type = VIE_OP_TYPE_STOS,
180 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
181 	},
182 	[0xAB] = {
183 		.op_byte = 0xAB,
184 		.op_type = VIE_OP_TYPE_STOS,
185 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
186 	},
187 	[0xC6] = {
188 		/* XXX Group 11 extended opcode - not just MOV */
189 		.op_byte = 0xC6,
190 		.op_type = VIE_OP_TYPE_MOV,
191 		.op_flags = VIE_OP_F_IMM8,
192 	},
193 	[0xC7] = {
194 		.op_byte = 0xC7,
195 		.op_type = VIE_OP_TYPE_MOV,
196 		.op_flags = VIE_OP_F_IMM,
197 	},
198 	[0x23] = {
199 		.op_byte = 0x23,
200 		.op_type = VIE_OP_TYPE_AND,
201 	},
202 	[0x80] = {
203 		/* Group 1 extended opcode */
204 		.op_byte = 0x80,
205 		.op_type = VIE_OP_TYPE_GROUP1,
206 		.op_flags = VIE_OP_F_IMM8,
207 	},
208 	[0x81] = {
209 		/* Group 1 extended opcode */
210 		.op_byte = 0x81,
211 		.op_type = VIE_OP_TYPE_GROUP1,
212 		.op_flags = VIE_OP_F_IMM,
213 	},
214 	[0x83] = {
215 		/* Group 1 extended opcode */
216 		.op_byte = 0x83,
217 		.op_type = VIE_OP_TYPE_GROUP1,
218 		.op_flags = VIE_OP_F_IMM8,
219 	},
220 	[0x8F] = {
221 		/* XXX Group 1A extended opcode - not just POP */
222 		.op_byte = 0x8F,
223 		.op_type = VIE_OP_TYPE_POP,
224 	},
225 	[0xF7] = {
226 		/* XXX Group 3 extended opcode - not just TEST */
227 		.op_byte = 0xF7,
228 		.op_type = VIE_OP_TYPE_TEST,
229 		.op_flags = VIE_OP_F_IMM,
230 	},
231 	[0xFF] = {
232 		/* XXX Group 5 extended opcode - not just PUSH */
233 		.op_byte = 0xFF,
234 		.op_type = VIE_OP_TYPE_PUSH,
235 	}
236 };
237 
238 /* struct vie.mod */
239 #define	VIE_MOD_INDIRECT		0
240 #define	VIE_MOD_INDIRECT_DISP8		1
241 #define	VIE_MOD_INDIRECT_DISP32		2
242 #define	VIE_MOD_DIRECT			3
243 
244 /* struct vie.rm */
245 #define	VIE_RM_SIB			4
246 #define	VIE_RM_DISP32			5
247 
248 #define	GB				(1024 * 1024 * 1024)
249 
250 static enum vm_reg_name gpr_map[16] = {
251 	VM_REG_GUEST_RAX,
252 	VM_REG_GUEST_RCX,
253 	VM_REG_GUEST_RDX,
254 	VM_REG_GUEST_RBX,
255 	VM_REG_GUEST_RSP,
256 	VM_REG_GUEST_RBP,
257 	VM_REG_GUEST_RSI,
258 	VM_REG_GUEST_RDI,
259 	VM_REG_GUEST_R8,
260 	VM_REG_GUEST_R9,
261 	VM_REG_GUEST_R10,
262 	VM_REG_GUEST_R11,
263 	VM_REG_GUEST_R12,
264 	VM_REG_GUEST_R13,
265 	VM_REG_GUEST_R14,
266 	VM_REG_GUEST_R15
267 };
268 
269 static uint64_t size2mask[] = {
270 	[1] = 0xff,
271 	[2] = 0xffff,
272 	[4] = 0xffffffff,
273 	[8] = 0xffffffffffffffff,
274 };
275 
276 static int
277 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
278 {
279 	int error;
280 
281 	error = vm_get_register(vm, vcpuid, reg, rval);
282 
283 	return (error);
284 }
285 
286 static void
287 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
288 {
289 	*lhbr = 0;
290 	*reg = gpr_map[vie->reg];
291 
292 	/*
293 	 * 64-bit mode imposes limitations on accessing legacy high byte
294 	 * registers (lhbr).
295 	 *
296 	 * The legacy high-byte registers cannot be addressed if the REX
297 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
298 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
299 	 *
300 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
301 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
302 	 * %ah, %ch, %dh and %bh respectively.
303 	 */
304 	if (!vie->rex_present) {
305 		if (vie->reg & 0x4) {
306 			*lhbr = 1;
307 			*reg = gpr_map[vie->reg & 0x3];
308 		}
309 	}
310 }
311 
312 static int
313 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
314 {
315 	uint64_t val;
316 	int error, lhbr;
317 	enum vm_reg_name reg;
318 
319 	vie_calc_bytereg(vie, &reg, &lhbr);
320 	error = vm_get_register(vm, vcpuid, reg, &val);
321 
322 	/*
323 	 * To obtain the value of a legacy high byte register shift the
324 	 * base register right by 8 bits (%ah = %rax >> 8).
325 	 */
326 	if (lhbr)
327 		*rval = val >> 8;
328 	else
329 		*rval = val;
330 	return (error);
331 }
332 
333 static int
334 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
335 {
336 	uint64_t origval, val, mask;
337 	int error, lhbr;
338 	enum vm_reg_name reg;
339 
340 	vie_calc_bytereg(vie, &reg, &lhbr);
341 	error = vm_get_register(vm, vcpuid, reg, &origval);
342 	if (error == 0) {
343 		val = byte;
344 		mask = 0xff;
345 		if (lhbr) {
346 			/*
347 			 * Shift left by 8 to store 'byte' in a legacy high
348 			 * byte register.
349 			 */
350 			val <<= 8;
351 			mask <<= 8;
352 		}
353 		val |= origval & ~mask;
354 		error = vm_set_register(vm, vcpuid, reg, val);
355 	}
356 	return (error);
357 }
358 
359 int
360 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
361 		    uint64_t val, int size)
362 {
363 	int error;
364 	uint64_t origval;
365 
366 	switch (size) {
367 	case 1:
368 	case 2:
369 		error = vie_read_register(vm, vcpuid, reg, &origval);
370 		if (error)
371 			return (error);
372 		val &= size2mask[size];
373 		val |= origval & ~size2mask[size];
374 		break;
375 	case 4:
376 		val &= 0xffffffffUL;
377 		break;
378 	case 8:
379 		break;
380 	default:
381 		return (EINVAL);
382 	}
383 
384 	error = vm_set_register(vm, vcpuid, reg, val);
385 	return (error);
386 }
387 
388 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
389 
390 /*
391  * Return the status flags that would result from doing (x - y).
392  */
393 #define	GETCC(sz)							\
394 static u_long								\
395 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
396 {									\
397 	u_long rflags;							\
398 									\
399 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
400 	    "=r" (rflags), "+r" (x) : "m" (y));				\
401 	return (rflags);						\
402 } struct __hack
403 
404 GETCC(8);
405 GETCC(16);
406 GETCC(32);
407 GETCC(64);
408 
409 static u_long
410 getcc(int opsize, uint64_t x, uint64_t y)
411 {
412 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
413 	    ("getcc: invalid operand size %d", opsize));
414 
415 	if (opsize == 1)
416 		return (getcc8(x, y));
417 	else if (opsize == 2)
418 		return (getcc16(x, y));
419 	else if (opsize == 4)
420 		return (getcc32(x, y));
421 	else
422 		return (getcc64(x, y));
423 }
424 
425 /*
426  * Macro creation of functions getaddflags{8,16,32,64}
427  */
428 #define	GETADDFLAGS(sz)							\
429 static u_long								\
430 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
431 {									\
432 	u_long rflags;							\
433 									\
434 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
435 	    "=r" (rflags), "+r" (x) : "m" (y));				\
436 	return (rflags);						\
437 } struct __hack
438 
439 GETADDFLAGS(8);
440 GETADDFLAGS(16);
441 GETADDFLAGS(32);
442 GETADDFLAGS(64);
443 
444 static u_long
445 getaddflags(int opsize, uint64_t x, uint64_t y)
446 {
447 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
448 	    ("getaddflags: invalid operand size %d", opsize));
449 
450 	if (opsize == 1)
451 		return (getaddflags8(x, y));
452 	else if (opsize == 2)
453 		return (getaddflags16(x, y));
454 	else if (opsize == 4)
455 		return (getaddflags32(x, y));
456 	else
457 		return (getaddflags64(x, y));
458 }
459 
460 /*
461  * Return the status flags that would result from doing (x & y).
462  */
463 #define	GETANDFLAGS(sz)							\
464 static u_long								\
465 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
466 {									\
467 	u_long rflags;							\
468 									\
469 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
470 	    "=r" (rflags), "+r" (x) : "m" (y));				\
471 	return (rflags);						\
472 } struct __hack
473 
474 GETANDFLAGS(8);
475 GETANDFLAGS(16);
476 GETANDFLAGS(32);
477 GETANDFLAGS(64);
478 
479 static u_long
480 getandflags(int opsize, uint64_t x, uint64_t y)
481 {
482 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
483 	    ("getandflags: invalid operand size %d", opsize));
484 
485 	if (opsize == 1)
486 		return (getandflags8(x, y));
487 	else if (opsize == 2)
488 		return (getandflags16(x, y));
489 	else if (opsize == 4)
490 		return (getandflags32(x, y));
491 	else
492 		return (getandflags64(x, y));
493 }
494 
495 static int
496 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
497 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
498 {
499 	int error, size;
500 	enum vm_reg_name reg;
501 	uint8_t byte;
502 	uint64_t val;
503 
504 	size = vie->opsize;
505 	error = EINVAL;
506 
507 	switch (vie->op.op_byte) {
508 	case 0x88:
509 		/*
510 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
511 		 * 88/r:	mov r/m8, r8
512 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
513 		 */
514 		size = 1;	/* override for byte operation */
515 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
516 		if (error == 0)
517 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
518 		break;
519 	case 0x89:
520 		/*
521 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
522 		 * 89/r:	mov r/m16, r16
523 		 * 89/r:	mov r/m32, r32
524 		 * REX.W + 89/r	mov r/m64, r64
525 		 */
526 		reg = gpr_map[vie->reg];
527 		error = vie_read_register(vm, vcpuid, reg, &val);
528 		if (error == 0) {
529 			val &= size2mask[size];
530 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
531 		}
532 		break;
533 	case 0x8A:
534 		/*
535 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
536 		 * 8A/r:	mov r8, r/m8
537 		 * REX + 8A/r:	mov r8, r/m8
538 		 */
539 		size = 1;	/* override for byte operation */
540 		error = memread(vm, vcpuid, gpa, &val, size, arg);
541 		if (error == 0)
542 			error = vie_write_bytereg(vm, vcpuid, vie, val);
543 		break;
544 	case 0x8B:
545 		/*
546 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
547 		 * 8B/r:	mov r16, r/m16
548 		 * 8B/r:	mov r32, r/m32
549 		 * REX.W 8B/r:	mov r64, r/m64
550 		 */
551 		error = memread(vm, vcpuid, gpa, &val, size, arg);
552 		if (error == 0) {
553 			reg = gpr_map[vie->reg];
554 			error = vie_update_register(vm, vcpuid, reg, val, size);
555 		}
556 		break;
557 	case 0xA1:
558 		/*
559 		 * MOV from seg:moffset to AX/EAX/RAX
560 		 * A1:		mov AX, moffs16
561 		 * A1:		mov EAX, moffs32
562 		 * REX.W + A1:	mov RAX, moffs64
563 		 */
564 		error = memread(vm, vcpuid, gpa, &val, size, arg);
565 		if (error == 0) {
566 			reg = VM_REG_GUEST_RAX;
567 			error = vie_update_register(vm, vcpuid, reg, val, size);
568 		}
569 		break;
570 	case 0xA3:
571 		/*
572 		 * MOV from AX/EAX/RAX to seg:moffset
573 		 * A3:		mov moffs16, AX
574 		 * A3:		mov moffs32, EAX
575 		 * REX.W + A3:	mov moffs64, RAX
576 		 */
577 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
578 		if (error == 0) {
579 			val &= size2mask[size];
580 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
581 		}
582 		break;
583 	case 0xC6:
584 		/*
585 		 * MOV from imm8 to mem (ModRM:r/m)
586 		 * C6/0		mov r/m8, imm8
587 		 * REX + C6/0	mov r/m8, imm8
588 		 */
589 		size = 1;	/* override for byte operation */
590 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
591 		break;
592 	case 0xC7:
593 		/*
594 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
595 		 * C7/0		mov r/m16, imm16
596 		 * C7/0		mov r/m32, imm32
597 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
598 		 */
599 		val = vie->immediate & size2mask[size];
600 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
601 		break;
602 	default:
603 		break;
604 	}
605 
606 	return (error);
607 }
608 
609 static int
610 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
611 	     mem_region_read_t memread, mem_region_write_t memwrite,
612 	     void *arg)
613 {
614 	int error, size;
615 	enum vm_reg_name reg;
616 	uint64_t val;
617 
618 	size = vie->opsize;
619 	error = EINVAL;
620 
621 	switch (vie->op.op_byte) {
622 	case 0xB6:
623 		/*
624 		 * MOV and zero extend byte from mem (ModRM:r/m) to
625 		 * reg (ModRM:reg).
626 		 *
627 		 * 0F B6/r		movzx r16, r/m8
628 		 * 0F B6/r		movzx r32, r/m8
629 		 * REX.W + 0F B6/r	movzx r64, r/m8
630 		 */
631 
632 		/* get the first operand */
633 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
634 		if (error)
635 			break;
636 
637 		/* get the second operand */
638 		reg = gpr_map[vie->reg];
639 
640 		/* zero-extend byte */
641 		val = (uint8_t)val;
642 
643 		/* write the result */
644 		error = vie_update_register(vm, vcpuid, reg, val, size);
645 		break;
646 	case 0xB7:
647 		/*
648 		 * MOV and zero extend word from mem (ModRM:r/m) to
649 		 * reg (ModRM:reg).
650 		 *
651 		 * 0F B7/r		movzx r32, r/m16
652 		 * REX.W + 0F B7/r	movzx r64, r/m16
653 		 */
654 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
655 		if (error)
656 			return (error);
657 
658 		reg = gpr_map[vie->reg];
659 
660 		/* zero-extend word */
661 		val = (uint16_t)val;
662 
663 		error = vie_update_register(vm, vcpuid, reg, val, size);
664 		break;
665 	case 0xBE:
666 		/*
667 		 * MOV and sign extend byte from mem (ModRM:r/m) to
668 		 * reg (ModRM:reg).
669 		 *
670 		 * 0F BE/r		movsx r16, r/m8
671 		 * 0F BE/r		movsx r32, r/m8
672 		 * REX.W + 0F BE/r	movsx r64, r/m8
673 		 */
674 
675 		/* get the first operand */
676 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
677 		if (error)
678 			break;
679 
680 		/* get the second operand */
681 		reg = gpr_map[vie->reg];
682 
683 		/* sign extend byte */
684 		val = (int8_t)val;
685 
686 		/* write the result */
687 		error = vie_update_register(vm, vcpuid, reg, val, size);
688 		break;
689 	default:
690 		break;
691 	}
692 	return (error);
693 }
694 
695 /*
696  * Helper function to calculate and validate a linear address.
697  */
698 static int
699 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
700     int opsize, int addrsize, int prot, enum vm_reg_name seg,
701     enum vm_reg_name gpr, uint64_t *gla, int *fault)
702 {
703 	struct seg_desc desc;
704 	uint64_t cr0, val, rflags;
705 	int error;
706 
707 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
708 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
709 
710 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
711 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
712 
713 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
714 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
715 	    __func__, error, seg));
716 
717 	error = vie_read_register(vm, vcpuid, gpr, &val);
718 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
719 	    error, gpr));
720 
721 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
722 	    addrsize, prot, gla)) {
723 		if (seg == VM_REG_GUEST_SS)
724 			vm_inject_ss(vm, vcpuid, 0);
725 		else
726 			vm_inject_gp(vm, vcpuid);
727 		goto guest_fault;
728 	}
729 
730 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
731 		if (seg == VM_REG_GUEST_SS)
732 			vm_inject_ss(vm, vcpuid, 0);
733 		else
734 			vm_inject_gp(vm, vcpuid);
735 		goto guest_fault;
736 	}
737 
738 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
739 		vm_inject_ac(vm, vcpuid, 0);
740 		goto guest_fault;
741 	}
742 
743 	*fault = 0;
744 	return (0);
745 
746 guest_fault:
747 	*fault = 1;
748 	return (0);
749 }
750 
751 static int
752 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
753     struct vm_guest_paging *paging, mem_region_read_t memread,
754     mem_region_write_t memwrite, void *arg)
755 {
756 #ifdef _KERNEL
757 	struct vm_copyinfo copyinfo[2];
758 #else
759 	struct iovec copyinfo[2];
760 #endif
761 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
762 	uint64_t rcx, rdi, rsi, rflags;
763 	int error, fault, opsize, seg, repeat;
764 
765 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
766 	val = 0;
767 	error = 0;
768 
769 	/*
770 	 * XXX although the MOVS instruction is only supposed to be used with
771 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
772 	 *
773 	 * Empirically the "repnz" prefix has identical behavior to "rep"
774 	 * and the zero flag does not make a difference.
775 	 */
776 	repeat = vie->repz_present | vie->repnz_present;
777 
778 	if (repeat) {
779 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
780 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
781 
782 		/*
783 		 * The count register is %rcx, %ecx or %cx depending on the
784 		 * address size of the instruction.
785 		 */
786 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
787 			error = 0;
788 			goto done;
789 		}
790 	}
791 
792 	/*
793 	 *	Source		Destination	Comments
794 	 *	--------------------------------------------
795 	 * (1)  memory		memory		n/a
796 	 * (2)  memory		mmio		emulated
797 	 * (3)  mmio		memory		emulated
798 	 * (4)  mmio		mmio		emulated
799 	 *
800 	 * At this point we don't have sufficient information to distinguish
801 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
802 	 * out because it will succeed only when operating on regular memory.
803 	 *
804 	 * XXX the emulation doesn't properly handle the case where 'gpa'
805 	 * is straddling the boundary between the normal memory and MMIO.
806 	 */
807 
808 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
809 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
810 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
811 	if (error || fault)
812 		goto done;
813 
814 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
815 	    copyinfo, nitems(copyinfo), &fault);
816 	if (error == 0) {
817 		if (fault)
818 			goto done;	/* Resume guest to handle fault */
819 
820 		/*
821 		 * case (2): read from system memory and write to mmio.
822 		 */
823 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
824 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
825 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
826 		if (error)
827 			goto done;
828 	} else {
829 		/*
830 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
831 		 * if 'srcaddr' is in the mmio space.
832 		 */
833 
834 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
835 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
836 		    &fault);
837 		if (error || fault)
838 			goto done;
839 
840 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
841 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
842 		if (error == 0) {
843 			if (fault)
844 				goto done;    /* Resume guest to handle fault */
845 
846 			/*
847 			 * case (3): read from MMIO and write to system memory.
848 			 *
849 			 * A MMIO read can have side-effects so we
850 			 * commit to it only after vm_copy_setup() is
851 			 * successful. If a page-fault needs to be
852 			 * injected into the guest then it will happen
853 			 * before the MMIO read is attempted.
854 			 */
855 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
856 			if (error)
857 				goto done;
858 
859 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
860 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
861 		} else {
862 			/*
863 			 * Case (4): read from and write to mmio.
864 			 *
865 			 * Commit to the MMIO read/write (with potential
866 			 * side-effects) only after we are sure that the
867 			 * instruction is not going to be restarted due
868 			 * to address translation faults.
869 			 */
870 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
871 			    PROT_READ, &srcgpa, &fault);
872 			if (error || fault)
873 				goto done;
874 
875 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
876 			   PROT_WRITE, &dstgpa, &fault);
877 			if (error || fault)
878 				goto done;
879 
880 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
881 			if (error)
882 				goto done;
883 
884 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
885 			if (error)
886 				goto done;
887 		}
888 	}
889 
890 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
891 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
892 
893 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
894 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
895 
896 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
897 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
898 
899 	if (rflags & PSL_D) {
900 		rsi -= opsize;
901 		rdi -= opsize;
902 	} else {
903 		rsi += opsize;
904 		rdi += opsize;
905 	}
906 
907 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
908 	    vie->addrsize);
909 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
910 
911 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
912 	    vie->addrsize);
913 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
914 
915 	if (repeat) {
916 		rcx = rcx - 1;
917 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
918 		    rcx, vie->addrsize);
919 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
920 
921 		/*
922 		 * Repeat the instruction if the count register is not zero.
923 		 */
924 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
925 			vm_restart_instruction(vm, vcpuid);
926 	}
927 done:
928 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
929 	    __func__, error));
930 	return (error);
931 }
932 
933 static int
934 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
935     struct vm_guest_paging *paging, mem_region_read_t memread,
936     mem_region_write_t memwrite, void *arg)
937 {
938 	int error, opsize, repeat;
939 	uint64_t val;
940 	uint64_t rcx, rdi, rflags;
941 
942 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
943 	repeat = vie->repz_present | vie->repnz_present;
944 
945 	if (repeat) {
946 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
947 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
948 
949 		/*
950 		 * The count register is %rcx, %ecx or %cx depending on the
951 		 * address size of the instruction.
952 		 */
953 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
954 			return (0);
955 	}
956 
957 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
958 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
959 
960 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
961 	if (error)
962 		return (error);
963 
964 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
965 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
966 
967 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
968 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
969 
970 	if (rflags & PSL_D)
971 		rdi -= opsize;
972 	else
973 		rdi += opsize;
974 
975 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
976 	    vie->addrsize);
977 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
978 
979 	if (repeat) {
980 		rcx = rcx - 1;
981 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
982 		    rcx, vie->addrsize);
983 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
984 
985 		/*
986 		 * Repeat the instruction if the count register is not zero.
987 		 */
988 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
989 			vm_restart_instruction(vm, vcpuid);
990 	}
991 
992 	return (0);
993 }
994 
995 static int
996 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
997 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
998 {
999 	int error, size;
1000 	enum vm_reg_name reg;
1001 	uint64_t result, rflags, rflags2, val1, val2;
1002 
1003 	size = vie->opsize;
1004 	error = EINVAL;
1005 
1006 	switch (vie->op.op_byte) {
1007 	case 0x23:
1008 		/*
1009 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1010 		 * result in reg.
1011 		 *
1012 		 * 23/r		and r16, r/m16
1013 		 * 23/r		and r32, r/m32
1014 		 * REX.W + 23/r	and r64, r/m64
1015 		 */
1016 
1017 		/* get the first operand */
1018 		reg = gpr_map[vie->reg];
1019 		error = vie_read_register(vm, vcpuid, reg, &val1);
1020 		if (error)
1021 			break;
1022 
1023 		/* get the second operand */
1024 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1025 		if (error)
1026 			break;
1027 
1028 		/* perform the operation and write the result */
1029 		result = val1 & val2;
1030 		error = vie_update_register(vm, vcpuid, reg, result, size);
1031 		break;
1032 	case 0x81:
1033 	case 0x83:
1034 		/*
1035 		 * AND mem (ModRM:r/m) with immediate and store the
1036 		 * result in mem.
1037 		 *
1038 		 * 81 /4		and r/m16, imm16
1039 		 * 81 /4		and r/m32, imm32
1040 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1041 		 *
1042 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1043 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1044 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1045 		 */
1046 
1047 		/* get the first operand */
1048                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1049                 if (error)
1050 			break;
1051 
1052                 /*
1053 		 * perform the operation with the pre-fetched immediate
1054 		 * operand and write the result
1055 		 */
1056                 result = val1 & vie->immediate;
1057                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1058 		break;
1059 	default:
1060 		break;
1061 	}
1062 	if (error)
1063 		return (error);
1064 
1065 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1066 	if (error)
1067 		return (error);
1068 
1069 	/*
1070 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1071 	 * to the result; AF is undefined.
1072 	 *
1073 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1074 	 */
1075 	rflags2 = getcc(size, result, 0);
1076 	rflags &= ~RFLAGS_STATUS_BITS;
1077 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1078 
1079 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1080 	return (error);
1081 }
1082 
1083 static int
1084 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1085 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1086 {
1087 	int error, size;
1088 	enum vm_reg_name reg;
1089 	uint64_t result, rflags, rflags2, val1, val2;
1090 
1091 	size = vie->opsize;
1092 	error = EINVAL;
1093 
1094 	switch (vie->op.op_byte) {
1095 	case 0x0B:
1096 		/*
1097 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1098 		 * result in reg.
1099 		 *
1100 		 * 0b/r         or r16, r/m16
1101 		 * 0b/r         or r32, r/m32
1102 		 * REX.W + 0b/r or r64, r/m64
1103 		 */
1104 
1105 		/* get the first operand */
1106 		reg = gpr_map[vie->reg];
1107 		error = vie_read_register(vm, vcpuid, reg, &val1);
1108 		if (error)
1109 			break;
1110 
1111 		/* get the second operand */
1112 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1113 		if (error)
1114 			break;
1115 
1116 		/* perform the operation and write the result */
1117 		result = val1 | val2;
1118 		error = vie_update_register(vm, vcpuid, reg, result, size);
1119 		break;
1120 	case 0x81:
1121 	case 0x83:
1122 		/*
1123 		 * OR mem (ModRM:r/m) with immediate and store the
1124 		 * result in mem.
1125 		 *
1126 		 * 81 /1		or r/m16, imm16
1127 		 * 81 /1		or r/m32, imm32
1128 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1129 		 *
1130 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1131 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1132 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1133 		 */
1134 
1135 		/* get the first operand */
1136                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1137                 if (error)
1138 			break;
1139 
1140                 /*
1141 		 * perform the operation with the pre-fetched immediate
1142 		 * operand and write the result
1143 		 */
1144                 result = val1 | vie->immediate;
1145                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1146 		break;
1147 	default:
1148 		break;
1149 	}
1150 	if (error)
1151 		return (error);
1152 
1153 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1154 	if (error)
1155 		return (error);
1156 
1157 	/*
1158 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1159 	 * to the result; AF is undefined.
1160 	 *
1161 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1162 	 */
1163 	rflags2 = getcc(size, result, 0);
1164 	rflags &= ~RFLAGS_STATUS_BITS;
1165 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1166 
1167 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1168 	return (error);
1169 }
1170 
1171 static int
1172 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1173 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1174 {
1175 	int error, size;
1176 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1177 	enum vm_reg_name reg;
1178 
1179 	size = vie->opsize;
1180 	switch (vie->op.op_byte) {
1181 	case 0x39:
1182 	case 0x3B:
1183 		/*
1184 		 * 39/r		CMP r/m16, r16
1185 		 * 39/r		CMP r/m32, r32
1186 		 * REX.W 39/r	CMP r/m64, r64
1187 		 *
1188 		 * 3B/r		CMP r16, r/m16
1189 		 * 3B/r		CMP r32, r/m32
1190 		 * REX.W + 3B/r	CMP r64, r/m64
1191 		 *
1192 		 * Compare the first operand with the second operand and
1193 		 * set status flags in EFLAGS register. The comparison is
1194 		 * performed by subtracting the second operand from the first
1195 		 * operand and then setting the status flags.
1196 		 */
1197 
1198 		/* Get the register operand */
1199 		reg = gpr_map[vie->reg];
1200 		error = vie_read_register(vm, vcpuid, reg, &regop);
1201 		if (error)
1202 			return (error);
1203 
1204 		/* Get the memory operand */
1205 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1206 		if (error)
1207 			return (error);
1208 
1209 		if (vie->op.op_byte == 0x3B) {
1210 			op1 = regop;
1211 			op2 = memop;
1212 		} else {
1213 			op1 = memop;
1214 			op2 = regop;
1215 		}
1216 		rflags2 = getcc(size, op1, op2);
1217 		break;
1218 	case 0x80:
1219 	case 0x81:
1220 	case 0x83:
1221 		/*
1222 		 * 80 /7		cmp r/m8, imm8
1223 		 * REX + 80 /7		cmp r/m8, imm8
1224 		 *
1225 		 * 81 /7		cmp r/m16, imm16
1226 		 * 81 /7		cmp r/m32, imm32
1227 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1228 		 *
1229 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1230 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1231 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1232 		 *
1233 		 * Compare mem (ModRM:r/m) with immediate and set
1234 		 * status flags according to the results.  The
1235 		 * comparison is performed by subtracting the
1236 		 * immediate from the first operand and then setting
1237 		 * the status flags.
1238 		 *
1239 		 */
1240 		if (vie->op.op_byte == 0x80)
1241 			size = 1;
1242 
1243 		/* get the first operand */
1244                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1245 		if (error)
1246 			return (error);
1247 
1248 		rflags2 = getcc(size, op1, vie->immediate);
1249 		break;
1250 	default:
1251 		return (EINVAL);
1252 	}
1253 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1254 	if (error)
1255 		return (error);
1256 	rflags &= ~RFLAGS_STATUS_BITS;
1257 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1258 
1259 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1260 	return (error);
1261 }
1262 
1263 static int
1264 emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1265     mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1266 {
1267 	int error, size;
1268 	uint64_t op1, rflags, rflags2;
1269 
1270 	size = vie->opsize;
1271 	error = EINVAL;
1272 
1273 	switch (vie->op.op_byte) {
1274 	case 0xF7:
1275 		/*
1276 		 * F7 /0		test r/m16, imm16
1277 		 * F7 /0		test r/m32, imm32
1278 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1279 		 *
1280 		 * Test mem (ModRM:r/m) with immediate and set status
1281 		 * flags according to the results.  The comparison is
1282 		 * performed by anding the immediate from the first
1283 		 * operand and then setting the status flags.
1284 		 */
1285 		if ((vie->reg & 7) != 0)
1286 			return (EINVAL);
1287 
1288 		error = memread(vm, vcpuid, gpa, &op1, size, arg);
1289 		if (error)
1290 			return (error);
1291 
1292 		rflags2 = getandflags(size, op1, vie->immediate);
1293 		break;
1294 	default:
1295 		return (EINVAL);
1296 	}
1297 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1298 	if (error)
1299 		return (error);
1300 
1301 	/*
1302 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1303 	 * to the result; AF is undefined.
1304 	 */
1305 	rflags &= ~RFLAGS_STATUS_BITS;
1306 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1307 
1308 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1309 	return (error);
1310 }
1311 
1312 static int
1313 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1314 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1315 {
1316 	int error, size;
1317 	uint64_t nval, rflags, rflags2, val1, val2;
1318 	enum vm_reg_name reg;
1319 
1320 	size = vie->opsize;
1321 	error = EINVAL;
1322 
1323 	switch (vie->op.op_byte) {
1324 	case 0x03:
1325 		/*
1326 		 * ADD r/m to r and store the result in r
1327 		 *
1328 		 * 03/r            ADD r16, r/m16
1329 		 * 03/r            ADD r32, r/m32
1330 		 * REX.W + 03/r    ADD r64, r/m64
1331 		 */
1332 
1333 		/* get the first operand */
1334 		reg = gpr_map[vie->reg];
1335 		error = vie_read_register(vm, vcpuid, reg, &val1);
1336 		if (error)
1337 			break;
1338 
1339 		/* get the second operand */
1340 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1341 		if (error)
1342 			break;
1343 
1344 		/* perform the operation and write the result */
1345 		nval = val1 + val2;
1346 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1347 		break;
1348 	default:
1349 		break;
1350 	}
1351 
1352 	if (!error) {
1353 		rflags2 = getaddflags(size, val1, val2);
1354 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1355 		    &rflags);
1356 		if (error)
1357 			return (error);
1358 
1359 		rflags &= ~RFLAGS_STATUS_BITS;
1360 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1361 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1362 		    rflags, 8);
1363 	}
1364 
1365 	return (error);
1366 }
1367 
1368 static int
1369 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1370 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1371 {
1372 	int error, size;
1373 	uint64_t nval, rflags, rflags2, val1, val2;
1374 	enum vm_reg_name reg;
1375 
1376 	size = vie->opsize;
1377 	error = EINVAL;
1378 
1379 	switch (vie->op.op_byte) {
1380 	case 0x2B:
1381 		/*
1382 		 * SUB r/m from r and store the result in r
1383 		 *
1384 		 * 2B/r            SUB r16, r/m16
1385 		 * 2B/r            SUB r32, r/m32
1386 		 * REX.W + 2B/r    SUB r64, r/m64
1387 		 */
1388 
1389 		/* get the first operand */
1390 		reg = gpr_map[vie->reg];
1391 		error = vie_read_register(vm, vcpuid, reg, &val1);
1392 		if (error)
1393 			break;
1394 
1395 		/* get the second operand */
1396 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1397 		if (error)
1398 			break;
1399 
1400 		/* perform the operation and write the result */
1401 		nval = val1 - val2;
1402 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1403 		break;
1404 	default:
1405 		break;
1406 	}
1407 
1408 	if (!error) {
1409 		rflags2 = getcc(size, val1, val2);
1410 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1411 		    &rflags);
1412 		if (error)
1413 			return (error);
1414 
1415 		rflags &= ~RFLAGS_STATUS_BITS;
1416 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1417 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1418 		    rflags, 8);
1419 	}
1420 
1421 	return (error);
1422 }
1423 
1424 static int
1425 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1426     struct vm_guest_paging *paging, mem_region_read_t memread,
1427     mem_region_write_t memwrite, void *arg)
1428 {
1429 #ifdef _KERNEL
1430 	struct vm_copyinfo copyinfo[2];
1431 #else
1432 	struct iovec copyinfo[2];
1433 #endif
1434 	struct seg_desc ss_desc;
1435 	uint64_t cr0, rflags, rsp, stack_gla, val;
1436 	int error, fault, size, stackaddrsize, pushop;
1437 
1438 	val = 0;
1439 	size = vie->opsize;
1440 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1441 
1442 	/*
1443 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1444 	 */
1445 	if (paging->cpu_mode == CPU_MODE_REAL) {
1446 		stackaddrsize = 2;
1447 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1448 		/*
1449 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1450 		 * - Stack pointer size is always 64-bits.
1451 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1452 		 * - 16-bit PUSH/POP is supported by using the operand size
1453 		 *   override prefix (66H).
1454 		 */
1455 		stackaddrsize = 8;
1456 		size = vie->opsize_override ? 2 : 8;
1457 	} else {
1458 		/*
1459 		 * In protected or compatibility mode the 'B' flag in the
1460 		 * stack-segment descriptor determines the size of the
1461 		 * stack pointer.
1462 		 */
1463 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1464 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1465 		    __func__, error));
1466 		if (SEG_DESC_DEF32(ss_desc.access))
1467 			stackaddrsize = 4;
1468 		else
1469 			stackaddrsize = 2;
1470 	}
1471 
1472 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1473 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1474 
1475 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1476 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1477 
1478 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1479 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1480 	if (pushop) {
1481 		rsp -= size;
1482 	}
1483 
1484 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1485 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1486 	    &stack_gla)) {
1487 		vm_inject_ss(vm, vcpuid, 0);
1488 		return (0);
1489 	}
1490 
1491 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1492 		vm_inject_ss(vm, vcpuid, 0);
1493 		return (0);
1494 	}
1495 
1496 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1497 		vm_inject_ac(vm, vcpuid, 0);
1498 		return (0);
1499 	}
1500 
1501 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1502 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1503 	    &fault);
1504 	if (error || fault)
1505 		return (error);
1506 
1507 	if (pushop) {
1508 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1509 		if (error == 0)
1510 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1511 	} else {
1512 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1513 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1514 		rsp += size;
1515 	}
1516 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1517 
1518 	if (error == 0) {
1519 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1520 		    stackaddrsize);
1521 		KASSERT(error == 0, ("error %d updating rsp", error));
1522 	}
1523 	return (error);
1524 }
1525 
1526 static int
1527 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1528     struct vm_guest_paging *paging, mem_region_read_t memread,
1529     mem_region_write_t memwrite, void *arg)
1530 {
1531 	int error;
1532 
1533 	/*
1534 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1535 	 *
1536 	 * PUSH is part of the group 5 extended opcodes and is identified
1537 	 * by ModRM:reg = b110.
1538 	 */
1539 	if ((vie->reg & 7) != 6)
1540 		return (EINVAL);
1541 
1542 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1543 	    memwrite, arg);
1544 	return (error);
1545 }
1546 
1547 static int
1548 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1549     struct vm_guest_paging *paging, mem_region_read_t memread,
1550     mem_region_write_t memwrite, void *arg)
1551 {
1552 	int error;
1553 
1554 	/*
1555 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1556 	 *
1557 	 * POP is part of the group 1A extended opcodes and is identified
1558 	 * by ModRM:reg = b000.
1559 	 */
1560 	if ((vie->reg & 7) != 0)
1561 		return (EINVAL);
1562 
1563 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1564 	    memwrite, arg);
1565 	return (error);
1566 }
1567 
1568 static int
1569 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1570     struct vm_guest_paging *paging, mem_region_read_t memread,
1571     mem_region_write_t memwrite, void *memarg)
1572 {
1573 	int error;
1574 
1575 	switch (vie->reg & 7) {
1576 	case 0x1:	/* OR */
1577 		error = emulate_or(vm, vcpuid, gpa, vie,
1578 		    memread, memwrite, memarg);
1579 		break;
1580 	case 0x4:	/* AND */
1581 		error = emulate_and(vm, vcpuid, gpa, vie,
1582 		    memread, memwrite, memarg);
1583 		break;
1584 	case 0x7:	/* CMP */
1585 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1586 		    memread, memwrite, memarg);
1587 		break;
1588 	default:
1589 		error = EINVAL;
1590 		break;
1591 	}
1592 
1593 	return (error);
1594 }
1595 
1596 static int
1597 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1598     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1599 {
1600 	uint64_t val, rflags;
1601 	int error, bitmask, bitoff;
1602 
1603 	/*
1604 	 * 0F BA is a Group 8 extended opcode.
1605 	 *
1606 	 * Currently we only emulate the 'Bit Test' instruction which is
1607 	 * identified by a ModR/M:reg encoding of 100b.
1608 	 */
1609 	if ((vie->reg & 7) != 4)
1610 		return (EINVAL);
1611 
1612 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1613 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1614 
1615 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1616 	if (error)
1617 		return (error);
1618 
1619 	/*
1620 	 * Intel SDM, Vol 2, Table 3-2:
1621 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1622 	 */
1623 	bitmask = vie->opsize * 8 - 1;
1624 	bitoff = vie->immediate & bitmask;
1625 
1626 	/* Copy the bit into the Carry flag in %rflags */
1627 	if (val & (1UL << bitoff))
1628 		rflags |= PSL_C;
1629 	else
1630 		rflags &= ~PSL_C;
1631 
1632 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1633 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1634 
1635 	return (0);
1636 }
1637 
1638 static int
1639 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1640     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1641 {
1642 	int error;
1643 	uint64_t buf;
1644 
1645 	switch (vie->reg & 7) {
1646 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1647 		if (vie->mod == 0x3) {
1648 			/*
1649 			 * SFENCE.  Ignore it, VM exit provides enough
1650 			 * barriers on its own.
1651 			 */
1652 			error = 0;
1653 		} else {
1654 			/*
1655 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1656 			 * rights.
1657 			 */
1658 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1659 		}
1660 		break;
1661 	default:
1662 		error = EINVAL;
1663 		break;
1664 	}
1665 
1666 	return (error);
1667 }
1668 
1669 int
1670 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1671     struct vm_guest_paging *paging, mem_region_read_t memread,
1672     mem_region_write_t memwrite, void *memarg)
1673 {
1674 	int error;
1675 
1676 	if (!vie->decoded)
1677 		return (EINVAL);
1678 
1679 	switch (vie->op.op_type) {
1680 	case VIE_OP_TYPE_GROUP1:
1681 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1682 		    memwrite, memarg);
1683 		break;
1684 	case VIE_OP_TYPE_POP:
1685 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1686 		    memwrite, memarg);
1687 		break;
1688 	case VIE_OP_TYPE_PUSH:
1689 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1690 		    memwrite, memarg);
1691 		break;
1692 	case VIE_OP_TYPE_CMP:
1693 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1694 				    memread, memwrite, memarg);
1695 		break;
1696 	case VIE_OP_TYPE_MOV:
1697 		error = emulate_mov(vm, vcpuid, gpa, vie,
1698 				    memread, memwrite, memarg);
1699 		break;
1700 	case VIE_OP_TYPE_MOVSX:
1701 	case VIE_OP_TYPE_MOVZX:
1702 		error = emulate_movx(vm, vcpuid, gpa, vie,
1703 				     memread, memwrite, memarg);
1704 		break;
1705 	case VIE_OP_TYPE_MOVS:
1706 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1707 		    memwrite, memarg);
1708 		break;
1709 	case VIE_OP_TYPE_STOS:
1710 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1711 		    memwrite, memarg);
1712 		break;
1713 	case VIE_OP_TYPE_AND:
1714 		error = emulate_and(vm, vcpuid, gpa, vie,
1715 				    memread, memwrite, memarg);
1716 		break;
1717 	case VIE_OP_TYPE_OR:
1718 		error = emulate_or(vm, vcpuid, gpa, vie,
1719 				    memread, memwrite, memarg);
1720 		break;
1721 	case VIE_OP_TYPE_SUB:
1722 		error = emulate_sub(vm, vcpuid, gpa, vie,
1723 				    memread, memwrite, memarg);
1724 		break;
1725 	case VIE_OP_TYPE_BITTEST:
1726 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1727 		    memread, memwrite, memarg);
1728 		break;
1729 	case VIE_OP_TYPE_TWOB_GRP15:
1730 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1731 		    memread, memwrite, memarg);
1732 		break;
1733 	case VIE_OP_TYPE_ADD:
1734 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1735 		    memwrite, memarg);
1736 		break;
1737 	case VIE_OP_TYPE_TEST:
1738 		error = emulate_test(vm, vcpuid, gpa, vie,
1739 		    memread, memwrite, memarg);
1740 		break;
1741 	default:
1742 		error = EINVAL;
1743 		break;
1744 	}
1745 
1746 	return (error);
1747 }
1748 
1749 int
1750 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1751 {
1752 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1753 	    ("%s: invalid size %d", __func__, size));
1754 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1755 
1756 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1757 		return (0);
1758 
1759 	return ((gla & (size - 1)) ? 1 : 0);
1760 }
1761 
1762 int
1763 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1764 {
1765 	uint64_t mask;
1766 
1767 	if (cpu_mode != CPU_MODE_64BIT)
1768 		return (0);
1769 
1770 	/*
1771 	 * The value of the bit 47 in the 'gla' should be replicated in the
1772 	 * most significant 16 bits.
1773 	 */
1774 	mask = ~((1UL << 48) - 1);
1775 	if (gla & (1UL << 47))
1776 		return ((gla & mask) != mask);
1777 	else
1778 		return ((gla & mask) != 0);
1779 }
1780 
1781 uint64_t
1782 vie_size2mask(int size)
1783 {
1784 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1785 	    ("vie_size2mask: invalid size %d", size));
1786 	return (size2mask[size]);
1787 }
1788 
1789 int
1790 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1791     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1792     int prot, uint64_t *gla)
1793 {
1794 	uint64_t firstoff, low_limit, high_limit, segbase;
1795 	int glasize, type;
1796 
1797 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1798 	    ("%s: invalid segment %d", __func__, seg));
1799 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1800 	    ("%s: invalid operand size %d", __func__, length));
1801 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1802 	    ("%s: invalid prot %#x", __func__, prot));
1803 
1804 	firstoff = offset;
1805 	if (cpu_mode == CPU_MODE_64BIT) {
1806 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1807 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1808 		glasize = 8;
1809 	} else {
1810 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1811 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1812 		glasize = 4;
1813 		/*
1814 		 * If the segment selector is loaded with a NULL selector
1815 		 * then the descriptor is unusable and attempting to use
1816 		 * it results in a #GP(0).
1817 		 */
1818 		if (SEG_DESC_UNUSABLE(desc->access))
1819 			return (-1);
1820 
1821 		/*
1822 		 * The processor generates a #NP exception when a segment
1823 		 * register is loaded with a selector that points to a
1824 		 * descriptor that is not present. If this was the case then
1825 		 * it would have been checked before the VM-exit.
1826 		 */
1827 		KASSERT(SEG_DESC_PRESENT(desc->access),
1828 		    ("segment %d not present: %#x", seg, desc->access));
1829 
1830 		/*
1831 		 * The descriptor type must indicate a code/data segment.
1832 		 */
1833 		type = SEG_DESC_TYPE(desc->access);
1834 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1835 		    "descriptor type %#x", seg, type));
1836 
1837 		if (prot & PROT_READ) {
1838 			/* #GP on a read access to a exec-only code segment */
1839 			if ((type & 0xA) == 0x8)
1840 				return (-1);
1841 		}
1842 
1843 		if (prot & PROT_WRITE) {
1844 			/*
1845 			 * #GP on a write access to a code segment or a
1846 			 * read-only data segment.
1847 			 */
1848 			if (type & 0x8)			/* code segment */
1849 				return (-1);
1850 
1851 			if ((type & 0xA) == 0)		/* read-only data seg */
1852 				return (-1);
1853 		}
1854 
1855 		/*
1856 		 * 'desc->limit' is fully expanded taking granularity into
1857 		 * account.
1858 		 */
1859 		if ((type & 0xC) == 0x4) {
1860 			/* expand-down data segment */
1861 			low_limit = desc->limit + 1;
1862 			high_limit = SEG_DESC_DEF32(desc->access) ?
1863 			    0xffffffff : 0xffff;
1864 		} else {
1865 			/* code segment or expand-up data segment */
1866 			low_limit = 0;
1867 			high_limit = desc->limit;
1868 		}
1869 
1870 		while (length > 0) {
1871 			offset &= vie_size2mask(addrsize);
1872 			if (offset < low_limit || offset > high_limit)
1873 				return (-1);
1874 			offset++;
1875 			length--;
1876 		}
1877 	}
1878 
1879 	/*
1880 	 * In 64-bit mode all segments except %fs and %gs have a segment
1881 	 * base address of 0.
1882 	 */
1883 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1884 	    seg != VM_REG_GUEST_GS) {
1885 		segbase = 0;
1886 	} else {
1887 		segbase = desc->base;
1888 	}
1889 
1890 	/*
1891 	 * Truncate 'firstoff' to the effective address size before adding
1892 	 * it to the segment base.
1893 	 */
1894 	firstoff &= vie_size2mask(addrsize);
1895 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1896 	return (0);
1897 }
1898 
1899 #ifdef _KERNEL
1900 void
1901 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1902 {
1903 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1904 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1905 
1906 	bzero(vie, sizeof(struct vie));
1907 
1908 	vie->base_register = VM_REG_LAST;
1909 	vie->index_register = VM_REG_LAST;
1910 	vie->segment_register = VM_REG_LAST;
1911 
1912 	if (inst_length) {
1913 		bcopy(inst_bytes, vie->inst, inst_length);
1914 		vie->num_valid = inst_length;
1915 	}
1916 }
1917 
1918 static int
1919 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1920 {
1921 	int error_code = 0;
1922 
1923 	if (pte & PG_V)
1924 		error_code |= PGEX_P;
1925 	if (prot & VM_PROT_WRITE)
1926 		error_code |= PGEX_W;
1927 	if (usermode)
1928 		error_code |= PGEX_U;
1929 	if (rsvd)
1930 		error_code |= PGEX_RSV;
1931 	if (prot & VM_PROT_EXECUTE)
1932 		error_code |= PGEX_I;
1933 
1934 	return (error_code);
1935 }
1936 
1937 static void
1938 ptp_release(void **cookie)
1939 {
1940 	if (*cookie != NULL) {
1941 		vm_gpa_release(*cookie);
1942 		*cookie = NULL;
1943 	}
1944 }
1945 
1946 static void *
1947 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1948 {
1949 	void *ptr;
1950 
1951 	ptp_release(cookie);
1952 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1953 	return (ptr);
1954 }
1955 
1956 static int
1957 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1958     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
1959 {
1960 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1961 	u_int retries;
1962 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1963 	uint32_t *ptpbase32, pte32;
1964 	void *cookie;
1965 
1966 	*guest_fault = 0;
1967 
1968 	usermode = (paging->cpl == 3 ? 1 : 0);
1969 	writable = prot & VM_PROT_WRITE;
1970 	cookie = NULL;
1971 	retval = 0;
1972 	retries = 0;
1973 restart:
1974 	ptpphys = paging->cr3;		/* root of the page tables */
1975 	ptp_release(&cookie);
1976 	if (retries++ > 0)
1977 		maybe_yield();
1978 
1979 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1980 		/*
1981 		 * XXX assuming a non-stack reference otherwise a stack fault
1982 		 * should be generated.
1983 		 */
1984 		if (!check_only)
1985 			vm_inject_gp(vm, vcpuid);
1986 		goto fault;
1987 	}
1988 
1989 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1990 		*gpa = gla;
1991 		goto done;
1992 	}
1993 
1994 	if (paging->paging_mode == PAGING_MODE_32) {
1995 		nlevels = 2;
1996 		while (--nlevels >= 0) {
1997 			/* Zero out the lower 12 bits. */
1998 			ptpphys &= ~0xfff;
1999 
2000 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2001 			    &cookie);
2002 
2003 			if (ptpbase32 == NULL)
2004 				goto error;
2005 
2006 			ptpshift = PAGE_SHIFT + nlevels * 10;
2007 			ptpindex = (gla >> ptpshift) & 0x3FF;
2008 			pgsize = 1UL << ptpshift;
2009 
2010 			pte32 = ptpbase32[ptpindex];
2011 
2012 			if ((pte32 & PG_V) == 0 ||
2013 			    (usermode && (pte32 & PG_U) == 0) ||
2014 			    (writable && (pte32 & PG_RW) == 0)) {
2015 				if (!check_only) {
2016 					pfcode = pf_error_code(usermode, prot, 0,
2017 					    pte32);
2018 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2019 				}
2020 				goto fault;
2021 			}
2022 
2023 			/*
2024 			 * Emulate the x86 MMU's management of the accessed
2025 			 * and dirty flags. While the accessed flag is set
2026 			 * at every level of the page table, the dirty flag
2027 			 * is only set at the last level providing the guest
2028 			 * physical address.
2029 			 */
2030 			if (!check_only && (pte32 & PG_A) == 0) {
2031 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2032 				    pte32, pte32 | PG_A) == 0) {
2033 					goto restart;
2034 				}
2035 			}
2036 
2037 			/* XXX must be ignored if CR4.PSE=0 */
2038 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2039 				break;
2040 
2041 			ptpphys = pte32;
2042 		}
2043 
2044 		/* Set the dirty bit in the page table entry if necessary */
2045 		if (!check_only && writable && (pte32 & PG_M) == 0) {
2046 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2047 			    pte32, pte32 | PG_M) == 0) {
2048 				goto restart;
2049 			}
2050 		}
2051 
2052 		/* Zero out the lower 'ptpshift' bits */
2053 		pte32 >>= ptpshift; pte32 <<= ptpshift;
2054 		*gpa = pte32 | (gla & (pgsize - 1));
2055 		goto done;
2056 	}
2057 
2058 	if (paging->paging_mode == PAGING_MODE_PAE) {
2059 		/* Zero out the lower 5 bits and the upper 32 bits */
2060 		ptpphys &= 0xffffffe0UL;
2061 
2062 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
2063 		    &cookie);
2064 		if (ptpbase == NULL)
2065 			goto error;
2066 
2067 		ptpindex = (gla >> 30) & 0x3;
2068 
2069 		pte = ptpbase[ptpindex];
2070 
2071 		if ((pte & PG_V) == 0) {
2072 			if (!check_only) {
2073 				pfcode = pf_error_code(usermode, prot, 0, pte);
2074 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2075 			}
2076 			goto fault;
2077 		}
2078 
2079 		ptpphys = pte;
2080 
2081 		nlevels = 2;
2082 	} else
2083 		nlevels = 4;
2084 	while (--nlevels >= 0) {
2085 		/* Zero out the lower 12 bits and the upper 12 bits */
2086 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2087 
2088 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
2089 		if (ptpbase == NULL)
2090 			goto error;
2091 
2092 		ptpshift = PAGE_SHIFT + nlevels * 9;
2093 		ptpindex = (gla >> ptpshift) & 0x1FF;
2094 		pgsize = 1UL << ptpshift;
2095 
2096 		pte = ptpbase[ptpindex];
2097 
2098 		if ((pte & PG_V) == 0 ||
2099 		    (usermode && (pte & PG_U) == 0) ||
2100 		    (writable && (pte & PG_RW) == 0)) {
2101 			if (!check_only) {
2102 				pfcode = pf_error_code(usermode, prot, 0, pte);
2103 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2104 			}
2105 			goto fault;
2106 		}
2107 
2108 		/* Set the accessed bit in the page table entry */
2109 		if (!check_only && (pte & PG_A) == 0) {
2110 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2111 			    pte, pte | PG_A) == 0) {
2112 				goto restart;
2113 			}
2114 		}
2115 
2116 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2117 			if (pgsize > 1 * GB) {
2118 				if (!check_only) {
2119 					pfcode = pf_error_code(usermode, prot, 1,
2120 					    pte);
2121 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2122 				}
2123 				goto fault;
2124 			}
2125 			break;
2126 		}
2127 
2128 		ptpphys = pte;
2129 	}
2130 
2131 	/* Set the dirty bit in the page table entry if necessary */
2132 	if (!check_only && writable && (pte & PG_M) == 0) {
2133 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2134 			goto restart;
2135 	}
2136 
2137 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2138 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2139 	*gpa = pte | (gla & (pgsize - 1));
2140 done:
2141 	ptp_release(&cookie);
2142 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2143 	    __func__, retval));
2144 	return (retval);
2145 error:
2146 	retval = EFAULT;
2147 	goto done;
2148 fault:
2149 	*guest_fault = 1;
2150 	goto done;
2151 }
2152 
2153 int
2154 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2155     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2156 {
2157 
2158 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2159 	    false));
2160 }
2161 
2162 int
2163 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2164     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2165 {
2166 
2167 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2168 	    true));
2169 }
2170 
2171 int
2172 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2173     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2174 {
2175 	struct vm_copyinfo copyinfo[2];
2176 	int error, prot;
2177 
2178 	if (inst_length > VIE_INST_SIZE)
2179 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2180 
2181 	prot = PROT_READ | PROT_EXEC;
2182 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2183 	    copyinfo, nitems(copyinfo), faultptr);
2184 	if (error || *faultptr)
2185 		return (error);
2186 
2187 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2188 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2189 	vie->num_valid = inst_length;
2190 	return (0);
2191 }
2192 
2193 static int
2194 vie_peek(struct vie *vie, uint8_t *x)
2195 {
2196 
2197 	if (vie->num_processed < vie->num_valid) {
2198 		*x = vie->inst[vie->num_processed];
2199 		return (0);
2200 	} else
2201 		return (-1);
2202 }
2203 
2204 static void
2205 vie_advance(struct vie *vie)
2206 {
2207 
2208 	vie->num_processed++;
2209 }
2210 
2211 static bool
2212 segment_override(uint8_t x, int *seg)
2213 {
2214 
2215 	switch (x) {
2216 	case 0x2E:
2217 		*seg = VM_REG_GUEST_CS;
2218 		break;
2219 	case 0x36:
2220 		*seg = VM_REG_GUEST_SS;
2221 		break;
2222 	case 0x3E:
2223 		*seg = VM_REG_GUEST_DS;
2224 		break;
2225 	case 0x26:
2226 		*seg = VM_REG_GUEST_ES;
2227 		break;
2228 	case 0x64:
2229 		*seg = VM_REG_GUEST_FS;
2230 		break;
2231 	case 0x65:
2232 		*seg = VM_REG_GUEST_GS;
2233 		break;
2234 	default:
2235 		return (false);
2236 	}
2237 	return (true);
2238 }
2239 
2240 static int
2241 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2242 {
2243 	uint8_t x;
2244 
2245 	while (1) {
2246 		if (vie_peek(vie, &x))
2247 			return (-1);
2248 
2249 		if (x == 0x66)
2250 			vie->opsize_override = 1;
2251 		else if (x == 0x67)
2252 			vie->addrsize_override = 1;
2253 		else if (x == 0xF3)
2254 			vie->repz_present = 1;
2255 		else if (x == 0xF2)
2256 			vie->repnz_present = 1;
2257 		else if (segment_override(x, &vie->segment_register))
2258 			vie->segment_override = 1;
2259 		else
2260 			break;
2261 
2262 		vie_advance(vie);
2263 	}
2264 
2265 	/*
2266 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2267 	 * - Only one REX prefix is allowed per instruction.
2268 	 * - The REX prefix must immediately precede the opcode byte or the
2269 	 *   escape opcode byte.
2270 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2271 	 *   the mandatory prefix must come before the REX prefix.
2272 	 */
2273 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2274 		vie->rex_present = 1;
2275 		vie->rex_w = x & 0x8 ? 1 : 0;
2276 		vie->rex_r = x & 0x4 ? 1 : 0;
2277 		vie->rex_x = x & 0x2 ? 1 : 0;
2278 		vie->rex_b = x & 0x1 ? 1 : 0;
2279 		vie_advance(vie);
2280 	}
2281 
2282 	/*
2283 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2284 	 */
2285 	if (cpu_mode == CPU_MODE_64BIT) {
2286 		/*
2287 		 * Default address size is 64-bits and default operand size
2288 		 * is 32-bits.
2289 		 */
2290 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2291 		if (vie->rex_w)
2292 			vie->opsize = 8;
2293 		else if (vie->opsize_override)
2294 			vie->opsize = 2;
2295 		else
2296 			vie->opsize = 4;
2297 	} else if (cs_d) {
2298 		/* Default address and operand sizes are 32-bits */
2299 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2300 		vie->opsize = vie->opsize_override ? 2 : 4;
2301 	} else {
2302 		/* Default address and operand sizes are 16-bits */
2303 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2304 		vie->opsize = vie->opsize_override ? 4 : 2;
2305 	}
2306 	return (0);
2307 }
2308 
2309 static int
2310 decode_two_byte_opcode(struct vie *vie)
2311 {
2312 	uint8_t x;
2313 
2314 	if (vie_peek(vie, &x))
2315 		return (-1);
2316 
2317 	vie->op = two_byte_opcodes[x];
2318 
2319 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2320 		return (-1);
2321 
2322 	vie_advance(vie);
2323 	return (0);
2324 }
2325 
2326 static int
2327 decode_opcode(struct vie *vie)
2328 {
2329 	uint8_t x;
2330 
2331 	if (vie_peek(vie, &x))
2332 		return (-1);
2333 
2334 	vie->op = one_byte_opcodes[x];
2335 
2336 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2337 		return (-1);
2338 
2339 	vie_advance(vie);
2340 
2341 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2342 		return (decode_two_byte_opcode(vie));
2343 
2344 	return (0);
2345 }
2346 
2347 static int
2348 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2349 {
2350 	uint8_t x;
2351 
2352 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2353 		return (0);
2354 
2355 	if (cpu_mode == CPU_MODE_REAL)
2356 		return (-1);
2357 
2358 	if (vie_peek(vie, &x))
2359 		return (-1);
2360 
2361 	vie->mod = (x >> 6) & 0x3;
2362 	vie->rm =  (x >> 0) & 0x7;
2363 	vie->reg = (x >> 3) & 0x7;
2364 
2365 	/*
2366 	 * A direct addressing mode makes no sense in the context of an EPT
2367 	 * fault. There has to be a memory access involved to cause the
2368 	 * EPT fault.
2369 	 */
2370 	if (vie->mod == VIE_MOD_DIRECT)
2371 		return (-1);
2372 
2373 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2374 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2375 		/*
2376 		 * Table 2-5: Special Cases of REX Encodings
2377 		 *
2378 		 * mod=0, r/m=5 is used in the compatibility mode to
2379 		 * indicate a disp32 without a base register.
2380 		 *
2381 		 * mod!=3, r/m=4 is used in the compatibility mode to
2382 		 * indicate that the SIB byte is present.
2383 		 *
2384 		 * The 'b' bit in the REX prefix is don't care in
2385 		 * this case.
2386 		 */
2387 	} else {
2388 		vie->rm |= (vie->rex_b << 3);
2389 	}
2390 
2391 	vie->reg |= (vie->rex_r << 3);
2392 
2393 	/* SIB */
2394 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2395 		goto done;
2396 
2397 	vie->base_register = gpr_map[vie->rm];
2398 
2399 	switch (vie->mod) {
2400 	case VIE_MOD_INDIRECT_DISP8:
2401 		vie->disp_bytes = 1;
2402 		break;
2403 	case VIE_MOD_INDIRECT_DISP32:
2404 		vie->disp_bytes = 4;
2405 		break;
2406 	case VIE_MOD_INDIRECT:
2407 		if (vie->rm == VIE_RM_DISP32) {
2408 			vie->disp_bytes = 4;
2409 			/*
2410 			 * Table 2-7. RIP-Relative Addressing
2411 			 *
2412 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2413 			 * whereas in compatibility mode it just implies disp32.
2414 			 */
2415 
2416 			if (cpu_mode == CPU_MODE_64BIT)
2417 				vie->base_register = VM_REG_GUEST_RIP;
2418 			else
2419 				vie->base_register = VM_REG_LAST;
2420 		}
2421 		break;
2422 	}
2423 
2424 done:
2425 	vie_advance(vie);
2426 
2427 	return (0);
2428 }
2429 
2430 static int
2431 decode_sib(struct vie *vie)
2432 {
2433 	uint8_t x;
2434 
2435 	/* Proceed only if SIB byte is present */
2436 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2437 		return (0);
2438 
2439 	if (vie_peek(vie, &x))
2440 		return (-1);
2441 
2442 	/* De-construct the SIB byte */
2443 	vie->ss = (x >> 6) & 0x3;
2444 	vie->index = (x >> 3) & 0x7;
2445 	vie->base = (x >> 0) & 0x7;
2446 
2447 	/* Apply the REX prefix modifiers */
2448 	vie->index |= vie->rex_x << 3;
2449 	vie->base |= vie->rex_b << 3;
2450 
2451 	switch (vie->mod) {
2452 	case VIE_MOD_INDIRECT_DISP8:
2453 		vie->disp_bytes = 1;
2454 		break;
2455 	case VIE_MOD_INDIRECT_DISP32:
2456 		vie->disp_bytes = 4;
2457 		break;
2458 	}
2459 
2460 	if (vie->mod == VIE_MOD_INDIRECT &&
2461 	    (vie->base == 5 || vie->base == 13)) {
2462 		/*
2463 		 * Special case when base register is unused if mod = 0
2464 		 * and base = %rbp or %r13.
2465 		 *
2466 		 * Documented in:
2467 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2468 		 * Table 2-5: Special Cases of REX Encodings
2469 		 */
2470 		vie->disp_bytes = 4;
2471 	} else {
2472 		vie->base_register = gpr_map[vie->base];
2473 	}
2474 
2475 	/*
2476 	 * All encodings of 'index' are valid except for %rsp (4).
2477 	 *
2478 	 * Documented in:
2479 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2480 	 * Table 2-5: Special Cases of REX Encodings
2481 	 */
2482 	if (vie->index != 4)
2483 		vie->index_register = gpr_map[vie->index];
2484 
2485 	/* 'scale' makes sense only in the context of an index register */
2486 	if (vie->index_register < VM_REG_LAST)
2487 		vie->scale = 1 << vie->ss;
2488 
2489 	vie_advance(vie);
2490 
2491 	return (0);
2492 }
2493 
2494 static int
2495 decode_displacement(struct vie *vie)
2496 {
2497 	int n, i;
2498 	uint8_t x;
2499 
2500 	union {
2501 		char	buf[4];
2502 		int8_t	signed8;
2503 		int32_t	signed32;
2504 	} u;
2505 
2506 	if ((n = vie->disp_bytes) == 0)
2507 		return (0);
2508 
2509 	if (n != 1 && n != 4)
2510 		panic("decode_displacement: invalid disp_bytes %d", n);
2511 
2512 	for (i = 0; i < n; i++) {
2513 		if (vie_peek(vie, &x))
2514 			return (-1);
2515 
2516 		u.buf[i] = x;
2517 		vie_advance(vie);
2518 	}
2519 
2520 	if (n == 1)
2521 		vie->displacement = u.signed8;		/* sign-extended */
2522 	else
2523 		vie->displacement = u.signed32;		/* sign-extended */
2524 
2525 	return (0);
2526 }
2527 
2528 static int
2529 decode_immediate(struct vie *vie)
2530 {
2531 	int i, n;
2532 	uint8_t x;
2533 	union {
2534 		char	buf[4];
2535 		int8_t	signed8;
2536 		int16_t	signed16;
2537 		int32_t	signed32;
2538 	} u;
2539 
2540 	/* Figure out immediate operand size (if any) */
2541 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2542 		/*
2543 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2544 		 * In 64-bit mode the typical size of immediate operands
2545 		 * remains 32-bits. When the operand size if 64-bits, the
2546 		 * processor sign-extends all immediates to 64-bits prior
2547 		 * to their use.
2548 		 */
2549 		if (vie->opsize == 4 || vie->opsize == 8)
2550 			vie->imm_bytes = 4;
2551 		else
2552 			vie->imm_bytes = 2;
2553 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2554 		vie->imm_bytes = 1;
2555 	}
2556 
2557 	if ((n = vie->imm_bytes) == 0)
2558 		return (0);
2559 
2560 	KASSERT(n == 1 || n == 2 || n == 4,
2561 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2562 
2563 	for (i = 0; i < n; i++) {
2564 		if (vie_peek(vie, &x))
2565 			return (-1);
2566 
2567 		u.buf[i] = x;
2568 		vie_advance(vie);
2569 	}
2570 
2571 	/* sign-extend the immediate value before use */
2572 	if (n == 1)
2573 		vie->immediate = u.signed8;
2574 	else if (n == 2)
2575 		vie->immediate = u.signed16;
2576 	else
2577 		vie->immediate = u.signed32;
2578 
2579 	return (0);
2580 }
2581 
2582 static int
2583 decode_moffset(struct vie *vie)
2584 {
2585 	int i, n;
2586 	uint8_t x;
2587 	union {
2588 		char	buf[8];
2589 		uint64_t u64;
2590 	} u;
2591 
2592 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2593 		return (0);
2594 
2595 	/*
2596 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2597 	 * The memory offset size follows the address-size of the instruction.
2598 	 */
2599 	n = vie->addrsize;
2600 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2601 
2602 	u.u64 = 0;
2603 	for (i = 0; i < n; i++) {
2604 		if (vie_peek(vie, &x))
2605 			return (-1);
2606 
2607 		u.buf[i] = x;
2608 		vie_advance(vie);
2609 	}
2610 	vie->displacement = u.u64;
2611 	return (0);
2612 }
2613 
2614 /*
2615  * Verify that the 'guest linear address' provided as collateral of the nested
2616  * page table fault matches with our instruction decoding.
2617  */
2618 static int
2619 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2620     enum vm_cpu_mode cpu_mode)
2621 {
2622 	int error;
2623 	uint64_t base, segbase, idx, gla2;
2624 	enum vm_reg_name seg;
2625 	struct seg_desc desc;
2626 
2627 	/* Skip 'gla' verification */
2628 	if (gla == VIE_INVALID_GLA)
2629 		return (0);
2630 
2631 	base = 0;
2632 	if (vie->base_register != VM_REG_LAST) {
2633 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2634 		if (error) {
2635 			printf("verify_gla: error %d getting base reg %d\n",
2636 				error, vie->base_register);
2637 			return (-1);
2638 		}
2639 
2640 		/*
2641 		 * RIP-relative addressing starts from the following
2642 		 * instruction
2643 		 */
2644 		if (vie->base_register == VM_REG_GUEST_RIP)
2645 			base += vie->num_processed;
2646 	}
2647 
2648 	idx = 0;
2649 	if (vie->index_register != VM_REG_LAST) {
2650 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2651 		if (error) {
2652 			printf("verify_gla: error %d getting index reg %d\n",
2653 				error, vie->index_register);
2654 			return (-1);
2655 		}
2656 	}
2657 
2658 	/*
2659 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2660 	 *
2661 	 * In 64-bit mode, segmentation is generally (but not
2662 	 * completely) disabled.  The exceptions are the FS and GS
2663 	 * segments.
2664 	 *
2665 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2666 	 * as the base, the SS segment is the default segment.  For
2667 	 * other data references, except when relative to stack or
2668 	 * string destination the DS segment is the default.  These
2669 	 * can be overridden to allow other segments to be accessed.
2670 	 */
2671 	if (vie->segment_override)
2672 		seg = vie->segment_register;
2673 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2674 	    vie->base_register == VM_REG_GUEST_RBP)
2675 		seg = VM_REG_GUEST_SS;
2676 	else
2677 		seg = VM_REG_GUEST_DS;
2678 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2679 	    seg != VM_REG_GUEST_GS) {
2680 		segbase = 0;
2681 	} else {
2682 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2683 		if (error) {
2684 			printf("verify_gla: error %d getting segment"
2685 			       " descriptor %d", error,
2686 			       vie->segment_register);
2687 			return (-1);
2688 		}
2689 		segbase = desc.base;
2690 	}
2691 
2692 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2693 	gla2 &= size2mask[vie->addrsize];
2694 	if (gla != gla2) {
2695 		printf("verify_gla mismatch: segbase(0x%0lx)"
2696 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2697 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2698 		       segbase, base, vie->scale, idx, vie->displacement,
2699 		       gla, gla2);
2700 		return (-1);
2701 	}
2702 
2703 	return (0);
2704 }
2705 
2706 int
2707 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2708 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2709 {
2710 
2711 	if (decode_prefixes(vie, cpu_mode, cs_d))
2712 		return (-1);
2713 
2714 	if (decode_opcode(vie))
2715 		return (-1);
2716 
2717 	if (decode_modrm(vie, cpu_mode))
2718 		return (-1);
2719 
2720 	if (decode_sib(vie))
2721 		return (-1);
2722 
2723 	if (decode_displacement(vie))
2724 		return (-1);
2725 
2726 	if (decode_immediate(vie))
2727 		return (-1);
2728 
2729 	if (decode_moffset(vie))
2730 		return (-1);
2731 
2732 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2733 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2734 			return (-1);
2735 	}
2736 
2737 	vie->decoded = 1;	/* success */
2738 
2739 	return (0);
2740 }
2741 #endif	/* _KERNEL */
2742