xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 732a02b4e77866604a120a275c082bb6221bd2ff)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <err.h>
54 #include <assert.h>
55 #include <stdbool.h>
56 #include <stdio.h>
57 #include <strings.h>
58 #include <vmmapi.h>
59 #define	KASSERT(exp,msg)	assert((exp))
60 #define	panic(...)		errx(4, __VA_ARGS__)
61 #endif	/* _KERNEL */
62 
63 #include <machine/vmm_instruction_emul.h>
64 #include <x86/psl.h>
65 #include <x86/specialreg.h>
66 
67 /* struct vie_op.op_type */
68 enum {
69 	VIE_OP_TYPE_NONE = 0,
70 	VIE_OP_TYPE_MOV,
71 	VIE_OP_TYPE_MOVSX,
72 	VIE_OP_TYPE_MOVZX,
73 	VIE_OP_TYPE_AND,
74 	VIE_OP_TYPE_OR,
75 	VIE_OP_TYPE_SUB,
76 	VIE_OP_TYPE_TWO_BYTE,
77 	VIE_OP_TYPE_PUSH,
78 	VIE_OP_TYPE_CMP,
79 	VIE_OP_TYPE_POP,
80 	VIE_OP_TYPE_MOVS,
81 	VIE_OP_TYPE_GROUP1,
82 	VIE_OP_TYPE_STOS,
83 	VIE_OP_TYPE_BITTEST,
84 	VIE_OP_TYPE_TWOB_GRP15,
85 	VIE_OP_TYPE_ADD,
86 	VIE_OP_TYPE_TEST,
87 	VIE_OP_TYPE_LAST
88 };
89 
90 /* struct vie_op.op_flags */
91 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
92 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
93 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
94 #define	VIE_OP_F_NO_MODRM	(1 << 3)
95 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
96 
97 static const struct vie_op two_byte_opcodes[256] = {
98 	[0xAE] = {
99 		  .op_byte = 0xAE,
100 		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
101 	},
102 	[0xB6] = {
103 		.op_byte = 0xB6,
104 		.op_type = VIE_OP_TYPE_MOVZX,
105 	},
106 	[0xB7] = {
107 		.op_byte = 0xB7,
108 		.op_type = VIE_OP_TYPE_MOVZX,
109 	},
110 	[0xBA] = {
111 		.op_byte = 0xBA,
112 		.op_type = VIE_OP_TYPE_BITTEST,
113 		.op_flags = VIE_OP_F_IMM8,
114 	},
115 	[0xBE] = {
116 		.op_byte = 0xBE,
117 		.op_type = VIE_OP_TYPE_MOVSX,
118 	},
119 };
120 
121 static const struct vie_op one_byte_opcodes[256] = {
122 	[0x03] = {
123 		.op_byte = 0x03,
124 		.op_type = VIE_OP_TYPE_ADD,
125 	},
126 	[0x0F] = {
127 		.op_byte = 0x0F,
128 		.op_type = VIE_OP_TYPE_TWO_BYTE
129 	},
130 	[0x0B] = {
131 		.op_byte = 0x0B,
132 		.op_type = VIE_OP_TYPE_OR,
133 	},
134 	[0x2B] = {
135 		.op_byte = 0x2B,
136 		.op_type = VIE_OP_TYPE_SUB,
137 	},
138 	[0x39] = {
139 		.op_byte = 0x39,
140 		.op_type = VIE_OP_TYPE_CMP,
141 	},
142 	[0x3B] = {
143 		.op_byte = 0x3B,
144 		.op_type = VIE_OP_TYPE_CMP,
145 	},
146 	[0x88] = {
147 		.op_byte = 0x88,
148 		.op_type = VIE_OP_TYPE_MOV,
149 	},
150 	[0x89] = {
151 		.op_byte = 0x89,
152 		.op_type = VIE_OP_TYPE_MOV,
153 	},
154 	[0x8A] = {
155 		.op_byte = 0x8A,
156 		.op_type = VIE_OP_TYPE_MOV,
157 	},
158 	[0x8B] = {
159 		.op_byte = 0x8B,
160 		.op_type = VIE_OP_TYPE_MOV,
161 	},
162 	[0xA1] = {
163 		.op_byte = 0xA1,
164 		.op_type = VIE_OP_TYPE_MOV,
165 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
166 	},
167 	[0xA3] = {
168 		.op_byte = 0xA3,
169 		.op_type = VIE_OP_TYPE_MOV,
170 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
171 	},
172 	[0xA4] = {
173 		.op_byte = 0xA4,
174 		.op_type = VIE_OP_TYPE_MOVS,
175 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
176 	},
177 	[0xA5] = {
178 		.op_byte = 0xA5,
179 		.op_type = VIE_OP_TYPE_MOVS,
180 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
181 	},
182 	[0xAA] = {
183 		.op_byte = 0xAA,
184 		.op_type = VIE_OP_TYPE_STOS,
185 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
186 	},
187 	[0xAB] = {
188 		.op_byte = 0xAB,
189 		.op_type = VIE_OP_TYPE_STOS,
190 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
191 	},
192 	[0xC6] = {
193 		/* XXX Group 11 extended opcode - not just MOV */
194 		.op_byte = 0xC6,
195 		.op_type = VIE_OP_TYPE_MOV,
196 		.op_flags = VIE_OP_F_IMM8,
197 	},
198 	[0xC7] = {
199 		.op_byte = 0xC7,
200 		.op_type = VIE_OP_TYPE_MOV,
201 		.op_flags = VIE_OP_F_IMM,
202 	},
203 	[0x23] = {
204 		.op_byte = 0x23,
205 		.op_type = VIE_OP_TYPE_AND,
206 	},
207 	[0x80] = {
208 		/* Group 1 extended opcode */
209 		.op_byte = 0x80,
210 		.op_type = VIE_OP_TYPE_GROUP1,
211 		.op_flags = VIE_OP_F_IMM8,
212 	},
213 	[0x81] = {
214 		/* Group 1 extended opcode */
215 		.op_byte = 0x81,
216 		.op_type = VIE_OP_TYPE_GROUP1,
217 		.op_flags = VIE_OP_F_IMM,
218 	},
219 	[0x83] = {
220 		/* Group 1 extended opcode */
221 		.op_byte = 0x83,
222 		.op_type = VIE_OP_TYPE_GROUP1,
223 		.op_flags = VIE_OP_F_IMM8,
224 	},
225 	[0x8F] = {
226 		/* XXX Group 1A extended opcode - not just POP */
227 		.op_byte = 0x8F,
228 		.op_type = VIE_OP_TYPE_POP,
229 	},
230 	[0xF7] = {
231 		/* XXX Group 3 extended opcode - not just TEST */
232 		.op_byte = 0xF7,
233 		.op_type = VIE_OP_TYPE_TEST,
234 		.op_flags = VIE_OP_F_IMM,
235 	},
236 	[0xFF] = {
237 		/* XXX Group 5 extended opcode - not just PUSH */
238 		.op_byte = 0xFF,
239 		.op_type = VIE_OP_TYPE_PUSH,
240 	}
241 };
242 
243 /* struct vie.mod */
244 #define	VIE_MOD_INDIRECT		0
245 #define	VIE_MOD_INDIRECT_DISP8		1
246 #define	VIE_MOD_INDIRECT_DISP32		2
247 #define	VIE_MOD_DIRECT			3
248 
249 /* struct vie.rm */
250 #define	VIE_RM_SIB			4
251 #define	VIE_RM_DISP32			5
252 
253 #define	GB				(1024 * 1024 * 1024)
254 
255 static enum vm_reg_name gpr_map[16] = {
256 	VM_REG_GUEST_RAX,
257 	VM_REG_GUEST_RCX,
258 	VM_REG_GUEST_RDX,
259 	VM_REG_GUEST_RBX,
260 	VM_REG_GUEST_RSP,
261 	VM_REG_GUEST_RBP,
262 	VM_REG_GUEST_RSI,
263 	VM_REG_GUEST_RDI,
264 	VM_REG_GUEST_R8,
265 	VM_REG_GUEST_R9,
266 	VM_REG_GUEST_R10,
267 	VM_REG_GUEST_R11,
268 	VM_REG_GUEST_R12,
269 	VM_REG_GUEST_R13,
270 	VM_REG_GUEST_R14,
271 	VM_REG_GUEST_R15
272 };
273 
274 static uint64_t size2mask[] = {
275 	[1] = 0xff,
276 	[2] = 0xffff,
277 	[4] = 0xffffffff,
278 	[8] = 0xffffffffffffffff,
279 };
280 
281 static int
282 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
283 {
284 	int error;
285 
286 	error = vm_get_register(vm, vcpuid, reg, rval);
287 
288 	return (error);
289 }
290 
291 static void
292 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
293 {
294 	*lhbr = 0;
295 	*reg = gpr_map[vie->reg];
296 
297 	/*
298 	 * 64-bit mode imposes limitations on accessing legacy high byte
299 	 * registers (lhbr).
300 	 *
301 	 * The legacy high-byte registers cannot be addressed if the REX
302 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
303 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
304 	 *
305 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
306 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
307 	 * %ah, %ch, %dh and %bh respectively.
308 	 */
309 	if (!vie->rex_present) {
310 		if (vie->reg & 0x4) {
311 			*lhbr = 1;
312 			*reg = gpr_map[vie->reg & 0x3];
313 		}
314 	}
315 }
316 
317 static int
318 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
319 {
320 	uint64_t val;
321 	int error, lhbr;
322 	enum vm_reg_name reg;
323 
324 	vie_calc_bytereg(vie, &reg, &lhbr);
325 	error = vm_get_register(vm, vcpuid, reg, &val);
326 
327 	/*
328 	 * To obtain the value of a legacy high byte register shift the
329 	 * base register right by 8 bits (%ah = %rax >> 8).
330 	 */
331 	if (lhbr)
332 		*rval = val >> 8;
333 	else
334 		*rval = val;
335 	return (error);
336 }
337 
338 static int
339 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
340 {
341 	uint64_t origval, val, mask;
342 	int error, lhbr;
343 	enum vm_reg_name reg;
344 
345 	vie_calc_bytereg(vie, &reg, &lhbr);
346 	error = vm_get_register(vm, vcpuid, reg, &origval);
347 	if (error == 0) {
348 		val = byte;
349 		mask = 0xff;
350 		if (lhbr) {
351 			/*
352 			 * Shift left by 8 to store 'byte' in a legacy high
353 			 * byte register.
354 			 */
355 			val <<= 8;
356 			mask <<= 8;
357 		}
358 		val |= origval & ~mask;
359 		error = vm_set_register(vm, vcpuid, reg, val);
360 	}
361 	return (error);
362 }
363 
364 int
365 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
366 		    uint64_t val, int size)
367 {
368 	int error;
369 	uint64_t origval;
370 
371 	switch (size) {
372 	case 1:
373 	case 2:
374 		error = vie_read_register(vm, vcpuid, reg, &origval);
375 		if (error)
376 			return (error);
377 		val &= size2mask[size];
378 		val |= origval & ~size2mask[size];
379 		break;
380 	case 4:
381 		val &= 0xffffffffUL;
382 		break;
383 	case 8:
384 		break;
385 	default:
386 		return (EINVAL);
387 	}
388 
389 	error = vm_set_register(vm, vcpuid, reg, val);
390 	return (error);
391 }
392 
393 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
394 
395 /*
396  * Return the status flags that would result from doing (x - y).
397  */
398 #define	GETCC(sz)							\
399 static u_long								\
400 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
401 {									\
402 	u_long rflags;							\
403 									\
404 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
405 	    "=r" (rflags), "+r" (x) : "m" (y));				\
406 	return (rflags);						\
407 } struct __hack
408 
409 GETCC(8);
410 GETCC(16);
411 GETCC(32);
412 GETCC(64);
413 
414 static u_long
415 getcc(int opsize, uint64_t x, uint64_t y)
416 {
417 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
418 	    ("getcc: invalid operand size %d", opsize));
419 
420 	if (opsize == 1)
421 		return (getcc8(x, y));
422 	else if (opsize == 2)
423 		return (getcc16(x, y));
424 	else if (opsize == 4)
425 		return (getcc32(x, y));
426 	else
427 		return (getcc64(x, y));
428 }
429 
430 /*
431  * Macro creation of functions getaddflags{8,16,32,64}
432  */
433 #define	GETADDFLAGS(sz)							\
434 static u_long								\
435 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
436 {									\
437 	u_long rflags;							\
438 									\
439 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
440 	    "=r" (rflags), "+r" (x) : "m" (y));				\
441 	return (rflags);						\
442 } struct __hack
443 
444 GETADDFLAGS(8);
445 GETADDFLAGS(16);
446 GETADDFLAGS(32);
447 GETADDFLAGS(64);
448 
449 static u_long
450 getaddflags(int opsize, uint64_t x, uint64_t y)
451 {
452 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
453 	    ("getaddflags: invalid operand size %d", opsize));
454 
455 	if (opsize == 1)
456 		return (getaddflags8(x, y));
457 	else if (opsize == 2)
458 		return (getaddflags16(x, y));
459 	else if (opsize == 4)
460 		return (getaddflags32(x, y));
461 	else
462 		return (getaddflags64(x, y));
463 }
464 
465 /*
466  * Return the status flags that would result from doing (x & y).
467  */
468 #define	GETANDFLAGS(sz)							\
469 static u_long								\
470 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
471 {									\
472 	u_long rflags;							\
473 									\
474 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
475 	    "=r" (rflags), "+r" (x) : "m" (y));				\
476 	return (rflags);						\
477 } struct __hack
478 
479 GETANDFLAGS(8);
480 GETANDFLAGS(16);
481 GETANDFLAGS(32);
482 GETANDFLAGS(64);
483 
484 static u_long
485 getandflags(int opsize, uint64_t x, uint64_t y)
486 {
487 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
488 	    ("getandflags: invalid operand size %d", opsize));
489 
490 	if (opsize == 1)
491 		return (getandflags8(x, y));
492 	else if (opsize == 2)
493 		return (getandflags16(x, y));
494 	else if (opsize == 4)
495 		return (getandflags32(x, y));
496 	else
497 		return (getandflags64(x, y));
498 }
499 
500 static int
501 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
502 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
503 {
504 	int error, size;
505 	enum vm_reg_name reg;
506 	uint8_t byte;
507 	uint64_t val;
508 
509 	size = vie->opsize;
510 	error = EINVAL;
511 
512 	switch (vie->op.op_byte) {
513 	case 0x88:
514 		/*
515 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
516 		 * 88/r:	mov r/m8, r8
517 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
518 		 */
519 		size = 1;	/* override for byte operation */
520 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
521 		if (error == 0)
522 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
523 		break;
524 	case 0x89:
525 		/*
526 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
527 		 * 89/r:	mov r/m16, r16
528 		 * 89/r:	mov r/m32, r32
529 		 * REX.W + 89/r	mov r/m64, r64
530 		 */
531 		reg = gpr_map[vie->reg];
532 		error = vie_read_register(vm, vcpuid, reg, &val);
533 		if (error == 0) {
534 			val &= size2mask[size];
535 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
536 		}
537 		break;
538 	case 0x8A:
539 		/*
540 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
541 		 * 8A/r:	mov r8, r/m8
542 		 * REX + 8A/r:	mov r8, r/m8
543 		 */
544 		size = 1;	/* override for byte operation */
545 		error = memread(vm, vcpuid, gpa, &val, size, arg);
546 		if (error == 0)
547 			error = vie_write_bytereg(vm, vcpuid, vie, val);
548 		break;
549 	case 0x8B:
550 		/*
551 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
552 		 * 8B/r:	mov r16, r/m16
553 		 * 8B/r:	mov r32, r/m32
554 		 * REX.W 8B/r:	mov r64, r/m64
555 		 */
556 		error = memread(vm, vcpuid, gpa, &val, size, arg);
557 		if (error == 0) {
558 			reg = gpr_map[vie->reg];
559 			error = vie_update_register(vm, vcpuid, reg, val, size);
560 		}
561 		break;
562 	case 0xA1:
563 		/*
564 		 * MOV from seg:moffset to AX/EAX/RAX
565 		 * A1:		mov AX, moffs16
566 		 * A1:		mov EAX, moffs32
567 		 * REX.W + A1:	mov RAX, moffs64
568 		 */
569 		error = memread(vm, vcpuid, gpa, &val, size, arg);
570 		if (error == 0) {
571 			reg = VM_REG_GUEST_RAX;
572 			error = vie_update_register(vm, vcpuid, reg, val, size);
573 		}
574 		break;
575 	case 0xA3:
576 		/*
577 		 * MOV from AX/EAX/RAX to seg:moffset
578 		 * A3:		mov moffs16, AX
579 		 * A3:		mov moffs32, EAX
580 		 * REX.W + A3:	mov moffs64, RAX
581 		 */
582 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
583 		if (error == 0) {
584 			val &= size2mask[size];
585 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
586 		}
587 		break;
588 	case 0xC6:
589 		/*
590 		 * MOV from imm8 to mem (ModRM:r/m)
591 		 * C6/0		mov r/m8, imm8
592 		 * REX + C6/0	mov r/m8, imm8
593 		 */
594 		size = 1;	/* override for byte operation */
595 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
596 		break;
597 	case 0xC7:
598 		/*
599 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
600 		 * C7/0		mov r/m16, imm16
601 		 * C7/0		mov r/m32, imm32
602 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
603 		 */
604 		val = vie->immediate & size2mask[size];
605 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
606 		break;
607 	default:
608 		break;
609 	}
610 
611 	return (error);
612 }
613 
614 static int
615 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
616 	     mem_region_read_t memread, mem_region_write_t memwrite,
617 	     void *arg)
618 {
619 	int error, size;
620 	enum vm_reg_name reg;
621 	uint64_t val;
622 
623 	size = vie->opsize;
624 	error = EINVAL;
625 
626 	switch (vie->op.op_byte) {
627 	case 0xB6:
628 		/*
629 		 * MOV and zero extend byte from mem (ModRM:r/m) to
630 		 * reg (ModRM:reg).
631 		 *
632 		 * 0F B6/r		movzx r16, r/m8
633 		 * 0F B6/r		movzx r32, r/m8
634 		 * REX.W + 0F B6/r	movzx r64, r/m8
635 		 */
636 
637 		/* get the first operand */
638 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
639 		if (error)
640 			break;
641 
642 		/* get the second operand */
643 		reg = gpr_map[vie->reg];
644 
645 		/* zero-extend byte */
646 		val = (uint8_t)val;
647 
648 		/* write the result */
649 		error = vie_update_register(vm, vcpuid, reg, val, size);
650 		break;
651 	case 0xB7:
652 		/*
653 		 * MOV and zero extend word from mem (ModRM:r/m) to
654 		 * reg (ModRM:reg).
655 		 *
656 		 * 0F B7/r		movzx r32, r/m16
657 		 * REX.W + 0F B7/r	movzx r64, r/m16
658 		 */
659 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
660 		if (error)
661 			return (error);
662 
663 		reg = gpr_map[vie->reg];
664 
665 		/* zero-extend word */
666 		val = (uint16_t)val;
667 
668 		error = vie_update_register(vm, vcpuid, reg, val, size);
669 		break;
670 	case 0xBE:
671 		/*
672 		 * MOV and sign extend byte from mem (ModRM:r/m) to
673 		 * reg (ModRM:reg).
674 		 *
675 		 * 0F BE/r		movsx r16, r/m8
676 		 * 0F BE/r		movsx r32, r/m8
677 		 * REX.W + 0F BE/r	movsx r64, r/m8
678 		 */
679 
680 		/* get the first operand */
681 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
682 		if (error)
683 			break;
684 
685 		/* get the second operand */
686 		reg = gpr_map[vie->reg];
687 
688 		/* sign extend byte */
689 		val = (int8_t)val;
690 
691 		/* write the result */
692 		error = vie_update_register(vm, vcpuid, reg, val, size);
693 		break;
694 	default:
695 		break;
696 	}
697 	return (error);
698 }
699 
700 /*
701  * Helper function to calculate and validate a linear address.
702  */
703 static int
704 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
705     int opsize, int addrsize, int prot, enum vm_reg_name seg,
706     enum vm_reg_name gpr, uint64_t *gla, int *fault)
707 {
708 	struct seg_desc desc;
709 	uint64_t cr0, val, rflags;
710 	int error;
711 
712 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
713 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
714 
715 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
716 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
717 
718 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
719 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
720 	    __func__, error, seg));
721 
722 	error = vie_read_register(vm, vcpuid, gpr, &val);
723 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
724 	    error, gpr));
725 
726 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
727 	    addrsize, prot, gla)) {
728 		if (seg == VM_REG_GUEST_SS)
729 			vm_inject_ss(vm, vcpuid, 0);
730 		else
731 			vm_inject_gp(vm, vcpuid);
732 		goto guest_fault;
733 	}
734 
735 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
736 		if (seg == VM_REG_GUEST_SS)
737 			vm_inject_ss(vm, vcpuid, 0);
738 		else
739 			vm_inject_gp(vm, vcpuid);
740 		goto guest_fault;
741 	}
742 
743 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
744 		vm_inject_ac(vm, vcpuid, 0);
745 		goto guest_fault;
746 	}
747 
748 	*fault = 0;
749 	return (0);
750 
751 guest_fault:
752 	*fault = 1;
753 	return (0);
754 }
755 
756 static int
757 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
758     struct vm_guest_paging *paging, mem_region_read_t memread,
759     mem_region_write_t memwrite, void *arg)
760 {
761 #ifdef _KERNEL
762 	struct vm_copyinfo copyinfo[2];
763 #else
764 	struct iovec copyinfo[2];
765 #endif
766 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
767 	uint64_t rcx, rdi, rsi, rflags;
768 	int error, fault, opsize, seg, repeat;
769 
770 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
771 	val = 0;
772 	error = 0;
773 
774 	/*
775 	 * XXX although the MOVS instruction is only supposed to be used with
776 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
777 	 *
778 	 * Empirically the "repnz" prefix has identical behavior to "rep"
779 	 * and the zero flag does not make a difference.
780 	 */
781 	repeat = vie->repz_present | vie->repnz_present;
782 
783 	if (repeat) {
784 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
785 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
786 
787 		/*
788 		 * The count register is %rcx, %ecx or %cx depending on the
789 		 * address size of the instruction.
790 		 */
791 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
792 			error = 0;
793 			goto done;
794 		}
795 	}
796 
797 	/*
798 	 *	Source		Destination	Comments
799 	 *	--------------------------------------------
800 	 * (1)  memory		memory		n/a
801 	 * (2)  memory		mmio		emulated
802 	 * (3)  mmio		memory		emulated
803 	 * (4)  mmio		mmio		emulated
804 	 *
805 	 * At this point we don't have sufficient information to distinguish
806 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
807 	 * out because it will succeed only when operating on regular memory.
808 	 *
809 	 * XXX the emulation doesn't properly handle the case where 'gpa'
810 	 * is straddling the boundary between the normal memory and MMIO.
811 	 */
812 
813 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
814 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
815 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
816 	if (error || fault)
817 		goto done;
818 
819 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
820 	    copyinfo, nitems(copyinfo), &fault);
821 	if (error == 0) {
822 		if (fault)
823 			goto done;	/* Resume guest to handle fault */
824 
825 		/*
826 		 * case (2): read from system memory and write to mmio.
827 		 */
828 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
829 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
830 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
831 		if (error)
832 			goto done;
833 	} else {
834 		/*
835 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
836 		 * if 'srcaddr' is in the mmio space.
837 		 */
838 
839 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
840 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
841 		    &fault);
842 		if (error || fault)
843 			goto done;
844 
845 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
846 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
847 		if (error == 0) {
848 			if (fault)
849 				goto done;    /* Resume guest to handle fault */
850 
851 			/*
852 			 * case (3): read from MMIO and write to system memory.
853 			 *
854 			 * A MMIO read can have side-effects so we
855 			 * commit to it only after vm_copy_setup() is
856 			 * successful. If a page-fault needs to be
857 			 * injected into the guest then it will happen
858 			 * before the MMIO read is attempted.
859 			 */
860 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
861 			if (error)
862 				goto done;
863 
864 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
865 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
866 		} else {
867 			/*
868 			 * Case (4): read from and write to mmio.
869 			 *
870 			 * Commit to the MMIO read/write (with potential
871 			 * side-effects) only after we are sure that the
872 			 * instruction is not going to be restarted due
873 			 * to address translation faults.
874 			 */
875 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
876 			    PROT_READ, &srcgpa, &fault);
877 			if (error || fault)
878 				goto done;
879 
880 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
881 			   PROT_WRITE, &dstgpa, &fault);
882 			if (error || fault)
883 				goto done;
884 
885 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
886 			if (error)
887 				goto done;
888 
889 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
890 			if (error)
891 				goto done;
892 		}
893 	}
894 
895 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
896 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
897 
898 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
899 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
900 
901 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
902 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
903 
904 	if (rflags & PSL_D) {
905 		rsi -= opsize;
906 		rdi -= opsize;
907 	} else {
908 		rsi += opsize;
909 		rdi += opsize;
910 	}
911 
912 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
913 	    vie->addrsize);
914 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
915 
916 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
917 	    vie->addrsize);
918 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
919 
920 	if (repeat) {
921 		rcx = rcx - 1;
922 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
923 		    rcx, vie->addrsize);
924 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
925 
926 		/*
927 		 * Repeat the instruction if the count register is not zero.
928 		 */
929 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
930 			vm_restart_instruction(vm, vcpuid);
931 	}
932 done:
933 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
934 	    __func__, error));
935 	return (error);
936 }
937 
938 static int
939 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
940     struct vm_guest_paging *paging, mem_region_read_t memread,
941     mem_region_write_t memwrite, void *arg)
942 {
943 	int error, opsize, repeat;
944 	uint64_t val;
945 	uint64_t rcx, rdi, rflags;
946 
947 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
948 	repeat = vie->repz_present | vie->repnz_present;
949 
950 	if (repeat) {
951 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
952 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
953 
954 		/*
955 		 * The count register is %rcx, %ecx or %cx depending on the
956 		 * address size of the instruction.
957 		 */
958 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
959 			return (0);
960 	}
961 
962 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
963 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
964 
965 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
966 	if (error)
967 		return (error);
968 
969 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
970 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
971 
972 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
973 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
974 
975 	if (rflags & PSL_D)
976 		rdi -= opsize;
977 	else
978 		rdi += opsize;
979 
980 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
981 	    vie->addrsize);
982 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
983 
984 	if (repeat) {
985 		rcx = rcx - 1;
986 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
987 		    rcx, vie->addrsize);
988 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
989 
990 		/*
991 		 * Repeat the instruction if the count register is not zero.
992 		 */
993 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
994 			vm_restart_instruction(vm, vcpuid);
995 	}
996 
997 	return (0);
998 }
999 
1000 static int
1001 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1002 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1003 {
1004 	int error, size;
1005 	enum vm_reg_name reg;
1006 	uint64_t result, rflags, rflags2, val1, val2;
1007 
1008 	size = vie->opsize;
1009 	error = EINVAL;
1010 
1011 	switch (vie->op.op_byte) {
1012 	case 0x23:
1013 		/*
1014 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1015 		 * result in reg.
1016 		 *
1017 		 * 23/r		and r16, r/m16
1018 		 * 23/r		and r32, r/m32
1019 		 * REX.W + 23/r	and r64, r/m64
1020 		 */
1021 
1022 		/* get the first operand */
1023 		reg = gpr_map[vie->reg];
1024 		error = vie_read_register(vm, vcpuid, reg, &val1);
1025 		if (error)
1026 			break;
1027 
1028 		/* get the second operand */
1029 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1030 		if (error)
1031 			break;
1032 
1033 		/* perform the operation and write the result */
1034 		result = val1 & val2;
1035 		error = vie_update_register(vm, vcpuid, reg, result, size);
1036 		break;
1037 	case 0x81:
1038 	case 0x83:
1039 		/*
1040 		 * AND mem (ModRM:r/m) with immediate and store the
1041 		 * result in mem.
1042 		 *
1043 		 * 81 /4		and r/m16, imm16
1044 		 * 81 /4		and r/m32, imm32
1045 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1046 		 *
1047 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1048 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1049 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1050 		 */
1051 
1052 		/* get the first operand */
1053                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1054                 if (error)
1055 			break;
1056 
1057                 /*
1058 		 * perform the operation with the pre-fetched immediate
1059 		 * operand and write the result
1060 		 */
1061                 result = val1 & vie->immediate;
1062                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1063 		break;
1064 	default:
1065 		break;
1066 	}
1067 	if (error)
1068 		return (error);
1069 
1070 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1071 	if (error)
1072 		return (error);
1073 
1074 	/*
1075 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1076 	 * to the result; AF is undefined.
1077 	 *
1078 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1079 	 */
1080 	rflags2 = getcc(size, result, 0);
1081 	rflags &= ~RFLAGS_STATUS_BITS;
1082 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1083 
1084 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1085 	return (error);
1086 }
1087 
1088 static int
1089 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1090 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1091 {
1092 	int error, size;
1093 	enum vm_reg_name reg;
1094 	uint64_t result, rflags, rflags2, val1, val2;
1095 
1096 	size = vie->opsize;
1097 	error = EINVAL;
1098 
1099 	switch (vie->op.op_byte) {
1100 	case 0x0B:
1101 		/*
1102 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1103 		 * result in reg.
1104 		 *
1105 		 * 0b/r         or r16, r/m16
1106 		 * 0b/r         or r32, r/m32
1107 		 * REX.W + 0b/r or r64, r/m64
1108 		 */
1109 
1110 		/* get the first operand */
1111 		reg = gpr_map[vie->reg];
1112 		error = vie_read_register(vm, vcpuid, reg, &val1);
1113 		if (error)
1114 			break;
1115 
1116 		/* get the second operand */
1117 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1118 		if (error)
1119 			break;
1120 
1121 		/* perform the operation and write the result */
1122 		result = val1 | val2;
1123 		error = vie_update_register(vm, vcpuid, reg, result, size);
1124 		break;
1125 	case 0x81:
1126 	case 0x83:
1127 		/*
1128 		 * OR mem (ModRM:r/m) with immediate and store the
1129 		 * result in mem.
1130 		 *
1131 		 * 81 /1		or r/m16, imm16
1132 		 * 81 /1		or r/m32, imm32
1133 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1134 		 *
1135 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1136 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1137 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1138 		 */
1139 
1140 		/* get the first operand */
1141                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1142                 if (error)
1143 			break;
1144 
1145                 /*
1146 		 * perform the operation with the pre-fetched immediate
1147 		 * operand and write the result
1148 		 */
1149                 result = val1 | vie->immediate;
1150                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1151 		break;
1152 	default:
1153 		break;
1154 	}
1155 	if (error)
1156 		return (error);
1157 
1158 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1159 	if (error)
1160 		return (error);
1161 
1162 	/*
1163 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1164 	 * to the result; AF is undefined.
1165 	 *
1166 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1167 	 */
1168 	rflags2 = getcc(size, result, 0);
1169 	rflags &= ~RFLAGS_STATUS_BITS;
1170 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1171 
1172 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1173 	return (error);
1174 }
1175 
1176 static int
1177 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1178 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1179 {
1180 	int error, size;
1181 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1182 	enum vm_reg_name reg;
1183 
1184 	size = vie->opsize;
1185 	switch (vie->op.op_byte) {
1186 	case 0x39:
1187 	case 0x3B:
1188 		/*
1189 		 * 39/r		CMP r/m16, r16
1190 		 * 39/r		CMP r/m32, r32
1191 		 * REX.W 39/r	CMP r/m64, r64
1192 		 *
1193 		 * 3B/r		CMP r16, r/m16
1194 		 * 3B/r		CMP r32, r/m32
1195 		 * REX.W + 3B/r	CMP r64, r/m64
1196 		 *
1197 		 * Compare the first operand with the second operand and
1198 		 * set status flags in EFLAGS register. The comparison is
1199 		 * performed by subtracting the second operand from the first
1200 		 * operand and then setting the status flags.
1201 		 */
1202 
1203 		/* Get the register operand */
1204 		reg = gpr_map[vie->reg];
1205 		error = vie_read_register(vm, vcpuid, reg, &regop);
1206 		if (error)
1207 			return (error);
1208 
1209 		/* Get the memory operand */
1210 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1211 		if (error)
1212 			return (error);
1213 
1214 		if (vie->op.op_byte == 0x3B) {
1215 			op1 = regop;
1216 			op2 = memop;
1217 		} else {
1218 			op1 = memop;
1219 			op2 = regop;
1220 		}
1221 		rflags2 = getcc(size, op1, op2);
1222 		break;
1223 	case 0x80:
1224 	case 0x81:
1225 	case 0x83:
1226 		/*
1227 		 * 80 /7		cmp r/m8, imm8
1228 		 * REX + 80 /7		cmp r/m8, imm8
1229 		 *
1230 		 * 81 /7		cmp r/m16, imm16
1231 		 * 81 /7		cmp r/m32, imm32
1232 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1233 		 *
1234 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1235 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1236 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1237 		 *
1238 		 * Compare mem (ModRM:r/m) with immediate and set
1239 		 * status flags according to the results.  The
1240 		 * comparison is performed by subtracting the
1241 		 * immediate from the first operand and then setting
1242 		 * the status flags.
1243 		 *
1244 		 */
1245 		if (vie->op.op_byte == 0x80)
1246 			size = 1;
1247 
1248 		/* get the first operand */
1249                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1250 		if (error)
1251 			return (error);
1252 
1253 		rflags2 = getcc(size, op1, vie->immediate);
1254 		break;
1255 	default:
1256 		return (EINVAL);
1257 	}
1258 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1259 	if (error)
1260 		return (error);
1261 	rflags &= ~RFLAGS_STATUS_BITS;
1262 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1263 
1264 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1265 	return (error);
1266 }
1267 
1268 static int
1269 emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1270     mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1271 {
1272 	int error, size;
1273 	uint64_t op1, rflags, rflags2;
1274 
1275 	size = vie->opsize;
1276 	error = EINVAL;
1277 
1278 	switch (vie->op.op_byte) {
1279 	case 0xF7:
1280 		/*
1281 		 * F7 /0		test r/m16, imm16
1282 		 * F7 /0		test r/m32, imm32
1283 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1284 		 *
1285 		 * Test mem (ModRM:r/m) with immediate and set status
1286 		 * flags according to the results.  The comparison is
1287 		 * performed by anding the immediate from the first
1288 		 * operand and then setting the status flags.
1289 		 */
1290 		if ((vie->reg & 7) != 0)
1291 			return (EINVAL);
1292 
1293 		error = memread(vm, vcpuid, gpa, &op1, size, arg);
1294 		if (error)
1295 			return (error);
1296 
1297 		rflags2 = getandflags(size, op1, vie->immediate);
1298 		break;
1299 	default:
1300 		return (EINVAL);
1301 	}
1302 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1303 	if (error)
1304 		return (error);
1305 
1306 	/*
1307 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1308 	 * to the result; AF is undefined.
1309 	 */
1310 	rflags &= ~RFLAGS_STATUS_BITS;
1311 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1312 
1313 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1314 	return (error);
1315 }
1316 
1317 static int
1318 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1319 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1320 {
1321 	int error, size;
1322 	uint64_t nval, rflags, rflags2, val1, val2;
1323 	enum vm_reg_name reg;
1324 
1325 	size = vie->opsize;
1326 	error = EINVAL;
1327 
1328 	switch (vie->op.op_byte) {
1329 	case 0x03:
1330 		/*
1331 		 * ADD r/m to r and store the result in r
1332 		 *
1333 		 * 03/r            ADD r16, r/m16
1334 		 * 03/r            ADD r32, r/m32
1335 		 * REX.W + 03/r    ADD r64, r/m64
1336 		 */
1337 
1338 		/* get the first operand */
1339 		reg = gpr_map[vie->reg];
1340 		error = vie_read_register(vm, vcpuid, reg, &val1);
1341 		if (error)
1342 			break;
1343 
1344 		/* get the second operand */
1345 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1346 		if (error)
1347 			break;
1348 
1349 		/* perform the operation and write the result */
1350 		nval = val1 + val2;
1351 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1352 		break;
1353 	default:
1354 		break;
1355 	}
1356 
1357 	if (!error) {
1358 		rflags2 = getaddflags(size, val1, val2);
1359 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1360 		    &rflags);
1361 		if (error)
1362 			return (error);
1363 
1364 		rflags &= ~RFLAGS_STATUS_BITS;
1365 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1366 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1367 		    rflags, 8);
1368 	}
1369 
1370 	return (error);
1371 }
1372 
1373 static int
1374 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1375 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1376 {
1377 	int error, size;
1378 	uint64_t nval, rflags, rflags2, val1, val2;
1379 	enum vm_reg_name reg;
1380 
1381 	size = vie->opsize;
1382 	error = EINVAL;
1383 
1384 	switch (vie->op.op_byte) {
1385 	case 0x2B:
1386 		/*
1387 		 * SUB r/m from r and store the result in r
1388 		 *
1389 		 * 2B/r            SUB r16, r/m16
1390 		 * 2B/r            SUB r32, r/m32
1391 		 * REX.W + 2B/r    SUB r64, r/m64
1392 		 */
1393 
1394 		/* get the first operand */
1395 		reg = gpr_map[vie->reg];
1396 		error = vie_read_register(vm, vcpuid, reg, &val1);
1397 		if (error)
1398 			break;
1399 
1400 		/* get the second operand */
1401 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1402 		if (error)
1403 			break;
1404 
1405 		/* perform the operation and write the result */
1406 		nval = val1 - val2;
1407 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1408 		break;
1409 	default:
1410 		break;
1411 	}
1412 
1413 	if (!error) {
1414 		rflags2 = getcc(size, val1, val2);
1415 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1416 		    &rflags);
1417 		if (error)
1418 			return (error);
1419 
1420 		rflags &= ~RFLAGS_STATUS_BITS;
1421 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1422 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1423 		    rflags, 8);
1424 	}
1425 
1426 	return (error);
1427 }
1428 
1429 static int
1430 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1431     struct vm_guest_paging *paging, mem_region_read_t memread,
1432     mem_region_write_t memwrite, void *arg)
1433 {
1434 #ifdef _KERNEL
1435 	struct vm_copyinfo copyinfo[2];
1436 #else
1437 	struct iovec copyinfo[2];
1438 #endif
1439 	struct seg_desc ss_desc;
1440 	uint64_t cr0, rflags, rsp, stack_gla, val;
1441 	int error, fault, size, stackaddrsize, pushop;
1442 
1443 	val = 0;
1444 	size = vie->opsize;
1445 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1446 
1447 	/*
1448 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1449 	 */
1450 	if (paging->cpu_mode == CPU_MODE_REAL) {
1451 		stackaddrsize = 2;
1452 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1453 		/*
1454 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1455 		 * - Stack pointer size is always 64-bits.
1456 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1457 		 * - 16-bit PUSH/POP is supported by using the operand size
1458 		 *   override prefix (66H).
1459 		 */
1460 		stackaddrsize = 8;
1461 		size = vie->opsize_override ? 2 : 8;
1462 	} else {
1463 		/*
1464 		 * In protected or compatibility mode the 'B' flag in the
1465 		 * stack-segment descriptor determines the size of the
1466 		 * stack pointer.
1467 		 */
1468 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1469 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1470 		    __func__, error));
1471 		if (SEG_DESC_DEF32(ss_desc.access))
1472 			stackaddrsize = 4;
1473 		else
1474 			stackaddrsize = 2;
1475 	}
1476 
1477 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1478 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1479 
1480 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1481 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1482 
1483 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1484 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1485 	if (pushop) {
1486 		rsp -= size;
1487 	}
1488 
1489 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1490 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1491 	    &stack_gla)) {
1492 		vm_inject_ss(vm, vcpuid, 0);
1493 		return (0);
1494 	}
1495 
1496 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1497 		vm_inject_ss(vm, vcpuid, 0);
1498 		return (0);
1499 	}
1500 
1501 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1502 		vm_inject_ac(vm, vcpuid, 0);
1503 		return (0);
1504 	}
1505 
1506 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1507 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1508 	    &fault);
1509 	if (error || fault)
1510 		return (error);
1511 
1512 	if (pushop) {
1513 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1514 		if (error == 0)
1515 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1516 	} else {
1517 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1518 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1519 		rsp += size;
1520 	}
1521 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1522 
1523 	if (error == 0) {
1524 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1525 		    stackaddrsize);
1526 		KASSERT(error == 0, ("error %d updating rsp", error));
1527 	}
1528 	return (error);
1529 }
1530 
1531 static int
1532 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1533     struct vm_guest_paging *paging, mem_region_read_t memread,
1534     mem_region_write_t memwrite, void *arg)
1535 {
1536 	int error;
1537 
1538 	/*
1539 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1540 	 *
1541 	 * PUSH is part of the group 5 extended opcodes and is identified
1542 	 * by ModRM:reg = b110.
1543 	 */
1544 	if ((vie->reg & 7) != 6)
1545 		return (EINVAL);
1546 
1547 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1548 	    memwrite, arg);
1549 	return (error);
1550 }
1551 
1552 static int
1553 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1554     struct vm_guest_paging *paging, mem_region_read_t memread,
1555     mem_region_write_t memwrite, void *arg)
1556 {
1557 	int error;
1558 
1559 	/*
1560 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1561 	 *
1562 	 * POP is part of the group 1A extended opcodes and is identified
1563 	 * by ModRM:reg = b000.
1564 	 */
1565 	if ((vie->reg & 7) != 0)
1566 		return (EINVAL);
1567 
1568 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1569 	    memwrite, arg);
1570 	return (error);
1571 }
1572 
1573 static int
1574 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1575     struct vm_guest_paging *paging, mem_region_read_t memread,
1576     mem_region_write_t memwrite, void *memarg)
1577 {
1578 	int error;
1579 
1580 	switch (vie->reg & 7) {
1581 	case 0x1:	/* OR */
1582 		error = emulate_or(vm, vcpuid, gpa, vie,
1583 		    memread, memwrite, memarg);
1584 		break;
1585 	case 0x4:	/* AND */
1586 		error = emulate_and(vm, vcpuid, gpa, vie,
1587 		    memread, memwrite, memarg);
1588 		break;
1589 	case 0x7:	/* CMP */
1590 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1591 		    memread, memwrite, memarg);
1592 		break;
1593 	default:
1594 		error = EINVAL;
1595 		break;
1596 	}
1597 
1598 	return (error);
1599 }
1600 
1601 static int
1602 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1603     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1604 {
1605 	uint64_t val, rflags;
1606 	int error, bitmask, bitoff;
1607 
1608 	/*
1609 	 * 0F BA is a Group 8 extended opcode.
1610 	 *
1611 	 * Currently we only emulate the 'Bit Test' instruction which is
1612 	 * identified by a ModR/M:reg encoding of 100b.
1613 	 */
1614 	if ((vie->reg & 7) != 4)
1615 		return (EINVAL);
1616 
1617 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1618 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1619 
1620 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1621 	if (error)
1622 		return (error);
1623 
1624 	/*
1625 	 * Intel SDM, Vol 2, Table 3-2:
1626 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1627 	 */
1628 	bitmask = vie->opsize * 8 - 1;
1629 	bitoff = vie->immediate & bitmask;
1630 
1631 	/* Copy the bit into the Carry flag in %rflags */
1632 	if (val & (1UL << bitoff))
1633 		rflags |= PSL_C;
1634 	else
1635 		rflags &= ~PSL_C;
1636 
1637 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1638 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1639 
1640 	return (0);
1641 }
1642 
1643 static int
1644 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1645     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1646 {
1647 	int error;
1648 	uint64_t buf;
1649 
1650 	switch (vie->reg & 7) {
1651 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1652 		if (vie->mod == 0x3) {
1653 			/*
1654 			 * SFENCE.  Ignore it, VM exit provides enough
1655 			 * barriers on its own.
1656 			 */
1657 			error = 0;
1658 		} else {
1659 			/*
1660 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1661 			 * rights.
1662 			 */
1663 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1664 		}
1665 		break;
1666 	default:
1667 		error = EINVAL;
1668 		break;
1669 	}
1670 
1671 	return (error);
1672 }
1673 
1674 int
1675 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1676     struct vm_guest_paging *paging, mem_region_read_t memread,
1677     mem_region_write_t memwrite, void *memarg)
1678 {
1679 	int error;
1680 
1681 	if (!vie->decoded)
1682 		return (EINVAL);
1683 
1684 	switch (vie->op.op_type) {
1685 	case VIE_OP_TYPE_GROUP1:
1686 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1687 		    memwrite, memarg);
1688 		break;
1689 	case VIE_OP_TYPE_POP:
1690 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1691 		    memwrite, memarg);
1692 		break;
1693 	case VIE_OP_TYPE_PUSH:
1694 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1695 		    memwrite, memarg);
1696 		break;
1697 	case VIE_OP_TYPE_CMP:
1698 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1699 				    memread, memwrite, memarg);
1700 		break;
1701 	case VIE_OP_TYPE_MOV:
1702 		error = emulate_mov(vm, vcpuid, gpa, vie,
1703 				    memread, memwrite, memarg);
1704 		break;
1705 	case VIE_OP_TYPE_MOVSX:
1706 	case VIE_OP_TYPE_MOVZX:
1707 		error = emulate_movx(vm, vcpuid, gpa, vie,
1708 				     memread, memwrite, memarg);
1709 		break;
1710 	case VIE_OP_TYPE_MOVS:
1711 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1712 		    memwrite, memarg);
1713 		break;
1714 	case VIE_OP_TYPE_STOS:
1715 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1716 		    memwrite, memarg);
1717 		break;
1718 	case VIE_OP_TYPE_AND:
1719 		error = emulate_and(vm, vcpuid, gpa, vie,
1720 				    memread, memwrite, memarg);
1721 		break;
1722 	case VIE_OP_TYPE_OR:
1723 		error = emulate_or(vm, vcpuid, gpa, vie,
1724 				    memread, memwrite, memarg);
1725 		break;
1726 	case VIE_OP_TYPE_SUB:
1727 		error = emulate_sub(vm, vcpuid, gpa, vie,
1728 				    memread, memwrite, memarg);
1729 		break;
1730 	case VIE_OP_TYPE_BITTEST:
1731 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1732 		    memread, memwrite, memarg);
1733 		break;
1734 	case VIE_OP_TYPE_TWOB_GRP15:
1735 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1736 		    memread, memwrite, memarg);
1737 		break;
1738 	case VIE_OP_TYPE_ADD:
1739 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1740 		    memwrite, memarg);
1741 		break;
1742 	case VIE_OP_TYPE_TEST:
1743 		error = emulate_test(vm, vcpuid, gpa, vie,
1744 		    memread, memwrite, memarg);
1745 		break;
1746 	default:
1747 		error = EINVAL;
1748 		break;
1749 	}
1750 
1751 	return (error);
1752 }
1753 
1754 int
1755 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1756 {
1757 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1758 	    ("%s: invalid size %d", __func__, size));
1759 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1760 
1761 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1762 		return (0);
1763 
1764 	return ((gla & (size - 1)) ? 1 : 0);
1765 }
1766 
1767 int
1768 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1769 {
1770 	uint64_t mask;
1771 
1772 	if (cpu_mode != CPU_MODE_64BIT)
1773 		return (0);
1774 
1775 	/*
1776 	 * The value of the bit 47 in the 'gla' should be replicated in the
1777 	 * most significant 16 bits.
1778 	 */
1779 	mask = ~((1UL << 48) - 1);
1780 	if (gla & (1UL << 47))
1781 		return ((gla & mask) != mask);
1782 	else
1783 		return ((gla & mask) != 0);
1784 }
1785 
1786 uint64_t
1787 vie_size2mask(int size)
1788 {
1789 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1790 	    ("vie_size2mask: invalid size %d", size));
1791 	return (size2mask[size]);
1792 }
1793 
1794 int
1795 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1796     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1797     int prot, uint64_t *gla)
1798 {
1799 	uint64_t firstoff, low_limit, high_limit, segbase;
1800 	int glasize, type;
1801 
1802 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1803 	    ("%s: invalid segment %d", __func__, seg));
1804 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1805 	    ("%s: invalid operand size %d", __func__, length));
1806 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1807 	    ("%s: invalid prot %#x", __func__, prot));
1808 
1809 	firstoff = offset;
1810 	if (cpu_mode == CPU_MODE_64BIT) {
1811 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1812 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1813 		glasize = 8;
1814 	} else {
1815 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1816 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1817 		glasize = 4;
1818 		/*
1819 		 * If the segment selector is loaded with a NULL selector
1820 		 * then the descriptor is unusable and attempting to use
1821 		 * it results in a #GP(0).
1822 		 */
1823 		if (SEG_DESC_UNUSABLE(desc->access))
1824 			return (-1);
1825 
1826 		/*
1827 		 * The processor generates a #NP exception when a segment
1828 		 * register is loaded with a selector that points to a
1829 		 * descriptor that is not present. If this was the case then
1830 		 * it would have been checked before the VM-exit.
1831 		 */
1832 		KASSERT(SEG_DESC_PRESENT(desc->access),
1833 		    ("segment %d not present: %#x", seg, desc->access));
1834 
1835 		/*
1836 		 * The descriptor type must indicate a code/data segment.
1837 		 */
1838 		type = SEG_DESC_TYPE(desc->access);
1839 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1840 		    "descriptor type %#x", seg, type));
1841 
1842 		if (prot & PROT_READ) {
1843 			/* #GP on a read access to a exec-only code segment */
1844 			if ((type & 0xA) == 0x8)
1845 				return (-1);
1846 		}
1847 
1848 		if (prot & PROT_WRITE) {
1849 			/*
1850 			 * #GP on a write access to a code segment or a
1851 			 * read-only data segment.
1852 			 */
1853 			if (type & 0x8)			/* code segment */
1854 				return (-1);
1855 
1856 			if ((type & 0xA) == 0)		/* read-only data seg */
1857 				return (-1);
1858 		}
1859 
1860 		/*
1861 		 * 'desc->limit' is fully expanded taking granularity into
1862 		 * account.
1863 		 */
1864 		if ((type & 0xC) == 0x4) {
1865 			/* expand-down data segment */
1866 			low_limit = desc->limit + 1;
1867 			high_limit = SEG_DESC_DEF32(desc->access) ?
1868 			    0xffffffff : 0xffff;
1869 		} else {
1870 			/* code segment or expand-up data segment */
1871 			low_limit = 0;
1872 			high_limit = desc->limit;
1873 		}
1874 
1875 		while (length > 0) {
1876 			offset &= vie_size2mask(addrsize);
1877 			if (offset < low_limit || offset > high_limit)
1878 				return (-1);
1879 			offset++;
1880 			length--;
1881 		}
1882 	}
1883 
1884 	/*
1885 	 * In 64-bit mode all segments except %fs and %gs have a segment
1886 	 * base address of 0.
1887 	 */
1888 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1889 	    seg != VM_REG_GUEST_GS) {
1890 		segbase = 0;
1891 	} else {
1892 		segbase = desc->base;
1893 	}
1894 
1895 	/*
1896 	 * Truncate 'firstoff' to the effective address size before adding
1897 	 * it to the segment base.
1898 	 */
1899 	firstoff &= vie_size2mask(addrsize);
1900 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1901 	return (0);
1902 }
1903 
1904 void
1905 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1906 {
1907 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1908 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1909 
1910 	bzero(vie, sizeof(struct vie));
1911 
1912 	vie->base_register = VM_REG_LAST;
1913 	vie->index_register = VM_REG_LAST;
1914 	vie->segment_register = VM_REG_LAST;
1915 
1916 	if (inst_length) {
1917 		bcopy(inst_bytes, vie->inst, inst_length);
1918 		vie->num_valid = inst_length;
1919 	}
1920 }
1921 
1922 #ifdef _KERNEL
1923 static int
1924 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1925 {
1926 	int error_code = 0;
1927 
1928 	if (pte & PG_V)
1929 		error_code |= PGEX_P;
1930 	if (prot & VM_PROT_WRITE)
1931 		error_code |= PGEX_W;
1932 	if (usermode)
1933 		error_code |= PGEX_U;
1934 	if (rsvd)
1935 		error_code |= PGEX_RSV;
1936 	if (prot & VM_PROT_EXECUTE)
1937 		error_code |= PGEX_I;
1938 
1939 	return (error_code);
1940 }
1941 
1942 static void
1943 ptp_release(void **cookie)
1944 {
1945 	if (*cookie != NULL) {
1946 		vm_gpa_release(*cookie);
1947 		*cookie = NULL;
1948 	}
1949 }
1950 
1951 static void *
1952 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1953 {
1954 	void *ptr;
1955 
1956 	ptp_release(cookie);
1957 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1958 	return (ptr);
1959 }
1960 
1961 static int
1962 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1963     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
1964 {
1965 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1966 	u_int retries;
1967 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1968 	uint32_t *ptpbase32, pte32;
1969 	void *cookie;
1970 
1971 	*guest_fault = 0;
1972 
1973 	usermode = (paging->cpl == 3 ? 1 : 0);
1974 	writable = prot & VM_PROT_WRITE;
1975 	cookie = NULL;
1976 	retval = 0;
1977 	retries = 0;
1978 restart:
1979 	ptpphys = paging->cr3;		/* root of the page tables */
1980 	ptp_release(&cookie);
1981 	if (retries++ > 0)
1982 		maybe_yield();
1983 
1984 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1985 		/*
1986 		 * XXX assuming a non-stack reference otherwise a stack fault
1987 		 * should be generated.
1988 		 */
1989 		if (!check_only)
1990 			vm_inject_gp(vm, vcpuid);
1991 		goto fault;
1992 	}
1993 
1994 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1995 		*gpa = gla;
1996 		goto done;
1997 	}
1998 
1999 	if (paging->paging_mode == PAGING_MODE_32) {
2000 		nlevels = 2;
2001 		while (--nlevels >= 0) {
2002 			/* Zero out the lower 12 bits. */
2003 			ptpphys &= ~0xfff;
2004 
2005 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2006 			    &cookie);
2007 
2008 			if (ptpbase32 == NULL)
2009 				goto error;
2010 
2011 			ptpshift = PAGE_SHIFT + nlevels * 10;
2012 			ptpindex = (gla >> ptpshift) & 0x3FF;
2013 			pgsize = 1UL << ptpshift;
2014 
2015 			pte32 = ptpbase32[ptpindex];
2016 
2017 			if ((pte32 & PG_V) == 0 ||
2018 			    (usermode && (pte32 & PG_U) == 0) ||
2019 			    (writable && (pte32 & PG_RW) == 0)) {
2020 				if (!check_only) {
2021 					pfcode = pf_error_code(usermode, prot, 0,
2022 					    pte32);
2023 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2024 				}
2025 				goto fault;
2026 			}
2027 
2028 			/*
2029 			 * Emulate the x86 MMU's management of the accessed
2030 			 * and dirty flags. While the accessed flag is set
2031 			 * at every level of the page table, the dirty flag
2032 			 * is only set at the last level providing the guest
2033 			 * physical address.
2034 			 */
2035 			if (!check_only && (pte32 & PG_A) == 0) {
2036 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2037 				    pte32, pte32 | PG_A) == 0) {
2038 					goto restart;
2039 				}
2040 			}
2041 
2042 			/* XXX must be ignored if CR4.PSE=0 */
2043 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2044 				break;
2045 
2046 			ptpphys = pte32;
2047 		}
2048 
2049 		/* Set the dirty bit in the page table entry if necessary */
2050 		if (!check_only && writable && (pte32 & PG_M) == 0) {
2051 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2052 			    pte32, pte32 | PG_M) == 0) {
2053 				goto restart;
2054 			}
2055 		}
2056 
2057 		/* Zero out the lower 'ptpshift' bits */
2058 		pte32 >>= ptpshift; pte32 <<= ptpshift;
2059 		*gpa = pte32 | (gla & (pgsize - 1));
2060 		goto done;
2061 	}
2062 
2063 	if (paging->paging_mode == PAGING_MODE_PAE) {
2064 		/* Zero out the lower 5 bits and the upper 32 bits */
2065 		ptpphys &= 0xffffffe0UL;
2066 
2067 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
2068 		    &cookie);
2069 		if (ptpbase == NULL)
2070 			goto error;
2071 
2072 		ptpindex = (gla >> 30) & 0x3;
2073 
2074 		pte = ptpbase[ptpindex];
2075 
2076 		if ((pte & PG_V) == 0) {
2077 			if (!check_only) {
2078 				pfcode = pf_error_code(usermode, prot, 0, pte);
2079 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2080 			}
2081 			goto fault;
2082 		}
2083 
2084 		ptpphys = pte;
2085 
2086 		nlevels = 2;
2087 	} else
2088 		nlevels = 4;
2089 	while (--nlevels >= 0) {
2090 		/* Zero out the lower 12 bits and the upper 12 bits */
2091 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2092 
2093 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
2094 		if (ptpbase == NULL)
2095 			goto error;
2096 
2097 		ptpshift = PAGE_SHIFT + nlevels * 9;
2098 		ptpindex = (gla >> ptpshift) & 0x1FF;
2099 		pgsize = 1UL << ptpshift;
2100 
2101 		pte = ptpbase[ptpindex];
2102 
2103 		if ((pte & PG_V) == 0 ||
2104 		    (usermode && (pte & PG_U) == 0) ||
2105 		    (writable && (pte & PG_RW) == 0)) {
2106 			if (!check_only) {
2107 				pfcode = pf_error_code(usermode, prot, 0, pte);
2108 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2109 			}
2110 			goto fault;
2111 		}
2112 
2113 		/* Set the accessed bit in the page table entry */
2114 		if (!check_only && (pte & PG_A) == 0) {
2115 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2116 			    pte, pte | PG_A) == 0) {
2117 				goto restart;
2118 			}
2119 		}
2120 
2121 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2122 			if (pgsize > 1 * GB) {
2123 				if (!check_only) {
2124 					pfcode = pf_error_code(usermode, prot, 1,
2125 					    pte);
2126 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2127 				}
2128 				goto fault;
2129 			}
2130 			break;
2131 		}
2132 
2133 		ptpphys = pte;
2134 	}
2135 
2136 	/* Set the dirty bit in the page table entry if necessary */
2137 	if (!check_only && writable && (pte & PG_M) == 0) {
2138 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2139 			goto restart;
2140 	}
2141 
2142 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2143 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2144 	*gpa = pte | (gla & (pgsize - 1));
2145 done:
2146 	ptp_release(&cookie);
2147 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2148 	    __func__, retval));
2149 	return (retval);
2150 error:
2151 	retval = EFAULT;
2152 	goto done;
2153 fault:
2154 	*guest_fault = 1;
2155 	goto done;
2156 }
2157 
2158 int
2159 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2160     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2161 {
2162 
2163 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2164 	    false));
2165 }
2166 
2167 int
2168 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2169     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2170 {
2171 
2172 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2173 	    true));
2174 }
2175 
2176 int
2177 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2178     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2179 {
2180 	struct vm_copyinfo copyinfo[2];
2181 	int error, prot;
2182 
2183 	if (inst_length > VIE_INST_SIZE)
2184 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2185 
2186 	prot = PROT_READ | PROT_EXEC;
2187 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2188 	    copyinfo, nitems(copyinfo), faultptr);
2189 	if (error || *faultptr)
2190 		return (error);
2191 
2192 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2193 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2194 	vie->num_valid = inst_length;
2195 	return (0);
2196 }
2197 #endif	/* _KERNEL */
2198 
2199 static int
2200 vie_peek(struct vie *vie, uint8_t *x)
2201 {
2202 
2203 	if (vie->num_processed < vie->num_valid) {
2204 		*x = vie->inst[vie->num_processed];
2205 		return (0);
2206 	} else
2207 		return (-1);
2208 }
2209 
2210 static void
2211 vie_advance(struct vie *vie)
2212 {
2213 
2214 	vie->num_processed++;
2215 }
2216 
2217 static bool
2218 segment_override(uint8_t x, int *seg)
2219 {
2220 
2221 	switch (x) {
2222 	case 0x2E:
2223 		*seg = VM_REG_GUEST_CS;
2224 		break;
2225 	case 0x36:
2226 		*seg = VM_REG_GUEST_SS;
2227 		break;
2228 	case 0x3E:
2229 		*seg = VM_REG_GUEST_DS;
2230 		break;
2231 	case 0x26:
2232 		*seg = VM_REG_GUEST_ES;
2233 		break;
2234 	case 0x64:
2235 		*seg = VM_REG_GUEST_FS;
2236 		break;
2237 	case 0x65:
2238 		*seg = VM_REG_GUEST_GS;
2239 		break;
2240 	default:
2241 		return (false);
2242 	}
2243 	return (true);
2244 }
2245 
2246 static int
2247 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2248 {
2249 	uint8_t x;
2250 
2251 	while (1) {
2252 		if (vie_peek(vie, &x))
2253 			return (-1);
2254 
2255 		if (x == 0x66)
2256 			vie->opsize_override = 1;
2257 		else if (x == 0x67)
2258 			vie->addrsize_override = 1;
2259 		else if (x == 0xF3)
2260 			vie->repz_present = 1;
2261 		else if (x == 0xF2)
2262 			vie->repnz_present = 1;
2263 		else if (segment_override(x, &vie->segment_register))
2264 			vie->segment_override = 1;
2265 		else
2266 			break;
2267 
2268 		vie_advance(vie);
2269 	}
2270 
2271 	/*
2272 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2273 	 * - Only one REX prefix is allowed per instruction.
2274 	 * - The REX prefix must immediately precede the opcode byte or the
2275 	 *   escape opcode byte.
2276 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2277 	 *   the mandatory prefix must come before the REX prefix.
2278 	 */
2279 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2280 		vie->rex_present = 1;
2281 		vie->rex_w = x & 0x8 ? 1 : 0;
2282 		vie->rex_r = x & 0x4 ? 1 : 0;
2283 		vie->rex_x = x & 0x2 ? 1 : 0;
2284 		vie->rex_b = x & 0x1 ? 1 : 0;
2285 		vie_advance(vie);
2286 	}
2287 
2288 	/*
2289 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2290 	 */
2291 	if (cpu_mode == CPU_MODE_64BIT) {
2292 		/*
2293 		 * Default address size is 64-bits and default operand size
2294 		 * is 32-bits.
2295 		 */
2296 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2297 		if (vie->rex_w)
2298 			vie->opsize = 8;
2299 		else if (vie->opsize_override)
2300 			vie->opsize = 2;
2301 		else
2302 			vie->opsize = 4;
2303 	} else if (cs_d) {
2304 		/* Default address and operand sizes are 32-bits */
2305 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2306 		vie->opsize = vie->opsize_override ? 2 : 4;
2307 	} else {
2308 		/* Default address and operand sizes are 16-bits */
2309 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2310 		vie->opsize = vie->opsize_override ? 4 : 2;
2311 	}
2312 	return (0);
2313 }
2314 
2315 static int
2316 decode_two_byte_opcode(struct vie *vie)
2317 {
2318 	uint8_t x;
2319 
2320 	if (vie_peek(vie, &x))
2321 		return (-1);
2322 
2323 	vie->op = two_byte_opcodes[x];
2324 
2325 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2326 		return (-1);
2327 
2328 	vie_advance(vie);
2329 	return (0);
2330 }
2331 
2332 static int
2333 decode_opcode(struct vie *vie)
2334 {
2335 	uint8_t x;
2336 
2337 	if (vie_peek(vie, &x))
2338 		return (-1);
2339 
2340 	vie->op = one_byte_opcodes[x];
2341 
2342 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2343 		return (-1);
2344 
2345 	vie_advance(vie);
2346 
2347 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2348 		return (decode_two_byte_opcode(vie));
2349 
2350 	return (0);
2351 }
2352 
2353 static int
2354 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2355 {
2356 	uint8_t x;
2357 
2358 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2359 		return (0);
2360 
2361 	if (cpu_mode == CPU_MODE_REAL)
2362 		return (-1);
2363 
2364 	if (vie_peek(vie, &x))
2365 		return (-1);
2366 
2367 	vie->mod = (x >> 6) & 0x3;
2368 	vie->rm =  (x >> 0) & 0x7;
2369 	vie->reg = (x >> 3) & 0x7;
2370 
2371 	/*
2372 	 * A direct addressing mode makes no sense in the context of an EPT
2373 	 * fault. There has to be a memory access involved to cause the
2374 	 * EPT fault.
2375 	 */
2376 	if (vie->mod == VIE_MOD_DIRECT)
2377 		return (-1);
2378 
2379 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2380 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2381 		/*
2382 		 * Table 2-5: Special Cases of REX Encodings
2383 		 *
2384 		 * mod=0, r/m=5 is used in the compatibility mode to
2385 		 * indicate a disp32 without a base register.
2386 		 *
2387 		 * mod!=3, r/m=4 is used in the compatibility mode to
2388 		 * indicate that the SIB byte is present.
2389 		 *
2390 		 * The 'b' bit in the REX prefix is don't care in
2391 		 * this case.
2392 		 */
2393 	} else {
2394 		vie->rm |= (vie->rex_b << 3);
2395 	}
2396 
2397 	vie->reg |= (vie->rex_r << 3);
2398 
2399 	/* SIB */
2400 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2401 		goto done;
2402 
2403 	vie->base_register = gpr_map[vie->rm];
2404 
2405 	switch (vie->mod) {
2406 	case VIE_MOD_INDIRECT_DISP8:
2407 		vie->disp_bytes = 1;
2408 		break;
2409 	case VIE_MOD_INDIRECT_DISP32:
2410 		vie->disp_bytes = 4;
2411 		break;
2412 	case VIE_MOD_INDIRECT:
2413 		if (vie->rm == VIE_RM_DISP32) {
2414 			vie->disp_bytes = 4;
2415 			/*
2416 			 * Table 2-7. RIP-Relative Addressing
2417 			 *
2418 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2419 			 * whereas in compatibility mode it just implies disp32.
2420 			 */
2421 
2422 			if (cpu_mode == CPU_MODE_64BIT)
2423 				vie->base_register = VM_REG_GUEST_RIP;
2424 			else
2425 				vie->base_register = VM_REG_LAST;
2426 		}
2427 		break;
2428 	}
2429 
2430 done:
2431 	vie_advance(vie);
2432 
2433 	return (0);
2434 }
2435 
2436 static int
2437 decode_sib(struct vie *vie)
2438 {
2439 	uint8_t x;
2440 
2441 	/* Proceed only if SIB byte is present */
2442 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2443 		return (0);
2444 
2445 	if (vie_peek(vie, &x))
2446 		return (-1);
2447 
2448 	/* De-construct the SIB byte */
2449 	vie->ss = (x >> 6) & 0x3;
2450 	vie->index = (x >> 3) & 0x7;
2451 	vie->base = (x >> 0) & 0x7;
2452 
2453 	/* Apply the REX prefix modifiers */
2454 	vie->index |= vie->rex_x << 3;
2455 	vie->base |= vie->rex_b << 3;
2456 
2457 	switch (vie->mod) {
2458 	case VIE_MOD_INDIRECT_DISP8:
2459 		vie->disp_bytes = 1;
2460 		break;
2461 	case VIE_MOD_INDIRECT_DISP32:
2462 		vie->disp_bytes = 4;
2463 		break;
2464 	}
2465 
2466 	if (vie->mod == VIE_MOD_INDIRECT &&
2467 	    (vie->base == 5 || vie->base == 13)) {
2468 		/*
2469 		 * Special case when base register is unused if mod = 0
2470 		 * and base = %rbp or %r13.
2471 		 *
2472 		 * Documented in:
2473 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2474 		 * Table 2-5: Special Cases of REX Encodings
2475 		 */
2476 		vie->disp_bytes = 4;
2477 	} else {
2478 		vie->base_register = gpr_map[vie->base];
2479 	}
2480 
2481 	/*
2482 	 * All encodings of 'index' are valid except for %rsp (4).
2483 	 *
2484 	 * Documented in:
2485 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2486 	 * Table 2-5: Special Cases of REX Encodings
2487 	 */
2488 	if (vie->index != 4)
2489 		vie->index_register = gpr_map[vie->index];
2490 
2491 	/* 'scale' makes sense only in the context of an index register */
2492 	if (vie->index_register < VM_REG_LAST)
2493 		vie->scale = 1 << vie->ss;
2494 
2495 	vie_advance(vie);
2496 
2497 	return (0);
2498 }
2499 
2500 static int
2501 decode_displacement(struct vie *vie)
2502 {
2503 	int n, i;
2504 	uint8_t x;
2505 
2506 	union {
2507 		char	buf[4];
2508 		int8_t	signed8;
2509 		int32_t	signed32;
2510 	} u;
2511 
2512 	if ((n = vie->disp_bytes) == 0)
2513 		return (0);
2514 
2515 	if (n != 1 && n != 4)
2516 		panic("decode_displacement: invalid disp_bytes %d", n);
2517 
2518 	for (i = 0; i < n; i++) {
2519 		if (vie_peek(vie, &x))
2520 			return (-1);
2521 
2522 		u.buf[i] = x;
2523 		vie_advance(vie);
2524 	}
2525 
2526 	if (n == 1)
2527 		vie->displacement = u.signed8;		/* sign-extended */
2528 	else
2529 		vie->displacement = u.signed32;		/* sign-extended */
2530 
2531 	return (0);
2532 }
2533 
2534 static int
2535 decode_immediate(struct vie *vie)
2536 {
2537 	int i, n;
2538 	uint8_t x;
2539 	union {
2540 		char	buf[4];
2541 		int8_t	signed8;
2542 		int16_t	signed16;
2543 		int32_t	signed32;
2544 	} u;
2545 
2546 	/* Figure out immediate operand size (if any) */
2547 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2548 		/*
2549 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2550 		 * In 64-bit mode the typical size of immediate operands
2551 		 * remains 32-bits. When the operand size if 64-bits, the
2552 		 * processor sign-extends all immediates to 64-bits prior
2553 		 * to their use.
2554 		 */
2555 		if (vie->opsize == 4 || vie->opsize == 8)
2556 			vie->imm_bytes = 4;
2557 		else
2558 			vie->imm_bytes = 2;
2559 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2560 		vie->imm_bytes = 1;
2561 	}
2562 
2563 	if ((n = vie->imm_bytes) == 0)
2564 		return (0);
2565 
2566 	KASSERT(n == 1 || n == 2 || n == 4,
2567 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2568 
2569 	for (i = 0; i < n; i++) {
2570 		if (vie_peek(vie, &x))
2571 			return (-1);
2572 
2573 		u.buf[i] = x;
2574 		vie_advance(vie);
2575 	}
2576 
2577 	/* sign-extend the immediate value before use */
2578 	if (n == 1)
2579 		vie->immediate = u.signed8;
2580 	else if (n == 2)
2581 		vie->immediate = u.signed16;
2582 	else
2583 		vie->immediate = u.signed32;
2584 
2585 	return (0);
2586 }
2587 
2588 static int
2589 decode_moffset(struct vie *vie)
2590 {
2591 	int i, n;
2592 	uint8_t x;
2593 	union {
2594 		char	buf[8];
2595 		uint64_t u64;
2596 	} u;
2597 
2598 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2599 		return (0);
2600 
2601 	/*
2602 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2603 	 * The memory offset size follows the address-size of the instruction.
2604 	 */
2605 	n = vie->addrsize;
2606 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2607 
2608 	u.u64 = 0;
2609 	for (i = 0; i < n; i++) {
2610 		if (vie_peek(vie, &x))
2611 			return (-1);
2612 
2613 		u.buf[i] = x;
2614 		vie_advance(vie);
2615 	}
2616 	vie->displacement = u.u64;
2617 	return (0);
2618 }
2619 
2620 #ifdef _KERNEL
2621 /*
2622  * Verify that the 'guest linear address' provided as collateral of the nested
2623  * page table fault matches with our instruction decoding.
2624  */
2625 static int
2626 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2627     enum vm_cpu_mode cpu_mode)
2628 {
2629 	int error;
2630 	uint64_t base, segbase, idx, gla2;
2631 	enum vm_reg_name seg;
2632 	struct seg_desc desc;
2633 
2634 	/* Skip 'gla' verification */
2635 	if (gla == VIE_INVALID_GLA)
2636 		return (0);
2637 
2638 	base = 0;
2639 	if (vie->base_register != VM_REG_LAST) {
2640 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2641 		if (error) {
2642 			printf("verify_gla: error %d getting base reg %d\n",
2643 				error, vie->base_register);
2644 			return (-1);
2645 		}
2646 
2647 		/*
2648 		 * RIP-relative addressing starts from the following
2649 		 * instruction
2650 		 */
2651 		if (vie->base_register == VM_REG_GUEST_RIP)
2652 			base += vie->num_processed;
2653 	}
2654 
2655 	idx = 0;
2656 	if (vie->index_register != VM_REG_LAST) {
2657 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2658 		if (error) {
2659 			printf("verify_gla: error %d getting index reg %d\n",
2660 				error, vie->index_register);
2661 			return (-1);
2662 		}
2663 	}
2664 
2665 	/*
2666 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2667 	 *
2668 	 * In 64-bit mode, segmentation is generally (but not
2669 	 * completely) disabled.  The exceptions are the FS and GS
2670 	 * segments.
2671 	 *
2672 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2673 	 * as the base, the SS segment is the default segment.  For
2674 	 * other data references, except when relative to stack or
2675 	 * string destination the DS segment is the default.  These
2676 	 * can be overridden to allow other segments to be accessed.
2677 	 */
2678 	if (vie->segment_override)
2679 		seg = vie->segment_register;
2680 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2681 	    vie->base_register == VM_REG_GUEST_RBP)
2682 		seg = VM_REG_GUEST_SS;
2683 	else
2684 		seg = VM_REG_GUEST_DS;
2685 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2686 	    seg != VM_REG_GUEST_GS) {
2687 		segbase = 0;
2688 	} else {
2689 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2690 		if (error) {
2691 			printf("verify_gla: error %d getting segment"
2692 			       " descriptor %d", error,
2693 			       vie->segment_register);
2694 			return (-1);
2695 		}
2696 		segbase = desc.base;
2697 	}
2698 
2699 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2700 	gla2 &= size2mask[vie->addrsize];
2701 	if (gla != gla2) {
2702 		printf("verify_gla mismatch: segbase(0x%0lx)"
2703 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2704 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2705 		       segbase, base, vie->scale, idx, vie->displacement,
2706 		       gla, gla2);
2707 		return (-1);
2708 	}
2709 
2710 	return (0);
2711 }
2712 #endif	/* _KERNEL */
2713 
2714 int
2715 #ifdef _KERNEL
2716 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2717 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2718 #else
2719 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2720 #endif
2721 {
2722 
2723 	if (decode_prefixes(vie, cpu_mode, cs_d))
2724 		return (-1);
2725 
2726 	if (decode_opcode(vie))
2727 		return (-1);
2728 
2729 	if (decode_modrm(vie, cpu_mode))
2730 		return (-1);
2731 
2732 	if (decode_sib(vie))
2733 		return (-1);
2734 
2735 	if (decode_displacement(vie))
2736 		return (-1);
2737 
2738 	if (decode_immediate(vie))
2739 		return (-1);
2740 
2741 	if (decode_moffset(vie))
2742 		return (-1);
2743 
2744 #ifdef _KERNEL
2745 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2746 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2747 			return (-1);
2748 	}
2749 #endif
2750 
2751 	vie->decoded = 1;	/* success */
2752 
2753 	return (0);
2754 }
2755