xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 6f63e88c0166ed3e5f2805a9e667c7d24d304cf1)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <err.h>
54 #include <assert.h>
55 #include <stdbool.h>
56 #include <stdio.h>
57 #include <strings.h>
58 #include <vmmapi.h>
59 #define	KASSERT(exp,msg)	assert((exp))
60 #define	panic(...)		errx(4, __VA_ARGS__)
61 #endif	/* _KERNEL */
62 
63 #include <machine/vmm_instruction_emul.h>
64 #include <x86/psl.h>
65 #include <x86/specialreg.h>
66 
67 /* struct vie_op.op_type */
68 enum {
69 	VIE_OP_TYPE_NONE = 0,
70 	VIE_OP_TYPE_MOV,
71 	VIE_OP_TYPE_MOVSX,
72 	VIE_OP_TYPE_MOVZX,
73 	VIE_OP_TYPE_AND,
74 	VIE_OP_TYPE_OR,
75 	VIE_OP_TYPE_SUB,
76 	VIE_OP_TYPE_TWO_BYTE,
77 	VIE_OP_TYPE_PUSH,
78 	VIE_OP_TYPE_CMP,
79 	VIE_OP_TYPE_POP,
80 	VIE_OP_TYPE_MOVS,
81 	VIE_OP_TYPE_GROUP1,
82 	VIE_OP_TYPE_STOS,
83 	VIE_OP_TYPE_BITTEST,
84 	VIE_OP_TYPE_TWOB_GRP15,
85 	VIE_OP_TYPE_ADD,
86 	VIE_OP_TYPE_TEST,
87 	VIE_OP_TYPE_BEXTR,
88 	VIE_OP_TYPE_LAST
89 };
90 
91 /* struct vie_op.op_flags */
92 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
93 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
94 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
95 #define	VIE_OP_F_NO_MODRM	(1 << 3)
96 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
97 
98 static const struct vie_op three_byte_opcodes_0f38[256] = {
99 	[0xF7] = {
100 		.op_byte = 0xF7,
101 		.op_type = VIE_OP_TYPE_BEXTR,
102 	},
103 };
104 
105 static const struct vie_op two_byte_opcodes[256] = {
106 	[0xAE] = {
107 		.op_byte = 0xAE,
108 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
109 	},
110 	[0xB6] = {
111 		.op_byte = 0xB6,
112 		.op_type = VIE_OP_TYPE_MOVZX,
113 	},
114 	[0xB7] = {
115 		.op_byte = 0xB7,
116 		.op_type = VIE_OP_TYPE_MOVZX,
117 	},
118 	[0xBA] = {
119 		.op_byte = 0xBA,
120 		.op_type = VIE_OP_TYPE_BITTEST,
121 		.op_flags = VIE_OP_F_IMM8,
122 	},
123 	[0xBE] = {
124 		.op_byte = 0xBE,
125 		.op_type = VIE_OP_TYPE_MOVSX,
126 	},
127 };
128 
129 static const struct vie_op one_byte_opcodes[256] = {
130 	[0x03] = {
131 		.op_byte = 0x03,
132 		.op_type = VIE_OP_TYPE_ADD,
133 	},
134 	[0x0F] = {
135 		.op_byte = 0x0F,
136 		.op_type = VIE_OP_TYPE_TWO_BYTE
137 	},
138 	[0x0B] = {
139 		.op_byte = 0x0B,
140 		.op_type = VIE_OP_TYPE_OR,
141 	},
142 	[0x2B] = {
143 		.op_byte = 0x2B,
144 		.op_type = VIE_OP_TYPE_SUB,
145 	},
146 	[0x39] = {
147 		.op_byte = 0x39,
148 		.op_type = VIE_OP_TYPE_CMP,
149 	},
150 	[0x3B] = {
151 		.op_byte = 0x3B,
152 		.op_type = VIE_OP_TYPE_CMP,
153 	},
154 	[0x88] = {
155 		.op_byte = 0x88,
156 		.op_type = VIE_OP_TYPE_MOV,
157 	},
158 	[0x89] = {
159 		.op_byte = 0x89,
160 		.op_type = VIE_OP_TYPE_MOV,
161 	},
162 	[0x8A] = {
163 		.op_byte = 0x8A,
164 		.op_type = VIE_OP_TYPE_MOV,
165 	},
166 	[0x8B] = {
167 		.op_byte = 0x8B,
168 		.op_type = VIE_OP_TYPE_MOV,
169 	},
170 	[0xA1] = {
171 		.op_byte = 0xA1,
172 		.op_type = VIE_OP_TYPE_MOV,
173 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
174 	},
175 	[0xA3] = {
176 		.op_byte = 0xA3,
177 		.op_type = VIE_OP_TYPE_MOV,
178 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
179 	},
180 	[0xA4] = {
181 		.op_byte = 0xA4,
182 		.op_type = VIE_OP_TYPE_MOVS,
183 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
184 	},
185 	[0xA5] = {
186 		.op_byte = 0xA5,
187 		.op_type = VIE_OP_TYPE_MOVS,
188 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
189 	},
190 	[0xAA] = {
191 		.op_byte = 0xAA,
192 		.op_type = VIE_OP_TYPE_STOS,
193 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
194 	},
195 	[0xAB] = {
196 		.op_byte = 0xAB,
197 		.op_type = VIE_OP_TYPE_STOS,
198 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
199 	},
200 	[0xC6] = {
201 		/* XXX Group 11 extended opcode - not just MOV */
202 		.op_byte = 0xC6,
203 		.op_type = VIE_OP_TYPE_MOV,
204 		.op_flags = VIE_OP_F_IMM8,
205 	},
206 	[0xC7] = {
207 		.op_byte = 0xC7,
208 		.op_type = VIE_OP_TYPE_MOV,
209 		.op_flags = VIE_OP_F_IMM,
210 	},
211 	[0x23] = {
212 		.op_byte = 0x23,
213 		.op_type = VIE_OP_TYPE_AND,
214 	},
215 	[0x80] = {
216 		/* Group 1 extended opcode */
217 		.op_byte = 0x80,
218 		.op_type = VIE_OP_TYPE_GROUP1,
219 		.op_flags = VIE_OP_F_IMM8,
220 	},
221 	[0x81] = {
222 		/* Group 1 extended opcode */
223 		.op_byte = 0x81,
224 		.op_type = VIE_OP_TYPE_GROUP1,
225 		.op_flags = VIE_OP_F_IMM,
226 	},
227 	[0x83] = {
228 		/* Group 1 extended opcode */
229 		.op_byte = 0x83,
230 		.op_type = VIE_OP_TYPE_GROUP1,
231 		.op_flags = VIE_OP_F_IMM8,
232 	},
233 	[0x8F] = {
234 		/* XXX Group 1A extended opcode - not just POP */
235 		.op_byte = 0x8F,
236 		.op_type = VIE_OP_TYPE_POP,
237 	},
238 	[0xF7] = {
239 		/* XXX Group 3 extended opcode - not just TEST */
240 		.op_byte = 0xF7,
241 		.op_type = VIE_OP_TYPE_TEST,
242 		.op_flags = VIE_OP_F_IMM,
243 	},
244 	[0xFF] = {
245 		/* XXX Group 5 extended opcode - not just PUSH */
246 		.op_byte = 0xFF,
247 		.op_type = VIE_OP_TYPE_PUSH,
248 	}
249 };
250 
251 /* struct vie.mod */
252 #define	VIE_MOD_INDIRECT		0
253 #define	VIE_MOD_INDIRECT_DISP8		1
254 #define	VIE_MOD_INDIRECT_DISP32		2
255 #define	VIE_MOD_DIRECT			3
256 
257 /* struct vie.rm */
258 #define	VIE_RM_SIB			4
259 #define	VIE_RM_DISP32			5
260 
261 #define	GB				(1024 * 1024 * 1024)
262 
263 static enum vm_reg_name gpr_map[16] = {
264 	VM_REG_GUEST_RAX,
265 	VM_REG_GUEST_RCX,
266 	VM_REG_GUEST_RDX,
267 	VM_REG_GUEST_RBX,
268 	VM_REG_GUEST_RSP,
269 	VM_REG_GUEST_RBP,
270 	VM_REG_GUEST_RSI,
271 	VM_REG_GUEST_RDI,
272 	VM_REG_GUEST_R8,
273 	VM_REG_GUEST_R9,
274 	VM_REG_GUEST_R10,
275 	VM_REG_GUEST_R11,
276 	VM_REG_GUEST_R12,
277 	VM_REG_GUEST_R13,
278 	VM_REG_GUEST_R14,
279 	VM_REG_GUEST_R15
280 };
281 
282 static uint64_t size2mask[] = {
283 	[1] = 0xff,
284 	[2] = 0xffff,
285 	[4] = 0xffffffff,
286 	[8] = 0xffffffffffffffff,
287 };
288 
289 static int
290 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
291 {
292 	int error;
293 
294 	error = vm_get_register(vm, vcpuid, reg, rval);
295 
296 	return (error);
297 }
298 
299 static void
300 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
301 {
302 	*lhbr = 0;
303 	*reg = gpr_map[vie->reg];
304 
305 	/*
306 	 * 64-bit mode imposes limitations on accessing legacy high byte
307 	 * registers (lhbr).
308 	 *
309 	 * The legacy high-byte registers cannot be addressed if the REX
310 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
311 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
312 	 *
313 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
314 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
315 	 * %ah, %ch, %dh and %bh respectively.
316 	 */
317 	if (!vie->rex_present) {
318 		if (vie->reg & 0x4) {
319 			*lhbr = 1;
320 			*reg = gpr_map[vie->reg & 0x3];
321 		}
322 	}
323 }
324 
325 static int
326 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
327 {
328 	uint64_t val;
329 	int error, lhbr;
330 	enum vm_reg_name reg;
331 
332 	vie_calc_bytereg(vie, &reg, &lhbr);
333 	error = vm_get_register(vm, vcpuid, reg, &val);
334 
335 	/*
336 	 * To obtain the value of a legacy high byte register shift the
337 	 * base register right by 8 bits (%ah = %rax >> 8).
338 	 */
339 	if (lhbr)
340 		*rval = val >> 8;
341 	else
342 		*rval = val;
343 	return (error);
344 }
345 
346 static int
347 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
348 {
349 	uint64_t origval, val, mask;
350 	int error, lhbr;
351 	enum vm_reg_name reg;
352 
353 	vie_calc_bytereg(vie, &reg, &lhbr);
354 	error = vm_get_register(vm, vcpuid, reg, &origval);
355 	if (error == 0) {
356 		val = byte;
357 		mask = 0xff;
358 		if (lhbr) {
359 			/*
360 			 * Shift left by 8 to store 'byte' in a legacy high
361 			 * byte register.
362 			 */
363 			val <<= 8;
364 			mask <<= 8;
365 		}
366 		val |= origval & ~mask;
367 		error = vm_set_register(vm, vcpuid, reg, val);
368 	}
369 	return (error);
370 }
371 
372 int
373 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
374 		    uint64_t val, int size)
375 {
376 	int error;
377 	uint64_t origval;
378 
379 	switch (size) {
380 	case 1:
381 	case 2:
382 		error = vie_read_register(vm, vcpuid, reg, &origval);
383 		if (error)
384 			return (error);
385 		val &= size2mask[size];
386 		val |= origval & ~size2mask[size];
387 		break;
388 	case 4:
389 		val &= 0xffffffffUL;
390 		break;
391 	case 8:
392 		break;
393 	default:
394 		return (EINVAL);
395 	}
396 
397 	error = vm_set_register(vm, vcpuid, reg, val);
398 	return (error);
399 }
400 
401 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
402 
403 /*
404  * Return the status flags that would result from doing (x - y).
405  */
406 #define	GETCC(sz)							\
407 static u_long								\
408 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
409 {									\
410 	u_long rflags;							\
411 									\
412 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
413 	    "=r" (rflags), "+r" (x) : "m" (y));				\
414 	return (rflags);						\
415 } struct __hack
416 
417 GETCC(8);
418 GETCC(16);
419 GETCC(32);
420 GETCC(64);
421 
422 static u_long
423 getcc(int opsize, uint64_t x, uint64_t y)
424 {
425 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
426 	    ("getcc: invalid operand size %d", opsize));
427 
428 	if (opsize == 1)
429 		return (getcc8(x, y));
430 	else if (opsize == 2)
431 		return (getcc16(x, y));
432 	else if (opsize == 4)
433 		return (getcc32(x, y));
434 	else
435 		return (getcc64(x, y));
436 }
437 
438 /*
439  * Macro creation of functions getaddflags{8,16,32,64}
440  */
441 #define	GETADDFLAGS(sz)							\
442 static u_long								\
443 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
444 {									\
445 	u_long rflags;							\
446 									\
447 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
448 	    "=r" (rflags), "+r" (x) : "m" (y));				\
449 	return (rflags);						\
450 } struct __hack
451 
452 GETADDFLAGS(8);
453 GETADDFLAGS(16);
454 GETADDFLAGS(32);
455 GETADDFLAGS(64);
456 
457 static u_long
458 getaddflags(int opsize, uint64_t x, uint64_t y)
459 {
460 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
461 	    ("getaddflags: invalid operand size %d", opsize));
462 
463 	if (opsize == 1)
464 		return (getaddflags8(x, y));
465 	else if (opsize == 2)
466 		return (getaddflags16(x, y));
467 	else if (opsize == 4)
468 		return (getaddflags32(x, y));
469 	else
470 		return (getaddflags64(x, y));
471 }
472 
473 /*
474  * Return the status flags that would result from doing (x & y).
475  */
476 #define	GETANDFLAGS(sz)							\
477 static u_long								\
478 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
479 {									\
480 	u_long rflags;							\
481 									\
482 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
483 	    "=r" (rflags), "+r" (x) : "m" (y));				\
484 	return (rflags);						\
485 } struct __hack
486 
487 GETANDFLAGS(8);
488 GETANDFLAGS(16);
489 GETANDFLAGS(32);
490 GETANDFLAGS(64);
491 
492 static u_long
493 getandflags(int opsize, uint64_t x, uint64_t y)
494 {
495 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
496 	    ("getandflags: invalid operand size %d", opsize));
497 
498 	if (opsize == 1)
499 		return (getandflags8(x, y));
500 	else if (opsize == 2)
501 		return (getandflags16(x, y));
502 	else if (opsize == 4)
503 		return (getandflags32(x, y));
504 	else
505 		return (getandflags64(x, y));
506 }
507 
508 static int
509 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
510 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
511 {
512 	int error, size;
513 	enum vm_reg_name reg;
514 	uint8_t byte;
515 	uint64_t val;
516 
517 	size = vie->opsize;
518 	error = EINVAL;
519 
520 	switch (vie->op.op_byte) {
521 	case 0x88:
522 		/*
523 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
524 		 * 88/r:	mov r/m8, r8
525 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
526 		 */
527 		size = 1;	/* override for byte operation */
528 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
529 		if (error == 0)
530 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
531 		break;
532 	case 0x89:
533 		/*
534 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
535 		 * 89/r:	mov r/m16, r16
536 		 * 89/r:	mov r/m32, r32
537 		 * REX.W + 89/r	mov r/m64, r64
538 		 */
539 		reg = gpr_map[vie->reg];
540 		error = vie_read_register(vm, vcpuid, reg, &val);
541 		if (error == 0) {
542 			val &= size2mask[size];
543 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
544 		}
545 		break;
546 	case 0x8A:
547 		/*
548 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
549 		 * 8A/r:	mov r8, r/m8
550 		 * REX + 8A/r:	mov r8, r/m8
551 		 */
552 		size = 1;	/* override for byte operation */
553 		error = memread(vm, vcpuid, gpa, &val, size, arg);
554 		if (error == 0)
555 			error = vie_write_bytereg(vm, vcpuid, vie, val);
556 		break;
557 	case 0x8B:
558 		/*
559 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
560 		 * 8B/r:	mov r16, r/m16
561 		 * 8B/r:	mov r32, r/m32
562 		 * REX.W 8B/r:	mov r64, r/m64
563 		 */
564 		error = memread(vm, vcpuid, gpa, &val, size, arg);
565 		if (error == 0) {
566 			reg = gpr_map[vie->reg];
567 			error = vie_update_register(vm, vcpuid, reg, val, size);
568 		}
569 		break;
570 	case 0xA1:
571 		/*
572 		 * MOV from seg:moffset to AX/EAX/RAX
573 		 * A1:		mov AX, moffs16
574 		 * A1:		mov EAX, moffs32
575 		 * REX.W + A1:	mov RAX, moffs64
576 		 */
577 		error = memread(vm, vcpuid, gpa, &val, size, arg);
578 		if (error == 0) {
579 			reg = VM_REG_GUEST_RAX;
580 			error = vie_update_register(vm, vcpuid, reg, val, size);
581 		}
582 		break;
583 	case 0xA3:
584 		/*
585 		 * MOV from AX/EAX/RAX to seg:moffset
586 		 * A3:		mov moffs16, AX
587 		 * A3:		mov moffs32, EAX
588 		 * REX.W + A3:	mov moffs64, RAX
589 		 */
590 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
591 		if (error == 0) {
592 			val &= size2mask[size];
593 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
594 		}
595 		break;
596 	case 0xC6:
597 		/*
598 		 * MOV from imm8 to mem (ModRM:r/m)
599 		 * C6/0		mov r/m8, imm8
600 		 * REX + C6/0	mov r/m8, imm8
601 		 */
602 		size = 1;	/* override for byte operation */
603 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
604 		break;
605 	case 0xC7:
606 		/*
607 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
608 		 * C7/0		mov r/m16, imm16
609 		 * C7/0		mov r/m32, imm32
610 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
611 		 */
612 		val = vie->immediate & size2mask[size];
613 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
614 		break;
615 	default:
616 		break;
617 	}
618 
619 	return (error);
620 }
621 
622 static int
623 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
624 	     mem_region_read_t memread, mem_region_write_t memwrite,
625 	     void *arg)
626 {
627 	int error, size;
628 	enum vm_reg_name reg;
629 	uint64_t val;
630 
631 	size = vie->opsize;
632 	error = EINVAL;
633 
634 	switch (vie->op.op_byte) {
635 	case 0xB6:
636 		/*
637 		 * MOV and zero extend byte from mem (ModRM:r/m) to
638 		 * reg (ModRM:reg).
639 		 *
640 		 * 0F B6/r		movzx r16, r/m8
641 		 * 0F B6/r		movzx r32, r/m8
642 		 * REX.W + 0F B6/r	movzx r64, r/m8
643 		 */
644 
645 		/* get the first operand */
646 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
647 		if (error)
648 			break;
649 
650 		/* get the second operand */
651 		reg = gpr_map[vie->reg];
652 
653 		/* zero-extend byte */
654 		val = (uint8_t)val;
655 
656 		/* write the result */
657 		error = vie_update_register(vm, vcpuid, reg, val, size);
658 		break;
659 	case 0xB7:
660 		/*
661 		 * MOV and zero extend word from mem (ModRM:r/m) to
662 		 * reg (ModRM:reg).
663 		 *
664 		 * 0F B7/r		movzx r32, r/m16
665 		 * REX.W + 0F B7/r	movzx r64, r/m16
666 		 */
667 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
668 		if (error)
669 			return (error);
670 
671 		reg = gpr_map[vie->reg];
672 
673 		/* zero-extend word */
674 		val = (uint16_t)val;
675 
676 		error = vie_update_register(vm, vcpuid, reg, val, size);
677 		break;
678 	case 0xBE:
679 		/*
680 		 * MOV and sign extend byte from mem (ModRM:r/m) to
681 		 * reg (ModRM:reg).
682 		 *
683 		 * 0F BE/r		movsx r16, r/m8
684 		 * 0F BE/r		movsx r32, r/m8
685 		 * REX.W + 0F BE/r	movsx r64, r/m8
686 		 */
687 
688 		/* get the first operand */
689 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
690 		if (error)
691 			break;
692 
693 		/* get the second operand */
694 		reg = gpr_map[vie->reg];
695 
696 		/* sign extend byte */
697 		val = (int8_t)val;
698 
699 		/* write the result */
700 		error = vie_update_register(vm, vcpuid, reg, val, size);
701 		break;
702 	default:
703 		break;
704 	}
705 	return (error);
706 }
707 
708 /*
709  * Helper function to calculate and validate a linear address.
710  */
711 static int
712 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
713     int opsize, int addrsize, int prot, enum vm_reg_name seg,
714     enum vm_reg_name gpr, uint64_t *gla, int *fault)
715 {
716 	struct seg_desc desc;
717 	uint64_t cr0, val, rflags;
718 	int error;
719 
720 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
721 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
722 
723 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
724 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
725 
726 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
727 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
728 	    __func__, error, seg));
729 
730 	error = vie_read_register(vm, vcpuid, gpr, &val);
731 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
732 	    error, gpr));
733 
734 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
735 	    addrsize, prot, gla)) {
736 		if (seg == VM_REG_GUEST_SS)
737 			vm_inject_ss(vm, vcpuid, 0);
738 		else
739 			vm_inject_gp(vm, vcpuid);
740 		goto guest_fault;
741 	}
742 
743 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
744 		if (seg == VM_REG_GUEST_SS)
745 			vm_inject_ss(vm, vcpuid, 0);
746 		else
747 			vm_inject_gp(vm, vcpuid);
748 		goto guest_fault;
749 	}
750 
751 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
752 		vm_inject_ac(vm, vcpuid, 0);
753 		goto guest_fault;
754 	}
755 
756 	*fault = 0;
757 	return (0);
758 
759 guest_fault:
760 	*fault = 1;
761 	return (0);
762 }
763 
764 static int
765 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
766     struct vm_guest_paging *paging, mem_region_read_t memread,
767     mem_region_write_t memwrite, void *arg)
768 {
769 #ifdef _KERNEL
770 	struct vm_copyinfo copyinfo[2];
771 #else
772 	struct iovec copyinfo[2];
773 #endif
774 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
775 	uint64_t rcx, rdi, rsi, rflags;
776 	int error, fault, opsize, seg, repeat;
777 
778 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
779 	val = 0;
780 	error = 0;
781 
782 	/*
783 	 * XXX although the MOVS instruction is only supposed to be used with
784 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
785 	 *
786 	 * Empirically the "repnz" prefix has identical behavior to "rep"
787 	 * and the zero flag does not make a difference.
788 	 */
789 	repeat = vie->repz_present | vie->repnz_present;
790 
791 	if (repeat) {
792 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
793 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
794 
795 		/*
796 		 * The count register is %rcx, %ecx or %cx depending on the
797 		 * address size of the instruction.
798 		 */
799 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
800 			error = 0;
801 			goto done;
802 		}
803 	}
804 
805 	/*
806 	 *	Source		Destination	Comments
807 	 *	--------------------------------------------
808 	 * (1)  memory		memory		n/a
809 	 * (2)  memory		mmio		emulated
810 	 * (3)  mmio		memory		emulated
811 	 * (4)  mmio		mmio		emulated
812 	 *
813 	 * At this point we don't have sufficient information to distinguish
814 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
815 	 * out because it will succeed only when operating on regular memory.
816 	 *
817 	 * XXX the emulation doesn't properly handle the case where 'gpa'
818 	 * is straddling the boundary between the normal memory and MMIO.
819 	 */
820 
821 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
822 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
823 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
824 	if (error || fault)
825 		goto done;
826 
827 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
828 	    copyinfo, nitems(copyinfo), &fault);
829 	if (error == 0) {
830 		if (fault)
831 			goto done;	/* Resume guest to handle fault */
832 
833 		/*
834 		 * case (2): read from system memory and write to mmio.
835 		 */
836 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
837 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
838 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
839 		if (error)
840 			goto done;
841 	} else {
842 		/*
843 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
844 		 * if 'srcaddr' is in the mmio space.
845 		 */
846 
847 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
848 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
849 		    &fault);
850 		if (error || fault)
851 			goto done;
852 
853 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
854 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
855 		if (error == 0) {
856 			if (fault)
857 				goto done;    /* Resume guest to handle fault */
858 
859 			/*
860 			 * case (3): read from MMIO and write to system memory.
861 			 *
862 			 * A MMIO read can have side-effects so we
863 			 * commit to it only after vm_copy_setup() is
864 			 * successful. If a page-fault needs to be
865 			 * injected into the guest then it will happen
866 			 * before the MMIO read is attempted.
867 			 */
868 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
869 			if (error)
870 				goto done;
871 
872 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
873 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
874 		} else {
875 			/*
876 			 * Case (4): read from and write to mmio.
877 			 *
878 			 * Commit to the MMIO read/write (with potential
879 			 * side-effects) only after we are sure that the
880 			 * instruction is not going to be restarted due
881 			 * to address translation faults.
882 			 */
883 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
884 			    PROT_READ, &srcgpa, &fault);
885 			if (error || fault)
886 				goto done;
887 
888 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
889 			   PROT_WRITE, &dstgpa, &fault);
890 			if (error || fault)
891 				goto done;
892 
893 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
894 			if (error)
895 				goto done;
896 
897 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
898 			if (error)
899 				goto done;
900 		}
901 	}
902 
903 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
904 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
905 
906 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
907 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
908 
909 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
910 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
911 
912 	if (rflags & PSL_D) {
913 		rsi -= opsize;
914 		rdi -= opsize;
915 	} else {
916 		rsi += opsize;
917 		rdi += opsize;
918 	}
919 
920 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
921 	    vie->addrsize);
922 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
923 
924 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
925 	    vie->addrsize);
926 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
927 
928 	if (repeat) {
929 		rcx = rcx - 1;
930 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
931 		    rcx, vie->addrsize);
932 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
933 
934 		/*
935 		 * Repeat the instruction if the count register is not zero.
936 		 */
937 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
938 			vm_restart_instruction(vm, vcpuid);
939 	}
940 done:
941 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
942 	    __func__, error));
943 	return (error);
944 }
945 
946 static int
947 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
948     struct vm_guest_paging *paging, mem_region_read_t memread,
949     mem_region_write_t memwrite, void *arg)
950 {
951 	int error, opsize, repeat;
952 	uint64_t val;
953 	uint64_t rcx, rdi, rflags;
954 
955 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
956 	repeat = vie->repz_present | vie->repnz_present;
957 
958 	if (repeat) {
959 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
960 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
961 
962 		/*
963 		 * The count register is %rcx, %ecx or %cx depending on the
964 		 * address size of the instruction.
965 		 */
966 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
967 			return (0);
968 	}
969 
970 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
971 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
972 
973 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
974 	if (error)
975 		return (error);
976 
977 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
978 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
979 
980 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
981 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
982 
983 	if (rflags & PSL_D)
984 		rdi -= opsize;
985 	else
986 		rdi += opsize;
987 
988 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
989 	    vie->addrsize);
990 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
991 
992 	if (repeat) {
993 		rcx = rcx - 1;
994 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
995 		    rcx, vie->addrsize);
996 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
997 
998 		/*
999 		 * Repeat the instruction if the count register is not zero.
1000 		 */
1001 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1002 			vm_restart_instruction(vm, vcpuid);
1003 	}
1004 
1005 	return (0);
1006 }
1007 
1008 static int
1009 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1010 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1011 {
1012 	int error, size;
1013 	enum vm_reg_name reg;
1014 	uint64_t result, rflags, rflags2, val1, val2;
1015 
1016 	size = vie->opsize;
1017 	error = EINVAL;
1018 
1019 	switch (vie->op.op_byte) {
1020 	case 0x23:
1021 		/*
1022 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1023 		 * result in reg.
1024 		 *
1025 		 * 23/r		and r16, r/m16
1026 		 * 23/r		and r32, r/m32
1027 		 * REX.W + 23/r	and r64, r/m64
1028 		 */
1029 
1030 		/* get the first operand */
1031 		reg = gpr_map[vie->reg];
1032 		error = vie_read_register(vm, vcpuid, reg, &val1);
1033 		if (error)
1034 			break;
1035 
1036 		/* get the second operand */
1037 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1038 		if (error)
1039 			break;
1040 
1041 		/* perform the operation and write the result */
1042 		result = val1 & val2;
1043 		error = vie_update_register(vm, vcpuid, reg, result, size);
1044 		break;
1045 	case 0x81:
1046 	case 0x83:
1047 		/*
1048 		 * AND mem (ModRM:r/m) with immediate and store the
1049 		 * result in mem.
1050 		 *
1051 		 * 81 /4		and r/m16, imm16
1052 		 * 81 /4		and r/m32, imm32
1053 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1054 		 *
1055 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1056 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1057 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1058 		 */
1059 
1060 		/* get the first operand */
1061                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1062                 if (error)
1063 			break;
1064 
1065                 /*
1066 		 * perform the operation with the pre-fetched immediate
1067 		 * operand and write the result
1068 		 */
1069                 result = val1 & vie->immediate;
1070                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1071 		break;
1072 	default:
1073 		break;
1074 	}
1075 	if (error)
1076 		return (error);
1077 
1078 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1079 	if (error)
1080 		return (error);
1081 
1082 	/*
1083 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1084 	 * to the result; AF is undefined.
1085 	 *
1086 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1087 	 */
1088 	rflags2 = getcc(size, result, 0);
1089 	rflags &= ~RFLAGS_STATUS_BITS;
1090 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1091 
1092 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1093 	return (error);
1094 }
1095 
1096 static int
1097 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1098 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1099 {
1100 	int error, size;
1101 	enum vm_reg_name reg;
1102 	uint64_t result, rflags, rflags2, val1, val2;
1103 
1104 	size = vie->opsize;
1105 	error = EINVAL;
1106 
1107 	switch (vie->op.op_byte) {
1108 	case 0x0B:
1109 		/*
1110 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1111 		 * result in reg.
1112 		 *
1113 		 * 0b/r         or r16, r/m16
1114 		 * 0b/r         or r32, r/m32
1115 		 * REX.W + 0b/r or r64, r/m64
1116 		 */
1117 
1118 		/* get the first operand */
1119 		reg = gpr_map[vie->reg];
1120 		error = vie_read_register(vm, vcpuid, reg, &val1);
1121 		if (error)
1122 			break;
1123 
1124 		/* get the second operand */
1125 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1126 		if (error)
1127 			break;
1128 
1129 		/* perform the operation and write the result */
1130 		result = val1 | val2;
1131 		error = vie_update_register(vm, vcpuid, reg, result, size);
1132 		break;
1133 	case 0x81:
1134 	case 0x83:
1135 		/*
1136 		 * OR mem (ModRM:r/m) with immediate and store the
1137 		 * result in mem.
1138 		 *
1139 		 * 81 /1		or r/m16, imm16
1140 		 * 81 /1		or r/m32, imm32
1141 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1142 		 *
1143 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1144 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1145 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1146 		 */
1147 
1148 		/* get the first operand */
1149                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1150                 if (error)
1151 			break;
1152 
1153                 /*
1154 		 * perform the operation with the pre-fetched immediate
1155 		 * operand and write the result
1156 		 */
1157                 result = val1 | vie->immediate;
1158                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1159 		break;
1160 	default:
1161 		break;
1162 	}
1163 	if (error)
1164 		return (error);
1165 
1166 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1167 	if (error)
1168 		return (error);
1169 
1170 	/*
1171 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1172 	 * to the result; AF is undefined.
1173 	 *
1174 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1175 	 */
1176 	rflags2 = getcc(size, result, 0);
1177 	rflags &= ~RFLAGS_STATUS_BITS;
1178 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1179 
1180 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1181 	return (error);
1182 }
1183 
1184 static int
1185 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1186 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1187 {
1188 	int error, size;
1189 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1190 	enum vm_reg_name reg;
1191 
1192 	size = vie->opsize;
1193 	switch (vie->op.op_byte) {
1194 	case 0x39:
1195 	case 0x3B:
1196 		/*
1197 		 * 39/r		CMP r/m16, r16
1198 		 * 39/r		CMP r/m32, r32
1199 		 * REX.W 39/r	CMP r/m64, r64
1200 		 *
1201 		 * 3B/r		CMP r16, r/m16
1202 		 * 3B/r		CMP r32, r/m32
1203 		 * REX.W + 3B/r	CMP r64, r/m64
1204 		 *
1205 		 * Compare the first operand with the second operand and
1206 		 * set status flags in EFLAGS register. The comparison is
1207 		 * performed by subtracting the second operand from the first
1208 		 * operand and then setting the status flags.
1209 		 */
1210 
1211 		/* Get the register operand */
1212 		reg = gpr_map[vie->reg];
1213 		error = vie_read_register(vm, vcpuid, reg, &regop);
1214 		if (error)
1215 			return (error);
1216 
1217 		/* Get the memory operand */
1218 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1219 		if (error)
1220 			return (error);
1221 
1222 		if (vie->op.op_byte == 0x3B) {
1223 			op1 = regop;
1224 			op2 = memop;
1225 		} else {
1226 			op1 = memop;
1227 			op2 = regop;
1228 		}
1229 		rflags2 = getcc(size, op1, op2);
1230 		break;
1231 	case 0x80:
1232 	case 0x81:
1233 	case 0x83:
1234 		/*
1235 		 * 80 /7		cmp r/m8, imm8
1236 		 * REX + 80 /7		cmp r/m8, imm8
1237 		 *
1238 		 * 81 /7		cmp r/m16, imm16
1239 		 * 81 /7		cmp r/m32, imm32
1240 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1241 		 *
1242 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1243 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1244 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1245 		 *
1246 		 * Compare mem (ModRM:r/m) with immediate and set
1247 		 * status flags according to the results.  The
1248 		 * comparison is performed by subtracting the
1249 		 * immediate from the first operand and then setting
1250 		 * the status flags.
1251 		 *
1252 		 */
1253 		if (vie->op.op_byte == 0x80)
1254 			size = 1;
1255 
1256 		/* get the first operand */
1257                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1258 		if (error)
1259 			return (error);
1260 
1261 		rflags2 = getcc(size, op1, vie->immediate);
1262 		break;
1263 	default:
1264 		return (EINVAL);
1265 	}
1266 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1267 	if (error)
1268 		return (error);
1269 	rflags &= ~RFLAGS_STATUS_BITS;
1270 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1271 
1272 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1273 	return (error);
1274 }
1275 
1276 static int
1277 emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1278     mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1279 {
1280 	int error, size;
1281 	uint64_t op1, rflags, rflags2;
1282 
1283 	size = vie->opsize;
1284 	error = EINVAL;
1285 
1286 	switch (vie->op.op_byte) {
1287 	case 0xF7:
1288 		/*
1289 		 * F7 /0		test r/m16, imm16
1290 		 * F7 /0		test r/m32, imm32
1291 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1292 		 *
1293 		 * Test mem (ModRM:r/m) with immediate and set status
1294 		 * flags according to the results.  The comparison is
1295 		 * performed by anding the immediate from the first
1296 		 * operand and then setting the status flags.
1297 		 */
1298 		if ((vie->reg & 7) != 0)
1299 			return (EINVAL);
1300 
1301 		error = memread(vm, vcpuid, gpa, &op1, size, arg);
1302 		if (error)
1303 			return (error);
1304 
1305 		rflags2 = getandflags(size, op1, vie->immediate);
1306 		break;
1307 	default:
1308 		return (EINVAL);
1309 	}
1310 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1311 	if (error)
1312 		return (error);
1313 
1314 	/*
1315 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1316 	 * to the result; AF is undefined.
1317 	 */
1318 	rflags &= ~RFLAGS_STATUS_BITS;
1319 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1320 
1321 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1322 	return (error);
1323 }
1324 
1325 static int
1326 emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1327     struct vm_guest_paging *paging, mem_region_read_t memread,
1328     mem_region_write_t memwrite, void *arg)
1329 {
1330 	uint64_t src1, src2, dst, rflags;
1331 	unsigned start, len;
1332 	int error, size;
1333 
1334 	size = vie->opsize;
1335 	error = EINVAL;
1336 
1337 	/*
1338 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1339 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1340 	 *
1341 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1342 	 * Vex.vvvv.
1343 	 *
1344 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1345 	 */
1346 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1347 		size = 4;
1348 
1349 	/*
1350 	 * Extracts contiguous bits from the first /source/ operand (second
1351 	 * operand) using an index and length specified in the second /source/
1352 	 * operand (third operand).
1353 	 */
1354 	error = memread(vm, vcpuid, gpa, &src1, size, arg);
1355 	if (error)
1356 		return (error);
1357 	error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1358 	if (error)
1359 		return (error);
1360 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1361 	if (error)
1362 		return (error);
1363 
1364 	start = (src2 & 0xff);
1365 	len = (src2 & 0xff00) >> 8;
1366 
1367 	/* If no bits are extracted, the destination register is cleared. */
1368 	dst = 0;
1369 
1370 	/* If START exceeds the operand size, no bits are extracted. */
1371 	if (start > size * 8)
1372 		goto done;
1373 	/* Length is bounded by both the destination size and start offset. */
1374 	if (start + len > size * 8)
1375 		len = (size * 8) - start;
1376 	if (len == 0)
1377 		goto done;
1378 
1379 	if (start > 0)
1380 		src1 = (src1 >> start);
1381 	if (len < 64)
1382 		src1 = src1 & ((1ull << len) - 1);
1383 	dst = src1;
1384 
1385 done:
1386 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1387 	if (error)
1388 		return (error);
1389 
1390 	/*
1391 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1392 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1393 	 */
1394 	rflags &= ~RFLAGS_STATUS_BITS;
1395 	if (dst == 0)
1396 		rflags |= PSL_Z;
1397 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1398 	    8);
1399 	return (error);
1400 }
1401 
1402 static int
1403 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1404 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1405 {
1406 	int error, size;
1407 	uint64_t nval, rflags, rflags2, val1, val2;
1408 	enum vm_reg_name reg;
1409 
1410 	size = vie->opsize;
1411 	error = EINVAL;
1412 
1413 	switch (vie->op.op_byte) {
1414 	case 0x03:
1415 		/*
1416 		 * ADD r/m to r and store the result in r
1417 		 *
1418 		 * 03/r            ADD r16, r/m16
1419 		 * 03/r            ADD r32, r/m32
1420 		 * REX.W + 03/r    ADD r64, r/m64
1421 		 */
1422 
1423 		/* get the first operand */
1424 		reg = gpr_map[vie->reg];
1425 		error = vie_read_register(vm, vcpuid, reg, &val1);
1426 		if (error)
1427 			break;
1428 
1429 		/* get the second operand */
1430 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1431 		if (error)
1432 			break;
1433 
1434 		/* perform the operation and write the result */
1435 		nval = val1 + val2;
1436 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1437 		break;
1438 	default:
1439 		break;
1440 	}
1441 
1442 	if (!error) {
1443 		rflags2 = getaddflags(size, val1, val2);
1444 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1445 		    &rflags);
1446 		if (error)
1447 			return (error);
1448 
1449 		rflags &= ~RFLAGS_STATUS_BITS;
1450 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1451 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1452 		    rflags, 8);
1453 	}
1454 
1455 	return (error);
1456 }
1457 
1458 static int
1459 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1460 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1461 {
1462 	int error, size;
1463 	uint64_t nval, rflags, rflags2, val1, val2;
1464 	enum vm_reg_name reg;
1465 
1466 	size = vie->opsize;
1467 	error = EINVAL;
1468 
1469 	switch (vie->op.op_byte) {
1470 	case 0x2B:
1471 		/*
1472 		 * SUB r/m from r and store the result in r
1473 		 *
1474 		 * 2B/r            SUB r16, r/m16
1475 		 * 2B/r            SUB r32, r/m32
1476 		 * REX.W + 2B/r    SUB r64, r/m64
1477 		 */
1478 
1479 		/* get the first operand */
1480 		reg = gpr_map[vie->reg];
1481 		error = vie_read_register(vm, vcpuid, reg, &val1);
1482 		if (error)
1483 			break;
1484 
1485 		/* get the second operand */
1486 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1487 		if (error)
1488 			break;
1489 
1490 		/* perform the operation and write the result */
1491 		nval = val1 - val2;
1492 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1493 		break;
1494 	default:
1495 		break;
1496 	}
1497 
1498 	if (!error) {
1499 		rflags2 = getcc(size, val1, val2);
1500 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1501 		    &rflags);
1502 		if (error)
1503 			return (error);
1504 
1505 		rflags &= ~RFLAGS_STATUS_BITS;
1506 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1507 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1508 		    rflags, 8);
1509 	}
1510 
1511 	return (error);
1512 }
1513 
1514 static int
1515 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1516     struct vm_guest_paging *paging, mem_region_read_t memread,
1517     mem_region_write_t memwrite, void *arg)
1518 {
1519 #ifdef _KERNEL
1520 	struct vm_copyinfo copyinfo[2];
1521 #else
1522 	struct iovec copyinfo[2];
1523 #endif
1524 	struct seg_desc ss_desc;
1525 	uint64_t cr0, rflags, rsp, stack_gla, val;
1526 	int error, fault, size, stackaddrsize, pushop;
1527 
1528 	val = 0;
1529 	size = vie->opsize;
1530 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1531 
1532 	/*
1533 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1534 	 */
1535 	if (paging->cpu_mode == CPU_MODE_REAL) {
1536 		stackaddrsize = 2;
1537 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1538 		/*
1539 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1540 		 * - Stack pointer size is always 64-bits.
1541 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1542 		 * - 16-bit PUSH/POP is supported by using the operand size
1543 		 *   override prefix (66H).
1544 		 */
1545 		stackaddrsize = 8;
1546 		size = vie->opsize_override ? 2 : 8;
1547 	} else {
1548 		/*
1549 		 * In protected or compatibility mode the 'B' flag in the
1550 		 * stack-segment descriptor determines the size of the
1551 		 * stack pointer.
1552 		 */
1553 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1554 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1555 		    __func__, error));
1556 		if (SEG_DESC_DEF32(ss_desc.access))
1557 			stackaddrsize = 4;
1558 		else
1559 			stackaddrsize = 2;
1560 	}
1561 
1562 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1563 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1564 
1565 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1566 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1567 
1568 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1569 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1570 	if (pushop) {
1571 		rsp -= size;
1572 	}
1573 
1574 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1575 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1576 	    &stack_gla)) {
1577 		vm_inject_ss(vm, vcpuid, 0);
1578 		return (0);
1579 	}
1580 
1581 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1582 		vm_inject_ss(vm, vcpuid, 0);
1583 		return (0);
1584 	}
1585 
1586 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1587 		vm_inject_ac(vm, vcpuid, 0);
1588 		return (0);
1589 	}
1590 
1591 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1592 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1593 	    &fault);
1594 	if (error || fault)
1595 		return (error);
1596 
1597 	if (pushop) {
1598 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1599 		if (error == 0)
1600 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1601 	} else {
1602 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1603 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1604 		rsp += size;
1605 	}
1606 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1607 
1608 	if (error == 0) {
1609 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1610 		    stackaddrsize);
1611 		KASSERT(error == 0, ("error %d updating rsp", error));
1612 	}
1613 	return (error);
1614 }
1615 
1616 static int
1617 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1618     struct vm_guest_paging *paging, mem_region_read_t memread,
1619     mem_region_write_t memwrite, void *arg)
1620 {
1621 	int error;
1622 
1623 	/*
1624 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1625 	 *
1626 	 * PUSH is part of the group 5 extended opcodes and is identified
1627 	 * by ModRM:reg = b110.
1628 	 */
1629 	if ((vie->reg & 7) != 6)
1630 		return (EINVAL);
1631 
1632 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1633 	    memwrite, arg);
1634 	return (error);
1635 }
1636 
1637 static int
1638 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1639     struct vm_guest_paging *paging, mem_region_read_t memread,
1640     mem_region_write_t memwrite, void *arg)
1641 {
1642 	int error;
1643 
1644 	/*
1645 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1646 	 *
1647 	 * POP is part of the group 1A extended opcodes and is identified
1648 	 * by ModRM:reg = b000.
1649 	 */
1650 	if ((vie->reg & 7) != 0)
1651 		return (EINVAL);
1652 
1653 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1654 	    memwrite, arg);
1655 	return (error);
1656 }
1657 
1658 static int
1659 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1660     struct vm_guest_paging *paging, mem_region_read_t memread,
1661     mem_region_write_t memwrite, void *memarg)
1662 {
1663 	int error;
1664 
1665 	switch (vie->reg & 7) {
1666 	case 0x1:	/* OR */
1667 		error = emulate_or(vm, vcpuid, gpa, vie,
1668 		    memread, memwrite, memarg);
1669 		break;
1670 	case 0x4:	/* AND */
1671 		error = emulate_and(vm, vcpuid, gpa, vie,
1672 		    memread, memwrite, memarg);
1673 		break;
1674 	case 0x7:	/* CMP */
1675 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1676 		    memread, memwrite, memarg);
1677 		break;
1678 	default:
1679 		error = EINVAL;
1680 		break;
1681 	}
1682 
1683 	return (error);
1684 }
1685 
1686 static int
1687 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1688     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1689 {
1690 	uint64_t val, rflags;
1691 	int error, bitmask, bitoff;
1692 
1693 	/*
1694 	 * 0F BA is a Group 8 extended opcode.
1695 	 *
1696 	 * Currently we only emulate the 'Bit Test' instruction which is
1697 	 * identified by a ModR/M:reg encoding of 100b.
1698 	 */
1699 	if ((vie->reg & 7) != 4)
1700 		return (EINVAL);
1701 
1702 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1703 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1704 
1705 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1706 	if (error)
1707 		return (error);
1708 
1709 	/*
1710 	 * Intel SDM, Vol 2, Table 3-2:
1711 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1712 	 */
1713 	bitmask = vie->opsize * 8 - 1;
1714 	bitoff = vie->immediate & bitmask;
1715 
1716 	/* Copy the bit into the Carry flag in %rflags */
1717 	if (val & (1UL << bitoff))
1718 		rflags |= PSL_C;
1719 	else
1720 		rflags &= ~PSL_C;
1721 
1722 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1723 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1724 
1725 	return (0);
1726 }
1727 
1728 static int
1729 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1730     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1731 {
1732 	int error;
1733 	uint64_t buf;
1734 
1735 	switch (vie->reg & 7) {
1736 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1737 		if (vie->mod == 0x3) {
1738 			/*
1739 			 * SFENCE.  Ignore it, VM exit provides enough
1740 			 * barriers on its own.
1741 			 */
1742 			error = 0;
1743 		} else {
1744 			/*
1745 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1746 			 * rights.
1747 			 */
1748 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1749 		}
1750 		break;
1751 	default:
1752 		error = EINVAL;
1753 		break;
1754 	}
1755 
1756 	return (error);
1757 }
1758 
1759 int
1760 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1761     struct vm_guest_paging *paging, mem_region_read_t memread,
1762     mem_region_write_t memwrite, void *memarg)
1763 {
1764 	int error;
1765 
1766 	if (!vie->decoded)
1767 		return (EINVAL);
1768 
1769 	switch (vie->op.op_type) {
1770 	case VIE_OP_TYPE_GROUP1:
1771 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1772 		    memwrite, memarg);
1773 		break;
1774 	case VIE_OP_TYPE_POP:
1775 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1776 		    memwrite, memarg);
1777 		break;
1778 	case VIE_OP_TYPE_PUSH:
1779 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1780 		    memwrite, memarg);
1781 		break;
1782 	case VIE_OP_TYPE_CMP:
1783 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1784 				    memread, memwrite, memarg);
1785 		break;
1786 	case VIE_OP_TYPE_MOV:
1787 		error = emulate_mov(vm, vcpuid, gpa, vie,
1788 				    memread, memwrite, memarg);
1789 		break;
1790 	case VIE_OP_TYPE_MOVSX:
1791 	case VIE_OP_TYPE_MOVZX:
1792 		error = emulate_movx(vm, vcpuid, gpa, vie,
1793 				     memread, memwrite, memarg);
1794 		break;
1795 	case VIE_OP_TYPE_MOVS:
1796 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1797 		    memwrite, memarg);
1798 		break;
1799 	case VIE_OP_TYPE_STOS:
1800 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1801 		    memwrite, memarg);
1802 		break;
1803 	case VIE_OP_TYPE_AND:
1804 		error = emulate_and(vm, vcpuid, gpa, vie,
1805 				    memread, memwrite, memarg);
1806 		break;
1807 	case VIE_OP_TYPE_OR:
1808 		error = emulate_or(vm, vcpuid, gpa, vie,
1809 				    memread, memwrite, memarg);
1810 		break;
1811 	case VIE_OP_TYPE_SUB:
1812 		error = emulate_sub(vm, vcpuid, gpa, vie,
1813 				    memread, memwrite, memarg);
1814 		break;
1815 	case VIE_OP_TYPE_BITTEST:
1816 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1817 		    memread, memwrite, memarg);
1818 		break;
1819 	case VIE_OP_TYPE_TWOB_GRP15:
1820 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1821 		    memread, memwrite, memarg);
1822 		break;
1823 	case VIE_OP_TYPE_ADD:
1824 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
1825 		    memwrite, memarg);
1826 		break;
1827 	case VIE_OP_TYPE_TEST:
1828 		error = emulate_test(vm, vcpuid, gpa, vie,
1829 		    memread, memwrite, memarg);
1830 		break;
1831 	case VIE_OP_TYPE_BEXTR:
1832 		error = emulate_bextr(vm, vcpuid, gpa, vie, paging,
1833 		    memread, memwrite, memarg);
1834 		break;
1835 	default:
1836 		error = EINVAL;
1837 		break;
1838 	}
1839 
1840 	return (error);
1841 }
1842 
1843 int
1844 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1845 {
1846 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1847 	    ("%s: invalid size %d", __func__, size));
1848 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1849 
1850 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1851 		return (0);
1852 
1853 	return ((gla & (size - 1)) ? 1 : 0);
1854 }
1855 
1856 int
1857 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1858 {
1859 	uint64_t mask;
1860 
1861 	if (cpu_mode != CPU_MODE_64BIT)
1862 		return (0);
1863 
1864 	/*
1865 	 * The value of the bit 47 in the 'gla' should be replicated in the
1866 	 * most significant 16 bits.
1867 	 */
1868 	mask = ~((1UL << 48) - 1);
1869 	if (gla & (1UL << 47))
1870 		return ((gla & mask) != mask);
1871 	else
1872 		return ((gla & mask) != 0);
1873 }
1874 
1875 uint64_t
1876 vie_size2mask(int size)
1877 {
1878 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1879 	    ("vie_size2mask: invalid size %d", size));
1880 	return (size2mask[size]);
1881 }
1882 
1883 int
1884 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1885     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1886     int prot, uint64_t *gla)
1887 {
1888 	uint64_t firstoff, low_limit, high_limit, segbase;
1889 	int glasize, type;
1890 
1891 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1892 	    ("%s: invalid segment %d", __func__, seg));
1893 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1894 	    ("%s: invalid operand size %d", __func__, length));
1895 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1896 	    ("%s: invalid prot %#x", __func__, prot));
1897 
1898 	firstoff = offset;
1899 	if (cpu_mode == CPU_MODE_64BIT) {
1900 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1901 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1902 		glasize = 8;
1903 	} else {
1904 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1905 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1906 		glasize = 4;
1907 		/*
1908 		 * If the segment selector is loaded with a NULL selector
1909 		 * then the descriptor is unusable and attempting to use
1910 		 * it results in a #GP(0).
1911 		 */
1912 		if (SEG_DESC_UNUSABLE(desc->access))
1913 			return (-1);
1914 
1915 		/*
1916 		 * The processor generates a #NP exception when a segment
1917 		 * register is loaded with a selector that points to a
1918 		 * descriptor that is not present. If this was the case then
1919 		 * it would have been checked before the VM-exit.
1920 		 */
1921 		KASSERT(SEG_DESC_PRESENT(desc->access),
1922 		    ("segment %d not present: %#x", seg, desc->access));
1923 
1924 		/*
1925 		 * The descriptor type must indicate a code/data segment.
1926 		 */
1927 		type = SEG_DESC_TYPE(desc->access);
1928 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1929 		    "descriptor type %#x", seg, type));
1930 
1931 		if (prot & PROT_READ) {
1932 			/* #GP on a read access to a exec-only code segment */
1933 			if ((type & 0xA) == 0x8)
1934 				return (-1);
1935 		}
1936 
1937 		if (prot & PROT_WRITE) {
1938 			/*
1939 			 * #GP on a write access to a code segment or a
1940 			 * read-only data segment.
1941 			 */
1942 			if (type & 0x8)			/* code segment */
1943 				return (-1);
1944 
1945 			if ((type & 0xA) == 0)		/* read-only data seg */
1946 				return (-1);
1947 		}
1948 
1949 		/*
1950 		 * 'desc->limit' is fully expanded taking granularity into
1951 		 * account.
1952 		 */
1953 		if ((type & 0xC) == 0x4) {
1954 			/* expand-down data segment */
1955 			low_limit = desc->limit + 1;
1956 			high_limit = SEG_DESC_DEF32(desc->access) ?
1957 			    0xffffffff : 0xffff;
1958 		} else {
1959 			/* code segment or expand-up data segment */
1960 			low_limit = 0;
1961 			high_limit = desc->limit;
1962 		}
1963 
1964 		while (length > 0) {
1965 			offset &= vie_size2mask(addrsize);
1966 			if (offset < low_limit || offset > high_limit)
1967 				return (-1);
1968 			offset++;
1969 			length--;
1970 		}
1971 	}
1972 
1973 	/*
1974 	 * In 64-bit mode all segments except %fs and %gs have a segment
1975 	 * base address of 0.
1976 	 */
1977 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1978 	    seg != VM_REG_GUEST_GS) {
1979 		segbase = 0;
1980 	} else {
1981 		segbase = desc->base;
1982 	}
1983 
1984 	/*
1985 	 * Truncate 'firstoff' to the effective address size before adding
1986 	 * it to the segment base.
1987 	 */
1988 	firstoff &= vie_size2mask(addrsize);
1989 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1990 	return (0);
1991 }
1992 
1993 void
1994 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1995 {
1996 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1997 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1998 
1999 	bzero(vie, sizeof(struct vie));
2000 
2001 	vie->base_register = VM_REG_LAST;
2002 	vie->index_register = VM_REG_LAST;
2003 	vie->segment_register = VM_REG_LAST;
2004 
2005 	if (inst_length) {
2006 		bcopy(inst_bytes, vie->inst, inst_length);
2007 		vie->num_valid = inst_length;
2008 	}
2009 }
2010 
2011 #ifdef _KERNEL
2012 static int
2013 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2014 {
2015 	int error_code = 0;
2016 
2017 	if (pte & PG_V)
2018 		error_code |= PGEX_P;
2019 	if (prot & VM_PROT_WRITE)
2020 		error_code |= PGEX_W;
2021 	if (usermode)
2022 		error_code |= PGEX_U;
2023 	if (rsvd)
2024 		error_code |= PGEX_RSV;
2025 	if (prot & VM_PROT_EXECUTE)
2026 		error_code |= PGEX_I;
2027 
2028 	return (error_code);
2029 }
2030 
2031 static void
2032 ptp_release(void **cookie)
2033 {
2034 	if (*cookie != NULL) {
2035 		vm_gpa_release(*cookie);
2036 		*cookie = NULL;
2037 	}
2038 }
2039 
2040 static void *
2041 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2042 {
2043 	void *ptr;
2044 
2045 	ptp_release(cookie);
2046 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
2047 	return (ptr);
2048 }
2049 
2050 static int
2051 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2052     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2053 {
2054 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2055 	u_int retries;
2056 	uint64_t *ptpbase, ptpphys, pte, pgsize;
2057 	uint32_t *ptpbase32, pte32;
2058 	void *cookie;
2059 
2060 	*guest_fault = 0;
2061 
2062 	usermode = (paging->cpl == 3 ? 1 : 0);
2063 	writable = prot & VM_PROT_WRITE;
2064 	cookie = NULL;
2065 	retval = 0;
2066 	retries = 0;
2067 restart:
2068 	ptpphys = paging->cr3;		/* root of the page tables */
2069 	ptp_release(&cookie);
2070 	if (retries++ > 0)
2071 		maybe_yield();
2072 
2073 	if (vie_canonical_check(paging->cpu_mode, gla)) {
2074 		/*
2075 		 * XXX assuming a non-stack reference otherwise a stack fault
2076 		 * should be generated.
2077 		 */
2078 		if (!check_only)
2079 			vm_inject_gp(vm, vcpuid);
2080 		goto fault;
2081 	}
2082 
2083 	if (paging->paging_mode == PAGING_MODE_FLAT) {
2084 		*gpa = gla;
2085 		goto done;
2086 	}
2087 
2088 	if (paging->paging_mode == PAGING_MODE_32) {
2089 		nlevels = 2;
2090 		while (--nlevels >= 0) {
2091 			/* Zero out the lower 12 bits. */
2092 			ptpphys &= ~0xfff;
2093 
2094 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2095 			    &cookie);
2096 
2097 			if (ptpbase32 == NULL)
2098 				goto error;
2099 
2100 			ptpshift = PAGE_SHIFT + nlevels * 10;
2101 			ptpindex = (gla >> ptpshift) & 0x3FF;
2102 			pgsize = 1UL << ptpshift;
2103 
2104 			pte32 = ptpbase32[ptpindex];
2105 
2106 			if ((pte32 & PG_V) == 0 ||
2107 			    (usermode && (pte32 & PG_U) == 0) ||
2108 			    (writable && (pte32 & PG_RW) == 0)) {
2109 				if (!check_only) {
2110 					pfcode = pf_error_code(usermode, prot, 0,
2111 					    pte32);
2112 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2113 				}
2114 				goto fault;
2115 			}
2116 
2117 			/*
2118 			 * Emulate the x86 MMU's management of the accessed
2119 			 * and dirty flags. While the accessed flag is set
2120 			 * at every level of the page table, the dirty flag
2121 			 * is only set at the last level providing the guest
2122 			 * physical address.
2123 			 */
2124 			if (!check_only && (pte32 & PG_A) == 0) {
2125 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2126 				    pte32, pte32 | PG_A) == 0) {
2127 					goto restart;
2128 				}
2129 			}
2130 
2131 			/* XXX must be ignored if CR4.PSE=0 */
2132 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2133 				break;
2134 
2135 			ptpphys = pte32;
2136 		}
2137 
2138 		/* Set the dirty bit in the page table entry if necessary */
2139 		if (!check_only && writable && (pte32 & PG_M) == 0) {
2140 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2141 			    pte32, pte32 | PG_M) == 0) {
2142 				goto restart;
2143 			}
2144 		}
2145 
2146 		/* Zero out the lower 'ptpshift' bits */
2147 		pte32 >>= ptpshift; pte32 <<= ptpshift;
2148 		*gpa = pte32 | (gla & (pgsize - 1));
2149 		goto done;
2150 	}
2151 
2152 	if (paging->paging_mode == PAGING_MODE_PAE) {
2153 		/* Zero out the lower 5 bits and the upper 32 bits */
2154 		ptpphys &= 0xffffffe0UL;
2155 
2156 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
2157 		    &cookie);
2158 		if (ptpbase == NULL)
2159 			goto error;
2160 
2161 		ptpindex = (gla >> 30) & 0x3;
2162 
2163 		pte = ptpbase[ptpindex];
2164 
2165 		if ((pte & PG_V) == 0) {
2166 			if (!check_only) {
2167 				pfcode = pf_error_code(usermode, prot, 0, pte);
2168 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2169 			}
2170 			goto fault;
2171 		}
2172 
2173 		ptpphys = pte;
2174 
2175 		nlevels = 2;
2176 	} else
2177 		nlevels = 4;
2178 	while (--nlevels >= 0) {
2179 		/* Zero out the lower 12 bits and the upper 12 bits */
2180 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2181 
2182 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
2183 		if (ptpbase == NULL)
2184 			goto error;
2185 
2186 		ptpshift = PAGE_SHIFT + nlevels * 9;
2187 		ptpindex = (gla >> ptpshift) & 0x1FF;
2188 		pgsize = 1UL << ptpshift;
2189 
2190 		pte = ptpbase[ptpindex];
2191 
2192 		if ((pte & PG_V) == 0 ||
2193 		    (usermode && (pte & PG_U) == 0) ||
2194 		    (writable && (pte & PG_RW) == 0)) {
2195 			if (!check_only) {
2196 				pfcode = pf_error_code(usermode, prot, 0, pte);
2197 				vm_inject_pf(vm, vcpuid, pfcode, gla);
2198 			}
2199 			goto fault;
2200 		}
2201 
2202 		/* Set the accessed bit in the page table entry */
2203 		if (!check_only && (pte & PG_A) == 0) {
2204 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2205 			    pte, pte | PG_A) == 0) {
2206 				goto restart;
2207 			}
2208 		}
2209 
2210 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2211 			if (pgsize > 1 * GB) {
2212 				if (!check_only) {
2213 					pfcode = pf_error_code(usermode, prot, 1,
2214 					    pte);
2215 					vm_inject_pf(vm, vcpuid, pfcode, gla);
2216 				}
2217 				goto fault;
2218 			}
2219 			break;
2220 		}
2221 
2222 		ptpphys = pte;
2223 	}
2224 
2225 	/* Set the dirty bit in the page table entry if necessary */
2226 	if (!check_only && writable && (pte & PG_M) == 0) {
2227 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2228 			goto restart;
2229 	}
2230 
2231 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2232 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2233 	*gpa = pte | (gla & (pgsize - 1));
2234 done:
2235 	ptp_release(&cookie);
2236 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2237 	    __func__, retval));
2238 	return (retval);
2239 error:
2240 	retval = EFAULT;
2241 	goto done;
2242 fault:
2243 	*guest_fault = 1;
2244 	goto done;
2245 }
2246 
2247 int
2248 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2249     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2250 {
2251 
2252 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2253 	    false));
2254 }
2255 
2256 int
2257 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2258     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2259 {
2260 
2261 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
2262 	    true));
2263 }
2264 
2265 int
2266 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2267     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2268 {
2269 	struct vm_copyinfo copyinfo[2];
2270 	int error, prot;
2271 
2272 	if (inst_length > VIE_INST_SIZE)
2273 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2274 
2275 	prot = PROT_READ | PROT_EXEC;
2276 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
2277 	    copyinfo, nitems(copyinfo), faultptr);
2278 	if (error || *faultptr)
2279 		return (error);
2280 
2281 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
2282 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2283 	vie->num_valid = inst_length;
2284 	return (0);
2285 }
2286 #endif	/* _KERNEL */
2287 
2288 static int
2289 vie_peek(struct vie *vie, uint8_t *x)
2290 {
2291 
2292 	if (vie->num_processed < vie->num_valid) {
2293 		*x = vie->inst[vie->num_processed];
2294 		return (0);
2295 	} else
2296 		return (-1);
2297 }
2298 
2299 static void
2300 vie_advance(struct vie *vie)
2301 {
2302 
2303 	vie->num_processed++;
2304 }
2305 
2306 static bool
2307 segment_override(uint8_t x, int *seg)
2308 {
2309 
2310 	switch (x) {
2311 	case 0x2E:
2312 		*seg = VM_REG_GUEST_CS;
2313 		break;
2314 	case 0x36:
2315 		*seg = VM_REG_GUEST_SS;
2316 		break;
2317 	case 0x3E:
2318 		*seg = VM_REG_GUEST_DS;
2319 		break;
2320 	case 0x26:
2321 		*seg = VM_REG_GUEST_ES;
2322 		break;
2323 	case 0x64:
2324 		*seg = VM_REG_GUEST_FS;
2325 		break;
2326 	case 0x65:
2327 		*seg = VM_REG_GUEST_GS;
2328 		break;
2329 	default:
2330 		return (false);
2331 	}
2332 	return (true);
2333 }
2334 
2335 static int
2336 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2337 {
2338 	uint8_t x;
2339 
2340 	while (1) {
2341 		if (vie_peek(vie, &x))
2342 			return (-1);
2343 
2344 		if (x == 0x66)
2345 			vie->opsize_override = 1;
2346 		else if (x == 0x67)
2347 			vie->addrsize_override = 1;
2348 		else if (x == 0xF3)
2349 			vie->repz_present = 1;
2350 		else if (x == 0xF2)
2351 			vie->repnz_present = 1;
2352 		else if (segment_override(x, &vie->segment_register))
2353 			vie->segment_override = 1;
2354 		else
2355 			break;
2356 
2357 		vie_advance(vie);
2358 	}
2359 
2360 	/*
2361 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2362 	 * - Only one REX prefix is allowed per instruction.
2363 	 * - The REX prefix must immediately precede the opcode byte or the
2364 	 *   escape opcode byte.
2365 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2366 	 *   the mandatory prefix must come before the REX prefix.
2367 	 */
2368 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2369 		vie->rex_present = 1;
2370 		vie->rex_w = x & 0x8 ? 1 : 0;
2371 		vie->rex_r = x & 0x4 ? 1 : 0;
2372 		vie->rex_x = x & 0x2 ? 1 : 0;
2373 		vie->rex_b = x & 0x1 ? 1 : 0;
2374 		vie_advance(vie);
2375 	}
2376 
2377 	/*
2378 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2379 	 */
2380 	if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2381 	    && x == 0xC4) {
2382 		const struct vie_op *optab;
2383 
2384 		/* 3-byte VEX prefix. */
2385 		vie->vex_present = 1;
2386 
2387 		vie_advance(vie);
2388 		if (vie_peek(vie, &x))
2389 			return (-1);
2390 
2391 		/*
2392 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
2393 		 * relative to REX encoding.
2394 		 */
2395 		vie->rex_r = x & 0x80 ? 0 : 1;
2396 		vie->rex_x = x & 0x40 ? 0 : 1;
2397 		vie->rex_b = x & 0x20 ? 0 : 1;
2398 
2399 		switch (x & 0x1F) {
2400 		case 0x2:
2401 			/* 0F 38. */
2402 			optab = three_byte_opcodes_0f38;
2403 			break;
2404 		case 0x1:
2405 			/* 0F class - nothing handled here yet. */
2406 			/* FALLTHROUGH */
2407 		case 0x3:
2408 			/* 0F 3A class - nothing handled here yet. */
2409 			/* FALLTHROUGH */
2410 		default:
2411 			/* Reserved (#UD). */
2412 			return (-1);
2413 		}
2414 
2415 		vie_advance(vie);
2416 		if (vie_peek(vie, &x))
2417 			return (-1);
2418 
2419 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2420 		vie->rex_w = x & 0x80 ? 1 : 0;
2421 
2422 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2423 		vie->vex_l = !!(x & 0x4);
2424 		vie->vex_pp = (x & 0x3);
2425 
2426 		/* PP: 1=66 2=F3 3=F2 prefixes. */
2427 		switch (vie->vex_pp) {
2428 		case 0x1:
2429 			vie->opsize_override = 1;
2430 			break;
2431 		case 0x2:
2432 			vie->repz_present = 1;
2433 			break;
2434 		case 0x3:
2435 			vie->repnz_present = 1;
2436 			break;
2437 		}
2438 
2439 		vie_advance(vie);
2440 
2441 		/* Opcode, sans literal prefix prefix. */
2442 		if (vie_peek(vie, &x))
2443 			return (-1);
2444 
2445 		vie->op = optab[x];
2446 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
2447 			return (-1);
2448 
2449 		vie_advance(vie);
2450 	}
2451 
2452 	/*
2453 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2454 	 */
2455 	if (cpu_mode == CPU_MODE_64BIT) {
2456 		/*
2457 		 * Default address size is 64-bits and default operand size
2458 		 * is 32-bits.
2459 		 */
2460 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2461 		if (vie->rex_w)
2462 			vie->opsize = 8;
2463 		else if (vie->opsize_override)
2464 			vie->opsize = 2;
2465 		else
2466 			vie->opsize = 4;
2467 	} else if (cs_d) {
2468 		/* Default address and operand sizes are 32-bits */
2469 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2470 		vie->opsize = vie->opsize_override ? 2 : 4;
2471 	} else {
2472 		/* Default address and operand sizes are 16-bits */
2473 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2474 		vie->opsize = vie->opsize_override ? 4 : 2;
2475 	}
2476 	return (0);
2477 }
2478 
2479 static int
2480 decode_two_byte_opcode(struct vie *vie)
2481 {
2482 	uint8_t x;
2483 
2484 	if (vie_peek(vie, &x))
2485 		return (-1);
2486 
2487 	vie->op = two_byte_opcodes[x];
2488 
2489 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2490 		return (-1);
2491 
2492 	vie_advance(vie);
2493 	return (0);
2494 }
2495 
2496 static int
2497 decode_opcode(struct vie *vie)
2498 {
2499 	uint8_t x;
2500 
2501 	if (vie_peek(vie, &x))
2502 		return (-1);
2503 
2504 	/* Already did this via VEX prefix. */
2505 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
2506 		return (0);
2507 
2508 	vie->op = one_byte_opcodes[x];
2509 
2510 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2511 		return (-1);
2512 
2513 	vie_advance(vie);
2514 
2515 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2516 		return (decode_two_byte_opcode(vie));
2517 
2518 	return (0);
2519 }
2520 
2521 static int
2522 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2523 {
2524 	uint8_t x;
2525 
2526 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2527 		return (0);
2528 
2529 	if (cpu_mode == CPU_MODE_REAL)
2530 		return (-1);
2531 
2532 	if (vie_peek(vie, &x))
2533 		return (-1);
2534 
2535 	vie->mod = (x >> 6) & 0x3;
2536 	vie->rm =  (x >> 0) & 0x7;
2537 	vie->reg = (x >> 3) & 0x7;
2538 
2539 	/*
2540 	 * A direct addressing mode makes no sense in the context of an EPT
2541 	 * fault. There has to be a memory access involved to cause the
2542 	 * EPT fault.
2543 	 */
2544 	if (vie->mod == VIE_MOD_DIRECT)
2545 		return (-1);
2546 
2547 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2548 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2549 		/*
2550 		 * Table 2-5: Special Cases of REX Encodings
2551 		 *
2552 		 * mod=0, r/m=5 is used in the compatibility mode to
2553 		 * indicate a disp32 without a base register.
2554 		 *
2555 		 * mod!=3, r/m=4 is used in the compatibility mode to
2556 		 * indicate that the SIB byte is present.
2557 		 *
2558 		 * The 'b' bit in the REX prefix is don't care in
2559 		 * this case.
2560 		 */
2561 	} else {
2562 		vie->rm |= (vie->rex_b << 3);
2563 	}
2564 
2565 	vie->reg |= (vie->rex_r << 3);
2566 
2567 	/* SIB */
2568 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2569 		goto done;
2570 
2571 	vie->base_register = gpr_map[vie->rm];
2572 
2573 	switch (vie->mod) {
2574 	case VIE_MOD_INDIRECT_DISP8:
2575 		vie->disp_bytes = 1;
2576 		break;
2577 	case VIE_MOD_INDIRECT_DISP32:
2578 		vie->disp_bytes = 4;
2579 		break;
2580 	case VIE_MOD_INDIRECT:
2581 		if (vie->rm == VIE_RM_DISP32) {
2582 			vie->disp_bytes = 4;
2583 			/*
2584 			 * Table 2-7. RIP-Relative Addressing
2585 			 *
2586 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2587 			 * whereas in compatibility mode it just implies disp32.
2588 			 */
2589 
2590 			if (cpu_mode == CPU_MODE_64BIT)
2591 				vie->base_register = VM_REG_GUEST_RIP;
2592 			else
2593 				vie->base_register = VM_REG_LAST;
2594 		}
2595 		break;
2596 	}
2597 
2598 done:
2599 	vie_advance(vie);
2600 
2601 	return (0);
2602 }
2603 
2604 static int
2605 decode_sib(struct vie *vie)
2606 {
2607 	uint8_t x;
2608 
2609 	/* Proceed only if SIB byte is present */
2610 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2611 		return (0);
2612 
2613 	if (vie_peek(vie, &x))
2614 		return (-1);
2615 
2616 	/* De-construct the SIB byte */
2617 	vie->ss = (x >> 6) & 0x3;
2618 	vie->index = (x >> 3) & 0x7;
2619 	vie->base = (x >> 0) & 0x7;
2620 
2621 	/* Apply the REX prefix modifiers */
2622 	vie->index |= vie->rex_x << 3;
2623 	vie->base |= vie->rex_b << 3;
2624 
2625 	switch (vie->mod) {
2626 	case VIE_MOD_INDIRECT_DISP8:
2627 		vie->disp_bytes = 1;
2628 		break;
2629 	case VIE_MOD_INDIRECT_DISP32:
2630 		vie->disp_bytes = 4;
2631 		break;
2632 	}
2633 
2634 	if (vie->mod == VIE_MOD_INDIRECT &&
2635 	    (vie->base == 5 || vie->base == 13)) {
2636 		/*
2637 		 * Special case when base register is unused if mod = 0
2638 		 * and base = %rbp or %r13.
2639 		 *
2640 		 * Documented in:
2641 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2642 		 * Table 2-5: Special Cases of REX Encodings
2643 		 */
2644 		vie->disp_bytes = 4;
2645 	} else {
2646 		vie->base_register = gpr_map[vie->base];
2647 	}
2648 
2649 	/*
2650 	 * All encodings of 'index' are valid except for %rsp (4).
2651 	 *
2652 	 * Documented in:
2653 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2654 	 * Table 2-5: Special Cases of REX Encodings
2655 	 */
2656 	if (vie->index != 4)
2657 		vie->index_register = gpr_map[vie->index];
2658 
2659 	/* 'scale' makes sense only in the context of an index register */
2660 	if (vie->index_register < VM_REG_LAST)
2661 		vie->scale = 1 << vie->ss;
2662 
2663 	vie_advance(vie);
2664 
2665 	return (0);
2666 }
2667 
2668 static int
2669 decode_displacement(struct vie *vie)
2670 {
2671 	int n, i;
2672 	uint8_t x;
2673 
2674 	union {
2675 		char	buf[4];
2676 		int8_t	signed8;
2677 		int32_t	signed32;
2678 	} u;
2679 
2680 	if ((n = vie->disp_bytes) == 0)
2681 		return (0);
2682 
2683 	if (n != 1 && n != 4)
2684 		panic("decode_displacement: invalid disp_bytes %d", n);
2685 
2686 	for (i = 0; i < n; i++) {
2687 		if (vie_peek(vie, &x))
2688 			return (-1);
2689 
2690 		u.buf[i] = x;
2691 		vie_advance(vie);
2692 	}
2693 
2694 	if (n == 1)
2695 		vie->displacement = u.signed8;		/* sign-extended */
2696 	else
2697 		vie->displacement = u.signed32;		/* sign-extended */
2698 
2699 	return (0);
2700 }
2701 
2702 static int
2703 decode_immediate(struct vie *vie)
2704 {
2705 	int i, n;
2706 	uint8_t x;
2707 	union {
2708 		char	buf[4];
2709 		int8_t	signed8;
2710 		int16_t	signed16;
2711 		int32_t	signed32;
2712 	} u;
2713 
2714 	/* Figure out immediate operand size (if any) */
2715 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2716 		/*
2717 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2718 		 * In 64-bit mode the typical size of immediate operands
2719 		 * remains 32-bits. When the operand size if 64-bits, the
2720 		 * processor sign-extends all immediates to 64-bits prior
2721 		 * to their use.
2722 		 */
2723 		if (vie->opsize == 4 || vie->opsize == 8)
2724 			vie->imm_bytes = 4;
2725 		else
2726 			vie->imm_bytes = 2;
2727 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2728 		vie->imm_bytes = 1;
2729 	}
2730 
2731 	if ((n = vie->imm_bytes) == 0)
2732 		return (0);
2733 
2734 	KASSERT(n == 1 || n == 2 || n == 4,
2735 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2736 
2737 	for (i = 0; i < n; i++) {
2738 		if (vie_peek(vie, &x))
2739 			return (-1);
2740 
2741 		u.buf[i] = x;
2742 		vie_advance(vie);
2743 	}
2744 
2745 	/* sign-extend the immediate value before use */
2746 	if (n == 1)
2747 		vie->immediate = u.signed8;
2748 	else if (n == 2)
2749 		vie->immediate = u.signed16;
2750 	else
2751 		vie->immediate = u.signed32;
2752 
2753 	return (0);
2754 }
2755 
2756 static int
2757 decode_moffset(struct vie *vie)
2758 {
2759 	int i, n;
2760 	uint8_t x;
2761 	union {
2762 		char	buf[8];
2763 		uint64_t u64;
2764 	} u;
2765 
2766 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2767 		return (0);
2768 
2769 	/*
2770 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2771 	 * The memory offset size follows the address-size of the instruction.
2772 	 */
2773 	n = vie->addrsize;
2774 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2775 
2776 	u.u64 = 0;
2777 	for (i = 0; i < n; i++) {
2778 		if (vie_peek(vie, &x))
2779 			return (-1);
2780 
2781 		u.buf[i] = x;
2782 		vie_advance(vie);
2783 	}
2784 	vie->displacement = u.u64;
2785 	return (0);
2786 }
2787 
2788 #ifdef _KERNEL
2789 /*
2790  * Verify that the 'guest linear address' provided as collateral of the nested
2791  * page table fault matches with our instruction decoding.
2792  */
2793 static int
2794 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2795     enum vm_cpu_mode cpu_mode)
2796 {
2797 	int error;
2798 	uint64_t base, segbase, idx, gla2;
2799 	enum vm_reg_name seg;
2800 	struct seg_desc desc;
2801 
2802 	/* Skip 'gla' verification */
2803 	if (gla == VIE_INVALID_GLA)
2804 		return (0);
2805 
2806 	base = 0;
2807 	if (vie->base_register != VM_REG_LAST) {
2808 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2809 		if (error) {
2810 			printf("verify_gla: error %d getting base reg %d\n",
2811 				error, vie->base_register);
2812 			return (-1);
2813 		}
2814 
2815 		/*
2816 		 * RIP-relative addressing starts from the following
2817 		 * instruction
2818 		 */
2819 		if (vie->base_register == VM_REG_GUEST_RIP)
2820 			base += vie->num_processed;
2821 	}
2822 
2823 	idx = 0;
2824 	if (vie->index_register != VM_REG_LAST) {
2825 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2826 		if (error) {
2827 			printf("verify_gla: error %d getting index reg %d\n",
2828 				error, vie->index_register);
2829 			return (-1);
2830 		}
2831 	}
2832 
2833 	/*
2834 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2835 	 *
2836 	 * In 64-bit mode, segmentation is generally (but not
2837 	 * completely) disabled.  The exceptions are the FS and GS
2838 	 * segments.
2839 	 *
2840 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2841 	 * as the base, the SS segment is the default segment.  For
2842 	 * other data references, except when relative to stack or
2843 	 * string destination the DS segment is the default.  These
2844 	 * can be overridden to allow other segments to be accessed.
2845 	 */
2846 	if (vie->segment_override)
2847 		seg = vie->segment_register;
2848 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2849 	    vie->base_register == VM_REG_GUEST_RBP)
2850 		seg = VM_REG_GUEST_SS;
2851 	else
2852 		seg = VM_REG_GUEST_DS;
2853 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2854 	    seg != VM_REG_GUEST_GS) {
2855 		segbase = 0;
2856 	} else {
2857 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2858 		if (error) {
2859 			printf("verify_gla: error %d getting segment"
2860 			       " descriptor %d", error,
2861 			       vie->segment_register);
2862 			return (-1);
2863 		}
2864 		segbase = desc.base;
2865 	}
2866 
2867 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2868 	gla2 &= size2mask[vie->addrsize];
2869 	if (gla != gla2) {
2870 		printf("verify_gla mismatch: segbase(0x%0lx)"
2871 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2872 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2873 		       segbase, base, vie->scale, idx, vie->displacement,
2874 		       gla, gla2);
2875 		return (-1);
2876 	}
2877 
2878 	return (0);
2879 }
2880 #endif	/* _KERNEL */
2881 
2882 int
2883 #ifdef _KERNEL
2884 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2885 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2886 #else
2887 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2888 #endif
2889 {
2890 
2891 	if (decode_prefixes(vie, cpu_mode, cs_d))
2892 		return (-1);
2893 
2894 	if (decode_opcode(vie))
2895 		return (-1);
2896 
2897 	if (decode_modrm(vie, cpu_mode))
2898 		return (-1);
2899 
2900 	if (decode_sib(vie))
2901 		return (-1);
2902 
2903 	if (decode_displacement(vie))
2904 		return (-1);
2905 
2906 	if (decode_immediate(vie))
2907 		return (-1);
2908 
2909 	if (decode_moffset(vie))
2910 		return (-1);
2911 
2912 #ifdef _KERNEL
2913 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2914 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2915 			return (-1);
2916 	}
2917 #endif
2918 
2919 	vie->decoded = 1;	/* success */
2920 
2921 	return (0);
2922 }
2923