xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 6574b8ed19b093f0af09501d2c9676c28993cb97)
1 /*-
2  * Copyright (c) 2012 Sandvine, Inc.
3  * Copyright (c) 2012 NetApp, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #ifdef _KERNEL
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 
39 #include <vm/vm.h>
40 #include <vm/pmap.h>
41 
42 #include <machine/vmparam.h>
43 #include <machine/vmm.h>
44 #else	/* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 #include <sys/_iovec.h>
48 
49 #include <machine/vmm.h>
50 
51 #include <assert.h>
52 #include <vmmapi.h>
53 #define	KASSERT(exp,msg)	assert((exp))
54 #endif	/* _KERNEL */
55 
56 #include <machine/vmm_instruction_emul.h>
57 #include <x86/psl.h>
58 #include <x86/specialreg.h>
59 
60 /* struct vie_op.op_type */
61 enum {
62 	VIE_OP_TYPE_NONE = 0,
63 	VIE_OP_TYPE_MOV,
64 	VIE_OP_TYPE_MOVSX,
65 	VIE_OP_TYPE_MOVZX,
66 	VIE_OP_TYPE_AND,
67 	VIE_OP_TYPE_OR,
68 	VIE_OP_TYPE_SUB,
69 	VIE_OP_TYPE_TWO_BYTE,
70 	VIE_OP_TYPE_PUSH,
71 	VIE_OP_TYPE_CMP,
72 	VIE_OP_TYPE_LAST
73 };
74 
75 /* struct vie_op.op_flags */
76 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
77 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
78 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
79 #define	VIE_OP_F_NO_MODRM	(1 << 3)
80 
81 static const struct vie_op two_byte_opcodes[256] = {
82 	[0xB6] = {
83 		.op_byte = 0xB6,
84 		.op_type = VIE_OP_TYPE_MOVZX,
85 	},
86 	[0xB7] = {
87 		.op_byte = 0xB7,
88 		.op_type = VIE_OP_TYPE_MOVZX,
89 	},
90 	[0xBE] = {
91 		.op_byte = 0xBE,
92 		.op_type = VIE_OP_TYPE_MOVSX,
93 	},
94 };
95 
96 static const struct vie_op one_byte_opcodes[256] = {
97 	[0x0F] = {
98 		.op_byte = 0x0F,
99 		.op_type = VIE_OP_TYPE_TWO_BYTE
100 	},
101 	[0x2B] = {
102 		.op_byte = 0x2B,
103 		.op_type = VIE_OP_TYPE_SUB,
104 	},
105 	[0x3B] = {
106 		.op_byte = 0x3B,
107 		.op_type = VIE_OP_TYPE_CMP,
108 	},
109 	[0x88] = {
110 		.op_byte = 0x88,
111 		.op_type = VIE_OP_TYPE_MOV,
112 	},
113 	[0x89] = {
114 		.op_byte = 0x89,
115 		.op_type = VIE_OP_TYPE_MOV,
116 	},
117 	[0x8A] = {
118 		.op_byte = 0x8A,
119 		.op_type = VIE_OP_TYPE_MOV,
120 	},
121 	[0x8B] = {
122 		.op_byte = 0x8B,
123 		.op_type = VIE_OP_TYPE_MOV,
124 	},
125 	[0xA1] = {
126 		.op_byte = 0xA1,
127 		.op_type = VIE_OP_TYPE_MOV,
128 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
129 	},
130 	[0xA3] = {
131 		.op_byte = 0xA3,
132 		.op_type = VIE_OP_TYPE_MOV,
133 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
134 	},
135 	[0xC6] = {
136 		/* XXX Group 11 extended opcode - not just MOV */
137 		.op_byte = 0xC6,
138 		.op_type = VIE_OP_TYPE_MOV,
139 		.op_flags = VIE_OP_F_IMM8,
140 	},
141 	[0xC7] = {
142 		.op_byte = 0xC7,
143 		.op_type = VIE_OP_TYPE_MOV,
144 		.op_flags = VIE_OP_F_IMM,
145 	},
146 	[0x23] = {
147 		.op_byte = 0x23,
148 		.op_type = VIE_OP_TYPE_AND,
149 	},
150 	[0x81] = {
151 		/* XXX Group 1 extended opcode - not just AND */
152 		.op_byte = 0x81,
153 		.op_type = VIE_OP_TYPE_AND,
154 		.op_flags = VIE_OP_F_IMM,
155 	},
156 	[0x83] = {
157 		/* XXX Group 1 extended opcode - not just OR */
158 		.op_byte = 0x83,
159 		.op_type = VIE_OP_TYPE_OR,
160 		.op_flags = VIE_OP_F_IMM8,
161 	},
162 	[0xFF] = {
163 		/* XXX Group 5 extended opcode - not just PUSH */
164 		.op_byte = 0xFF,
165 		.op_type = VIE_OP_TYPE_PUSH,
166 	}
167 };
168 
169 /* struct vie.mod */
170 #define	VIE_MOD_INDIRECT		0
171 #define	VIE_MOD_INDIRECT_DISP8		1
172 #define	VIE_MOD_INDIRECT_DISP32		2
173 #define	VIE_MOD_DIRECT			3
174 
175 /* struct vie.rm */
176 #define	VIE_RM_SIB			4
177 #define	VIE_RM_DISP32			5
178 
179 #define	GB				(1024 * 1024 * 1024)
180 
181 static enum vm_reg_name gpr_map[16] = {
182 	VM_REG_GUEST_RAX,
183 	VM_REG_GUEST_RCX,
184 	VM_REG_GUEST_RDX,
185 	VM_REG_GUEST_RBX,
186 	VM_REG_GUEST_RSP,
187 	VM_REG_GUEST_RBP,
188 	VM_REG_GUEST_RSI,
189 	VM_REG_GUEST_RDI,
190 	VM_REG_GUEST_R8,
191 	VM_REG_GUEST_R9,
192 	VM_REG_GUEST_R10,
193 	VM_REG_GUEST_R11,
194 	VM_REG_GUEST_R12,
195 	VM_REG_GUEST_R13,
196 	VM_REG_GUEST_R14,
197 	VM_REG_GUEST_R15
198 };
199 
200 static uint64_t size2mask[] = {
201 	[1] = 0xff,
202 	[2] = 0xffff,
203 	[4] = 0xffffffff,
204 	[8] = 0xffffffffffffffff,
205 };
206 
207 static int
208 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
209 {
210 	int error;
211 
212 	error = vm_get_register(vm, vcpuid, reg, rval);
213 
214 	return (error);
215 }
216 
217 static void
218 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
219 {
220 	*lhbr = 0;
221 	*reg = gpr_map[vie->reg];
222 
223 	/*
224 	 * 64-bit mode imposes limitations on accessing legacy high byte
225 	 * registers (lhbr).
226 	 *
227 	 * The legacy high-byte registers cannot be addressed if the REX
228 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
229 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
230 	 *
231 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
232 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
233 	 * %ah, %ch, %dh and %bh respectively.
234 	 */
235 	if (!vie->rex_present) {
236 		if (vie->reg & 0x4) {
237 			*lhbr = 1;
238 			*reg = gpr_map[vie->reg & 0x3];
239 		}
240 	}
241 }
242 
243 static int
244 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
245 {
246 	uint64_t val;
247 	int error, lhbr;
248 	enum vm_reg_name reg;
249 
250 	vie_calc_bytereg(vie, &reg, &lhbr);
251 	error = vm_get_register(vm, vcpuid, reg, &val);
252 
253 	/*
254 	 * To obtain the value of a legacy high byte register shift the
255 	 * base register right by 8 bits (%ah = %rax >> 8).
256 	 */
257 	if (lhbr)
258 		*rval = val >> 8;
259 	else
260 		*rval = val;
261 	return (error);
262 }
263 
264 static int
265 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
266 {
267 	uint64_t origval, val, mask;
268 	int error, lhbr;
269 	enum vm_reg_name reg;
270 
271 	vie_calc_bytereg(vie, &reg, &lhbr);
272 	error = vm_get_register(vm, vcpuid, reg, &origval);
273 	if (error == 0) {
274 		val = byte;
275 		mask = 0xff;
276 		if (lhbr) {
277 			/*
278 			 * Shift left by 8 to store 'byte' in a legacy high
279 			 * byte register.
280 			 */
281 			val <<= 8;
282 			mask <<= 8;
283 		}
284 		val |= origval & ~mask;
285 		error = vm_set_register(vm, vcpuid, reg, val);
286 	}
287 	return (error);
288 }
289 
290 int
291 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
292 		    uint64_t val, int size)
293 {
294 	int error;
295 	uint64_t origval;
296 
297 	switch (size) {
298 	case 1:
299 	case 2:
300 		error = vie_read_register(vm, vcpuid, reg, &origval);
301 		if (error)
302 			return (error);
303 		val &= size2mask[size];
304 		val |= origval & ~size2mask[size];
305 		break;
306 	case 4:
307 		val &= 0xffffffffUL;
308 		break;
309 	case 8:
310 		break;
311 	default:
312 		return (EINVAL);
313 	}
314 
315 	error = vm_set_register(vm, vcpuid, reg, val);
316 	return (error);
317 }
318 
319 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
320 
321 /*
322  * Return the status flags that would result from doing (x - y).
323  */
324 #define	GETCC(sz)							\
325 static u_long								\
326 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
327 {									\
328 	u_long rflags;							\
329 									\
330 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
331 	    "=r" (rflags), "+r" (x) : "m" (y));				\
332 	return (rflags);						\
333 } struct __hack
334 
335 GETCC(8);
336 GETCC(16);
337 GETCC(32);
338 GETCC(64);
339 
340 static u_long
341 getcc(int opsize, uint64_t x, uint64_t y)
342 {
343 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
344 	    ("getcc: invalid operand size %d", opsize));
345 
346 	if (opsize == 1)
347 		return (getcc8(x, y));
348 	else if (opsize == 2)
349 		return (getcc16(x, y));
350 	else if (opsize == 4)
351 		return (getcc32(x, y));
352 	else
353 		return (getcc64(x, y));
354 }
355 
356 static int
357 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
358 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
359 {
360 	int error, size;
361 	enum vm_reg_name reg;
362 	uint8_t byte;
363 	uint64_t val;
364 
365 	size = vie->opsize;
366 	error = EINVAL;
367 
368 	switch (vie->op.op_byte) {
369 	case 0x88:
370 		/*
371 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
372 		 * 88/r:	mov r/m8, r8
373 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
374 		 */
375 		size = 1;	/* override for byte operation */
376 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
377 		if (error == 0)
378 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
379 		break;
380 	case 0x89:
381 		/*
382 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
383 		 * 89/r:	mov r/m16, r16
384 		 * 89/r:	mov r/m32, r32
385 		 * REX.W + 89/r	mov r/m64, r64
386 		 */
387 		reg = gpr_map[vie->reg];
388 		error = vie_read_register(vm, vcpuid, reg, &val);
389 		if (error == 0) {
390 			val &= size2mask[size];
391 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
392 		}
393 		break;
394 	case 0x8A:
395 		/*
396 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
397 		 * 8A/r:	mov r8, r/m8
398 		 * REX + 8A/r:	mov r8, r/m8
399 		 */
400 		size = 1;	/* override for byte operation */
401 		error = memread(vm, vcpuid, gpa, &val, size, arg);
402 		if (error == 0)
403 			error = vie_write_bytereg(vm, vcpuid, vie, val);
404 		break;
405 	case 0x8B:
406 		/*
407 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
408 		 * 8B/r:	mov r16, r/m16
409 		 * 8B/r:	mov r32, r/m32
410 		 * REX.W 8B/r:	mov r64, r/m64
411 		 */
412 		error = memread(vm, vcpuid, gpa, &val, size, arg);
413 		if (error == 0) {
414 			reg = gpr_map[vie->reg];
415 			error = vie_update_register(vm, vcpuid, reg, val, size);
416 		}
417 		break;
418 	case 0xA1:
419 		/*
420 		 * MOV from seg:moffset to AX/EAX/RAX
421 		 * A1:		mov AX, moffs16
422 		 * A1:		mov EAX, moffs32
423 		 * REX.W + A1:	mov RAX, moffs64
424 		 */
425 		error = memread(vm, vcpuid, gpa, &val, size, arg);
426 		if (error == 0) {
427 			reg = VM_REG_GUEST_RAX;
428 			error = vie_update_register(vm, vcpuid, reg, val, size);
429 		}
430 		break;
431 	case 0xA3:
432 		/*
433 		 * MOV from AX/EAX/RAX to seg:moffset
434 		 * A3:		mov moffs16, AX
435 		 * A3:		mov moffs32, EAX
436 		 * REX.W + A3:	mov moffs64, RAX
437 		 */
438 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
439 		if (error == 0) {
440 			val &= size2mask[size];
441 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
442 		}
443 		break;
444 	case 0xC6:
445 		/*
446 		 * MOV from imm8 to mem (ModRM:r/m)
447 		 * C6/0		mov r/m8, imm8
448 		 * REX + C6/0	mov r/m8, imm8
449 		 */
450 		size = 1;	/* override for byte operation */
451 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
452 		break;
453 	case 0xC7:
454 		/*
455 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
456 		 * C7/0		mov r/m16, imm16
457 		 * C7/0		mov r/m32, imm32
458 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
459 		 */
460 		val = vie->immediate & size2mask[size];
461 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
462 		break;
463 	default:
464 		break;
465 	}
466 
467 	return (error);
468 }
469 
470 static int
471 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
472 	     mem_region_read_t memread, mem_region_write_t memwrite,
473 	     void *arg)
474 {
475 	int error, size;
476 	enum vm_reg_name reg;
477 	uint64_t val;
478 
479 	size = vie->opsize;
480 	error = EINVAL;
481 
482 	switch (vie->op.op_byte) {
483 	case 0xB6:
484 		/*
485 		 * MOV and zero extend byte from mem (ModRM:r/m) to
486 		 * reg (ModRM:reg).
487 		 *
488 		 * 0F B6/r		movzx r16, r/m8
489 		 * 0F B6/r		movzx r32, r/m8
490 		 * REX.W + 0F B6/r	movzx r64, r/m8
491 		 */
492 
493 		/* get the first operand */
494 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
495 		if (error)
496 			break;
497 
498 		/* get the second operand */
499 		reg = gpr_map[vie->reg];
500 
501 		/* zero-extend byte */
502 		val = (uint8_t)val;
503 
504 		/* write the result */
505 		error = vie_update_register(vm, vcpuid, reg, val, size);
506 		break;
507 	case 0xB7:
508 		/*
509 		 * MOV and zero extend word from mem (ModRM:r/m) to
510 		 * reg (ModRM:reg).
511 		 *
512 		 * 0F B7/r		movzx r32, r/m16
513 		 * REX.W + 0F B7/r	movzx r64, r/m16
514 		 */
515 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
516 		if (error)
517 			return (error);
518 
519 		reg = gpr_map[vie->reg];
520 
521 		/* zero-extend word */
522 		val = (uint16_t)val;
523 
524 		error = vie_update_register(vm, vcpuid, reg, val, size);
525 		break;
526 	case 0xBE:
527 		/*
528 		 * MOV and sign extend byte from mem (ModRM:r/m) to
529 		 * reg (ModRM:reg).
530 		 *
531 		 * 0F BE/r		movsx r16, r/m8
532 		 * 0F BE/r		movsx r32, r/m8
533 		 * REX.W + 0F BE/r	movsx r64, r/m8
534 		 */
535 
536 		/* get the first operand */
537 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
538 		if (error)
539 			break;
540 
541 		/* get the second operand */
542 		reg = gpr_map[vie->reg];
543 
544 		/* sign extend byte */
545 		val = (int8_t)val;
546 
547 		/* write the result */
548 		error = vie_update_register(vm, vcpuid, reg, val, size);
549 		break;
550 	default:
551 		break;
552 	}
553 	return (error);
554 }
555 
556 static int
557 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
558 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
559 {
560 	int error, size;
561 	enum vm_reg_name reg;
562 	uint64_t result, rflags, rflags2, val1, val2;
563 
564 	size = vie->opsize;
565 	error = EINVAL;
566 
567 	switch (vie->op.op_byte) {
568 	case 0x23:
569 		/*
570 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
571 		 * result in reg.
572 		 *
573 		 * 23/r		and r16, r/m16
574 		 * 23/r		and r32, r/m32
575 		 * REX.W + 23/r	and r64, r/m64
576 		 */
577 
578 		/* get the first operand */
579 		reg = gpr_map[vie->reg];
580 		error = vie_read_register(vm, vcpuid, reg, &val1);
581 		if (error)
582 			break;
583 
584 		/* get the second operand */
585 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
586 		if (error)
587 			break;
588 
589 		/* perform the operation and write the result */
590 		result = val1 & val2;
591 		error = vie_update_register(vm, vcpuid, reg, result, size);
592 		break;
593 	case 0x81:
594 		/*
595 		 * AND/OR mem (ModRM:r/m) with immediate and store the
596 		 * result in mem.
597 		 *
598 		 * AND: i = 4
599 		 * OR:  i = 1
600 		 * 81 /i		op r/m16, imm16
601 		 * 81 /i		op r/m32, imm32
602 		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
603 		 *
604 		 */
605 
606 		/* get the first operand */
607                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
608                 if (error)
609 			break;
610 
611                 /*
612                  * perform the operation with the pre-fetched immediate
613                  * operand and write the result
614                  */
615 		switch (vie->reg & 7) {
616 		case 0x4:
617 			/* modrm:reg == b100, AND */
618 			result = val1 & vie->immediate;
619 			break;
620 		case 0x1:
621 			/* modrm:reg == b001, OR */
622 			result = val1 | vie->immediate;
623 			break;
624 		default:
625 			error = EINVAL;
626 			break;
627 		}
628 		if (error)
629 			break;
630 
631 		error = memwrite(vm, vcpuid, gpa, result, size, arg);
632 		break;
633 	default:
634 		break;
635 	}
636 	if (error)
637 		return (error);
638 
639 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
640 	if (error)
641 		return (error);
642 
643 	/*
644 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
645 	 * to the result; AF is undefined.
646 	 *
647 	 * The updated status flags are obtained by subtracting 0 from 'result'.
648 	 */
649 	rflags2 = getcc(size, result, 0);
650 	rflags &= ~RFLAGS_STATUS_BITS;
651 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
652 
653 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
654 	return (error);
655 }
656 
657 static int
658 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
659 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
660 {
661 	int error, size;
662 	uint64_t val1, result, rflags, rflags2;
663 
664 	size = vie->opsize;
665 	error = EINVAL;
666 
667 	switch (vie->op.op_byte) {
668 	case 0x83:
669 		/*
670 		 * OR mem (ModRM:r/m) with immediate and store the
671 		 * result in mem.
672 		 *
673 		 * 83 /1		OR r/m16, imm8 sign-extended to 16
674 		 * 83 /1		OR r/m32, imm8 sign-extended to 32
675 		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
676 		 *
677 		 * Currently, only the OR operation of the 0x83 opcode
678 		 * is implemented (ModRM:reg = b001).
679 		 */
680 		if ((vie->reg & 7) != 1)
681 			break;
682 
683 		/* get the first operand */
684                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
685                 if (error)
686 			break;
687 
688                 /*
689 		 * perform the operation with the pre-fetched immediate
690 		 * operand and write the result
691 		 */
692                 result = val1 | vie->immediate;
693                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
694 		break;
695 	default:
696 		break;
697 	}
698 	if (error)
699 		return (error);
700 
701 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
702 	if (error)
703 		return (error);
704 
705 	/*
706 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
707 	 * to the result; AF is undefined.
708 	 *
709 	 * The updated status flags are obtained by subtracting 0 from 'result'.
710 	 */
711 	rflags2 = getcc(size, result, 0);
712 	rflags &= ~RFLAGS_STATUS_BITS;
713 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
714 
715 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
716 	return (error);
717 }
718 
719 static int
720 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
721 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
722 {
723 	int error, size;
724 	uint64_t op1, op2, rflags, rflags2;
725 	enum vm_reg_name reg;
726 
727 	size = vie->opsize;
728 	switch (vie->op.op_byte) {
729 	case 0x3B:
730 		/*
731 		 * 3B/r		CMP r16, r/m16
732 		 * 3B/r		CMP r32, r/m32
733 		 * REX.W + 3B/r	CMP r64, r/m64
734 		 *
735 		 * Compare first operand (reg) with second operand (r/m) and
736 		 * set status flags in EFLAGS register. The comparison is
737 		 * performed by subtracting the second operand from the first
738 		 * operand and then setting the status flags.
739 		 */
740 
741 		/* Get the first operand */
742 		reg = gpr_map[vie->reg];
743 		error = vie_read_register(vm, vcpuid, reg, &op1);
744 		if (error)
745 			return (error);
746 
747 		/* Get the second operand */
748 		error = memread(vm, vcpuid, gpa, &op2, size, arg);
749 		if (error)
750 			return (error);
751 
752 		break;
753 	default:
754 		return (EINVAL);
755 	}
756 	rflags2 = getcc(size, op1, op2);
757 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
758 	if (error)
759 		return (error);
760 	rflags &= ~RFLAGS_STATUS_BITS;
761 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
762 
763 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
764 	return (error);
765 }
766 
767 static int
768 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
769 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
770 {
771 	int error, size;
772 	uint64_t nval, rflags, rflags2, val1, val2;
773 	enum vm_reg_name reg;
774 
775 	size = vie->opsize;
776 	error = EINVAL;
777 
778 	switch (vie->op.op_byte) {
779 	case 0x2B:
780 		/*
781 		 * SUB r/m from r and store the result in r
782 		 *
783 		 * 2B/r            SUB r16, r/m16
784 		 * 2B/r            SUB r32, r/m32
785 		 * REX.W + 2B/r    SUB r64, r/m64
786 		 */
787 
788 		/* get the first operand */
789 		reg = gpr_map[vie->reg];
790 		error = vie_read_register(vm, vcpuid, reg, &val1);
791 		if (error)
792 			break;
793 
794 		/* get the second operand */
795 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
796 		if (error)
797 			break;
798 
799 		/* perform the operation and write the result */
800 		nval = val1 - val2;
801 		error = vie_update_register(vm, vcpuid, reg, nval, size);
802 		break;
803 	default:
804 		break;
805 	}
806 
807 	if (!error) {
808 		rflags2 = getcc(size, val1, val2);
809 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
810 		    &rflags);
811 		if (error)
812 			return (error);
813 
814 		rflags &= ~RFLAGS_STATUS_BITS;
815 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
816 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
817 		    rflags, 8);
818 	}
819 
820 	return (error);
821 }
822 
823 static int
824 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
825     struct vm_guest_paging *paging, mem_region_read_t memread,
826     mem_region_write_t memwrite, void *arg)
827 {
828 #ifdef _KERNEL
829 	struct vm_copyinfo copyinfo[2];
830 #else
831 	struct iovec copyinfo[2];
832 #endif
833 	struct seg_desc ss_desc;
834 	uint64_t cr0, rflags, rsp, stack_gla, val;
835 	int error, size, stackaddrsize;
836 
837 	/*
838 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
839 	 *
840 	 * PUSH is part of the group 5 extended opcodes and is identified
841 	 * by ModRM:reg = b110.
842 	 */
843 	if ((vie->reg & 7) != 6)
844 		return (EINVAL);
845 
846 	size = vie->opsize;
847 	/*
848 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
849 	 */
850 	if (paging->cpu_mode == CPU_MODE_REAL) {
851 		stackaddrsize = 2;
852 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
853 		/*
854 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
855 		 * - Stack pointer size is always 64-bits.
856 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
857 		 * - 16-bit PUSH/POP is supported by using the operand size
858 		 *   override prefix (66H).
859 		 */
860 		stackaddrsize = 8;
861 		size = vie->opsize_override ? 2 : 8;
862 	} else {
863 		/*
864 		 * In protected or compability mode the 'B' flag in the
865 		 * stack-segment descriptor determines the size of the
866 		 * stack pointer.
867 		 */
868 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
869 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
870 		    __func__, error));
871 		if (SEG_DESC_DEF32(ss_desc.access))
872 			stackaddrsize = 4;
873 		else
874 			stackaddrsize = 2;
875 	}
876 
877 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
878 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
879 
880 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
881 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
882 
883 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
884 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
885 
886 	rsp -= size;
887 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
888 	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
889 		vm_inject_ss(vm, vcpuid, 0);
890 		return (0);
891 	}
892 
893 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
894 		vm_inject_ss(vm, vcpuid, 0);
895 		return (0);
896 	}
897 
898 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
899 		vm_inject_ac(vm, vcpuid, 0);
900 		return (0);
901 	}
902 
903 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
904 	    copyinfo, nitems(copyinfo));
905 	if (error == -1) {
906 		/*
907 		 * XXX cannot return a negative error value here because it
908 		 * ends up being the return value of the VM_RUN() ioctl and
909 		 * is interpreted as a pseudo-error (for e.g. ERESTART).
910 		 */
911 		return (EFAULT);
912 	} else if (error == 1) {
913 		/* Resume guest execution to handle page fault */
914 		return (0);
915 	}
916 
917 	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
918 	if (error == 0) {
919 		vm_copyout(vm, vcpuid, &val, copyinfo, size);
920 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
921 		    stackaddrsize);
922 		KASSERT(error == 0, ("error %d updating rsp", error));
923 	}
924 #ifdef _KERNEL
925 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
926 #endif
927 	return (error);
928 }
929 
930 int
931 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
932     struct vm_guest_paging *paging, mem_region_read_t memread,
933     mem_region_write_t memwrite, void *memarg)
934 {
935 	int error;
936 
937 	if (!vie->decoded)
938 		return (EINVAL);
939 
940 	switch (vie->op.op_type) {
941 	case VIE_OP_TYPE_PUSH:
942 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
943 		    memwrite, memarg);
944 		break;
945 	case VIE_OP_TYPE_CMP:
946 		error = emulate_cmp(vm, vcpuid, gpa, vie,
947 				    memread, memwrite, memarg);
948 		break;
949 	case VIE_OP_TYPE_MOV:
950 		error = emulate_mov(vm, vcpuid, gpa, vie,
951 				    memread, memwrite, memarg);
952 		break;
953 	case VIE_OP_TYPE_MOVSX:
954 	case VIE_OP_TYPE_MOVZX:
955 		error = emulate_movx(vm, vcpuid, gpa, vie,
956 				     memread, memwrite, memarg);
957 		break;
958 	case VIE_OP_TYPE_AND:
959 		error = emulate_and(vm, vcpuid, gpa, vie,
960 				    memread, memwrite, memarg);
961 		break;
962 	case VIE_OP_TYPE_OR:
963 		error = emulate_or(vm, vcpuid, gpa, vie,
964 				    memread, memwrite, memarg);
965 		break;
966 	case VIE_OP_TYPE_SUB:
967 		error = emulate_sub(vm, vcpuid, gpa, vie,
968 				    memread, memwrite, memarg);
969 		break;
970 	default:
971 		error = EINVAL;
972 		break;
973 	}
974 
975 	return (error);
976 }
977 
978 int
979 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
980 {
981 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
982 	    ("%s: invalid size %d", __func__, size));
983 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
984 
985 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
986 		return (0);
987 
988 	return ((gla & (size - 1)) ? 1 : 0);
989 }
990 
991 int
992 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
993 {
994 	uint64_t mask;
995 
996 	if (cpu_mode != CPU_MODE_64BIT)
997 		return (0);
998 
999 	/*
1000 	 * The value of the bit 47 in the 'gla' should be replicated in the
1001 	 * most significant 16 bits.
1002 	 */
1003 	mask = ~((1UL << 48) - 1);
1004 	if (gla & (1UL << 47))
1005 		return ((gla & mask) != mask);
1006 	else
1007 		return ((gla & mask) != 0);
1008 }
1009 
1010 uint64_t
1011 vie_size2mask(int size)
1012 {
1013 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1014 	    ("vie_size2mask: invalid size %d", size));
1015 	return (size2mask[size]);
1016 }
1017 
1018 int
1019 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1020     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1021     int prot, uint64_t *gla)
1022 {
1023 	uint64_t firstoff, low_limit, high_limit, segbase;
1024 	int glasize, type;
1025 
1026 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1027 	    ("%s: invalid segment %d", __func__, seg));
1028 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1029 	    ("%s: invalid operand size %d", __func__, length));
1030 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1031 	    ("%s: invalid prot %#x", __func__, prot));
1032 
1033 	firstoff = offset;
1034 	if (cpu_mode == CPU_MODE_64BIT) {
1035 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1036 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1037 		glasize = 8;
1038 	} else {
1039 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1040 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1041 		glasize = 4;
1042 		/*
1043 		 * If the segment selector is loaded with a NULL selector
1044 		 * then the descriptor is unusable and attempting to use
1045 		 * it results in a #GP(0).
1046 		 */
1047 		if (SEG_DESC_UNUSABLE(desc->access))
1048 			return (-1);
1049 
1050 		/*
1051 		 * The processor generates a #NP exception when a segment
1052 		 * register is loaded with a selector that points to a
1053 		 * descriptor that is not present. If this was the case then
1054 		 * it would have been checked before the VM-exit.
1055 		 */
1056 		KASSERT(SEG_DESC_PRESENT(desc->access),
1057 		    ("segment %d not present: %#x", seg, desc->access));
1058 
1059 		/*
1060 		 * The descriptor type must indicate a code/data segment.
1061 		 */
1062 		type = SEG_DESC_TYPE(desc->access);
1063 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1064 		    "descriptor type %#x", seg, type));
1065 
1066 		if (prot & PROT_READ) {
1067 			/* #GP on a read access to a exec-only code segment */
1068 			if ((type & 0xA) == 0x8)
1069 				return (-1);
1070 		}
1071 
1072 		if (prot & PROT_WRITE) {
1073 			/*
1074 			 * #GP on a write access to a code segment or a
1075 			 * read-only data segment.
1076 			 */
1077 			if (type & 0x8)			/* code segment */
1078 				return (-1);
1079 
1080 			if ((type & 0xA) == 0)		/* read-only data seg */
1081 				return (-1);
1082 		}
1083 
1084 		/*
1085 		 * 'desc->limit' is fully expanded taking granularity into
1086 		 * account.
1087 		 */
1088 		if ((type & 0xC) == 0x4) {
1089 			/* expand-down data segment */
1090 			low_limit = desc->limit + 1;
1091 			high_limit = SEG_DESC_DEF32(desc->access) ?
1092 			    0xffffffff : 0xffff;
1093 		} else {
1094 			/* code segment or expand-up data segment */
1095 			low_limit = 0;
1096 			high_limit = desc->limit;
1097 		}
1098 
1099 		while (length > 0) {
1100 			offset &= vie_size2mask(addrsize);
1101 			if (offset < low_limit || offset > high_limit)
1102 				return (-1);
1103 			offset++;
1104 			length--;
1105 		}
1106 	}
1107 
1108 	/*
1109 	 * In 64-bit mode all segments except %fs and %gs have a segment
1110 	 * base address of 0.
1111 	 */
1112 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1113 	    seg != VM_REG_GUEST_GS) {
1114 		segbase = 0;
1115 	} else {
1116 		segbase = desc->base;
1117 	}
1118 
1119 	/*
1120 	 * Truncate 'firstoff' to the effective address size before adding
1121 	 * it to the segment base.
1122 	 */
1123 	firstoff &= vie_size2mask(addrsize);
1124 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1125 	return (0);
1126 }
1127 
1128 #ifdef _KERNEL
1129 void
1130 vie_init(struct vie *vie)
1131 {
1132 
1133 	bzero(vie, sizeof(struct vie));
1134 
1135 	vie->base_register = VM_REG_LAST;
1136 	vie->index_register = VM_REG_LAST;
1137 }
1138 
1139 static int
1140 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1141 {
1142 	int error_code = 0;
1143 
1144 	if (pte & PG_V)
1145 		error_code |= PGEX_P;
1146 	if (prot & VM_PROT_WRITE)
1147 		error_code |= PGEX_W;
1148 	if (usermode)
1149 		error_code |= PGEX_U;
1150 	if (rsvd)
1151 		error_code |= PGEX_RSV;
1152 	if (prot & VM_PROT_EXECUTE)
1153 		error_code |= PGEX_I;
1154 
1155 	return (error_code);
1156 }
1157 
1158 static void
1159 ptp_release(void **cookie)
1160 {
1161 	if (*cookie != NULL) {
1162 		vm_gpa_release(*cookie);
1163 		*cookie = NULL;
1164 	}
1165 }
1166 
1167 static void *
1168 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
1169 {
1170 	void *ptr;
1171 
1172 	ptp_release(cookie);
1173 	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
1174 	return (ptr);
1175 }
1176 
1177 int
1178 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1179     uint64_t gla, int prot, uint64_t *gpa)
1180 {
1181 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1182 	u_int retries;
1183 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1184 	uint32_t *ptpbase32, pte32;
1185 	void *cookie;
1186 
1187 	usermode = (paging->cpl == 3 ? 1 : 0);
1188 	writable = prot & VM_PROT_WRITE;
1189 	cookie = NULL;
1190 	retval = 0;
1191 	retries = 0;
1192 restart:
1193 	ptpphys = paging->cr3;		/* root of the page tables */
1194 	ptp_release(&cookie);
1195 	if (retries++ > 0)
1196 		maybe_yield();
1197 
1198 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1199 		/*
1200 		 * XXX assuming a non-stack reference otherwise a stack fault
1201 		 * should be generated.
1202 		 */
1203 		vm_inject_gp(vm, vcpuid);
1204 		goto fault;
1205 	}
1206 
1207 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1208 		*gpa = gla;
1209 		goto done;
1210 	}
1211 
1212 	if (paging->paging_mode == PAGING_MODE_32) {
1213 		nlevels = 2;
1214 		while (--nlevels >= 0) {
1215 			/* Zero out the lower 12 bits. */
1216 			ptpphys &= ~0xfff;
1217 
1218 			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1219 
1220 			if (ptpbase32 == NULL)
1221 				goto error;
1222 
1223 			ptpshift = PAGE_SHIFT + nlevels * 10;
1224 			ptpindex = (gla >> ptpshift) & 0x3FF;
1225 			pgsize = 1UL << ptpshift;
1226 
1227 			pte32 = ptpbase32[ptpindex];
1228 
1229 			if ((pte32 & PG_V) == 0 ||
1230 			    (usermode && (pte32 & PG_U) == 0) ||
1231 			    (writable && (pte32 & PG_RW) == 0)) {
1232 				pfcode = pf_error_code(usermode, prot, 0,
1233 				    pte32);
1234 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1235 				goto fault;
1236 			}
1237 
1238 			/*
1239 			 * Emulate the x86 MMU's management of the accessed
1240 			 * and dirty flags. While the accessed flag is set
1241 			 * at every level of the page table, the dirty flag
1242 			 * is only set at the last level providing the guest
1243 			 * physical address.
1244 			 */
1245 			if ((pte32 & PG_A) == 0) {
1246 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1247 				    pte32, pte32 | PG_A) == 0) {
1248 					goto restart;
1249 				}
1250 			}
1251 
1252 			/* XXX must be ignored if CR4.PSE=0 */
1253 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1254 				break;
1255 
1256 			ptpphys = pte32;
1257 		}
1258 
1259 		/* Set the dirty bit in the page table entry if necessary */
1260 		if (writable && (pte32 & PG_M) == 0) {
1261 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1262 			    pte32, pte32 | PG_M) == 0) {
1263 				goto restart;
1264 			}
1265 		}
1266 
1267 		/* Zero out the lower 'ptpshift' bits */
1268 		pte32 >>= ptpshift; pte32 <<= ptpshift;
1269 		*gpa = pte32 | (gla & (pgsize - 1));
1270 		goto done;
1271 	}
1272 
1273 	if (paging->paging_mode == PAGING_MODE_PAE) {
1274 		/* Zero out the lower 5 bits and the upper 32 bits */
1275 		ptpphys &= 0xffffffe0UL;
1276 
1277 		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
1278 		if (ptpbase == NULL)
1279 			goto error;
1280 
1281 		ptpindex = (gla >> 30) & 0x3;
1282 
1283 		pte = ptpbase[ptpindex];
1284 
1285 		if ((pte & PG_V) == 0) {
1286 			pfcode = pf_error_code(usermode, prot, 0, pte);
1287 			vm_inject_pf(vm, vcpuid, pfcode, gla);
1288 			goto fault;
1289 		}
1290 
1291 		ptpphys = pte;
1292 
1293 		nlevels = 2;
1294 	} else
1295 		nlevels = 4;
1296 	while (--nlevels >= 0) {
1297 		/* Zero out the lower 12 bits and the upper 12 bits */
1298 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1299 
1300 		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1301 		if (ptpbase == NULL)
1302 			goto error;
1303 
1304 		ptpshift = PAGE_SHIFT + nlevels * 9;
1305 		ptpindex = (gla >> ptpshift) & 0x1FF;
1306 		pgsize = 1UL << ptpshift;
1307 
1308 		pte = ptpbase[ptpindex];
1309 
1310 		if ((pte & PG_V) == 0 ||
1311 		    (usermode && (pte & PG_U) == 0) ||
1312 		    (writable && (pte & PG_RW) == 0)) {
1313 			pfcode = pf_error_code(usermode, prot, 0, pte);
1314 			vm_inject_pf(vm, vcpuid, pfcode, gla);
1315 			goto fault;
1316 		}
1317 
1318 		/* Set the accessed bit in the page table entry */
1319 		if ((pte & PG_A) == 0) {
1320 			if (atomic_cmpset_64(&ptpbase[ptpindex],
1321 			    pte, pte | PG_A) == 0) {
1322 				goto restart;
1323 			}
1324 		}
1325 
1326 		if (nlevels > 0 && (pte & PG_PS) != 0) {
1327 			if (pgsize > 1 * GB) {
1328 				pfcode = pf_error_code(usermode, prot, 1, pte);
1329 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1330 				goto fault;
1331 			}
1332 			break;
1333 		}
1334 
1335 		ptpphys = pte;
1336 	}
1337 
1338 	/* Set the dirty bit in the page table entry if necessary */
1339 	if (writable && (pte & PG_M) == 0) {
1340 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1341 			goto restart;
1342 	}
1343 
1344 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1345 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1346 	*gpa = pte | (gla & (pgsize - 1));
1347 done:
1348 	ptp_release(&cookie);
1349 	return (retval);
1350 error:
1351 	retval = -1;
1352 	goto done;
1353 fault:
1354 	retval = 1;
1355 	goto done;
1356 }
1357 
1358 int
1359 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1360     uint64_t rip, int inst_length, struct vie *vie)
1361 {
1362 	struct vm_copyinfo copyinfo[2];
1363 	int error, prot;
1364 
1365 	if (inst_length > VIE_INST_SIZE)
1366 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1367 
1368 	prot = PROT_READ | PROT_EXEC;
1369 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1370 	    copyinfo, nitems(copyinfo));
1371 	if (error == 0) {
1372 		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1373 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1374 		vie->num_valid = inst_length;
1375 	}
1376 	return (error);
1377 }
1378 
1379 static int
1380 vie_peek(struct vie *vie, uint8_t *x)
1381 {
1382 
1383 	if (vie->num_processed < vie->num_valid) {
1384 		*x = vie->inst[vie->num_processed];
1385 		return (0);
1386 	} else
1387 		return (-1);
1388 }
1389 
1390 static void
1391 vie_advance(struct vie *vie)
1392 {
1393 
1394 	vie->num_processed++;
1395 }
1396 
1397 static int
1398 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1399 {
1400 	uint8_t x;
1401 
1402 	while (1) {
1403 		if (vie_peek(vie, &x))
1404 			return (-1);
1405 
1406 		if (x == 0x66)
1407 			vie->opsize_override = 1;
1408 		else if (x == 0x67)
1409 			vie->addrsize_override = 1;
1410 		else
1411 			break;
1412 
1413 		vie_advance(vie);
1414 	}
1415 
1416 	/*
1417 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1418 	 * - Only one REX prefix is allowed per instruction.
1419 	 * - The REX prefix must immediately precede the opcode byte or the
1420 	 *   escape opcode byte.
1421 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1422 	 *   the mandatory prefix must come before the REX prefix.
1423 	 */
1424 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1425 		vie->rex_present = 1;
1426 		vie->rex_w = x & 0x8 ? 1 : 0;
1427 		vie->rex_r = x & 0x4 ? 1 : 0;
1428 		vie->rex_x = x & 0x2 ? 1 : 0;
1429 		vie->rex_b = x & 0x1 ? 1 : 0;
1430 		vie_advance(vie);
1431 	}
1432 
1433 	/*
1434 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1435 	 */
1436 	if (cpu_mode == CPU_MODE_64BIT) {
1437 		/*
1438 		 * Default address size is 64-bits and default operand size
1439 		 * is 32-bits.
1440 		 */
1441 		vie->addrsize = vie->addrsize_override ? 4 : 8;
1442 		if (vie->rex_w)
1443 			vie->opsize = 8;
1444 		else if (vie->opsize_override)
1445 			vie->opsize = 2;
1446 		else
1447 			vie->opsize = 4;
1448 	} else if (cs_d) {
1449 		/* Default address and operand sizes are 32-bits */
1450 		vie->addrsize = vie->addrsize_override ? 2 : 4;
1451 		vie->opsize = vie->opsize_override ? 2 : 4;
1452 	} else {
1453 		/* Default address and operand sizes are 16-bits */
1454 		vie->addrsize = vie->addrsize_override ? 4 : 2;
1455 		vie->opsize = vie->opsize_override ? 4 : 2;
1456 	}
1457 	return (0);
1458 }
1459 
1460 static int
1461 decode_two_byte_opcode(struct vie *vie)
1462 {
1463 	uint8_t x;
1464 
1465 	if (vie_peek(vie, &x))
1466 		return (-1);
1467 
1468 	vie->op = two_byte_opcodes[x];
1469 
1470 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1471 		return (-1);
1472 
1473 	vie_advance(vie);
1474 	return (0);
1475 }
1476 
1477 static int
1478 decode_opcode(struct vie *vie)
1479 {
1480 	uint8_t x;
1481 
1482 	if (vie_peek(vie, &x))
1483 		return (-1);
1484 
1485 	vie->op = one_byte_opcodes[x];
1486 
1487 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1488 		return (-1);
1489 
1490 	vie_advance(vie);
1491 
1492 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1493 		return (decode_two_byte_opcode(vie));
1494 
1495 	return (0);
1496 }
1497 
1498 static int
1499 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1500 {
1501 	uint8_t x;
1502 
1503 	if (cpu_mode == CPU_MODE_REAL)
1504 		return (-1);
1505 
1506 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
1507 		return (0);
1508 
1509 	if (vie_peek(vie, &x))
1510 		return (-1);
1511 
1512 	vie->mod = (x >> 6) & 0x3;
1513 	vie->rm =  (x >> 0) & 0x7;
1514 	vie->reg = (x >> 3) & 0x7;
1515 
1516 	/*
1517 	 * A direct addressing mode makes no sense in the context of an EPT
1518 	 * fault. There has to be a memory access involved to cause the
1519 	 * EPT fault.
1520 	 */
1521 	if (vie->mod == VIE_MOD_DIRECT)
1522 		return (-1);
1523 
1524 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1525 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1526 		/*
1527 		 * Table 2-5: Special Cases of REX Encodings
1528 		 *
1529 		 * mod=0, r/m=5 is used in the compatibility mode to
1530 		 * indicate a disp32 without a base register.
1531 		 *
1532 		 * mod!=3, r/m=4 is used in the compatibility mode to
1533 		 * indicate that the SIB byte is present.
1534 		 *
1535 		 * The 'b' bit in the REX prefix is don't care in
1536 		 * this case.
1537 		 */
1538 	} else {
1539 		vie->rm |= (vie->rex_b << 3);
1540 	}
1541 
1542 	vie->reg |= (vie->rex_r << 3);
1543 
1544 	/* SIB */
1545 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1546 		goto done;
1547 
1548 	vie->base_register = gpr_map[vie->rm];
1549 
1550 	switch (vie->mod) {
1551 	case VIE_MOD_INDIRECT_DISP8:
1552 		vie->disp_bytes = 1;
1553 		break;
1554 	case VIE_MOD_INDIRECT_DISP32:
1555 		vie->disp_bytes = 4;
1556 		break;
1557 	case VIE_MOD_INDIRECT:
1558 		if (vie->rm == VIE_RM_DISP32) {
1559 			vie->disp_bytes = 4;
1560 			/*
1561 			 * Table 2-7. RIP-Relative Addressing
1562 			 *
1563 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1564 			 * whereas in compatibility mode it just implies disp32.
1565 			 */
1566 
1567 			if (cpu_mode == CPU_MODE_64BIT)
1568 				vie->base_register = VM_REG_GUEST_RIP;
1569 			else
1570 				vie->base_register = VM_REG_LAST;
1571 		}
1572 		break;
1573 	}
1574 
1575 done:
1576 	vie_advance(vie);
1577 
1578 	return (0);
1579 }
1580 
1581 static int
1582 decode_sib(struct vie *vie)
1583 {
1584 	uint8_t x;
1585 
1586 	/* Proceed only if SIB byte is present */
1587 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1588 		return (0);
1589 
1590 	if (vie_peek(vie, &x))
1591 		return (-1);
1592 
1593 	/* De-construct the SIB byte */
1594 	vie->ss = (x >> 6) & 0x3;
1595 	vie->index = (x >> 3) & 0x7;
1596 	vie->base = (x >> 0) & 0x7;
1597 
1598 	/* Apply the REX prefix modifiers */
1599 	vie->index |= vie->rex_x << 3;
1600 	vie->base |= vie->rex_b << 3;
1601 
1602 	switch (vie->mod) {
1603 	case VIE_MOD_INDIRECT_DISP8:
1604 		vie->disp_bytes = 1;
1605 		break;
1606 	case VIE_MOD_INDIRECT_DISP32:
1607 		vie->disp_bytes = 4;
1608 		break;
1609 	}
1610 
1611 	if (vie->mod == VIE_MOD_INDIRECT &&
1612 	    (vie->base == 5 || vie->base == 13)) {
1613 		/*
1614 		 * Special case when base register is unused if mod = 0
1615 		 * and base = %rbp or %r13.
1616 		 *
1617 		 * Documented in:
1618 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1619 		 * Table 2-5: Special Cases of REX Encodings
1620 		 */
1621 		vie->disp_bytes = 4;
1622 	} else {
1623 		vie->base_register = gpr_map[vie->base];
1624 	}
1625 
1626 	/*
1627 	 * All encodings of 'index' are valid except for %rsp (4).
1628 	 *
1629 	 * Documented in:
1630 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1631 	 * Table 2-5: Special Cases of REX Encodings
1632 	 */
1633 	if (vie->index != 4)
1634 		vie->index_register = gpr_map[vie->index];
1635 
1636 	/* 'scale' makes sense only in the context of an index register */
1637 	if (vie->index_register < VM_REG_LAST)
1638 		vie->scale = 1 << vie->ss;
1639 
1640 	vie_advance(vie);
1641 
1642 	return (0);
1643 }
1644 
1645 static int
1646 decode_displacement(struct vie *vie)
1647 {
1648 	int n, i;
1649 	uint8_t x;
1650 
1651 	union {
1652 		char	buf[4];
1653 		int8_t	signed8;
1654 		int32_t	signed32;
1655 	} u;
1656 
1657 	if ((n = vie->disp_bytes) == 0)
1658 		return (0);
1659 
1660 	if (n != 1 && n != 4)
1661 		panic("decode_displacement: invalid disp_bytes %d", n);
1662 
1663 	for (i = 0; i < n; i++) {
1664 		if (vie_peek(vie, &x))
1665 			return (-1);
1666 
1667 		u.buf[i] = x;
1668 		vie_advance(vie);
1669 	}
1670 
1671 	if (n == 1)
1672 		vie->displacement = u.signed8;		/* sign-extended */
1673 	else
1674 		vie->displacement = u.signed32;		/* sign-extended */
1675 
1676 	return (0);
1677 }
1678 
1679 static int
1680 decode_immediate(struct vie *vie)
1681 {
1682 	int i, n;
1683 	uint8_t x;
1684 	union {
1685 		char	buf[4];
1686 		int8_t	signed8;
1687 		int16_t	signed16;
1688 		int32_t	signed32;
1689 	} u;
1690 
1691 	/* Figure out immediate operand size (if any) */
1692 	if (vie->op.op_flags & VIE_OP_F_IMM) {
1693 		/*
1694 		 * Section 2.2.1.5 "Immediates", Intel SDM:
1695 		 * In 64-bit mode the typical size of immediate operands
1696 		 * remains 32-bits. When the operand size if 64-bits, the
1697 		 * processor sign-extends all immediates to 64-bits prior
1698 		 * to their use.
1699 		 */
1700 		if (vie->opsize == 4 || vie->opsize == 8)
1701 			vie->imm_bytes = 4;
1702 		else
1703 			vie->imm_bytes = 2;
1704 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
1705 		vie->imm_bytes = 1;
1706 	}
1707 
1708 	if ((n = vie->imm_bytes) == 0)
1709 		return (0);
1710 
1711 	KASSERT(n == 1 || n == 2 || n == 4,
1712 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
1713 
1714 	for (i = 0; i < n; i++) {
1715 		if (vie_peek(vie, &x))
1716 			return (-1);
1717 
1718 		u.buf[i] = x;
1719 		vie_advance(vie);
1720 	}
1721 
1722 	/* sign-extend the immediate value before use */
1723 	if (n == 1)
1724 		vie->immediate = u.signed8;
1725 	else if (n == 2)
1726 		vie->immediate = u.signed16;
1727 	else
1728 		vie->immediate = u.signed32;
1729 
1730 	return (0);
1731 }
1732 
1733 static int
1734 decode_moffset(struct vie *vie)
1735 {
1736 	int i, n;
1737 	uint8_t x;
1738 	union {
1739 		char	buf[8];
1740 		uint64_t u64;
1741 	} u;
1742 
1743 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
1744 		return (0);
1745 
1746 	/*
1747 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
1748 	 * The memory offset size follows the address-size of the instruction.
1749 	 */
1750 	n = vie->addrsize;
1751 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
1752 
1753 	u.u64 = 0;
1754 	for (i = 0; i < n; i++) {
1755 		if (vie_peek(vie, &x))
1756 			return (-1);
1757 
1758 		u.buf[i] = x;
1759 		vie_advance(vie);
1760 	}
1761 	vie->displacement = u.u64;
1762 	return (0);
1763 }
1764 
1765 /*
1766  * Verify that all the bytes in the instruction buffer were consumed.
1767  */
1768 static int
1769 verify_inst_length(struct vie *vie)
1770 {
1771 
1772 	if (vie->num_processed == vie->num_valid)
1773 		return (0);
1774 	else
1775 		return (-1);
1776 }
1777 
1778 /*
1779  * Verify that the 'guest linear address' provided as collateral of the nested
1780  * page table fault matches with our instruction decoding.
1781  */
1782 static int
1783 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1784 {
1785 	int error;
1786 	uint64_t base, idx, gla2;
1787 
1788 	/* Skip 'gla' verification */
1789 	if (gla == VIE_INVALID_GLA)
1790 		return (0);
1791 
1792 	base = 0;
1793 	if (vie->base_register != VM_REG_LAST) {
1794 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1795 		if (error) {
1796 			printf("verify_gla: error %d getting base reg %d\n",
1797 				error, vie->base_register);
1798 			return (-1);
1799 		}
1800 
1801 		/*
1802 		 * RIP-relative addressing starts from the following
1803 		 * instruction
1804 		 */
1805 		if (vie->base_register == VM_REG_GUEST_RIP)
1806 			base += vie->num_valid;
1807 	}
1808 
1809 	idx = 0;
1810 	if (vie->index_register != VM_REG_LAST) {
1811 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1812 		if (error) {
1813 			printf("verify_gla: error %d getting index reg %d\n",
1814 				error, vie->index_register);
1815 			return (-1);
1816 		}
1817 	}
1818 
1819 	/* XXX assuming that the base address of the segment is 0 */
1820 	gla2 = base + vie->scale * idx + vie->displacement;
1821 	gla2 &= size2mask[vie->addrsize];
1822 	if (gla != gla2) {
1823 		printf("verify_gla mismatch: "
1824 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1825 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
1826 		       base, vie->scale, idx, vie->displacement, gla, gla2);
1827 		return (-1);
1828 	}
1829 
1830 	return (0);
1831 }
1832 
1833 int
1834 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1835 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
1836 {
1837 
1838 	if (decode_prefixes(vie, cpu_mode, cs_d))
1839 		return (-1);
1840 
1841 	if (decode_opcode(vie))
1842 		return (-1);
1843 
1844 	if (decode_modrm(vie, cpu_mode))
1845 		return (-1);
1846 
1847 	if (decode_sib(vie))
1848 		return (-1);
1849 
1850 	if (decode_displacement(vie))
1851 		return (-1);
1852 
1853 	if (decode_immediate(vie))
1854 		return (-1);
1855 
1856 	if (decode_moffset(vie))
1857 		return (-1);
1858 
1859 	if (verify_inst_length(vie))
1860 		return (-1);
1861 
1862 	if (verify_gla(vm, cpuid, gla, vie))
1863 		return (-1);
1864 
1865 	vie->decoded = 1;	/* success */
1866 
1867 	return (0);
1868 }
1869 #endif	/* _KERNEL */
1870