xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 7e00348e7605b9906601438008341ffc37c00e2c)
1 /*-
2  * Copyright (c) 2012 Sandvine, Inc.
3  * Copyright (c) 2012 NetApp, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #ifdef _KERNEL
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 
39 #include <vm/vm.h>
40 #include <vm/pmap.h>
41 
42 #include <machine/vmparam.h>
43 #include <machine/vmm.h>
44 #else	/* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 #include <sys/_iovec.h>
48 
49 #include <machine/vmm.h>
50 
51 #include <assert.h>
52 #include <vmmapi.h>
53 #define	KASSERT(exp,msg)	assert((exp))
54 #endif	/* _KERNEL */
55 
56 #include <machine/vmm_instruction_emul.h>
57 #include <x86/psl.h>
58 #include <x86/specialreg.h>
59 
60 /* struct vie_op.op_type */
61 enum {
62 	VIE_OP_TYPE_NONE = 0,
63 	VIE_OP_TYPE_MOV,
64 	VIE_OP_TYPE_MOVSX,
65 	VIE_OP_TYPE_MOVZX,
66 	VIE_OP_TYPE_AND,
67 	VIE_OP_TYPE_OR,
68 	VIE_OP_TYPE_SUB,
69 	VIE_OP_TYPE_TWO_BYTE,
70 	VIE_OP_TYPE_PUSH,
71 	VIE_OP_TYPE_CMP,
72 	VIE_OP_TYPE_POP,
73 	VIE_OP_TYPE_LAST
74 };
75 
76 /* struct vie_op.op_flags */
77 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
78 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
79 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
80 #define	VIE_OP_F_NO_MODRM	(1 << 3)
81 
82 static const struct vie_op two_byte_opcodes[256] = {
83 	[0xB6] = {
84 		.op_byte = 0xB6,
85 		.op_type = VIE_OP_TYPE_MOVZX,
86 	},
87 	[0xB7] = {
88 		.op_byte = 0xB7,
89 		.op_type = VIE_OP_TYPE_MOVZX,
90 	},
91 	[0xBE] = {
92 		.op_byte = 0xBE,
93 		.op_type = VIE_OP_TYPE_MOVSX,
94 	},
95 };
96 
97 static const struct vie_op one_byte_opcodes[256] = {
98 	[0x0F] = {
99 		.op_byte = 0x0F,
100 		.op_type = VIE_OP_TYPE_TWO_BYTE
101 	},
102 	[0x2B] = {
103 		.op_byte = 0x2B,
104 		.op_type = VIE_OP_TYPE_SUB,
105 	},
106 	[0x3B] = {
107 		.op_byte = 0x3B,
108 		.op_type = VIE_OP_TYPE_CMP,
109 	},
110 	[0x88] = {
111 		.op_byte = 0x88,
112 		.op_type = VIE_OP_TYPE_MOV,
113 	},
114 	[0x89] = {
115 		.op_byte = 0x89,
116 		.op_type = VIE_OP_TYPE_MOV,
117 	},
118 	[0x8A] = {
119 		.op_byte = 0x8A,
120 		.op_type = VIE_OP_TYPE_MOV,
121 	},
122 	[0x8B] = {
123 		.op_byte = 0x8B,
124 		.op_type = VIE_OP_TYPE_MOV,
125 	},
126 	[0xA1] = {
127 		.op_byte = 0xA1,
128 		.op_type = VIE_OP_TYPE_MOV,
129 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
130 	},
131 	[0xA3] = {
132 		.op_byte = 0xA3,
133 		.op_type = VIE_OP_TYPE_MOV,
134 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
135 	},
136 	[0xC6] = {
137 		/* XXX Group 11 extended opcode - not just MOV */
138 		.op_byte = 0xC6,
139 		.op_type = VIE_OP_TYPE_MOV,
140 		.op_flags = VIE_OP_F_IMM8,
141 	},
142 	[0xC7] = {
143 		.op_byte = 0xC7,
144 		.op_type = VIE_OP_TYPE_MOV,
145 		.op_flags = VIE_OP_F_IMM,
146 	},
147 	[0x23] = {
148 		.op_byte = 0x23,
149 		.op_type = VIE_OP_TYPE_AND,
150 	},
151 	[0x81] = {
152 		/* XXX Group 1 extended opcode - not just AND */
153 		.op_byte = 0x81,
154 		.op_type = VIE_OP_TYPE_AND,
155 		.op_flags = VIE_OP_F_IMM,
156 	},
157 	[0x83] = {
158 		/* XXX Group 1 extended opcode - not just OR */
159 		.op_byte = 0x83,
160 		.op_type = VIE_OP_TYPE_OR,
161 		.op_flags = VIE_OP_F_IMM8,
162 	},
163 	[0x8F] = {
164 		/* XXX Group 1A extended opcode - not just POP */
165 		.op_byte = 0x8F,
166 		.op_type = VIE_OP_TYPE_POP,
167 	},
168 	[0xFF] = {
169 		/* XXX Group 5 extended opcode - not just PUSH */
170 		.op_byte = 0xFF,
171 		.op_type = VIE_OP_TYPE_PUSH,
172 	}
173 };
174 
175 /* struct vie.mod */
176 #define	VIE_MOD_INDIRECT		0
177 #define	VIE_MOD_INDIRECT_DISP8		1
178 #define	VIE_MOD_INDIRECT_DISP32		2
179 #define	VIE_MOD_DIRECT			3
180 
181 /* struct vie.rm */
182 #define	VIE_RM_SIB			4
183 #define	VIE_RM_DISP32			5
184 
185 #define	GB				(1024 * 1024 * 1024)
186 
187 static enum vm_reg_name gpr_map[16] = {
188 	VM_REG_GUEST_RAX,
189 	VM_REG_GUEST_RCX,
190 	VM_REG_GUEST_RDX,
191 	VM_REG_GUEST_RBX,
192 	VM_REG_GUEST_RSP,
193 	VM_REG_GUEST_RBP,
194 	VM_REG_GUEST_RSI,
195 	VM_REG_GUEST_RDI,
196 	VM_REG_GUEST_R8,
197 	VM_REG_GUEST_R9,
198 	VM_REG_GUEST_R10,
199 	VM_REG_GUEST_R11,
200 	VM_REG_GUEST_R12,
201 	VM_REG_GUEST_R13,
202 	VM_REG_GUEST_R14,
203 	VM_REG_GUEST_R15
204 };
205 
206 static uint64_t size2mask[] = {
207 	[1] = 0xff,
208 	[2] = 0xffff,
209 	[4] = 0xffffffff,
210 	[8] = 0xffffffffffffffff,
211 };
212 
213 static int
214 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
215 {
216 	int error;
217 
218 	error = vm_get_register(vm, vcpuid, reg, rval);
219 
220 	return (error);
221 }
222 
223 static void
224 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
225 {
226 	*lhbr = 0;
227 	*reg = gpr_map[vie->reg];
228 
229 	/*
230 	 * 64-bit mode imposes limitations on accessing legacy high byte
231 	 * registers (lhbr).
232 	 *
233 	 * The legacy high-byte registers cannot be addressed if the REX
234 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
235 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
236 	 *
237 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
238 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
239 	 * %ah, %ch, %dh and %bh respectively.
240 	 */
241 	if (!vie->rex_present) {
242 		if (vie->reg & 0x4) {
243 			*lhbr = 1;
244 			*reg = gpr_map[vie->reg & 0x3];
245 		}
246 	}
247 }
248 
249 static int
250 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
251 {
252 	uint64_t val;
253 	int error, lhbr;
254 	enum vm_reg_name reg;
255 
256 	vie_calc_bytereg(vie, &reg, &lhbr);
257 	error = vm_get_register(vm, vcpuid, reg, &val);
258 
259 	/*
260 	 * To obtain the value of a legacy high byte register shift the
261 	 * base register right by 8 bits (%ah = %rax >> 8).
262 	 */
263 	if (lhbr)
264 		*rval = val >> 8;
265 	else
266 		*rval = val;
267 	return (error);
268 }
269 
270 static int
271 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
272 {
273 	uint64_t origval, val, mask;
274 	int error, lhbr;
275 	enum vm_reg_name reg;
276 
277 	vie_calc_bytereg(vie, &reg, &lhbr);
278 	error = vm_get_register(vm, vcpuid, reg, &origval);
279 	if (error == 0) {
280 		val = byte;
281 		mask = 0xff;
282 		if (lhbr) {
283 			/*
284 			 * Shift left by 8 to store 'byte' in a legacy high
285 			 * byte register.
286 			 */
287 			val <<= 8;
288 			mask <<= 8;
289 		}
290 		val |= origval & ~mask;
291 		error = vm_set_register(vm, vcpuid, reg, val);
292 	}
293 	return (error);
294 }
295 
296 int
297 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
298 		    uint64_t val, int size)
299 {
300 	int error;
301 	uint64_t origval;
302 
303 	switch (size) {
304 	case 1:
305 	case 2:
306 		error = vie_read_register(vm, vcpuid, reg, &origval);
307 		if (error)
308 			return (error);
309 		val &= size2mask[size];
310 		val |= origval & ~size2mask[size];
311 		break;
312 	case 4:
313 		val &= 0xffffffffUL;
314 		break;
315 	case 8:
316 		break;
317 	default:
318 		return (EINVAL);
319 	}
320 
321 	error = vm_set_register(vm, vcpuid, reg, val);
322 	return (error);
323 }
324 
325 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
326 
327 /*
328  * Return the status flags that would result from doing (x - y).
329  */
330 #define	GETCC(sz)							\
331 static u_long								\
332 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
333 {									\
334 	u_long rflags;							\
335 									\
336 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
337 	    "=r" (rflags), "+r" (x) : "m" (y));				\
338 	return (rflags);						\
339 } struct __hack
340 
341 GETCC(8);
342 GETCC(16);
343 GETCC(32);
344 GETCC(64);
345 
346 static u_long
347 getcc(int opsize, uint64_t x, uint64_t y)
348 {
349 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
350 	    ("getcc: invalid operand size %d", opsize));
351 
352 	if (opsize == 1)
353 		return (getcc8(x, y));
354 	else if (opsize == 2)
355 		return (getcc16(x, y));
356 	else if (opsize == 4)
357 		return (getcc32(x, y));
358 	else
359 		return (getcc64(x, y));
360 }
361 
362 static int
363 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
364 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
365 {
366 	int error, size;
367 	enum vm_reg_name reg;
368 	uint8_t byte;
369 	uint64_t val;
370 
371 	size = vie->opsize;
372 	error = EINVAL;
373 
374 	switch (vie->op.op_byte) {
375 	case 0x88:
376 		/*
377 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
378 		 * 88/r:	mov r/m8, r8
379 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
380 		 */
381 		size = 1;	/* override for byte operation */
382 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
383 		if (error == 0)
384 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
385 		break;
386 	case 0x89:
387 		/*
388 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
389 		 * 89/r:	mov r/m16, r16
390 		 * 89/r:	mov r/m32, r32
391 		 * REX.W + 89/r	mov r/m64, r64
392 		 */
393 		reg = gpr_map[vie->reg];
394 		error = vie_read_register(vm, vcpuid, reg, &val);
395 		if (error == 0) {
396 			val &= size2mask[size];
397 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
398 		}
399 		break;
400 	case 0x8A:
401 		/*
402 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
403 		 * 8A/r:	mov r8, r/m8
404 		 * REX + 8A/r:	mov r8, r/m8
405 		 */
406 		size = 1;	/* override for byte operation */
407 		error = memread(vm, vcpuid, gpa, &val, size, arg);
408 		if (error == 0)
409 			error = vie_write_bytereg(vm, vcpuid, vie, val);
410 		break;
411 	case 0x8B:
412 		/*
413 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
414 		 * 8B/r:	mov r16, r/m16
415 		 * 8B/r:	mov r32, r/m32
416 		 * REX.W 8B/r:	mov r64, r/m64
417 		 */
418 		error = memread(vm, vcpuid, gpa, &val, size, arg);
419 		if (error == 0) {
420 			reg = gpr_map[vie->reg];
421 			error = vie_update_register(vm, vcpuid, reg, val, size);
422 		}
423 		break;
424 	case 0xA1:
425 		/*
426 		 * MOV from seg:moffset to AX/EAX/RAX
427 		 * A1:		mov AX, moffs16
428 		 * A1:		mov EAX, moffs32
429 		 * REX.W + A1:	mov RAX, moffs64
430 		 */
431 		error = memread(vm, vcpuid, gpa, &val, size, arg);
432 		if (error == 0) {
433 			reg = VM_REG_GUEST_RAX;
434 			error = vie_update_register(vm, vcpuid, reg, val, size);
435 		}
436 		break;
437 	case 0xA3:
438 		/*
439 		 * MOV from AX/EAX/RAX to seg:moffset
440 		 * A3:		mov moffs16, AX
441 		 * A3:		mov moffs32, EAX
442 		 * REX.W + A3:	mov moffs64, RAX
443 		 */
444 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
445 		if (error == 0) {
446 			val &= size2mask[size];
447 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
448 		}
449 		break;
450 	case 0xC6:
451 		/*
452 		 * MOV from imm8 to mem (ModRM:r/m)
453 		 * C6/0		mov r/m8, imm8
454 		 * REX + C6/0	mov r/m8, imm8
455 		 */
456 		size = 1;	/* override for byte operation */
457 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
458 		break;
459 	case 0xC7:
460 		/*
461 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
462 		 * C7/0		mov r/m16, imm16
463 		 * C7/0		mov r/m32, imm32
464 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
465 		 */
466 		val = vie->immediate & size2mask[size];
467 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
468 		break;
469 	default:
470 		break;
471 	}
472 
473 	return (error);
474 }
475 
476 static int
477 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
478 	     mem_region_read_t memread, mem_region_write_t memwrite,
479 	     void *arg)
480 {
481 	int error, size;
482 	enum vm_reg_name reg;
483 	uint64_t val;
484 
485 	size = vie->opsize;
486 	error = EINVAL;
487 
488 	switch (vie->op.op_byte) {
489 	case 0xB6:
490 		/*
491 		 * MOV and zero extend byte from mem (ModRM:r/m) to
492 		 * reg (ModRM:reg).
493 		 *
494 		 * 0F B6/r		movzx r16, r/m8
495 		 * 0F B6/r		movzx r32, r/m8
496 		 * REX.W + 0F B6/r	movzx r64, r/m8
497 		 */
498 
499 		/* get the first operand */
500 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
501 		if (error)
502 			break;
503 
504 		/* get the second operand */
505 		reg = gpr_map[vie->reg];
506 
507 		/* zero-extend byte */
508 		val = (uint8_t)val;
509 
510 		/* write the result */
511 		error = vie_update_register(vm, vcpuid, reg, val, size);
512 		break;
513 	case 0xB7:
514 		/*
515 		 * MOV and zero extend word from mem (ModRM:r/m) to
516 		 * reg (ModRM:reg).
517 		 *
518 		 * 0F B7/r		movzx r32, r/m16
519 		 * REX.W + 0F B7/r	movzx r64, r/m16
520 		 */
521 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
522 		if (error)
523 			return (error);
524 
525 		reg = gpr_map[vie->reg];
526 
527 		/* zero-extend word */
528 		val = (uint16_t)val;
529 
530 		error = vie_update_register(vm, vcpuid, reg, val, size);
531 		break;
532 	case 0xBE:
533 		/*
534 		 * MOV and sign extend byte from mem (ModRM:r/m) to
535 		 * reg (ModRM:reg).
536 		 *
537 		 * 0F BE/r		movsx r16, r/m8
538 		 * 0F BE/r		movsx r32, r/m8
539 		 * REX.W + 0F BE/r	movsx r64, r/m8
540 		 */
541 
542 		/* get the first operand */
543 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
544 		if (error)
545 			break;
546 
547 		/* get the second operand */
548 		reg = gpr_map[vie->reg];
549 
550 		/* sign extend byte */
551 		val = (int8_t)val;
552 
553 		/* write the result */
554 		error = vie_update_register(vm, vcpuid, reg, val, size);
555 		break;
556 	default:
557 		break;
558 	}
559 	return (error);
560 }
561 
562 static int
563 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
564 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
565 {
566 	int error, size;
567 	enum vm_reg_name reg;
568 	uint64_t result, rflags, rflags2, val1, val2;
569 
570 	size = vie->opsize;
571 	error = EINVAL;
572 
573 	switch (vie->op.op_byte) {
574 	case 0x23:
575 		/*
576 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
577 		 * result in reg.
578 		 *
579 		 * 23/r		and r16, r/m16
580 		 * 23/r		and r32, r/m32
581 		 * REX.W + 23/r	and r64, r/m64
582 		 */
583 
584 		/* get the first operand */
585 		reg = gpr_map[vie->reg];
586 		error = vie_read_register(vm, vcpuid, reg, &val1);
587 		if (error)
588 			break;
589 
590 		/* get the second operand */
591 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
592 		if (error)
593 			break;
594 
595 		/* perform the operation and write the result */
596 		result = val1 & val2;
597 		error = vie_update_register(vm, vcpuid, reg, result, size);
598 		break;
599 	case 0x81:
600 		/*
601 		 * AND/OR mem (ModRM:r/m) with immediate and store the
602 		 * result in mem.
603 		 *
604 		 * AND: i = 4
605 		 * OR:  i = 1
606 		 * 81 /i		op r/m16, imm16
607 		 * 81 /i		op r/m32, imm32
608 		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
609 		 *
610 		 */
611 
612 		/* get the first operand */
613                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
614                 if (error)
615 			break;
616 
617                 /*
618                  * perform the operation with the pre-fetched immediate
619                  * operand and write the result
620                  */
621 		switch (vie->reg & 7) {
622 		case 0x4:
623 			/* modrm:reg == b100, AND */
624 			result = val1 & vie->immediate;
625 			break;
626 		case 0x1:
627 			/* modrm:reg == b001, OR */
628 			result = val1 | vie->immediate;
629 			break;
630 		default:
631 			error = EINVAL;
632 			break;
633 		}
634 		if (error)
635 			break;
636 
637 		error = memwrite(vm, vcpuid, gpa, result, size, arg);
638 		break;
639 	default:
640 		break;
641 	}
642 	if (error)
643 		return (error);
644 
645 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
646 	if (error)
647 		return (error);
648 
649 	/*
650 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
651 	 * to the result; AF is undefined.
652 	 *
653 	 * The updated status flags are obtained by subtracting 0 from 'result'.
654 	 */
655 	rflags2 = getcc(size, result, 0);
656 	rflags &= ~RFLAGS_STATUS_BITS;
657 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
658 
659 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
660 	return (error);
661 }
662 
663 static int
664 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
665 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
666 {
667 	int error, size;
668 	uint64_t val1, result, rflags, rflags2;
669 
670 	size = vie->opsize;
671 	error = EINVAL;
672 
673 	switch (vie->op.op_byte) {
674 	case 0x83:
675 		/*
676 		 * OR mem (ModRM:r/m) with immediate and store the
677 		 * result in mem.
678 		 *
679 		 * 83 /1		OR r/m16, imm8 sign-extended to 16
680 		 * 83 /1		OR r/m32, imm8 sign-extended to 32
681 		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
682 		 *
683 		 * Currently, only the OR operation of the 0x83 opcode
684 		 * is implemented (ModRM:reg = b001).
685 		 */
686 		if ((vie->reg & 7) != 1)
687 			break;
688 
689 		/* get the first operand */
690                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
691                 if (error)
692 			break;
693 
694                 /*
695 		 * perform the operation with the pre-fetched immediate
696 		 * operand and write the result
697 		 */
698                 result = val1 | vie->immediate;
699                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
700 		break;
701 	default:
702 		break;
703 	}
704 	if (error)
705 		return (error);
706 
707 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
708 	if (error)
709 		return (error);
710 
711 	/*
712 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
713 	 * to the result; AF is undefined.
714 	 *
715 	 * The updated status flags are obtained by subtracting 0 from 'result'.
716 	 */
717 	rflags2 = getcc(size, result, 0);
718 	rflags &= ~RFLAGS_STATUS_BITS;
719 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
720 
721 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
722 	return (error);
723 }
724 
725 static int
726 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
727 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
728 {
729 	int error, size;
730 	uint64_t op1, op2, rflags, rflags2;
731 	enum vm_reg_name reg;
732 
733 	size = vie->opsize;
734 	switch (vie->op.op_byte) {
735 	case 0x3B:
736 		/*
737 		 * 3B/r		CMP r16, r/m16
738 		 * 3B/r		CMP r32, r/m32
739 		 * REX.W + 3B/r	CMP r64, r/m64
740 		 *
741 		 * Compare first operand (reg) with second operand (r/m) and
742 		 * set status flags in EFLAGS register. The comparison is
743 		 * performed by subtracting the second operand from the first
744 		 * operand and then setting the status flags.
745 		 */
746 
747 		/* Get the first operand */
748 		reg = gpr_map[vie->reg];
749 		error = vie_read_register(vm, vcpuid, reg, &op1);
750 		if (error)
751 			return (error);
752 
753 		/* Get the second operand */
754 		error = memread(vm, vcpuid, gpa, &op2, size, arg);
755 		if (error)
756 			return (error);
757 
758 		break;
759 	default:
760 		return (EINVAL);
761 	}
762 	rflags2 = getcc(size, op1, op2);
763 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
764 	if (error)
765 		return (error);
766 	rflags &= ~RFLAGS_STATUS_BITS;
767 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
768 
769 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
770 	return (error);
771 }
772 
773 static int
774 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
775 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
776 {
777 	int error, size;
778 	uint64_t nval, rflags, rflags2, val1, val2;
779 	enum vm_reg_name reg;
780 
781 	size = vie->opsize;
782 	error = EINVAL;
783 
784 	switch (vie->op.op_byte) {
785 	case 0x2B:
786 		/*
787 		 * SUB r/m from r and store the result in r
788 		 *
789 		 * 2B/r            SUB r16, r/m16
790 		 * 2B/r            SUB r32, r/m32
791 		 * REX.W + 2B/r    SUB r64, r/m64
792 		 */
793 
794 		/* get the first operand */
795 		reg = gpr_map[vie->reg];
796 		error = vie_read_register(vm, vcpuid, reg, &val1);
797 		if (error)
798 			break;
799 
800 		/* get the second operand */
801 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
802 		if (error)
803 			break;
804 
805 		/* perform the operation and write the result */
806 		nval = val1 - val2;
807 		error = vie_update_register(vm, vcpuid, reg, nval, size);
808 		break;
809 	default:
810 		break;
811 	}
812 
813 	if (!error) {
814 		rflags2 = getcc(size, val1, val2);
815 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
816 		    &rflags);
817 		if (error)
818 			return (error);
819 
820 		rflags &= ~RFLAGS_STATUS_BITS;
821 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
822 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
823 		    rflags, 8);
824 	}
825 
826 	return (error);
827 }
828 
829 static int
830 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
831     struct vm_guest_paging *paging, mem_region_read_t memread,
832     mem_region_write_t memwrite, void *arg)
833 {
834 #ifdef _KERNEL
835 	struct vm_copyinfo copyinfo[2];
836 #else
837 	struct iovec copyinfo[2];
838 #endif
839 	struct seg_desc ss_desc;
840 	uint64_t cr0, rflags, rsp, stack_gla, val;
841 	int error, size, stackaddrsize, pushop;
842 
843 	val = 0;
844 	size = vie->opsize;
845 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
846 
847 	/*
848 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
849 	 */
850 	if (paging->cpu_mode == CPU_MODE_REAL) {
851 		stackaddrsize = 2;
852 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
853 		/*
854 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
855 		 * - Stack pointer size is always 64-bits.
856 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
857 		 * - 16-bit PUSH/POP is supported by using the operand size
858 		 *   override prefix (66H).
859 		 */
860 		stackaddrsize = 8;
861 		size = vie->opsize_override ? 2 : 8;
862 	} else {
863 		/*
864 		 * In protected or compability mode the 'B' flag in the
865 		 * stack-segment descriptor determines the size of the
866 		 * stack pointer.
867 		 */
868 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
869 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
870 		    __func__, error));
871 		if (SEG_DESC_DEF32(ss_desc.access))
872 			stackaddrsize = 4;
873 		else
874 			stackaddrsize = 2;
875 	}
876 
877 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
878 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
879 
880 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
881 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
882 
883 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
884 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
885 	if (pushop) {
886 		rsp -= size;
887 	}
888 
889 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
890 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
891 	    &stack_gla)) {
892 		vm_inject_ss(vm, vcpuid, 0);
893 		return (0);
894 	}
895 
896 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
897 		vm_inject_ss(vm, vcpuid, 0);
898 		return (0);
899 	}
900 
901 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
902 		vm_inject_ac(vm, vcpuid, 0);
903 		return (0);
904 	}
905 
906 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
907 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
908 	if (error == -1) {
909 		/*
910 		 * XXX cannot return a negative error value here because it
911 		 * ends up being the return value of the VM_RUN() ioctl and
912 		 * is interpreted as a pseudo-error (for e.g. ERESTART).
913 		 */
914 		return (EFAULT);
915 	} else if (error == 1) {
916 		/* Resume guest execution to handle page fault */
917 		return (0);
918 	}
919 
920 	if (pushop) {
921 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
922 		if (error == 0)
923 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
924 	} else {
925 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
926 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
927 		rsp += size;
928 	}
929 #ifdef _KERNEL
930 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
931 #endif
932 
933 	if (error == 0) {
934 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
935 		    stackaddrsize);
936 		KASSERT(error == 0, ("error %d updating rsp", error));
937 	}
938 	return (error);
939 }
940 
941 static int
942 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
943     struct vm_guest_paging *paging, mem_region_read_t memread,
944     mem_region_write_t memwrite, void *arg)
945 {
946 	int error;
947 
948 	/*
949 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
950 	 *
951 	 * PUSH is part of the group 5 extended opcodes and is identified
952 	 * by ModRM:reg = b110.
953 	 */
954 	if ((vie->reg & 7) != 6)
955 		return (EINVAL);
956 
957 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
958 	    memwrite, arg);
959 	return (error);
960 }
961 
962 static int
963 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
964     struct vm_guest_paging *paging, mem_region_read_t memread,
965     mem_region_write_t memwrite, void *arg)
966 {
967 	int error;
968 
969 	/*
970 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
971 	 *
972 	 * POP is part of the group 1A extended opcodes and is identified
973 	 * by ModRM:reg = b000.
974 	 */
975 	if ((vie->reg & 7) != 0)
976 		return (EINVAL);
977 
978 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
979 	    memwrite, arg);
980 	return (error);
981 }
982 
983 int
984 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
985     struct vm_guest_paging *paging, mem_region_read_t memread,
986     mem_region_write_t memwrite, void *memarg)
987 {
988 	int error;
989 
990 	if (!vie->decoded)
991 		return (EINVAL);
992 
993 	switch (vie->op.op_type) {
994 	case VIE_OP_TYPE_POP:
995 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
996 		    memwrite, memarg);
997 		break;
998 	case VIE_OP_TYPE_PUSH:
999 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1000 		    memwrite, memarg);
1001 		break;
1002 	case VIE_OP_TYPE_CMP:
1003 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1004 				    memread, memwrite, memarg);
1005 		break;
1006 	case VIE_OP_TYPE_MOV:
1007 		error = emulate_mov(vm, vcpuid, gpa, vie,
1008 				    memread, memwrite, memarg);
1009 		break;
1010 	case VIE_OP_TYPE_MOVSX:
1011 	case VIE_OP_TYPE_MOVZX:
1012 		error = emulate_movx(vm, vcpuid, gpa, vie,
1013 				     memread, memwrite, memarg);
1014 		break;
1015 	case VIE_OP_TYPE_AND:
1016 		error = emulate_and(vm, vcpuid, gpa, vie,
1017 				    memread, memwrite, memarg);
1018 		break;
1019 	case VIE_OP_TYPE_OR:
1020 		error = emulate_or(vm, vcpuid, gpa, vie,
1021 				    memread, memwrite, memarg);
1022 		break;
1023 	case VIE_OP_TYPE_SUB:
1024 		error = emulate_sub(vm, vcpuid, gpa, vie,
1025 				    memread, memwrite, memarg);
1026 		break;
1027 	default:
1028 		error = EINVAL;
1029 		break;
1030 	}
1031 
1032 	return (error);
1033 }
1034 
1035 int
1036 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1037 {
1038 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1039 	    ("%s: invalid size %d", __func__, size));
1040 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1041 
1042 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1043 		return (0);
1044 
1045 	return ((gla & (size - 1)) ? 1 : 0);
1046 }
1047 
1048 int
1049 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1050 {
1051 	uint64_t mask;
1052 
1053 	if (cpu_mode != CPU_MODE_64BIT)
1054 		return (0);
1055 
1056 	/*
1057 	 * The value of the bit 47 in the 'gla' should be replicated in the
1058 	 * most significant 16 bits.
1059 	 */
1060 	mask = ~((1UL << 48) - 1);
1061 	if (gla & (1UL << 47))
1062 		return ((gla & mask) != mask);
1063 	else
1064 		return ((gla & mask) != 0);
1065 }
1066 
1067 uint64_t
1068 vie_size2mask(int size)
1069 {
1070 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1071 	    ("vie_size2mask: invalid size %d", size));
1072 	return (size2mask[size]);
1073 }
1074 
1075 int
1076 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1077     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1078     int prot, uint64_t *gla)
1079 {
1080 	uint64_t firstoff, low_limit, high_limit, segbase;
1081 	int glasize, type;
1082 
1083 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1084 	    ("%s: invalid segment %d", __func__, seg));
1085 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1086 	    ("%s: invalid operand size %d", __func__, length));
1087 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1088 	    ("%s: invalid prot %#x", __func__, prot));
1089 
1090 	firstoff = offset;
1091 	if (cpu_mode == CPU_MODE_64BIT) {
1092 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1093 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1094 		glasize = 8;
1095 	} else {
1096 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1097 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1098 		glasize = 4;
1099 		/*
1100 		 * If the segment selector is loaded with a NULL selector
1101 		 * then the descriptor is unusable and attempting to use
1102 		 * it results in a #GP(0).
1103 		 */
1104 		if (SEG_DESC_UNUSABLE(desc->access))
1105 			return (-1);
1106 
1107 		/*
1108 		 * The processor generates a #NP exception when a segment
1109 		 * register is loaded with a selector that points to a
1110 		 * descriptor that is not present. If this was the case then
1111 		 * it would have been checked before the VM-exit.
1112 		 */
1113 		KASSERT(SEG_DESC_PRESENT(desc->access),
1114 		    ("segment %d not present: %#x", seg, desc->access));
1115 
1116 		/*
1117 		 * The descriptor type must indicate a code/data segment.
1118 		 */
1119 		type = SEG_DESC_TYPE(desc->access);
1120 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1121 		    "descriptor type %#x", seg, type));
1122 
1123 		if (prot & PROT_READ) {
1124 			/* #GP on a read access to a exec-only code segment */
1125 			if ((type & 0xA) == 0x8)
1126 				return (-1);
1127 		}
1128 
1129 		if (prot & PROT_WRITE) {
1130 			/*
1131 			 * #GP on a write access to a code segment or a
1132 			 * read-only data segment.
1133 			 */
1134 			if (type & 0x8)			/* code segment */
1135 				return (-1);
1136 
1137 			if ((type & 0xA) == 0)		/* read-only data seg */
1138 				return (-1);
1139 		}
1140 
1141 		/*
1142 		 * 'desc->limit' is fully expanded taking granularity into
1143 		 * account.
1144 		 */
1145 		if ((type & 0xC) == 0x4) {
1146 			/* expand-down data segment */
1147 			low_limit = desc->limit + 1;
1148 			high_limit = SEG_DESC_DEF32(desc->access) ?
1149 			    0xffffffff : 0xffff;
1150 		} else {
1151 			/* code segment or expand-up data segment */
1152 			low_limit = 0;
1153 			high_limit = desc->limit;
1154 		}
1155 
1156 		while (length > 0) {
1157 			offset &= vie_size2mask(addrsize);
1158 			if (offset < low_limit || offset > high_limit)
1159 				return (-1);
1160 			offset++;
1161 			length--;
1162 		}
1163 	}
1164 
1165 	/*
1166 	 * In 64-bit mode all segments except %fs and %gs have a segment
1167 	 * base address of 0.
1168 	 */
1169 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1170 	    seg != VM_REG_GUEST_GS) {
1171 		segbase = 0;
1172 	} else {
1173 		segbase = desc->base;
1174 	}
1175 
1176 	/*
1177 	 * Truncate 'firstoff' to the effective address size before adding
1178 	 * it to the segment base.
1179 	 */
1180 	firstoff &= vie_size2mask(addrsize);
1181 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1182 	return (0);
1183 }
1184 
1185 #ifdef _KERNEL
1186 void
1187 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1188 {
1189 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1190 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1191 
1192 	bzero(vie, sizeof(struct vie));
1193 
1194 	vie->base_register = VM_REG_LAST;
1195 	vie->index_register = VM_REG_LAST;
1196 
1197 	if (inst_length) {
1198 		bcopy(inst_bytes, vie->inst, inst_length);
1199 		vie->num_valid = inst_length;
1200 	}
1201 }
1202 
1203 static int
1204 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1205 {
1206 	int error_code = 0;
1207 
1208 	if (pte & PG_V)
1209 		error_code |= PGEX_P;
1210 	if (prot & VM_PROT_WRITE)
1211 		error_code |= PGEX_W;
1212 	if (usermode)
1213 		error_code |= PGEX_U;
1214 	if (rsvd)
1215 		error_code |= PGEX_RSV;
1216 	if (prot & VM_PROT_EXECUTE)
1217 		error_code |= PGEX_I;
1218 
1219 	return (error_code);
1220 }
1221 
1222 static void
1223 ptp_release(void **cookie)
1224 {
1225 	if (*cookie != NULL) {
1226 		vm_gpa_release(*cookie);
1227 		*cookie = NULL;
1228 	}
1229 }
1230 
1231 static void *
1232 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
1233 {
1234 	void *ptr;
1235 
1236 	ptp_release(cookie);
1237 	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
1238 	return (ptr);
1239 }
1240 
1241 int
1242 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1243     uint64_t gla, int prot, uint64_t *gpa)
1244 {
1245 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1246 	u_int retries;
1247 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1248 	uint32_t *ptpbase32, pte32;
1249 	void *cookie;
1250 
1251 	usermode = (paging->cpl == 3 ? 1 : 0);
1252 	writable = prot & VM_PROT_WRITE;
1253 	cookie = NULL;
1254 	retval = 0;
1255 	retries = 0;
1256 restart:
1257 	ptpphys = paging->cr3;		/* root of the page tables */
1258 	ptp_release(&cookie);
1259 	if (retries++ > 0)
1260 		maybe_yield();
1261 
1262 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1263 		/*
1264 		 * XXX assuming a non-stack reference otherwise a stack fault
1265 		 * should be generated.
1266 		 */
1267 		vm_inject_gp(vm, vcpuid);
1268 		goto fault;
1269 	}
1270 
1271 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1272 		*gpa = gla;
1273 		goto done;
1274 	}
1275 
1276 	if (paging->paging_mode == PAGING_MODE_32) {
1277 		nlevels = 2;
1278 		while (--nlevels >= 0) {
1279 			/* Zero out the lower 12 bits. */
1280 			ptpphys &= ~0xfff;
1281 
1282 			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1283 
1284 			if (ptpbase32 == NULL)
1285 				goto error;
1286 
1287 			ptpshift = PAGE_SHIFT + nlevels * 10;
1288 			ptpindex = (gla >> ptpshift) & 0x3FF;
1289 			pgsize = 1UL << ptpshift;
1290 
1291 			pte32 = ptpbase32[ptpindex];
1292 
1293 			if ((pte32 & PG_V) == 0 ||
1294 			    (usermode && (pte32 & PG_U) == 0) ||
1295 			    (writable && (pte32 & PG_RW) == 0)) {
1296 				pfcode = pf_error_code(usermode, prot, 0,
1297 				    pte32);
1298 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1299 				goto fault;
1300 			}
1301 
1302 			/*
1303 			 * Emulate the x86 MMU's management of the accessed
1304 			 * and dirty flags. While the accessed flag is set
1305 			 * at every level of the page table, the dirty flag
1306 			 * is only set at the last level providing the guest
1307 			 * physical address.
1308 			 */
1309 			if ((pte32 & PG_A) == 0) {
1310 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1311 				    pte32, pte32 | PG_A) == 0) {
1312 					goto restart;
1313 				}
1314 			}
1315 
1316 			/* XXX must be ignored if CR4.PSE=0 */
1317 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1318 				break;
1319 
1320 			ptpphys = pte32;
1321 		}
1322 
1323 		/* Set the dirty bit in the page table entry if necessary */
1324 		if (writable && (pte32 & PG_M) == 0) {
1325 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1326 			    pte32, pte32 | PG_M) == 0) {
1327 				goto restart;
1328 			}
1329 		}
1330 
1331 		/* Zero out the lower 'ptpshift' bits */
1332 		pte32 >>= ptpshift; pte32 <<= ptpshift;
1333 		*gpa = pte32 | (gla & (pgsize - 1));
1334 		goto done;
1335 	}
1336 
1337 	if (paging->paging_mode == PAGING_MODE_PAE) {
1338 		/* Zero out the lower 5 bits and the upper 32 bits */
1339 		ptpphys &= 0xffffffe0UL;
1340 
1341 		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
1342 		if (ptpbase == NULL)
1343 			goto error;
1344 
1345 		ptpindex = (gla >> 30) & 0x3;
1346 
1347 		pte = ptpbase[ptpindex];
1348 
1349 		if ((pte & PG_V) == 0) {
1350 			pfcode = pf_error_code(usermode, prot, 0, pte);
1351 			vm_inject_pf(vm, vcpuid, pfcode, gla);
1352 			goto fault;
1353 		}
1354 
1355 		ptpphys = pte;
1356 
1357 		nlevels = 2;
1358 	} else
1359 		nlevels = 4;
1360 	while (--nlevels >= 0) {
1361 		/* Zero out the lower 12 bits and the upper 12 bits */
1362 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1363 
1364 		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1365 		if (ptpbase == NULL)
1366 			goto error;
1367 
1368 		ptpshift = PAGE_SHIFT + nlevels * 9;
1369 		ptpindex = (gla >> ptpshift) & 0x1FF;
1370 		pgsize = 1UL << ptpshift;
1371 
1372 		pte = ptpbase[ptpindex];
1373 
1374 		if ((pte & PG_V) == 0 ||
1375 		    (usermode && (pte & PG_U) == 0) ||
1376 		    (writable && (pte & PG_RW) == 0)) {
1377 			pfcode = pf_error_code(usermode, prot, 0, pte);
1378 			vm_inject_pf(vm, vcpuid, pfcode, gla);
1379 			goto fault;
1380 		}
1381 
1382 		/* Set the accessed bit in the page table entry */
1383 		if ((pte & PG_A) == 0) {
1384 			if (atomic_cmpset_64(&ptpbase[ptpindex],
1385 			    pte, pte | PG_A) == 0) {
1386 				goto restart;
1387 			}
1388 		}
1389 
1390 		if (nlevels > 0 && (pte & PG_PS) != 0) {
1391 			if (pgsize > 1 * GB) {
1392 				pfcode = pf_error_code(usermode, prot, 1, pte);
1393 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1394 				goto fault;
1395 			}
1396 			break;
1397 		}
1398 
1399 		ptpphys = pte;
1400 	}
1401 
1402 	/* Set the dirty bit in the page table entry if necessary */
1403 	if (writable && (pte & PG_M) == 0) {
1404 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1405 			goto restart;
1406 	}
1407 
1408 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1409 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1410 	*gpa = pte | (gla & (pgsize - 1));
1411 done:
1412 	ptp_release(&cookie);
1413 	return (retval);
1414 error:
1415 	retval = -1;
1416 	goto done;
1417 fault:
1418 	retval = 1;
1419 	goto done;
1420 }
1421 
1422 int
1423 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1424     uint64_t rip, int inst_length, struct vie *vie)
1425 {
1426 	struct vm_copyinfo copyinfo[2];
1427 	int error, prot;
1428 
1429 	if (inst_length > VIE_INST_SIZE)
1430 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1431 
1432 	prot = PROT_READ | PROT_EXEC;
1433 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1434 	    copyinfo, nitems(copyinfo));
1435 	if (error == 0) {
1436 		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1437 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1438 		vie->num_valid = inst_length;
1439 	}
1440 	return (error);
1441 }
1442 
1443 static int
1444 vie_peek(struct vie *vie, uint8_t *x)
1445 {
1446 
1447 	if (vie->num_processed < vie->num_valid) {
1448 		*x = vie->inst[vie->num_processed];
1449 		return (0);
1450 	} else
1451 		return (-1);
1452 }
1453 
1454 static void
1455 vie_advance(struct vie *vie)
1456 {
1457 
1458 	vie->num_processed++;
1459 }
1460 
1461 static int
1462 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1463 {
1464 	uint8_t x;
1465 
1466 	while (1) {
1467 		if (vie_peek(vie, &x))
1468 			return (-1);
1469 
1470 		if (x == 0x66)
1471 			vie->opsize_override = 1;
1472 		else if (x == 0x67)
1473 			vie->addrsize_override = 1;
1474 		else
1475 			break;
1476 
1477 		vie_advance(vie);
1478 	}
1479 
1480 	/*
1481 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1482 	 * - Only one REX prefix is allowed per instruction.
1483 	 * - The REX prefix must immediately precede the opcode byte or the
1484 	 *   escape opcode byte.
1485 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1486 	 *   the mandatory prefix must come before the REX prefix.
1487 	 */
1488 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1489 		vie->rex_present = 1;
1490 		vie->rex_w = x & 0x8 ? 1 : 0;
1491 		vie->rex_r = x & 0x4 ? 1 : 0;
1492 		vie->rex_x = x & 0x2 ? 1 : 0;
1493 		vie->rex_b = x & 0x1 ? 1 : 0;
1494 		vie_advance(vie);
1495 	}
1496 
1497 	/*
1498 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1499 	 */
1500 	if (cpu_mode == CPU_MODE_64BIT) {
1501 		/*
1502 		 * Default address size is 64-bits and default operand size
1503 		 * is 32-bits.
1504 		 */
1505 		vie->addrsize = vie->addrsize_override ? 4 : 8;
1506 		if (vie->rex_w)
1507 			vie->opsize = 8;
1508 		else if (vie->opsize_override)
1509 			vie->opsize = 2;
1510 		else
1511 			vie->opsize = 4;
1512 	} else if (cs_d) {
1513 		/* Default address and operand sizes are 32-bits */
1514 		vie->addrsize = vie->addrsize_override ? 2 : 4;
1515 		vie->opsize = vie->opsize_override ? 2 : 4;
1516 	} else {
1517 		/* Default address and operand sizes are 16-bits */
1518 		vie->addrsize = vie->addrsize_override ? 4 : 2;
1519 		vie->opsize = vie->opsize_override ? 4 : 2;
1520 	}
1521 	return (0);
1522 }
1523 
1524 static int
1525 decode_two_byte_opcode(struct vie *vie)
1526 {
1527 	uint8_t x;
1528 
1529 	if (vie_peek(vie, &x))
1530 		return (-1);
1531 
1532 	vie->op = two_byte_opcodes[x];
1533 
1534 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1535 		return (-1);
1536 
1537 	vie_advance(vie);
1538 	return (0);
1539 }
1540 
1541 static int
1542 decode_opcode(struct vie *vie)
1543 {
1544 	uint8_t x;
1545 
1546 	if (vie_peek(vie, &x))
1547 		return (-1);
1548 
1549 	vie->op = one_byte_opcodes[x];
1550 
1551 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1552 		return (-1);
1553 
1554 	vie_advance(vie);
1555 
1556 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1557 		return (decode_two_byte_opcode(vie));
1558 
1559 	return (0);
1560 }
1561 
1562 static int
1563 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1564 {
1565 	uint8_t x;
1566 
1567 	if (cpu_mode == CPU_MODE_REAL)
1568 		return (-1);
1569 
1570 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
1571 		return (0);
1572 
1573 	if (vie_peek(vie, &x))
1574 		return (-1);
1575 
1576 	vie->mod = (x >> 6) & 0x3;
1577 	vie->rm =  (x >> 0) & 0x7;
1578 	vie->reg = (x >> 3) & 0x7;
1579 
1580 	/*
1581 	 * A direct addressing mode makes no sense in the context of an EPT
1582 	 * fault. There has to be a memory access involved to cause the
1583 	 * EPT fault.
1584 	 */
1585 	if (vie->mod == VIE_MOD_DIRECT)
1586 		return (-1);
1587 
1588 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1589 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1590 		/*
1591 		 * Table 2-5: Special Cases of REX Encodings
1592 		 *
1593 		 * mod=0, r/m=5 is used in the compatibility mode to
1594 		 * indicate a disp32 without a base register.
1595 		 *
1596 		 * mod!=3, r/m=4 is used in the compatibility mode to
1597 		 * indicate that the SIB byte is present.
1598 		 *
1599 		 * The 'b' bit in the REX prefix is don't care in
1600 		 * this case.
1601 		 */
1602 	} else {
1603 		vie->rm |= (vie->rex_b << 3);
1604 	}
1605 
1606 	vie->reg |= (vie->rex_r << 3);
1607 
1608 	/* SIB */
1609 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1610 		goto done;
1611 
1612 	vie->base_register = gpr_map[vie->rm];
1613 
1614 	switch (vie->mod) {
1615 	case VIE_MOD_INDIRECT_DISP8:
1616 		vie->disp_bytes = 1;
1617 		break;
1618 	case VIE_MOD_INDIRECT_DISP32:
1619 		vie->disp_bytes = 4;
1620 		break;
1621 	case VIE_MOD_INDIRECT:
1622 		if (vie->rm == VIE_RM_DISP32) {
1623 			vie->disp_bytes = 4;
1624 			/*
1625 			 * Table 2-7. RIP-Relative Addressing
1626 			 *
1627 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1628 			 * whereas in compatibility mode it just implies disp32.
1629 			 */
1630 
1631 			if (cpu_mode == CPU_MODE_64BIT)
1632 				vie->base_register = VM_REG_GUEST_RIP;
1633 			else
1634 				vie->base_register = VM_REG_LAST;
1635 		}
1636 		break;
1637 	}
1638 
1639 done:
1640 	vie_advance(vie);
1641 
1642 	return (0);
1643 }
1644 
1645 static int
1646 decode_sib(struct vie *vie)
1647 {
1648 	uint8_t x;
1649 
1650 	/* Proceed only if SIB byte is present */
1651 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1652 		return (0);
1653 
1654 	if (vie_peek(vie, &x))
1655 		return (-1);
1656 
1657 	/* De-construct the SIB byte */
1658 	vie->ss = (x >> 6) & 0x3;
1659 	vie->index = (x >> 3) & 0x7;
1660 	vie->base = (x >> 0) & 0x7;
1661 
1662 	/* Apply the REX prefix modifiers */
1663 	vie->index |= vie->rex_x << 3;
1664 	vie->base |= vie->rex_b << 3;
1665 
1666 	switch (vie->mod) {
1667 	case VIE_MOD_INDIRECT_DISP8:
1668 		vie->disp_bytes = 1;
1669 		break;
1670 	case VIE_MOD_INDIRECT_DISP32:
1671 		vie->disp_bytes = 4;
1672 		break;
1673 	}
1674 
1675 	if (vie->mod == VIE_MOD_INDIRECT &&
1676 	    (vie->base == 5 || vie->base == 13)) {
1677 		/*
1678 		 * Special case when base register is unused if mod = 0
1679 		 * and base = %rbp or %r13.
1680 		 *
1681 		 * Documented in:
1682 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1683 		 * Table 2-5: Special Cases of REX Encodings
1684 		 */
1685 		vie->disp_bytes = 4;
1686 	} else {
1687 		vie->base_register = gpr_map[vie->base];
1688 	}
1689 
1690 	/*
1691 	 * All encodings of 'index' are valid except for %rsp (4).
1692 	 *
1693 	 * Documented in:
1694 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1695 	 * Table 2-5: Special Cases of REX Encodings
1696 	 */
1697 	if (vie->index != 4)
1698 		vie->index_register = gpr_map[vie->index];
1699 
1700 	/* 'scale' makes sense only in the context of an index register */
1701 	if (vie->index_register < VM_REG_LAST)
1702 		vie->scale = 1 << vie->ss;
1703 
1704 	vie_advance(vie);
1705 
1706 	return (0);
1707 }
1708 
1709 static int
1710 decode_displacement(struct vie *vie)
1711 {
1712 	int n, i;
1713 	uint8_t x;
1714 
1715 	union {
1716 		char	buf[4];
1717 		int8_t	signed8;
1718 		int32_t	signed32;
1719 	} u;
1720 
1721 	if ((n = vie->disp_bytes) == 0)
1722 		return (0);
1723 
1724 	if (n != 1 && n != 4)
1725 		panic("decode_displacement: invalid disp_bytes %d", n);
1726 
1727 	for (i = 0; i < n; i++) {
1728 		if (vie_peek(vie, &x))
1729 			return (-1);
1730 
1731 		u.buf[i] = x;
1732 		vie_advance(vie);
1733 	}
1734 
1735 	if (n == 1)
1736 		vie->displacement = u.signed8;		/* sign-extended */
1737 	else
1738 		vie->displacement = u.signed32;		/* sign-extended */
1739 
1740 	return (0);
1741 }
1742 
1743 static int
1744 decode_immediate(struct vie *vie)
1745 {
1746 	int i, n;
1747 	uint8_t x;
1748 	union {
1749 		char	buf[4];
1750 		int8_t	signed8;
1751 		int16_t	signed16;
1752 		int32_t	signed32;
1753 	} u;
1754 
1755 	/* Figure out immediate operand size (if any) */
1756 	if (vie->op.op_flags & VIE_OP_F_IMM) {
1757 		/*
1758 		 * Section 2.2.1.5 "Immediates", Intel SDM:
1759 		 * In 64-bit mode the typical size of immediate operands
1760 		 * remains 32-bits. When the operand size if 64-bits, the
1761 		 * processor sign-extends all immediates to 64-bits prior
1762 		 * to their use.
1763 		 */
1764 		if (vie->opsize == 4 || vie->opsize == 8)
1765 			vie->imm_bytes = 4;
1766 		else
1767 			vie->imm_bytes = 2;
1768 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
1769 		vie->imm_bytes = 1;
1770 	}
1771 
1772 	if ((n = vie->imm_bytes) == 0)
1773 		return (0);
1774 
1775 	KASSERT(n == 1 || n == 2 || n == 4,
1776 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
1777 
1778 	for (i = 0; i < n; i++) {
1779 		if (vie_peek(vie, &x))
1780 			return (-1);
1781 
1782 		u.buf[i] = x;
1783 		vie_advance(vie);
1784 	}
1785 
1786 	/* sign-extend the immediate value before use */
1787 	if (n == 1)
1788 		vie->immediate = u.signed8;
1789 	else if (n == 2)
1790 		vie->immediate = u.signed16;
1791 	else
1792 		vie->immediate = u.signed32;
1793 
1794 	return (0);
1795 }
1796 
1797 static int
1798 decode_moffset(struct vie *vie)
1799 {
1800 	int i, n;
1801 	uint8_t x;
1802 	union {
1803 		char	buf[8];
1804 		uint64_t u64;
1805 	} u;
1806 
1807 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
1808 		return (0);
1809 
1810 	/*
1811 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
1812 	 * The memory offset size follows the address-size of the instruction.
1813 	 */
1814 	n = vie->addrsize;
1815 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
1816 
1817 	u.u64 = 0;
1818 	for (i = 0; i < n; i++) {
1819 		if (vie_peek(vie, &x))
1820 			return (-1);
1821 
1822 		u.buf[i] = x;
1823 		vie_advance(vie);
1824 	}
1825 	vie->displacement = u.u64;
1826 	return (0);
1827 }
1828 
1829 /*
1830  * Verify that all the bytes in the instruction buffer were consumed.
1831  */
1832 static int
1833 verify_inst_length(struct vie *vie)
1834 {
1835 
1836 	if (vie->num_processed)
1837 		return (0);
1838 	else
1839 		return (-1);
1840 }
1841 
1842 /*
1843  * Verify that the 'guest linear address' provided as collateral of the nested
1844  * page table fault matches with our instruction decoding.
1845  */
1846 static int
1847 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1848 {
1849 	int error;
1850 	uint64_t base, idx, gla2;
1851 
1852 	/* Skip 'gla' verification */
1853 	if (gla == VIE_INVALID_GLA)
1854 		return (0);
1855 
1856 	base = 0;
1857 	if (vie->base_register != VM_REG_LAST) {
1858 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1859 		if (error) {
1860 			printf("verify_gla: error %d getting base reg %d\n",
1861 				error, vie->base_register);
1862 			return (-1);
1863 		}
1864 
1865 		/*
1866 		 * RIP-relative addressing starts from the following
1867 		 * instruction
1868 		 */
1869 		if (vie->base_register == VM_REG_GUEST_RIP)
1870 			base += vie->num_valid;
1871 	}
1872 
1873 	idx = 0;
1874 	if (vie->index_register != VM_REG_LAST) {
1875 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1876 		if (error) {
1877 			printf("verify_gla: error %d getting index reg %d\n",
1878 				error, vie->index_register);
1879 			return (-1);
1880 		}
1881 	}
1882 
1883 	/* XXX assuming that the base address of the segment is 0 */
1884 	gla2 = base + vie->scale * idx + vie->displacement;
1885 	gla2 &= size2mask[vie->addrsize];
1886 	if (gla != gla2) {
1887 		printf("verify_gla mismatch: "
1888 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1889 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
1890 		       base, vie->scale, idx, vie->displacement, gla, gla2);
1891 		return (-1);
1892 	}
1893 
1894 	return (0);
1895 }
1896 
1897 int
1898 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1899 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
1900 {
1901 
1902 	if (decode_prefixes(vie, cpu_mode, cs_d))
1903 		return (-1);
1904 
1905 	if (decode_opcode(vie))
1906 		return (-1);
1907 
1908 	if (decode_modrm(vie, cpu_mode))
1909 		return (-1);
1910 
1911 	if (decode_sib(vie))
1912 		return (-1);
1913 
1914 	if (decode_displacement(vie))
1915 		return (-1);
1916 
1917 	if (decode_immediate(vie))
1918 		return (-1);
1919 
1920 	if (decode_moffset(vie))
1921 		return (-1);
1922 
1923 	if (verify_inst_length(vie))
1924 		return (-1);
1925 
1926 	if (verify_gla(vm, cpuid, gla, vie))
1927 		return (-1);
1928 
1929 	vie->decoded = 1;	/* success */
1930 
1931 	return (0);
1932 }
1933 #endif	/* _KERNEL */
1934