xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision 95d45410b5100e07f6f98450bcd841a8945d4726)
1 /*-
2  * Copyright (c) 2012 Sandvine, Inc.
3  * Copyright (c) 2012 NetApp, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #ifdef _KERNEL
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 
39 #include <vm/vm.h>
40 #include <vm/pmap.h>
41 
42 #include <machine/vmparam.h>
43 #include <machine/vmm.h>
44 #else	/* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 
48 #include <machine/vmm.h>
49 
50 #include <assert.h>
51 #include <vmmapi.h>
52 #define	KASSERT(exp,msg)	assert((exp))
53 #endif	/* _KERNEL */
54 
55 #include <machine/vmm_instruction_emul.h>
56 #include <x86/psl.h>
57 #include <x86/specialreg.h>
58 
59 /* struct vie_op.op_type */
60 enum {
61 	VIE_OP_TYPE_NONE = 0,
62 	VIE_OP_TYPE_MOV,
63 	VIE_OP_TYPE_MOVSX,
64 	VIE_OP_TYPE_MOVZX,
65 	VIE_OP_TYPE_AND,
66 	VIE_OP_TYPE_OR,
67 	VIE_OP_TYPE_TWO_BYTE,
68 	VIE_OP_TYPE_LAST
69 };
70 
71 /* struct vie_op.op_flags */
72 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
73 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
74 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
75 
76 static const struct vie_op two_byte_opcodes[256] = {
77 	[0xB6] = {
78 		.op_byte = 0xB6,
79 		.op_type = VIE_OP_TYPE_MOVZX,
80 	},
81 	[0xBE] = {
82 		.op_byte = 0xBE,
83 		.op_type = VIE_OP_TYPE_MOVSX,
84 	},
85 };
86 
87 static const struct vie_op one_byte_opcodes[256] = {
88 	[0x0F] = {
89 		.op_byte = 0x0F,
90 		.op_type = VIE_OP_TYPE_TWO_BYTE
91 	},
92 	[0x88] = {
93 		.op_byte = 0x88,
94 		.op_type = VIE_OP_TYPE_MOV,
95 	},
96 	[0x89] = {
97 		.op_byte = 0x89,
98 		.op_type = VIE_OP_TYPE_MOV,
99 	},
100 	[0x8A] = {
101 		.op_byte = 0x8A,
102 		.op_type = VIE_OP_TYPE_MOV,
103 	},
104 	[0x8B] = {
105 		.op_byte = 0x8B,
106 		.op_type = VIE_OP_TYPE_MOV,
107 	},
108 	[0xC6] = {
109 		/* XXX Group 11 extended opcode - not just MOV */
110 		.op_byte = 0xC6,
111 		.op_type = VIE_OP_TYPE_MOV,
112 		.op_flags = VIE_OP_F_IMM8,
113 	},
114 	[0xC7] = {
115 		.op_byte = 0xC7,
116 		.op_type = VIE_OP_TYPE_MOV,
117 		.op_flags = VIE_OP_F_IMM,
118 	},
119 	[0x23] = {
120 		.op_byte = 0x23,
121 		.op_type = VIE_OP_TYPE_AND,
122 	},
123 	[0x81] = {
124 		/* XXX Group 1 extended opcode - not just AND */
125 		.op_byte = 0x81,
126 		.op_type = VIE_OP_TYPE_AND,
127 		.op_flags = VIE_OP_F_IMM,
128 	},
129 	[0x83] = {
130 		/* XXX Group 1 extended opcode - not just OR */
131 		.op_byte = 0x83,
132 		.op_type = VIE_OP_TYPE_OR,
133 		.op_flags = VIE_OP_F_IMM8,
134 	},
135 };
136 
137 /* struct vie.mod */
138 #define	VIE_MOD_INDIRECT		0
139 #define	VIE_MOD_INDIRECT_DISP8		1
140 #define	VIE_MOD_INDIRECT_DISP32		2
141 #define	VIE_MOD_DIRECT			3
142 
143 /* struct vie.rm */
144 #define	VIE_RM_SIB			4
145 #define	VIE_RM_DISP32			5
146 
147 #define	GB				(1024 * 1024 * 1024)
148 
149 static enum vm_reg_name gpr_map[16] = {
150 	VM_REG_GUEST_RAX,
151 	VM_REG_GUEST_RCX,
152 	VM_REG_GUEST_RDX,
153 	VM_REG_GUEST_RBX,
154 	VM_REG_GUEST_RSP,
155 	VM_REG_GUEST_RBP,
156 	VM_REG_GUEST_RSI,
157 	VM_REG_GUEST_RDI,
158 	VM_REG_GUEST_R8,
159 	VM_REG_GUEST_R9,
160 	VM_REG_GUEST_R10,
161 	VM_REG_GUEST_R11,
162 	VM_REG_GUEST_R12,
163 	VM_REG_GUEST_R13,
164 	VM_REG_GUEST_R14,
165 	VM_REG_GUEST_R15
166 };
167 
168 static uint64_t size2mask[] = {
169 	[1] = 0xff,
170 	[2] = 0xffff,
171 	[4] = 0xffffffff,
172 	[8] = 0xffffffffffffffff,
173 };
174 
175 static int
176 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
177 {
178 	int error;
179 
180 	error = vm_get_register(vm, vcpuid, reg, rval);
181 
182 	return (error);
183 }
184 
185 static void
186 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
187 {
188 	*lhbr = 0;
189 	*reg = gpr_map[vie->reg];
190 
191 	/*
192 	 * 64-bit mode imposes limitations on accessing legacy high byte
193 	 * registers (lhbr).
194 	 *
195 	 * The legacy high-byte registers cannot be addressed if the REX
196 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
197 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
198 	 *
199 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
200 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
201 	 * %ah, %ch, %dh and %bh respectively.
202 	 */
203 	if (!vie->rex_present) {
204 		if (vie->reg & 0x4) {
205 			*lhbr = 1;
206 			*reg = gpr_map[vie->reg & 0x3];
207 		}
208 	}
209 }
210 
211 static int
212 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
213 {
214 	uint64_t val;
215 	int error, lhbr;
216 	enum vm_reg_name reg;
217 
218 	vie_calc_bytereg(vie, &reg, &lhbr);
219 	error = vm_get_register(vm, vcpuid, reg, &val);
220 
221 	/*
222 	 * To obtain the value of a legacy high byte register shift the
223 	 * base register right by 8 bits (%ah = %rax >> 8).
224 	 */
225 	if (lhbr)
226 		*rval = val >> 8;
227 	else
228 		*rval = val;
229 	return (error);
230 }
231 
232 static int
233 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
234 {
235 	uint64_t origval, val, mask;
236 	int error, lhbr;
237 	enum vm_reg_name reg;
238 
239 	vie_calc_bytereg(vie, &reg, &lhbr);
240 	error = vm_get_register(vm, vcpuid, reg, &origval);
241 	if (error == 0) {
242 		val = byte;
243 		mask = 0xff;
244 		if (lhbr) {
245 			/*
246 			 * Shift left by 8 to store 'byte' in a legacy high
247 			 * byte register.
248 			 */
249 			val <<= 8;
250 			mask <<= 8;
251 		}
252 		val |= origval & ~mask;
253 		error = vm_set_register(vm, vcpuid, reg, val);
254 	}
255 	return (error);
256 }
257 
258 int
259 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
260 		    uint64_t val, int size)
261 {
262 	int error;
263 	uint64_t origval;
264 
265 	switch (size) {
266 	case 1:
267 	case 2:
268 		error = vie_read_register(vm, vcpuid, reg, &origval);
269 		if (error)
270 			return (error);
271 		val &= size2mask[size];
272 		val |= origval & ~size2mask[size];
273 		break;
274 	case 4:
275 		val &= 0xffffffffUL;
276 		break;
277 	case 8:
278 		break;
279 	default:
280 		return (EINVAL);
281 	}
282 
283 	error = vm_set_register(vm, vcpuid, reg, val);
284 	return (error);
285 }
286 
287 static int
288 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
289 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
290 {
291 	int error, size;
292 	enum vm_reg_name reg;
293 	uint8_t byte;
294 	uint64_t val;
295 
296 	size = vie->opsize;
297 	error = EINVAL;
298 
299 	switch (vie->op.op_byte) {
300 	case 0x88:
301 		/*
302 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
303 		 * 88/r:	mov r/m8, r8
304 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
305 		 */
306 		size = 1;	/* override for byte operation */
307 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
308 		if (error == 0)
309 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
310 		break;
311 	case 0x89:
312 		/*
313 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
314 		 * 89/r:	mov r/m16, r16
315 		 * 89/r:	mov r/m32, r32
316 		 * REX.W + 89/r	mov r/m64, r64
317 		 */
318 		reg = gpr_map[vie->reg];
319 		error = vie_read_register(vm, vcpuid, reg, &val);
320 		if (error == 0) {
321 			val &= size2mask[size];
322 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
323 		}
324 		break;
325 	case 0x8A:
326 		/*
327 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
328 		 * 8A/r:	mov r8, r/m8
329 		 * REX + 8A/r:	mov r8, r/m8
330 		 */
331 		size = 1;	/* override for byte operation */
332 		error = memread(vm, vcpuid, gpa, &val, size, arg);
333 		if (error == 0)
334 			error = vie_write_bytereg(vm, vcpuid, vie, val);
335 		break;
336 	case 0x8B:
337 		/*
338 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
339 		 * 8B/r:	mov r16, r/m16
340 		 * 8B/r:	mov r32, r/m32
341 		 * REX.W 8B/r:	mov r64, r/m64
342 		 */
343 		error = memread(vm, vcpuid, gpa, &val, size, arg);
344 		if (error == 0) {
345 			reg = gpr_map[vie->reg];
346 			error = vie_update_register(vm, vcpuid, reg, val, size);
347 		}
348 		break;
349 	case 0xC6:
350 		/*
351 		 * MOV from imm8 to mem (ModRM:r/m)
352 		 * C6/0		mov r/m8, imm8
353 		 * REX + C6/0	mov r/m8, imm8
354 		 */
355 		size = 1;	/* override for byte operation */
356 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
357 		break;
358 	case 0xC7:
359 		/*
360 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
361 		 * C7/0		mov r/m16, imm16
362 		 * C7/0		mov r/m32, imm32
363 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
364 		 */
365 		val = vie->immediate & size2mask[size];
366 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
367 		break;
368 	default:
369 		break;
370 	}
371 
372 	return (error);
373 }
374 
375 static int
376 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
377 	     mem_region_read_t memread, mem_region_write_t memwrite,
378 	     void *arg)
379 {
380 	int error, size;
381 	enum vm_reg_name reg;
382 	uint64_t val;
383 
384 	size = vie->opsize;
385 	error = EINVAL;
386 
387 	switch (vie->op.op_byte) {
388 	case 0xB6:
389 		/*
390 		 * MOV and zero extend byte from mem (ModRM:r/m) to
391 		 * reg (ModRM:reg).
392 		 *
393 		 * 0F B6/r		movzx r16, r/m8
394 		 * 0F B6/r		movzx r32, r/m8
395 		 * REX.W + 0F B6/r	movzx r64, r/m8
396 		 */
397 
398 		/* get the first operand */
399 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
400 		if (error)
401 			break;
402 
403 		/* get the second operand */
404 		reg = gpr_map[vie->reg];
405 
406 		/* zero-extend byte */
407 		val = (uint8_t)val;
408 
409 		/* write the result */
410 		error = vie_update_register(vm, vcpuid, reg, val, size);
411 		break;
412 	case 0xBE:
413 		/*
414 		 * MOV and sign extend byte from mem (ModRM:r/m) to
415 		 * reg (ModRM:reg).
416 		 *
417 		 * 0F BE/r		movsx r16, r/m8
418 		 * 0F BE/r		movsx r32, r/m8
419 		 * REX.W + 0F BE/r	movsx r64, r/m8
420 		 */
421 
422 		/* get the first operand */
423 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
424 		if (error)
425 			break;
426 
427 		/* get the second operand */
428 		reg = gpr_map[vie->reg];
429 
430 		/* sign extend byte */
431 		val = (int8_t)val;
432 
433 		/* write the result */
434 		error = vie_update_register(vm, vcpuid, reg, val, size);
435 		break;
436 	default:
437 		break;
438 	}
439 	return (error);
440 }
441 
442 static int
443 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
444 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
445 {
446 	int error, size;
447 	enum vm_reg_name reg;
448 	uint64_t val1, val2;
449 
450 	size = vie->opsize;
451 	error = EINVAL;
452 
453 	switch (vie->op.op_byte) {
454 	case 0x23:
455 		/*
456 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
457 		 * result in reg.
458 		 *
459 		 * 23/r		and r16, r/m16
460 		 * 23/r		and r32, r/m32
461 		 * REX.W + 23/r	and r64, r/m64
462 		 */
463 
464 		/* get the first operand */
465 		reg = gpr_map[vie->reg];
466 		error = vie_read_register(vm, vcpuid, reg, &val1);
467 		if (error)
468 			break;
469 
470 		/* get the second operand */
471 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
472 		if (error)
473 			break;
474 
475 		/* perform the operation and write the result */
476 		val1 &= val2;
477 		error = vie_update_register(vm, vcpuid, reg, val1, size);
478 		break;
479 	case 0x81:
480 		/*
481 		 * AND mem (ModRM:r/m) with immediate and store the
482 		 * result in mem.
483 		 *
484 		 * 81 /4		and r/m16, imm16
485 		 * 81 /4		and r/m32, imm32
486 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
487 		 *
488 		 * Currently, only the AND operation of the 0x81 opcode
489 		 * is implemented (ModRM:reg = b100).
490 		 */
491 		if ((vie->reg & 7) != 4)
492 			break;
493 
494 		/* get the first operand */
495                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
496                 if (error)
497 			break;
498 
499                 /*
500 		 * perform the operation with the pre-fetched immediate
501 		 * operand and write the result
502 		 */
503                 val1 &= vie->immediate;
504                 error = memwrite(vm, vcpuid, gpa, val1, size, arg);
505 		break;
506 	default:
507 		break;
508 	}
509 	return (error);
510 }
511 
512 static int
513 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
514 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
515 {
516 	int error, size;
517 	uint64_t val1;
518 
519 	size = vie->opsize;
520 	error = EINVAL;
521 
522 	switch (vie->op.op_byte) {
523 	case 0x83:
524 		/*
525 		 * OR mem (ModRM:r/m) with immediate and store the
526 		 * result in mem.
527 		 *
528 		 * 83 /1		OR r/m16, imm8 sign-extended to 16
529 		 * 83 /1		OR r/m32, imm8 sign-extended to 32
530 		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
531 		 *
532 		 * Currently, only the OR operation of the 0x83 opcode
533 		 * is implemented (ModRM:reg = b001).
534 		 */
535 		if ((vie->reg & 7) != 1)
536 			break;
537 
538 		/* get the first operand */
539                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
540                 if (error)
541 			break;
542 
543                 /*
544 		 * perform the operation with the pre-fetched immediate
545 		 * operand and write the result
546 		 */
547                 val1 |= vie->immediate;
548                 error = memwrite(vm, vcpuid, gpa, val1, size, arg);
549 		break;
550 	default:
551 		break;
552 	}
553 	return (error);
554 }
555 
556 int
557 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
558 			mem_region_read_t memread, mem_region_write_t memwrite,
559 			void *memarg)
560 {
561 	int error;
562 
563 	if (!vie->decoded)
564 		return (EINVAL);
565 
566 	switch (vie->op.op_type) {
567 	case VIE_OP_TYPE_MOV:
568 		error = emulate_mov(vm, vcpuid, gpa, vie,
569 				    memread, memwrite, memarg);
570 		break;
571 	case VIE_OP_TYPE_MOVSX:
572 	case VIE_OP_TYPE_MOVZX:
573 		error = emulate_movx(vm, vcpuid, gpa, vie,
574 				     memread, memwrite, memarg);
575 		break;
576 	case VIE_OP_TYPE_AND:
577 		error = emulate_and(vm, vcpuid, gpa, vie,
578 				    memread, memwrite, memarg);
579 		break;
580 	case VIE_OP_TYPE_OR:
581 		error = emulate_or(vm, vcpuid, gpa, vie,
582 				    memread, memwrite, memarg);
583 		break;
584 	default:
585 		error = EINVAL;
586 		break;
587 	}
588 
589 	return (error);
590 }
591 
592 int
593 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
594 {
595 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
596 	    ("%s: invalid size %d", __func__, size));
597 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
598 
599 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
600 		return (0);
601 
602 	return ((gla & (size - 1)) ? 1 : 0);
603 }
604 
605 int
606 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
607 {
608 	uint64_t mask;
609 
610 	if (cpu_mode != CPU_MODE_64BIT)
611 		return (0);
612 
613 	/*
614 	 * The value of the bit 47 in the 'gla' should be replicated in the
615 	 * most significant 16 bits.
616 	 */
617 	mask = ~((1UL << 48) - 1);
618 	if (gla & (1UL << 47))
619 		return ((gla & mask) != mask);
620 	else
621 		return ((gla & mask) != 0);
622 }
623 
624 uint64_t
625 vie_size2mask(int size)
626 {
627 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
628 	    ("vie_size2mask: invalid size %d", size));
629 	return (size2mask[size]);
630 }
631 
632 int
633 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
634     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
635     int prot, uint64_t *gla)
636 {
637 	uint64_t firstoff, low_limit, high_limit, segbase;
638 	int glasize, type;
639 
640 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
641 	    ("%s: invalid segment %d", __func__, seg));
642 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
643 	    ("%s: invalid operand size %d", __func__, length));
644 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
645 	    ("%s: invalid prot %#x", __func__, prot));
646 
647 	firstoff = offset;
648 	if (cpu_mode == CPU_MODE_64BIT) {
649 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
650 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
651 		glasize = 8;
652 	} else {
653 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
654 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
655 		glasize = 4;
656 		/*
657 		 * If the segment selector is loaded with a NULL selector
658 		 * then the descriptor is unusable and attempting to use
659 		 * it results in a #GP(0).
660 		 */
661 		if (SEG_DESC_UNUSABLE(desc->access))
662 			return (-1);
663 
664 		/*
665 		 * The processor generates a #NP exception when a segment
666 		 * register is loaded with a selector that points to a
667 		 * descriptor that is not present. If this was the case then
668 		 * it would have been checked before the VM-exit.
669 		 */
670 		KASSERT(SEG_DESC_PRESENT(desc->access),
671 		    ("segment %d not present: %#x", seg, desc->access));
672 
673 		/*
674 		 * The descriptor type must indicate a code/data segment.
675 		 */
676 		type = SEG_DESC_TYPE(desc->access);
677 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
678 		    "descriptor type %#x", seg, type));
679 
680 		if (prot & PROT_READ) {
681 			/* #GP on a read access to a exec-only code segment */
682 			if ((type & 0xA) == 0x8)
683 				return (-1);
684 		}
685 
686 		if (prot & PROT_WRITE) {
687 			/*
688 			 * #GP on a write access to a code segment or a
689 			 * read-only data segment.
690 			 */
691 			if (type & 0x8)			/* code segment */
692 				return (-1);
693 
694 			if ((type & 0xA) == 0)		/* read-only data seg */
695 				return (-1);
696 		}
697 
698 		/*
699 		 * 'desc->limit' is fully expanded taking granularity into
700 		 * account.
701 		 */
702 		if ((type & 0xC) == 0x4) {
703 			/* expand-down data segment */
704 			low_limit = desc->limit + 1;
705 			high_limit = SEG_DESC_DEF32(desc->access) ?
706 			    0xffffffff : 0xffff;
707 		} else {
708 			/* code segment or expand-up data segment */
709 			low_limit = 0;
710 			high_limit = desc->limit;
711 		}
712 
713 		while (length > 0) {
714 			offset &= vie_size2mask(addrsize);
715 			if (offset < low_limit || offset > high_limit)
716 				return (-1);
717 			offset++;
718 			length--;
719 		}
720 	}
721 
722 	/*
723 	 * In 64-bit mode all segments except %fs and %gs have a segment
724 	 * base address of 0.
725 	 */
726 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
727 	    seg != VM_REG_GUEST_GS) {
728 		segbase = 0;
729 	} else {
730 		segbase = desc->base;
731 	}
732 
733 	/*
734 	 * Truncate 'firstoff' to the effective address size before adding
735 	 * it to the segment base.
736 	 */
737 	firstoff &= vie_size2mask(addrsize);
738 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
739 	return (0);
740 }
741 
742 #ifdef _KERNEL
743 void
744 vie_init(struct vie *vie)
745 {
746 
747 	bzero(vie, sizeof(struct vie));
748 
749 	vie->base_register = VM_REG_LAST;
750 	vie->index_register = VM_REG_LAST;
751 }
752 
753 static int
754 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
755 {
756 	int error_code = 0;
757 
758 	if (pte & PG_V)
759 		error_code |= PGEX_P;
760 	if (prot & VM_PROT_WRITE)
761 		error_code |= PGEX_W;
762 	if (usermode)
763 		error_code |= PGEX_U;
764 	if (rsvd)
765 		error_code |= PGEX_RSV;
766 	if (prot & VM_PROT_EXECUTE)
767 		error_code |= PGEX_I;
768 
769 	return (error_code);
770 }
771 
772 static void
773 ptp_release(void **cookie)
774 {
775 	if (*cookie != NULL) {
776 		vm_gpa_release(*cookie);
777 		*cookie = NULL;
778 	}
779 }
780 
781 static void *
782 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
783 {
784 	void *ptr;
785 
786 	ptp_release(cookie);
787 	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
788 	return (ptr);
789 }
790 
791 int
792 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
793     uint64_t gla, int prot, uint64_t *gpa)
794 {
795 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
796 	u_int retries;
797 	uint64_t *ptpbase, ptpphys, pte, pgsize;
798 	uint32_t *ptpbase32, pte32;
799 	void *cookie;
800 
801 	usermode = (paging->cpl == 3 ? 1 : 0);
802 	writable = prot & VM_PROT_WRITE;
803 	cookie = NULL;
804 	retval = 0;
805 	retries = 0;
806 restart:
807 	ptpphys = paging->cr3;		/* root of the page tables */
808 	ptp_release(&cookie);
809 	if (retries++ > 0)
810 		maybe_yield();
811 
812 	if (vie_canonical_check(paging->cpu_mode, gla)) {
813 		/*
814 		 * XXX assuming a non-stack reference otherwise a stack fault
815 		 * should be generated.
816 		 */
817 		vm_inject_gp(vm, vcpuid);
818 		goto fault;
819 	}
820 
821 	if (paging->paging_mode == PAGING_MODE_FLAT) {
822 		*gpa = gla;
823 		goto done;
824 	}
825 
826 	if (paging->paging_mode == PAGING_MODE_32) {
827 		nlevels = 2;
828 		while (--nlevels >= 0) {
829 			/* Zero out the lower 12 bits. */
830 			ptpphys &= ~0xfff;
831 
832 			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
833 
834 			if (ptpbase32 == NULL)
835 				goto error;
836 
837 			ptpshift = PAGE_SHIFT + nlevels * 10;
838 			ptpindex = (gla >> ptpshift) & 0x3FF;
839 			pgsize = 1UL << ptpshift;
840 
841 			pte32 = ptpbase32[ptpindex];
842 
843 			if ((pte32 & PG_V) == 0 ||
844 			    (usermode && (pte32 & PG_U) == 0) ||
845 			    (writable && (pte32 & PG_RW) == 0)) {
846 				pfcode = pf_error_code(usermode, prot, 0,
847 				    pte32);
848 				vm_inject_pf(vm, vcpuid, pfcode, gla);
849 				goto fault;
850 			}
851 
852 			/*
853 			 * Emulate the x86 MMU's management of the accessed
854 			 * and dirty flags. While the accessed flag is set
855 			 * at every level of the page table, the dirty flag
856 			 * is only set at the last level providing the guest
857 			 * physical address.
858 			 */
859 			if ((pte32 & PG_A) == 0) {
860 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
861 				    pte32, pte32 | PG_A) == 0) {
862 					goto restart;
863 				}
864 			}
865 
866 			/* XXX must be ignored if CR4.PSE=0 */
867 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
868 				break;
869 
870 			ptpphys = pte32;
871 		}
872 
873 		/* Set the dirty bit in the page table entry if necessary */
874 		if (writable && (pte32 & PG_M) == 0) {
875 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
876 			    pte32, pte32 | PG_M) == 0) {
877 				goto restart;
878 			}
879 		}
880 
881 		/* Zero out the lower 'ptpshift' bits */
882 		pte32 >>= ptpshift; pte32 <<= ptpshift;
883 		*gpa = pte32 | (gla & (pgsize - 1));
884 		goto done;
885 	}
886 
887 	if (paging->paging_mode == PAGING_MODE_PAE) {
888 		/* Zero out the lower 5 bits and the upper 32 bits */
889 		ptpphys &= 0xffffffe0UL;
890 
891 		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
892 		if (ptpbase == NULL)
893 			goto error;
894 
895 		ptpindex = (gla >> 30) & 0x3;
896 
897 		pte = ptpbase[ptpindex];
898 
899 		if ((pte & PG_V) == 0) {
900 			pfcode = pf_error_code(usermode, prot, 0, pte);
901 			vm_inject_pf(vm, vcpuid, pfcode, gla);
902 			goto fault;
903 		}
904 
905 		ptpphys = pte;
906 
907 		nlevels = 2;
908 	} else
909 		nlevels = 4;
910 	while (--nlevels >= 0) {
911 		/* Zero out the lower 12 bits and the upper 12 bits */
912 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
913 
914 		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
915 		if (ptpbase == NULL)
916 			goto error;
917 
918 		ptpshift = PAGE_SHIFT + nlevels * 9;
919 		ptpindex = (gla >> ptpshift) & 0x1FF;
920 		pgsize = 1UL << ptpshift;
921 
922 		pte = ptpbase[ptpindex];
923 
924 		if ((pte & PG_V) == 0 ||
925 		    (usermode && (pte & PG_U) == 0) ||
926 		    (writable && (pte & PG_RW) == 0)) {
927 			pfcode = pf_error_code(usermode, prot, 0, pte);
928 			vm_inject_pf(vm, vcpuid, pfcode, gla);
929 			goto fault;
930 		}
931 
932 		/* Set the accessed bit in the page table entry */
933 		if ((pte & PG_A) == 0) {
934 			if (atomic_cmpset_64(&ptpbase[ptpindex],
935 			    pte, pte | PG_A) == 0) {
936 				goto restart;
937 			}
938 		}
939 
940 		if (nlevels > 0 && (pte & PG_PS) != 0) {
941 			if (pgsize > 1 * GB) {
942 				pfcode = pf_error_code(usermode, prot, 1, pte);
943 				vm_inject_pf(vm, vcpuid, pfcode, gla);
944 				goto fault;
945 			}
946 			break;
947 		}
948 
949 		ptpphys = pte;
950 	}
951 
952 	/* Set the dirty bit in the page table entry if necessary */
953 	if (writable && (pte & PG_M) == 0) {
954 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
955 			goto restart;
956 	}
957 
958 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
959 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
960 	*gpa = pte | (gla & (pgsize - 1));
961 done:
962 	ptp_release(&cookie);
963 	return (retval);
964 error:
965 	retval = -1;
966 	goto done;
967 fault:
968 	retval = 1;
969 	goto done;
970 }
971 
972 int
973 vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
974     uint64_t rip, int inst_length, struct vie *vie)
975 {
976 	int n, error, prot;
977 	uint64_t gpa, off;
978 	void *hpa, *cookie;
979 
980 	/*
981 	 * XXX cache previously fetched instructions using 'rip' as the tag
982 	 */
983 
984 	prot = VM_PROT_READ | VM_PROT_EXECUTE;
985 	if (inst_length > VIE_INST_SIZE)
986 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
987 
988 	/* Copy the instruction into 'vie' */
989 	while (vie->num_valid < inst_length) {
990 		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
991 		if (error)
992 			return (error);
993 
994 		off = gpa & PAGE_MASK;
995 		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
996 
997 		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
998 			break;
999 
1000 		bcopy(hpa, &vie->inst[vie->num_valid], n);
1001 
1002 		vm_gpa_release(cookie);
1003 
1004 		rip += n;
1005 		vie->num_valid += n;
1006 	}
1007 
1008 	if (vie->num_valid == inst_length)
1009 		return (0);
1010 	else
1011 		return (-1);
1012 }
1013 
1014 static int
1015 vie_peek(struct vie *vie, uint8_t *x)
1016 {
1017 
1018 	if (vie->num_processed < vie->num_valid) {
1019 		*x = vie->inst[vie->num_processed];
1020 		return (0);
1021 	} else
1022 		return (-1);
1023 }
1024 
1025 static void
1026 vie_advance(struct vie *vie)
1027 {
1028 
1029 	vie->num_processed++;
1030 }
1031 
1032 static int
1033 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1034 {
1035 	uint8_t x;
1036 
1037 	while (1) {
1038 		if (vie_peek(vie, &x))
1039 			return (-1);
1040 
1041 		if (x == 0x66)
1042 			vie->opsize_override = 1;
1043 		else if (x == 0x67)
1044 			vie->addrsize_override = 1;
1045 		else
1046 			break;
1047 
1048 		vie_advance(vie);
1049 	}
1050 
1051 	/*
1052 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1053 	 * - Only one REX prefix is allowed per instruction.
1054 	 * - The REX prefix must immediately precede the opcode byte or the
1055 	 *   escape opcode byte.
1056 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1057 	 *   the mandatory prefix must come before the REX prefix.
1058 	 */
1059 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1060 		vie->rex_present = 1;
1061 		vie->rex_w = x & 0x8 ? 1 : 0;
1062 		vie->rex_r = x & 0x4 ? 1 : 0;
1063 		vie->rex_x = x & 0x2 ? 1 : 0;
1064 		vie->rex_b = x & 0x1 ? 1 : 0;
1065 		vie_advance(vie);
1066 	}
1067 
1068 	/*
1069 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1070 	 */
1071 	if (cpu_mode == CPU_MODE_64BIT) {
1072 		/*
1073 		 * Default address size is 64-bits and default operand size
1074 		 * is 32-bits.
1075 		 */
1076 		vie->addrsize = vie->addrsize_override ? 4 : 8;
1077 		if (vie->rex_w)
1078 			vie->opsize = 8;
1079 		else if (vie->opsize_override)
1080 			vie->opsize = 2;
1081 		else
1082 			vie->opsize = 4;
1083 	} else if (cs_d) {
1084 		/* Default address and operand sizes are 32-bits */
1085 		vie->addrsize = vie->addrsize_override ? 2 : 4;
1086 		vie->opsize = vie->opsize_override ? 2 : 4;
1087 	} else {
1088 		/* Default address and operand sizes are 16-bits */
1089 		vie->addrsize = vie->addrsize_override ? 4 : 2;
1090 		vie->opsize = vie->opsize_override ? 4 : 2;
1091 	}
1092 	return (0);
1093 }
1094 
1095 static int
1096 decode_two_byte_opcode(struct vie *vie)
1097 {
1098 	uint8_t x;
1099 
1100 	if (vie_peek(vie, &x))
1101 		return (-1);
1102 
1103 	vie->op = two_byte_opcodes[x];
1104 
1105 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1106 		return (-1);
1107 
1108 	vie_advance(vie);
1109 	return (0);
1110 }
1111 
1112 static int
1113 decode_opcode(struct vie *vie)
1114 {
1115 	uint8_t x;
1116 
1117 	if (vie_peek(vie, &x))
1118 		return (-1);
1119 
1120 	vie->op = one_byte_opcodes[x];
1121 
1122 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1123 		return (-1);
1124 
1125 	vie_advance(vie);
1126 
1127 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1128 		return (decode_two_byte_opcode(vie));
1129 
1130 	return (0);
1131 }
1132 
1133 static int
1134 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1135 {
1136 	uint8_t x;
1137 
1138 	if (cpu_mode == CPU_MODE_REAL)
1139 		return (-1);
1140 
1141 	if (vie_peek(vie, &x))
1142 		return (-1);
1143 
1144 	vie->mod = (x >> 6) & 0x3;
1145 	vie->rm =  (x >> 0) & 0x7;
1146 	vie->reg = (x >> 3) & 0x7;
1147 
1148 	/*
1149 	 * A direct addressing mode makes no sense in the context of an EPT
1150 	 * fault. There has to be a memory access involved to cause the
1151 	 * EPT fault.
1152 	 */
1153 	if (vie->mod == VIE_MOD_DIRECT)
1154 		return (-1);
1155 
1156 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1157 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1158 		/*
1159 		 * Table 2-5: Special Cases of REX Encodings
1160 		 *
1161 		 * mod=0, r/m=5 is used in the compatibility mode to
1162 		 * indicate a disp32 without a base register.
1163 		 *
1164 		 * mod!=3, r/m=4 is used in the compatibility mode to
1165 		 * indicate that the SIB byte is present.
1166 		 *
1167 		 * The 'b' bit in the REX prefix is don't care in
1168 		 * this case.
1169 		 */
1170 	} else {
1171 		vie->rm |= (vie->rex_b << 3);
1172 	}
1173 
1174 	vie->reg |= (vie->rex_r << 3);
1175 
1176 	/* SIB */
1177 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1178 		goto done;
1179 
1180 	vie->base_register = gpr_map[vie->rm];
1181 
1182 	switch (vie->mod) {
1183 	case VIE_MOD_INDIRECT_DISP8:
1184 		vie->disp_bytes = 1;
1185 		break;
1186 	case VIE_MOD_INDIRECT_DISP32:
1187 		vie->disp_bytes = 4;
1188 		break;
1189 	case VIE_MOD_INDIRECT:
1190 		if (vie->rm == VIE_RM_DISP32) {
1191 			vie->disp_bytes = 4;
1192 			/*
1193 			 * Table 2-7. RIP-Relative Addressing
1194 			 *
1195 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1196 			 * whereas in compatibility mode it just implies disp32.
1197 			 */
1198 
1199 			if (cpu_mode == CPU_MODE_64BIT)
1200 				vie->base_register = VM_REG_GUEST_RIP;
1201 			else
1202 				vie->base_register = VM_REG_LAST;
1203 		}
1204 		break;
1205 	}
1206 
1207 done:
1208 	vie_advance(vie);
1209 
1210 	return (0);
1211 }
1212 
1213 static int
1214 decode_sib(struct vie *vie)
1215 {
1216 	uint8_t x;
1217 
1218 	/* Proceed only if SIB byte is present */
1219 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1220 		return (0);
1221 
1222 	if (vie_peek(vie, &x))
1223 		return (-1);
1224 
1225 	/* De-construct the SIB byte */
1226 	vie->ss = (x >> 6) & 0x3;
1227 	vie->index = (x >> 3) & 0x7;
1228 	vie->base = (x >> 0) & 0x7;
1229 
1230 	/* Apply the REX prefix modifiers */
1231 	vie->index |= vie->rex_x << 3;
1232 	vie->base |= vie->rex_b << 3;
1233 
1234 	switch (vie->mod) {
1235 	case VIE_MOD_INDIRECT_DISP8:
1236 		vie->disp_bytes = 1;
1237 		break;
1238 	case VIE_MOD_INDIRECT_DISP32:
1239 		vie->disp_bytes = 4;
1240 		break;
1241 	}
1242 
1243 	if (vie->mod == VIE_MOD_INDIRECT &&
1244 	    (vie->base == 5 || vie->base == 13)) {
1245 		/*
1246 		 * Special case when base register is unused if mod = 0
1247 		 * and base = %rbp or %r13.
1248 		 *
1249 		 * Documented in:
1250 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1251 		 * Table 2-5: Special Cases of REX Encodings
1252 		 */
1253 		vie->disp_bytes = 4;
1254 	} else {
1255 		vie->base_register = gpr_map[vie->base];
1256 	}
1257 
1258 	/*
1259 	 * All encodings of 'index' are valid except for %rsp (4).
1260 	 *
1261 	 * Documented in:
1262 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1263 	 * Table 2-5: Special Cases of REX Encodings
1264 	 */
1265 	if (vie->index != 4)
1266 		vie->index_register = gpr_map[vie->index];
1267 
1268 	/* 'scale' makes sense only in the context of an index register */
1269 	if (vie->index_register < VM_REG_LAST)
1270 		vie->scale = 1 << vie->ss;
1271 
1272 	vie_advance(vie);
1273 
1274 	return (0);
1275 }
1276 
1277 static int
1278 decode_displacement(struct vie *vie)
1279 {
1280 	int n, i;
1281 	uint8_t x;
1282 
1283 	union {
1284 		char	buf[4];
1285 		int8_t	signed8;
1286 		int32_t	signed32;
1287 	} u;
1288 
1289 	if ((n = vie->disp_bytes) == 0)
1290 		return (0);
1291 
1292 	if (n != 1 && n != 4)
1293 		panic("decode_displacement: invalid disp_bytes %d", n);
1294 
1295 	for (i = 0; i < n; i++) {
1296 		if (vie_peek(vie, &x))
1297 			return (-1);
1298 
1299 		u.buf[i] = x;
1300 		vie_advance(vie);
1301 	}
1302 
1303 	if (n == 1)
1304 		vie->displacement = u.signed8;		/* sign-extended */
1305 	else
1306 		vie->displacement = u.signed32;		/* sign-extended */
1307 
1308 	return (0);
1309 }
1310 
1311 static int
1312 decode_immediate(struct vie *vie)
1313 {
1314 	int i, n;
1315 	uint8_t x;
1316 	union {
1317 		char	buf[8];
1318 		int8_t	signed8;
1319 		int16_t	signed16;
1320 		int32_t	signed32;
1321 		int64_t	signed64;
1322 	} u;
1323 
1324 	/* Figure out immediate operand size (if any) */
1325 	if (vie->op.op_flags & VIE_OP_F_MOFFSET) {
1326 		/*
1327 		 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
1328 		 * The memory offset size follows the address-size of the
1329 		 * instruction. Although this is treated as an immediate
1330 		 * value during instruction decoding it is interpreted as
1331 		 * a segment offset by the instruction emulation.
1332 		 */
1333 		vie->imm_bytes = vie->addrsize;
1334 	} else if (vie->op.op_flags & VIE_OP_F_IMM) {
1335 		/*
1336 		 * Section 2.2.1.5 "Immediates", Intel SDM:
1337 		 * In 64-bit mode the typical size of immediate operands
1338 		 * remains 32-bits. When the operand size if 64-bits, the
1339 		 * processor sign-extends all immediates to 64-bits prior
1340 		 * to their use.
1341 		 */
1342 		if (vie->opsize == 4 || vie->opsize == 8)
1343 			vie->imm_bytes = 4;
1344 		else
1345 			vie->imm_bytes = 2;
1346 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
1347 		vie->imm_bytes = 1;
1348 	}
1349 
1350 	if ((n = vie->imm_bytes) == 0)
1351 		return (0);
1352 
1353 	KASSERT(n == 1 || n == 2 || n == 4 || n == 8,
1354 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
1355 
1356 	for (i = 0; i < n; i++) {
1357 		if (vie_peek(vie, &x))
1358 			return (-1);
1359 
1360 		u.buf[i] = x;
1361 		vie_advance(vie);
1362 	}
1363 
1364 	/* sign-extend the immediate value before use */
1365 	if (n == 1)
1366 		vie->immediate = u.signed8;
1367 	else if (n == 2)
1368 		vie->immediate = u.signed16;
1369 	else if (n == 4)
1370 		vie->immediate = u.signed32;
1371 	else
1372 		vie->immediate = u.signed64;
1373 
1374 
1375 	if (vie->op.op_flags & VIE_OP_F_MOFFSET) {
1376 		/*
1377 		 * If the immediate value is going to be interpreted as a
1378 		 * segment offset then undo the sign-extension above.
1379 		 */
1380 		vie->immediate &= size2mask[n];
1381 	}
1382 
1383 	return (0);
1384 }
1385 
1386 /*
1387  * Verify that all the bytes in the instruction buffer were consumed.
1388  */
1389 static int
1390 verify_inst_length(struct vie *vie)
1391 {
1392 
1393 	if (vie->num_processed == vie->num_valid)
1394 		return (0);
1395 	else
1396 		return (-1);
1397 }
1398 
1399 /*
1400  * Verify that the 'guest linear address' provided as collateral of the nested
1401  * page table fault matches with our instruction decoding.
1402  */
1403 static int
1404 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1405 {
1406 	int error;
1407 	uint64_t base, idx, gla2;
1408 
1409 	/* Skip 'gla' verification */
1410 	if (gla == VIE_INVALID_GLA)
1411 		return (0);
1412 
1413 	base = 0;
1414 	if (vie->base_register != VM_REG_LAST) {
1415 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1416 		if (error) {
1417 			printf("verify_gla: error %d getting base reg %d\n",
1418 				error, vie->base_register);
1419 			return (-1);
1420 		}
1421 
1422 		/*
1423 		 * RIP-relative addressing starts from the following
1424 		 * instruction
1425 		 */
1426 		if (vie->base_register == VM_REG_GUEST_RIP)
1427 			base += vie->num_valid;
1428 	}
1429 
1430 	idx = 0;
1431 	if (vie->index_register != VM_REG_LAST) {
1432 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1433 		if (error) {
1434 			printf("verify_gla: error %d getting index reg %d\n",
1435 				error, vie->index_register);
1436 			return (-1);
1437 		}
1438 	}
1439 
1440 	/* XXX assuming that the base address of the segment is 0 */
1441 	gla2 = base + vie->scale * idx + vie->displacement;
1442 	gla2 &= size2mask[vie->addrsize];
1443 	if (gla != gla2) {
1444 		printf("verify_gla mismatch: "
1445 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1446 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
1447 		       base, vie->scale, idx, vie->displacement, gla, gla2);
1448 		return (-1);
1449 	}
1450 
1451 	return (0);
1452 }
1453 
1454 int
1455 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1456 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
1457 {
1458 
1459 	if (decode_prefixes(vie, cpu_mode, cs_d))
1460 		return (-1);
1461 
1462 	if (decode_opcode(vie))
1463 		return (-1);
1464 
1465 	if (decode_modrm(vie, cpu_mode))
1466 		return (-1);
1467 
1468 	if (decode_sib(vie))
1469 		return (-1);
1470 
1471 	if (decode_displacement(vie))
1472 		return (-1);
1473 
1474 	if (decode_immediate(vie))
1475 		return (-1);
1476 
1477 	if (verify_inst_length(vie))
1478 		return (-1);
1479 
1480 	if (verify_gla(vm, cpuid, gla, vie))
1481 		return (-1);
1482 
1483 	vie->decoded = 1;	/* success */
1484 
1485 	return (0);
1486 }
1487 #endif	/* _KERNEL */
1488