xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_instruction_emul.c (revision b3783300013fa93b98278c901b855062f538f7e2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2015 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2021 Oxide Computer Company
44  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include <sys/param.h>
51 #include <sys/pcpu.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
54 
55 #include <machine/vmparam.h>
56 #include <machine/vmm.h>
57 #include <sys/vmm_kernel.h>
58 #include <sys/vmm_vm.h>
59 
60 #include <sys/vmm_instruction_emul.h>
61 #include <x86/psl.h>
62 #include <x86/specialreg.h>
63 
64 #include "vmm_ioport.h"
65 
66 enum vie_status {
67 	VIES_INIT		= (1U << 0),
68 	VIES_MMIO		= (1U << 1),
69 	VIES_INOUT		= (1U << 2),
70 	VIES_OTHER		= (1U << 3),
71 	VIES_INST_FETCH		= (1U << 4),
72 	VIES_INST_DECODE	= (1U << 5),
73 	VIES_PENDING_MMIO	= (1U << 6),
74 	VIES_PENDING_INOUT	= (1U << 7),
75 	VIES_REPEAT		= (1U << 8),
76 	VIES_USER_FALLBACK	= (1U << 9),
77 	VIES_COMPLETE		= (1U << 10),
78 };
79 
80 /* State of request to perform emulated access (inout or MMIO) */
81 enum vie_req {
82 	VR_NONE,
83 	VR_PENDING,
84 	VR_DONE,
85 };
86 
87 struct vie_mmio {
88 	uint64_t		data;
89 	uint64_t		gpa;
90 	uint8_t			bytes;
91 	enum vie_req		state;
92 };
93 
94 struct vie_op {
95 	uint8_t		op_byte;	/* actual opcode byte */
96 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
97 	uint16_t	op_flags;
98 };
99 
100 #define	VIE_INST_SIZE	15
101 struct vie {
102 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
103 	uint8_t		num_valid;		/* size of the instruction */
104 	uint8_t		num_processed;
105 
106 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
107 	uint8_t		rex_w:1,		/* REX prefix */
108 			rex_r:1,
109 			rex_x:1,
110 			rex_b:1,
111 			rex_present:1,
112 			repz_present:1,		/* REP/REPE/REPZ prefix */
113 			repnz_present:1,	/* REPNE/REPNZ prefix */
114 			opsize_override:1,	/* Operand size override */
115 			addrsize_override:1,	/* Address size override */
116 			segment_override:1;	/* Segment override */
117 
118 	uint8_t		mod:2,			/* ModRM byte */
119 			reg:4,
120 			rm:4;
121 
122 	uint8_t		ss:2,			/* SIB byte */
123 			vex_present:1,		/* VEX prefixed */
124 			vex_l:1,		/* L bit */
125 			index:4,		/* SIB byte */
126 			base:4;			/* SIB byte */
127 
128 	uint8_t		disp_bytes;
129 	uint8_t		imm_bytes;
130 
131 	uint8_t		scale;
132 
133 	uint8_t		vex_reg:4,	/* vvvv: first source reg specifier */
134 			vex_pp:2,	/* pp */
135 			_sparebits:2;
136 
137 	uint8_t		_sparebytes[2];
138 
139 	int		base_register;		/* VM_REG_GUEST_xyz */
140 	int		index_register;		/* VM_REG_GUEST_xyz */
141 	int		segment_register;	/* VM_REG_GUEST_xyz */
142 
143 	int64_t		displacement;		/* optional addr displacement */
144 	int64_t		immediate;		/* optional immediate operand */
145 
146 	struct vie_op	op;			/* opcode description */
147 
148 	enum vie_status	status;
149 
150 	struct vm_guest_paging paging;		/* guest paging state */
151 
152 	uint64_t	mmio_gpa;		/* faulting GPA */
153 	struct vie_mmio	mmio_req_read;
154 	struct vie_mmio	mmio_req_write;
155 
156 	struct vm_inout	inout;			/* active in/out op */
157 	enum vie_req	inout_req_state;
158 	uint32_t	inout_req_val;		/* value from userspace */
159 };
160 
161 
162 /* struct vie_op.op_type */
163 enum {
164 	VIE_OP_TYPE_NONE = 0,
165 	VIE_OP_TYPE_MOV,
166 	VIE_OP_TYPE_MOVSX,
167 	VIE_OP_TYPE_MOVZX,
168 	VIE_OP_TYPE_MOV_CR,
169 	VIE_OP_TYPE_AND,
170 	VIE_OP_TYPE_OR,
171 	VIE_OP_TYPE_SUB,
172 	VIE_OP_TYPE_TWO_BYTE,
173 	VIE_OP_TYPE_PUSH,
174 	VIE_OP_TYPE_CMP,
175 	VIE_OP_TYPE_POP,
176 	VIE_OP_TYPE_MOVS,
177 	VIE_OP_TYPE_GROUP1,
178 	VIE_OP_TYPE_STOS,
179 	VIE_OP_TYPE_BITTEST,
180 	VIE_OP_TYPE_TWOB_GRP15,
181 	VIE_OP_TYPE_ADD,
182 	VIE_OP_TYPE_TEST,
183 	VIE_OP_TYPE_BEXTR,
184 	VIE_OP_TYPE_CLTS,
185 	VIE_OP_TYPE_MUL,
186 	VIE_OP_TYPE_LAST
187 };
188 
189 /* struct vie_op.op_flags */
190 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
191 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
192 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
193 #define	VIE_OP_F_NO_MODRM	(1 << 3)
194 #define	VIE_OP_F_NO_GLA_VERIFICATION	(1 << 4)
195 #define	VIE_OP_F_REG_REG	(1 << 5)  /* special-case for mov-cr */
196 
197 static const struct vie_op three_byte_opcodes_0f38[256] = {
198 	[0xF7] = {
199 		.op_byte = 0xF7,
200 		.op_type = VIE_OP_TYPE_BEXTR,
201 	},
202 };
203 
204 static const struct vie_op two_byte_opcodes[256] = {
205 	[0x06] = {
206 		.op_byte = 0x06,
207 		.op_type = VIE_OP_TYPE_CLTS,
208 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
209 	},
210 	[0x20] = {
211 		.op_byte = 0x20,
212 		.op_type = VIE_OP_TYPE_MOV_CR,
213 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
214 	},
215 	[0x22] = {
216 		.op_byte = 0x22,
217 		.op_type = VIE_OP_TYPE_MOV_CR,
218 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
219 	},
220 	[0xAE] = {
221 		.op_byte = 0xAE,
222 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
223 	},
224 	[0xAF] = {
225 		.op_byte = 0xAF,
226 		.op_type = VIE_OP_TYPE_MUL,
227 	},
228 	[0xB6] = {
229 		.op_byte = 0xB6,
230 		.op_type = VIE_OP_TYPE_MOVZX,
231 	},
232 	[0xB7] = {
233 		.op_byte = 0xB7,
234 		.op_type = VIE_OP_TYPE_MOVZX,
235 	},
236 	[0xBA] = {
237 		.op_byte = 0xBA,
238 		.op_type = VIE_OP_TYPE_BITTEST,
239 		.op_flags = VIE_OP_F_IMM8,
240 	},
241 	[0xBE] = {
242 		.op_byte = 0xBE,
243 		.op_type = VIE_OP_TYPE_MOVSX,
244 	},
245 };
246 
247 static const struct vie_op one_byte_opcodes[256] = {
248 	[0x03] = {
249 		.op_byte = 0x03,
250 		.op_type = VIE_OP_TYPE_ADD,
251 	},
252 	[0x0F] = {
253 		.op_byte = 0x0F,
254 		.op_type = VIE_OP_TYPE_TWO_BYTE
255 	},
256 	[0x0B] = {
257 		.op_byte = 0x0B,
258 		.op_type = VIE_OP_TYPE_OR,
259 	},
260 	[0x2B] = {
261 		.op_byte = 0x2B,
262 		.op_type = VIE_OP_TYPE_SUB,
263 	},
264 	[0x39] = {
265 		.op_byte = 0x39,
266 		.op_type = VIE_OP_TYPE_CMP,
267 	},
268 	[0x3B] = {
269 		.op_byte = 0x3B,
270 		.op_type = VIE_OP_TYPE_CMP,
271 	},
272 	[0x88] = {
273 		.op_byte = 0x88,
274 		.op_type = VIE_OP_TYPE_MOV,
275 	},
276 	[0x89] = {
277 		.op_byte = 0x89,
278 		.op_type = VIE_OP_TYPE_MOV,
279 	},
280 	[0x8A] = {
281 		.op_byte = 0x8A,
282 		.op_type = VIE_OP_TYPE_MOV,
283 	},
284 	[0x8B] = {
285 		.op_byte = 0x8B,
286 		.op_type = VIE_OP_TYPE_MOV,
287 	},
288 	[0xA1] = {
289 		.op_byte = 0xA1,
290 		.op_type = VIE_OP_TYPE_MOV,
291 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
292 	},
293 	[0xA3] = {
294 		.op_byte = 0xA3,
295 		.op_type = VIE_OP_TYPE_MOV,
296 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
297 	},
298 	[0xA4] = {
299 		.op_byte = 0xA4,
300 		.op_type = VIE_OP_TYPE_MOVS,
301 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
302 	},
303 	[0xA5] = {
304 		.op_byte = 0xA5,
305 		.op_type = VIE_OP_TYPE_MOVS,
306 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
307 	},
308 	[0xAA] = {
309 		.op_byte = 0xAA,
310 		.op_type = VIE_OP_TYPE_STOS,
311 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
312 	},
313 	[0xAB] = {
314 		.op_byte = 0xAB,
315 		.op_type = VIE_OP_TYPE_STOS,
316 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
317 	},
318 	[0xC6] = {
319 		/* XXX Group 11 extended opcode - not just MOV */
320 		.op_byte = 0xC6,
321 		.op_type = VIE_OP_TYPE_MOV,
322 		.op_flags = VIE_OP_F_IMM8,
323 	},
324 	[0xC7] = {
325 		.op_byte = 0xC7,
326 		.op_type = VIE_OP_TYPE_MOV,
327 		.op_flags = VIE_OP_F_IMM,
328 	},
329 	[0x23] = {
330 		.op_byte = 0x23,
331 		.op_type = VIE_OP_TYPE_AND,
332 	},
333 	[0x80] = {
334 		/* Group 1 extended opcode */
335 		.op_byte = 0x80,
336 		.op_type = VIE_OP_TYPE_GROUP1,
337 		.op_flags = VIE_OP_F_IMM8,
338 	},
339 	[0x81] = {
340 		/* Group 1 extended opcode */
341 		.op_byte = 0x81,
342 		.op_type = VIE_OP_TYPE_GROUP1,
343 		.op_flags = VIE_OP_F_IMM,
344 	},
345 	[0x83] = {
346 		/* Group 1 extended opcode */
347 		.op_byte = 0x83,
348 		.op_type = VIE_OP_TYPE_GROUP1,
349 		.op_flags = VIE_OP_F_IMM8,
350 	},
351 	[0x8F] = {
352 		/* XXX Group 1A extended opcode - not just POP */
353 		.op_byte = 0x8F,
354 		.op_type = VIE_OP_TYPE_POP,
355 	},
356 	[0xF6] = {
357 		/* XXX Group 3 extended opcode - not just TEST */
358 		.op_byte = 0xF6,
359 		.op_type = VIE_OP_TYPE_TEST,
360 		.op_flags = VIE_OP_F_IMM8,
361 	},
362 	[0xF7] = {
363 		/* XXX Group 3 extended opcode - not just TEST */
364 		.op_byte = 0xF7,
365 		.op_type = VIE_OP_TYPE_TEST,
366 		.op_flags = VIE_OP_F_IMM,
367 	},
368 	[0xFF] = {
369 		/* XXX Group 5 extended opcode - not just PUSH */
370 		.op_byte = 0xFF,
371 		.op_type = VIE_OP_TYPE_PUSH,
372 	}
373 };
374 
375 /* struct vie.mod */
376 #define	VIE_MOD_INDIRECT		0
377 #define	VIE_MOD_INDIRECT_DISP8		1
378 #define	VIE_MOD_INDIRECT_DISP32		2
379 #define	VIE_MOD_DIRECT			3
380 
381 /* struct vie.rm */
382 #define	VIE_RM_SIB			4
383 #define	VIE_RM_DISP32			5
384 
385 #define	GB				(1024 * 1024 * 1024)
386 
387 
388 /*
389  * Paging defines, previously pulled in from machine/pmap.h
390  */
391 #define	PG_V	(1 << 0) /* Present */
392 #define	PG_RW	(1 << 1) /* Read/Write */
393 #define	PG_U	(1 << 2) /* User/Supervisor */
394 #define	PG_A	(1 << 5) /* Accessed */
395 #define	PG_M	(1 << 6) /* Dirty */
396 #define	PG_PS	(1 << 7) /* Largepage */
397 
398 /*
399  * Paging except defines, previously pulled in from machine/pmap.h
400  */
401 #define	PGEX_P		(1 << 0) /* Non-present/Protection */
402 #define	PGEX_W		(1 << 1) /* Read/Write */
403 #define	PGEX_U		(1 << 2) /* User/Supervisor */
404 #define	PGEX_RSV	(1 << 3) /* (Non-)Reserved */
405 #define	PGEX_I		(1 << 4) /* Instruction */
406 
407 
408 static enum vm_reg_name gpr_map[16] = {
409 	VM_REG_GUEST_RAX,
410 	VM_REG_GUEST_RCX,
411 	VM_REG_GUEST_RDX,
412 	VM_REG_GUEST_RBX,
413 	VM_REG_GUEST_RSP,
414 	VM_REG_GUEST_RBP,
415 	VM_REG_GUEST_RSI,
416 	VM_REG_GUEST_RDI,
417 	VM_REG_GUEST_R8,
418 	VM_REG_GUEST_R9,
419 	VM_REG_GUEST_R10,
420 	VM_REG_GUEST_R11,
421 	VM_REG_GUEST_R12,
422 	VM_REG_GUEST_R13,
423 	VM_REG_GUEST_R14,
424 	VM_REG_GUEST_R15
425 };
426 
427 static const char *gpr_name_map[][16] = {
428 	[1] = {
429 		"a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil",
430 		"r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
431 	},
432 	[2] = {
433 		"ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
434 		"r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w",
435 	},
436 	[4] = {
437 		"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
438 		"r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
439 	},
440 	[8] = {
441 		"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
442 		"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
443 	},
444 };
445 
446 static enum vm_reg_name cr_map[16] = {
447 	VM_REG_GUEST_CR0,
448 	VM_REG_LAST,
449 	VM_REG_GUEST_CR2,
450 	VM_REG_GUEST_CR3,
451 	VM_REG_GUEST_CR4,
452 	VM_REG_LAST,
453 	VM_REG_LAST,
454 	VM_REG_LAST,
455 	VM_REG_LAST,
456 	VM_REG_LAST,
457 	VM_REG_LAST,
458 	VM_REG_LAST,
459 	VM_REG_LAST,
460 	VM_REG_LAST,
461 	VM_REG_LAST,
462 	VM_REG_LAST
463 };
464 
465 static uint64_t size2mask[] = {
466 	[1] = 0xff,
467 	[2] = 0xffff,
468 	[4] = 0xffffffff,
469 	[8] = 0xffffffffffffffff,
470 };
471 
472 
473 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid,
474     uint64_t gpa, uint64_t *rval, int bytes);
475 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid,
476     uint64_t gpa, uint64_t wval, int bytes);
477 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
478     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
479     int prot, uint64_t *gla);
480 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
481 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf,
482     uint64_t gla);
483 static uint64_t vie_size2mask(int size);
484 
485 struct vie *
486 vie_alloc()
487 {
488 	return (kmem_zalloc(sizeof (struct vie), KM_SLEEP));
489 }
490 
491 void
492 vie_free(struct vie *vie)
493 {
494 	kmem_free(vie, sizeof (struct vie));
495 }
496 
497 enum vm_reg_name
498 vie_regnum_map(uint8_t regnum)
499 {
500 	VERIFY3U(regnum, <, 16);
501 	return (gpr_map[regnum]);
502 }
503 
504 const char *
505 vie_regnum_name(uint8_t regnum, uint8_t size)
506 {
507 	VERIFY3U(regnum, <, 16);
508 	VERIFY(size == 1 || size == 2 || size == 4 || size == 8);
509 	return (gpr_name_map[size][regnum]);
510 }
511 
512 static void
513 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
514 {
515 	*lhbr = 0;
516 	*reg = gpr_map[vie->reg];
517 
518 	/*
519 	 * 64-bit mode imposes limitations on accessing legacy high byte
520 	 * registers (lhbr).
521 	 *
522 	 * The legacy high-byte registers cannot be addressed if the REX
523 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
524 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
525 	 *
526 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
527 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
528 	 * %ah, %ch, %dh and %bh respectively.
529 	 */
530 	if (!vie->rex_present) {
531 		if (vie->reg & 0x4) {
532 			*lhbr = 1;
533 			*reg = gpr_map[vie->reg & 0x3];
534 		}
535 	}
536 }
537 
538 static int
539 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval)
540 {
541 	uint64_t val;
542 	int error, lhbr;
543 	enum vm_reg_name reg;
544 
545 	vie_calc_bytereg(vie, &reg, &lhbr);
546 	error = vm_get_register(vm, vcpuid, reg, &val);
547 
548 	/*
549 	 * To obtain the value of a legacy high byte register shift the
550 	 * base register right by 8 bits (%ah = %rax >> 8).
551 	 */
552 	if (lhbr)
553 		*rval = val >> 8;
554 	else
555 		*rval = val;
556 	return (error);
557 }
558 
559 static int
560 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte)
561 {
562 	uint64_t origval, val, mask;
563 	int error, lhbr;
564 	enum vm_reg_name reg;
565 
566 	vie_calc_bytereg(vie, &reg, &lhbr);
567 	error = vm_get_register(vm, vcpuid, reg, &origval);
568 	if (error == 0) {
569 		val = byte;
570 		mask = 0xff;
571 		if (lhbr) {
572 			/*
573 			 * Shift left by 8 to store 'byte' in a legacy high
574 			 * byte register.
575 			 */
576 			val <<= 8;
577 			mask <<= 8;
578 		}
579 		val |= origval & ~mask;
580 		error = vm_set_register(vm, vcpuid, reg, val);
581 	}
582 	return (error);
583 }
584 
585 static int
586 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg,
587     uint64_t val, int size)
588 {
589 	int error;
590 	uint64_t origval;
591 
592 	switch (size) {
593 	case 1:
594 	case 2:
595 		error = vm_get_register(vm, vcpuid, reg, &origval);
596 		if (error)
597 			return (error);
598 		val &= size2mask[size];
599 		val |= origval & ~size2mask[size];
600 		break;
601 	case 4:
602 		val &= 0xffffffffUL;
603 		break;
604 	case 8:
605 		break;
606 	default:
607 		return (EINVAL);
608 	}
609 
610 	error = vm_set_register(vm, vcpuid, reg, val);
611 	return (error);
612 }
613 
614 static int
615 vie_repeat(struct vie *vie)
616 {
617 	vie->status |= VIES_REPEAT;
618 
619 	/*
620 	 * Clear out any cached operation values so the repeated instruction can
621 	 * begin without using that stale state.  Other state, such as the
622 	 * decoding results, are kept around as it will not vary between
623 	 * iterations of a rep-prefixed instruction.
624 	 */
625 	if ((vie->status & VIES_MMIO) != 0) {
626 		vie->mmio_req_read.state = VR_NONE;
627 		vie->mmio_req_write.state = VR_NONE;
628 	} else if ((vie->status & VIES_INOUT) != 0) {
629 		vie->inout_req_state = VR_NONE;
630 	} else {
631 		panic("unexpected emulation state");
632 	}
633 
634 	return (EAGAIN);
635 }
636 
637 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
638 
639 /*
640  * Return the status flags that would result from doing (x - y).
641  */
642 /* BEGIN CSTYLED */
643 #define	GETCC(sz)							\
644 static ulong_t								\
645 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
646 {									\
647 	ulong_t rflags;							\
648 									\
649 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
650 	    "=r" (rflags), "+r" (x) : "m" (y));				\
651 	return (rflags);						\
652 } struct __hack
653 /* END CSTYLED */
654 
655 GETCC(8);
656 GETCC(16);
657 GETCC(32);
658 GETCC(64);
659 
660 static ulong_t
661 getcc(int opsize, uint64_t x, uint64_t y)
662 {
663 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
664 	    ("getcc: invalid operand size %d", opsize));
665 
666 	if (opsize == 1)
667 		return (getcc8(x, y));
668 	else if (opsize == 2)
669 		return (getcc16(x, y));
670 	else if (opsize == 4)
671 		return (getcc32(x, y));
672 	else
673 		return (getcc64(x, y));
674 }
675 
676 /*
677  * Macro creation of functions getaddflags{8,16,32,64}
678  */
679 /* BEGIN CSTYLED */
680 #define	GETADDFLAGS(sz)							\
681 static ulong_t								\
682 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
683 {									\
684 	ulong_t rflags;							\
685 									\
686 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
687 	    "=r" (rflags), "+r" (x) : "m" (y));				\
688 	return (rflags);						\
689 } struct __hack
690 /* END CSTYLED */
691 
692 GETADDFLAGS(8);
693 GETADDFLAGS(16);
694 GETADDFLAGS(32);
695 GETADDFLAGS(64);
696 
697 static ulong_t
698 getaddflags(int opsize, uint64_t x, uint64_t y)
699 {
700 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
701 	    ("getaddflags: invalid operand size %d", opsize));
702 
703 	if (opsize == 1)
704 		return (getaddflags8(x, y));
705 	else if (opsize == 2)
706 		return (getaddflags16(x, y));
707 	else if (opsize == 4)
708 		return (getaddflags32(x, y));
709 	else
710 		return (getaddflags64(x, y));
711 }
712 
713 /*
714  * Macro creation of functions getimulflags{16,32,64}
715  */
716 /* BEGIN CSTYLED */
717 #define	GETIMULFLAGS(sz)						\
718 static ulong_t								\
719 getimulflags##sz(uint##sz##_t x, uint##sz##_t y)			\
720 {									\
721 	ulong_t rflags;							\
722 									\
723 	__asm __volatile("imul %2,%1; pushfq; popq %0" :		\
724 	    "=r" (rflags), "+r" (x) : "m" (y));				\
725 	return (rflags);						\
726 } struct __hack
727 /* END CSTYLED */
728 
729 GETIMULFLAGS(16);
730 GETIMULFLAGS(32);
731 GETIMULFLAGS(64);
732 
733 static ulong_t
734 getimulflags(int opsize, uint64_t x, uint64_t y)
735 {
736 	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
737 	    ("getimulflags: invalid operand size %d", opsize));
738 
739 	if (opsize == 2)
740 		return (getimulflags16(x, y));
741 	else if (opsize == 4)
742 		return (getimulflags32(x, y));
743 	else
744 		return (getimulflags64(x, y));
745 }
746 
747 /*
748  * Return the status flags that would result from doing (x & y).
749  */
750 /* BEGIN CSTYLED */
751 #define	GETANDFLAGS(sz)							\
752 static ulong_t								\
753 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
754 {									\
755 	ulong_t rflags;							\
756 									\
757 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
758 	    "=r" (rflags), "+r" (x) : "m" (y));				\
759 	return (rflags);						\
760 } struct __hack
761 /* END CSTYLED */
762 
763 GETANDFLAGS(8);
764 GETANDFLAGS(16);
765 GETANDFLAGS(32);
766 GETANDFLAGS(64);
767 
768 static ulong_t
769 getandflags(int opsize, uint64_t x, uint64_t y)
770 {
771 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
772 	    ("getandflags: invalid operand size %d", opsize));
773 
774 	if (opsize == 1)
775 		return (getandflags8(x, y));
776 	else if (opsize == 2)
777 		return (getandflags16(x, y));
778 	else if (opsize == 4)
779 		return (getandflags32(x, y));
780 	else
781 		return (getandflags64(x, y));
782 }
783 
784 static int
785 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid)
786 {
787 	uint64_t val;
788 	int err;
789 	enum vm_reg_name gpr = gpr_map[vie->rm];
790 	enum vm_reg_name cr = cr_map[vie->reg];
791 
792 	uint_t size = 4;
793 	if (vie->paging.cpu_mode == CPU_MODE_64BIT) {
794 		size = 8;
795 	}
796 
797 	switch (vie->op.op_byte) {
798 	case 0x20:
799 		/*
800 		 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
801 		 * 20/r:	mov r32, CR0-CR7
802 		 * 20/r:	mov r64, CR0-CR7
803 		 * REX.R + 20/0:	mov r64, CR8
804 		 */
805 		if (vie->paging.cpl != 0) {
806 			vm_inject_gp(vm, vcpuid);
807 			vie->num_processed = 0;
808 			return (0);
809 		}
810 		err = vm_get_register(vm, vcpuid, cr, &val);
811 		if (err != 0) {
812 			/* #UD for access to non-existent CRs */
813 			vm_inject_ud(vm, vcpuid);
814 			vie->num_processed = 0;
815 			return (0);
816 		}
817 		err = vie_update_register(vm, vcpuid, gpr, val, size);
818 		break;
819 	case 0x22: {
820 		/*
821 		 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
822 		 * 22/r:	mov CR0-CR7, r32
823 		 * 22/r:	mov CR0-CR7, r64
824 		 * REX.R + 22/0:	mov CR8, r64
825 		 */
826 		uint64_t old, diff;
827 
828 		if (vie->paging.cpl != 0) {
829 			vm_inject_gp(vm, vcpuid);
830 			vie->num_processed = 0;
831 			return (0);
832 		}
833 		err = vm_get_register(vm, vcpuid, cr, &old);
834 		if (err != 0) {
835 			/* #UD for access to non-existent CRs */
836 			vm_inject_ud(vm, vcpuid);
837 			vie->num_processed = 0;
838 			return (0);
839 		}
840 		err = vm_get_register(vm, vcpuid, gpr, &val);
841 		VERIFY0(err);
842 		val &= size2mask[size];
843 		diff = old ^ val;
844 
845 		switch (cr) {
846 		case VM_REG_GUEST_CR0:
847 			if ((diff & CR0_PG) != 0) {
848 				uint64_t efer;
849 
850 				err = vm_get_register(vm, vcpuid,
851 				    VM_REG_GUEST_EFER, &efer);
852 				VERIFY0(err);
853 
854 				/* Keep the long-mode state in EFER in sync */
855 				if ((val & CR0_PG) != 0 &&
856 				    (efer & EFER_LME) != 0) {
857 					efer |= EFER_LMA;
858 				}
859 				if ((val & CR0_PG) == 0 &&
860 				    (efer & EFER_LME) != 0) {
861 					efer &= ~EFER_LMA;
862 				}
863 
864 				err = vm_set_register(vm, vcpuid,
865 				    VM_REG_GUEST_EFER, efer);
866 				VERIFY0(err);
867 			}
868 			/* TODO: enforce more of the #GP checks */
869 			err = vm_set_register(vm, vcpuid, cr, val);
870 			VERIFY0(err);
871 			break;
872 		case VM_REG_GUEST_CR2:
873 		case VM_REG_GUEST_CR3:
874 		case VM_REG_GUEST_CR4:
875 			/* TODO: enforce more of the #GP checks */
876 			err = vm_set_register(vm, vcpuid, cr, val);
877 			break;
878 		default:
879 			/* The cr_map mapping should prevent this */
880 			panic("invalid cr %d", cr);
881 		}
882 		break;
883 	}
884 	default:
885 		return (EINVAL);
886 	}
887 	return (err);
888 }
889 
890 static int
891 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
892 {
893 	int error, size;
894 	enum vm_reg_name reg;
895 	uint8_t byte;
896 	uint64_t val;
897 
898 	size = vie->opsize;
899 	error = EINVAL;
900 
901 	switch (vie->op.op_byte) {
902 	case 0x88:
903 		/*
904 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
905 		 * 88/r:	mov r/m8, r8
906 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
907 		 */
908 		size = 1;	/* override for byte operation */
909 		error = vie_read_bytereg(vie, vm, vcpuid, &byte);
910 		if (error == 0) {
911 			error = vie_mmio_write(vie, vm, vcpuid, gpa, byte,
912 			    size);
913 		}
914 		break;
915 	case 0x89:
916 		/*
917 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
918 		 * 89/r:	mov r/m16, r16
919 		 * 89/r:	mov r/m32, r32
920 		 * REX.W + 89/r	mov r/m64, r64
921 		 */
922 		reg = gpr_map[vie->reg];
923 		error = vm_get_register(vm, vcpuid, reg, &val);
924 		if (error == 0) {
925 			val &= size2mask[size];
926 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
927 		}
928 		break;
929 	case 0x8A:
930 		/*
931 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
932 		 * 8A/r:	mov r8, r/m8
933 		 * REX + 8A/r:	mov r8, r/m8
934 		 */
935 		size = 1;	/* override for byte operation */
936 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
937 		if (error == 0)
938 			error = vie_write_bytereg(vie, vm, vcpuid, val);
939 		break;
940 	case 0x8B:
941 		/*
942 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
943 		 * 8B/r:	mov r16, r/m16
944 		 * 8B/r:	mov r32, r/m32
945 		 * REX.W 8B/r:	mov r64, r/m64
946 		 */
947 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
948 		if (error == 0) {
949 			reg = gpr_map[vie->reg];
950 			error = vie_update_register(vm, vcpuid, reg, val, size);
951 		}
952 		break;
953 	case 0xA1:
954 		/*
955 		 * MOV from seg:moffset to AX/EAX/RAX
956 		 * A1:		mov AX, moffs16
957 		 * A1:		mov EAX, moffs32
958 		 * REX.W + A1:	mov RAX, moffs64
959 		 */
960 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
961 		if (error == 0) {
962 			reg = VM_REG_GUEST_RAX;
963 			error = vie_update_register(vm, vcpuid, reg, val, size);
964 		}
965 		break;
966 	case 0xA3:
967 		/*
968 		 * MOV from AX/EAX/RAX to seg:moffset
969 		 * A3:		mov moffs16, AX
970 		 * A3:		mov moffs32, EAX
971 		 * REX.W + A3:	mov moffs64, RAX
972 		 */
973 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
974 		if (error == 0) {
975 			val &= size2mask[size];
976 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
977 		}
978 		break;
979 	case 0xC6:
980 		/*
981 		 * MOV from imm8 to mem (ModRM:r/m)
982 		 * C6/0		mov r/m8, imm8
983 		 * REX + C6/0	mov r/m8, imm8
984 		 */
985 		size = 1;	/* override for byte operation */
986 		val = vie->immediate;
987 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
988 		break;
989 	case 0xC7:
990 		/*
991 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
992 		 * C7/0		mov r/m16, imm16
993 		 * C7/0		mov r/m32, imm32
994 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
995 		 */
996 		val = vie->immediate & size2mask[size];
997 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
998 		break;
999 	default:
1000 		break;
1001 	}
1002 
1003 	return (error);
1004 }
1005 
1006 static int
1007 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1008 {
1009 	int error, size;
1010 	enum vm_reg_name reg;
1011 	uint64_t val;
1012 
1013 	size = vie->opsize;
1014 	error = EINVAL;
1015 
1016 	switch (vie->op.op_byte) {
1017 	case 0xB6:
1018 		/*
1019 		 * MOV and zero extend byte from mem (ModRM:r/m) to
1020 		 * reg (ModRM:reg).
1021 		 *
1022 		 * 0F B6/r		movzx r16, r/m8
1023 		 * 0F B6/r		movzx r32, r/m8
1024 		 * REX.W + 0F B6/r	movzx r64, r/m8
1025 		 */
1026 
1027 		/* get the first operand */
1028 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1029 		if (error)
1030 			break;
1031 
1032 		/* get the second operand */
1033 		reg = gpr_map[vie->reg];
1034 
1035 		/* zero-extend byte */
1036 		val = (uint8_t)val;
1037 
1038 		/* write the result */
1039 		error = vie_update_register(vm, vcpuid, reg, val, size);
1040 		break;
1041 	case 0xB7:
1042 		/*
1043 		 * MOV and zero extend word from mem (ModRM:r/m) to
1044 		 * reg (ModRM:reg).
1045 		 *
1046 		 * 0F B7/r		movzx r32, r/m16
1047 		 * REX.W + 0F B7/r	movzx r64, r/m16
1048 		 */
1049 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2);
1050 		if (error)
1051 			return (error);
1052 
1053 		reg = gpr_map[vie->reg];
1054 
1055 		/* zero-extend word */
1056 		val = (uint16_t)val;
1057 
1058 		error = vie_update_register(vm, vcpuid, reg, val, size);
1059 		break;
1060 	case 0xBE:
1061 		/*
1062 		 * MOV and sign extend byte from mem (ModRM:r/m) to
1063 		 * reg (ModRM:reg).
1064 		 *
1065 		 * 0F BE/r		movsx r16, r/m8
1066 		 * 0F BE/r		movsx r32, r/m8
1067 		 * REX.W + 0F BE/r	movsx r64, r/m8
1068 		 */
1069 
1070 		/* get the first operand */
1071 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1072 		if (error)
1073 			break;
1074 
1075 		/* get the second operand */
1076 		reg = gpr_map[vie->reg];
1077 
1078 		/* sign extend byte */
1079 		val = (int8_t)val;
1080 
1081 		/* write the result */
1082 		error = vie_update_register(vm, vcpuid, reg, val, size);
1083 		break;
1084 	default:
1085 		break;
1086 	}
1087 	return (error);
1088 }
1089 
1090 /*
1091  * Helper function to calculate and validate a linear address.
1092  */
1093 static int
1094 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize,
1095     int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr,
1096     uint64_t *gla)
1097 {
1098 	struct seg_desc desc;
1099 	uint64_t cr0, val, rflags;
1100 	int error;
1101 	struct vm_guest_paging *paging;
1102 
1103 	paging = &vie->paging;
1104 
1105 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1106 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1107 
1108 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1109 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1110 
1111 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
1112 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
1113 	    __func__, error, seg));
1114 
1115 	error = vm_get_register(vm, vcpuid, gpr, &val);
1116 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
1117 	    error, gpr));
1118 
1119 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
1120 	    addrsize, prot, gla)) {
1121 		if (seg == VM_REG_GUEST_SS)
1122 			vm_inject_ss(vm, vcpuid, 0);
1123 		else
1124 			vm_inject_gp(vm, vcpuid);
1125 		return (-1);
1126 	}
1127 
1128 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
1129 		if (seg == VM_REG_GUEST_SS)
1130 			vm_inject_ss(vm, vcpuid, 0);
1131 		else
1132 			vm_inject_gp(vm, vcpuid);
1133 		return (-1);
1134 	}
1135 
1136 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
1137 		vm_inject_ac(vm, vcpuid, 0);
1138 		return (-1);
1139 	}
1140 
1141 	return (0);
1142 }
1143 
1144 static int
1145 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1146 {
1147 	struct vm_copyinfo copyinfo[2];
1148 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
1149 	uint64_t rcx, rdi, rsi, rflags;
1150 	int error, fault, opsize, seg, repeat;
1151 	struct vm_guest_paging *paging;
1152 
1153 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
1154 	val = 0;
1155 	error = 0;
1156 	paging = &vie->paging;
1157 
1158 	/*
1159 	 * XXX although the MOVS instruction is only supposed to be used with
1160 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1161 	 *
1162 	 * Empirically the "repnz" prefix has identical behavior to "rep"
1163 	 * and the zero flag does not make a difference.
1164 	 */
1165 	repeat = vie->repz_present | vie->repnz_present;
1166 
1167 	if (repeat) {
1168 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1169 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1170 
1171 		/*
1172 		 * The count register is %rcx, %ecx or %cx depending on the
1173 		 * address size of the instruction.
1174 		 */
1175 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
1176 			error = 0;
1177 			goto done;
1178 		}
1179 	}
1180 
1181 	/*
1182 	 *	Source		Destination	Comments
1183 	 *	--------------------------------------------
1184 	 * (1)  memory		memory		n/a
1185 	 * (2)  memory		mmio		emulated
1186 	 * (3)  mmio		memory		emulated
1187 	 * (4)  mmio		mmio		emulated
1188 	 *
1189 	 * At this point we don't have sufficient information to distinguish
1190 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1191 	 * out because it will succeed only when operating on regular memory.
1192 	 *
1193 	 * XXX the emulation doesn't properly handle the case where 'gpa'
1194 	 * is straddling the boundary between the normal memory and MMIO.
1195 	 */
1196 
1197 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
1198 	if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg,
1199 	    VM_REG_GUEST_RSI, &srcaddr) != 0) {
1200 		goto done;
1201 	}
1202 
1203 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
1204 	    copyinfo, nitems(copyinfo), &fault);
1205 	if (error == 0) {
1206 		if (fault)
1207 			goto done;	/* Resume guest to handle fault */
1208 
1209 		/*
1210 		 * case (2): read from system memory and write to mmio.
1211 		 */
1212 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
1213 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1214 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1215 		if (error)
1216 			goto done;
1217 	} else {
1218 		/*
1219 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1220 		 * if 'srcaddr' is in the mmio space.
1221 		 */
1222 
1223 		if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize,
1224 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI,
1225 		    &dstaddr) != 0) {
1226 			goto done;
1227 		}
1228 
1229 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
1230 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
1231 		if (error == 0) {
1232 			if (fault)
1233 				goto done;    /* Resume guest to handle fault */
1234 
1235 			/*
1236 			 * case (3): read from MMIO and write to system memory.
1237 			 *
1238 			 * A MMIO read can have side-effects so we
1239 			 * commit to it only after vm_copy_setup() is
1240 			 * successful. If a page-fault needs to be
1241 			 * injected into the guest then it will happen
1242 			 * before the MMIO read is attempted.
1243 			 */
1244 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &val,
1245 			    opsize);
1246 
1247 			if (error == 0) {
1248 				vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
1249 			}
1250 			/*
1251 			 * Regardless of whether the MMIO read was successful or
1252 			 * not, the copy resources must be cleaned up.
1253 			 */
1254 			vm_copy_teardown(vm, vcpuid, copyinfo,
1255 			    nitems(copyinfo));
1256 			if (error != 0) {
1257 				goto done;
1258 			}
1259 		} else {
1260 			/*
1261 			 * Case (4): read from and write to mmio.
1262 			 *
1263 			 * Commit to the MMIO read/write (with potential
1264 			 * side-effects) only after we are sure that the
1265 			 * instruction is not going to be restarted due
1266 			 * to address translation faults.
1267 			 */
1268 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
1269 			    PROT_READ, &srcgpa, &fault);
1270 			if (error || fault)
1271 				goto done;
1272 
1273 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
1274 			    PROT_WRITE, &dstgpa, &fault);
1275 			if (error || fault)
1276 				goto done;
1277 
1278 			error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val,
1279 			    opsize);
1280 			if (error)
1281 				goto done;
1282 
1283 			error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val,
1284 			    opsize);
1285 			if (error)
1286 				goto done;
1287 		}
1288 	}
1289 
1290 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
1291 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
1292 
1293 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1294 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1295 
1296 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1297 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1298 
1299 	if (rflags & PSL_D) {
1300 		rsi -= opsize;
1301 		rdi -= opsize;
1302 	} else {
1303 		rsi += opsize;
1304 		rdi += opsize;
1305 	}
1306 
1307 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
1308 	    vie->addrsize);
1309 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
1310 
1311 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1312 	    vie->addrsize);
1313 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1314 
1315 	if (repeat) {
1316 		rcx = rcx - 1;
1317 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1318 		    rcx, vie->addrsize);
1319 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1320 
1321 		/*
1322 		 * Repeat the instruction if the count register is not zero.
1323 		 */
1324 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1325 			return (vie_repeat(vie));
1326 	}
1327 done:
1328 	return (error);
1329 }
1330 
1331 static int
1332 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1333 {
1334 	int error, opsize, repeat;
1335 	uint64_t val;
1336 	uint64_t rcx, rdi, rflags;
1337 
1338 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
1339 	repeat = vie->repz_present | vie->repnz_present;
1340 
1341 	if (repeat) {
1342 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1343 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1344 
1345 		/*
1346 		 * The count register is %rcx, %ecx or %cx depending on the
1347 		 * address size of the instruction.
1348 		 */
1349 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
1350 			return (0);
1351 	}
1352 
1353 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
1354 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
1355 
1356 	error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1357 	if (error)
1358 		return (error);
1359 
1360 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1361 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1362 
1363 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1364 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1365 
1366 	if (rflags & PSL_D)
1367 		rdi -= opsize;
1368 	else
1369 		rdi += opsize;
1370 
1371 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1372 	    vie->addrsize);
1373 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1374 
1375 	if (repeat) {
1376 		rcx = rcx - 1;
1377 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1378 		    rcx, vie->addrsize);
1379 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1380 
1381 		/*
1382 		 * Repeat the instruction if the count register is not zero.
1383 		 */
1384 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1385 			return (vie_repeat(vie));
1386 	}
1387 
1388 	return (0);
1389 }
1390 
1391 static int
1392 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1393 {
1394 	int error, size;
1395 	enum vm_reg_name reg;
1396 	uint64_t result, rflags, rflags2, val1, val2;
1397 
1398 	size = vie->opsize;
1399 	error = EINVAL;
1400 
1401 	switch (vie->op.op_byte) {
1402 	case 0x23:
1403 		/*
1404 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1405 		 * result in reg.
1406 		 *
1407 		 * 23/r		and r16, r/m16
1408 		 * 23/r		and r32, r/m32
1409 		 * REX.W + 23/r	and r64, r/m64
1410 		 */
1411 
1412 		/* get the first operand */
1413 		reg = gpr_map[vie->reg];
1414 		error = vm_get_register(vm, vcpuid, reg, &val1);
1415 		if (error)
1416 			break;
1417 
1418 		/* get the second operand */
1419 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1420 		if (error)
1421 			break;
1422 
1423 		/* perform the operation and write the result */
1424 		result = val1 & val2;
1425 		error = vie_update_register(vm, vcpuid, reg, result, size);
1426 		break;
1427 	case 0x81:
1428 	case 0x83:
1429 		/*
1430 		 * AND mem (ModRM:r/m) with immediate and store the
1431 		 * result in mem.
1432 		 *
1433 		 * 81 /4		and r/m16, imm16
1434 		 * 81 /4		and r/m32, imm32
1435 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1436 		 *
1437 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1438 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1439 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1440 		 */
1441 
1442 		/* get the first operand */
1443 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1444 		if (error)
1445 			break;
1446 
1447 		/*
1448 		 * perform the operation with the pre-fetched immediate
1449 		 * operand and write the result
1450 		 */
1451 		result = val1 & vie->immediate;
1452 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1453 		break;
1454 	default:
1455 		break;
1456 	}
1457 	if (error)
1458 		return (error);
1459 
1460 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1461 	if (error)
1462 		return (error);
1463 
1464 	/*
1465 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1466 	 * to the result; AF is undefined.
1467 	 *
1468 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1469 	 */
1470 	rflags2 = getcc(size, result, 0);
1471 	rflags &= ~RFLAGS_STATUS_BITS;
1472 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1473 
1474 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1475 	return (error);
1476 }
1477 
1478 static int
1479 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1480 {
1481 	int error, size;
1482 	enum vm_reg_name reg;
1483 	uint64_t result, rflags, rflags2, val1, val2;
1484 
1485 	size = vie->opsize;
1486 	error = EINVAL;
1487 
1488 	switch (vie->op.op_byte) {
1489 	case 0x0B:
1490 		/*
1491 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1492 		 * result in reg.
1493 		 *
1494 		 * 0b/r		or r16, r/m16
1495 		 * 0b/r		or r32, r/m32
1496 		 * REX.W + 0b/r	or r64, r/m64
1497 		 */
1498 
1499 		/* get the first operand */
1500 		reg = gpr_map[vie->reg];
1501 		error = vm_get_register(vm, vcpuid, reg, &val1);
1502 		if (error)
1503 			break;
1504 
1505 		/* get the second operand */
1506 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1507 		if (error)
1508 			break;
1509 
1510 		/* perform the operation and write the result */
1511 		result = val1 | val2;
1512 		error = vie_update_register(vm, vcpuid, reg, result, size);
1513 		break;
1514 	case 0x81:
1515 	case 0x83:
1516 		/*
1517 		 * OR mem (ModRM:r/m) with immediate and store the
1518 		 * result in mem.
1519 		 *
1520 		 * 81 /1		or r/m16, imm16
1521 		 * 81 /1		or r/m32, imm32
1522 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1523 		 *
1524 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1525 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1526 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1527 		 */
1528 
1529 		/* get the first operand */
1530 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1531 		if (error)
1532 			break;
1533 
1534 		/*
1535 		 * perform the operation with the pre-fetched immediate
1536 		 * operand and write the result
1537 		 */
1538 		result = val1 | vie->immediate;
1539 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1540 		break;
1541 	default:
1542 		break;
1543 	}
1544 	if (error)
1545 		return (error);
1546 
1547 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1548 	if (error)
1549 		return (error);
1550 
1551 	/*
1552 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1553 	 * to the result; AF is undefined.
1554 	 *
1555 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1556 	 */
1557 	rflags2 = getcc(size, result, 0);
1558 	rflags &= ~RFLAGS_STATUS_BITS;
1559 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1560 
1561 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1562 	return (error);
1563 }
1564 
1565 static int
1566 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1567 {
1568 	int error, size;
1569 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1570 	enum vm_reg_name reg;
1571 
1572 	size = vie->opsize;
1573 	switch (vie->op.op_byte) {
1574 	case 0x39:
1575 	case 0x3B:
1576 		/*
1577 		 * 39/r		CMP r/m16, r16
1578 		 * 39/r		CMP r/m32, r32
1579 		 * REX.W 39/r	CMP r/m64, r64
1580 		 *
1581 		 * 3B/r		CMP r16, r/m16
1582 		 * 3B/r		CMP r32, r/m32
1583 		 * REX.W + 3B/r	CMP r64, r/m64
1584 		 *
1585 		 * Compare the first operand with the second operand and
1586 		 * set status flags in EFLAGS register. The comparison is
1587 		 * performed by subtracting the second operand from the first
1588 		 * operand and then setting the status flags.
1589 		 */
1590 
1591 		/* Get the register operand */
1592 		reg = gpr_map[vie->reg];
1593 		error = vm_get_register(vm, vcpuid, reg, &regop);
1594 		if (error)
1595 			return (error);
1596 
1597 		/* Get the memory operand */
1598 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size);
1599 		if (error)
1600 			return (error);
1601 
1602 		if (vie->op.op_byte == 0x3B) {
1603 			op1 = regop;
1604 			op2 = memop;
1605 		} else {
1606 			op1 = memop;
1607 			op2 = regop;
1608 		}
1609 		rflags2 = getcc(size, op1, op2);
1610 		break;
1611 	case 0x80:
1612 	case 0x81:
1613 	case 0x83:
1614 		/*
1615 		 * 80 /7		cmp r/m8, imm8
1616 		 * REX + 80 /7		cmp r/m8, imm8
1617 		 *
1618 		 * 81 /7		cmp r/m16, imm16
1619 		 * 81 /7		cmp r/m32, imm32
1620 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1621 		 *
1622 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1623 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1624 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1625 		 *
1626 		 * Compare mem (ModRM:r/m) with immediate and set
1627 		 * status flags according to the results.  The
1628 		 * comparison is performed by subtracting the
1629 		 * immediate from the first operand and then setting
1630 		 * the status flags.
1631 		 *
1632 		 */
1633 		if (vie->op.op_byte == 0x80)
1634 			size = 1;
1635 
1636 		/* get the first operand */
1637 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1638 		if (error)
1639 			return (error);
1640 
1641 		rflags2 = getcc(size, op1, vie->immediate);
1642 		break;
1643 	default:
1644 		return (EINVAL);
1645 	}
1646 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1647 	if (error)
1648 		return (error);
1649 	rflags &= ~RFLAGS_STATUS_BITS;
1650 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1651 
1652 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1653 	return (error);
1654 }
1655 
1656 static int
1657 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1658 {
1659 	int error, size;
1660 	uint64_t op1, rflags, rflags2;
1661 
1662 	size = vie->opsize;
1663 	error = EINVAL;
1664 
1665 	switch (vie->op.op_byte) {
1666 	case 0xF6:
1667 		/*
1668 		 * F6 /0		test r/m8, imm8
1669 		 *
1670 		 * Test mem (ModRM:r/m) with immediate and set status
1671 		 * flags according to the results.  The comparison is
1672 		 * performed by anding the immediate from the first
1673 		 * operand and then setting the status flags.
1674 		 */
1675 		if ((vie->reg & 7) != 0)
1676 			return (EINVAL);
1677 
1678 		size = 1;	/* override for byte operation */
1679 
1680 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1681 		if (error)
1682 			return (error);
1683 
1684 		rflags2 = getandflags(size, op1, vie->immediate);
1685 		break;
1686 	case 0xF7:
1687 		/*
1688 		 * F7 /0		test r/m16, imm16
1689 		 * F7 /0		test r/m32, imm32
1690 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1691 		 *
1692 		 * Test mem (ModRM:r/m) with immediate and set status
1693 		 * flags according to the results.  The comparison is
1694 		 * performed by anding the immediate from the first
1695 		 * operand and then setting the status flags.
1696 		 */
1697 		if ((vie->reg & 7) != 0)
1698 			return (EINVAL);
1699 
1700 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1701 		if (error)
1702 			return (error);
1703 
1704 		rflags2 = getandflags(size, op1, vie->immediate);
1705 		break;
1706 	default:
1707 		return (EINVAL);
1708 	}
1709 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1710 	if (error)
1711 		return (error);
1712 
1713 	/*
1714 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1715 	 * to the result; AF is undefined.
1716 	 */
1717 	rflags &= ~RFLAGS_STATUS_BITS;
1718 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1719 
1720 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1721 	return (error);
1722 }
1723 
1724 static int
1725 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1726 {
1727 	uint64_t src1, src2, dst, rflags;
1728 	unsigned start, len, size;
1729 	int error;
1730 	struct vm_guest_paging *paging;
1731 
1732 	size = vie->opsize;
1733 	error = EINVAL;
1734 	paging = &vie->paging;
1735 
1736 	/*
1737 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1738 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1739 	 *
1740 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1741 	 * Vex.vvvv.
1742 	 *
1743 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1744 	 */
1745 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1746 		size = 4;
1747 
1748 	/*
1749 	 * Extracts contiguous bits from the first /source/ operand (second
1750 	 * operand) using an index and length specified in the second /source/
1751 	 * operand (third operand).
1752 	 */
1753 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size);
1754 	if (error)
1755 		return (error);
1756 	error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1757 	if (error)
1758 		return (error);
1759 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1760 	if (error)
1761 		return (error);
1762 
1763 	start = (src2 & 0xff);
1764 	len = (src2 & 0xff00) >> 8;
1765 
1766 	/* If no bits are extracted, the destination register is cleared. */
1767 	dst = 0;
1768 
1769 	/* If START exceeds the operand size, no bits are extracted. */
1770 	if (start > size * 8)
1771 		goto done;
1772 	/* Length is bounded by both the destination size and start offset. */
1773 	if (start + len > size * 8)
1774 		len = (size * 8) - start;
1775 	if (len == 0)
1776 		goto done;
1777 
1778 	if (start > 0)
1779 		src1 = (src1 >> start);
1780 	if (len < 64)
1781 		src1 = src1 & ((1ull << len) - 1);
1782 	dst = src1;
1783 
1784 done:
1785 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1786 	if (error)
1787 		return (error);
1788 
1789 	/*
1790 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1791 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1792 	 */
1793 	rflags &= ~RFLAGS_STATUS_BITS;
1794 	if (dst == 0)
1795 		rflags |= PSL_Z;
1796 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1797 	    8);
1798 	return (error);
1799 }
1800 
1801 static int
1802 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1803 {
1804 	int error, size;
1805 	uint64_t nval, rflags, rflags2, val1, val2;
1806 	enum vm_reg_name reg;
1807 
1808 	size = vie->opsize;
1809 	error = EINVAL;
1810 
1811 	switch (vie->op.op_byte) {
1812 	case 0x03:
1813 		/*
1814 		 * ADD r/m to r and store the result in r
1815 		 *
1816 		 * 03/r			ADD r16, r/m16
1817 		 * 03/r			ADD r32, r/m32
1818 		 * REX.W + 03/r		ADD r64, r/m64
1819 		 */
1820 
1821 		/* get the first operand */
1822 		reg = gpr_map[vie->reg];
1823 		error = vm_get_register(vm, vcpuid, reg, &val1);
1824 		if (error)
1825 			break;
1826 
1827 		/* get the second operand */
1828 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1829 		if (error)
1830 			break;
1831 
1832 		/* perform the operation and write the result */
1833 		nval = val1 + val2;
1834 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1835 		break;
1836 	default:
1837 		break;
1838 	}
1839 
1840 	if (!error) {
1841 		rflags2 = getaddflags(size, val1, val2);
1842 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1843 		    &rflags);
1844 		if (error)
1845 			return (error);
1846 
1847 		rflags &= ~RFLAGS_STATUS_BITS;
1848 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1849 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1850 		    rflags, 8);
1851 	}
1852 
1853 	return (error);
1854 }
1855 
1856 static int
1857 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1858 {
1859 	int error, size;
1860 	uint64_t nval, rflags, rflags2, val1, val2;
1861 	enum vm_reg_name reg;
1862 
1863 	size = vie->opsize;
1864 	error = EINVAL;
1865 
1866 	switch (vie->op.op_byte) {
1867 	case 0x2B:
1868 		/*
1869 		 * SUB r/m from r and store the result in r
1870 		 *
1871 		 * 2B/r		SUB r16, r/m16
1872 		 * 2B/r		SUB r32, r/m32
1873 		 * REX.W + 2B/r	SUB r64, r/m64
1874 		 */
1875 
1876 		/* get the first operand */
1877 		reg = gpr_map[vie->reg];
1878 		error = vm_get_register(vm, vcpuid, reg, &val1);
1879 		if (error)
1880 			break;
1881 
1882 		/* get the second operand */
1883 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1884 		if (error)
1885 			break;
1886 
1887 		/* perform the operation and write the result */
1888 		nval = val1 - val2;
1889 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1890 		break;
1891 	default:
1892 		break;
1893 	}
1894 
1895 	if (!error) {
1896 		rflags2 = getcc(size, val1, val2);
1897 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1898 		    &rflags);
1899 		if (error)
1900 			return (error);
1901 
1902 		rflags &= ~RFLAGS_STATUS_BITS;
1903 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1904 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1905 		    rflags, 8);
1906 	}
1907 
1908 	return (error);
1909 }
1910 
1911 static int
1912 vie_emulate_mul(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1913 {
1914 	int error, size;
1915 	uint64_t rflags, rflags2, val1, val2;
1916 	__int128_t nval;
1917 	enum vm_reg_name reg;
1918 	ulong_t (*getflags)(int, uint64_t, uint64_t) = NULL;
1919 
1920 	size = vie->opsize;
1921 	error = EINVAL;
1922 
1923 	switch (vie->op.op_byte) {
1924 	case 0xAF:
1925 		/*
1926 		 * Multiply the contents of a destination register by
1927 		 * the contents of a register or memory operand and
1928 		 * put the signed result in the destination register.
1929 		 *
1930 		 * AF/r		IMUL r16, r/m16
1931 		 * AF/r		IMUL r32, r/m32
1932 		 * REX.W + AF/r	IMUL r64, r/m64
1933 		 */
1934 
1935 		getflags = getimulflags;
1936 
1937 		/* get the first operand */
1938 		reg = gpr_map[vie->reg];
1939 		error = vm_get_register(vm, vcpuid, reg, &val1);
1940 		if (error != 0)
1941 			break;
1942 
1943 		/* get the second operand */
1944 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1945 		if (error != 0)
1946 			break;
1947 
1948 		/* perform the operation and write the result */
1949 		nval = (int64_t)val1 * (int64_t)val2;
1950 
1951 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1952 
1953 		DTRACE_PROBE4(vie__imul,
1954 		    const char *, vie_regnum_name(vie->reg, size),
1955 		    uint64_t, val1, uint64_t, val2, __uint128_t, nval);
1956 
1957 		break;
1958 	default:
1959 		break;
1960 	}
1961 
1962 	if (error == 0) {
1963 		rflags2 = getflags(size, val1, val2);
1964 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1965 		    &rflags);
1966 		if (error)
1967 			return (error);
1968 
1969 		rflags &= ~RFLAGS_STATUS_BITS;
1970 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1971 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1972 		    rflags, 8);
1973 
1974 		DTRACE_PROBE2(vie__imul__rflags,
1975 		    uint64_t, rflags, uint64_t, rflags2);
1976 	}
1977 
1978 	return (error);
1979 }
1980 
1981 static int
1982 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1983 {
1984 	struct vm_copyinfo copyinfo[2];
1985 	struct seg_desc ss_desc;
1986 	uint64_t cr0, rflags, rsp, stack_gla, val;
1987 	int error, fault, size, stackaddrsize, pushop;
1988 	struct vm_guest_paging *paging;
1989 
1990 	val = 0;
1991 	size = vie->opsize;
1992 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1993 	paging = &vie->paging;
1994 
1995 	/*
1996 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1997 	 */
1998 	if (paging->cpu_mode == CPU_MODE_REAL) {
1999 		stackaddrsize = 2;
2000 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
2001 		/*
2002 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
2003 		 * - Stack pointer size is always 64-bits.
2004 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
2005 		 * - 16-bit PUSH/POP is supported by using the operand size
2006 		 *   override prefix (66H).
2007 		 */
2008 		stackaddrsize = 8;
2009 		size = vie->opsize_override ? 2 : 8;
2010 	} else {
2011 		/*
2012 		 * In protected or compatibility mode the 'B' flag in the
2013 		 * stack-segment descriptor determines the size of the
2014 		 * stack pointer.
2015 		 */
2016 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
2017 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
2018 		    __func__, error));
2019 		if (SEG_DESC_DEF32(ss_desc.access))
2020 			stackaddrsize = 4;
2021 		else
2022 			stackaddrsize = 2;
2023 	}
2024 
2025 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
2026 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
2027 
2028 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2029 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2030 
2031 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
2032 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
2033 	if (pushop) {
2034 		rsp -= size;
2035 	}
2036 
2037 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
2038 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
2039 	    &stack_gla)) {
2040 		vm_inject_ss(vm, vcpuid, 0);
2041 		return (0);
2042 	}
2043 
2044 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
2045 		vm_inject_ss(vm, vcpuid, 0);
2046 		return (0);
2047 	}
2048 
2049 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
2050 		vm_inject_ac(vm, vcpuid, 0);
2051 		return (0);
2052 	}
2053 
2054 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
2055 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
2056 	    &fault);
2057 	if (error || fault)
2058 		return (error);
2059 
2060 	if (pushop) {
2061 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
2062 		if (error == 0)
2063 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
2064 	} else {
2065 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
2066 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
2067 		rsp += size;
2068 	}
2069 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2070 
2071 	if (error == 0) {
2072 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
2073 		    stackaddrsize);
2074 		KASSERT(error == 0, ("error %d updating rsp", error));
2075 	}
2076 	return (error);
2077 }
2078 
2079 static int
2080 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2081 {
2082 	int error;
2083 
2084 	/*
2085 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2086 	 *
2087 	 * PUSH is part of the group 5 extended opcodes and is identified
2088 	 * by ModRM:reg = b110.
2089 	 */
2090 	if ((vie->reg & 7) != 6)
2091 		return (EINVAL);
2092 
2093 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2094 	return (error);
2095 }
2096 
2097 static int
2098 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2099 {
2100 	int error;
2101 
2102 	/*
2103 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2104 	 *
2105 	 * POP is part of the group 1A extended opcodes and is identified
2106 	 * by ModRM:reg = b000.
2107 	 */
2108 	if ((vie->reg & 7) != 0)
2109 		return (EINVAL);
2110 
2111 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2112 	return (error);
2113 }
2114 
2115 static int
2116 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2117 {
2118 	int error;
2119 
2120 	switch (vie->reg & 7) {
2121 	case 0x1:	/* OR */
2122 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2123 		break;
2124 	case 0x4:	/* AND */
2125 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2126 		break;
2127 	case 0x7:	/* CMP */
2128 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2129 		break;
2130 	default:
2131 		error = EINVAL;
2132 		break;
2133 	}
2134 
2135 	return (error);
2136 }
2137 
2138 static int
2139 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2140 {
2141 	uint64_t val, rflags;
2142 	int error, bitmask, bitoff;
2143 
2144 	/*
2145 	 * 0F BA is a Group 8 extended opcode.
2146 	 *
2147 	 * Currently we only emulate the 'Bit Test' instruction which is
2148 	 * identified by a ModR/M:reg encoding of 100b.
2149 	 */
2150 	if ((vie->reg & 7) != 4)
2151 		return (EINVAL);
2152 
2153 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2154 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2155 
2156 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize);
2157 	if (error)
2158 		return (error);
2159 
2160 	/*
2161 	 * Intel SDM, Vol 2, Table 3-2:
2162 	 * "Range of Bit Positions Specified by Bit Offset Operands"
2163 	 */
2164 	bitmask = vie->opsize * 8 - 1;
2165 	bitoff = vie->immediate & bitmask;
2166 
2167 	/* Copy the bit into the Carry flag in %rflags */
2168 	if (val & (1UL << bitoff))
2169 		rflags |= PSL_C;
2170 	else
2171 		rflags &= ~PSL_C;
2172 
2173 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
2174 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
2175 
2176 	return (0);
2177 }
2178 
2179 static int
2180 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid,
2181     uint64_t gpa)
2182 {
2183 	int error;
2184 	uint64_t buf;
2185 
2186 	switch (vie->reg & 7) {
2187 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
2188 		if (vie->mod == 0x3) {
2189 			/*
2190 			 * SFENCE.  Ignore it, VM exit provides enough
2191 			 * barriers on its own.
2192 			 */
2193 			error = 0;
2194 		} else {
2195 			/*
2196 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
2197 			 * rights.
2198 			 */
2199 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1);
2200 		}
2201 		break;
2202 	default:
2203 		error = EINVAL;
2204 		break;
2205 	}
2206 
2207 	return (error);
2208 }
2209 
2210 static int
2211 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid)
2212 {
2213 	uint64_t val;
2214 	int error __maybe_unused;
2215 
2216 	if (vie->paging.cpl != 0) {
2217 		vm_inject_gp(vm, vcpuid);
2218 		vie->num_processed = 0;
2219 		return (0);
2220 	}
2221 
2222 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val);
2223 	ASSERT(error == 0);
2224 
2225 	/* Clear %cr0.TS */
2226 	val &= ~CR0_TS;
2227 
2228 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val);
2229 	ASSERT(error == 0);
2230 
2231 	return (0);
2232 }
2233 
2234 static int
2235 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2236     uint64_t *rval, int bytes)
2237 {
2238 	int err;
2239 
2240 	if (vie->mmio_req_read.state == VR_DONE) {
2241 		ASSERT(vie->mmio_req_read.bytes == bytes);
2242 		ASSERT(vie->mmio_req_read.gpa == gpa);
2243 
2244 		*rval = vie->mmio_req_read.data;
2245 		return (0);
2246 	}
2247 
2248 	err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes);
2249 	if (err == 0) {
2250 		/*
2251 		 * A successful read from an in-kernel-emulated device may come
2252 		 * with side effects, so stash the result in case it's used for
2253 		 * an instruction which subsequently needs to issue an MMIO
2254 		 * write to userspace.
2255 		 */
2256 		ASSERT(vie->mmio_req_read.state == VR_NONE);
2257 
2258 		vie->mmio_req_read.bytes = bytes;
2259 		vie->mmio_req_read.gpa = gpa;
2260 		vie->mmio_req_read.data = *rval;
2261 		vie->mmio_req_read.state = VR_DONE;
2262 
2263 	} else if (err == ESRCH) {
2264 		/* Hope that userspace emulation can fulfill this read */
2265 		vie->mmio_req_read.bytes = bytes;
2266 		vie->mmio_req_read.gpa = gpa;
2267 		vie->mmio_req_read.state = VR_PENDING;
2268 		vie->status |= VIES_PENDING_MMIO;
2269 	} else if (err < 0) {
2270 		/*
2271 		 * The MMIO read failed in such a way that fallback to handling
2272 		 * in userspace is required.
2273 		 */
2274 		vie->status |= VIES_USER_FALLBACK;
2275 	}
2276 	return (err);
2277 }
2278 
2279 static int
2280 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2281     uint64_t wval, int bytes)
2282 {
2283 	int err;
2284 
2285 	if (vie->mmio_req_write.state == VR_DONE) {
2286 		ASSERT(vie->mmio_req_write.bytes == bytes);
2287 		ASSERT(vie->mmio_req_write.gpa == gpa);
2288 
2289 		return (0);
2290 	}
2291 
2292 	err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes);
2293 	if (err == 0) {
2294 		/*
2295 		 * A successful write to an in-kernel-emulated device probably
2296 		 * results in side effects, so stash the fact that such a write
2297 		 * succeeded in case the operation requires other work.
2298 		 */
2299 		vie->mmio_req_write.bytes = bytes;
2300 		vie->mmio_req_write.gpa = gpa;
2301 		vie->mmio_req_write.data = wval;
2302 		vie->mmio_req_write.state = VR_DONE;
2303 	} else if (err == ESRCH) {
2304 		/* Hope that userspace emulation can fulfill this write */
2305 		vie->mmio_req_write.bytes = bytes;
2306 		vie->mmio_req_write.gpa = gpa;
2307 		vie->mmio_req_write.data = wval;
2308 		vie->mmio_req_write.state = VR_PENDING;
2309 		vie->status |= VIES_PENDING_MMIO;
2310 	} else if (err < 0) {
2311 		/*
2312 		 * The MMIO write failed in such a way that fallback to handling
2313 		 * in userspace is required.
2314 		 */
2315 		vie->status |= VIES_USER_FALLBACK;
2316 	}
2317 	return (err);
2318 }
2319 
2320 int
2321 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid)
2322 {
2323 	int error;
2324 	uint64_t gpa;
2325 
2326 	if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) !=
2327 	    (VIES_INST_DECODE | VIES_MMIO)) {
2328 		return (EINVAL);
2329 	}
2330 
2331 	gpa = vie->mmio_gpa;
2332 
2333 	switch (vie->op.op_type) {
2334 	case VIE_OP_TYPE_GROUP1:
2335 		error = vie_emulate_group1(vie, vm, vcpuid, gpa);
2336 		break;
2337 	case VIE_OP_TYPE_POP:
2338 		error = vie_emulate_pop(vie, vm, vcpuid, gpa);
2339 		break;
2340 	case VIE_OP_TYPE_PUSH:
2341 		error = vie_emulate_push(vie, vm, vcpuid, gpa);
2342 		break;
2343 	case VIE_OP_TYPE_CMP:
2344 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2345 		break;
2346 	case VIE_OP_TYPE_MOV:
2347 		error = vie_emulate_mov(vie, vm, vcpuid, gpa);
2348 		break;
2349 	case VIE_OP_TYPE_MOVSX:
2350 	case VIE_OP_TYPE_MOVZX:
2351 		error = vie_emulate_movx(vie, vm, vcpuid, gpa);
2352 		break;
2353 	case VIE_OP_TYPE_MOVS:
2354 		error = vie_emulate_movs(vie, vm, vcpuid, gpa);
2355 		break;
2356 	case VIE_OP_TYPE_STOS:
2357 		error = vie_emulate_stos(vie, vm, vcpuid, gpa);
2358 		break;
2359 	case VIE_OP_TYPE_AND:
2360 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2361 		break;
2362 	case VIE_OP_TYPE_OR:
2363 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2364 		break;
2365 	case VIE_OP_TYPE_SUB:
2366 		error = vie_emulate_sub(vie, vm, vcpuid, gpa);
2367 		break;
2368 	case VIE_OP_TYPE_BITTEST:
2369 		error = vie_emulate_bittest(vie, vm, vcpuid, gpa);
2370 		break;
2371 	case VIE_OP_TYPE_TWOB_GRP15:
2372 		error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa);
2373 		break;
2374 	case VIE_OP_TYPE_ADD:
2375 		error = vie_emulate_add(vie, vm, vcpuid, gpa);
2376 		break;
2377 	case VIE_OP_TYPE_TEST:
2378 		error = vie_emulate_test(vie, vm, vcpuid, gpa);
2379 		break;
2380 	case VIE_OP_TYPE_BEXTR:
2381 		error = vie_emulate_bextr(vie, vm, vcpuid, gpa);
2382 		break;
2383 	case VIE_OP_TYPE_MUL:
2384 		error = vie_emulate_mul(vie, vm, vcpuid, gpa);
2385 		break;
2386 	default:
2387 		error = EINVAL;
2388 		break;
2389 	}
2390 
2391 	if (error == ESRCH) {
2392 		/* Return to userspace with the mmio request */
2393 		return (-1);
2394 	}
2395 
2396 	return (error);
2397 }
2398 
2399 static int
2400 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid,
2401     uint32_t *eax)
2402 {
2403 	uint32_t mask, val;
2404 	bool in;
2405 	int err;
2406 
2407 	mask = vie_size2mask(vie->inout.bytes);
2408 	in = (vie->inout.flags & INOUT_IN) != 0;
2409 
2410 	if (!in) {
2411 		val = *eax & mask;
2412 	}
2413 
2414 	if (vie->inout_req_state != VR_DONE) {
2415 		err = vm_ioport_access(vm, vcpuid, in, vie->inout.port,
2416 		    vie->inout.bytes, &val);
2417 		val &= mask;
2418 	} else {
2419 		/*
2420 		 * This port access was handled in userspace and the result was
2421 		 * injected in to be handled now.
2422 		 */
2423 		val = vie->inout_req_val & mask;
2424 		vie->inout_req_state = VR_NONE;
2425 		err = 0;
2426 	}
2427 
2428 	if (err == ESRCH) {
2429 		vie->status |= VIES_PENDING_INOUT;
2430 		vie->inout_req_state = VR_PENDING;
2431 		return (err);
2432 	} else if (err != 0) {
2433 		return (err);
2434 	}
2435 
2436 	if (in) {
2437 		*eax = (*eax & ~mask) | val;
2438 	}
2439 	return (0);
2440 }
2441 
2442 static enum vm_reg_name
2443 vie_inout_segname(const struct vie *vie)
2444 {
2445 	uint8_t segidx = vie->inout.segment;
2446 	const enum vm_reg_name segmap[] = {
2447 		VM_REG_GUEST_ES,
2448 		VM_REG_GUEST_CS,
2449 		VM_REG_GUEST_SS,
2450 		VM_REG_GUEST_DS,
2451 		VM_REG_GUEST_FS,
2452 		VM_REG_GUEST_GS,
2453 	};
2454 	const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0]));
2455 
2456 	if (segidx >= maxidx) {
2457 		panic("unexpected segment index %u", segidx);
2458 	}
2459 	return (segmap[segidx]);
2460 }
2461 
2462 static int
2463 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid)
2464 {
2465 	uint8_t bytes, addrsize;
2466 	uint64_t index, count = 0, gla, rflags;
2467 	int prot, err, fault;
2468 	bool in, repeat;
2469 	enum vm_reg_name seg_reg, idx_reg;
2470 	struct vm_copyinfo copyinfo[2];
2471 
2472 	in = (vie->inout.flags & INOUT_IN) != 0;
2473 	bytes = vie->inout.bytes;
2474 	addrsize = vie->inout.addrsize;
2475 	prot = in ? PROT_WRITE : PROT_READ;
2476 
2477 	ASSERT(bytes == 1 || bytes == 2 || bytes == 4);
2478 	ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8);
2479 
2480 	idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2481 	seg_reg = vie_inout_segname(vie);
2482 	err = vm_get_register(vm, vcpuid, idx_reg, &index);
2483 	ASSERT(err == 0);
2484 	index = index & vie_size2mask(addrsize);
2485 
2486 	repeat = (vie->inout.flags & INOUT_REP) != 0;
2487 
2488 	/* Count register */
2489 	if (repeat) {
2490 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count);
2491 		count &= vie_size2mask(addrsize);
2492 
2493 		if (count == 0) {
2494 			/*
2495 			 * If we were asked to emulate a REP INS/OUTS when the
2496 			 * count register is zero, no further work is required.
2497 			 */
2498 			return (0);
2499 		}
2500 	} else {
2501 		count = 1;
2502 	}
2503 
2504 	gla = 0;
2505 	if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg,
2506 	    idx_reg, &gla) != 0) {
2507 		/* vie_get_gla() already injected the appropriate fault */
2508 		return (0);
2509 	}
2510 
2511 	/*
2512 	 * The INS/OUTS emulate currently assumes that the memory target resides
2513 	 * within the guest system memory, rather than a device MMIO region.  If
2514 	 * such a case becomes a necessity, that additional handling could be
2515 	 * put in place.
2516 	 */
2517 	err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot,
2518 	    copyinfo, nitems(copyinfo), &fault);
2519 
2520 	if (err) {
2521 		/* Unrecoverable error */
2522 		return (err);
2523 	} else if (fault) {
2524 		/* Resume guest to handle fault */
2525 		return (0);
2526 	}
2527 
2528 	if (!in) {
2529 		vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes);
2530 	}
2531 
2532 	err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2533 
2534 	if (err == 0 && in) {
2535 		vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes);
2536 	}
2537 
2538 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2539 
2540 	if (err == 0) {
2541 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
2542 		    &rflags);
2543 		ASSERT(err == 0);
2544 
2545 		/* Update index */
2546 		if (rflags & PSL_D) {
2547 			index -= bytes;
2548 		} else {
2549 			index += bytes;
2550 		}
2551 
2552 		/* Update index register */
2553 		err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize);
2554 		ASSERT(err == 0);
2555 
2556 		/*
2557 		 * Update count register only if the instruction had a repeat
2558 		 * prefix.
2559 		 */
2560 		if ((vie->inout.flags & INOUT_REP) != 0) {
2561 			count--;
2562 			err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
2563 			    count, addrsize);
2564 			ASSERT(err == 0);
2565 
2566 			if (count != 0) {
2567 				return (vie_repeat(vie));
2568 			}
2569 		}
2570 	}
2571 
2572 	return (err);
2573 }
2574 
2575 int
2576 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid)
2577 {
2578 	int err = 0;
2579 
2580 	if ((vie->status & VIES_INOUT) == 0) {
2581 		return (EINVAL);
2582 	}
2583 
2584 	if ((vie->inout.flags & INOUT_STR) == 0) {
2585 		/*
2586 		 * For now, using the 'rep' prefixes with plain (non-string)
2587 		 * in/out is not supported.
2588 		 */
2589 		if ((vie->inout.flags & INOUT_REP) != 0) {
2590 			return (EINVAL);
2591 		}
2592 
2593 		err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2594 		if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) {
2595 			/*
2596 			 * With the inX access now a success, the result needs
2597 			 * to be stored in the guest %rax.
2598 			 */
2599 			err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2600 			    vie->inout.eax);
2601 			VERIFY0(err);
2602 		}
2603 	} else {
2604 		vie->status &= ~VIES_REPEAT;
2605 		err = vie_emulate_inout_str(vie, vm, vcpuid);
2606 
2607 	}
2608 	if (err < 0) {
2609 		/*
2610 		 * Access to an I/O port failed in such a way that fallback to
2611 		 * handling in userspace is required.
2612 		 */
2613 		vie->status |= VIES_USER_FALLBACK;
2614 	} else if (err == ESRCH) {
2615 		ASSERT(vie->status & VIES_PENDING_INOUT);
2616 		/* Return to userspace with the in/out request */
2617 		err = -1;
2618 	}
2619 
2620 	return (err);
2621 }
2622 
2623 int
2624 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid)
2625 {
2626 	int error;
2627 
2628 	if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) !=
2629 	    (VIES_INST_DECODE | VIES_OTHER)) {
2630 		return (EINVAL);
2631 	}
2632 
2633 	switch (vie->op.op_type) {
2634 	case VIE_OP_TYPE_CLTS:
2635 		error = vie_emulate_clts(vie, vm, vcpuid);
2636 		break;
2637 	case VIE_OP_TYPE_MOV_CR:
2638 		error = vie_emulate_mov_cr(vie, vm, vcpuid);
2639 		break;
2640 	default:
2641 		error = EINVAL;
2642 		break;
2643 	}
2644 
2645 	return (error);
2646 }
2647 
2648 void
2649 vie_reset(struct vie *vie)
2650 {
2651 	vie->status = 0;
2652 	vie->num_processed = vie->num_valid = 0;
2653 }
2654 
2655 void
2656 vie_advance_pc(struct vie *vie, uint64_t *nextrip)
2657 {
2658 	VERIFY((vie->status & VIES_REPEAT) == 0);
2659 
2660 	*nextrip += vie->num_processed;
2661 	vie_reset(vie);
2662 }
2663 
2664 void
2665 vie_exitinfo(const struct vie *vie, struct vm_exit *vme)
2666 {
2667 	if (vie->status & VIES_USER_FALLBACK) {
2668 		/*
2669 		 * Despite the fact that the instruction was successfully
2670 		 * decoded, some aspect of the emulation failed in such a way
2671 		 * that it is left up to userspace to complete the operation.
2672 		 */
2673 		vie_fallback_exitinfo(vie, vme);
2674 	} else if (vie->status & VIES_MMIO) {
2675 		vme->exitcode = VM_EXITCODE_MMIO;
2676 		if (vie->mmio_req_read.state == VR_PENDING) {
2677 			vme->u.mmio.gpa = vie->mmio_req_read.gpa;
2678 			vme->u.mmio.data = 0;
2679 			vme->u.mmio.bytes = vie->mmio_req_read.bytes;
2680 			vme->u.mmio.read = 1;
2681 		} else if (vie->mmio_req_write.state == VR_PENDING) {
2682 			vme->u.mmio.gpa = vie->mmio_req_write.gpa;
2683 			vme->u.mmio.data = vie->mmio_req_write.data &
2684 			    vie_size2mask(vie->mmio_req_write.bytes);
2685 			vme->u.mmio.bytes = vie->mmio_req_write.bytes;
2686 			vme->u.mmio.read = 0;
2687 		} else {
2688 			panic("bad pending MMIO state");
2689 		}
2690 	} else if (vie->status & VIES_INOUT) {
2691 		vme->exitcode = VM_EXITCODE_INOUT;
2692 		vme->u.inout.port = vie->inout.port;
2693 		vme->u.inout.bytes = vie->inout.bytes;
2694 		if ((vie->inout.flags & INOUT_IN) != 0) {
2695 			vme->u.inout.flags = INOUT_IN;
2696 			vme->u.inout.eax = 0;
2697 		} else {
2698 			vme->u.inout.flags = 0;
2699 			vme->u.inout.eax = vie->inout.eax &
2700 			    vie_size2mask(vie->inout.bytes);
2701 		}
2702 	} else {
2703 		panic("no pending operation");
2704 	}
2705 }
2706 
2707 /*
2708  * In the case of a decoding or verification failure, bailing out to userspace
2709  * to do the instruction emulation is our only option for now.
2710  */
2711 void
2712 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme)
2713 {
2714 	if ((vie->status & VIES_INST_FETCH) == 0) {
2715 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
2716 	} else {
2717 		ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst));
2718 
2719 		bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst));
2720 		vme->u.inst_emul.num_valid = vie->num_valid;
2721 	}
2722 	vme->exitcode = VM_EXITCODE_INST_EMUL;
2723 }
2724 
2725 void
2726 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base,
2727     int *cs_d)
2728 {
2729 	struct seg_desc cs_desc;
2730 	int error __maybe_unused;
2731 
2732 	error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc);
2733 	ASSERT(error == 0);
2734 
2735 	/* Initialization required for the paging info to be populated */
2736 	VERIFY(vie->status & VIES_INIT);
2737 	switch (vie->paging.cpu_mode) {
2738 	case CPU_MODE_REAL:
2739 		*cs_base = cs_desc.base;
2740 		*cs_d = 0;
2741 		break;
2742 	case CPU_MODE_PROTECTED:
2743 	case CPU_MODE_COMPATIBILITY:
2744 		*cs_base = cs_desc.base;
2745 		*cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0;
2746 		break;
2747 	default:
2748 		*cs_base = 0;
2749 		*cs_d = 0;
2750 		break;
2751 	}
2752 }
2753 
2754 bool
2755 vie_pending(const struct vie *vie)
2756 {
2757 	/*
2758 	 * These VIE status bits indicate conditions which must be addressed
2759 	 * through either device IO fulfillment (with corresponding
2760 	 * vie_fulfill_*()) or complete userspace emulation (followed by a
2761 	 * vie_reset()).
2762 	 */
2763 	const enum vie_status of_interest =
2764 	    VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK;
2765 
2766 	return ((vie->status & of_interest) != 0);
2767 }
2768 
2769 bool
2770 vie_needs_fetch(const struct vie *vie)
2771 {
2772 	if (vie->status & VIES_INST_FETCH) {
2773 		ASSERT(vie->num_valid != 0);
2774 		return (false);
2775 	}
2776 	return (true);
2777 }
2778 
2779 static int
2780 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
2781 {
2782 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2783 	    ("%s: invalid size %d", __func__, size));
2784 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
2785 
2786 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
2787 		return (0);
2788 
2789 	return ((gla & (size - 1)) ? 1 : 0);
2790 }
2791 
2792 static int
2793 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
2794 {
2795 	uint64_t mask;
2796 
2797 	if (cpu_mode != CPU_MODE_64BIT)
2798 		return (0);
2799 
2800 	/*
2801 	 * The value of the bit 47 in the 'gla' should be replicated in the
2802 	 * most significant 16 bits.
2803 	 */
2804 	mask = ~((1UL << 48) - 1);
2805 	if (gla & (1UL << 47))
2806 		return ((gla & mask) != mask);
2807 	else
2808 		return ((gla & mask) != 0);
2809 }
2810 
2811 static uint64_t
2812 vie_size2mask(int size)
2813 {
2814 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2815 	    ("vie_size2mask: invalid size %d", size));
2816 	return (size2mask[size]);
2817 }
2818 
2819 static int
2820 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
2821     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
2822     int prot, uint64_t *gla)
2823 {
2824 	uint64_t firstoff, low_limit, high_limit, segbase;
2825 	int glasize, type;
2826 
2827 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
2828 	    ("%s: invalid segment %d", __func__, seg));
2829 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
2830 	    ("%s: invalid operand size %d", __func__, length));
2831 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
2832 	    ("%s: invalid prot %x", __func__, prot));
2833 
2834 	firstoff = offset;
2835 	if (cpu_mode == CPU_MODE_64BIT) {
2836 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
2837 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
2838 		glasize = 8;
2839 	} else {
2840 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
2841 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
2842 		glasize = 4;
2843 		/*
2844 		 * If the segment selector is loaded with a NULL selector
2845 		 * then the descriptor is unusable and attempting to use
2846 		 * it results in a #GP(0).
2847 		 */
2848 		if (SEG_DESC_UNUSABLE(desc->access))
2849 			return (-1);
2850 
2851 		/*
2852 		 * The processor generates a #NP exception when a segment
2853 		 * register is loaded with a selector that points to a
2854 		 * descriptor that is not present. If this was the case then
2855 		 * it would have been checked before the VM-exit.
2856 		 */
2857 		KASSERT(SEG_DESC_PRESENT(desc->access),
2858 		    ("segment %d not present: %x", seg, desc->access));
2859 
2860 		/*
2861 		 * The descriptor type must indicate a code/data segment.
2862 		 */
2863 		type = SEG_DESC_TYPE(desc->access);
2864 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
2865 		    "descriptor type %x", seg, type));
2866 
2867 		if (prot & PROT_READ) {
2868 			/* #GP on a read access to a exec-only code segment */
2869 			if ((type & 0xA) == 0x8)
2870 				return (-1);
2871 		}
2872 
2873 		if (prot & PROT_WRITE) {
2874 			/*
2875 			 * #GP on a write access to a code segment or a
2876 			 * read-only data segment.
2877 			 */
2878 			if (type & 0x8)			/* code segment */
2879 				return (-1);
2880 
2881 			if ((type & 0xA) == 0)		/* read-only data seg */
2882 				return (-1);
2883 		}
2884 
2885 		/*
2886 		 * 'desc->limit' is fully expanded taking granularity into
2887 		 * account.
2888 		 */
2889 		if ((type & 0xC) == 0x4) {
2890 			/* expand-down data segment */
2891 			low_limit = desc->limit + 1;
2892 			high_limit = SEG_DESC_DEF32(desc->access) ?
2893 			    0xffffffff : 0xffff;
2894 		} else {
2895 			/* code segment or expand-up data segment */
2896 			low_limit = 0;
2897 			high_limit = desc->limit;
2898 		}
2899 
2900 		while (length > 0) {
2901 			offset &= vie_size2mask(addrsize);
2902 			if (offset < low_limit || offset > high_limit)
2903 				return (-1);
2904 			offset++;
2905 			length--;
2906 		}
2907 	}
2908 
2909 	/*
2910 	 * In 64-bit mode all segments except %fs and %gs have a segment
2911 	 * base address of 0.
2912 	 */
2913 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2914 	    seg != VM_REG_GUEST_GS) {
2915 		segbase = 0;
2916 	} else {
2917 		segbase = desc->base;
2918 	}
2919 
2920 	/*
2921 	 * Truncate 'firstoff' to the effective address size before adding
2922 	 * it to the segment base.
2923 	 */
2924 	firstoff &= vie_size2mask(addrsize);
2925 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
2926 	return (0);
2927 }
2928 
2929 void
2930 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length,
2931     const struct vm_guest_paging *paging, uint64_t gpa)
2932 {
2933 	KASSERT(inst_length <= VIE_INST_SIZE,
2934 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2935 
2936 	bzero(vie, sizeof (struct vie));
2937 
2938 	vie->base_register = VM_REG_LAST;
2939 	vie->index_register = VM_REG_LAST;
2940 	vie->segment_register = VM_REG_LAST;
2941 	vie->status = VIES_INIT | VIES_MMIO;
2942 
2943 	if (inst_length != 0) {
2944 		bcopy(inst_bytes, vie->inst, inst_length);
2945 		vie->num_valid = inst_length;
2946 		vie->status |= VIES_INST_FETCH;
2947 	}
2948 
2949 	vie->paging = *paging;
2950 	vie->mmio_gpa = gpa;
2951 }
2952 
2953 void
2954 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len,
2955     const struct vm_guest_paging *paging)
2956 {
2957 	bzero(vie, sizeof (struct vie));
2958 
2959 	vie->status = VIES_INIT | VIES_INOUT;
2960 
2961 	vie->inout = *inout;
2962 	vie->paging = *paging;
2963 
2964 	/*
2965 	 * Since VMX/SVM assists already decoded the nature of the in/out
2966 	 * instruction, let the status reflect that.
2967 	 */
2968 	vie->status |= VIES_INST_FETCH | VIES_INST_DECODE;
2969 	vie->num_processed = inst_len;
2970 }
2971 
2972 void
2973 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging)
2974 {
2975 	bzero(vie, sizeof (struct vie));
2976 
2977 	vie->base_register = VM_REG_LAST;
2978 	vie->index_register = VM_REG_LAST;
2979 	vie->segment_register = VM_REG_LAST;
2980 	vie->status = VIES_INIT | VIES_OTHER;
2981 
2982 	vie->paging = *paging;
2983 }
2984 
2985 int
2986 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result)
2987 {
2988 	struct vie_mmio *pending;
2989 
2990 	if ((vie->status & VIES_MMIO) == 0 ||
2991 	    (vie->status & VIES_PENDING_MMIO) == 0) {
2992 		return (EINVAL);
2993 	}
2994 
2995 	if (result->read) {
2996 		pending = &vie->mmio_req_read;
2997 	} else {
2998 		pending = &vie->mmio_req_write;
2999 	}
3000 
3001 	if (pending->state != VR_PENDING ||
3002 	    pending->bytes != result->bytes || pending->gpa != result->gpa) {
3003 		return (EINVAL);
3004 	}
3005 
3006 	if (result->read) {
3007 		pending->data = result->data & vie_size2mask(pending->bytes);
3008 	}
3009 	pending->state = VR_DONE;
3010 	vie->status &= ~VIES_PENDING_MMIO;
3011 
3012 	return (0);
3013 }
3014 
3015 int
3016 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result)
3017 {
3018 	if ((vie->status & VIES_INOUT) == 0 ||
3019 	    (vie->status & VIES_PENDING_INOUT) == 0) {
3020 		return (EINVAL);
3021 	}
3022 	if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) ||
3023 	    vie->inout.bytes != result->bytes ||
3024 	    vie->inout.port != result->port) {
3025 		return (EINVAL);
3026 	}
3027 
3028 	if (result->flags & INOUT_IN) {
3029 		vie->inout_req_val = result->eax &
3030 		    vie_size2mask(vie->inout.bytes);
3031 	}
3032 	vie->inout_req_state = VR_DONE;
3033 	vie->status &= ~(VIES_PENDING_INOUT);
3034 
3035 	return (0);
3036 }
3037 
3038 uint64_t
3039 vie_mmio_gpa(const struct vie *vie)
3040 {
3041 	return (vie->mmio_gpa);
3042 }
3043 
3044 static int
3045 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
3046 {
3047 	int error_code = 0;
3048 
3049 	if (pte & PG_V)
3050 		error_code |= PGEX_P;
3051 	if (prot & PROT_WRITE)
3052 		error_code |= PGEX_W;
3053 	if (usermode)
3054 		error_code |= PGEX_U;
3055 	if (rsvd)
3056 		error_code |= PGEX_RSV;
3057 	if (prot & PROT_EXEC)
3058 		error_code |= PGEX_I;
3059 
3060 	return (error_code);
3061 }
3062 
3063 static void
3064 ptp_release(vm_page_t **vmp)
3065 {
3066 	if (*vmp != NULL) {
3067 		(void) vmp_release(*vmp);
3068 		*vmp = NULL;
3069 	}
3070 }
3071 
3072 static void *
3073 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
3074 {
3075 	vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
3076 	const uintptr_t hold_gpa = gpa & PAGEMASK;
3077 
3078 	/* Hold must not cross a page boundary */
3079 	VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
3080 
3081 	if (*vmp != NULL) {
3082 		(void) vmp_release(*vmp);
3083 	}
3084 
3085 	*vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
3086 	if (*vmp == NULL) {
3087 		return (NULL);
3088 	}
3089 
3090 	return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
3091 }
3092 
3093 static int
3094 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3095     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
3096 {
3097 	int nlevels, pfcode;
3098 	int ptpshift = 0, ptpindex = 0;
3099 	uint64_t ptpphys;
3100 	uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
3101 	vm_page_t *cookie = NULL;
3102 	const bool usermode = paging->cpl == 3;
3103 	const bool writable = (prot & PROT_WRITE) != 0;
3104 
3105 	*guest_fault = 0;
3106 restart:
3107 	ptpphys = paging->cr3;		/* root of the page tables */
3108 	ptp_release(&cookie);
3109 
3110 	if (vie_canonical_check(paging->cpu_mode, gla)) {
3111 		/*
3112 		 * XXX assuming a non-stack reference otherwise a stack fault
3113 		 * should be generated.
3114 		 */
3115 		if (!check_only)
3116 			vm_inject_gp(vm, vcpuid);
3117 		*guest_fault = 1;
3118 		return (0);
3119 	}
3120 
3121 	if (paging->paging_mode == PAGING_MODE_FLAT) {
3122 		*gpa = gla;
3123 		return (0);
3124 	}
3125 
3126 	if (paging->paging_mode == PAGING_MODE_32) {
3127 		uint32_t *ptpbase32, pte32;
3128 
3129 		nlevels = 2;
3130 		while (--nlevels >= 0) {
3131 			/* Zero out the lower 12 bits. */
3132 			ptpphys &= ~0xfff;
3133 
3134 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
3135 			    &cookie);
3136 
3137 			if (ptpbase32 == NULL) {
3138 				return (EFAULT);
3139 			}
3140 
3141 			ptpshift = PAGE_SHIFT + nlevels * 10;
3142 			ptpindex = (gla >> ptpshift) & 0x3FF;
3143 			pgsize = 1UL << ptpshift;
3144 
3145 			pte32 = ptpbase32[ptpindex];
3146 
3147 			if ((pte32 & PG_V) == 0 ||
3148 			    (usermode && (pte32 & PG_U) == 0) ||
3149 			    (writable && (pte32 & PG_RW) == 0)) {
3150 				if (!check_only) {
3151 					pfcode = pf_error_code(usermode, prot,
3152 					    0, pte32);
3153 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3154 				}
3155 
3156 				ptp_release(&cookie);
3157 				*guest_fault = 1;
3158 				return (0);
3159 			}
3160 
3161 			/*
3162 			 * Emulate the x86 MMU's management of the accessed
3163 			 * and dirty flags. While the accessed flag is set
3164 			 * at every level of the page table, the dirty flag
3165 			 * is only set at the last level providing the guest
3166 			 * physical address.
3167 			 */
3168 			if (!check_only && (pte32 & PG_A) == 0) {
3169 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
3170 				    pte32, pte32 | PG_A) == 0) {
3171 					goto restart;
3172 				}
3173 			}
3174 
3175 			/* XXX must be ignored if CR4.PSE=0 */
3176 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
3177 				break;
3178 
3179 			ptpphys = pte32;
3180 		}
3181 
3182 		/* Set the dirty bit in the page table entry if necessary */
3183 		if (!check_only && writable && (pte32 & PG_M) == 0) {
3184 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
3185 			    pte32, pte32 | PG_M) == 0) {
3186 				goto restart;
3187 			}
3188 		}
3189 
3190 		/* Zero out the lower 'ptpshift' bits */
3191 		pte32 >>= ptpshift; pte32 <<= ptpshift;
3192 		*gpa = pte32 | (gla & (pgsize - 1));
3193 		ptp_release(&cookie);
3194 		return (0);
3195 	}
3196 
3197 	if (paging->paging_mode == PAGING_MODE_PAE) {
3198 		/* Zero out the lower 5 bits and the upper 32 bits */
3199 		ptpphys &= 0xffffffe0UL;
3200 
3201 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
3202 		    &cookie);
3203 		if (ptpbase == NULL) {
3204 			return (EFAULT);
3205 		}
3206 
3207 		ptpindex = (gla >> 30) & 0x3;
3208 
3209 		pte = ptpbase[ptpindex];
3210 
3211 		if ((pte & PG_V) == 0) {
3212 			if (!check_only) {
3213 				pfcode = pf_error_code(usermode, prot, 0, pte);
3214 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3215 			}
3216 
3217 			ptp_release(&cookie);
3218 			*guest_fault = 1;
3219 			return (0);
3220 		}
3221 
3222 		ptpphys = pte;
3223 
3224 		nlevels = 2;
3225 	} else {
3226 		nlevels = 4;
3227 	}
3228 
3229 	while (--nlevels >= 0) {
3230 		/* Zero out the lower 12 bits and the upper 12 bits */
3231 		ptpphys &= 0x000ffffffffff000UL;
3232 
3233 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
3234 		if (ptpbase == NULL) {
3235 			return (EFAULT);
3236 		}
3237 
3238 		ptpshift = PAGE_SHIFT + nlevels * 9;
3239 		ptpindex = (gla >> ptpshift) & 0x1FF;
3240 		pgsize = 1UL << ptpshift;
3241 
3242 		pte = ptpbase[ptpindex];
3243 
3244 		if ((pte & PG_V) == 0 ||
3245 		    (usermode && (pte & PG_U) == 0) ||
3246 		    (writable && (pte & PG_RW) == 0)) {
3247 			if (!check_only) {
3248 				pfcode = pf_error_code(usermode, prot, 0, pte);
3249 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3250 			}
3251 
3252 			ptp_release(&cookie);
3253 			*guest_fault = 1;
3254 			return (0);
3255 		}
3256 
3257 		/* Set the accessed bit in the page table entry */
3258 		if (!check_only && (pte & PG_A) == 0) {
3259 			if (atomic_cmpset_64(&ptpbase[ptpindex],
3260 			    pte, pte | PG_A) == 0) {
3261 				goto restart;
3262 			}
3263 		}
3264 
3265 		if (nlevels > 0 && (pte & PG_PS) != 0) {
3266 			if (pgsize > 1 * GB) {
3267 				if (!check_only) {
3268 					pfcode = pf_error_code(usermode, prot,
3269 					    1, pte);
3270 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3271 				}
3272 
3273 				ptp_release(&cookie);
3274 				*guest_fault = 1;
3275 				return (0);
3276 			}
3277 			break;
3278 		}
3279 
3280 		ptpphys = pte;
3281 	}
3282 
3283 	/* Set the dirty bit in the page table entry if necessary */
3284 	if (!check_only && writable && (pte & PG_M) == 0) {
3285 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
3286 			goto restart;
3287 	}
3288 	ptp_release(&cookie);
3289 
3290 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3291 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
3292 	*gpa = pte | (gla & (pgsize - 1));
3293 	return (0);
3294 }
3295 
3296 int
3297 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3298     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3299 {
3300 
3301 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3302 	    false));
3303 }
3304 
3305 int
3306 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3307     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3308 {
3309 
3310 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3311 	    true));
3312 }
3313 
3314 int
3315 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip,
3316     int *faultptr)
3317 {
3318 	struct vm_copyinfo copyinfo[2];
3319 	int error, prot;
3320 
3321 	if ((vie->status & VIES_INIT) == 0) {
3322 		return (EINVAL);
3323 	}
3324 
3325 	prot = PROT_READ | PROT_EXEC;
3326 	error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE,
3327 	    prot, copyinfo, nitems(copyinfo), faultptr);
3328 	if (error || *faultptr)
3329 		return (error);
3330 
3331 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE);
3332 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
3333 	vie->num_valid = VIE_INST_SIZE;
3334 	vie->status |= VIES_INST_FETCH;
3335 	return (0);
3336 }
3337 
3338 static int
3339 vie_peek(struct vie *vie, uint8_t *x)
3340 {
3341 
3342 	if (vie->num_processed < vie->num_valid) {
3343 		*x = vie->inst[vie->num_processed];
3344 		return (0);
3345 	} else
3346 		return (-1);
3347 }
3348 
3349 static void
3350 vie_advance(struct vie *vie)
3351 {
3352 
3353 	vie->num_processed++;
3354 }
3355 
3356 static bool
3357 segment_override(uint8_t x, int *seg)
3358 {
3359 
3360 	switch (x) {
3361 	case 0x2E:
3362 		*seg = VM_REG_GUEST_CS;
3363 		break;
3364 	case 0x36:
3365 		*seg = VM_REG_GUEST_SS;
3366 		break;
3367 	case 0x3E:
3368 		*seg = VM_REG_GUEST_DS;
3369 		break;
3370 	case 0x26:
3371 		*seg = VM_REG_GUEST_ES;
3372 		break;
3373 	case 0x64:
3374 		*seg = VM_REG_GUEST_FS;
3375 		break;
3376 	case 0x65:
3377 		*seg = VM_REG_GUEST_GS;
3378 		break;
3379 	default:
3380 		return (false);
3381 	}
3382 	return (true);
3383 }
3384 
3385 static int
3386 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
3387 {
3388 	uint8_t x;
3389 
3390 	while (1) {
3391 		if (vie_peek(vie, &x))
3392 			return (-1);
3393 
3394 		if (x == 0x66)
3395 			vie->opsize_override = 1;
3396 		else if (x == 0x67)
3397 			vie->addrsize_override = 1;
3398 		else if (x == 0xF3)
3399 			vie->repz_present = 1;
3400 		else if (x == 0xF2)
3401 			vie->repnz_present = 1;
3402 		else if (segment_override(x, &vie->segment_register))
3403 			vie->segment_override = 1;
3404 		else
3405 			break;
3406 
3407 		vie_advance(vie);
3408 	}
3409 
3410 	/*
3411 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3412 	 * - Only one REX prefix is allowed per instruction.
3413 	 * - The REX prefix must immediately precede the opcode byte or the
3414 	 *   escape opcode byte.
3415 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3416 	 *   the mandatory prefix must come before the REX prefix.
3417 	 */
3418 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
3419 		vie->rex_present = 1;
3420 		vie->rex_w = x & 0x8 ? 1 : 0;
3421 		vie->rex_r = x & 0x4 ? 1 : 0;
3422 		vie->rex_x = x & 0x2 ? 1 : 0;
3423 		vie->rex_b = x & 0x1 ? 1 : 0;
3424 		vie_advance(vie);
3425 	}
3426 
3427 	/*
3428 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3429 	 */
3430 	if ((cpu_mode == CPU_MODE_64BIT ||
3431 	    cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) {
3432 		const struct vie_op *optab;
3433 
3434 		/* 3-byte VEX prefix. */
3435 		vie->vex_present = 1;
3436 
3437 		vie_advance(vie);
3438 		if (vie_peek(vie, &x))
3439 			return (-1);
3440 
3441 		/*
3442 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
3443 		 * relative to REX encoding.
3444 		 */
3445 		vie->rex_r = x & 0x80 ? 0 : 1;
3446 		vie->rex_x = x & 0x40 ? 0 : 1;
3447 		vie->rex_b = x & 0x20 ? 0 : 1;
3448 
3449 		switch (x & 0x1F) {
3450 		case 0x2:
3451 			/* 0F 38. */
3452 			optab = three_byte_opcodes_0f38;
3453 			break;
3454 		case 0x1:
3455 			/* 0F class - nothing handled here yet. */
3456 			/* FALLTHROUGH */
3457 		case 0x3:
3458 			/* 0F 3A class - nothing handled here yet. */
3459 			/* FALLTHROUGH */
3460 		default:
3461 			/* Reserved (#UD). */
3462 			return (-1);
3463 		}
3464 
3465 		vie_advance(vie);
3466 		if (vie_peek(vie, &x))
3467 			return (-1);
3468 
3469 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3470 		vie->rex_w = x & 0x80 ? 1 : 0;
3471 
3472 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
3473 		vie->vex_l = !!(x & 0x4);
3474 		vie->vex_pp = (x & 0x3);
3475 
3476 		/* PP: 1=66 2=F3 3=F2 prefixes. */
3477 		switch (vie->vex_pp) {
3478 		case 0x1:
3479 			vie->opsize_override = 1;
3480 			break;
3481 		case 0x2:
3482 			vie->repz_present = 1;
3483 			break;
3484 		case 0x3:
3485 			vie->repnz_present = 1;
3486 			break;
3487 		}
3488 
3489 		vie_advance(vie);
3490 
3491 		/* Opcode, sans literal prefix prefix. */
3492 		if (vie_peek(vie, &x))
3493 			return (-1);
3494 
3495 		vie->op = optab[x];
3496 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
3497 			return (-1);
3498 
3499 		vie_advance(vie);
3500 	}
3501 
3502 	/*
3503 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3504 	 */
3505 	if (cpu_mode == CPU_MODE_64BIT) {
3506 		/*
3507 		 * Default address size is 64-bits and default operand size
3508 		 * is 32-bits.
3509 		 */
3510 		vie->addrsize = vie->addrsize_override ? 4 : 8;
3511 		if (vie->rex_w)
3512 			vie->opsize = 8;
3513 		else if (vie->opsize_override)
3514 			vie->opsize = 2;
3515 		else
3516 			vie->opsize = 4;
3517 	} else if (cs_d) {
3518 		/* Default address and operand sizes are 32-bits */
3519 		vie->addrsize = vie->addrsize_override ? 2 : 4;
3520 		vie->opsize = vie->opsize_override ? 2 : 4;
3521 	} else {
3522 		/* Default address and operand sizes are 16-bits */
3523 		vie->addrsize = vie->addrsize_override ? 4 : 2;
3524 		vie->opsize = vie->opsize_override ? 4 : 2;
3525 	}
3526 	return (0);
3527 }
3528 
3529 static int
3530 decode_two_byte_opcode(struct vie *vie)
3531 {
3532 	uint8_t x;
3533 
3534 	if (vie_peek(vie, &x))
3535 		return (-1);
3536 
3537 	vie->op = two_byte_opcodes[x];
3538 
3539 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3540 		return (-1);
3541 
3542 	vie_advance(vie);
3543 	return (0);
3544 }
3545 
3546 static int
3547 decode_opcode(struct vie *vie)
3548 {
3549 	uint8_t x;
3550 
3551 	if (vie_peek(vie, &x))
3552 		return (-1);
3553 
3554 	/* Already did this via VEX prefix. */
3555 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
3556 		return (0);
3557 
3558 	vie->op = one_byte_opcodes[x];
3559 
3560 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3561 		return (-1);
3562 
3563 	vie_advance(vie);
3564 
3565 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
3566 		return (decode_two_byte_opcode(vie));
3567 
3568 	return (0);
3569 }
3570 
3571 static int
3572 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
3573 {
3574 	uint8_t x;
3575 	/*
3576 	 * Handling mov-to/from-cr is special since it is not issuing
3577 	 * mmio/pio requests and can be done in real mode.  We must bypass some
3578 	 * of the other existing decoding restrictions for it.
3579 	 */
3580 	const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0);
3581 
3582 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
3583 		return (0);
3584 
3585 	if (cpu_mode == CPU_MODE_REAL && !is_movcr)
3586 		return (-1);
3587 
3588 	if (vie_peek(vie, &x))
3589 		return (-1);
3590 
3591 	vie->mod = (x >> 6) & 0x3;
3592 	vie->rm =  (x >> 0) & 0x7;
3593 	vie->reg = (x >> 3) & 0x7;
3594 
3595 	/*
3596 	 * A direct addressing mode makes no sense in the context of an EPT
3597 	 * fault. There has to be a memory access involved to cause the
3598 	 * EPT fault.
3599 	 */
3600 	if (vie->mod == VIE_MOD_DIRECT && !is_movcr)
3601 		return (-1);
3602 
3603 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
3604 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
3605 		/*
3606 		 * Table 2-5: Special Cases of REX Encodings
3607 		 *
3608 		 * mod=0, r/m=5 is used in the compatibility mode to
3609 		 * indicate a disp32 without a base register.
3610 		 *
3611 		 * mod!=3, r/m=4 is used in the compatibility mode to
3612 		 * indicate that the SIB byte is present.
3613 		 *
3614 		 * The 'b' bit in the REX prefix is don't care in
3615 		 * this case.
3616 		 */
3617 	} else {
3618 		vie->rm |= (vie->rex_b << 3);
3619 	}
3620 
3621 	vie->reg |= (vie->rex_r << 3);
3622 
3623 	/* SIB */
3624 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
3625 		goto done;
3626 
3627 	vie->base_register = gpr_map[vie->rm];
3628 
3629 	switch (vie->mod) {
3630 	case VIE_MOD_INDIRECT_DISP8:
3631 		vie->disp_bytes = 1;
3632 		break;
3633 	case VIE_MOD_INDIRECT_DISP32:
3634 		vie->disp_bytes = 4;
3635 		break;
3636 	case VIE_MOD_INDIRECT:
3637 		if (vie->rm == VIE_RM_DISP32) {
3638 			vie->disp_bytes = 4;
3639 			/*
3640 			 * Table 2-7. RIP-Relative Addressing
3641 			 *
3642 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3643 			 * whereas in compatibility mode it just implies disp32.
3644 			 */
3645 
3646 			if (cpu_mode == CPU_MODE_64BIT)
3647 				vie->base_register = VM_REG_GUEST_RIP;
3648 			else
3649 				vie->base_register = VM_REG_LAST;
3650 		}
3651 		break;
3652 	}
3653 
3654 done:
3655 	vie_advance(vie);
3656 
3657 	return (0);
3658 }
3659 
3660 static int
3661 decode_sib(struct vie *vie)
3662 {
3663 	uint8_t x;
3664 
3665 	/* Proceed only if SIB byte is present */
3666 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
3667 		return (0);
3668 
3669 	if (vie_peek(vie, &x))
3670 		return (-1);
3671 
3672 	/* De-construct the SIB byte */
3673 	vie->ss = (x >> 6) & 0x3;
3674 	vie->index = (x >> 3) & 0x7;
3675 	vie->base = (x >> 0) & 0x7;
3676 
3677 	/* Apply the REX prefix modifiers */
3678 	vie->index |= vie->rex_x << 3;
3679 	vie->base |= vie->rex_b << 3;
3680 
3681 	switch (vie->mod) {
3682 	case VIE_MOD_INDIRECT_DISP8:
3683 		vie->disp_bytes = 1;
3684 		break;
3685 	case VIE_MOD_INDIRECT_DISP32:
3686 		vie->disp_bytes = 4;
3687 		break;
3688 	}
3689 
3690 	if (vie->mod == VIE_MOD_INDIRECT &&
3691 	    (vie->base == 5 || vie->base == 13)) {
3692 		/*
3693 		 * Special case when base register is unused if mod = 0
3694 		 * and base = %rbp or %r13.
3695 		 *
3696 		 * Documented in:
3697 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3698 		 * Table 2-5: Special Cases of REX Encodings
3699 		 */
3700 		vie->disp_bytes = 4;
3701 	} else {
3702 		vie->base_register = gpr_map[vie->base];
3703 	}
3704 
3705 	/*
3706 	 * All encodings of 'index' are valid except for %rsp (4).
3707 	 *
3708 	 * Documented in:
3709 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3710 	 * Table 2-5: Special Cases of REX Encodings
3711 	 */
3712 	if (vie->index != 4)
3713 		vie->index_register = gpr_map[vie->index];
3714 
3715 	/* 'scale' makes sense only in the context of an index register */
3716 	if (vie->index_register < VM_REG_LAST)
3717 		vie->scale = 1 << vie->ss;
3718 
3719 	vie_advance(vie);
3720 
3721 	return (0);
3722 }
3723 
3724 static int
3725 decode_displacement(struct vie *vie)
3726 {
3727 	int n, i;
3728 	uint8_t x;
3729 
3730 	union {
3731 		char	buf[4];
3732 		int8_t	signed8;
3733 		int32_t	signed32;
3734 	} u;
3735 
3736 	if ((n = vie->disp_bytes) == 0)
3737 		return (0);
3738 
3739 	if (n != 1 && n != 4)
3740 		panic("decode_displacement: invalid disp_bytes %d", n);
3741 
3742 	for (i = 0; i < n; i++) {
3743 		if (vie_peek(vie, &x))
3744 			return (-1);
3745 
3746 		u.buf[i] = x;
3747 		vie_advance(vie);
3748 	}
3749 
3750 	if (n == 1)
3751 		vie->displacement = u.signed8;		/* sign-extended */
3752 	else
3753 		vie->displacement = u.signed32;		/* sign-extended */
3754 
3755 	return (0);
3756 }
3757 
3758 static int
3759 decode_immediate(struct vie *vie)
3760 {
3761 	int i, n;
3762 	uint8_t x;
3763 	union {
3764 		char	buf[4];
3765 		int8_t	signed8;
3766 		int16_t	signed16;
3767 		int32_t	signed32;
3768 	} u;
3769 
3770 	/* Figure out immediate operand size (if any) */
3771 	if (vie->op.op_flags & VIE_OP_F_IMM) {
3772 		/*
3773 		 * Section 2.2.1.5 "Immediates", Intel SDM:
3774 		 * In 64-bit mode the typical size of immediate operands
3775 		 * remains 32-bits. When the operand size if 64-bits, the
3776 		 * processor sign-extends all immediates to 64-bits prior
3777 		 * to their use.
3778 		 */
3779 		if (vie->opsize == 4 || vie->opsize == 8)
3780 			vie->imm_bytes = 4;
3781 		else
3782 			vie->imm_bytes = 2;
3783 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
3784 		vie->imm_bytes = 1;
3785 	}
3786 
3787 	if ((n = vie->imm_bytes) == 0)
3788 		return (0);
3789 
3790 	KASSERT(n == 1 || n == 2 || n == 4,
3791 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
3792 
3793 	for (i = 0; i < n; i++) {
3794 		if (vie_peek(vie, &x))
3795 			return (-1);
3796 
3797 		u.buf[i] = x;
3798 		vie_advance(vie);
3799 	}
3800 
3801 	/* sign-extend the immediate value before use */
3802 	if (n == 1)
3803 		vie->immediate = u.signed8;
3804 	else if (n == 2)
3805 		vie->immediate = u.signed16;
3806 	else
3807 		vie->immediate = u.signed32;
3808 
3809 	return (0);
3810 }
3811 
3812 static int
3813 decode_moffset(struct vie *vie)
3814 {
3815 	int i, n;
3816 	uint8_t x;
3817 	union {
3818 		char	buf[8];
3819 		uint64_t u64;
3820 	} u;
3821 
3822 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
3823 		return (0);
3824 
3825 	/*
3826 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3827 	 * The memory offset size follows the address-size of the instruction.
3828 	 */
3829 	n = vie->addrsize;
3830 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
3831 
3832 	u.u64 = 0;
3833 	for (i = 0; i < n; i++) {
3834 		if (vie_peek(vie, &x))
3835 			return (-1);
3836 
3837 		u.buf[i] = x;
3838 		vie_advance(vie);
3839 	}
3840 	vie->displacement = u.u64;
3841 	return (0);
3842 }
3843 
3844 /*
3845  * Verify that the 'guest linear address' provided as collateral of the nested
3846  * page table fault matches with our instruction decoding.
3847  */
3848 int
3849 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla)
3850 {
3851 	int error;
3852 	uint64_t base, segbase, idx, gla2;
3853 	enum vm_reg_name seg;
3854 	struct seg_desc desc;
3855 
3856 	ASSERT((vie->status & VIES_INST_DECODE) != 0);
3857 
3858 	/*
3859 	 * If there was no valid GLA context with the exit, or the decoded
3860 	 * instruction acts on more than one address, verification is done.
3861 	 */
3862 	if (gla == VIE_INVALID_GLA ||
3863 	    (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) {
3864 		return (0);
3865 	}
3866 
3867 	base = 0;
3868 	if (vie->base_register != VM_REG_LAST) {
3869 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
3870 		if (error) {
3871 			printf("verify_gla: error %d getting base reg %d\n",
3872 			    error, vie->base_register);
3873 			return (-1);
3874 		}
3875 
3876 		/*
3877 		 * RIP-relative addressing starts from the following
3878 		 * instruction
3879 		 */
3880 		if (vie->base_register == VM_REG_GUEST_RIP)
3881 			base += vie->num_processed;
3882 	}
3883 
3884 	idx = 0;
3885 	if (vie->index_register != VM_REG_LAST) {
3886 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
3887 		if (error) {
3888 			printf("verify_gla: error %d getting index reg %d\n",
3889 			    error, vie->index_register);
3890 			return (-1);
3891 		}
3892 	}
3893 
3894 	/*
3895 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3896 	 *
3897 	 * In 64-bit mode, segmentation is generally (but not
3898 	 * completely) disabled.  The exceptions are the FS and GS
3899 	 * segments.
3900 	 *
3901 	 * In legacy IA-32 mode, when the ESP or EBP register is used
3902 	 * as the base, the SS segment is the default segment.  For
3903 	 * other data references, except when relative to stack or
3904 	 * string destination the DS segment is the default.  These
3905 	 * can be overridden to allow other segments to be accessed.
3906 	 */
3907 	if (vie->segment_override) {
3908 		seg = vie->segment_register;
3909 	} else if (vie->base_register == VM_REG_GUEST_RSP ||
3910 	    vie->base_register == VM_REG_GUEST_RBP) {
3911 		seg = VM_REG_GUEST_SS;
3912 	} else {
3913 		seg = VM_REG_GUEST_DS;
3914 	}
3915 	if (vie->paging.cpu_mode == CPU_MODE_64BIT &&
3916 	    seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) {
3917 		segbase = 0;
3918 	} else {
3919 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
3920 		if (error) {
3921 			printf("verify_gla: error %d getting segment"
3922 			    " descriptor %d", error, vie->segment_register);
3923 			return (-1);
3924 		}
3925 		segbase = desc.base;
3926 	}
3927 
3928 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
3929 	gla2 &= size2mask[vie->addrsize];
3930 	if (gla != gla2) {
3931 		printf("verify_gla mismatch: segbase(0x%0lx)"
3932 		    "base(0x%0lx), scale(%d), index(0x%0lx), "
3933 		    "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3934 		    segbase, base, vie->scale, idx, vie->displacement,
3935 		    gla, gla2);
3936 		return (-1);
3937 	}
3938 
3939 	return (0);
3940 }
3941 
3942 int
3943 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d)
3944 {
3945 	enum vm_cpu_mode cpu_mode;
3946 
3947 	if ((vie->status & VIES_INST_FETCH) == 0) {
3948 		return (EINVAL);
3949 	}
3950 
3951 	cpu_mode = vie->paging.cpu_mode;
3952 
3953 	if (decode_prefixes(vie, cpu_mode, cs_d))
3954 		return (-1);
3955 
3956 	if (decode_opcode(vie))
3957 		return (-1);
3958 
3959 	if (decode_modrm(vie, cpu_mode))
3960 		return (-1);
3961 
3962 	if (decode_sib(vie))
3963 		return (-1);
3964 
3965 	if (decode_displacement(vie))
3966 		return (-1);
3967 
3968 	if (decode_immediate(vie))
3969 		return (-1);
3970 
3971 	if (decode_moffset(vie))
3972 		return (-1);
3973 
3974 	vie->status |= VIES_INST_DECODE;
3975 
3976 	return (0);
3977 }
3978