xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_instruction_emul.c (revision badf94ff3599fab15963f6c532929e9bc411757a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2015 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2021 Oxide Computer Company
44  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include <sys/param.h>
51 #include <sys/pcpu.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
54 
55 #include <machine/vmparam.h>
56 #include <machine/vmm.h>
57 #include <sys/vmm_kernel.h>
58 #include <sys/vmm_vm.h>
59 
60 #include <sys/vmm_instruction_emul.h>
61 #include <x86/psl.h>
62 #include <x86/specialreg.h>
63 
64 #include "vmm_ioport.h"
65 #include "vmm_ktr.h"
66 
67 enum vie_status {
68 	VIES_INIT		= (1U << 0),
69 	VIES_MMIO		= (1U << 1),
70 	VIES_INOUT		= (1U << 2),
71 	VIES_OTHER		= (1U << 3),
72 	VIES_INST_FETCH		= (1U << 4),
73 	VIES_INST_DECODE	= (1U << 5),
74 	VIES_PENDING_MMIO	= (1U << 6),
75 	VIES_PENDING_INOUT	= (1U << 7),
76 	VIES_REPEAT		= (1U << 8),
77 	VIES_USER_FALLBACK	= (1U << 9),
78 	VIES_COMPLETE		= (1U << 10),
79 };
80 
81 /* State of request to perform emulated access (inout or MMIO) */
82 enum vie_req {
83 	VR_NONE,
84 	VR_PENDING,
85 	VR_DONE,
86 };
87 
88 struct vie_mmio {
89 	uint64_t		data;
90 	uint64_t		gpa;
91 	uint8_t			bytes;
92 	enum vie_req		state;
93 };
94 
95 struct vie_op {
96 	uint8_t		op_byte;	/* actual opcode byte */
97 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
98 	uint16_t	op_flags;
99 };
100 
101 #define	VIE_INST_SIZE	15
102 struct vie {
103 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
104 	uint8_t		num_valid;		/* size of the instruction */
105 	uint8_t		num_processed;
106 
107 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
108 	uint8_t		rex_w:1,		/* REX prefix */
109 			rex_r:1,
110 			rex_x:1,
111 			rex_b:1,
112 			rex_present:1,
113 			repz_present:1,		/* REP/REPE/REPZ prefix */
114 			repnz_present:1,	/* REPNE/REPNZ prefix */
115 			opsize_override:1,	/* Operand size override */
116 			addrsize_override:1,	/* Address size override */
117 			segment_override:1;	/* Segment override */
118 
119 	uint8_t		mod:2,			/* ModRM byte */
120 			reg:4,
121 			rm:4;
122 
123 	uint8_t		ss:2,			/* SIB byte */
124 			vex_present:1,		/* VEX prefixed */
125 			vex_l:1,		/* L bit */
126 			index:4,		/* SIB byte */
127 			base:4;			/* SIB byte */
128 
129 	uint8_t		disp_bytes;
130 	uint8_t		imm_bytes;
131 
132 	uint8_t		scale;
133 
134 	uint8_t		vex_reg:4,	/* vvvv: first source reg specifier */
135 			vex_pp:2,	/* pp */
136 			_sparebits:2;
137 
138 	uint8_t		_sparebytes[2];
139 
140 	int		base_register;		/* VM_REG_GUEST_xyz */
141 	int		index_register;		/* VM_REG_GUEST_xyz */
142 	int		segment_register;	/* VM_REG_GUEST_xyz */
143 
144 	int64_t		displacement;		/* optional addr displacement */
145 	int64_t		immediate;		/* optional immediate operand */
146 
147 	struct vie_op	op;			/* opcode description */
148 
149 	enum vie_status	status;
150 
151 	struct vm_guest_paging paging;		/* guest paging state */
152 
153 	uint64_t	mmio_gpa;		/* faulting GPA */
154 	struct vie_mmio	mmio_req_read;
155 	struct vie_mmio	mmio_req_write;
156 
157 	struct vm_inout	inout;			/* active in/out op */
158 	enum vie_req	inout_req_state;
159 	uint32_t	inout_req_val;		/* value from userspace */
160 };
161 
162 
163 /* struct vie_op.op_type */
164 enum {
165 	VIE_OP_TYPE_NONE = 0,
166 	VIE_OP_TYPE_MOV,
167 	VIE_OP_TYPE_MOVSX,
168 	VIE_OP_TYPE_MOVZX,
169 	VIE_OP_TYPE_MOV_CR,
170 	VIE_OP_TYPE_AND,
171 	VIE_OP_TYPE_OR,
172 	VIE_OP_TYPE_SUB,
173 	VIE_OP_TYPE_TWO_BYTE,
174 	VIE_OP_TYPE_PUSH,
175 	VIE_OP_TYPE_CMP,
176 	VIE_OP_TYPE_POP,
177 	VIE_OP_TYPE_MOVS,
178 	VIE_OP_TYPE_GROUP1,
179 	VIE_OP_TYPE_STOS,
180 	VIE_OP_TYPE_BITTEST,
181 	VIE_OP_TYPE_TWOB_GRP15,
182 	VIE_OP_TYPE_ADD,
183 	VIE_OP_TYPE_TEST,
184 	VIE_OP_TYPE_BEXTR,
185 	VIE_OP_TYPE_CLTS,
186 	VIE_OP_TYPE_LAST
187 };
188 
189 /* struct vie_op.op_flags */
190 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
191 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
192 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
193 #define	VIE_OP_F_NO_MODRM	(1 << 3)
194 #define	VIE_OP_F_NO_GLA_VERIFICATION	(1 << 4)
195 #define	VIE_OP_F_REG_REG	(1 << 5)  /* special-case for mov-cr */
196 
197 static const struct vie_op three_byte_opcodes_0f38[256] = {
198 	[0xF7] = {
199 		.op_byte = 0xF7,
200 		.op_type = VIE_OP_TYPE_BEXTR,
201 	},
202 };
203 
204 static const struct vie_op two_byte_opcodes[256] = {
205 	[0x06] = {
206 		.op_byte = 0x06,
207 		.op_type = VIE_OP_TYPE_CLTS,
208 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
209 	},
210 	[0x20] = {
211 		.op_byte = 0x20,
212 		.op_type = VIE_OP_TYPE_MOV_CR,
213 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
214 	},
215 	[0x22] = {
216 		.op_byte = 0x22,
217 		.op_type = VIE_OP_TYPE_MOV_CR,
218 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
219 	},
220 	[0xAE] = {
221 		.op_byte = 0xAE,
222 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
223 	},
224 	[0xB6] = {
225 		.op_byte = 0xB6,
226 		.op_type = VIE_OP_TYPE_MOVZX,
227 	},
228 	[0xB7] = {
229 		.op_byte = 0xB7,
230 		.op_type = VIE_OP_TYPE_MOVZX,
231 	},
232 	[0xBA] = {
233 		.op_byte = 0xBA,
234 		.op_type = VIE_OP_TYPE_BITTEST,
235 		.op_flags = VIE_OP_F_IMM8,
236 	},
237 	[0xBE] = {
238 		.op_byte = 0xBE,
239 		.op_type = VIE_OP_TYPE_MOVSX,
240 	},
241 };
242 
243 static const struct vie_op one_byte_opcodes[256] = {
244 	[0x03] = {
245 		.op_byte = 0x03,
246 		.op_type = VIE_OP_TYPE_ADD,
247 	},
248 	[0x0F] = {
249 		.op_byte = 0x0F,
250 		.op_type = VIE_OP_TYPE_TWO_BYTE
251 	},
252 	[0x0B] = {
253 		.op_byte = 0x0B,
254 		.op_type = VIE_OP_TYPE_OR,
255 	},
256 	[0x2B] = {
257 		.op_byte = 0x2B,
258 		.op_type = VIE_OP_TYPE_SUB,
259 	},
260 	[0x39] = {
261 		.op_byte = 0x39,
262 		.op_type = VIE_OP_TYPE_CMP,
263 	},
264 	[0x3B] = {
265 		.op_byte = 0x3B,
266 		.op_type = VIE_OP_TYPE_CMP,
267 	},
268 	[0x88] = {
269 		.op_byte = 0x88,
270 		.op_type = VIE_OP_TYPE_MOV,
271 	},
272 	[0x89] = {
273 		.op_byte = 0x89,
274 		.op_type = VIE_OP_TYPE_MOV,
275 	},
276 	[0x8A] = {
277 		.op_byte = 0x8A,
278 		.op_type = VIE_OP_TYPE_MOV,
279 	},
280 	[0x8B] = {
281 		.op_byte = 0x8B,
282 		.op_type = VIE_OP_TYPE_MOV,
283 	},
284 	[0xA1] = {
285 		.op_byte = 0xA1,
286 		.op_type = VIE_OP_TYPE_MOV,
287 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
288 	},
289 	[0xA3] = {
290 		.op_byte = 0xA3,
291 		.op_type = VIE_OP_TYPE_MOV,
292 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
293 	},
294 	[0xA4] = {
295 		.op_byte = 0xA4,
296 		.op_type = VIE_OP_TYPE_MOVS,
297 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
298 	},
299 	[0xA5] = {
300 		.op_byte = 0xA5,
301 		.op_type = VIE_OP_TYPE_MOVS,
302 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
303 	},
304 	[0xAA] = {
305 		.op_byte = 0xAA,
306 		.op_type = VIE_OP_TYPE_STOS,
307 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
308 	},
309 	[0xAB] = {
310 		.op_byte = 0xAB,
311 		.op_type = VIE_OP_TYPE_STOS,
312 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
313 	},
314 	[0xC6] = {
315 		/* XXX Group 11 extended opcode - not just MOV */
316 		.op_byte = 0xC6,
317 		.op_type = VIE_OP_TYPE_MOV,
318 		.op_flags = VIE_OP_F_IMM8,
319 	},
320 	[0xC7] = {
321 		.op_byte = 0xC7,
322 		.op_type = VIE_OP_TYPE_MOV,
323 		.op_flags = VIE_OP_F_IMM,
324 	},
325 	[0x23] = {
326 		.op_byte = 0x23,
327 		.op_type = VIE_OP_TYPE_AND,
328 	},
329 	[0x80] = {
330 		/* Group 1 extended opcode */
331 		.op_byte = 0x80,
332 		.op_type = VIE_OP_TYPE_GROUP1,
333 		.op_flags = VIE_OP_F_IMM8,
334 	},
335 	[0x81] = {
336 		/* Group 1 extended opcode */
337 		.op_byte = 0x81,
338 		.op_type = VIE_OP_TYPE_GROUP1,
339 		.op_flags = VIE_OP_F_IMM,
340 	},
341 	[0x83] = {
342 		/* Group 1 extended opcode */
343 		.op_byte = 0x83,
344 		.op_type = VIE_OP_TYPE_GROUP1,
345 		.op_flags = VIE_OP_F_IMM8,
346 	},
347 	[0x8F] = {
348 		/* XXX Group 1A extended opcode - not just POP */
349 		.op_byte = 0x8F,
350 		.op_type = VIE_OP_TYPE_POP,
351 	},
352 	[0xF6] = {
353 		/* XXX Group 3 extended opcode - not just TEST */
354 		.op_byte = 0xF6,
355 		.op_type = VIE_OP_TYPE_TEST,
356 		.op_flags = VIE_OP_F_IMM8,
357 	},
358 	[0xF7] = {
359 		/* XXX Group 3 extended opcode - not just TEST */
360 		.op_byte = 0xF7,
361 		.op_type = VIE_OP_TYPE_TEST,
362 		.op_flags = VIE_OP_F_IMM,
363 	},
364 	[0xFF] = {
365 		/* XXX Group 5 extended opcode - not just PUSH */
366 		.op_byte = 0xFF,
367 		.op_type = VIE_OP_TYPE_PUSH,
368 	}
369 };
370 
371 /* struct vie.mod */
372 #define	VIE_MOD_INDIRECT		0
373 #define	VIE_MOD_INDIRECT_DISP8		1
374 #define	VIE_MOD_INDIRECT_DISP32		2
375 #define	VIE_MOD_DIRECT			3
376 
377 /* struct vie.rm */
378 #define	VIE_RM_SIB			4
379 #define	VIE_RM_DISP32			5
380 
381 #define	GB				(1024 * 1024 * 1024)
382 
383 
384 /*
385  * Paging defines, previously pulled in from machine/pmap.h
386  */
387 #define	PG_V	(1 << 0) /* Present */
388 #define	PG_RW	(1 << 1) /* Read/Write */
389 #define	PG_U	(1 << 2) /* User/Supervisor */
390 #define	PG_A	(1 << 5) /* Accessed */
391 #define	PG_M	(1 << 6) /* Dirty */
392 #define	PG_PS	(1 << 7) /* Largepage */
393 
394 /*
395  * Paging except defines, previously pulled in from machine/pmap.h
396  */
397 #define	PGEX_P		(1 << 0) /* Non-present/Protection */
398 #define	PGEX_W		(1 << 1) /* Read/Write */
399 #define	PGEX_U		(1 << 2) /* User/Supervisor */
400 #define	PGEX_RSV	(1 << 3) /* (Non-)Reserved */
401 #define	PGEX_I		(1 << 4) /* Instruction */
402 
403 
404 static enum vm_reg_name gpr_map[16] = {
405 	VM_REG_GUEST_RAX,
406 	VM_REG_GUEST_RCX,
407 	VM_REG_GUEST_RDX,
408 	VM_REG_GUEST_RBX,
409 	VM_REG_GUEST_RSP,
410 	VM_REG_GUEST_RBP,
411 	VM_REG_GUEST_RSI,
412 	VM_REG_GUEST_RDI,
413 	VM_REG_GUEST_R8,
414 	VM_REG_GUEST_R9,
415 	VM_REG_GUEST_R10,
416 	VM_REG_GUEST_R11,
417 	VM_REG_GUEST_R12,
418 	VM_REG_GUEST_R13,
419 	VM_REG_GUEST_R14,
420 	VM_REG_GUEST_R15
421 };
422 
423 static enum vm_reg_name cr_map[16] = {
424 	VM_REG_GUEST_CR0,
425 	VM_REG_LAST,
426 	VM_REG_GUEST_CR2,
427 	VM_REG_GUEST_CR3,
428 	VM_REG_GUEST_CR4,
429 	VM_REG_LAST,
430 	VM_REG_LAST,
431 	VM_REG_LAST,
432 	VM_REG_LAST,
433 	VM_REG_LAST,
434 	VM_REG_LAST,
435 	VM_REG_LAST,
436 	VM_REG_LAST,
437 	VM_REG_LAST,
438 	VM_REG_LAST,
439 	VM_REG_LAST
440 };
441 
442 static uint64_t size2mask[] = {
443 	[1] = 0xff,
444 	[2] = 0xffff,
445 	[4] = 0xffffffff,
446 	[8] = 0xffffffffffffffff,
447 };
448 
449 
450 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid,
451     uint64_t gpa, uint64_t *rval, int bytes);
452 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid,
453     uint64_t gpa, uint64_t wval, int bytes);
454 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
455     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
456     int prot, uint64_t *gla);
457 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
458 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf,
459     uint64_t gla);
460 static uint64_t vie_size2mask(int size);
461 
462 struct vie *
463 vie_alloc()
464 {
465 	return (kmem_zalloc(sizeof (struct vie), KM_SLEEP));
466 }
467 
468 void
469 vie_free(struct vie *vie)
470 {
471 	kmem_free(vie, sizeof (struct vie));
472 }
473 
474 enum vm_reg_name
475 vie_regnum_map(uint8_t regnum)
476 {
477 	VERIFY3U(regnum, <, 16);
478 	return (gpr_map[regnum]);
479 }
480 
481 static void
482 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
483 {
484 	*lhbr = 0;
485 	*reg = gpr_map[vie->reg];
486 
487 	/*
488 	 * 64-bit mode imposes limitations on accessing legacy high byte
489 	 * registers (lhbr).
490 	 *
491 	 * The legacy high-byte registers cannot be addressed if the REX
492 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
493 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
494 	 *
495 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
496 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
497 	 * %ah, %ch, %dh and %bh respectively.
498 	 */
499 	if (!vie->rex_present) {
500 		if (vie->reg & 0x4) {
501 			*lhbr = 1;
502 			*reg = gpr_map[vie->reg & 0x3];
503 		}
504 	}
505 }
506 
507 static int
508 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval)
509 {
510 	uint64_t val;
511 	int error, lhbr;
512 	enum vm_reg_name reg;
513 
514 	vie_calc_bytereg(vie, &reg, &lhbr);
515 	error = vm_get_register(vm, vcpuid, reg, &val);
516 
517 	/*
518 	 * To obtain the value of a legacy high byte register shift the
519 	 * base register right by 8 bits (%ah = %rax >> 8).
520 	 */
521 	if (lhbr)
522 		*rval = val >> 8;
523 	else
524 		*rval = val;
525 	return (error);
526 }
527 
528 static int
529 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte)
530 {
531 	uint64_t origval, val, mask;
532 	int error, lhbr;
533 	enum vm_reg_name reg;
534 
535 	vie_calc_bytereg(vie, &reg, &lhbr);
536 	error = vm_get_register(vm, vcpuid, reg, &origval);
537 	if (error == 0) {
538 		val = byte;
539 		mask = 0xff;
540 		if (lhbr) {
541 			/*
542 			 * Shift left by 8 to store 'byte' in a legacy high
543 			 * byte register.
544 			 */
545 			val <<= 8;
546 			mask <<= 8;
547 		}
548 		val |= origval & ~mask;
549 		error = vm_set_register(vm, vcpuid, reg, val);
550 	}
551 	return (error);
552 }
553 
554 static int
555 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg,
556     uint64_t val, int size)
557 {
558 	int error;
559 	uint64_t origval;
560 
561 	switch (size) {
562 	case 1:
563 	case 2:
564 		error = vm_get_register(vm, vcpuid, reg, &origval);
565 		if (error)
566 			return (error);
567 		val &= size2mask[size];
568 		val |= origval & ~size2mask[size];
569 		break;
570 	case 4:
571 		val &= 0xffffffffUL;
572 		break;
573 	case 8:
574 		break;
575 	default:
576 		return (EINVAL);
577 	}
578 
579 	error = vm_set_register(vm, vcpuid, reg, val);
580 	return (error);
581 }
582 
583 static int
584 vie_repeat(struct vie *vie)
585 {
586 	vie->status |= VIES_REPEAT;
587 
588 	/*
589 	 * Clear out any cached operation values so the repeated instruction can
590 	 * begin without using that stale state.  Other state, such as the
591 	 * decoding results, are kept around as it will not vary between
592 	 * iterations of a rep-prefixed instruction.
593 	 */
594 	if ((vie->status & VIES_MMIO) != 0) {
595 		vie->mmio_req_read.state = VR_NONE;
596 		vie->mmio_req_write.state = VR_NONE;
597 	} else if ((vie->status & VIES_INOUT) != 0) {
598 		vie->inout_req_state = VR_NONE;
599 	} else {
600 		panic("unexpected emulation state");
601 	}
602 
603 	return (EAGAIN);
604 }
605 
606 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
607 
608 /*
609  * Return the status flags that would result from doing (x - y).
610  */
611 /* BEGIN CSTYLED */
612 #define	GETCC(sz)							\
613 static ulong_t								\
614 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
615 {									\
616 	ulong_t rflags;							\
617 									\
618 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
619 	    "=r" (rflags), "+r" (x) : "m" (y));				\
620 	return (rflags);						\
621 } struct __hack
622 /* END CSTYLED */
623 
624 GETCC(8);
625 GETCC(16);
626 GETCC(32);
627 GETCC(64);
628 
629 static ulong_t
630 getcc(int opsize, uint64_t x, uint64_t y)
631 {
632 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
633 	    ("getcc: invalid operand size %d", opsize));
634 
635 	if (opsize == 1)
636 		return (getcc8(x, y));
637 	else if (opsize == 2)
638 		return (getcc16(x, y));
639 	else if (opsize == 4)
640 		return (getcc32(x, y));
641 	else
642 		return (getcc64(x, y));
643 }
644 
645 /*
646  * Macro creation of functions getaddflags{8,16,32,64}
647  */
648 /* BEGIN CSTYLED */
649 #define	GETADDFLAGS(sz)							\
650 static ulong_t								\
651 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
652 {									\
653 	ulong_t rflags;							\
654 									\
655 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
656 	    "=r" (rflags), "+r" (x) : "m" (y));				\
657 	return (rflags);						\
658 } struct __hack
659 /* END CSTYLED */
660 
661 GETADDFLAGS(8);
662 GETADDFLAGS(16);
663 GETADDFLAGS(32);
664 GETADDFLAGS(64);
665 
666 static ulong_t
667 getaddflags(int opsize, uint64_t x, uint64_t y)
668 {
669 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
670 	    ("getaddflags: invalid operand size %d", opsize));
671 
672 	if (opsize == 1)
673 		return (getaddflags8(x, y));
674 	else if (opsize == 2)
675 		return (getaddflags16(x, y));
676 	else if (opsize == 4)
677 		return (getaddflags32(x, y));
678 	else
679 		return (getaddflags64(x, y));
680 }
681 
682 /*
683  * Return the status flags that would result from doing (x & y).
684  */
685 /* BEGIN CSTYLED */
686 #define	GETANDFLAGS(sz)							\
687 static ulong_t								\
688 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
689 {									\
690 	ulong_t rflags;							\
691 									\
692 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
693 	    "=r" (rflags), "+r" (x) : "m" (y));				\
694 	return (rflags);						\
695 } struct __hack
696 /* END CSTYLED */
697 
698 GETANDFLAGS(8);
699 GETANDFLAGS(16);
700 GETANDFLAGS(32);
701 GETANDFLAGS(64);
702 
703 static ulong_t
704 getandflags(int opsize, uint64_t x, uint64_t y)
705 {
706 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
707 	    ("getandflags: invalid operand size %d", opsize));
708 
709 	if (opsize == 1)
710 		return (getandflags8(x, y));
711 	else if (opsize == 2)
712 		return (getandflags16(x, y));
713 	else if (opsize == 4)
714 		return (getandflags32(x, y));
715 	else
716 		return (getandflags64(x, y));
717 }
718 
719 static int
720 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid)
721 {
722 	uint64_t val;
723 	int err;
724 	enum vm_reg_name gpr = gpr_map[vie->rm];
725 	enum vm_reg_name cr = cr_map[vie->reg];
726 
727 	uint_t size = 4;
728 	if (vie->paging.cpu_mode == CPU_MODE_64BIT) {
729 		size = 8;
730 	}
731 
732 	switch (vie->op.op_byte) {
733 	case 0x20:
734 		/*
735 		 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
736 		 * 20/r:	mov r32, CR0-CR7
737 		 * 20/r:	mov r64, CR0-CR7
738 		 * REX.R + 20/0:	mov r64, CR8
739 		 */
740 		if (vie->paging.cpl != 0) {
741 			vm_inject_gp(vm, vcpuid);
742 			vie->num_processed = 0;
743 			return (0);
744 		}
745 		err = vm_get_register(vm, vcpuid, cr, &val);
746 		if (err != 0) {
747 			/* #UD for access to non-existent CRs */
748 			vm_inject_ud(vm, vcpuid);
749 			vie->num_processed = 0;
750 			return (0);
751 		}
752 		err = vie_update_register(vm, vcpuid, gpr, val, size);
753 		break;
754 	case 0x22: {
755 		/*
756 		 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
757 		 * 22/r:	mov CR0-CR7, r32
758 		 * 22/r:	mov CR0-CR7, r64
759 		 * REX.R + 22/0:	mov CR8, r64
760 		 */
761 		uint64_t old, diff;
762 
763 		if (vie->paging.cpl != 0) {
764 			vm_inject_gp(vm, vcpuid);
765 			vie->num_processed = 0;
766 			return (0);
767 		}
768 		err = vm_get_register(vm, vcpuid, cr, &old);
769 		if (err != 0) {
770 			/* #UD for access to non-existent CRs */
771 			vm_inject_ud(vm, vcpuid);
772 			vie->num_processed = 0;
773 			return (0);
774 		}
775 		err = vm_get_register(vm, vcpuid, gpr, &val);
776 		VERIFY0(err);
777 		val &= size2mask[size];
778 		diff = old ^ val;
779 
780 		switch (cr) {
781 		case VM_REG_GUEST_CR0:
782 			if ((diff & CR0_PG) != 0) {
783 				uint64_t efer;
784 
785 				err = vm_get_register(vm, vcpuid,
786 				    VM_REG_GUEST_EFER, &efer);
787 				VERIFY0(err);
788 
789 				/* Keep the long-mode state in EFER in sync */
790 				if ((val & CR0_PG) != 0 &&
791 				    (efer & EFER_LME) != 0) {
792 					efer |= EFER_LMA;
793 				}
794 				if ((val & CR0_PG) == 0 &&
795 				    (efer & EFER_LME) != 0) {
796 					efer &= ~EFER_LMA;
797 				}
798 
799 				err = vm_set_register(vm, vcpuid,
800 				    VM_REG_GUEST_EFER, efer);
801 				VERIFY0(err);
802 			}
803 			/* TODO: enforce more of the #GP checks */
804 			err = vm_set_register(vm, vcpuid, cr, val);
805 			VERIFY0(err);
806 			break;
807 		case VM_REG_GUEST_CR2:
808 		case VM_REG_GUEST_CR3:
809 		case VM_REG_GUEST_CR4:
810 			/* TODO: enforce more of the #GP checks */
811 			err = vm_set_register(vm, vcpuid, cr, val);
812 			break;
813 		default:
814 			/* The cr_map mapping should prevent this */
815 			panic("invalid cr %d", cr);
816 		}
817 		break;
818 	}
819 	default:
820 		return (EINVAL);
821 	}
822 	return (err);
823 }
824 
825 static int
826 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
827 {
828 	int error, size;
829 	enum vm_reg_name reg;
830 	uint8_t byte;
831 	uint64_t val;
832 
833 	size = vie->opsize;
834 	error = EINVAL;
835 
836 	switch (vie->op.op_byte) {
837 	case 0x88:
838 		/*
839 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
840 		 * 88/r:	mov r/m8, r8
841 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
842 		 */
843 		size = 1;	/* override for byte operation */
844 		error = vie_read_bytereg(vie, vm, vcpuid, &byte);
845 		if (error == 0) {
846 			error = vie_mmio_write(vie, vm, vcpuid, gpa, byte,
847 			    size);
848 		}
849 		break;
850 	case 0x89:
851 		/*
852 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
853 		 * 89/r:	mov r/m16, r16
854 		 * 89/r:	mov r/m32, r32
855 		 * REX.W + 89/r	mov r/m64, r64
856 		 */
857 		reg = gpr_map[vie->reg];
858 		error = vm_get_register(vm, vcpuid, reg, &val);
859 		if (error == 0) {
860 			val &= size2mask[size];
861 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
862 		}
863 		break;
864 	case 0x8A:
865 		/*
866 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
867 		 * 8A/r:	mov r8, r/m8
868 		 * REX + 8A/r:	mov r8, r/m8
869 		 */
870 		size = 1;	/* override for byte operation */
871 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
872 		if (error == 0)
873 			error = vie_write_bytereg(vie, vm, vcpuid, val);
874 		break;
875 	case 0x8B:
876 		/*
877 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
878 		 * 8B/r:	mov r16, r/m16
879 		 * 8B/r:	mov r32, r/m32
880 		 * REX.W 8B/r:	mov r64, r/m64
881 		 */
882 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
883 		if (error == 0) {
884 			reg = gpr_map[vie->reg];
885 			error = vie_update_register(vm, vcpuid, reg, val, size);
886 		}
887 		break;
888 	case 0xA1:
889 		/*
890 		 * MOV from seg:moffset to AX/EAX/RAX
891 		 * A1:		mov AX, moffs16
892 		 * A1:		mov EAX, moffs32
893 		 * REX.W + A1:	mov RAX, moffs64
894 		 */
895 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
896 		if (error == 0) {
897 			reg = VM_REG_GUEST_RAX;
898 			error = vie_update_register(vm, vcpuid, reg, val, size);
899 		}
900 		break;
901 	case 0xA3:
902 		/*
903 		 * MOV from AX/EAX/RAX to seg:moffset
904 		 * A3:		mov moffs16, AX
905 		 * A3:		mov moffs32, EAX
906 		 * REX.W + A3:	mov moffs64, RAX
907 		 */
908 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
909 		if (error == 0) {
910 			val &= size2mask[size];
911 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
912 		}
913 		break;
914 	case 0xC6:
915 		/*
916 		 * MOV from imm8 to mem (ModRM:r/m)
917 		 * C6/0		mov r/m8, imm8
918 		 * REX + C6/0	mov r/m8, imm8
919 		 */
920 		size = 1;	/* override for byte operation */
921 		val = vie->immediate;
922 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
923 		break;
924 	case 0xC7:
925 		/*
926 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
927 		 * C7/0		mov r/m16, imm16
928 		 * C7/0		mov r/m32, imm32
929 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
930 		 */
931 		val = vie->immediate & size2mask[size];
932 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
933 		break;
934 	default:
935 		break;
936 	}
937 
938 	return (error);
939 }
940 
941 static int
942 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
943 {
944 	int error, size;
945 	enum vm_reg_name reg;
946 	uint64_t val;
947 
948 	size = vie->opsize;
949 	error = EINVAL;
950 
951 	switch (vie->op.op_byte) {
952 	case 0xB6:
953 		/*
954 		 * MOV and zero extend byte from mem (ModRM:r/m) to
955 		 * reg (ModRM:reg).
956 		 *
957 		 * 0F B6/r		movzx r16, r/m8
958 		 * 0F B6/r		movzx r32, r/m8
959 		 * REX.W + 0F B6/r	movzx r64, r/m8
960 		 */
961 
962 		/* get the first operand */
963 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
964 		if (error)
965 			break;
966 
967 		/* get the second operand */
968 		reg = gpr_map[vie->reg];
969 
970 		/* zero-extend byte */
971 		val = (uint8_t)val;
972 
973 		/* write the result */
974 		error = vie_update_register(vm, vcpuid, reg, val, size);
975 		break;
976 	case 0xB7:
977 		/*
978 		 * MOV and zero extend word from mem (ModRM:r/m) to
979 		 * reg (ModRM:reg).
980 		 *
981 		 * 0F B7/r		movzx r32, r/m16
982 		 * REX.W + 0F B7/r	movzx r64, r/m16
983 		 */
984 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2);
985 		if (error)
986 			return (error);
987 
988 		reg = gpr_map[vie->reg];
989 
990 		/* zero-extend word */
991 		val = (uint16_t)val;
992 
993 		error = vie_update_register(vm, vcpuid, reg, val, size);
994 		break;
995 	case 0xBE:
996 		/*
997 		 * MOV and sign extend byte from mem (ModRM:r/m) to
998 		 * reg (ModRM:reg).
999 		 *
1000 		 * 0F BE/r		movsx r16, r/m8
1001 		 * 0F BE/r		movsx r32, r/m8
1002 		 * REX.W + 0F BE/r	movsx r64, r/m8
1003 		 */
1004 
1005 		/* get the first operand */
1006 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1007 		if (error)
1008 			break;
1009 
1010 		/* get the second operand */
1011 		reg = gpr_map[vie->reg];
1012 
1013 		/* sign extend byte */
1014 		val = (int8_t)val;
1015 
1016 		/* write the result */
1017 		error = vie_update_register(vm, vcpuid, reg, val, size);
1018 		break;
1019 	default:
1020 		break;
1021 	}
1022 	return (error);
1023 }
1024 
1025 /*
1026  * Helper function to calculate and validate a linear address.
1027  */
1028 static int
1029 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize,
1030     int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr,
1031     uint64_t *gla)
1032 {
1033 	struct seg_desc desc;
1034 	uint64_t cr0, val, rflags;
1035 	int error;
1036 	struct vm_guest_paging *paging;
1037 
1038 	paging = &vie->paging;
1039 
1040 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1041 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1042 
1043 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1044 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1045 
1046 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
1047 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
1048 	    __func__, error, seg));
1049 
1050 	error = vm_get_register(vm, vcpuid, gpr, &val);
1051 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
1052 	    error, gpr));
1053 
1054 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
1055 	    addrsize, prot, gla)) {
1056 		if (seg == VM_REG_GUEST_SS)
1057 			vm_inject_ss(vm, vcpuid, 0);
1058 		else
1059 			vm_inject_gp(vm, vcpuid);
1060 		return (-1);
1061 	}
1062 
1063 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
1064 		if (seg == VM_REG_GUEST_SS)
1065 			vm_inject_ss(vm, vcpuid, 0);
1066 		else
1067 			vm_inject_gp(vm, vcpuid);
1068 		return (-1);
1069 	}
1070 
1071 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
1072 		vm_inject_ac(vm, vcpuid, 0);
1073 		return (-1);
1074 	}
1075 
1076 	return (0);
1077 }
1078 
1079 static int
1080 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1081 {
1082 	struct vm_copyinfo copyinfo[2];
1083 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
1084 	uint64_t rcx, rdi, rsi, rflags;
1085 	int error, fault, opsize, seg, repeat;
1086 	struct vm_guest_paging *paging;
1087 
1088 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
1089 	val = 0;
1090 	error = 0;
1091 	paging = &vie->paging;
1092 
1093 	/*
1094 	 * XXX although the MOVS instruction is only supposed to be used with
1095 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1096 	 *
1097 	 * Empirically the "repnz" prefix has identical behavior to "rep"
1098 	 * and the zero flag does not make a difference.
1099 	 */
1100 	repeat = vie->repz_present | vie->repnz_present;
1101 
1102 	if (repeat) {
1103 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1104 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1105 
1106 		/*
1107 		 * The count register is %rcx, %ecx or %cx depending on the
1108 		 * address size of the instruction.
1109 		 */
1110 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
1111 			error = 0;
1112 			goto done;
1113 		}
1114 	}
1115 
1116 	/*
1117 	 *	Source		Destination	Comments
1118 	 *	--------------------------------------------
1119 	 * (1)  memory		memory		n/a
1120 	 * (2)  memory		mmio		emulated
1121 	 * (3)  mmio		memory		emulated
1122 	 * (4)  mmio		mmio		emulated
1123 	 *
1124 	 * At this point we don't have sufficient information to distinguish
1125 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1126 	 * out because it will succeed only when operating on regular memory.
1127 	 *
1128 	 * XXX the emulation doesn't properly handle the case where 'gpa'
1129 	 * is straddling the boundary between the normal memory and MMIO.
1130 	 */
1131 
1132 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
1133 	if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg,
1134 	    VM_REG_GUEST_RSI, &srcaddr) != 0) {
1135 		goto done;
1136 	}
1137 
1138 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
1139 	    copyinfo, nitems(copyinfo), &fault);
1140 	if (error == 0) {
1141 		if (fault)
1142 			goto done;	/* Resume guest to handle fault */
1143 
1144 		/*
1145 		 * case (2): read from system memory and write to mmio.
1146 		 */
1147 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
1148 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1149 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1150 		if (error)
1151 			goto done;
1152 	} else {
1153 		/*
1154 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1155 		 * if 'srcaddr' is in the mmio space.
1156 		 */
1157 
1158 		if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize,
1159 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI,
1160 		    &dstaddr) != 0) {
1161 			goto done;
1162 		}
1163 
1164 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
1165 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
1166 		if (error == 0) {
1167 			if (fault)
1168 				goto done;    /* Resume guest to handle fault */
1169 
1170 			/*
1171 			 * case (3): read from MMIO and write to system memory.
1172 			 *
1173 			 * A MMIO read can have side-effects so we
1174 			 * commit to it only after vm_copy_setup() is
1175 			 * successful. If a page-fault needs to be
1176 			 * injected into the guest then it will happen
1177 			 * before the MMIO read is attempted.
1178 			 */
1179 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &val,
1180 			    opsize);
1181 
1182 			if (error == 0) {
1183 				vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
1184 			}
1185 			/*
1186 			 * Regardless of whether the MMIO read was successful or
1187 			 * not, the copy resources must be cleaned up.
1188 			 */
1189 			vm_copy_teardown(vm, vcpuid, copyinfo,
1190 			    nitems(copyinfo));
1191 			if (error != 0) {
1192 				goto done;
1193 			}
1194 		} else {
1195 			/*
1196 			 * Case (4): read from and write to mmio.
1197 			 *
1198 			 * Commit to the MMIO read/write (with potential
1199 			 * side-effects) only after we are sure that the
1200 			 * instruction is not going to be restarted due
1201 			 * to address translation faults.
1202 			 */
1203 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
1204 			    PROT_READ, &srcgpa, &fault);
1205 			if (error || fault)
1206 				goto done;
1207 
1208 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
1209 			    PROT_WRITE, &dstgpa, &fault);
1210 			if (error || fault)
1211 				goto done;
1212 
1213 			error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val,
1214 			    opsize);
1215 			if (error)
1216 				goto done;
1217 
1218 			error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val,
1219 			    opsize);
1220 			if (error)
1221 				goto done;
1222 		}
1223 	}
1224 
1225 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
1226 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
1227 
1228 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1229 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1230 
1231 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1232 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1233 
1234 	if (rflags & PSL_D) {
1235 		rsi -= opsize;
1236 		rdi -= opsize;
1237 	} else {
1238 		rsi += opsize;
1239 		rdi += opsize;
1240 	}
1241 
1242 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
1243 	    vie->addrsize);
1244 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
1245 
1246 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1247 	    vie->addrsize);
1248 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1249 
1250 	if (repeat) {
1251 		rcx = rcx - 1;
1252 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1253 		    rcx, vie->addrsize);
1254 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1255 
1256 		/*
1257 		 * Repeat the instruction if the count register is not zero.
1258 		 */
1259 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1260 			return (vie_repeat(vie));
1261 	}
1262 done:
1263 	return (error);
1264 }
1265 
1266 static int
1267 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1268 {
1269 	int error, opsize, repeat;
1270 	uint64_t val;
1271 	uint64_t rcx, rdi, rflags;
1272 
1273 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
1274 	repeat = vie->repz_present | vie->repnz_present;
1275 
1276 	if (repeat) {
1277 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1278 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1279 
1280 		/*
1281 		 * The count register is %rcx, %ecx or %cx depending on the
1282 		 * address size of the instruction.
1283 		 */
1284 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
1285 			return (0);
1286 	}
1287 
1288 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
1289 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
1290 
1291 	error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1292 	if (error)
1293 		return (error);
1294 
1295 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1296 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1297 
1298 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1299 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1300 
1301 	if (rflags & PSL_D)
1302 		rdi -= opsize;
1303 	else
1304 		rdi += opsize;
1305 
1306 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1307 	    vie->addrsize);
1308 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1309 
1310 	if (repeat) {
1311 		rcx = rcx - 1;
1312 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1313 		    rcx, vie->addrsize);
1314 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1315 
1316 		/*
1317 		 * Repeat the instruction if the count register is not zero.
1318 		 */
1319 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1320 			return (vie_repeat(vie));
1321 	}
1322 
1323 	return (0);
1324 }
1325 
1326 static int
1327 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1328 {
1329 	int error, size;
1330 	enum vm_reg_name reg;
1331 	uint64_t result, rflags, rflags2, val1, val2;
1332 
1333 	size = vie->opsize;
1334 	error = EINVAL;
1335 
1336 	switch (vie->op.op_byte) {
1337 	case 0x23:
1338 		/*
1339 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1340 		 * result in reg.
1341 		 *
1342 		 * 23/r		and r16, r/m16
1343 		 * 23/r		and r32, r/m32
1344 		 * REX.W + 23/r	and r64, r/m64
1345 		 */
1346 
1347 		/* get the first operand */
1348 		reg = gpr_map[vie->reg];
1349 		error = vm_get_register(vm, vcpuid, reg, &val1);
1350 		if (error)
1351 			break;
1352 
1353 		/* get the second operand */
1354 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1355 		if (error)
1356 			break;
1357 
1358 		/* perform the operation and write the result */
1359 		result = val1 & val2;
1360 		error = vie_update_register(vm, vcpuid, reg, result, size);
1361 		break;
1362 	case 0x81:
1363 	case 0x83:
1364 		/*
1365 		 * AND mem (ModRM:r/m) with immediate and store the
1366 		 * result in mem.
1367 		 *
1368 		 * 81 /4		and r/m16, imm16
1369 		 * 81 /4		and r/m32, imm32
1370 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1371 		 *
1372 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1373 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1374 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1375 		 */
1376 
1377 		/* get the first operand */
1378 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1379 		if (error)
1380 			break;
1381 
1382 		/*
1383 		 * perform the operation with the pre-fetched immediate
1384 		 * operand and write the result
1385 		 */
1386 		result = val1 & vie->immediate;
1387 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1388 		break;
1389 	default:
1390 		break;
1391 	}
1392 	if (error)
1393 		return (error);
1394 
1395 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1396 	if (error)
1397 		return (error);
1398 
1399 	/*
1400 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1401 	 * to the result; AF is undefined.
1402 	 *
1403 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1404 	 */
1405 	rflags2 = getcc(size, result, 0);
1406 	rflags &= ~RFLAGS_STATUS_BITS;
1407 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1408 
1409 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1410 	return (error);
1411 }
1412 
1413 static int
1414 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1415 {
1416 	int error, size;
1417 	enum vm_reg_name reg;
1418 	uint64_t result, rflags, rflags2, val1, val2;
1419 
1420 	size = vie->opsize;
1421 	error = EINVAL;
1422 
1423 	switch (vie->op.op_byte) {
1424 	case 0x0B:
1425 		/*
1426 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1427 		 * result in reg.
1428 		 *
1429 		 * 0b/r		or r16, r/m16
1430 		 * 0b/r		or r32, r/m32
1431 		 * REX.W + 0b/r	or r64, r/m64
1432 		 */
1433 
1434 		/* get the first operand */
1435 		reg = gpr_map[vie->reg];
1436 		error = vm_get_register(vm, vcpuid, reg, &val1);
1437 		if (error)
1438 			break;
1439 
1440 		/* get the second operand */
1441 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1442 		if (error)
1443 			break;
1444 
1445 		/* perform the operation and write the result */
1446 		result = val1 | val2;
1447 		error = vie_update_register(vm, vcpuid, reg, result, size);
1448 		break;
1449 	case 0x81:
1450 	case 0x83:
1451 		/*
1452 		 * OR mem (ModRM:r/m) with immediate and store the
1453 		 * result in mem.
1454 		 *
1455 		 * 81 /1		or r/m16, imm16
1456 		 * 81 /1		or r/m32, imm32
1457 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1458 		 *
1459 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1460 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1461 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1462 		 */
1463 
1464 		/* get the first operand */
1465 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1466 		if (error)
1467 			break;
1468 
1469 		/*
1470 		 * perform the operation with the pre-fetched immediate
1471 		 * operand and write the result
1472 		 */
1473 		result = val1 | vie->immediate;
1474 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1475 		break;
1476 	default:
1477 		break;
1478 	}
1479 	if (error)
1480 		return (error);
1481 
1482 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1483 	if (error)
1484 		return (error);
1485 
1486 	/*
1487 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1488 	 * to the result; AF is undefined.
1489 	 *
1490 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1491 	 */
1492 	rflags2 = getcc(size, result, 0);
1493 	rflags &= ~RFLAGS_STATUS_BITS;
1494 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1495 
1496 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1497 	return (error);
1498 }
1499 
1500 static int
1501 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1502 {
1503 	int error, size;
1504 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1505 	enum vm_reg_name reg;
1506 
1507 	size = vie->opsize;
1508 	switch (vie->op.op_byte) {
1509 	case 0x39:
1510 	case 0x3B:
1511 		/*
1512 		 * 39/r		CMP r/m16, r16
1513 		 * 39/r		CMP r/m32, r32
1514 		 * REX.W 39/r	CMP r/m64, r64
1515 		 *
1516 		 * 3B/r		CMP r16, r/m16
1517 		 * 3B/r		CMP r32, r/m32
1518 		 * REX.W + 3B/r	CMP r64, r/m64
1519 		 *
1520 		 * Compare the first operand with the second operand and
1521 		 * set status flags in EFLAGS register. The comparison is
1522 		 * performed by subtracting the second operand from the first
1523 		 * operand and then setting the status flags.
1524 		 */
1525 
1526 		/* Get the register operand */
1527 		reg = gpr_map[vie->reg];
1528 		error = vm_get_register(vm, vcpuid, reg, &regop);
1529 		if (error)
1530 			return (error);
1531 
1532 		/* Get the memory operand */
1533 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size);
1534 		if (error)
1535 			return (error);
1536 
1537 		if (vie->op.op_byte == 0x3B) {
1538 			op1 = regop;
1539 			op2 = memop;
1540 		} else {
1541 			op1 = memop;
1542 			op2 = regop;
1543 		}
1544 		rflags2 = getcc(size, op1, op2);
1545 		break;
1546 	case 0x80:
1547 	case 0x81:
1548 	case 0x83:
1549 		/*
1550 		 * 80 /7		cmp r/m8, imm8
1551 		 * REX + 80 /7		cmp r/m8, imm8
1552 		 *
1553 		 * 81 /7		cmp r/m16, imm16
1554 		 * 81 /7		cmp r/m32, imm32
1555 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1556 		 *
1557 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1558 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1559 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1560 		 *
1561 		 * Compare mem (ModRM:r/m) with immediate and set
1562 		 * status flags according to the results.  The
1563 		 * comparison is performed by subtracting the
1564 		 * immediate from the first operand and then setting
1565 		 * the status flags.
1566 		 *
1567 		 */
1568 		if (vie->op.op_byte == 0x80)
1569 			size = 1;
1570 
1571 		/* get the first operand */
1572 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1573 		if (error)
1574 			return (error);
1575 
1576 		rflags2 = getcc(size, op1, vie->immediate);
1577 		break;
1578 	default:
1579 		return (EINVAL);
1580 	}
1581 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1582 	if (error)
1583 		return (error);
1584 	rflags &= ~RFLAGS_STATUS_BITS;
1585 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1586 
1587 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1588 	return (error);
1589 }
1590 
1591 static int
1592 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1593 {
1594 	int error, size;
1595 	uint64_t op1, rflags, rflags2;
1596 
1597 	size = vie->opsize;
1598 	error = EINVAL;
1599 
1600 	switch (vie->op.op_byte) {
1601 	case 0xF6:
1602 		/*
1603 		 * F6 /0		test r/m8, imm8
1604 		 *
1605 		 * Test mem (ModRM:r/m) with immediate and set status
1606 		 * flags according to the results.  The comparison is
1607 		 * performed by anding the immediate from the first
1608 		 * operand and then setting the status flags.
1609 		 */
1610 		if ((vie->reg & 7) != 0)
1611 			return (EINVAL);
1612 
1613 		size = 1;	/* override for byte operation */
1614 
1615 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1616 		if (error)
1617 			return (error);
1618 
1619 		rflags2 = getandflags(size, op1, vie->immediate);
1620 		break;
1621 	case 0xF7:
1622 		/*
1623 		 * F7 /0		test r/m16, imm16
1624 		 * F7 /0		test r/m32, imm32
1625 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1626 		 *
1627 		 * Test mem (ModRM:r/m) with immediate and set status
1628 		 * flags according to the results.  The comparison is
1629 		 * performed by anding the immediate from the first
1630 		 * operand and then setting the status flags.
1631 		 */
1632 		if ((vie->reg & 7) != 0)
1633 			return (EINVAL);
1634 
1635 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1636 		if (error)
1637 			return (error);
1638 
1639 		rflags2 = getandflags(size, op1, vie->immediate);
1640 		break;
1641 	default:
1642 		return (EINVAL);
1643 	}
1644 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1645 	if (error)
1646 		return (error);
1647 
1648 	/*
1649 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1650 	 * to the result; AF is undefined.
1651 	 */
1652 	rflags &= ~RFLAGS_STATUS_BITS;
1653 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1654 
1655 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1656 	return (error);
1657 }
1658 
1659 static int
1660 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1661 {
1662 	uint64_t src1, src2, dst, rflags;
1663 	unsigned start, len;
1664 	int error, size;
1665 	struct vm_guest_paging *paging;
1666 
1667 	size = vie->opsize;
1668 	error = EINVAL;
1669 	paging = &vie->paging;
1670 
1671 	/*
1672 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1673 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1674 	 *
1675 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1676 	 * Vex.vvvv.
1677 	 *
1678 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1679 	 */
1680 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1681 		size = 4;
1682 
1683 	/*
1684 	 * Extracts contiguous bits from the first /source/ operand (second
1685 	 * operand) using an index and length specified in the second /source/
1686 	 * operand (third operand).
1687 	 */
1688 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size);
1689 	if (error)
1690 		return (error);
1691 	error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1692 	if (error)
1693 		return (error);
1694 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1695 	if (error)
1696 		return (error);
1697 
1698 	start = (src2 & 0xff);
1699 	len = (src2 & 0xff00) >> 8;
1700 
1701 	/* If no bits are extracted, the destination register is cleared. */
1702 	dst = 0;
1703 
1704 	/* If START exceeds the operand size, no bits are extracted. */
1705 	if (start > size * 8)
1706 		goto done;
1707 	/* Length is bounded by both the destination size and start offset. */
1708 	if (start + len > size * 8)
1709 		len = (size * 8) - start;
1710 	if (len == 0)
1711 		goto done;
1712 
1713 	if (start > 0)
1714 		src1 = (src1 >> start);
1715 	if (len < 64)
1716 		src1 = src1 & ((1ull << len) - 1);
1717 	dst = src1;
1718 
1719 done:
1720 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1721 	if (error)
1722 		return (error);
1723 
1724 	/*
1725 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1726 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1727 	 */
1728 	rflags &= ~RFLAGS_STATUS_BITS;
1729 	if (dst == 0)
1730 		rflags |= PSL_Z;
1731 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1732 	    8);
1733 	return (error);
1734 }
1735 
1736 static int
1737 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1738 {
1739 	int error, size;
1740 	uint64_t nval, rflags, rflags2, val1, val2;
1741 	enum vm_reg_name reg;
1742 
1743 	size = vie->opsize;
1744 	error = EINVAL;
1745 
1746 	switch (vie->op.op_byte) {
1747 	case 0x03:
1748 		/*
1749 		 * ADD r/m to r and store the result in r
1750 		 *
1751 		 * 03/r			ADD r16, r/m16
1752 		 * 03/r			ADD r32, r/m32
1753 		 * REX.W + 03/r		ADD r64, r/m64
1754 		 */
1755 
1756 		/* get the first operand */
1757 		reg = gpr_map[vie->reg];
1758 		error = vm_get_register(vm, vcpuid, reg, &val1);
1759 		if (error)
1760 			break;
1761 
1762 		/* get the second operand */
1763 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1764 		if (error)
1765 			break;
1766 
1767 		/* perform the operation and write the result */
1768 		nval = val1 + val2;
1769 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1770 		break;
1771 	default:
1772 		break;
1773 	}
1774 
1775 	if (!error) {
1776 		rflags2 = getaddflags(size, val1, val2);
1777 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1778 		    &rflags);
1779 		if (error)
1780 			return (error);
1781 
1782 		rflags &= ~RFLAGS_STATUS_BITS;
1783 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1784 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1785 		    rflags, 8);
1786 	}
1787 
1788 	return (error);
1789 }
1790 
1791 static int
1792 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1793 {
1794 	int error, size;
1795 	uint64_t nval, rflags, rflags2, val1, val2;
1796 	enum vm_reg_name reg;
1797 
1798 	size = vie->opsize;
1799 	error = EINVAL;
1800 
1801 	switch (vie->op.op_byte) {
1802 	case 0x2B:
1803 		/*
1804 		 * SUB r/m from r and store the result in r
1805 		 *
1806 		 * 2B/r		SUB r16, r/m16
1807 		 * 2B/r		SUB r32, r/m32
1808 		 * REX.W + 2B/r	SUB r64, r/m64
1809 		 */
1810 
1811 		/* get the first operand */
1812 		reg = gpr_map[vie->reg];
1813 		error = vm_get_register(vm, vcpuid, reg, &val1);
1814 		if (error)
1815 			break;
1816 
1817 		/* get the second operand */
1818 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1819 		if (error)
1820 			break;
1821 
1822 		/* perform the operation and write the result */
1823 		nval = val1 - val2;
1824 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1825 		break;
1826 	default:
1827 		break;
1828 	}
1829 
1830 	if (!error) {
1831 		rflags2 = getcc(size, val1, val2);
1832 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1833 		    &rflags);
1834 		if (error)
1835 			return (error);
1836 
1837 		rflags &= ~RFLAGS_STATUS_BITS;
1838 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1839 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1840 		    rflags, 8);
1841 	}
1842 
1843 	return (error);
1844 }
1845 
1846 static int
1847 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1848 {
1849 	struct vm_copyinfo copyinfo[2];
1850 	struct seg_desc ss_desc;
1851 	uint64_t cr0, rflags, rsp, stack_gla, val;
1852 	int error, fault, size, stackaddrsize, pushop;
1853 	struct vm_guest_paging *paging;
1854 
1855 	val = 0;
1856 	size = vie->opsize;
1857 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1858 	paging = &vie->paging;
1859 
1860 	/*
1861 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1862 	 */
1863 	if (paging->cpu_mode == CPU_MODE_REAL) {
1864 		stackaddrsize = 2;
1865 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1866 		/*
1867 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1868 		 * - Stack pointer size is always 64-bits.
1869 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1870 		 * - 16-bit PUSH/POP is supported by using the operand size
1871 		 *   override prefix (66H).
1872 		 */
1873 		stackaddrsize = 8;
1874 		size = vie->opsize_override ? 2 : 8;
1875 	} else {
1876 		/*
1877 		 * In protected or compatibility mode the 'B' flag in the
1878 		 * stack-segment descriptor determines the size of the
1879 		 * stack pointer.
1880 		 */
1881 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1882 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1883 		    __func__, error));
1884 		if (SEG_DESC_DEF32(ss_desc.access))
1885 			stackaddrsize = 4;
1886 		else
1887 			stackaddrsize = 2;
1888 	}
1889 
1890 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1891 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1892 
1893 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1894 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1895 
1896 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1897 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1898 	if (pushop) {
1899 		rsp -= size;
1900 	}
1901 
1902 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1903 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1904 	    &stack_gla)) {
1905 		vm_inject_ss(vm, vcpuid, 0);
1906 		return (0);
1907 	}
1908 
1909 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1910 		vm_inject_ss(vm, vcpuid, 0);
1911 		return (0);
1912 	}
1913 
1914 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1915 		vm_inject_ac(vm, vcpuid, 0);
1916 		return (0);
1917 	}
1918 
1919 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1920 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1921 	    &fault);
1922 	if (error || fault)
1923 		return (error);
1924 
1925 	if (pushop) {
1926 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
1927 		if (error == 0)
1928 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1929 	} else {
1930 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1931 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
1932 		rsp += size;
1933 	}
1934 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1935 
1936 	if (error == 0) {
1937 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1938 		    stackaddrsize);
1939 		KASSERT(error == 0, ("error %d updating rsp", error));
1940 	}
1941 	return (error);
1942 }
1943 
1944 static int
1945 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1946 {
1947 	int error;
1948 
1949 	/*
1950 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1951 	 *
1952 	 * PUSH is part of the group 5 extended opcodes and is identified
1953 	 * by ModRM:reg = b110.
1954 	 */
1955 	if ((vie->reg & 7) != 6)
1956 		return (EINVAL);
1957 
1958 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
1959 	return (error);
1960 }
1961 
1962 static int
1963 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1964 {
1965 	int error;
1966 
1967 	/*
1968 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1969 	 *
1970 	 * POP is part of the group 1A extended opcodes and is identified
1971 	 * by ModRM:reg = b000.
1972 	 */
1973 	if ((vie->reg & 7) != 0)
1974 		return (EINVAL);
1975 
1976 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
1977 	return (error);
1978 }
1979 
1980 static int
1981 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1982 {
1983 	int error;
1984 
1985 	switch (vie->reg & 7) {
1986 	case 0x1:	/* OR */
1987 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
1988 		break;
1989 	case 0x4:	/* AND */
1990 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
1991 		break;
1992 	case 0x7:	/* CMP */
1993 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
1994 		break;
1995 	default:
1996 		error = EINVAL;
1997 		break;
1998 	}
1999 
2000 	return (error);
2001 }
2002 
2003 static int
2004 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2005 {
2006 	uint64_t val, rflags;
2007 	int error, bitmask, bitoff;
2008 
2009 	/*
2010 	 * 0F BA is a Group 8 extended opcode.
2011 	 *
2012 	 * Currently we only emulate the 'Bit Test' instruction which is
2013 	 * identified by a ModR/M:reg encoding of 100b.
2014 	 */
2015 	if ((vie->reg & 7) != 4)
2016 		return (EINVAL);
2017 
2018 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2019 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2020 
2021 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize);
2022 	if (error)
2023 		return (error);
2024 
2025 	/*
2026 	 * Intel SDM, Vol 2, Table 3-2:
2027 	 * "Range of Bit Positions Specified by Bit Offset Operands"
2028 	 */
2029 	bitmask = vie->opsize * 8 - 1;
2030 	bitoff = vie->immediate & bitmask;
2031 
2032 	/* Copy the bit into the Carry flag in %rflags */
2033 	if (val & (1UL << bitoff))
2034 		rflags |= PSL_C;
2035 	else
2036 		rflags &= ~PSL_C;
2037 
2038 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
2039 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
2040 
2041 	return (0);
2042 }
2043 
2044 static int
2045 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid,
2046     uint64_t gpa)
2047 {
2048 	int error;
2049 	uint64_t buf;
2050 
2051 	switch (vie->reg & 7) {
2052 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
2053 		if (vie->mod == 0x3) {
2054 			/*
2055 			 * SFENCE.  Ignore it, VM exit provides enough
2056 			 * barriers on its own.
2057 			 */
2058 			error = 0;
2059 		} else {
2060 			/*
2061 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
2062 			 * rights.
2063 			 */
2064 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1);
2065 		}
2066 		break;
2067 	default:
2068 		error = EINVAL;
2069 		break;
2070 	}
2071 
2072 	return (error);
2073 }
2074 
2075 static int
2076 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid)
2077 {
2078 	uint64_t val;
2079 	int error;
2080 
2081 	if (vie->paging.cpl != 0) {
2082 		vm_inject_gp(vm, vcpuid);
2083 		vie->num_processed = 0;
2084 		return (0);
2085 	}
2086 
2087 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val);
2088 	ASSERT(error == 0);
2089 
2090 	/* Clear %cr0.TS */
2091 	val &= ~CR0_TS;
2092 
2093 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val);
2094 	ASSERT(error == 0);
2095 
2096 	return (0);
2097 }
2098 
2099 static int
2100 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2101     uint64_t *rval, int bytes)
2102 {
2103 	int err;
2104 
2105 	if (vie->mmio_req_read.state == VR_DONE) {
2106 		ASSERT(vie->mmio_req_read.bytes == bytes);
2107 		ASSERT(vie->mmio_req_read.gpa == gpa);
2108 
2109 		*rval = vie->mmio_req_read.data;
2110 		return (0);
2111 	}
2112 
2113 	err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes);
2114 	if (err == 0) {
2115 		/*
2116 		 * A successful read from an in-kernel-emulated device may come
2117 		 * with side effects, so stash the result in case it's used for
2118 		 * an instruction which subsequently needs to issue an MMIO
2119 		 * write to userspace.
2120 		 */
2121 		ASSERT(vie->mmio_req_read.state == VR_NONE);
2122 
2123 		vie->mmio_req_read.bytes = bytes;
2124 		vie->mmio_req_read.gpa = gpa;
2125 		vie->mmio_req_read.data = *rval;
2126 		vie->mmio_req_read.state = VR_DONE;
2127 
2128 	} else if (err == ESRCH) {
2129 		/* Hope that userspace emulation can fulfill this read */
2130 		vie->mmio_req_read.bytes = bytes;
2131 		vie->mmio_req_read.gpa = gpa;
2132 		vie->mmio_req_read.state = VR_PENDING;
2133 		vie->status |= VIES_PENDING_MMIO;
2134 	} else if (err < 0) {
2135 		/*
2136 		 * The MMIO read failed in such a way that fallback to handling
2137 		 * in userspace is required.
2138 		 */
2139 		vie->status |= VIES_USER_FALLBACK;
2140 	}
2141 	return (err);
2142 }
2143 
2144 static int
2145 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2146     uint64_t wval, int bytes)
2147 {
2148 	int err;
2149 
2150 	if (vie->mmio_req_write.state == VR_DONE) {
2151 		ASSERT(vie->mmio_req_write.bytes == bytes);
2152 		ASSERT(vie->mmio_req_write.gpa == gpa);
2153 
2154 		return (0);
2155 	}
2156 
2157 	err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes);
2158 	if (err == 0) {
2159 		/*
2160 		 * A successful write to an in-kernel-emulated device probably
2161 		 * results in side effects, so stash the fact that such a write
2162 		 * succeeded in case the operation requires other work.
2163 		 */
2164 		vie->mmio_req_write.bytes = bytes;
2165 		vie->mmio_req_write.gpa = gpa;
2166 		vie->mmio_req_write.data = wval;
2167 		vie->mmio_req_write.state = VR_DONE;
2168 	} else if (err == ESRCH) {
2169 		/* Hope that userspace emulation can fulfill this write */
2170 		vie->mmio_req_write.bytes = bytes;
2171 		vie->mmio_req_write.gpa = gpa;
2172 		vie->mmio_req_write.data = wval;
2173 		vie->mmio_req_write.state = VR_PENDING;
2174 		vie->status |= VIES_PENDING_MMIO;
2175 	} else if (err < 0) {
2176 		/*
2177 		 * The MMIO write failed in such a way that fallback to handling
2178 		 * in userspace is required.
2179 		 */
2180 		vie->status |= VIES_USER_FALLBACK;
2181 	}
2182 	return (err);
2183 }
2184 
2185 int
2186 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid)
2187 {
2188 	int error;
2189 	uint64_t gpa;
2190 
2191 	if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) !=
2192 	    (VIES_INST_DECODE | VIES_MMIO)) {
2193 		return (EINVAL);
2194 	}
2195 
2196 	gpa = vie->mmio_gpa;
2197 
2198 	switch (vie->op.op_type) {
2199 	case VIE_OP_TYPE_GROUP1:
2200 		error = vie_emulate_group1(vie, vm, vcpuid, gpa);
2201 		break;
2202 	case VIE_OP_TYPE_POP:
2203 		error = vie_emulate_pop(vie, vm, vcpuid, gpa);
2204 		break;
2205 	case VIE_OP_TYPE_PUSH:
2206 		error = vie_emulate_push(vie, vm, vcpuid, gpa);
2207 		break;
2208 	case VIE_OP_TYPE_CMP:
2209 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2210 		break;
2211 	case VIE_OP_TYPE_MOV:
2212 		error = vie_emulate_mov(vie, vm, vcpuid, gpa);
2213 		break;
2214 	case VIE_OP_TYPE_MOVSX:
2215 	case VIE_OP_TYPE_MOVZX:
2216 		error = vie_emulate_movx(vie, vm, vcpuid, gpa);
2217 		break;
2218 	case VIE_OP_TYPE_MOVS:
2219 		error = vie_emulate_movs(vie, vm, vcpuid, gpa);
2220 		break;
2221 	case VIE_OP_TYPE_STOS:
2222 		error = vie_emulate_stos(vie, vm, vcpuid, gpa);
2223 		break;
2224 	case VIE_OP_TYPE_AND:
2225 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2226 		break;
2227 	case VIE_OP_TYPE_OR:
2228 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2229 		break;
2230 	case VIE_OP_TYPE_SUB:
2231 		error = vie_emulate_sub(vie, vm, vcpuid, gpa);
2232 		break;
2233 	case VIE_OP_TYPE_BITTEST:
2234 		error = vie_emulate_bittest(vie, vm, vcpuid, gpa);
2235 		break;
2236 	case VIE_OP_TYPE_TWOB_GRP15:
2237 		error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa);
2238 		break;
2239 	case VIE_OP_TYPE_ADD:
2240 		error = vie_emulate_add(vie, vm, vcpuid, gpa);
2241 		break;
2242 	case VIE_OP_TYPE_TEST:
2243 		error = vie_emulate_test(vie, vm, vcpuid, gpa);
2244 		break;
2245 	case VIE_OP_TYPE_BEXTR:
2246 		error = vie_emulate_bextr(vie, vm, vcpuid, gpa);
2247 		break;
2248 	default:
2249 		error = EINVAL;
2250 		break;
2251 	}
2252 
2253 	if (error == ESRCH) {
2254 		/* Return to userspace with the mmio request */
2255 		return (-1);
2256 	}
2257 
2258 	return (error);
2259 }
2260 
2261 static int
2262 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid,
2263     uint32_t *eax)
2264 {
2265 	uint32_t mask, val;
2266 	bool in;
2267 	int err;
2268 
2269 	mask = vie_size2mask(vie->inout.bytes);
2270 	in = (vie->inout.flags & INOUT_IN) != 0;
2271 
2272 	if (!in) {
2273 		val = *eax & mask;
2274 	}
2275 
2276 	if (vie->inout_req_state != VR_DONE) {
2277 		err = vm_ioport_access(vm, vcpuid, in, vie->inout.port,
2278 		    vie->inout.bytes, &val);
2279 		val &= mask;
2280 	} else {
2281 		/*
2282 		 * This port access was handled in userspace and the result was
2283 		 * injected in to be handled now.
2284 		 */
2285 		val = vie->inout_req_val & mask;
2286 		vie->inout_req_state = VR_NONE;
2287 		err = 0;
2288 	}
2289 
2290 	if (err == ESRCH) {
2291 		vie->status |= VIES_PENDING_INOUT;
2292 		vie->inout_req_state = VR_PENDING;
2293 		return (err);
2294 	} else if (err != 0) {
2295 		return (err);
2296 	}
2297 
2298 	if (in) {
2299 		*eax = (*eax & ~mask) | val;
2300 	}
2301 	return (0);
2302 }
2303 
2304 static enum vm_reg_name
2305 vie_inout_segname(const struct vie *vie)
2306 {
2307 	uint8_t segidx = vie->inout.segment;
2308 	const enum vm_reg_name segmap[] = {
2309 		VM_REG_GUEST_ES,
2310 		VM_REG_GUEST_CS,
2311 		VM_REG_GUEST_SS,
2312 		VM_REG_GUEST_DS,
2313 		VM_REG_GUEST_FS,
2314 		VM_REG_GUEST_GS,
2315 	};
2316 	const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0]));
2317 
2318 	if (segidx >= maxidx) {
2319 		panic("unexpected segment index %u", segidx);
2320 	}
2321 	return (segmap[segidx]);
2322 }
2323 
2324 static int
2325 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid)
2326 {
2327 	uint8_t bytes, addrsize;
2328 	uint64_t index, count = 0, gla, rflags;
2329 	int prot, err, fault;
2330 	bool in, repeat;
2331 	enum vm_reg_name seg_reg, idx_reg;
2332 	struct vm_copyinfo copyinfo[2];
2333 
2334 	in = (vie->inout.flags & INOUT_IN) != 0;
2335 	bytes = vie->inout.bytes;
2336 	addrsize = vie->inout.addrsize;
2337 	prot = in ? PROT_WRITE : PROT_READ;
2338 
2339 	ASSERT(bytes == 1 || bytes == 2 || bytes == 4);
2340 	ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8);
2341 
2342 	idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2343 	seg_reg = vie_inout_segname(vie);
2344 	err = vm_get_register(vm, vcpuid, idx_reg, &index);
2345 	ASSERT(err == 0);
2346 	index = index & vie_size2mask(addrsize);
2347 
2348 	repeat = (vie->inout.flags & INOUT_REP) != 0;
2349 
2350 	/* Count register */
2351 	if (repeat) {
2352 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count);
2353 		count &= vie_size2mask(addrsize);
2354 
2355 		if (count == 0) {
2356 			/*
2357 			 * If we were asked to emulate a REP INS/OUTS when the
2358 			 * count register is zero, no further work is required.
2359 			 */
2360 			return (0);
2361 		}
2362 	} else {
2363 		count = 1;
2364 	}
2365 
2366 	gla = 0;
2367 	if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg,
2368 	    idx_reg, &gla) != 0) {
2369 		/* vie_get_gla() already injected the appropriate fault */
2370 		return (0);
2371 	}
2372 
2373 	/*
2374 	 * The INS/OUTS emulate currently assumes that the memory target resides
2375 	 * within the guest system memory, rather than a device MMIO region.  If
2376 	 * such a case becomes a necessity, that additional handling could be
2377 	 * put in place.
2378 	 */
2379 	err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot,
2380 	    copyinfo, nitems(copyinfo), &fault);
2381 
2382 	if (err) {
2383 		/* Unrecoverable error */
2384 		return (err);
2385 	} else if (fault) {
2386 		/* Resume guest to handle fault */
2387 		return (0);
2388 	}
2389 
2390 	if (!in) {
2391 		vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes);
2392 	}
2393 
2394 	err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2395 
2396 	if (err == 0 && in) {
2397 		vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes);
2398 	}
2399 
2400 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2401 
2402 	if (err == 0) {
2403 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
2404 		    &rflags);
2405 		ASSERT(err == 0);
2406 
2407 		/* Update index */
2408 		if (rflags & PSL_D) {
2409 			index -= bytes;
2410 		} else {
2411 			index += bytes;
2412 		}
2413 
2414 		/* Update index register */
2415 		err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize);
2416 		ASSERT(err == 0);
2417 
2418 		/*
2419 		 * Update count register only if the instruction had a repeat
2420 		 * prefix.
2421 		 */
2422 		if ((vie->inout.flags & INOUT_REP) != 0) {
2423 			count--;
2424 			err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
2425 			    count, addrsize);
2426 			ASSERT(err == 0);
2427 
2428 			if (count != 0) {
2429 				return (vie_repeat(vie));
2430 			}
2431 		}
2432 	}
2433 
2434 	return (err);
2435 }
2436 
2437 int
2438 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid)
2439 {
2440 	int err = 0;
2441 
2442 	if ((vie->status & VIES_INOUT) == 0) {
2443 		return (EINVAL);
2444 	}
2445 
2446 	if ((vie->inout.flags & INOUT_STR) == 0) {
2447 		/*
2448 		 * For now, using the 'rep' prefixes with plain (non-string)
2449 		 * in/out is not supported.
2450 		 */
2451 		if ((vie->inout.flags & INOUT_REP) != 0) {
2452 			return (EINVAL);
2453 		}
2454 
2455 		err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2456 		if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) {
2457 			/*
2458 			 * With the inX access now a success, the result needs
2459 			 * to be stored in the guest %rax.
2460 			 */
2461 			err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2462 			    vie->inout.eax);
2463 			VERIFY0(err);
2464 		}
2465 	} else {
2466 		vie->status &= ~VIES_REPEAT;
2467 		err = vie_emulate_inout_str(vie, vm, vcpuid);
2468 
2469 	}
2470 	if (err < 0) {
2471 		/*
2472 		 * Access to an I/O port failed in such a way that fallback to
2473 		 * handling in userspace is required.
2474 		 */
2475 		vie->status |= VIES_USER_FALLBACK;
2476 	} else if (err == ESRCH) {
2477 		ASSERT(vie->status & VIES_PENDING_INOUT);
2478 		/* Return to userspace with the in/out request */
2479 		err = -1;
2480 	}
2481 
2482 	return (err);
2483 }
2484 
2485 int
2486 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid)
2487 {
2488 	int error;
2489 
2490 	if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) !=
2491 	    (VIES_INST_DECODE | VIES_OTHER)) {
2492 		return (EINVAL);
2493 	}
2494 
2495 	switch (vie->op.op_type) {
2496 	case VIE_OP_TYPE_CLTS:
2497 		error = vie_emulate_clts(vie, vm, vcpuid);
2498 		break;
2499 	case VIE_OP_TYPE_MOV_CR:
2500 		error = vie_emulate_mov_cr(vie, vm, vcpuid);
2501 		break;
2502 	default:
2503 		error = EINVAL;
2504 		break;
2505 	}
2506 
2507 	return (error);
2508 }
2509 
2510 void
2511 vie_reset(struct vie *vie)
2512 {
2513 	vie->status = 0;
2514 	vie->num_processed = vie->num_valid = 0;
2515 }
2516 
2517 void
2518 vie_advance_pc(struct vie *vie, uint64_t *nextrip)
2519 {
2520 	VERIFY((vie->status & VIES_REPEAT) == 0);
2521 
2522 	*nextrip += vie->num_processed;
2523 	vie_reset(vie);
2524 }
2525 
2526 void
2527 vie_exitinfo(const struct vie *vie, struct vm_exit *vme)
2528 {
2529 	if (vie->status & VIES_USER_FALLBACK) {
2530 		/*
2531 		 * Despite the fact that the instruction was successfully
2532 		 * decoded, some aspect of the emulation failed in such a way
2533 		 * that it is left up to userspace to complete the operation.
2534 		 */
2535 		vie_fallback_exitinfo(vie, vme);
2536 	} else if (vie->status & VIES_MMIO) {
2537 		vme->exitcode = VM_EXITCODE_MMIO;
2538 		if (vie->mmio_req_read.state == VR_PENDING) {
2539 			vme->u.mmio.gpa = vie->mmio_req_read.gpa;
2540 			vme->u.mmio.data = 0;
2541 			vme->u.mmio.bytes = vie->mmio_req_read.bytes;
2542 			vme->u.mmio.read = 1;
2543 		} else if (vie->mmio_req_write.state == VR_PENDING) {
2544 			vme->u.mmio.gpa = vie->mmio_req_write.gpa;
2545 			vme->u.mmio.data = vie->mmio_req_write.data &
2546 			    vie_size2mask(vie->mmio_req_write.bytes);
2547 			vme->u.mmio.bytes = vie->mmio_req_write.bytes;
2548 			vme->u.mmio.read = 0;
2549 		} else {
2550 			panic("bad pending MMIO state");
2551 		}
2552 	} else if (vie->status & VIES_INOUT) {
2553 		vme->exitcode = VM_EXITCODE_INOUT;
2554 		vme->u.inout.port = vie->inout.port;
2555 		vme->u.inout.bytes = vie->inout.bytes;
2556 		if ((vie->inout.flags & INOUT_IN) != 0) {
2557 			vme->u.inout.flags = INOUT_IN;
2558 			vme->u.inout.eax = 0;
2559 		} else {
2560 			vme->u.inout.flags = 0;
2561 			vme->u.inout.eax = vie->inout.eax &
2562 			    vie_size2mask(vie->inout.bytes);
2563 		}
2564 	} else {
2565 		panic("no pending operation");
2566 	}
2567 }
2568 
2569 /*
2570  * In the case of a decoding or verification failure, bailing out to userspace
2571  * to do the instruction emulation is our only option for now.
2572  */
2573 void
2574 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme)
2575 {
2576 	if ((vie->status & VIES_INST_FETCH) == 0) {
2577 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
2578 	} else {
2579 		ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst));
2580 
2581 		bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst));
2582 		vme->u.inst_emul.num_valid = vie->num_valid;
2583 	}
2584 	vme->exitcode = VM_EXITCODE_INST_EMUL;
2585 }
2586 
2587 void
2588 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base,
2589     int *cs_d)
2590 {
2591 	struct seg_desc cs_desc;
2592 	int error;
2593 
2594 	error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc);
2595 	ASSERT(error == 0);
2596 
2597 	/* Initialization required for the paging info to be populated */
2598 	VERIFY(vie->status & VIES_INIT);
2599 	switch (vie->paging.cpu_mode) {
2600 	case CPU_MODE_REAL:
2601 		*cs_base = cs_desc.base;
2602 		*cs_d = 0;
2603 		break;
2604 	case CPU_MODE_PROTECTED:
2605 	case CPU_MODE_COMPATIBILITY:
2606 		*cs_base = cs_desc.base;
2607 		*cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0;
2608 		break;
2609 	default:
2610 		*cs_base = 0;
2611 		*cs_d = 0;
2612 		break;
2613 	}
2614 }
2615 
2616 bool
2617 vie_pending(const struct vie *vie)
2618 {
2619 	/*
2620 	 * These VIE status bits indicate conditions which must be addressed
2621 	 * through either device IO fulfillment (with corresponding
2622 	 * vie_fulfill_*()) or complete userspace emulation (followed by a
2623 	 * vie_reset()).
2624 	 */
2625 	const enum vie_status of_interest =
2626 	    VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK;
2627 
2628 	return ((vie->status & of_interest) != 0);
2629 }
2630 
2631 bool
2632 vie_needs_fetch(const struct vie *vie)
2633 {
2634 	if (vie->status & VIES_INST_FETCH) {
2635 		ASSERT(vie->num_valid != 0);
2636 		return (false);
2637 	}
2638 	return (true);
2639 }
2640 
2641 static int
2642 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
2643 {
2644 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2645 	    ("%s: invalid size %d", __func__, size));
2646 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
2647 
2648 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
2649 		return (0);
2650 
2651 	return ((gla & (size - 1)) ? 1 : 0);
2652 }
2653 
2654 static int
2655 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
2656 {
2657 	uint64_t mask;
2658 
2659 	if (cpu_mode != CPU_MODE_64BIT)
2660 		return (0);
2661 
2662 	/*
2663 	 * The value of the bit 47 in the 'gla' should be replicated in the
2664 	 * most significant 16 bits.
2665 	 */
2666 	mask = ~((1UL << 48) - 1);
2667 	if (gla & (1UL << 47))
2668 		return ((gla & mask) != mask);
2669 	else
2670 		return ((gla & mask) != 0);
2671 }
2672 
2673 static uint64_t
2674 vie_size2mask(int size)
2675 {
2676 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2677 	    ("vie_size2mask: invalid size %d", size));
2678 	return (size2mask[size]);
2679 }
2680 
2681 static int
2682 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
2683     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
2684     int prot, uint64_t *gla)
2685 {
2686 	uint64_t firstoff, low_limit, high_limit, segbase;
2687 	int glasize, type;
2688 
2689 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
2690 	    ("%s: invalid segment %d", __func__, seg));
2691 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
2692 	    ("%s: invalid operand size %d", __func__, length));
2693 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
2694 	    ("%s: invalid prot %x", __func__, prot));
2695 
2696 	firstoff = offset;
2697 	if (cpu_mode == CPU_MODE_64BIT) {
2698 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
2699 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
2700 		glasize = 8;
2701 	} else {
2702 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
2703 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
2704 		glasize = 4;
2705 		/*
2706 		 * If the segment selector is loaded with a NULL selector
2707 		 * then the descriptor is unusable and attempting to use
2708 		 * it results in a #GP(0).
2709 		 */
2710 		if (SEG_DESC_UNUSABLE(desc->access))
2711 			return (-1);
2712 
2713 		/*
2714 		 * The processor generates a #NP exception when a segment
2715 		 * register is loaded with a selector that points to a
2716 		 * descriptor that is not present. If this was the case then
2717 		 * it would have been checked before the VM-exit.
2718 		 */
2719 		KASSERT(SEG_DESC_PRESENT(desc->access),
2720 		    ("segment %d not present: %x", seg, desc->access));
2721 
2722 		/*
2723 		 * The descriptor type must indicate a code/data segment.
2724 		 */
2725 		type = SEG_DESC_TYPE(desc->access);
2726 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
2727 		    "descriptor type %x", seg, type));
2728 
2729 		if (prot & PROT_READ) {
2730 			/* #GP on a read access to a exec-only code segment */
2731 			if ((type & 0xA) == 0x8)
2732 				return (-1);
2733 		}
2734 
2735 		if (prot & PROT_WRITE) {
2736 			/*
2737 			 * #GP on a write access to a code segment or a
2738 			 * read-only data segment.
2739 			 */
2740 			if (type & 0x8)			/* code segment */
2741 				return (-1);
2742 
2743 			if ((type & 0xA) == 0)		/* read-only data seg */
2744 				return (-1);
2745 		}
2746 
2747 		/*
2748 		 * 'desc->limit' is fully expanded taking granularity into
2749 		 * account.
2750 		 */
2751 		if ((type & 0xC) == 0x4) {
2752 			/* expand-down data segment */
2753 			low_limit = desc->limit + 1;
2754 			high_limit = SEG_DESC_DEF32(desc->access) ?
2755 			    0xffffffff : 0xffff;
2756 		} else {
2757 			/* code segment or expand-up data segment */
2758 			low_limit = 0;
2759 			high_limit = desc->limit;
2760 		}
2761 
2762 		while (length > 0) {
2763 			offset &= vie_size2mask(addrsize);
2764 			if (offset < low_limit || offset > high_limit)
2765 				return (-1);
2766 			offset++;
2767 			length--;
2768 		}
2769 	}
2770 
2771 	/*
2772 	 * In 64-bit mode all segments except %fs and %gs have a segment
2773 	 * base address of 0.
2774 	 */
2775 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2776 	    seg != VM_REG_GUEST_GS) {
2777 		segbase = 0;
2778 	} else {
2779 		segbase = desc->base;
2780 	}
2781 
2782 	/*
2783 	 * Truncate 'firstoff' to the effective address size before adding
2784 	 * it to the segment base.
2785 	 */
2786 	firstoff &= vie_size2mask(addrsize);
2787 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
2788 	return (0);
2789 }
2790 
2791 void
2792 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length,
2793     const struct vm_guest_paging *paging, uint64_t gpa)
2794 {
2795 	KASSERT(inst_length <= VIE_INST_SIZE,
2796 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2797 
2798 	bzero(vie, sizeof (struct vie));
2799 
2800 	vie->base_register = VM_REG_LAST;
2801 	vie->index_register = VM_REG_LAST;
2802 	vie->segment_register = VM_REG_LAST;
2803 	vie->status = VIES_INIT | VIES_MMIO;
2804 
2805 	if (inst_length != 0) {
2806 		bcopy(inst_bytes, vie->inst, inst_length);
2807 		vie->num_valid = inst_length;
2808 		vie->status |= VIES_INST_FETCH;
2809 	}
2810 
2811 	vie->paging = *paging;
2812 	vie->mmio_gpa = gpa;
2813 }
2814 
2815 void
2816 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len,
2817     const struct vm_guest_paging *paging)
2818 {
2819 	bzero(vie, sizeof (struct vie));
2820 
2821 	vie->status = VIES_INIT | VIES_INOUT;
2822 
2823 	vie->inout = *inout;
2824 	vie->paging = *paging;
2825 
2826 	/*
2827 	 * Since VMX/SVM assists already decoded the nature of the in/out
2828 	 * instruction, let the status reflect that.
2829 	 */
2830 	vie->status |= VIES_INST_FETCH | VIES_INST_DECODE;
2831 	vie->num_processed = inst_len;
2832 }
2833 
2834 void
2835 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging)
2836 {
2837 	bzero(vie, sizeof (struct vie));
2838 
2839 	vie->base_register = VM_REG_LAST;
2840 	vie->index_register = VM_REG_LAST;
2841 	vie->segment_register = VM_REG_LAST;
2842 	vie->status = VIES_INIT | VIES_OTHER;
2843 
2844 	vie->paging = *paging;
2845 }
2846 
2847 int
2848 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result)
2849 {
2850 	struct vie_mmio *pending;
2851 
2852 	if ((vie->status & VIES_MMIO) == 0 ||
2853 	    (vie->status & VIES_PENDING_MMIO) == 0) {
2854 		return (EINVAL);
2855 	}
2856 
2857 	if (result->read) {
2858 		pending = &vie->mmio_req_read;
2859 	} else {
2860 		pending = &vie->mmio_req_write;
2861 	}
2862 
2863 	if (pending->state != VR_PENDING ||
2864 	    pending->bytes != result->bytes || pending->gpa != result->gpa) {
2865 		return (EINVAL);
2866 	}
2867 
2868 	if (result->read) {
2869 		pending->data = result->data & vie_size2mask(pending->bytes);
2870 	}
2871 	pending->state = VR_DONE;
2872 	vie->status &= ~VIES_PENDING_MMIO;
2873 
2874 	return (0);
2875 }
2876 
2877 int
2878 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result)
2879 {
2880 	if ((vie->status & VIES_INOUT) == 0 ||
2881 	    (vie->status & VIES_PENDING_INOUT) == 0) {
2882 		return (EINVAL);
2883 	}
2884 	if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) ||
2885 	    vie->inout.bytes != result->bytes ||
2886 	    vie->inout.port != result->port) {
2887 		return (EINVAL);
2888 	}
2889 
2890 	if (result->flags & INOUT_IN) {
2891 		vie->inout_req_val = result->eax &
2892 		    vie_size2mask(vie->inout.bytes);
2893 	}
2894 	vie->inout_req_state = VR_DONE;
2895 	vie->status &= ~(VIES_PENDING_INOUT);
2896 
2897 	return (0);
2898 }
2899 
2900 uint64_t
2901 vie_mmio_gpa(const struct vie *vie)
2902 {
2903 	return (vie->mmio_gpa);
2904 }
2905 
2906 static int
2907 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2908 {
2909 	int error_code = 0;
2910 
2911 	if (pte & PG_V)
2912 		error_code |= PGEX_P;
2913 	if (prot & PROT_WRITE)
2914 		error_code |= PGEX_W;
2915 	if (usermode)
2916 		error_code |= PGEX_U;
2917 	if (rsvd)
2918 		error_code |= PGEX_RSV;
2919 	if (prot & PROT_EXEC)
2920 		error_code |= PGEX_I;
2921 
2922 	return (error_code);
2923 }
2924 
2925 static void
2926 ptp_release(vm_page_t **vmp)
2927 {
2928 	if (*vmp != NULL) {
2929 		(void) vmp_release(*vmp);
2930 		*vmp = NULL;
2931 	}
2932 }
2933 
2934 static void *
2935 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
2936 {
2937 	vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
2938 	const uintptr_t hold_gpa = gpa & PAGEMASK;
2939 
2940 	/* Hold must not cross a page boundary */
2941 	VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
2942 
2943 	if (*vmp != NULL) {
2944 		(void) vmp_release(*vmp);
2945 	}
2946 
2947 	*vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
2948 	if (*vmp == NULL) {
2949 		return (NULL);
2950 	}
2951 
2952 	return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
2953 }
2954 
2955 static int
2956 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2957     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2958 {
2959 	int nlevels, pfcode;
2960 	int ptpshift = 0, ptpindex = 0;
2961 	uint64_t ptpphys;
2962 	uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
2963 	vm_page_t *cookie = NULL;
2964 	const bool usermode = paging->cpl == 3;
2965 	const bool writable = (prot & PROT_WRITE) != 0;
2966 
2967 	*guest_fault = 0;
2968 restart:
2969 	ptpphys = paging->cr3;		/* root of the page tables */
2970 	ptp_release(&cookie);
2971 
2972 	if (vie_canonical_check(paging->cpu_mode, gla)) {
2973 		/*
2974 		 * XXX assuming a non-stack reference otherwise a stack fault
2975 		 * should be generated.
2976 		 */
2977 		if (!check_only)
2978 			vm_inject_gp(vm, vcpuid);
2979 		*guest_fault = 1;
2980 		return (0);
2981 	}
2982 
2983 	if (paging->paging_mode == PAGING_MODE_FLAT) {
2984 		*gpa = gla;
2985 		return (0);
2986 	}
2987 
2988 	if (paging->paging_mode == PAGING_MODE_32) {
2989 		uint32_t *ptpbase32, pte32;
2990 
2991 		nlevels = 2;
2992 		while (--nlevels >= 0) {
2993 			/* Zero out the lower 12 bits. */
2994 			ptpphys &= ~0xfff;
2995 
2996 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2997 			    &cookie);
2998 
2999 			if (ptpbase32 == NULL) {
3000 				return (EFAULT);
3001 			}
3002 
3003 			ptpshift = PAGE_SHIFT + nlevels * 10;
3004 			ptpindex = (gla >> ptpshift) & 0x3FF;
3005 			pgsize = 1UL << ptpshift;
3006 
3007 			pte32 = ptpbase32[ptpindex];
3008 
3009 			if ((pte32 & PG_V) == 0 ||
3010 			    (usermode && (pte32 & PG_U) == 0) ||
3011 			    (writable && (pte32 & PG_RW) == 0)) {
3012 				if (!check_only) {
3013 					pfcode = pf_error_code(usermode, prot,
3014 					    0, pte32);
3015 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3016 				}
3017 
3018 				ptp_release(&cookie);
3019 				*guest_fault = 1;
3020 				return (0);
3021 			}
3022 
3023 			/*
3024 			 * Emulate the x86 MMU's management of the accessed
3025 			 * and dirty flags. While the accessed flag is set
3026 			 * at every level of the page table, the dirty flag
3027 			 * is only set at the last level providing the guest
3028 			 * physical address.
3029 			 */
3030 			if (!check_only && (pte32 & PG_A) == 0) {
3031 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
3032 				    pte32, pte32 | PG_A) == 0) {
3033 					goto restart;
3034 				}
3035 			}
3036 
3037 			/* XXX must be ignored if CR4.PSE=0 */
3038 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
3039 				break;
3040 
3041 			ptpphys = pte32;
3042 		}
3043 
3044 		/* Set the dirty bit in the page table entry if necessary */
3045 		if (!check_only && writable && (pte32 & PG_M) == 0) {
3046 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
3047 			    pte32, pte32 | PG_M) == 0) {
3048 				goto restart;
3049 			}
3050 		}
3051 
3052 		/* Zero out the lower 'ptpshift' bits */
3053 		pte32 >>= ptpshift; pte32 <<= ptpshift;
3054 		*gpa = pte32 | (gla & (pgsize - 1));
3055 		ptp_release(&cookie);
3056 		return (0);
3057 	}
3058 
3059 	if (paging->paging_mode == PAGING_MODE_PAE) {
3060 		/* Zero out the lower 5 bits and the upper 32 bits */
3061 		ptpphys &= 0xffffffe0UL;
3062 
3063 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
3064 		    &cookie);
3065 		if (ptpbase == NULL) {
3066 			return (EFAULT);
3067 		}
3068 
3069 		ptpindex = (gla >> 30) & 0x3;
3070 
3071 		pte = ptpbase[ptpindex];
3072 
3073 		if ((pte & PG_V) == 0) {
3074 			if (!check_only) {
3075 				pfcode = pf_error_code(usermode, prot, 0, pte);
3076 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3077 			}
3078 
3079 			ptp_release(&cookie);
3080 			*guest_fault = 1;
3081 			return (0);
3082 		}
3083 
3084 		ptpphys = pte;
3085 
3086 		nlevels = 2;
3087 	} else {
3088 		nlevels = 4;
3089 	}
3090 
3091 	while (--nlevels >= 0) {
3092 		/* Zero out the lower 12 bits and the upper 12 bits */
3093 		ptpphys &= 0x000ffffffffff000UL;
3094 
3095 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
3096 		if (ptpbase == NULL) {
3097 			return (EFAULT);
3098 		}
3099 
3100 		ptpshift = PAGE_SHIFT + nlevels * 9;
3101 		ptpindex = (gla >> ptpshift) & 0x1FF;
3102 		pgsize = 1UL << ptpshift;
3103 
3104 		pte = ptpbase[ptpindex];
3105 
3106 		if ((pte & PG_V) == 0 ||
3107 		    (usermode && (pte & PG_U) == 0) ||
3108 		    (writable && (pte & PG_RW) == 0)) {
3109 			if (!check_only) {
3110 				pfcode = pf_error_code(usermode, prot, 0, pte);
3111 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3112 			}
3113 
3114 			ptp_release(&cookie);
3115 			*guest_fault = 1;
3116 			return (0);
3117 		}
3118 
3119 		/* Set the accessed bit in the page table entry */
3120 		if (!check_only && (pte & PG_A) == 0) {
3121 			if (atomic_cmpset_64(&ptpbase[ptpindex],
3122 			    pte, pte | PG_A) == 0) {
3123 				goto restart;
3124 			}
3125 		}
3126 
3127 		if (nlevels > 0 && (pte & PG_PS) != 0) {
3128 			if (pgsize > 1 * GB) {
3129 				if (!check_only) {
3130 					pfcode = pf_error_code(usermode, prot,
3131 					    1, pte);
3132 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3133 				}
3134 
3135 				ptp_release(&cookie);
3136 				*guest_fault = 1;
3137 				return (0);
3138 			}
3139 			break;
3140 		}
3141 
3142 		ptpphys = pte;
3143 	}
3144 
3145 	/* Set the dirty bit in the page table entry if necessary */
3146 	if (!check_only && writable && (pte & PG_M) == 0) {
3147 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
3148 			goto restart;
3149 	}
3150 	ptp_release(&cookie);
3151 
3152 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3153 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
3154 	*gpa = pte | (gla & (pgsize - 1));
3155 	return (0);
3156 }
3157 
3158 int
3159 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3160     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3161 {
3162 
3163 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3164 	    false));
3165 }
3166 
3167 int
3168 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3169     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3170 {
3171 
3172 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3173 	    true));
3174 }
3175 
3176 int
3177 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip,
3178     int *faultptr)
3179 {
3180 	struct vm_copyinfo copyinfo[2];
3181 	int error, prot;
3182 
3183 	if ((vie->status & VIES_INIT) == 0) {
3184 		return (EINVAL);
3185 	}
3186 
3187 	prot = PROT_READ | PROT_EXEC;
3188 	error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE,
3189 	    prot, copyinfo, nitems(copyinfo), faultptr);
3190 	if (error || *faultptr)
3191 		return (error);
3192 
3193 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE);
3194 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
3195 	vie->num_valid = VIE_INST_SIZE;
3196 	vie->status |= VIES_INST_FETCH;
3197 	return (0);
3198 }
3199 
3200 static int
3201 vie_peek(struct vie *vie, uint8_t *x)
3202 {
3203 
3204 	if (vie->num_processed < vie->num_valid) {
3205 		*x = vie->inst[vie->num_processed];
3206 		return (0);
3207 	} else
3208 		return (-1);
3209 }
3210 
3211 static void
3212 vie_advance(struct vie *vie)
3213 {
3214 
3215 	vie->num_processed++;
3216 }
3217 
3218 static bool
3219 segment_override(uint8_t x, int *seg)
3220 {
3221 
3222 	switch (x) {
3223 	case 0x2E:
3224 		*seg = VM_REG_GUEST_CS;
3225 		break;
3226 	case 0x36:
3227 		*seg = VM_REG_GUEST_SS;
3228 		break;
3229 	case 0x3E:
3230 		*seg = VM_REG_GUEST_DS;
3231 		break;
3232 	case 0x26:
3233 		*seg = VM_REG_GUEST_ES;
3234 		break;
3235 	case 0x64:
3236 		*seg = VM_REG_GUEST_FS;
3237 		break;
3238 	case 0x65:
3239 		*seg = VM_REG_GUEST_GS;
3240 		break;
3241 	default:
3242 		return (false);
3243 	}
3244 	return (true);
3245 }
3246 
3247 static int
3248 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
3249 {
3250 	uint8_t x;
3251 
3252 	while (1) {
3253 		if (vie_peek(vie, &x))
3254 			return (-1);
3255 
3256 		if (x == 0x66)
3257 			vie->opsize_override = 1;
3258 		else if (x == 0x67)
3259 			vie->addrsize_override = 1;
3260 		else if (x == 0xF3)
3261 			vie->repz_present = 1;
3262 		else if (x == 0xF2)
3263 			vie->repnz_present = 1;
3264 		else if (segment_override(x, &vie->segment_register))
3265 			vie->segment_override = 1;
3266 		else
3267 			break;
3268 
3269 		vie_advance(vie);
3270 	}
3271 
3272 	/*
3273 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3274 	 * - Only one REX prefix is allowed per instruction.
3275 	 * - The REX prefix must immediately precede the opcode byte or the
3276 	 *   escape opcode byte.
3277 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3278 	 *   the mandatory prefix must come before the REX prefix.
3279 	 */
3280 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
3281 		vie->rex_present = 1;
3282 		vie->rex_w = x & 0x8 ? 1 : 0;
3283 		vie->rex_r = x & 0x4 ? 1 : 0;
3284 		vie->rex_x = x & 0x2 ? 1 : 0;
3285 		vie->rex_b = x & 0x1 ? 1 : 0;
3286 		vie_advance(vie);
3287 	}
3288 
3289 	/*
3290 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3291 	 */
3292 	if ((cpu_mode == CPU_MODE_64BIT ||
3293 	    cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) {
3294 		const struct vie_op *optab;
3295 
3296 		/* 3-byte VEX prefix. */
3297 		vie->vex_present = 1;
3298 
3299 		vie_advance(vie);
3300 		if (vie_peek(vie, &x))
3301 			return (-1);
3302 
3303 		/*
3304 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
3305 		 * relative to REX encoding.
3306 		 */
3307 		vie->rex_r = x & 0x80 ? 0 : 1;
3308 		vie->rex_x = x & 0x40 ? 0 : 1;
3309 		vie->rex_b = x & 0x20 ? 0 : 1;
3310 
3311 		switch (x & 0x1F) {
3312 		case 0x2:
3313 			/* 0F 38. */
3314 			optab = three_byte_opcodes_0f38;
3315 			break;
3316 		case 0x1:
3317 			/* 0F class - nothing handled here yet. */
3318 			/* FALLTHROUGH */
3319 		case 0x3:
3320 			/* 0F 3A class - nothing handled here yet. */
3321 			/* FALLTHROUGH */
3322 		default:
3323 			/* Reserved (#UD). */
3324 			return (-1);
3325 		}
3326 
3327 		vie_advance(vie);
3328 		if (vie_peek(vie, &x))
3329 			return (-1);
3330 
3331 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3332 		vie->rex_w = x & 0x80 ? 1 : 0;
3333 
3334 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
3335 		vie->vex_l = !!(x & 0x4);
3336 		vie->vex_pp = (x & 0x3);
3337 
3338 		/* PP: 1=66 2=F3 3=F2 prefixes. */
3339 		switch (vie->vex_pp) {
3340 		case 0x1:
3341 			vie->opsize_override = 1;
3342 			break;
3343 		case 0x2:
3344 			vie->repz_present = 1;
3345 			break;
3346 		case 0x3:
3347 			vie->repnz_present = 1;
3348 			break;
3349 		}
3350 
3351 		vie_advance(vie);
3352 
3353 		/* Opcode, sans literal prefix prefix. */
3354 		if (vie_peek(vie, &x))
3355 			return (-1);
3356 
3357 		vie->op = optab[x];
3358 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
3359 			return (-1);
3360 
3361 		vie_advance(vie);
3362 	}
3363 
3364 	/*
3365 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3366 	 */
3367 	if (cpu_mode == CPU_MODE_64BIT) {
3368 		/*
3369 		 * Default address size is 64-bits and default operand size
3370 		 * is 32-bits.
3371 		 */
3372 		vie->addrsize = vie->addrsize_override ? 4 : 8;
3373 		if (vie->rex_w)
3374 			vie->opsize = 8;
3375 		else if (vie->opsize_override)
3376 			vie->opsize = 2;
3377 		else
3378 			vie->opsize = 4;
3379 	} else if (cs_d) {
3380 		/* Default address and operand sizes are 32-bits */
3381 		vie->addrsize = vie->addrsize_override ? 2 : 4;
3382 		vie->opsize = vie->opsize_override ? 2 : 4;
3383 	} else {
3384 		/* Default address and operand sizes are 16-bits */
3385 		vie->addrsize = vie->addrsize_override ? 4 : 2;
3386 		vie->opsize = vie->opsize_override ? 4 : 2;
3387 	}
3388 	return (0);
3389 }
3390 
3391 static int
3392 decode_two_byte_opcode(struct vie *vie)
3393 {
3394 	uint8_t x;
3395 
3396 	if (vie_peek(vie, &x))
3397 		return (-1);
3398 
3399 	vie->op = two_byte_opcodes[x];
3400 
3401 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3402 		return (-1);
3403 
3404 	vie_advance(vie);
3405 	return (0);
3406 }
3407 
3408 static int
3409 decode_opcode(struct vie *vie)
3410 {
3411 	uint8_t x;
3412 
3413 	if (vie_peek(vie, &x))
3414 		return (-1);
3415 
3416 	/* Already did this via VEX prefix. */
3417 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
3418 		return (0);
3419 
3420 	vie->op = one_byte_opcodes[x];
3421 
3422 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3423 		return (-1);
3424 
3425 	vie_advance(vie);
3426 
3427 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
3428 		return (decode_two_byte_opcode(vie));
3429 
3430 	return (0);
3431 }
3432 
3433 static int
3434 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
3435 {
3436 	uint8_t x;
3437 	/*
3438 	 * Handling mov-to/from-cr is special since it is not issuing
3439 	 * mmio/pio requests and can be done in real mode.  We must bypass some
3440 	 * of the other existing decoding restrictions for it.
3441 	 */
3442 	const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0);
3443 
3444 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
3445 		return (0);
3446 
3447 	if (cpu_mode == CPU_MODE_REAL && !is_movcr)
3448 		return (-1);
3449 
3450 	if (vie_peek(vie, &x))
3451 		return (-1);
3452 
3453 	vie->mod = (x >> 6) & 0x3;
3454 	vie->rm =  (x >> 0) & 0x7;
3455 	vie->reg = (x >> 3) & 0x7;
3456 
3457 	/*
3458 	 * A direct addressing mode makes no sense in the context of an EPT
3459 	 * fault. There has to be a memory access involved to cause the
3460 	 * EPT fault.
3461 	 */
3462 	if (vie->mod == VIE_MOD_DIRECT && !is_movcr)
3463 		return (-1);
3464 
3465 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
3466 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
3467 		/*
3468 		 * Table 2-5: Special Cases of REX Encodings
3469 		 *
3470 		 * mod=0, r/m=5 is used in the compatibility mode to
3471 		 * indicate a disp32 without a base register.
3472 		 *
3473 		 * mod!=3, r/m=4 is used in the compatibility mode to
3474 		 * indicate that the SIB byte is present.
3475 		 *
3476 		 * The 'b' bit in the REX prefix is don't care in
3477 		 * this case.
3478 		 */
3479 	} else {
3480 		vie->rm |= (vie->rex_b << 3);
3481 	}
3482 
3483 	vie->reg |= (vie->rex_r << 3);
3484 
3485 	/* SIB */
3486 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
3487 		goto done;
3488 
3489 	vie->base_register = gpr_map[vie->rm];
3490 
3491 	switch (vie->mod) {
3492 	case VIE_MOD_INDIRECT_DISP8:
3493 		vie->disp_bytes = 1;
3494 		break;
3495 	case VIE_MOD_INDIRECT_DISP32:
3496 		vie->disp_bytes = 4;
3497 		break;
3498 	case VIE_MOD_INDIRECT:
3499 		if (vie->rm == VIE_RM_DISP32) {
3500 			vie->disp_bytes = 4;
3501 			/*
3502 			 * Table 2-7. RIP-Relative Addressing
3503 			 *
3504 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3505 			 * whereas in compatibility mode it just implies disp32.
3506 			 */
3507 
3508 			if (cpu_mode == CPU_MODE_64BIT)
3509 				vie->base_register = VM_REG_GUEST_RIP;
3510 			else
3511 				vie->base_register = VM_REG_LAST;
3512 		}
3513 		break;
3514 	}
3515 
3516 done:
3517 	vie_advance(vie);
3518 
3519 	return (0);
3520 }
3521 
3522 static int
3523 decode_sib(struct vie *vie)
3524 {
3525 	uint8_t x;
3526 
3527 	/* Proceed only if SIB byte is present */
3528 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
3529 		return (0);
3530 
3531 	if (vie_peek(vie, &x))
3532 		return (-1);
3533 
3534 	/* De-construct the SIB byte */
3535 	vie->ss = (x >> 6) & 0x3;
3536 	vie->index = (x >> 3) & 0x7;
3537 	vie->base = (x >> 0) & 0x7;
3538 
3539 	/* Apply the REX prefix modifiers */
3540 	vie->index |= vie->rex_x << 3;
3541 	vie->base |= vie->rex_b << 3;
3542 
3543 	switch (vie->mod) {
3544 	case VIE_MOD_INDIRECT_DISP8:
3545 		vie->disp_bytes = 1;
3546 		break;
3547 	case VIE_MOD_INDIRECT_DISP32:
3548 		vie->disp_bytes = 4;
3549 		break;
3550 	}
3551 
3552 	if (vie->mod == VIE_MOD_INDIRECT &&
3553 	    (vie->base == 5 || vie->base == 13)) {
3554 		/*
3555 		 * Special case when base register is unused if mod = 0
3556 		 * and base = %rbp or %r13.
3557 		 *
3558 		 * Documented in:
3559 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3560 		 * Table 2-5: Special Cases of REX Encodings
3561 		 */
3562 		vie->disp_bytes = 4;
3563 	} else {
3564 		vie->base_register = gpr_map[vie->base];
3565 	}
3566 
3567 	/*
3568 	 * All encodings of 'index' are valid except for %rsp (4).
3569 	 *
3570 	 * Documented in:
3571 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3572 	 * Table 2-5: Special Cases of REX Encodings
3573 	 */
3574 	if (vie->index != 4)
3575 		vie->index_register = gpr_map[vie->index];
3576 
3577 	/* 'scale' makes sense only in the context of an index register */
3578 	if (vie->index_register < VM_REG_LAST)
3579 		vie->scale = 1 << vie->ss;
3580 
3581 	vie_advance(vie);
3582 
3583 	return (0);
3584 }
3585 
3586 static int
3587 decode_displacement(struct vie *vie)
3588 {
3589 	int n, i;
3590 	uint8_t x;
3591 
3592 	union {
3593 		char	buf[4];
3594 		int8_t	signed8;
3595 		int32_t	signed32;
3596 	} u;
3597 
3598 	if ((n = vie->disp_bytes) == 0)
3599 		return (0);
3600 
3601 	if (n != 1 && n != 4)
3602 		panic("decode_displacement: invalid disp_bytes %d", n);
3603 
3604 	for (i = 0; i < n; i++) {
3605 		if (vie_peek(vie, &x))
3606 			return (-1);
3607 
3608 		u.buf[i] = x;
3609 		vie_advance(vie);
3610 	}
3611 
3612 	if (n == 1)
3613 		vie->displacement = u.signed8;		/* sign-extended */
3614 	else
3615 		vie->displacement = u.signed32;		/* sign-extended */
3616 
3617 	return (0);
3618 }
3619 
3620 static int
3621 decode_immediate(struct vie *vie)
3622 {
3623 	int i, n;
3624 	uint8_t x;
3625 	union {
3626 		char	buf[4];
3627 		int8_t	signed8;
3628 		int16_t	signed16;
3629 		int32_t	signed32;
3630 	} u;
3631 
3632 	/* Figure out immediate operand size (if any) */
3633 	if (vie->op.op_flags & VIE_OP_F_IMM) {
3634 		/*
3635 		 * Section 2.2.1.5 "Immediates", Intel SDM:
3636 		 * In 64-bit mode the typical size of immediate operands
3637 		 * remains 32-bits. When the operand size if 64-bits, the
3638 		 * processor sign-extends all immediates to 64-bits prior
3639 		 * to their use.
3640 		 */
3641 		if (vie->opsize == 4 || vie->opsize == 8)
3642 			vie->imm_bytes = 4;
3643 		else
3644 			vie->imm_bytes = 2;
3645 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
3646 		vie->imm_bytes = 1;
3647 	}
3648 
3649 	if ((n = vie->imm_bytes) == 0)
3650 		return (0);
3651 
3652 	KASSERT(n == 1 || n == 2 || n == 4,
3653 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
3654 
3655 	for (i = 0; i < n; i++) {
3656 		if (vie_peek(vie, &x))
3657 			return (-1);
3658 
3659 		u.buf[i] = x;
3660 		vie_advance(vie);
3661 	}
3662 
3663 	/* sign-extend the immediate value before use */
3664 	if (n == 1)
3665 		vie->immediate = u.signed8;
3666 	else if (n == 2)
3667 		vie->immediate = u.signed16;
3668 	else
3669 		vie->immediate = u.signed32;
3670 
3671 	return (0);
3672 }
3673 
3674 static int
3675 decode_moffset(struct vie *vie)
3676 {
3677 	int i, n;
3678 	uint8_t x;
3679 	union {
3680 		char	buf[8];
3681 		uint64_t u64;
3682 	} u;
3683 
3684 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
3685 		return (0);
3686 
3687 	/*
3688 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3689 	 * The memory offset size follows the address-size of the instruction.
3690 	 */
3691 	n = vie->addrsize;
3692 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
3693 
3694 	u.u64 = 0;
3695 	for (i = 0; i < n; i++) {
3696 		if (vie_peek(vie, &x))
3697 			return (-1);
3698 
3699 		u.buf[i] = x;
3700 		vie_advance(vie);
3701 	}
3702 	vie->displacement = u.u64;
3703 	return (0);
3704 }
3705 
3706 /*
3707  * Verify that the 'guest linear address' provided as collateral of the nested
3708  * page table fault matches with our instruction decoding.
3709  */
3710 int
3711 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla)
3712 {
3713 	int error;
3714 	uint64_t base, segbase, idx, gla2;
3715 	enum vm_reg_name seg;
3716 	struct seg_desc desc;
3717 
3718 	ASSERT((vie->status & VIES_INST_DECODE) != 0);
3719 
3720 	/*
3721 	 * If there was no valid GLA context with the exit, or the decoded
3722 	 * instruction acts on more than one address, verification is done.
3723 	 */
3724 	if (gla == VIE_INVALID_GLA ||
3725 	    (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) {
3726 		return (0);
3727 	}
3728 
3729 	base = 0;
3730 	if (vie->base_register != VM_REG_LAST) {
3731 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
3732 		if (error) {
3733 			printf("verify_gla: error %d getting base reg %d\n",
3734 			    error, vie->base_register);
3735 			return (-1);
3736 		}
3737 
3738 		/*
3739 		 * RIP-relative addressing starts from the following
3740 		 * instruction
3741 		 */
3742 		if (vie->base_register == VM_REG_GUEST_RIP)
3743 			base += vie->num_processed;
3744 	}
3745 
3746 	idx = 0;
3747 	if (vie->index_register != VM_REG_LAST) {
3748 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
3749 		if (error) {
3750 			printf("verify_gla: error %d getting index reg %d\n",
3751 			    error, vie->index_register);
3752 			return (-1);
3753 		}
3754 	}
3755 
3756 	/*
3757 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3758 	 *
3759 	 * In 64-bit mode, segmentation is generally (but not
3760 	 * completely) disabled.  The exceptions are the FS and GS
3761 	 * segments.
3762 	 *
3763 	 * In legacy IA-32 mode, when the ESP or EBP register is used
3764 	 * as the base, the SS segment is the default segment.  For
3765 	 * other data references, except when relative to stack or
3766 	 * string destination the DS segment is the default.  These
3767 	 * can be overridden to allow other segments to be accessed.
3768 	 */
3769 	if (vie->segment_override) {
3770 		seg = vie->segment_register;
3771 	} else if (vie->base_register == VM_REG_GUEST_RSP ||
3772 	    vie->base_register == VM_REG_GUEST_RBP) {
3773 		seg = VM_REG_GUEST_SS;
3774 	} else {
3775 		seg = VM_REG_GUEST_DS;
3776 	}
3777 	if (vie->paging.cpu_mode == CPU_MODE_64BIT &&
3778 	    seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) {
3779 		segbase = 0;
3780 	} else {
3781 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
3782 		if (error) {
3783 			printf("verify_gla: error %d getting segment"
3784 			    " descriptor %d", error, vie->segment_register);
3785 			return (-1);
3786 		}
3787 		segbase = desc.base;
3788 	}
3789 
3790 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
3791 	gla2 &= size2mask[vie->addrsize];
3792 	if (gla != gla2) {
3793 		printf("verify_gla mismatch: segbase(0x%0lx)"
3794 		    "base(0x%0lx), scale(%d), index(0x%0lx), "
3795 		    "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3796 		    segbase, base, vie->scale, idx, vie->displacement,
3797 		    gla, gla2);
3798 		return (-1);
3799 	}
3800 
3801 	return (0);
3802 }
3803 
3804 int
3805 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d)
3806 {
3807 	enum vm_cpu_mode cpu_mode;
3808 
3809 	if ((vie->status & VIES_INST_FETCH) == 0) {
3810 		return (EINVAL);
3811 	}
3812 
3813 	cpu_mode = vie->paging.cpu_mode;
3814 
3815 	if (decode_prefixes(vie, cpu_mode, cs_d))
3816 		return (-1);
3817 
3818 	if (decode_opcode(vie))
3819 		return (-1);
3820 
3821 	if (decode_modrm(vie, cpu_mode))
3822 		return (-1);
3823 
3824 	if (decode_sib(vie))
3825 		return (-1);
3826 
3827 	if (decode_displacement(vie))
3828 		return (-1);
3829 
3830 	if (decode_immediate(vie))
3831 		return (-1);
3832 
3833 	if (decode_moffset(vie))
3834 		return (-1);
3835 
3836 	vie->status |= VIES_INST_DECODE;
3837 
3838 	return (0);
3839 }
3840