xref: /freebsd/sys/amd64/vmm/vmm_instruction_emul.c (revision c18c521c79b6160ce43bb2ca4c2eb42ccf7e6e57)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #ifdef _KERNEL
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/proc.h>
36 
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39 
40 #include <machine/vmparam.h>
41 #include <machine/vmm.h>
42 
43 #include <dev/vmm/vmm_mem.h>
44 #else	/* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 #include <sys/_iovec.h>
48 
49 #include <machine/vmm.h>
50 
51 #include <err.h>
52 #include <assert.h>
53 #include <stdbool.h>
54 #include <stddef.h>
55 #include <stdio.h>
56 #include <string.h>
57 #include <strings.h>
58 #include <vmmapi.h>
59 #define	__diagused
60 #define	KASSERT(exp,msg)	assert((exp))
61 #define	panic(...)		errx(4, __VA_ARGS__)
62 #endif	/* _KERNEL */
63 
64 #include <machine/vmm_instruction_emul.h>
65 #include <x86/psl.h>
66 #include <x86/specialreg.h>
67 
68 /* struct vie_op.op_flags */
69 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
70 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
71 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
72 #define	VIE_OP_F_NO_MODRM	(1 << 3)
73 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
74 
75 static const struct vie_op three_byte_opcodes_0f38[256] = {
76 	[0xF7] = {
77 		.op_byte = 0xF7,
78 		.op_type = VIE_OP_TYPE_BEXTR,
79 	},
80 };
81 
82 static const struct vie_op two_byte_opcodes[256] = {
83 	[0xAE] = {
84 		.op_byte = 0xAE,
85 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
86 	},
87 	[0xB6] = {
88 		.op_byte = 0xB6,
89 		.op_type = VIE_OP_TYPE_MOVZX,
90 	},
91 	[0xB7] = {
92 		.op_byte = 0xB7,
93 		.op_type = VIE_OP_TYPE_MOVZX,
94 	},
95 	[0xBA] = {
96 		.op_byte = 0xBA,
97 		.op_type = VIE_OP_TYPE_BITTEST,
98 		.op_flags = VIE_OP_F_IMM8,
99 	},
100 	[0xBE] = {
101 		.op_byte = 0xBE,
102 		.op_type = VIE_OP_TYPE_MOVSX,
103 	},
104 };
105 
106 static const struct vie_op one_byte_opcodes[256] = {
107 	[0x03] = {
108 		.op_byte = 0x03,
109 		.op_type = VIE_OP_TYPE_ADD,
110 	},
111 	[0x0F] = {
112 		.op_byte = 0x0F,
113 		.op_type = VIE_OP_TYPE_TWO_BYTE
114 	},
115 	[0x0B] = {
116 		.op_byte = 0x0B,
117 		.op_type = VIE_OP_TYPE_OR,
118 	},
119 	[0x2B] = {
120 		.op_byte = 0x2B,
121 		.op_type = VIE_OP_TYPE_SUB,
122 	},
123 	[0x39] = {
124 		.op_byte = 0x39,
125 		.op_type = VIE_OP_TYPE_CMP,
126 	},
127 	[0x3B] = {
128 		.op_byte = 0x3B,
129 		.op_type = VIE_OP_TYPE_CMP,
130 	},
131 	[0x6E] = {
132 		.op_byte = 0x6E,
133 		.op_type = VIE_OP_TYPE_OUTS,
134 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
135 	},
136 	[0x6F] = {
137 		.op_byte = 0x6F,
138 		.op_type = VIE_OP_TYPE_OUTS,
139 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
140 	},
141 	[0x88] = {
142 		.op_byte = 0x88,
143 		.op_type = VIE_OP_TYPE_MOV,
144 	},
145 	[0x89] = {
146 		.op_byte = 0x89,
147 		.op_type = VIE_OP_TYPE_MOV,
148 	},
149 	[0x8A] = {
150 		.op_byte = 0x8A,
151 		.op_type = VIE_OP_TYPE_MOV,
152 	},
153 	[0x8B] = {
154 		.op_byte = 0x8B,
155 		.op_type = VIE_OP_TYPE_MOV,
156 	},
157 	[0xA1] = {
158 		.op_byte = 0xA1,
159 		.op_type = VIE_OP_TYPE_MOV,
160 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
161 	},
162 	[0xA3] = {
163 		.op_byte = 0xA3,
164 		.op_type = VIE_OP_TYPE_MOV,
165 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
166 	},
167 	[0xA4] = {
168 		.op_byte = 0xA4,
169 		.op_type = VIE_OP_TYPE_MOVS,
170 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
171 	},
172 	[0xA5] = {
173 		.op_byte = 0xA5,
174 		.op_type = VIE_OP_TYPE_MOVS,
175 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
176 	},
177 	[0xAA] = {
178 		.op_byte = 0xAA,
179 		.op_type = VIE_OP_TYPE_STOS,
180 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
181 	},
182 	[0xAB] = {
183 		.op_byte = 0xAB,
184 		.op_type = VIE_OP_TYPE_STOS,
185 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
186 	},
187 	[0xC6] = {
188 		/* XXX Group 11 extended opcode - not just MOV */
189 		.op_byte = 0xC6,
190 		.op_type = VIE_OP_TYPE_MOV,
191 		.op_flags = VIE_OP_F_IMM8,
192 	},
193 	[0xC7] = {
194 		.op_byte = 0xC7,
195 		.op_type = VIE_OP_TYPE_MOV,
196 		.op_flags = VIE_OP_F_IMM,
197 	},
198 	[0x23] = {
199 		.op_byte = 0x23,
200 		.op_type = VIE_OP_TYPE_AND,
201 	},
202 	[0x80] = {
203 		/* Group 1 extended opcode */
204 		.op_byte = 0x80,
205 		.op_type = VIE_OP_TYPE_GROUP1,
206 		.op_flags = VIE_OP_F_IMM8,
207 	},
208 	[0x81] = {
209 		/* Group 1 extended opcode */
210 		.op_byte = 0x81,
211 		.op_type = VIE_OP_TYPE_GROUP1,
212 		.op_flags = VIE_OP_F_IMM,
213 	},
214 	[0x83] = {
215 		/* Group 1 extended opcode */
216 		.op_byte = 0x83,
217 		.op_type = VIE_OP_TYPE_GROUP1,
218 		.op_flags = VIE_OP_F_IMM8,
219 	},
220 	[0x8F] = {
221 		/* XXX Group 1A extended opcode - not just POP */
222 		.op_byte = 0x8F,
223 		.op_type = VIE_OP_TYPE_POP,
224 	},
225 	[0xF6] = {
226 		/* XXX Group 3 extended opcode - not just TEST */
227 		.op_byte = 0xF6,
228 		.op_type = VIE_OP_TYPE_TEST,
229 		.op_flags = VIE_OP_F_IMM8,
230 	},
231 	[0xF7] = {
232 		/* XXX Group 3 extended opcode - not just TEST */
233 		.op_byte = 0xF7,
234 		.op_type = VIE_OP_TYPE_TEST,
235 		.op_flags = VIE_OP_F_IMM,
236 	},
237 	[0xFF] = {
238 		/* XXX Group 5 extended opcode - not just PUSH */
239 		.op_byte = 0xFF,
240 		.op_type = VIE_OP_TYPE_PUSH,
241 	}
242 };
243 
244 /* struct vie.mod */
245 #define	VIE_MOD_INDIRECT		0
246 #define	VIE_MOD_INDIRECT_DISP8		1
247 #define	VIE_MOD_INDIRECT_DISP32		2
248 #define	VIE_MOD_DIRECT			3
249 
250 /* struct vie.rm */
251 #define	VIE_RM_SIB			4
252 #define	VIE_RM_DISP32			5
253 
254 #define	GB				(1024 * 1024 * 1024)
255 
256 static enum vm_reg_name gpr_map[16] = {
257 	VM_REG_GUEST_RAX,
258 	VM_REG_GUEST_RCX,
259 	VM_REG_GUEST_RDX,
260 	VM_REG_GUEST_RBX,
261 	VM_REG_GUEST_RSP,
262 	VM_REG_GUEST_RBP,
263 	VM_REG_GUEST_RSI,
264 	VM_REG_GUEST_RDI,
265 	VM_REG_GUEST_R8,
266 	VM_REG_GUEST_R9,
267 	VM_REG_GUEST_R10,
268 	VM_REG_GUEST_R11,
269 	VM_REG_GUEST_R12,
270 	VM_REG_GUEST_R13,
271 	VM_REG_GUEST_R14,
272 	VM_REG_GUEST_R15
273 };
274 
275 static uint64_t size2mask[] = {
276 	[1] = 0xff,
277 	[2] = 0xffff,
278 	[4] = 0xffffffff,
279 	[8] = 0xffffffffffffffff,
280 };
281 
282 static int
vie_read_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t * rval)283 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval)
284 {
285 	int error;
286 
287 	error = vm_get_register(vcpu, reg, rval);
288 
289 	return (error);
290 }
291 
292 static void
vie_calc_bytereg(struct vie * vie,enum vm_reg_name * reg,int * lhbr)293 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
294 {
295 	*lhbr = 0;
296 	*reg = gpr_map[vie->reg];
297 
298 	/*
299 	 * 64-bit mode imposes limitations on accessing legacy high byte
300 	 * registers (lhbr).
301 	 *
302 	 * The legacy high-byte registers cannot be addressed if the REX
303 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
304 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
305 	 *
306 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
307 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
308 	 * %ah, %ch, %dh and %bh respectively.
309 	 */
310 	if (!vie->rex_present) {
311 		if (vie->reg & 0x4) {
312 			*lhbr = 1;
313 			*reg = gpr_map[vie->reg & 0x3];
314 		}
315 	}
316 }
317 
318 static int
vie_read_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t * rval)319 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval)
320 {
321 	uint64_t val;
322 	int error, lhbr;
323 	enum vm_reg_name reg;
324 
325 	vie_calc_bytereg(vie, &reg, &lhbr);
326 	error = vm_get_register(vcpu, reg, &val);
327 
328 	/*
329 	 * To obtain the value of a legacy high byte register shift the
330 	 * base register right by 8 bits (%ah = %rax >> 8).
331 	 */
332 	if (lhbr)
333 		*rval = val >> 8;
334 	else
335 		*rval = val;
336 	return (error);
337 }
338 
339 static int
vie_write_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t byte)340 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte)
341 {
342 	uint64_t origval, val, mask;
343 	int error, lhbr;
344 	enum vm_reg_name reg;
345 
346 	vie_calc_bytereg(vie, &reg, &lhbr);
347 	error = vm_get_register(vcpu, reg, &origval);
348 	if (error == 0) {
349 		val = byte;
350 		mask = 0xff;
351 		if (lhbr) {
352 			/*
353 			 * Shift left by 8 to store 'byte' in a legacy high
354 			 * byte register.
355 			 */
356 			val <<= 8;
357 			mask <<= 8;
358 		}
359 		val |= origval & ~mask;
360 		error = vm_set_register(vcpu, reg, val);
361 	}
362 	return (error);
363 }
364 
365 int
vie_update_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t val,int size)366 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,
367 		    uint64_t val, int size)
368 {
369 	int error;
370 	uint64_t origval;
371 
372 	switch (size) {
373 	case 1:
374 	case 2:
375 		error = vie_read_register(vcpu, reg, &origval);
376 		if (error)
377 			return (error);
378 		val &= size2mask[size];
379 		val |= origval & ~size2mask[size];
380 		break;
381 	case 4:
382 		val &= 0xffffffffUL;
383 		break;
384 	case 8:
385 		break;
386 	default:
387 		return (EINVAL);
388 	}
389 
390 	error = vm_set_register(vcpu, reg, val);
391 	return (error);
392 }
393 
394 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
395 
396 /*
397  * Return the status flags that would result from doing (x - y).
398  */
399 #define	GETCC(sz)							\
400 static u_long								\
401 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
402 {									\
403 	u_long rflags;							\
404 									\
405 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
406 	    "=r" (rflags), "+r" (x) : "m" (y));				\
407 	return (rflags);						\
408 } struct __hack
409 
410 GETCC(8);
411 GETCC(16);
412 GETCC(32);
413 GETCC(64);
414 
415 static u_long
getcc(int opsize,uint64_t x,uint64_t y)416 getcc(int opsize, uint64_t x, uint64_t y)
417 {
418 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
419 	    ("getcc: invalid operand size %d", opsize));
420 
421 	if (opsize == 1)
422 		return (getcc8(x, y));
423 	else if (opsize == 2)
424 		return (getcc16(x, y));
425 	else if (opsize == 4)
426 		return (getcc32(x, y));
427 	else
428 		return (getcc64(x, y));
429 }
430 
431 /*
432  * Macro creation of functions getaddflags{8,16,32,64}
433  */
434 #define	GETADDFLAGS(sz)							\
435 static u_long								\
436 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
437 {									\
438 	u_long rflags;							\
439 									\
440 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
441 	    "=r" (rflags), "+r" (x) : "m" (y));				\
442 	return (rflags);						\
443 } struct __hack
444 
445 GETADDFLAGS(8);
446 GETADDFLAGS(16);
447 GETADDFLAGS(32);
448 GETADDFLAGS(64);
449 
450 static u_long
getaddflags(int opsize,uint64_t x,uint64_t y)451 getaddflags(int opsize, uint64_t x, uint64_t y)
452 {
453 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
454 	    ("getaddflags: invalid operand size %d", opsize));
455 
456 	if (opsize == 1)
457 		return (getaddflags8(x, y));
458 	else if (opsize == 2)
459 		return (getaddflags16(x, y));
460 	else if (opsize == 4)
461 		return (getaddflags32(x, y));
462 	else
463 		return (getaddflags64(x, y));
464 }
465 
466 /*
467  * Return the status flags that would result from doing (x & y).
468  */
469 #define	GETANDFLAGS(sz)							\
470 static u_long								\
471 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
472 {									\
473 	u_long rflags;							\
474 									\
475 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
476 	    "=r" (rflags), "+r" (x) : "m" (y));				\
477 	return (rflags);						\
478 } struct __hack
479 
480 GETANDFLAGS(8);
481 GETANDFLAGS(16);
482 GETANDFLAGS(32);
483 GETANDFLAGS(64);
484 
485 static u_long
getandflags(int opsize,uint64_t x,uint64_t y)486 getandflags(int opsize, uint64_t x, uint64_t y)
487 {
488 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
489 	    ("getandflags: invalid operand size %d", opsize));
490 
491 	if (opsize == 1)
492 		return (getandflags8(x, y));
493 	else if (opsize == 2)
494 		return (getandflags16(x, y));
495 	else if (opsize == 4)
496 		return (getandflags32(x, y));
497 	else
498 		return (getandflags64(x, y));
499 }
500 
501 static int
emulate_mov(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)502 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
503 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
504 {
505 	int error, size;
506 	enum vm_reg_name reg;
507 	uint8_t byte;
508 	uint64_t val;
509 
510 	size = vie->opsize;
511 	error = EINVAL;
512 
513 	switch (vie->op.op_byte) {
514 	case 0x88:
515 		/*
516 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
517 		 * 88/r:	mov r/m8, r8
518 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
519 		 */
520 		size = 1;	/* override for byte operation */
521 		error = vie_read_bytereg(vcpu, vie, &byte);
522 		if (error == 0)
523 			error = memwrite(vcpu, gpa, byte, size, arg);
524 		break;
525 	case 0x89:
526 		/*
527 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
528 		 * 89/r:	mov r/m16, r16
529 		 * 89/r:	mov r/m32, r32
530 		 * REX.W + 89/r	mov r/m64, r64
531 		 */
532 		reg = gpr_map[vie->reg];
533 		error = vie_read_register(vcpu, reg, &val);
534 		if (error == 0) {
535 			val &= size2mask[size];
536 			error = memwrite(vcpu, gpa, val, size, arg);
537 		}
538 		break;
539 	case 0x8A:
540 		/*
541 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
542 		 * 8A/r:	mov r8, r/m8
543 		 * REX + 8A/r:	mov r8, r/m8
544 		 */
545 		size = 1;	/* override for byte operation */
546 		error = memread(vcpu, gpa, &val, size, arg);
547 		if (error == 0)
548 			error = vie_write_bytereg(vcpu, vie, val);
549 		break;
550 	case 0x8B:
551 		/*
552 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
553 		 * 8B/r:	mov r16, r/m16
554 		 * 8B/r:	mov r32, r/m32
555 		 * REX.W 8B/r:	mov r64, r/m64
556 		 */
557 		error = memread(vcpu, gpa, &val, size, arg);
558 		if (error == 0) {
559 			reg = gpr_map[vie->reg];
560 			error = vie_update_register(vcpu, reg, val, size);
561 		}
562 		break;
563 	case 0xA1:
564 		/*
565 		 * MOV from seg:moffset to AX/EAX/RAX
566 		 * A1:		mov AX, moffs16
567 		 * A1:		mov EAX, moffs32
568 		 * REX.W + A1:	mov RAX, moffs64
569 		 */
570 		error = memread(vcpu, gpa, &val, size, arg);
571 		if (error == 0) {
572 			reg = VM_REG_GUEST_RAX;
573 			error = vie_update_register(vcpu, reg, val, size);
574 		}
575 		break;
576 	case 0xA3:
577 		/*
578 		 * MOV from AX/EAX/RAX to seg:moffset
579 		 * A3:		mov moffs16, AX
580 		 * A3:		mov moffs32, EAX
581 		 * REX.W + A3:	mov moffs64, RAX
582 		 */
583 		error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
584 		if (error == 0) {
585 			val &= size2mask[size];
586 			error = memwrite(vcpu, gpa, val, size, arg);
587 		}
588 		break;
589 	case 0xC6:
590 		/*
591 		 * MOV from imm8 to mem (ModRM:r/m)
592 		 * C6/0		mov r/m8, imm8
593 		 * REX + C6/0	mov r/m8, imm8
594 		 */
595 		size = 1;	/* override for byte operation */
596 		error = memwrite(vcpu, gpa, vie->immediate, size, arg);
597 		break;
598 	case 0xC7:
599 		/*
600 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
601 		 * C7/0		mov r/m16, imm16
602 		 * C7/0		mov r/m32, imm32
603 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
604 		 */
605 		val = vie->immediate & size2mask[size];
606 		error = memwrite(vcpu, gpa, val, size, arg);
607 		break;
608 	default:
609 		break;
610 	}
611 
612 	return (error);
613 }
614 
615 static int
emulate_movx(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)616 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
617     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
618 {
619 	int error, size;
620 	enum vm_reg_name reg;
621 	uint64_t val;
622 
623 	size = vie->opsize;
624 	error = EINVAL;
625 
626 	switch (vie->op.op_byte) {
627 	case 0xB6:
628 		/*
629 		 * MOV and zero extend byte from mem (ModRM:r/m) to
630 		 * reg (ModRM:reg).
631 		 *
632 		 * 0F B6/r		movzx r16, r/m8
633 		 * 0F B6/r		movzx r32, r/m8
634 		 * REX.W + 0F B6/r	movzx r64, r/m8
635 		 */
636 
637 		/* get the first operand */
638 		error = memread(vcpu, gpa, &val, 1, arg);
639 		if (error)
640 			break;
641 
642 		/* get the second operand */
643 		reg = gpr_map[vie->reg];
644 
645 		/* zero-extend byte */
646 		val = (uint8_t)val;
647 
648 		/* write the result */
649 		error = vie_update_register(vcpu, reg, val, size);
650 		break;
651 	case 0xB7:
652 		/*
653 		 * MOV and zero extend word from mem (ModRM:r/m) to
654 		 * reg (ModRM:reg).
655 		 *
656 		 * 0F B7/r		movzx r32, r/m16
657 		 * REX.W + 0F B7/r	movzx r64, r/m16
658 		 */
659 		error = memread(vcpu, gpa, &val, 2, arg);
660 		if (error)
661 			return (error);
662 
663 		reg = gpr_map[vie->reg];
664 
665 		/* zero-extend word */
666 		val = (uint16_t)val;
667 
668 		error = vie_update_register(vcpu, reg, val, size);
669 		break;
670 	case 0xBE:
671 		/*
672 		 * MOV and sign extend byte from mem (ModRM:r/m) to
673 		 * reg (ModRM:reg).
674 		 *
675 		 * 0F BE/r		movsx r16, r/m8
676 		 * 0F BE/r		movsx r32, r/m8
677 		 * REX.W + 0F BE/r	movsx r64, r/m8
678 		 */
679 
680 		/* get the first operand */
681 		error = memread(vcpu, gpa, &val, 1, arg);
682 		if (error)
683 			break;
684 
685 		/* get the second operand */
686 		reg = gpr_map[vie->reg];
687 
688 		/* sign extend byte */
689 		val = (int8_t)val;
690 
691 		/* write the result */
692 		error = vie_update_register(vcpu, reg, val, size);
693 		break;
694 	default:
695 		break;
696 	}
697 	return (error);
698 }
699 
700 /*
701  * Helper function to calculate and validate a linear address.
702  */
703 static int
get_gla(struct vcpu * vcpu,struct vie * vie __unused,struct vm_guest_paging * paging,int opsize,int addrsize,int prot,enum vm_reg_name seg,enum vm_reg_name gpr,uint64_t * gla,int * fault)704 get_gla(struct vcpu *vcpu, struct vie *vie __unused,
705     struct vm_guest_paging *paging, int opsize, int addrsize, int prot,
706     enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)
707 {
708 	struct seg_desc desc;
709 	uint64_t cr0, val, rflags;
710 	int error __diagused;
711 
712 	error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
713 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
714 
715 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
716 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
717 
718 	error = vm_get_seg_desc(vcpu, seg, &desc);
719 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
720 	    __func__, error, seg));
721 
722 	error = vie_read_register(vcpu, gpr, &val);
723 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
724 	    error, gpr));
725 
726 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
727 	    addrsize, prot, gla)) {
728 		if (seg == VM_REG_GUEST_SS)
729 			vm_inject_ss(vcpu, 0);
730 		else
731 			vm_inject_gp(vcpu);
732 		goto guest_fault;
733 	}
734 
735 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
736 		if (seg == VM_REG_GUEST_SS)
737 			vm_inject_ss(vcpu, 0);
738 		else
739 			vm_inject_gp(vcpu);
740 		goto guest_fault;
741 	}
742 
743 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
744 		vm_inject_ac(vcpu, 0);
745 		goto guest_fault;
746 	}
747 
748 	*fault = 0;
749 	return (0);
750 
751 guest_fault:
752 	*fault = 1;
753 	return (0);
754 }
755 
756 static int
emulate_movs(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)757 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
758     struct vm_guest_paging *paging, mem_region_read_t memread,
759     mem_region_write_t memwrite, void *arg)
760 {
761 #ifdef _KERNEL
762 	struct vm_copyinfo copyinfo[2];
763 #else
764 	struct iovec copyinfo[2];
765 #endif
766 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
767 	uint64_t rcx, rdi, rsi, rflags;
768 	int error, fault, opsize, seg, repeat;
769 
770 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
771 	val = 0;
772 	error = 0;
773 
774 	/*
775 	 * XXX although the MOVS instruction is only supposed to be used with
776 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
777 	 *
778 	 * Empirically the "repnz" prefix has identical behavior to "rep"
779 	 * and the zero flag does not make a difference.
780 	 */
781 	repeat = vie->repz_present | vie->repnz_present;
782 
783 	if (repeat) {
784 		error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
785 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
786 
787 		/*
788 		 * The count register is %rcx, %ecx or %cx depending on the
789 		 * address size of the instruction.
790 		 */
791 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
792 			error = 0;
793 			goto done;
794 		}
795 	}
796 
797 	/*
798 	 *	Source		Destination	Comments
799 	 *	--------------------------------------------
800 	 * (1)  memory		memory		n/a
801 	 * (2)  memory		mmio		emulated
802 	 * (3)  mmio		memory		emulated
803 	 * (4)  mmio		mmio		emulated
804 	 *
805 	 * At this point we don't have sufficient information to distinguish
806 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
807 	 * out because it will succeed only when operating on regular memory.
808 	 *
809 	 * XXX the emulation doesn't properly handle the case where 'gpa'
810 	 * is straddling the boundary between the normal memory and MMIO.
811 	 */
812 
813 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
814 	error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
815 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
816 	if (error || fault)
817 		goto done;
818 
819 	error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ,
820 	    copyinfo, nitems(copyinfo), &fault);
821 	if (error == 0) {
822 		if (fault)
823 			goto done;	/* Resume guest to handle fault */
824 
825 		/*
826 		 * case (2): read from system memory and write to mmio.
827 		 */
828 		vm_copyin(copyinfo, &val, opsize);
829 		vm_copy_teardown(copyinfo, nitems(copyinfo));
830 		error = memwrite(vcpu, gpa, val, opsize, arg);
831 		if (error)
832 			goto done;
833 	} else {
834 		/*
835 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
836 		 * if 'srcaddr' is in the mmio space.
837 		 */
838 
839 		error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
840 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
841 		    &fault);
842 		if (error || fault)
843 			goto done;
844 
845 		error = vm_copy_setup(vcpu, paging, dstaddr, opsize,
846 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
847 		if (error == 0) {
848 			if (fault)
849 				goto done;    /* Resume guest to handle fault */
850 
851 			/*
852 			 * case (3): read from MMIO and write to system memory.
853 			 *
854 			 * A MMIO read can have side-effects so we
855 			 * commit to it only after vm_copy_setup() is
856 			 * successful. If a page-fault needs to be
857 			 * injected into the guest then it will happen
858 			 * before the MMIO read is attempted.
859 			 */
860 			error = memread(vcpu, gpa, &val, opsize, arg);
861 			if (error)
862 				goto done;
863 
864 			vm_copyout(&val, copyinfo, opsize);
865 			vm_copy_teardown(copyinfo, nitems(copyinfo));
866 		} else {
867 			/*
868 			 * Case (4): read from and write to mmio.
869 			 *
870 			 * Commit to the MMIO read/write (with potential
871 			 * side-effects) only after we are sure that the
872 			 * instruction is not going to be restarted due
873 			 * to address translation faults.
874 			 */
875 			error = vm_gla2gpa(vcpu, paging, srcaddr,
876 			    PROT_READ, &srcgpa, &fault);
877 			if (error || fault)
878 				goto done;
879 
880 			error = vm_gla2gpa(vcpu, paging, dstaddr,
881 			   PROT_WRITE, &dstgpa, &fault);
882 			if (error || fault)
883 				goto done;
884 
885 			error = memread(vcpu, srcgpa, &val, opsize, arg);
886 			if (error)
887 				goto done;
888 
889 			error = memwrite(vcpu, dstgpa, val, opsize, arg);
890 			if (error)
891 				goto done;
892 		}
893 	}
894 
895 	error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi);
896 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
897 
898 	error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
899 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
900 
901 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
902 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
903 
904 	if (rflags & PSL_D) {
905 		rsi -= opsize;
906 		rdi -= opsize;
907 	} else {
908 		rsi += opsize;
909 		rdi += opsize;
910 	}
911 
912 	error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi,
913 	    vie->addrsize);
914 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
915 
916 	error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
917 	    vie->addrsize);
918 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
919 
920 	if (repeat) {
921 		rcx = rcx - 1;
922 		error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
923 		    rcx, vie->addrsize);
924 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
925 
926 		/*
927 		 * Repeat the instruction if the count register is not zero.
928 		 */
929 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
930 			vm_restart_instruction(vcpu);
931 	}
932 done:
933 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
934 	    __func__, error));
935 	return (error);
936 }
937 
938 static int
emulate_stos(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread __unused,mem_region_write_t memwrite,void * arg)939 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
940     struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,
941     mem_region_write_t memwrite, void *arg)
942 {
943 	int error, opsize, repeat;
944 	uint64_t val;
945 	uint64_t rcx, rdi, rflags;
946 
947 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
948 	repeat = vie->repz_present | vie->repnz_present;
949 
950 	if (repeat) {
951 		error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
952 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
953 
954 		/*
955 		 * The count register is %rcx, %ecx or %cx depending on the
956 		 * address size of the instruction.
957 		 */
958 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
959 			return (0);
960 	}
961 
962 	error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
963 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
964 
965 	error = memwrite(vcpu, gpa, val, opsize, arg);
966 	if (error)
967 		return (error);
968 
969 	error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
970 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
971 
972 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
973 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
974 
975 	if (rflags & PSL_D)
976 		rdi -= opsize;
977 	else
978 		rdi += opsize;
979 
980 	error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
981 	    vie->addrsize);
982 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
983 
984 	if (repeat) {
985 		rcx = rcx - 1;
986 		error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
987 		    rcx, vie->addrsize);
988 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
989 
990 		/*
991 		 * Repeat the instruction if the count register is not zero.
992 		 */
993 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
994 			vm_restart_instruction(vcpu);
995 	}
996 
997 	return (0);
998 }
999 
1000 static int
emulate_and(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1001 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1002 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1003 {
1004 	int error, size;
1005 	enum vm_reg_name reg;
1006 	uint64_t result, rflags, rflags2, val1, val2;
1007 
1008 	size = vie->opsize;
1009 	error = EINVAL;
1010 
1011 	switch (vie->op.op_byte) {
1012 	case 0x23:
1013 		/*
1014 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1015 		 * result in reg.
1016 		 *
1017 		 * 23/r		and r16, r/m16
1018 		 * 23/r		and r32, r/m32
1019 		 * REX.W + 23/r	and r64, r/m64
1020 		 */
1021 
1022 		/* get the first operand */
1023 		reg = gpr_map[vie->reg];
1024 		error = vie_read_register(vcpu, reg, &val1);
1025 		if (error)
1026 			break;
1027 
1028 		/* get the second operand */
1029 		error = memread(vcpu, gpa, &val2, size, arg);
1030 		if (error)
1031 			break;
1032 
1033 		/* perform the operation and write the result */
1034 		result = val1 & val2;
1035 		error = vie_update_register(vcpu, reg, result, size);
1036 		break;
1037 	case 0x81:
1038 	case 0x83:
1039 		/*
1040 		 * AND mem (ModRM:r/m) with immediate and store the
1041 		 * result in mem.
1042 		 *
1043 		 * 81 /4		and r/m16, imm16
1044 		 * 81 /4		and r/m32, imm32
1045 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1046 		 *
1047 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1048 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1049 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1050 		 */
1051 
1052 		/* get the first operand */
1053                 error = memread(vcpu, gpa, &val1, size, arg);
1054                 if (error)
1055 			break;
1056 
1057                 /*
1058 		 * perform the operation with the pre-fetched immediate
1059 		 * operand and write the result
1060 		 */
1061                 result = val1 & vie->immediate;
1062                 error = memwrite(vcpu, gpa, result, size, arg);
1063 		break;
1064 	default:
1065 		break;
1066 	}
1067 	if (error)
1068 		return (error);
1069 
1070 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1071 	if (error)
1072 		return (error);
1073 
1074 	/*
1075 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1076 	 * to the result; AF is undefined.
1077 	 *
1078 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1079 	 */
1080 	rflags2 = getcc(size, result, 0);
1081 	rflags &= ~RFLAGS_STATUS_BITS;
1082 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1083 
1084 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1085 	return (error);
1086 }
1087 
1088 static int
emulate_or(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1089 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1090 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1091 {
1092 	int error, size;
1093 	enum vm_reg_name reg;
1094 	uint64_t result, rflags, rflags2, val1, val2;
1095 
1096 	size = vie->opsize;
1097 	error = EINVAL;
1098 
1099 	switch (vie->op.op_byte) {
1100 	case 0x0B:
1101 		/*
1102 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1103 		 * result in reg.
1104 		 *
1105 		 * 0b/r         or r16, r/m16
1106 		 * 0b/r         or r32, r/m32
1107 		 * REX.W + 0b/r or r64, r/m64
1108 		 */
1109 
1110 		/* get the first operand */
1111 		reg = gpr_map[vie->reg];
1112 		error = vie_read_register(vcpu, reg, &val1);
1113 		if (error)
1114 			break;
1115 
1116 		/* get the second operand */
1117 		error = memread(vcpu, gpa, &val2, size, arg);
1118 		if (error)
1119 			break;
1120 
1121 		/* perform the operation and write the result */
1122 		result = val1 | val2;
1123 		error = vie_update_register(vcpu, reg, result, size);
1124 		break;
1125 	case 0x81:
1126 	case 0x83:
1127 		/*
1128 		 * OR mem (ModRM:r/m) with immediate and store the
1129 		 * result in mem.
1130 		 *
1131 		 * 81 /1		or r/m16, imm16
1132 		 * 81 /1		or r/m32, imm32
1133 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1134 		 *
1135 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1136 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1137 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1138 		 */
1139 
1140 		/* get the first operand */
1141                 error = memread(vcpu, gpa, &val1, size, arg);
1142                 if (error)
1143 			break;
1144 
1145                 /*
1146 		 * perform the operation with the pre-fetched immediate
1147 		 * operand and write the result
1148 		 */
1149                 result = val1 | vie->immediate;
1150                 error = memwrite(vcpu, gpa, result, size, arg);
1151 		break;
1152 	default:
1153 		break;
1154 	}
1155 	if (error)
1156 		return (error);
1157 
1158 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1159 	if (error)
1160 		return (error);
1161 
1162 	/*
1163 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1164 	 * to the result; AF is undefined.
1165 	 *
1166 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1167 	 */
1168 	rflags2 = getcc(size, result, 0);
1169 	rflags &= ~RFLAGS_STATUS_BITS;
1170 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1171 
1172 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1173 	return (error);
1174 }
1175 
1176 static int
emulate_cmp(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1177 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1178     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1179 {
1180 	int error, size;
1181 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1182 	enum vm_reg_name reg;
1183 
1184 	size = vie->opsize;
1185 	switch (vie->op.op_byte) {
1186 	case 0x39:
1187 	case 0x3B:
1188 		/*
1189 		 * 39/r		CMP r/m16, r16
1190 		 * 39/r		CMP r/m32, r32
1191 		 * REX.W 39/r	CMP r/m64, r64
1192 		 *
1193 		 * 3B/r		CMP r16, r/m16
1194 		 * 3B/r		CMP r32, r/m32
1195 		 * REX.W + 3B/r	CMP r64, r/m64
1196 		 *
1197 		 * Compare the first operand with the second operand and
1198 		 * set status flags in EFLAGS register. The comparison is
1199 		 * performed by subtracting the second operand from the first
1200 		 * operand and then setting the status flags.
1201 		 */
1202 
1203 		/* Get the register operand */
1204 		reg = gpr_map[vie->reg];
1205 		error = vie_read_register(vcpu, reg, &regop);
1206 		if (error)
1207 			return (error);
1208 
1209 		/* Get the memory operand */
1210 		error = memread(vcpu, gpa, &memop, size, arg);
1211 		if (error)
1212 			return (error);
1213 
1214 		if (vie->op.op_byte == 0x3B) {
1215 			op1 = regop;
1216 			op2 = memop;
1217 		} else {
1218 			op1 = memop;
1219 			op2 = regop;
1220 		}
1221 		rflags2 = getcc(size, op1, op2);
1222 		break;
1223 	case 0x80:
1224 	case 0x81:
1225 	case 0x83:
1226 		/*
1227 		 * 80 /7		cmp r/m8, imm8
1228 		 * REX + 80 /7		cmp r/m8, imm8
1229 		 *
1230 		 * 81 /7		cmp r/m16, imm16
1231 		 * 81 /7		cmp r/m32, imm32
1232 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1233 		 *
1234 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1235 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1236 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1237 		 *
1238 		 * Compare mem (ModRM:r/m) with immediate and set
1239 		 * status flags according to the results.  The
1240 		 * comparison is performed by subtracting the
1241 		 * immediate from the first operand and then setting
1242 		 * the status flags.
1243 		 *
1244 		 */
1245 		if (vie->op.op_byte == 0x80)
1246 			size = 1;
1247 
1248 		/* get the first operand */
1249                 error = memread(vcpu, gpa, &op1, size, arg);
1250 		if (error)
1251 			return (error);
1252 
1253 		rflags2 = getcc(size, op1, vie->immediate);
1254 		break;
1255 	default:
1256 		return (EINVAL);
1257 	}
1258 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1259 	if (error)
1260 		return (error);
1261 	rflags &= ~RFLAGS_STATUS_BITS;
1262 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1263 
1264 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1265 	return (error);
1266 }
1267 
1268 static int
emulate_test(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1269 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1270     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1271 {
1272 	int error, size;
1273 	uint64_t op1, rflags, rflags2;
1274 
1275 	size = vie->opsize;
1276 	error = EINVAL;
1277 
1278 	switch (vie->op.op_byte) {
1279 	case 0xF6:
1280 		/*
1281 		 * F6 /0		test r/m8, imm8
1282 		 */
1283 		size = 1;	/* override for byte operation */
1284 		/* FALLTHROUGH */
1285 	case 0xF7:
1286 		/*
1287 		 * F7 /0		test r/m16, imm16
1288 		 * F7 /0		test r/m32, imm32
1289 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1290 		 *
1291 		 * Test mem (ModRM:r/m) with immediate and set status
1292 		 * flags according to the results.  The comparison is
1293 		 * performed by anding the immediate from the first
1294 		 * operand and then setting the status flags.
1295 		 */
1296 		if ((vie->reg & 7) != 0)
1297 			return (EINVAL);
1298 
1299 		error = memread(vcpu, gpa, &op1, size, arg);
1300 		if (error)
1301 			return (error);
1302 
1303 		rflags2 = getandflags(size, op1, vie->immediate);
1304 		break;
1305 	default:
1306 		return (EINVAL);
1307 	}
1308 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1309 	if (error)
1310 		return (error);
1311 
1312 	/*
1313 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1314 	 * to the result; AF is undefined.
1315 	 */
1316 	rflags &= ~RFLAGS_STATUS_BITS;
1317 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1318 
1319 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1320 	return (error);
1321 }
1322 
1323 static int
emulate_bextr(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1324 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1325     struct vm_guest_paging *paging, mem_region_read_t memread,
1326     mem_region_write_t memwrite __unused, void *arg)
1327 {
1328 	uint64_t src1, src2, dst, rflags;
1329 	unsigned start, len, size;
1330 	int error;
1331 
1332 	size = vie->opsize;
1333 	error = EINVAL;
1334 
1335 	/*
1336 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1337 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1338 	 *
1339 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1340 	 * Vex.vvvv.
1341 	 *
1342 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1343 	 */
1344 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1345 		size = 4;
1346 
1347 	/*
1348 	 * Extracts contiguous bits from the first /source/ operand (second
1349 	 * operand) using an index and length specified in the second /source/
1350 	 * operand (third operand).
1351 	 */
1352 	error = memread(vcpu, gpa, &src1, size, arg);
1353 	if (error)
1354 		return (error);
1355 	error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2);
1356 	if (error)
1357 		return (error);
1358 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1359 	if (error)
1360 		return (error);
1361 
1362 	start = (src2 & 0xff);
1363 	len = (src2 & 0xff00) >> 8;
1364 
1365 	/* If no bits are extracted, the destination register is cleared. */
1366 	dst = 0;
1367 
1368 	/* If START exceeds the operand size, no bits are extracted. */
1369 	if (start > size * 8)
1370 		goto done;
1371 	/* Length is bounded by both the destination size and start offset. */
1372 	if (start + len > size * 8)
1373 		len = (size * 8) - start;
1374 	if (len == 0)
1375 		goto done;
1376 
1377 	if (start > 0)
1378 		src1 = (src1 >> start);
1379 	if (len < 64)
1380 		src1 = src1 & ((1ull << len) - 1);
1381 	dst = src1;
1382 
1383 done:
1384 	error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size);
1385 	if (error)
1386 		return (error);
1387 
1388 	/*
1389 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1390 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1391 	 */
1392 	rflags &= ~RFLAGS_STATUS_BITS;
1393 	if (dst == 0)
1394 		rflags |= PSL_Z;
1395 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags,
1396 	    8);
1397 	return (error);
1398 }
1399 
1400 static int
emulate_add(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1401 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1402     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1403 {
1404 	int error, size;
1405 	uint64_t nval, rflags, rflags2, val1, val2;
1406 	enum vm_reg_name reg;
1407 
1408 	size = vie->opsize;
1409 	error = EINVAL;
1410 
1411 	switch (vie->op.op_byte) {
1412 	case 0x03:
1413 		/*
1414 		 * ADD r/m to r and store the result in r
1415 		 *
1416 		 * 03/r            ADD r16, r/m16
1417 		 * 03/r            ADD r32, r/m32
1418 		 * REX.W + 03/r    ADD r64, r/m64
1419 		 */
1420 
1421 		/* get the first operand */
1422 		reg = gpr_map[vie->reg];
1423 		error = vie_read_register(vcpu, reg, &val1);
1424 		if (error)
1425 			break;
1426 
1427 		/* get the second operand */
1428 		error = memread(vcpu, gpa, &val2, size, arg);
1429 		if (error)
1430 			break;
1431 
1432 		/* perform the operation and write the result */
1433 		nval = val1 + val2;
1434 		error = vie_update_register(vcpu, reg, nval, size);
1435 		break;
1436 	default:
1437 		break;
1438 	}
1439 
1440 	if (!error) {
1441 		rflags2 = getaddflags(size, val1, val2);
1442 		error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1443 		    &rflags);
1444 		if (error)
1445 			return (error);
1446 
1447 		rflags &= ~RFLAGS_STATUS_BITS;
1448 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1449 		error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1450 		    rflags, 8);
1451 	}
1452 
1453 	return (error);
1454 }
1455 
1456 static int
emulate_sub(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1457 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1458     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1459 {
1460 	int error, size;
1461 	uint64_t nval, rflags, rflags2, val1, val2;
1462 	enum vm_reg_name reg;
1463 
1464 	size = vie->opsize;
1465 	error = EINVAL;
1466 
1467 	switch (vie->op.op_byte) {
1468 	case 0x2B:
1469 		/*
1470 		 * SUB r/m from r and store the result in r
1471 		 *
1472 		 * 2B/r            SUB r16, r/m16
1473 		 * 2B/r            SUB r32, r/m32
1474 		 * REX.W + 2B/r    SUB r64, r/m64
1475 		 */
1476 
1477 		/* get the first operand */
1478 		reg = gpr_map[vie->reg];
1479 		error = vie_read_register(vcpu, reg, &val1);
1480 		if (error)
1481 			break;
1482 
1483 		/* get the second operand */
1484 		error = memread(vcpu, gpa, &val2, size, arg);
1485 		if (error)
1486 			break;
1487 
1488 		/* perform the operation and write the result */
1489 		nval = val1 - val2;
1490 		error = vie_update_register(vcpu, reg, nval, size);
1491 		break;
1492 	default:
1493 		break;
1494 	}
1495 
1496 	if (!error) {
1497 		rflags2 = getcc(size, val1, val2);
1498 		error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1499 		    &rflags);
1500 		if (error)
1501 			return (error);
1502 
1503 		rflags &= ~RFLAGS_STATUS_BITS;
1504 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1505 		error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1506 		    rflags, 8);
1507 	}
1508 
1509 	return (error);
1510 }
1511 
1512 static int
emulate_stack_op(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1513 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1514     struct vm_guest_paging *paging, mem_region_read_t memread,
1515     mem_region_write_t memwrite, void *arg)
1516 {
1517 #ifdef _KERNEL
1518 	struct vm_copyinfo copyinfo[2];
1519 #else
1520 	struct iovec copyinfo[2];
1521 #endif
1522 	struct seg_desc ss_desc;
1523 	uint64_t cr0, rflags, rsp, stack_gla, val;
1524 	int error, fault, size, stackaddrsize, pushop;
1525 
1526 	val = 0;
1527 	size = vie->opsize;
1528 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1529 
1530 	/*
1531 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1532 	 */
1533 	if (paging->cpu_mode == CPU_MODE_REAL) {
1534 		stackaddrsize = 2;
1535 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1536 		/*
1537 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1538 		 * - Stack pointer size is always 64-bits.
1539 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1540 		 * - 16-bit PUSH/POP is supported by using the operand size
1541 		 *   override prefix (66H).
1542 		 */
1543 		stackaddrsize = 8;
1544 		size = vie->opsize_override ? 2 : 8;
1545 	} else {
1546 		/*
1547 		 * In protected or compatibility mode the 'B' flag in the
1548 		 * stack-segment descriptor determines the size of the
1549 		 * stack pointer.
1550 		 */
1551 		error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc);
1552 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1553 		    __func__, error));
1554 		if (SEG_DESC_DEF32(ss_desc.access))
1555 			stackaddrsize = 4;
1556 		else
1557 			stackaddrsize = 2;
1558 	}
1559 
1560 	error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
1561 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1562 
1563 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1564 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1565 
1566 	error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp);
1567 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1568 	if (pushop) {
1569 		rsp -= size;
1570 	}
1571 
1572 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1573 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1574 	    &stack_gla)) {
1575 		vm_inject_ss(vcpu, 0);
1576 		return (0);
1577 	}
1578 
1579 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1580 		vm_inject_ss(vcpu, 0);
1581 		return (0);
1582 	}
1583 
1584 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1585 		vm_inject_ac(vcpu, 0);
1586 		return (0);
1587 	}
1588 
1589 	error = vm_copy_setup(vcpu, paging, stack_gla, size,
1590 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1591 	    &fault);
1592 	if (error || fault)
1593 		return (error);
1594 
1595 	if (pushop) {
1596 		error = memread(vcpu, mmio_gpa, &val, size, arg);
1597 		if (error == 0)
1598 			vm_copyout(&val, copyinfo, size);
1599 	} else {
1600 		vm_copyin(copyinfo, &val, size);
1601 		error = memwrite(vcpu, mmio_gpa, val, size, arg);
1602 		rsp += size;
1603 	}
1604 	vm_copy_teardown(copyinfo, nitems(copyinfo));
1605 
1606 	if (error == 0) {
1607 		error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp,
1608 		    stackaddrsize);
1609 		KASSERT(error == 0, ("error %d updating rsp", error));
1610 	}
1611 	return (error);
1612 }
1613 
1614 static int
emulate_push(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1615 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1616     struct vm_guest_paging *paging, mem_region_read_t memread,
1617     mem_region_write_t memwrite, void *arg)
1618 {
1619 	int error;
1620 
1621 	/*
1622 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1623 	 *
1624 	 * PUSH is part of the group 5 extended opcodes and is identified
1625 	 * by ModRM:reg = b110.
1626 	 */
1627 	if ((vie->reg & 7) != 6)
1628 		return (EINVAL);
1629 
1630 	error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1631 	    memwrite, arg);
1632 	return (error);
1633 }
1634 
1635 static int
emulate_pop(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1636 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1637     struct vm_guest_paging *paging, mem_region_read_t memread,
1638     mem_region_write_t memwrite, void *arg)
1639 {
1640 	int error;
1641 
1642 	/*
1643 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1644 	 *
1645 	 * POP is part of the group 1A extended opcodes and is identified
1646 	 * by ModRM:reg = b000.
1647 	 */
1648 	if ((vie->reg & 7) != 0)
1649 		return (EINVAL);
1650 
1651 	error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1652 	    memwrite, arg);
1653 	return (error);
1654 }
1655 
1656 static int
emulate_group1(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1657 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1658     struct vm_guest_paging *paging __unused, mem_region_read_t memread,
1659     mem_region_write_t memwrite, void *memarg)
1660 {
1661 	int error;
1662 
1663 	switch (vie->reg & 7) {
1664 	case 0x1:	/* OR */
1665 		error = emulate_or(vcpu, gpa, vie,
1666 		    memread, memwrite, memarg);
1667 		break;
1668 	case 0x4:	/* AND */
1669 		error = emulate_and(vcpu, gpa, vie,
1670 		    memread, memwrite, memarg);
1671 		break;
1672 	case 0x7:	/* CMP */
1673 		error = emulate_cmp(vcpu, gpa, vie,
1674 		    memread, memwrite, memarg);
1675 		break;
1676 	default:
1677 		error = EINVAL;
1678 		break;
1679 	}
1680 
1681 	return (error);
1682 }
1683 
1684 static int
emulate_bittest(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1685 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1686     mem_region_read_t memread, mem_region_write_t memwrite __unused,
1687     void *memarg)
1688 {
1689 	uint64_t val, rflags;
1690 	int error, bitmask, bitoff;
1691 
1692 	/*
1693 	 * 0F BA is a Group 8 extended opcode.
1694 	 *
1695 	 * Currently we only emulate the 'Bit Test' instruction which is
1696 	 * identified by a ModR/M:reg encoding of 100b.
1697 	 */
1698 	if ((vie->reg & 7) != 4)
1699 		return (EINVAL);
1700 
1701 	error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1702 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1703 
1704 	error = memread(vcpu, gpa, &val, vie->opsize, memarg);
1705 	if (error)
1706 		return (error);
1707 
1708 	/*
1709 	 * Intel SDM, Vol 2, Table 3-2:
1710 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1711 	 */
1712 	bitmask = vie->opsize * 8 - 1;
1713 	bitoff = vie->immediate & bitmask;
1714 
1715 	/* Copy the bit into the Carry flag in %rflags */
1716 	if (val & (1UL << bitoff))
1717 		rflags |= PSL_C;
1718 	else
1719 		rflags &= ~PSL_C;
1720 
1721 	error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1722 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1723 
1724 	return (0);
1725 }
1726 
1727 static int
emulate_twob_group15(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1728 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1729     mem_region_read_t memread, mem_region_write_t memwrite __unused,
1730     void *memarg)
1731 {
1732 	int error;
1733 	uint64_t buf;
1734 
1735 	switch (vie->reg & 7) {
1736 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1737 		if (vie->mod == 0x3) {
1738 			/*
1739 			 * SFENCE.  Ignore it, VM exit provides enough
1740 			 * barriers on its own.
1741 			 */
1742 			error = 0;
1743 		} else {
1744 			/*
1745 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1746 			 * rights.
1747 			 */
1748 			error = memread(vcpu, gpa, &buf, 1, memarg);
1749 		}
1750 		break;
1751 	default:
1752 		error = EINVAL;
1753 		break;
1754 	}
1755 
1756 	return (error);
1757 }
1758 
1759 int
vmm_emulate_instruction(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1760 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1761     struct vm_guest_paging *paging, mem_region_read_t memread,
1762     mem_region_write_t memwrite, void *memarg)
1763 {
1764 	int error;
1765 
1766 	if (!vie->decoded)
1767 		return (EINVAL);
1768 
1769 	switch (vie->op.op_type) {
1770 	case VIE_OP_TYPE_GROUP1:
1771 		error = emulate_group1(vcpu, gpa, vie, paging, memread,
1772 		    memwrite, memarg);
1773 		break;
1774 	case VIE_OP_TYPE_POP:
1775 		error = emulate_pop(vcpu, gpa, vie, paging, memread,
1776 		    memwrite, memarg);
1777 		break;
1778 	case VIE_OP_TYPE_PUSH:
1779 		error = emulate_push(vcpu, gpa, vie, paging, memread,
1780 		    memwrite, memarg);
1781 		break;
1782 	case VIE_OP_TYPE_CMP:
1783 		error = emulate_cmp(vcpu, gpa, vie,
1784 				    memread, memwrite, memarg);
1785 		break;
1786 	case VIE_OP_TYPE_MOV:
1787 		error = emulate_mov(vcpu, gpa, vie,
1788 				    memread, memwrite, memarg);
1789 		break;
1790 	case VIE_OP_TYPE_MOVSX:
1791 	case VIE_OP_TYPE_MOVZX:
1792 		error = emulate_movx(vcpu, gpa, vie,
1793 				     memread, memwrite, memarg);
1794 		break;
1795 	case VIE_OP_TYPE_MOVS:
1796 		error = emulate_movs(vcpu, gpa, vie, paging, memread,
1797 		    memwrite, memarg);
1798 		break;
1799 	case VIE_OP_TYPE_STOS:
1800 		error = emulate_stos(vcpu, gpa, vie, paging, memread,
1801 		    memwrite, memarg);
1802 		break;
1803 	case VIE_OP_TYPE_AND:
1804 		error = emulate_and(vcpu, gpa, vie,
1805 				    memread, memwrite, memarg);
1806 		break;
1807 	case VIE_OP_TYPE_OR:
1808 		error = emulate_or(vcpu, gpa, vie,
1809 				    memread, memwrite, memarg);
1810 		break;
1811 	case VIE_OP_TYPE_SUB:
1812 		error = emulate_sub(vcpu, gpa, vie,
1813 				    memread, memwrite, memarg);
1814 		break;
1815 	case VIE_OP_TYPE_BITTEST:
1816 		error = emulate_bittest(vcpu, gpa, vie,
1817 		    memread, memwrite, memarg);
1818 		break;
1819 	case VIE_OP_TYPE_TWOB_GRP15:
1820 		error = emulate_twob_group15(vcpu, gpa, vie,
1821 		    memread, memwrite, memarg);
1822 		break;
1823 	case VIE_OP_TYPE_ADD:
1824 		error = emulate_add(vcpu, gpa, vie, memread,
1825 		    memwrite, memarg);
1826 		break;
1827 	case VIE_OP_TYPE_TEST:
1828 		error = emulate_test(vcpu, gpa, vie,
1829 		    memread, memwrite, memarg);
1830 		break;
1831 	case VIE_OP_TYPE_BEXTR:
1832 		error = emulate_bextr(vcpu, gpa, vie, paging,
1833 		    memread, memwrite, memarg);
1834 		break;
1835 	default:
1836 		error = EINVAL;
1837 		break;
1838 	}
1839 
1840 	return (error);
1841 }
1842 
1843 int
vie_alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)1844 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1845 {
1846 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1847 	    ("%s: invalid size %d", __func__, size));
1848 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1849 
1850 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1851 		return (0);
1852 
1853 	return ((gla & (size - 1)) ? 1 : 0);
1854 }
1855 
1856 int
vie_canonical_check(enum vm_cpu_mode cpu_mode,uint64_t gla)1857 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1858 {
1859 	uint64_t mask;
1860 
1861 	if (cpu_mode != CPU_MODE_64BIT)
1862 		return (0);
1863 
1864 	/*
1865 	 * The value of the bit 47 in the 'gla' should be replicated in the
1866 	 * most significant 16 bits.
1867 	 */
1868 	mask = ~((1UL << 48) - 1);
1869 	if (gla & (1UL << 47))
1870 		return ((gla & mask) != mask);
1871 	else
1872 		return ((gla & mask) != 0);
1873 }
1874 
1875 uint64_t
vie_size2mask(int size)1876 vie_size2mask(int size)
1877 {
1878 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1879 	    ("vie_size2mask: invalid size %d", size));
1880 	return (size2mask[size]);
1881 }
1882 
1883 int
vie_calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)1884 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1885     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1886     int prot, uint64_t *gla)
1887 {
1888 	uint64_t firstoff, low_limit, high_limit, segbase;
1889 	int glasize, type;
1890 
1891 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1892 	    ("%s: invalid segment %d", __func__, seg));
1893 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1894 	    ("%s: invalid operand size %d", __func__, length));
1895 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1896 	    ("%s: invalid prot %#x", __func__, prot));
1897 
1898 	firstoff = offset;
1899 	if (cpu_mode == CPU_MODE_64BIT) {
1900 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1901 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1902 		glasize = 8;
1903 	} else {
1904 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1905 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1906 		glasize = 4;
1907 		/*
1908 		 * If the segment selector is loaded with a NULL selector
1909 		 * then the descriptor is unusable and attempting to use
1910 		 * it results in a #GP(0).
1911 		 */
1912 		if (SEG_DESC_UNUSABLE(desc->access))
1913 			return (-1);
1914 
1915 		/*
1916 		 * The processor generates a #NP exception when a segment
1917 		 * register is loaded with a selector that points to a
1918 		 * descriptor that is not present. If this was the case then
1919 		 * it would have been checked before the VM-exit.
1920 		 */
1921 		KASSERT(SEG_DESC_PRESENT(desc->access),
1922 		    ("segment %d not present: %#x", seg, desc->access));
1923 
1924 		/*
1925 		 * The descriptor type must indicate a code/data segment.
1926 		 */
1927 		type = SEG_DESC_TYPE(desc->access);
1928 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1929 		    "descriptor type %#x", seg, type));
1930 
1931 		if (prot & PROT_READ) {
1932 			/* #GP on a read access to a exec-only code segment */
1933 			if ((type & 0xA) == 0x8)
1934 				return (-1);
1935 		}
1936 
1937 		if (prot & PROT_WRITE) {
1938 			/*
1939 			 * #GP on a write access to a code segment or a
1940 			 * read-only data segment.
1941 			 */
1942 			if (type & 0x8)			/* code segment */
1943 				return (-1);
1944 
1945 			if ((type & 0xA) == 0)		/* read-only data seg */
1946 				return (-1);
1947 		}
1948 
1949 		/*
1950 		 * 'desc->limit' is fully expanded taking granularity into
1951 		 * account.
1952 		 */
1953 		if ((type & 0xC) == 0x4) {
1954 			/* expand-down data segment */
1955 			low_limit = desc->limit + 1;
1956 			high_limit = SEG_DESC_DEF32(desc->access) ?
1957 			    0xffffffff : 0xffff;
1958 		} else {
1959 			/* code segment or expand-up data segment */
1960 			low_limit = 0;
1961 			high_limit = desc->limit;
1962 		}
1963 
1964 		while (length > 0) {
1965 			offset &= vie_size2mask(addrsize);
1966 			if (offset < low_limit || offset > high_limit)
1967 				return (-1);
1968 			offset++;
1969 			length--;
1970 		}
1971 	}
1972 
1973 	/*
1974 	 * In 64-bit mode all segments except %fs and %gs have a segment
1975 	 * base address of 0.
1976 	 */
1977 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1978 	    seg != VM_REG_GUEST_GS) {
1979 		segbase = 0;
1980 	} else {
1981 		segbase = desc->base;
1982 	}
1983 
1984 	/*
1985 	 * Truncate 'firstoff' to the effective address size before adding
1986 	 * it to the segment base.
1987 	 */
1988 	firstoff &= vie_size2mask(addrsize);
1989 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1990 	return (0);
1991 }
1992 
1993 /*
1994  * Prepare a partially decoded vie for a 2nd attempt.
1995  */
1996 void
vie_restart(struct vie * vie)1997 vie_restart(struct vie *vie)
1998 {
1999 	_Static_assert(
2000 	    offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
2001 	    offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
2002 	    "restart should not erase instruction length or contents");
2003 
2004 	memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
2005 	    sizeof(*vie) - offsetof(struct vie, vie_startzero));
2006 
2007 	vie->base_register = VM_REG_LAST;
2008 	vie->index_register = VM_REG_LAST;
2009 	vie->segment_register = VM_REG_LAST;
2010 }
2011 
2012 void
vie_init(struct vie * vie,const char * inst_bytes,int inst_length)2013 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
2014 {
2015 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
2016 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2017 
2018 	vie_restart(vie);
2019 	memset(vie->inst, 0, sizeof(vie->inst));
2020 	if (inst_length != 0)
2021 		memcpy(vie->inst, inst_bytes, inst_length);
2022 	vie->num_valid = inst_length;
2023 }
2024 
2025 #ifdef _KERNEL
2026 static int
pf_error_code(int usermode,int prot,int rsvd,uint64_t pte)2027 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2028 {
2029 	int error_code = 0;
2030 
2031 	if (pte & PG_V)
2032 		error_code |= PGEX_P;
2033 	if (prot & VM_PROT_WRITE)
2034 		error_code |= PGEX_W;
2035 	if (usermode)
2036 		error_code |= PGEX_U;
2037 	if (rsvd)
2038 		error_code |= PGEX_RSV;
2039 	if (prot & VM_PROT_EXECUTE)
2040 		error_code |= PGEX_I;
2041 
2042 	return (error_code);
2043 }
2044 
2045 static void
ptp_release(void ** cookie)2046 ptp_release(void **cookie)
2047 {
2048 	if (*cookie != NULL) {
2049 		vm_gpa_release(*cookie);
2050 		*cookie = NULL;
2051 	}
2052 }
2053 
2054 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)2055 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2056 {
2057 	void *ptr;
2058 
2059 	ptp_release(cookie);
2060 	ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
2061 	return (ptr);
2062 }
2063 
2064 static int
_vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault,bool check_only)2065 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2066     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2067 {
2068 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2069 	u_int retries;
2070 	uint64_t *ptpbase, ptpphys, pte, pgsize;
2071 	uint32_t *ptpbase32, pte32;
2072 	void *cookie;
2073 
2074 	*guest_fault = 0;
2075 
2076 	usermode = (paging->cpl == 3 ? 1 : 0);
2077 	writable = prot & VM_PROT_WRITE;
2078 	cookie = NULL;
2079 	retval = 0;
2080 	retries = 0;
2081 restart:
2082 	ptpphys = paging->cr3;		/* root of the page tables */
2083 	ptp_release(&cookie);
2084 	if (retries++ > 0)
2085 		maybe_yield();
2086 
2087 	if (vie_canonical_check(paging->cpu_mode, gla)) {
2088 		/*
2089 		 * XXX assuming a non-stack reference otherwise a stack fault
2090 		 * should be generated.
2091 		 */
2092 		if (!check_only)
2093 			vm_inject_gp(vcpu);
2094 		goto fault;
2095 	}
2096 
2097 	if (paging->paging_mode == PAGING_MODE_FLAT) {
2098 		*gpa = gla;
2099 		goto done;
2100 	}
2101 
2102 	if (paging->paging_mode == PAGING_MODE_32) {
2103 		nlevels = 2;
2104 		while (--nlevels >= 0) {
2105 			/* Zero out the lower 12 bits. */
2106 			ptpphys &= ~0xfff;
2107 
2108 			ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,
2109 			    &cookie);
2110 
2111 			if (ptpbase32 == NULL)
2112 				goto error;
2113 
2114 			ptpshift = PAGE_SHIFT + nlevels * 10;
2115 			ptpindex = (gla >> ptpshift) & 0x3FF;
2116 			pgsize = 1UL << ptpshift;
2117 
2118 			pte32 = ptpbase32[ptpindex];
2119 
2120 			if ((pte32 & PG_V) == 0 ||
2121 			    (usermode && (pte32 & PG_U) == 0) ||
2122 			    (writable && (pte32 & PG_RW) == 0)) {
2123 				if (!check_only) {
2124 					pfcode = pf_error_code(usermode, prot, 0,
2125 					    pte32);
2126 					vm_inject_pf(vcpu, pfcode, gla);
2127 				}
2128 				goto fault;
2129 			}
2130 
2131 			/*
2132 			 * Emulate the x86 MMU's management of the accessed
2133 			 * and dirty flags. While the accessed flag is set
2134 			 * at every level of the page table, the dirty flag
2135 			 * is only set at the last level providing the guest
2136 			 * physical address.
2137 			 */
2138 			if (!check_only && (pte32 & PG_A) == 0) {
2139 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
2140 				    pte32, pte32 | PG_A) == 0) {
2141 					goto restart;
2142 				}
2143 			}
2144 
2145 			/* XXX must be ignored if CR4.PSE=0 */
2146 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
2147 				break;
2148 
2149 			ptpphys = pte32;
2150 		}
2151 
2152 		/* Set the dirty bit in the page table entry if necessary */
2153 		if (!check_only && writable && (pte32 & PG_M) == 0) {
2154 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
2155 			    pte32, pte32 | PG_M) == 0) {
2156 				goto restart;
2157 			}
2158 		}
2159 
2160 		/* Zero out the lower 'ptpshift' bits */
2161 		pte32 >>= ptpshift; pte32 <<= ptpshift;
2162 		*gpa = pte32 | (gla & (pgsize - 1));
2163 		goto done;
2164 	}
2165 
2166 	if (paging->paging_mode == PAGING_MODE_PAE) {
2167 		/* Zero out the lower 5 bits and the upper 32 bits */
2168 		ptpphys &= 0xffffffe0UL;
2169 
2170 		ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,
2171 		    &cookie);
2172 		if (ptpbase == NULL)
2173 			goto error;
2174 
2175 		ptpindex = (gla >> 30) & 0x3;
2176 
2177 		pte = ptpbase[ptpindex];
2178 
2179 		if ((pte & PG_V) == 0) {
2180 			if (!check_only) {
2181 				pfcode = pf_error_code(usermode, prot, 0, pte);
2182 				vm_inject_pf(vcpu, pfcode, gla);
2183 			}
2184 			goto fault;
2185 		}
2186 
2187 		ptpphys = pte;
2188 
2189 		nlevels = 2;
2190 	} else if (paging->paging_mode == PAGING_MODE_64_LA57) {
2191 		nlevels = 5;
2192 	} else {
2193 		nlevels = 4;
2194 	}
2195 
2196 	while (--nlevels >= 0) {
2197 		/* Zero out the lower 12 bits and the upper 12 bits */
2198 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2199 
2200 		ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);
2201 		if (ptpbase == NULL)
2202 			goto error;
2203 
2204 		ptpshift = PAGE_SHIFT + nlevels * 9;
2205 		ptpindex = (gla >> ptpshift) & 0x1FF;
2206 		pgsize = 1UL << ptpshift;
2207 
2208 		pte = ptpbase[ptpindex];
2209 
2210 		if ((pte & PG_V) == 0 ||
2211 		    (usermode && (pte & PG_U) == 0) ||
2212 		    (writable && (pte & PG_RW) == 0)) {
2213 			if (!check_only) {
2214 				pfcode = pf_error_code(usermode, prot, 0, pte);
2215 				vm_inject_pf(vcpu, pfcode, gla);
2216 			}
2217 			goto fault;
2218 		}
2219 
2220 		/* Set the accessed bit in the page table entry */
2221 		if (!check_only && (pte & PG_A) == 0) {
2222 			if (atomic_cmpset_64(&ptpbase[ptpindex],
2223 			    pte, pte | PG_A) == 0) {
2224 				goto restart;
2225 			}
2226 		}
2227 
2228 		if (nlevels > 0 && (pte & PG_PS) != 0) {
2229 			if (pgsize > 1 * GB) {
2230 				if (!check_only) {
2231 					pfcode = pf_error_code(usermode, prot, 1,
2232 					    pte);
2233 					vm_inject_pf(vcpu, pfcode, gla);
2234 				}
2235 				goto fault;
2236 			}
2237 			break;
2238 		}
2239 
2240 		ptpphys = pte;
2241 	}
2242 
2243 	/* Set the dirty bit in the page table entry if necessary */
2244 	if (!check_only && writable && (pte & PG_M) == 0) {
2245 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2246 			goto restart;
2247 	}
2248 
2249 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2250 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2251 	*gpa = pte | (gla & (pgsize - 1));
2252 done:
2253 	ptp_release(&cookie);
2254 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2255 	    __func__, retval));
2256 	return (retval);
2257 error:
2258 	retval = EFAULT;
2259 	goto done;
2260 fault:
2261 	*guest_fault = 1;
2262 	goto done;
2263 }
2264 
2265 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2266 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2267     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2268 {
2269 
2270 	return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2271 	    false));
2272 }
2273 
2274 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2275 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
2276     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2277 {
2278 
2279 	return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2280 	    true));
2281 }
2282 
2283 int
vmm_fetch_instruction(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t rip,int inst_length,struct vie * vie,int * faultptr)2284 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,
2285     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2286 {
2287 	struct vm_copyinfo copyinfo[2];
2288 	int error, prot;
2289 
2290 	if (inst_length > VIE_INST_SIZE)
2291 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
2292 
2293 	prot = PROT_READ | PROT_EXEC;
2294 	error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,
2295 	    copyinfo, nitems(copyinfo), faultptr);
2296 	if (error || *faultptr)
2297 		return (error);
2298 
2299 	vm_copyin(copyinfo, vie->inst, inst_length);
2300 	vm_copy_teardown(copyinfo, nitems(copyinfo));
2301 	vie->num_valid = inst_length;
2302 	return (0);
2303 }
2304 #endif	/* _KERNEL */
2305 
2306 static int
vie_peek(struct vie * vie,uint8_t * x)2307 vie_peek(struct vie *vie, uint8_t *x)
2308 {
2309 
2310 	if (vie->num_processed < vie->num_valid) {
2311 		*x = vie->inst[vie->num_processed];
2312 		return (0);
2313 	} else
2314 		return (-1);
2315 }
2316 
2317 static void
vie_advance(struct vie * vie)2318 vie_advance(struct vie *vie)
2319 {
2320 
2321 	vie->num_processed++;
2322 }
2323 
2324 static bool
segment_override(uint8_t x,int * seg)2325 segment_override(uint8_t x, int *seg)
2326 {
2327 
2328 	switch (x) {
2329 	case 0x2E:
2330 		*seg = VM_REG_GUEST_CS;
2331 		break;
2332 	case 0x36:
2333 		*seg = VM_REG_GUEST_SS;
2334 		break;
2335 	case 0x3E:
2336 		*seg = VM_REG_GUEST_DS;
2337 		break;
2338 	case 0x26:
2339 		*seg = VM_REG_GUEST_ES;
2340 		break;
2341 	case 0x64:
2342 		*seg = VM_REG_GUEST_FS;
2343 		break;
2344 	case 0x65:
2345 		*seg = VM_REG_GUEST_GS;
2346 		break;
2347 	default:
2348 		return (false);
2349 	}
2350 	return (true);
2351 }
2352 
2353 static int
decode_prefixes(struct vie * vie,enum vm_cpu_mode cpu_mode,int cs_d)2354 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2355 {
2356 	uint8_t x;
2357 
2358 	while (1) {
2359 		if (vie_peek(vie, &x))
2360 			return (-1);
2361 
2362 		if (x == 0x66)
2363 			vie->opsize_override = 1;
2364 		else if (x == 0x67)
2365 			vie->addrsize_override = 1;
2366 		else if (x == 0xF3)
2367 			vie->repz_present = 1;
2368 		else if (x == 0xF2)
2369 			vie->repnz_present = 1;
2370 		else if (segment_override(x, &vie->segment_register))
2371 			vie->segment_override = 1;
2372 		else
2373 			break;
2374 
2375 		vie_advance(vie);
2376 	}
2377 
2378 	/*
2379 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2380 	 * - Only one REX prefix is allowed per instruction.
2381 	 * - The REX prefix must immediately precede the opcode byte or the
2382 	 *   escape opcode byte.
2383 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2384 	 *   the mandatory prefix must come before the REX prefix.
2385 	 */
2386 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2387 		vie->rex_present = 1;
2388 		vie->rex_w = x & 0x8 ? 1 : 0;
2389 		vie->rex_r = x & 0x4 ? 1 : 0;
2390 		vie->rex_x = x & 0x2 ? 1 : 0;
2391 		vie->rex_b = x & 0x1 ? 1 : 0;
2392 		vie_advance(vie);
2393 	}
2394 
2395 	/*
2396 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2397 	 */
2398 	if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2399 	    && x == 0xC4) {
2400 		const struct vie_op *optab;
2401 
2402 		/* 3-byte VEX prefix. */
2403 		vie->vex_present = 1;
2404 
2405 		vie_advance(vie);
2406 		if (vie_peek(vie, &x))
2407 			return (-1);
2408 
2409 		/*
2410 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
2411 		 * relative to REX encoding.
2412 		 */
2413 		vie->rex_r = x & 0x80 ? 0 : 1;
2414 		vie->rex_x = x & 0x40 ? 0 : 1;
2415 		vie->rex_b = x & 0x20 ? 0 : 1;
2416 
2417 		switch (x & 0x1F) {
2418 		case 0x2:
2419 			/* 0F 38. */
2420 			optab = three_byte_opcodes_0f38;
2421 			break;
2422 		case 0x1:
2423 			/* 0F class - nothing handled here yet. */
2424 			/* FALLTHROUGH */
2425 		case 0x3:
2426 			/* 0F 3A class - nothing handled here yet. */
2427 			/* FALLTHROUGH */
2428 		default:
2429 			/* Reserved (#UD). */
2430 			return (-1);
2431 		}
2432 
2433 		vie_advance(vie);
2434 		if (vie_peek(vie, &x))
2435 			return (-1);
2436 
2437 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2438 		vie->rex_w = x & 0x80 ? 1 : 0;
2439 
2440 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2441 		vie->vex_l = !!(x & 0x4);
2442 		vie->vex_pp = (x & 0x3);
2443 
2444 		/* PP: 1=66 2=F3 3=F2 prefixes. */
2445 		switch (vie->vex_pp) {
2446 		case 0x1:
2447 			vie->opsize_override = 1;
2448 			break;
2449 		case 0x2:
2450 			vie->repz_present = 1;
2451 			break;
2452 		case 0x3:
2453 			vie->repnz_present = 1;
2454 			break;
2455 		}
2456 
2457 		vie_advance(vie);
2458 
2459 		/* Opcode, sans literal prefix prefix. */
2460 		if (vie_peek(vie, &x))
2461 			return (-1);
2462 
2463 		vie->op = optab[x];
2464 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
2465 			return (-1);
2466 
2467 		vie_advance(vie);
2468 	}
2469 
2470 	/*
2471 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2472 	 */
2473 	if (cpu_mode == CPU_MODE_64BIT) {
2474 		/*
2475 		 * Default address size is 64-bits and default operand size
2476 		 * is 32-bits.
2477 		 */
2478 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2479 		if (vie->rex_w)
2480 			vie->opsize = 8;
2481 		else if (vie->opsize_override)
2482 			vie->opsize = 2;
2483 		else
2484 			vie->opsize = 4;
2485 	} else if (cs_d) {
2486 		/* Default address and operand sizes are 32-bits */
2487 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2488 		vie->opsize = vie->opsize_override ? 2 : 4;
2489 	} else {
2490 		/* Default address and operand sizes are 16-bits */
2491 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2492 		vie->opsize = vie->opsize_override ? 4 : 2;
2493 	}
2494 	return (0);
2495 }
2496 
2497 static int
decode_two_byte_opcode(struct vie * vie)2498 decode_two_byte_opcode(struct vie *vie)
2499 {
2500 	uint8_t x;
2501 
2502 	if (vie_peek(vie, &x))
2503 		return (-1);
2504 
2505 	vie->op = two_byte_opcodes[x];
2506 
2507 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2508 		return (-1);
2509 
2510 	vie_advance(vie);
2511 	return (0);
2512 }
2513 
2514 static int
decode_opcode(struct vie * vie)2515 decode_opcode(struct vie *vie)
2516 {
2517 	uint8_t x;
2518 
2519 	if (vie_peek(vie, &x))
2520 		return (-1);
2521 
2522 	/* Already did this via VEX prefix. */
2523 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
2524 		return (0);
2525 
2526 	vie->op = one_byte_opcodes[x];
2527 
2528 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2529 		return (-1);
2530 
2531 	vie_advance(vie);
2532 
2533 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2534 		return (decode_two_byte_opcode(vie));
2535 
2536 	return (0);
2537 }
2538 
2539 static int
decode_modrm(struct vie * vie,enum vm_cpu_mode cpu_mode)2540 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2541 {
2542 	uint8_t x;
2543 
2544 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2545 		return (0);
2546 
2547 	if (cpu_mode == CPU_MODE_REAL)
2548 		return (-1);
2549 
2550 	if (vie_peek(vie, &x))
2551 		return (-1);
2552 
2553 	vie->mod = (x >> 6) & 0x3;
2554 	vie->rm =  (x >> 0) & 0x7;
2555 	vie->reg = (x >> 3) & 0x7;
2556 
2557 	/*
2558 	 * A direct addressing mode makes no sense in the context of an EPT
2559 	 * fault. There has to be a memory access involved to cause the
2560 	 * EPT fault.
2561 	 */
2562 	if (vie->mod == VIE_MOD_DIRECT)
2563 		return (-1);
2564 
2565 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2566 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2567 		/*
2568 		 * Table 2-5: Special Cases of REX Encodings
2569 		 *
2570 		 * mod=0, r/m=5 is used in the compatibility mode to
2571 		 * indicate a disp32 without a base register.
2572 		 *
2573 		 * mod!=3, r/m=4 is used in the compatibility mode to
2574 		 * indicate that the SIB byte is present.
2575 		 *
2576 		 * The 'b' bit in the REX prefix is don't care in
2577 		 * this case.
2578 		 */
2579 	} else {
2580 		vie->rm |= (vie->rex_b << 3);
2581 	}
2582 
2583 	vie->reg |= (vie->rex_r << 3);
2584 
2585 	/* SIB */
2586 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2587 		goto done;
2588 
2589 	vie->base_register = gpr_map[vie->rm];
2590 
2591 	switch (vie->mod) {
2592 	case VIE_MOD_INDIRECT_DISP8:
2593 		vie->disp_bytes = 1;
2594 		break;
2595 	case VIE_MOD_INDIRECT_DISP32:
2596 		vie->disp_bytes = 4;
2597 		break;
2598 	case VIE_MOD_INDIRECT:
2599 		if (vie->rm == VIE_RM_DISP32) {
2600 			vie->disp_bytes = 4;
2601 			/*
2602 			 * Table 2-7. RIP-Relative Addressing
2603 			 *
2604 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2605 			 * whereas in compatibility mode it just implies disp32.
2606 			 */
2607 
2608 			if (cpu_mode == CPU_MODE_64BIT)
2609 				vie->base_register = VM_REG_GUEST_RIP;
2610 			else
2611 				vie->base_register = VM_REG_LAST;
2612 		}
2613 		break;
2614 	}
2615 
2616 done:
2617 	vie_advance(vie);
2618 
2619 	return (0);
2620 }
2621 
2622 static int
decode_sib(struct vie * vie)2623 decode_sib(struct vie *vie)
2624 {
2625 	uint8_t x;
2626 
2627 	/* Proceed only if SIB byte is present */
2628 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2629 		return (0);
2630 
2631 	if (vie_peek(vie, &x))
2632 		return (-1);
2633 
2634 	/* De-construct the SIB byte */
2635 	vie->ss = (x >> 6) & 0x3;
2636 	vie->index = (x >> 3) & 0x7;
2637 	vie->base = (x >> 0) & 0x7;
2638 
2639 	/* Apply the REX prefix modifiers */
2640 	vie->index |= vie->rex_x << 3;
2641 	vie->base |= vie->rex_b << 3;
2642 
2643 	switch (vie->mod) {
2644 	case VIE_MOD_INDIRECT_DISP8:
2645 		vie->disp_bytes = 1;
2646 		break;
2647 	case VIE_MOD_INDIRECT_DISP32:
2648 		vie->disp_bytes = 4;
2649 		break;
2650 	}
2651 
2652 	if (vie->mod == VIE_MOD_INDIRECT &&
2653 	    (vie->base == 5 || vie->base == 13)) {
2654 		/*
2655 		 * Special case when base register is unused if mod = 0
2656 		 * and base = %rbp or %r13.
2657 		 *
2658 		 * Documented in:
2659 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2660 		 * Table 2-5: Special Cases of REX Encodings
2661 		 */
2662 		vie->disp_bytes = 4;
2663 	} else {
2664 		vie->base_register = gpr_map[vie->base];
2665 	}
2666 
2667 	/*
2668 	 * All encodings of 'index' are valid except for %rsp (4).
2669 	 *
2670 	 * Documented in:
2671 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2672 	 * Table 2-5: Special Cases of REX Encodings
2673 	 */
2674 	if (vie->index != 4)
2675 		vie->index_register = gpr_map[vie->index];
2676 
2677 	/* 'scale' makes sense only in the context of an index register */
2678 	if (vie->index_register < VM_REG_LAST)
2679 		vie->scale = 1 << vie->ss;
2680 
2681 	vie_advance(vie);
2682 
2683 	return (0);
2684 }
2685 
2686 static int
decode_displacement(struct vie * vie)2687 decode_displacement(struct vie *vie)
2688 {
2689 	int n, i;
2690 	uint8_t x;
2691 
2692 	union {
2693 		char	buf[4];
2694 		int8_t	signed8;
2695 		int32_t	signed32;
2696 	} u;
2697 
2698 	if ((n = vie->disp_bytes) == 0)
2699 		return (0);
2700 
2701 	if (n != 1 && n != 4)
2702 		panic("decode_displacement: invalid disp_bytes %d", n);
2703 
2704 	for (i = 0; i < n; i++) {
2705 		if (vie_peek(vie, &x))
2706 			return (-1);
2707 
2708 		u.buf[i] = x;
2709 		vie_advance(vie);
2710 	}
2711 
2712 	if (n == 1)
2713 		vie->displacement = u.signed8;		/* sign-extended */
2714 	else
2715 		vie->displacement = u.signed32;		/* sign-extended */
2716 
2717 	return (0);
2718 }
2719 
2720 static int
decode_immediate(struct vie * vie)2721 decode_immediate(struct vie *vie)
2722 {
2723 	int i, n;
2724 	uint8_t x;
2725 	union {
2726 		char	buf[4];
2727 		int8_t	signed8;
2728 		int16_t	signed16;
2729 		int32_t	signed32;
2730 	} u;
2731 
2732 	/* Figure out immediate operand size (if any) */
2733 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2734 		/*
2735 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2736 		 * In 64-bit mode the typical size of immediate operands
2737 		 * remains 32-bits. When the operand size if 64-bits, the
2738 		 * processor sign-extends all immediates to 64-bits prior
2739 		 * to their use.
2740 		 */
2741 		if (vie->opsize == 4 || vie->opsize == 8)
2742 			vie->imm_bytes = 4;
2743 		else
2744 			vie->imm_bytes = 2;
2745 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2746 		vie->imm_bytes = 1;
2747 	}
2748 
2749 	if ((n = vie->imm_bytes) == 0)
2750 		return (0);
2751 
2752 	KASSERT(n == 1 || n == 2 || n == 4,
2753 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2754 
2755 	for (i = 0; i < n; i++) {
2756 		if (vie_peek(vie, &x))
2757 			return (-1);
2758 
2759 		u.buf[i] = x;
2760 		vie_advance(vie);
2761 	}
2762 
2763 	/* sign-extend the immediate value before use */
2764 	if (n == 1)
2765 		vie->immediate = u.signed8;
2766 	else if (n == 2)
2767 		vie->immediate = u.signed16;
2768 	else
2769 		vie->immediate = u.signed32;
2770 
2771 	return (0);
2772 }
2773 
2774 static int
decode_moffset(struct vie * vie)2775 decode_moffset(struct vie *vie)
2776 {
2777 	int i, n;
2778 	uint8_t x;
2779 	union {
2780 		char	buf[8];
2781 		uint64_t u64;
2782 	} u;
2783 
2784 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2785 		return (0);
2786 
2787 	/*
2788 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2789 	 * The memory offset size follows the address-size of the instruction.
2790 	 */
2791 	n = vie->addrsize;
2792 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2793 
2794 	u.u64 = 0;
2795 	for (i = 0; i < n; i++) {
2796 		if (vie_peek(vie, &x))
2797 			return (-1);
2798 
2799 		u.buf[i] = x;
2800 		vie_advance(vie);
2801 	}
2802 	vie->displacement = u.u64;
2803 	return (0);
2804 }
2805 
2806 #ifdef _KERNEL
2807 /*
2808  * Verify that the 'guest linear address' provided as collateral of the nested
2809  * page table fault matches with our instruction decoding.
2810  */
2811 static int
verify_gla(struct vcpu * vcpu,uint64_t gla,struct vie * vie,enum vm_cpu_mode cpu_mode)2812 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,
2813     enum vm_cpu_mode cpu_mode)
2814 {
2815 	int error;
2816 	uint64_t base, segbase, idx, gla2;
2817 	enum vm_reg_name seg;
2818 	struct seg_desc desc;
2819 
2820 	/* Skip 'gla' verification */
2821 	if (gla == VIE_INVALID_GLA)
2822 		return (0);
2823 
2824 	base = 0;
2825 	if (vie->base_register != VM_REG_LAST) {
2826 		error = vm_get_register(vcpu, vie->base_register, &base);
2827 		if (error) {
2828 			printf("verify_gla: error %d getting base reg %d\n",
2829 				error, vie->base_register);
2830 			return (-1);
2831 		}
2832 
2833 		/*
2834 		 * RIP-relative addressing starts from the following
2835 		 * instruction
2836 		 */
2837 		if (vie->base_register == VM_REG_GUEST_RIP)
2838 			base += vie->num_processed;
2839 	}
2840 
2841 	idx = 0;
2842 	if (vie->index_register != VM_REG_LAST) {
2843 		error = vm_get_register(vcpu, vie->index_register, &idx);
2844 		if (error) {
2845 			printf("verify_gla: error %d getting index reg %d\n",
2846 				error, vie->index_register);
2847 			return (-1);
2848 		}
2849 	}
2850 
2851 	/*
2852 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2853 	 *
2854 	 * In 64-bit mode, segmentation is generally (but not
2855 	 * completely) disabled.  The exceptions are the FS and GS
2856 	 * segments.
2857 	 *
2858 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2859 	 * as the base, the SS segment is the default segment.  For
2860 	 * other data references, except when relative to stack or
2861 	 * string destination the DS segment is the default.  These
2862 	 * can be overridden to allow other segments to be accessed.
2863 	 */
2864 	if (vie->segment_override)
2865 		seg = vie->segment_register;
2866 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2867 	    vie->base_register == VM_REG_GUEST_RBP)
2868 		seg = VM_REG_GUEST_SS;
2869 	else
2870 		seg = VM_REG_GUEST_DS;
2871 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2872 	    seg != VM_REG_GUEST_GS) {
2873 		segbase = 0;
2874 	} else {
2875 		error = vm_get_seg_desc(vcpu, seg, &desc);
2876 		if (error) {
2877 			printf("verify_gla: error %d getting segment"
2878 			       " descriptor %d", error,
2879 			       vie->segment_register);
2880 			return (-1);
2881 		}
2882 		segbase = desc.base;
2883 	}
2884 
2885 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2886 	gla2 &= size2mask[vie->addrsize];
2887 	if (gla != gla2) {
2888 		printf("verify_gla mismatch: segbase(0x%0lx)"
2889 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2890 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2891 		       segbase, base, vie->scale, idx, vie->displacement,
2892 		       gla, gla2);
2893 		return (-1);
2894 	}
2895 
2896 	return (0);
2897 }
2898 #endif	/* _KERNEL */
2899 
2900 int
2901 #ifdef _KERNEL
vmm_decode_instruction(struct vcpu * vcpu,uint64_t gla,enum vm_cpu_mode cpu_mode,int cs_d,struct vie * vie)2902 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
2903 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2904 #else
2905 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2906 #endif
2907 {
2908 
2909 	if (decode_prefixes(vie, cpu_mode, cs_d))
2910 		return (-1);
2911 
2912 	if (decode_opcode(vie))
2913 		return (-1);
2914 
2915 	if (decode_modrm(vie, cpu_mode))
2916 		return (-1);
2917 
2918 	if (decode_sib(vie))
2919 		return (-1);
2920 
2921 	if (decode_displacement(vie))
2922 		return (-1);
2923 
2924 	if (decode_immediate(vie))
2925 		return (-1);
2926 
2927 	if (decode_moffset(vie))
2928 		return (-1);
2929 
2930 #ifdef _KERNEL
2931 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2932 		if (verify_gla(vcpu, gla, vie, cpu_mode))
2933 			return (-1);
2934 	}
2935 #endif
2936 
2937 	vie->decoded = 1;	/* success */
2938 
2939 	return (0);
2940 }
2941