1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Sandvine, Inc.
5 * Copyright (c) 2012 NetApp, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #ifdef _KERNEL
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/proc.h>
36
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39
40 #include <machine/vmparam.h>
41 #include <machine/vmm.h>
42
43 #include <dev/vmm/vmm_mem.h>
44 #else /* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 #include <sys/_iovec.h>
48
49 #include <machine/vmm.h>
50
51 #include <err.h>
52 #include <assert.h>
53 #include <stdbool.h>
54 #include <stddef.h>
55 #include <stdio.h>
56 #include <string.h>
57 #include <strings.h>
58 #include <vmmapi.h>
59 #define __diagused
60 #define KASSERT(exp,msg) assert((exp))
61 #define panic(...) errx(4, __VA_ARGS__)
62 #endif /* _KERNEL */
63
64 #include <machine/vmm_instruction_emul.h>
65 #include <x86/psl.h>
66 #include <x86/specialreg.h>
67
68 /* struct vie_op.op_flags */
69 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
70 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
71 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
72 #define VIE_OP_F_NO_MODRM (1 << 3)
73 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
74
75 static const struct vie_op three_byte_opcodes_0f38[256] = {
76 [0xF7] = {
77 .op_byte = 0xF7,
78 .op_type = VIE_OP_TYPE_BEXTR,
79 },
80 };
81
82 static const struct vie_op two_byte_opcodes[256] = {
83 [0xAE] = {
84 .op_byte = 0xAE,
85 .op_type = VIE_OP_TYPE_TWOB_GRP15,
86 },
87 [0xB6] = {
88 .op_byte = 0xB6,
89 .op_type = VIE_OP_TYPE_MOVZX,
90 },
91 [0xB7] = {
92 .op_byte = 0xB7,
93 .op_type = VIE_OP_TYPE_MOVZX,
94 },
95 [0xBA] = {
96 .op_byte = 0xBA,
97 .op_type = VIE_OP_TYPE_BITTEST,
98 .op_flags = VIE_OP_F_IMM8,
99 },
100 [0xBE] = {
101 .op_byte = 0xBE,
102 .op_type = VIE_OP_TYPE_MOVSX,
103 },
104 };
105
106 static const struct vie_op one_byte_opcodes[256] = {
107 [0x03] = {
108 .op_byte = 0x03,
109 .op_type = VIE_OP_TYPE_ADD,
110 },
111 [0x0F] = {
112 .op_byte = 0x0F,
113 .op_type = VIE_OP_TYPE_TWO_BYTE
114 },
115 [0x0B] = {
116 .op_byte = 0x0B,
117 .op_type = VIE_OP_TYPE_OR,
118 },
119 [0x2B] = {
120 .op_byte = 0x2B,
121 .op_type = VIE_OP_TYPE_SUB,
122 },
123 [0x39] = {
124 .op_byte = 0x39,
125 .op_type = VIE_OP_TYPE_CMP,
126 },
127 [0x3B] = {
128 .op_byte = 0x3B,
129 .op_type = VIE_OP_TYPE_CMP,
130 },
131 [0x6E] = {
132 .op_byte = 0x6E,
133 .op_type = VIE_OP_TYPE_OUTS,
134 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
135 },
136 [0x6F] = {
137 .op_byte = 0x6F,
138 .op_type = VIE_OP_TYPE_OUTS,
139 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION,
140 },
141 [0x88] = {
142 .op_byte = 0x88,
143 .op_type = VIE_OP_TYPE_MOV,
144 },
145 [0x89] = {
146 .op_byte = 0x89,
147 .op_type = VIE_OP_TYPE_MOV,
148 },
149 [0x8A] = {
150 .op_byte = 0x8A,
151 .op_type = VIE_OP_TYPE_MOV,
152 },
153 [0x8B] = {
154 .op_byte = 0x8B,
155 .op_type = VIE_OP_TYPE_MOV,
156 },
157 [0xA1] = {
158 .op_byte = 0xA1,
159 .op_type = VIE_OP_TYPE_MOV,
160 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
161 },
162 [0xA3] = {
163 .op_byte = 0xA3,
164 .op_type = VIE_OP_TYPE_MOV,
165 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
166 },
167 [0xA4] = {
168 .op_byte = 0xA4,
169 .op_type = VIE_OP_TYPE_MOVS,
170 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
171 },
172 [0xA5] = {
173 .op_byte = 0xA5,
174 .op_type = VIE_OP_TYPE_MOVS,
175 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
176 },
177 [0xAA] = {
178 .op_byte = 0xAA,
179 .op_type = VIE_OP_TYPE_STOS,
180 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
181 },
182 [0xAB] = {
183 .op_byte = 0xAB,
184 .op_type = VIE_OP_TYPE_STOS,
185 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
186 },
187 [0xC6] = {
188 /* XXX Group 11 extended opcode - not just MOV */
189 .op_byte = 0xC6,
190 .op_type = VIE_OP_TYPE_MOV,
191 .op_flags = VIE_OP_F_IMM8,
192 },
193 [0xC7] = {
194 .op_byte = 0xC7,
195 .op_type = VIE_OP_TYPE_MOV,
196 .op_flags = VIE_OP_F_IMM,
197 },
198 [0x23] = {
199 .op_byte = 0x23,
200 .op_type = VIE_OP_TYPE_AND,
201 },
202 [0x80] = {
203 /* Group 1 extended opcode */
204 .op_byte = 0x80,
205 .op_type = VIE_OP_TYPE_GROUP1,
206 .op_flags = VIE_OP_F_IMM8,
207 },
208 [0x81] = {
209 /* Group 1 extended opcode */
210 .op_byte = 0x81,
211 .op_type = VIE_OP_TYPE_GROUP1,
212 .op_flags = VIE_OP_F_IMM,
213 },
214 [0x83] = {
215 /* Group 1 extended opcode */
216 .op_byte = 0x83,
217 .op_type = VIE_OP_TYPE_GROUP1,
218 .op_flags = VIE_OP_F_IMM8,
219 },
220 [0x8F] = {
221 /* XXX Group 1A extended opcode - not just POP */
222 .op_byte = 0x8F,
223 .op_type = VIE_OP_TYPE_POP,
224 },
225 [0xF6] = {
226 /* XXX Group 3 extended opcode - not just TEST */
227 .op_byte = 0xF6,
228 .op_type = VIE_OP_TYPE_TEST,
229 .op_flags = VIE_OP_F_IMM8,
230 },
231 [0xF7] = {
232 /* XXX Group 3 extended opcode - not just TEST */
233 .op_byte = 0xF7,
234 .op_type = VIE_OP_TYPE_TEST,
235 .op_flags = VIE_OP_F_IMM,
236 },
237 [0xFF] = {
238 /* XXX Group 5 extended opcode - not just PUSH */
239 .op_byte = 0xFF,
240 .op_type = VIE_OP_TYPE_PUSH,
241 }
242 };
243
244 /* struct vie.mod */
245 #define VIE_MOD_INDIRECT 0
246 #define VIE_MOD_INDIRECT_DISP8 1
247 #define VIE_MOD_INDIRECT_DISP32 2
248 #define VIE_MOD_DIRECT 3
249
250 /* struct vie.rm */
251 #define VIE_RM_SIB 4
252 #define VIE_RM_DISP32 5
253
254 #define GB (1024 * 1024 * 1024)
255
256 static enum vm_reg_name gpr_map[16] = {
257 VM_REG_GUEST_RAX,
258 VM_REG_GUEST_RCX,
259 VM_REG_GUEST_RDX,
260 VM_REG_GUEST_RBX,
261 VM_REG_GUEST_RSP,
262 VM_REG_GUEST_RBP,
263 VM_REG_GUEST_RSI,
264 VM_REG_GUEST_RDI,
265 VM_REG_GUEST_R8,
266 VM_REG_GUEST_R9,
267 VM_REG_GUEST_R10,
268 VM_REG_GUEST_R11,
269 VM_REG_GUEST_R12,
270 VM_REG_GUEST_R13,
271 VM_REG_GUEST_R14,
272 VM_REG_GUEST_R15
273 };
274
275 static uint64_t size2mask[] = {
276 [1] = 0xff,
277 [2] = 0xffff,
278 [4] = 0xffffffff,
279 [8] = 0xffffffffffffffff,
280 };
281
282 static int
vie_read_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t * rval)283 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval)
284 {
285 int error;
286
287 error = vm_get_register(vcpu, reg, rval);
288
289 return (error);
290 }
291
292 static void
vie_calc_bytereg(struct vie * vie,enum vm_reg_name * reg,int * lhbr)293 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
294 {
295 *lhbr = 0;
296 *reg = gpr_map[vie->reg];
297
298 /*
299 * 64-bit mode imposes limitations on accessing legacy high byte
300 * registers (lhbr).
301 *
302 * The legacy high-byte registers cannot be addressed if the REX
303 * prefix is present. In this case the values 4, 5, 6 and 7 of the
304 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
305 *
306 * If the REX prefix is not present then the values 4, 5, 6 and 7
307 * of the 'ModRM:reg' field address the legacy high-byte registers,
308 * %ah, %ch, %dh and %bh respectively.
309 */
310 if (!vie->rex_present) {
311 if (vie->reg & 0x4) {
312 *lhbr = 1;
313 *reg = gpr_map[vie->reg & 0x3];
314 }
315 }
316 }
317
318 static int
vie_read_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t * rval)319 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval)
320 {
321 uint64_t val;
322 int error, lhbr;
323 enum vm_reg_name reg;
324
325 vie_calc_bytereg(vie, ®, &lhbr);
326 error = vm_get_register(vcpu, reg, &val);
327
328 /*
329 * To obtain the value of a legacy high byte register shift the
330 * base register right by 8 bits (%ah = %rax >> 8).
331 */
332 if (lhbr)
333 *rval = val >> 8;
334 else
335 *rval = val;
336 return (error);
337 }
338
339 static int
vie_write_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t byte)340 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte)
341 {
342 uint64_t origval, val, mask;
343 int error, lhbr;
344 enum vm_reg_name reg;
345
346 vie_calc_bytereg(vie, ®, &lhbr);
347 error = vm_get_register(vcpu, reg, &origval);
348 if (error == 0) {
349 val = byte;
350 mask = 0xff;
351 if (lhbr) {
352 /*
353 * Shift left by 8 to store 'byte' in a legacy high
354 * byte register.
355 */
356 val <<= 8;
357 mask <<= 8;
358 }
359 val |= origval & ~mask;
360 error = vm_set_register(vcpu, reg, val);
361 }
362 return (error);
363 }
364
365 int
vie_update_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t val,int size)366 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,
367 uint64_t val, int size)
368 {
369 int error;
370 uint64_t origval;
371
372 switch (size) {
373 case 1:
374 case 2:
375 error = vie_read_register(vcpu, reg, &origval);
376 if (error)
377 return (error);
378 val &= size2mask[size];
379 val |= origval & ~size2mask[size];
380 break;
381 case 4:
382 val &= 0xffffffffUL;
383 break;
384 case 8:
385 break;
386 default:
387 return (EINVAL);
388 }
389
390 error = vm_set_register(vcpu, reg, val);
391 return (error);
392 }
393
394 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
395
396 /*
397 * Return the status flags that would result from doing (x - y).
398 */
399 #define GETCC(sz) \
400 static u_long \
401 getcc##sz(uint##sz##_t x, uint##sz##_t y) \
402 { \
403 u_long rflags; \
404 \
405 __asm __volatile("sub %2,%1; pushfq; popq %0" : \
406 "=r" (rflags), "+r" (x) : "m" (y)); \
407 return (rflags); \
408 } struct __hack
409
410 GETCC(8);
411 GETCC(16);
412 GETCC(32);
413 GETCC(64);
414
415 static u_long
getcc(int opsize,uint64_t x,uint64_t y)416 getcc(int opsize, uint64_t x, uint64_t y)
417 {
418 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
419 ("getcc: invalid operand size %d", opsize));
420
421 if (opsize == 1)
422 return (getcc8(x, y));
423 else if (opsize == 2)
424 return (getcc16(x, y));
425 else if (opsize == 4)
426 return (getcc32(x, y));
427 else
428 return (getcc64(x, y));
429 }
430
431 /*
432 * Macro creation of functions getaddflags{8,16,32,64}
433 */
434 #define GETADDFLAGS(sz) \
435 static u_long \
436 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \
437 { \
438 u_long rflags; \
439 \
440 __asm __volatile("add %2,%1; pushfq; popq %0" : \
441 "=r" (rflags), "+r" (x) : "m" (y)); \
442 return (rflags); \
443 } struct __hack
444
445 GETADDFLAGS(8);
446 GETADDFLAGS(16);
447 GETADDFLAGS(32);
448 GETADDFLAGS(64);
449
450 static u_long
getaddflags(int opsize,uint64_t x,uint64_t y)451 getaddflags(int opsize, uint64_t x, uint64_t y)
452 {
453 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
454 ("getaddflags: invalid operand size %d", opsize));
455
456 if (opsize == 1)
457 return (getaddflags8(x, y));
458 else if (opsize == 2)
459 return (getaddflags16(x, y));
460 else if (opsize == 4)
461 return (getaddflags32(x, y));
462 else
463 return (getaddflags64(x, y));
464 }
465
466 /*
467 * Return the status flags that would result from doing (x & y).
468 */
469 #define GETANDFLAGS(sz) \
470 static u_long \
471 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \
472 { \
473 u_long rflags; \
474 \
475 __asm __volatile("and %2,%1; pushfq; popq %0" : \
476 "=r" (rflags), "+r" (x) : "m" (y)); \
477 return (rflags); \
478 } struct __hack
479
480 GETANDFLAGS(8);
481 GETANDFLAGS(16);
482 GETANDFLAGS(32);
483 GETANDFLAGS(64);
484
485 static u_long
getandflags(int opsize,uint64_t x,uint64_t y)486 getandflags(int opsize, uint64_t x, uint64_t y)
487 {
488 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
489 ("getandflags: invalid operand size %d", opsize));
490
491 if (opsize == 1)
492 return (getandflags8(x, y));
493 else if (opsize == 2)
494 return (getandflags16(x, y));
495 else if (opsize == 4)
496 return (getandflags32(x, y));
497 else
498 return (getandflags64(x, y));
499 }
500
501 static int
emulate_mov(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)502 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
503 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
504 {
505 int error, size;
506 enum vm_reg_name reg;
507 uint8_t byte;
508 uint64_t val;
509
510 size = vie->opsize;
511 error = EINVAL;
512
513 switch (vie->op.op_byte) {
514 case 0x88:
515 /*
516 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
517 * 88/r: mov r/m8, r8
518 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
519 */
520 size = 1; /* override for byte operation */
521 error = vie_read_bytereg(vcpu, vie, &byte);
522 if (error == 0)
523 error = memwrite(vcpu, gpa, byte, size, arg);
524 break;
525 case 0x89:
526 /*
527 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
528 * 89/r: mov r/m16, r16
529 * 89/r: mov r/m32, r32
530 * REX.W + 89/r mov r/m64, r64
531 */
532 reg = gpr_map[vie->reg];
533 error = vie_read_register(vcpu, reg, &val);
534 if (error == 0) {
535 val &= size2mask[size];
536 error = memwrite(vcpu, gpa, val, size, arg);
537 }
538 break;
539 case 0x8A:
540 /*
541 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
542 * 8A/r: mov r8, r/m8
543 * REX + 8A/r: mov r8, r/m8
544 */
545 size = 1; /* override for byte operation */
546 error = memread(vcpu, gpa, &val, size, arg);
547 if (error == 0)
548 error = vie_write_bytereg(vcpu, vie, val);
549 break;
550 case 0x8B:
551 /*
552 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
553 * 8B/r: mov r16, r/m16
554 * 8B/r: mov r32, r/m32
555 * REX.W 8B/r: mov r64, r/m64
556 */
557 error = memread(vcpu, gpa, &val, size, arg);
558 if (error == 0) {
559 reg = gpr_map[vie->reg];
560 error = vie_update_register(vcpu, reg, val, size);
561 }
562 break;
563 case 0xA1:
564 /*
565 * MOV from seg:moffset to AX/EAX/RAX
566 * A1: mov AX, moffs16
567 * A1: mov EAX, moffs32
568 * REX.W + A1: mov RAX, moffs64
569 */
570 error = memread(vcpu, gpa, &val, size, arg);
571 if (error == 0) {
572 reg = VM_REG_GUEST_RAX;
573 error = vie_update_register(vcpu, reg, val, size);
574 }
575 break;
576 case 0xA3:
577 /*
578 * MOV from AX/EAX/RAX to seg:moffset
579 * A3: mov moffs16, AX
580 * A3: mov moffs32, EAX
581 * REX.W + A3: mov moffs64, RAX
582 */
583 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
584 if (error == 0) {
585 val &= size2mask[size];
586 error = memwrite(vcpu, gpa, val, size, arg);
587 }
588 break;
589 case 0xC6:
590 /*
591 * MOV from imm8 to mem (ModRM:r/m)
592 * C6/0 mov r/m8, imm8
593 * REX + C6/0 mov r/m8, imm8
594 */
595 size = 1; /* override for byte operation */
596 error = memwrite(vcpu, gpa, vie->immediate, size, arg);
597 break;
598 case 0xC7:
599 /*
600 * MOV from imm16/imm32 to mem (ModRM:r/m)
601 * C7/0 mov r/m16, imm16
602 * C7/0 mov r/m32, imm32
603 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
604 */
605 val = vie->immediate & size2mask[size];
606 error = memwrite(vcpu, gpa, val, size, arg);
607 break;
608 default:
609 break;
610 }
611
612 return (error);
613 }
614
615 static int
emulate_movx(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)616 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
617 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
618 {
619 int error, size;
620 enum vm_reg_name reg;
621 uint64_t val;
622
623 size = vie->opsize;
624 error = EINVAL;
625
626 switch (vie->op.op_byte) {
627 case 0xB6:
628 /*
629 * MOV and zero extend byte from mem (ModRM:r/m) to
630 * reg (ModRM:reg).
631 *
632 * 0F B6/r movzx r16, r/m8
633 * 0F B6/r movzx r32, r/m8
634 * REX.W + 0F B6/r movzx r64, r/m8
635 */
636
637 /* get the first operand */
638 error = memread(vcpu, gpa, &val, 1, arg);
639 if (error)
640 break;
641
642 /* get the second operand */
643 reg = gpr_map[vie->reg];
644
645 /* zero-extend byte */
646 val = (uint8_t)val;
647
648 /* write the result */
649 error = vie_update_register(vcpu, reg, val, size);
650 break;
651 case 0xB7:
652 /*
653 * MOV and zero extend word from mem (ModRM:r/m) to
654 * reg (ModRM:reg).
655 *
656 * 0F B7/r movzx r32, r/m16
657 * REX.W + 0F B7/r movzx r64, r/m16
658 */
659 error = memread(vcpu, gpa, &val, 2, arg);
660 if (error)
661 return (error);
662
663 reg = gpr_map[vie->reg];
664
665 /* zero-extend word */
666 val = (uint16_t)val;
667
668 error = vie_update_register(vcpu, reg, val, size);
669 break;
670 case 0xBE:
671 /*
672 * MOV and sign extend byte from mem (ModRM:r/m) to
673 * reg (ModRM:reg).
674 *
675 * 0F BE/r movsx r16, r/m8
676 * 0F BE/r movsx r32, r/m8
677 * REX.W + 0F BE/r movsx r64, r/m8
678 */
679
680 /* get the first operand */
681 error = memread(vcpu, gpa, &val, 1, arg);
682 if (error)
683 break;
684
685 /* get the second operand */
686 reg = gpr_map[vie->reg];
687
688 /* sign extend byte */
689 val = (int8_t)val;
690
691 /* write the result */
692 error = vie_update_register(vcpu, reg, val, size);
693 break;
694 default:
695 break;
696 }
697 return (error);
698 }
699
700 /*
701 * Helper function to calculate and validate a linear address.
702 */
703 static int
get_gla(struct vcpu * vcpu,struct vie * vie __unused,struct vm_guest_paging * paging,int opsize,int addrsize,int prot,enum vm_reg_name seg,enum vm_reg_name gpr,uint64_t * gla,int * fault)704 get_gla(struct vcpu *vcpu, struct vie *vie __unused,
705 struct vm_guest_paging *paging, int opsize, int addrsize, int prot,
706 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)
707 {
708 struct seg_desc desc;
709 uint64_t cr0, val, rflags;
710 int error __diagused;
711
712 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
713 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
714
715 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
716 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
717
718 error = vm_get_seg_desc(vcpu, seg, &desc);
719 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
720 __func__, error, seg));
721
722 error = vie_read_register(vcpu, gpr, &val);
723 KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
724 error, gpr));
725
726 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
727 addrsize, prot, gla)) {
728 if (seg == VM_REG_GUEST_SS)
729 vm_inject_ss(vcpu, 0);
730 else
731 vm_inject_gp(vcpu);
732 goto guest_fault;
733 }
734
735 if (vie_canonical_check(paging->cpu_mode, *gla)) {
736 if (seg == VM_REG_GUEST_SS)
737 vm_inject_ss(vcpu, 0);
738 else
739 vm_inject_gp(vcpu);
740 goto guest_fault;
741 }
742
743 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
744 vm_inject_ac(vcpu, 0);
745 goto guest_fault;
746 }
747
748 *fault = 0;
749 return (0);
750
751 guest_fault:
752 *fault = 1;
753 return (0);
754 }
755
756 static int
emulate_movs(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)757 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
758 struct vm_guest_paging *paging, mem_region_read_t memread,
759 mem_region_write_t memwrite, void *arg)
760 {
761 #ifdef _KERNEL
762 struct vm_copyinfo copyinfo[2];
763 #else
764 struct iovec copyinfo[2];
765 #endif
766 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
767 uint64_t rcx, rdi, rsi, rflags;
768 int error, fault, opsize, seg, repeat;
769
770 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
771 val = 0;
772 error = 0;
773
774 /*
775 * XXX although the MOVS instruction is only supposed to be used with
776 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
777 *
778 * Empirically the "repnz" prefix has identical behavior to "rep"
779 * and the zero flag does not make a difference.
780 */
781 repeat = vie->repz_present | vie->repnz_present;
782
783 if (repeat) {
784 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
785 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
786
787 /*
788 * The count register is %rcx, %ecx or %cx depending on the
789 * address size of the instruction.
790 */
791 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
792 error = 0;
793 goto done;
794 }
795 }
796
797 /*
798 * Source Destination Comments
799 * --------------------------------------------
800 * (1) memory memory n/a
801 * (2) memory mmio emulated
802 * (3) mmio memory emulated
803 * (4) mmio mmio emulated
804 *
805 * At this point we don't have sufficient information to distinguish
806 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
807 * out because it will succeed only when operating on regular memory.
808 *
809 * XXX the emulation doesn't properly handle the case where 'gpa'
810 * is straddling the boundary between the normal memory and MMIO.
811 */
812
813 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
814 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
815 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
816 if (error || fault)
817 goto done;
818
819 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ,
820 copyinfo, nitems(copyinfo), &fault);
821 if (error == 0) {
822 if (fault)
823 goto done; /* Resume guest to handle fault */
824
825 /*
826 * case (2): read from system memory and write to mmio.
827 */
828 vm_copyin(copyinfo, &val, opsize);
829 vm_copy_teardown(copyinfo, nitems(copyinfo));
830 error = memwrite(vcpu, gpa, val, opsize, arg);
831 if (error)
832 goto done;
833 } else {
834 /*
835 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
836 * if 'srcaddr' is in the mmio space.
837 */
838
839 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
840 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
841 &fault);
842 if (error || fault)
843 goto done;
844
845 error = vm_copy_setup(vcpu, paging, dstaddr, opsize,
846 PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
847 if (error == 0) {
848 if (fault)
849 goto done; /* Resume guest to handle fault */
850
851 /*
852 * case (3): read from MMIO and write to system memory.
853 *
854 * A MMIO read can have side-effects so we
855 * commit to it only after vm_copy_setup() is
856 * successful. If a page-fault needs to be
857 * injected into the guest then it will happen
858 * before the MMIO read is attempted.
859 */
860 error = memread(vcpu, gpa, &val, opsize, arg);
861 if (error)
862 goto done;
863
864 vm_copyout(&val, copyinfo, opsize);
865 vm_copy_teardown(copyinfo, nitems(copyinfo));
866 } else {
867 /*
868 * Case (4): read from and write to mmio.
869 *
870 * Commit to the MMIO read/write (with potential
871 * side-effects) only after we are sure that the
872 * instruction is not going to be restarted due
873 * to address translation faults.
874 */
875 error = vm_gla2gpa(vcpu, paging, srcaddr,
876 PROT_READ, &srcgpa, &fault);
877 if (error || fault)
878 goto done;
879
880 error = vm_gla2gpa(vcpu, paging, dstaddr,
881 PROT_WRITE, &dstgpa, &fault);
882 if (error || fault)
883 goto done;
884
885 error = memread(vcpu, srcgpa, &val, opsize, arg);
886 if (error)
887 goto done;
888
889 error = memwrite(vcpu, dstgpa, val, opsize, arg);
890 if (error)
891 goto done;
892 }
893 }
894
895 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi);
896 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
897
898 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
899 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
900
901 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
902 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
903
904 if (rflags & PSL_D) {
905 rsi -= opsize;
906 rdi -= opsize;
907 } else {
908 rsi += opsize;
909 rdi += opsize;
910 }
911
912 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi,
913 vie->addrsize);
914 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
915
916 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
917 vie->addrsize);
918 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
919
920 if (repeat) {
921 rcx = rcx - 1;
922 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
923 rcx, vie->addrsize);
924 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
925
926 /*
927 * Repeat the instruction if the count register is not zero.
928 */
929 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
930 vm_restart_instruction(vcpu);
931 }
932 done:
933 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
934 __func__, error));
935 return (error);
936 }
937
938 static int
emulate_stos(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread __unused,mem_region_write_t memwrite,void * arg)939 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
940 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,
941 mem_region_write_t memwrite, void *arg)
942 {
943 int error, opsize, repeat;
944 uint64_t val;
945 uint64_t rcx, rdi, rflags;
946
947 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
948 repeat = vie->repz_present | vie->repnz_present;
949
950 if (repeat) {
951 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
952 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
953
954 /*
955 * The count register is %rcx, %ecx or %cx depending on the
956 * address size of the instruction.
957 */
958 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
959 return (0);
960 }
961
962 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
963 KASSERT(!error, ("%s: error %d getting rax", __func__, error));
964
965 error = memwrite(vcpu, gpa, val, opsize, arg);
966 if (error)
967 return (error);
968
969 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
970 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
971
972 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
973 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
974
975 if (rflags & PSL_D)
976 rdi -= opsize;
977 else
978 rdi += opsize;
979
980 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
981 vie->addrsize);
982 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
983
984 if (repeat) {
985 rcx = rcx - 1;
986 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
987 rcx, vie->addrsize);
988 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
989
990 /*
991 * Repeat the instruction if the count register is not zero.
992 */
993 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
994 vm_restart_instruction(vcpu);
995 }
996
997 return (0);
998 }
999
1000 static int
emulate_and(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1001 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1002 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1003 {
1004 int error, size;
1005 enum vm_reg_name reg;
1006 uint64_t result, rflags, rflags2, val1, val2;
1007
1008 size = vie->opsize;
1009 error = EINVAL;
1010
1011 switch (vie->op.op_byte) {
1012 case 0x23:
1013 /*
1014 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1015 * result in reg.
1016 *
1017 * 23/r and r16, r/m16
1018 * 23/r and r32, r/m32
1019 * REX.W + 23/r and r64, r/m64
1020 */
1021
1022 /* get the first operand */
1023 reg = gpr_map[vie->reg];
1024 error = vie_read_register(vcpu, reg, &val1);
1025 if (error)
1026 break;
1027
1028 /* get the second operand */
1029 error = memread(vcpu, gpa, &val2, size, arg);
1030 if (error)
1031 break;
1032
1033 /* perform the operation and write the result */
1034 result = val1 & val2;
1035 error = vie_update_register(vcpu, reg, result, size);
1036 break;
1037 case 0x81:
1038 case 0x83:
1039 /*
1040 * AND mem (ModRM:r/m) with immediate and store the
1041 * result in mem.
1042 *
1043 * 81 /4 and r/m16, imm16
1044 * 81 /4 and r/m32, imm32
1045 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
1046 *
1047 * 83 /4 and r/m16, imm8 sign-extended to 16
1048 * 83 /4 and r/m32, imm8 sign-extended to 32
1049 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
1050 */
1051
1052 /* get the first operand */
1053 error = memread(vcpu, gpa, &val1, size, arg);
1054 if (error)
1055 break;
1056
1057 /*
1058 * perform the operation with the pre-fetched immediate
1059 * operand and write the result
1060 */
1061 result = val1 & vie->immediate;
1062 error = memwrite(vcpu, gpa, result, size, arg);
1063 break;
1064 default:
1065 break;
1066 }
1067 if (error)
1068 return (error);
1069
1070 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1071 if (error)
1072 return (error);
1073
1074 /*
1075 * OF and CF are cleared; the SF, ZF and PF flags are set according
1076 * to the result; AF is undefined.
1077 *
1078 * The updated status flags are obtained by subtracting 0 from 'result'.
1079 */
1080 rflags2 = getcc(size, result, 0);
1081 rflags &= ~RFLAGS_STATUS_BITS;
1082 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1083
1084 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1085 return (error);
1086 }
1087
1088 static int
emulate_or(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1089 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1090 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1091 {
1092 int error, size;
1093 enum vm_reg_name reg;
1094 uint64_t result, rflags, rflags2, val1, val2;
1095
1096 size = vie->opsize;
1097 error = EINVAL;
1098
1099 switch (vie->op.op_byte) {
1100 case 0x0B:
1101 /*
1102 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1103 * result in reg.
1104 *
1105 * 0b/r or r16, r/m16
1106 * 0b/r or r32, r/m32
1107 * REX.W + 0b/r or r64, r/m64
1108 */
1109
1110 /* get the first operand */
1111 reg = gpr_map[vie->reg];
1112 error = vie_read_register(vcpu, reg, &val1);
1113 if (error)
1114 break;
1115
1116 /* get the second operand */
1117 error = memread(vcpu, gpa, &val2, size, arg);
1118 if (error)
1119 break;
1120
1121 /* perform the operation and write the result */
1122 result = val1 | val2;
1123 error = vie_update_register(vcpu, reg, result, size);
1124 break;
1125 case 0x81:
1126 case 0x83:
1127 /*
1128 * OR mem (ModRM:r/m) with immediate and store the
1129 * result in mem.
1130 *
1131 * 81 /1 or r/m16, imm16
1132 * 81 /1 or r/m32, imm32
1133 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
1134 *
1135 * 83 /1 or r/m16, imm8 sign-extended to 16
1136 * 83 /1 or r/m32, imm8 sign-extended to 32
1137 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
1138 */
1139
1140 /* get the first operand */
1141 error = memread(vcpu, gpa, &val1, size, arg);
1142 if (error)
1143 break;
1144
1145 /*
1146 * perform the operation with the pre-fetched immediate
1147 * operand and write the result
1148 */
1149 result = val1 | vie->immediate;
1150 error = memwrite(vcpu, gpa, result, size, arg);
1151 break;
1152 default:
1153 break;
1154 }
1155 if (error)
1156 return (error);
1157
1158 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1159 if (error)
1160 return (error);
1161
1162 /*
1163 * OF and CF are cleared; the SF, ZF and PF flags are set according
1164 * to the result; AF is undefined.
1165 *
1166 * The updated status flags are obtained by subtracting 0 from 'result'.
1167 */
1168 rflags2 = getcc(size, result, 0);
1169 rflags &= ~RFLAGS_STATUS_BITS;
1170 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1171
1172 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1173 return (error);
1174 }
1175
1176 static int
emulate_cmp(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1177 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1178 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1179 {
1180 int error, size;
1181 uint64_t regop, memop, op1, op2, rflags, rflags2;
1182 enum vm_reg_name reg;
1183
1184 size = vie->opsize;
1185 switch (vie->op.op_byte) {
1186 case 0x39:
1187 case 0x3B:
1188 /*
1189 * 39/r CMP r/m16, r16
1190 * 39/r CMP r/m32, r32
1191 * REX.W 39/r CMP r/m64, r64
1192 *
1193 * 3B/r CMP r16, r/m16
1194 * 3B/r CMP r32, r/m32
1195 * REX.W + 3B/r CMP r64, r/m64
1196 *
1197 * Compare the first operand with the second operand and
1198 * set status flags in EFLAGS register. The comparison is
1199 * performed by subtracting the second operand from the first
1200 * operand and then setting the status flags.
1201 */
1202
1203 /* Get the register operand */
1204 reg = gpr_map[vie->reg];
1205 error = vie_read_register(vcpu, reg, ®op);
1206 if (error)
1207 return (error);
1208
1209 /* Get the memory operand */
1210 error = memread(vcpu, gpa, &memop, size, arg);
1211 if (error)
1212 return (error);
1213
1214 if (vie->op.op_byte == 0x3B) {
1215 op1 = regop;
1216 op2 = memop;
1217 } else {
1218 op1 = memop;
1219 op2 = regop;
1220 }
1221 rflags2 = getcc(size, op1, op2);
1222 break;
1223 case 0x80:
1224 case 0x81:
1225 case 0x83:
1226 /*
1227 * 80 /7 cmp r/m8, imm8
1228 * REX + 80 /7 cmp r/m8, imm8
1229 *
1230 * 81 /7 cmp r/m16, imm16
1231 * 81 /7 cmp r/m32, imm32
1232 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
1233 *
1234 * 83 /7 cmp r/m16, imm8 sign-extended to 16
1235 * 83 /7 cmp r/m32, imm8 sign-extended to 32
1236 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
1237 *
1238 * Compare mem (ModRM:r/m) with immediate and set
1239 * status flags according to the results. The
1240 * comparison is performed by subtracting the
1241 * immediate from the first operand and then setting
1242 * the status flags.
1243 *
1244 */
1245 if (vie->op.op_byte == 0x80)
1246 size = 1;
1247
1248 /* get the first operand */
1249 error = memread(vcpu, gpa, &op1, size, arg);
1250 if (error)
1251 return (error);
1252
1253 rflags2 = getcc(size, op1, vie->immediate);
1254 break;
1255 default:
1256 return (EINVAL);
1257 }
1258 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1259 if (error)
1260 return (error);
1261 rflags &= ~RFLAGS_STATUS_BITS;
1262 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1263
1264 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1265 return (error);
1266 }
1267
1268 static int
emulate_test(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1269 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1270 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1271 {
1272 int error, size;
1273 uint64_t op1, rflags, rflags2;
1274
1275 size = vie->opsize;
1276 error = EINVAL;
1277
1278 switch (vie->op.op_byte) {
1279 case 0xF6:
1280 /*
1281 * F6 /0 test r/m8, imm8
1282 */
1283 size = 1; /* override for byte operation */
1284 /* FALLTHROUGH */
1285 case 0xF7:
1286 /*
1287 * F7 /0 test r/m16, imm16
1288 * F7 /0 test r/m32, imm32
1289 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64
1290 *
1291 * Test mem (ModRM:r/m) with immediate and set status
1292 * flags according to the results. The comparison is
1293 * performed by anding the immediate from the first
1294 * operand and then setting the status flags.
1295 */
1296 if ((vie->reg & 7) != 0)
1297 return (EINVAL);
1298
1299 error = memread(vcpu, gpa, &op1, size, arg);
1300 if (error)
1301 return (error);
1302
1303 rflags2 = getandflags(size, op1, vie->immediate);
1304 break;
1305 default:
1306 return (EINVAL);
1307 }
1308 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1309 if (error)
1310 return (error);
1311
1312 /*
1313 * OF and CF are cleared; the SF, ZF and PF flags are set according
1314 * to the result; AF is undefined.
1315 */
1316 rflags &= ~RFLAGS_STATUS_BITS;
1317 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1318
1319 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1320 return (error);
1321 }
1322
1323 static int
emulate_bextr(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1324 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1325 struct vm_guest_paging *paging, mem_region_read_t memread,
1326 mem_region_write_t memwrite __unused, void *arg)
1327 {
1328 uint64_t src1, src2, dst, rflags;
1329 unsigned start, len, size;
1330 int error;
1331
1332 size = vie->opsize;
1333 error = EINVAL;
1334
1335 /*
1336 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b
1337 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b
1338 *
1339 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and
1340 * Vex.vvvv.
1341 *
1342 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1343 */
1344 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1345 size = 4;
1346
1347 /*
1348 * Extracts contiguous bits from the first /source/ operand (second
1349 * operand) using an index and length specified in the second /source/
1350 * operand (third operand).
1351 */
1352 error = memread(vcpu, gpa, &src1, size, arg);
1353 if (error)
1354 return (error);
1355 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2);
1356 if (error)
1357 return (error);
1358 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1359 if (error)
1360 return (error);
1361
1362 start = (src2 & 0xff);
1363 len = (src2 & 0xff00) >> 8;
1364
1365 /* If no bits are extracted, the destination register is cleared. */
1366 dst = 0;
1367
1368 /* If START exceeds the operand size, no bits are extracted. */
1369 if (start > size * 8)
1370 goto done;
1371 /* Length is bounded by both the destination size and start offset. */
1372 if (start + len > size * 8)
1373 len = (size * 8) - start;
1374 if (len == 0)
1375 goto done;
1376
1377 if (start > 0)
1378 src1 = (src1 >> start);
1379 if (len < 64)
1380 src1 = src1 & ((1ull << len) - 1);
1381 dst = src1;
1382
1383 done:
1384 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size);
1385 if (error)
1386 return (error);
1387
1388 /*
1389 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1390 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1391 */
1392 rflags &= ~RFLAGS_STATUS_BITS;
1393 if (dst == 0)
1394 rflags |= PSL_Z;
1395 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags,
1396 8);
1397 return (error);
1398 }
1399
1400 static int
emulate_add(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1401 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1402 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1403 {
1404 int error, size;
1405 uint64_t nval, rflags, rflags2, val1, val2;
1406 enum vm_reg_name reg;
1407
1408 size = vie->opsize;
1409 error = EINVAL;
1410
1411 switch (vie->op.op_byte) {
1412 case 0x03:
1413 /*
1414 * ADD r/m to r and store the result in r
1415 *
1416 * 03/r ADD r16, r/m16
1417 * 03/r ADD r32, r/m32
1418 * REX.W + 03/r ADD r64, r/m64
1419 */
1420
1421 /* get the first operand */
1422 reg = gpr_map[vie->reg];
1423 error = vie_read_register(vcpu, reg, &val1);
1424 if (error)
1425 break;
1426
1427 /* get the second operand */
1428 error = memread(vcpu, gpa, &val2, size, arg);
1429 if (error)
1430 break;
1431
1432 /* perform the operation and write the result */
1433 nval = val1 + val2;
1434 error = vie_update_register(vcpu, reg, nval, size);
1435 break;
1436 default:
1437 break;
1438 }
1439
1440 if (!error) {
1441 rflags2 = getaddflags(size, val1, val2);
1442 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1443 &rflags);
1444 if (error)
1445 return (error);
1446
1447 rflags &= ~RFLAGS_STATUS_BITS;
1448 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1449 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1450 rflags, 8);
1451 }
1452
1453 return (error);
1454 }
1455
1456 static int
emulate_sub(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1457 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1458 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1459 {
1460 int error, size;
1461 uint64_t nval, rflags, rflags2, val1, val2;
1462 enum vm_reg_name reg;
1463
1464 size = vie->opsize;
1465 error = EINVAL;
1466
1467 switch (vie->op.op_byte) {
1468 case 0x2B:
1469 /*
1470 * SUB r/m from r and store the result in r
1471 *
1472 * 2B/r SUB r16, r/m16
1473 * 2B/r SUB r32, r/m32
1474 * REX.W + 2B/r SUB r64, r/m64
1475 */
1476
1477 /* get the first operand */
1478 reg = gpr_map[vie->reg];
1479 error = vie_read_register(vcpu, reg, &val1);
1480 if (error)
1481 break;
1482
1483 /* get the second operand */
1484 error = memread(vcpu, gpa, &val2, size, arg);
1485 if (error)
1486 break;
1487
1488 /* perform the operation and write the result */
1489 nval = val1 - val2;
1490 error = vie_update_register(vcpu, reg, nval, size);
1491 break;
1492 default:
1493 break;
1494 }
1495
1496 if (!error) {
1497 rflags2 = getcc(size, val1, val2);
1498 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1499 &rflags);
1500 if (error)
1501 return (error);
1502
1503 rflags &= ~RFLAGS_STATUS_BITS;
1504 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1505 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1506 rflags, 8);
1507 }
1508
1509 return (error);
1510 }
1511
1512 static int
emulate_stack_op(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1513 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1514 struct vm_guest_paging *paging, mem_region_read_t memread,
1515 mem_region_write_t memwrite, void *arg)
1516 {
1517 #ifdef _KERNEL
1518 struct vm_copyinfo copyinfo[2];
1519 #else
1520 struct iovec copyinfo[2];
1521 #endif
1522 struct seg_desc ss_desc;
1523 uint64_t cr0, rflags, rsp, stack_gla, val;
1524 int error, fault, size, stackaddrsize, pushop;
1525
1526 val = 0;
1527 size = vie->opsize;
1528 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1529
1530 /*
1531 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1532 */
1533 if (paging->cpu_mode == CPU_MODE_REAL) {
1534 stackaddrsize = 2;
1535 } else if (paging->cpu_mode == CPU_MODE_64BIT) {
1536 /*
1537 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1538 * - Stack pointer size is always 64-bits.
1539 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1540 * - 16-bit PUSH/POP is supported by using the operand size
1541 * override prefix (66H).
1542 */
1543 stackaddrsize = 8;
1544 size = vie->opsize_override ? 2 : 8;
1545 } else {
1546 /*
1547 * In protected or compatibility mode the 'B' flag in the
1548 * stack-segment descriptor determines the size of the
1549 * stack pointer.
1550 */
1551 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc);
1552 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1553 __func__, error));
1554 if (SEG_DESC_DEF32(ss_desc.access))
1555 stackaddrsize = 4;
1556 else
1557 stackaddrsize = 2;
1558 }
1559
1560 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
1561 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1562
1563 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1564 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1565
1566 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp);
1567 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1568 if (pushop) {
1569 rsp -= size;
1570 }
1571
1572 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1573 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1574 &stack_gla)) {
1575 vm_inject_ss(vcpu, 0);
1576 return (0);
1577 }
1578
1579 if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1580 vm_inject_ss(vcpu, 0);
1581 return (0);
1582 }
1583
1584 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1585 vm_inject_ac(vcpu, 0);
1586 return (0);
1587 }
1588
1589 error = vm_copy_setup(vcpu, paging, stack_gla, size,
1590 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1591 &fault);
1592 if (error || fault)
1593 return (error);
1594
1595 if (pushop) {
1596 error = memread(vcpu, mmio_gpa, &val, size, arg);
1597 if (error == 0)
1598 vm_copyout(&val, copyinfo, size);
1599 } else {
1600 vm_copyin(copyinfo, &val, size);
1601 error = memwrite(vcpu, mmio_gpa, val, size, arg);
1602 rsp += size;
1603 }
1604 vm_copy_teardown(copyinfo, nitems(copyinfo));
1605
1606 if (error == 0) {
1607 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp,
1608 stackaddrsize);
1609 KASSERT(error == 0, ("error %d updating rsp", error));
1610 }
1611 return (error);
1612 }
1613
1614 static int
emulate_push(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1615 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1616 struct vm_guest_paging *paging, mem_region_read_t memread,
1617 mem_region_write_t memwrite, void *arg)
1618 {
1619 int error;
1620
1621 /*
1622 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1623 *
1624 * PUSH is part of the group 5 extended opcodes and is identified
1625 * by ModRM:reg = b110.
1626 */
1627 if ((vie->reg & 7) != 6)
1628 return (EINVAL);
1629
1630 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1631 memwrite, arg);
1632 return (error);
1633 }
1634
1635 static int
emulate_pop(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1636 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1637 struct vm_guest_paging *paging, mem_region_read_t memread,
1638 mem_region_write_t memwrite, void *arg)
1639 {
1640 int error;
1641
1642 /*
1643 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1644 *
1645 * POP is part of the group 1A extended opcodes and is identified
1646 * by ModRM:reg = b000.
1647 */
1648 if ((vie->reg & 7) != 0)
1649 return (EINVAL);
1650
1651 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1652 memwrite, arg);
1653 return (error);
1654 }
1655
1656 static int
emulate_group1(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1657 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1658 struct vm_guest_paging *paging __unused, mem_region_read_t memread,
1659 mem_region_write_t memwrite, void *memarg)
1660 {
1661 int error;
1662
1663 switch (vie->reg & 7) {
1664 case 0x1: /* OR */
1665 error = emulate_or(vcpu, gpa, vie,
1666 memread, memwrite, memarg);
1667 break;
1668 case 0x4: /* AND */
1669 error = emulate_and(vcpu, gpa, vie,
1670 memread, memwrite, memarg);
1671 break;
1672 case 0x7: /* CMP */
1673 error = emulate_cmp(vcpu, gpa, vie,
1674 memread, memwrite, memarg);
1675 break;
1676 default:
1677 error = EINVAL;
1678 break;
1679 }
1680
1681 return (error);
1682 }
1683
1684 static int
emulate_bittest(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1685 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1686 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1687 void *memarg)
1688 {
1689 uint64_t val, rflags;
1690 int error, bitmask, bitoff;
1691
1692 /*
1693 * 0F BA is a Group 8 extended opcode.
1694 *
1695 * Currently we only emulate the 'Bit Test' instruction which is
1696 * identified by a ModR/M:reg encoding of 100b.
1697 */
1698 if ((vie->reg & 7) != 4)
1699 return (EINVAL);
1700
1701 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1702 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1703
1704 error = memread(vcpu, gpa, &val, vie->opsize, memarg);
1705 if (error)
1706 return (error);
1707
1708 /*
1709 * Intel SDM, Vol 2, Table 3-2:
1710 * "Range of Bit Positions Specified by Bit Offset Operands"
1711 */
1712 bitmask = vie->opsize * 8 - 1;
1713 bitoff = vie->immediate & bitmask;
1714
1715 /* Copy the bit into the Carry flag in %rflags */
1716 if (val & (1UL << bitoff))
1717 rflags |= PSL_C;
1718 else
1719 rflags &= ~PSL_C;
1720
1721 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1722 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1723
1724 return (0);
1725 }
1726
1727 static int
emulate_twob_group15(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1728 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1729 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1730 void *memarg)
1731 {
1732 int error;
1733 uint64_t buf;
1734
1735 switch (vie->reg & 7) {
1736 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */
1737 if (vie->mod == 0x3) {
1738 /*
1739 * SFENCE. Ignore it, VM exit provides enough
1740 * barriers on its own.
1741 */
1742 error = 0;
1743 } else {
1744 /*
1745 * CLFLUSH, CLFLUSHOPT. Only check for access
1746 * rights.
1747 */
1748 error = memread(vcpu, gpa, &buf, 1, memarg);
1749 }
1750 break;
1751 default:
1752 error = EINVAL;
1753 break;
1754 }
1755
1756 return (error);
1757 }
1758
1759 int
vmm_emulate_instruction(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1760 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1761 struct vm_guest_paging *paging, mem_region_read_t memread,
1762 mem_region_write_t memwrite, void *memarg)
1763 {
1764 int error;
1765
1766 if (!vie->decoded)
1767 return (EINVAL);
1768
1769 switch (vie->op.op_type) {
1770 case VIE_OP_TYPE_GROUP1:
1771 error = emulate_group1(vcpu, gpa, vie, paging, memread,
1772 memwrite, memarg);
1773 break;
1774 case VIE_OP_TYPE_POP:
1775 error = emulate_pop(vcpu, gpa, vie, paging, memread,
1776 memwrite, memarg);
1777 break;
1778 case VIE_OP_TYPE_PUSH:
1779 error = emulate_push(vcpu, gpa, vie, paging, memread,
1780 memwrite, memarg);
1781 break;
1782 case VIE_OP_TYPE_CMP:
1783 error = emulate_cmp(vcpu, gpa, vie,
1784 memread, memwrite, memarg);
1785 break;
1786 case VIE_OP_TYPE_MOV:
1787 error = emulate_mov(vcpu, gpa, vie,
1788 memread, memwrite, memarg);
1789 break;
1790 case VIE_OP_TYPE_MOVSX:
1791 case VIE_OP_TYPE_MOVZX:
1792 error = emulate_movx(vcpu, gpa, vie,
1793 memread, memwrite, memarg);
1794 break;
1795 case VIE_OP_TYPE_MOVS:
1796 error = emulate_movs(vcpu, gpa, vie, paging, memread,
1797 memwrite, memarg);
1798 break;
1799 case VIE_OP_TYPE_STOS:
1800 error = emulate_stos(vcpu, gpa, vie, paging, memread,
1801 memwrite, memarg);
1802 break;
1803 case VIE_OP_TYPE_AND:
1804 error = emulate_and(vcpu, gpa, vie,
1805 memread, memwrite, memarg);
1806 break;
1807 case VIE_OP_TYPE_OR:
1808 error = emulate_or(vcpu, gpa, vie,
1809 memread, memwrite, memarg);
1810 break;
1811 case VIE_OP_TYPE_SUB:
1812 error = emulate_sub(vcpu, gpa, vie,
1813 memread, memwrite, memarg);
1814 break;
1815 case VIE_OP_TYPE_BITTEST:
1816 error = emulate_bittest(vcpu, gpa, vie,
1817 memread, memwrite, memarg);
1818 break;
1819 case VIE_OP_TYPE_TWOB_GRP15:
1820 error = emulate_twob_group15(vcpu, gpa, vie,
1821 memread, memwrite, memarg);
1822 break;
1823 case VIE_OP_TYPE_ADD:
1824 error = emulate_add(vcpu, gpa, vie, memread,
1825 memwrite, memarg);
1826 break;
1827 case VIE_OP_TYPE_TEST:
1828 error = emulate_test(vcpu, gpa, vie,
1829 memread, memwrite, memarg);
1830 break;
1831 case VIE_OP_TYPE_BEXTR:
1832 error = emulate_bextr(vcpu, gpa, vie, paging,
1833 memread, memwrite, memarg);
1834 break;
1835 default:
1836 error = EINVAL;
1837 break;
1838 }
1839
1840 return (error);
1841 }
1842
1843 int
vie_alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)1844 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1845 {
1846 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1847 ("%s: invalid size %d", __func__, size));
1848 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1849
1850 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1851 return (0);
1852
1853 return ((gla & (size - 1)) ? 1 : 0);
1854 }
1855
1856 int
vie_canonical_check(enum vm_cpu_mode cpu_mode,uint64_t gla)1857 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1858 {
1859 uint64_t mask;
1860
1861 if (cpu_mode != CPU_MODE_64BIT)
1862 return (0);
1863
1864 /*
1865 * The value of the bit 47 in the 'gla' should be replicated in the
1866 * most significant 16 bits.
1867 */
1868 mask = ~((1UL << 48) - 1);
1869 if (gla & (1UL << 47))
1870 return ((gla & mask) != mask);
1871 else
1872 return ((gla & mask) != 0);
1873 }
1874
1875 uint64_t
vie_size2mask(int size)1876 vie_size2mask(int size)
1877 {
1878 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1879 ("vie_size2mask: invalid size %d", size));
1880 return (size2mask[size]);
1881 }
1882
1883 int
vie_calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)1884 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1885 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1886 int prot, uint64_t *gla)
1887 {
1888 uint64_t firstoff, low_limit, high_limit, segbase;
1889 int glasize, type;
1890
1891 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1892 ("%s: invalid segment %d", __func__, seg));
1893 KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1894 ("%s: invalid operand size %d", __func__, length));
1895 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1896 ("%s: invalid prot %#x", __func__, prot));
1897
1898 firstoff = offset;
1899 if (cpu_mode == CPU_MODE_64BIT) {
1900 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1901 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1902 glasize = 8;
1903 } else {
1904 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1905 "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1906 glasize = 4;
1907 /*
1908 * If the segment selector is loaded with a NULL selector
1909 * then the descriptor is unusable and attempting to use
1910 * it results in a #GP(0).
1911 */
1912 if (SEG_DESC_UNUSABLE(desc->access))
1913 return (-1);
1914
1915 /*
1916 * The processor generates a #NP exception when a segment
1917 * register is loaded with a selector that points to a
1918 * descriptor that is not present. If this was the case then
1919 * it would have been checked before the VM-exit.
1920 */
1921 KASSERT(SEG_DESC_PRESENT(desc->access),
1922 ("segment %d not present: %#x", seg, desc->access));
1923
1924 /*
1925 * The descriptor type must indicate a code/data segment.
1926 */
1927 type = SEG_DESC_TYPE(desc->access);
1928 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1929 "descriptor type %#x", seg, type));
1930
1931 if (prot & PROT_READ) {
1932 /* #GP on a read access to a exec-only code segment */
1933 if ((type & 0xA) == 0x8)
1934 return (-1);
1935 }
1936
1937 if (prot & PROT_WRITE) {
1938 /*
1939 * #GP on a write access to a code segment or a
1940 * read-only data segment.
1941 */
1942 if (type & 0x8) /* code segment */
1943 return (-1);
1944
1945 if ((type & 0xA) == 0) /* read-only data seg */
1946 return (-1);
1947 }
1948
1949 /*
1950 * 'desc->limit' is fully expanded taking granularity into
1951 * account.
1952 */
1953 if ((type & 0xC) == 0x4) {
1954 /* expand-down data segment */
1955 low_limit = desc->limit + 1;
1956 high_limit = SEG_DESC_DEF32(desc->access) ?
1957 0xffffffff : 0xffff;
1958 } else {
1959 /* code segment or expand-up data segment */
1960 low_limit = 0;
1961 high_limit = desc->limit;
1962 }
1963
1964 while (length > 0) {
1965 offset &= vie_size2mask(addrsize);
1966 if (offset < low_limit || offset > high_limit)
1967 return (-1);
1968 offset++;
1969 length--;
1970 }
1971 }
1972
1973 /*
1974 * In 64-bit mode all segments except %fs and %gs have a segment
1975 * base address of 0.
1976 */
1977 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1978 seg != VM_REG_GUEST_GS) {
1979 segbase = 0;
1980 } else {
1981 segbase = desc->base;
1982 }
1983
1984 /*
1985 * Truncate 'firstoff' to the effective address size before adding
1986 * it to the segment base.
1987 */
1988 firstoff &= vie_size2mask(addrsize);
1989 *gla = (segbase + firstoff) & vie_size2mask(glasize);
1990 return (0);
1991 }
1992
1993 /*
1994 * Prepare a partially decoded vie for a 2nd attempt.
1995 */
1996 void
vie_restart(struct vie * vie)1997 vie_restart(struct vie *vie)
1998 {
1999 _Static_assert(
2000 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
2001 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
2002 "restart should not erase instruction length or contents");
2003
2004 memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
2005 sizeof(*vie) - offsetof(struct vie, vie_startzero));
2006
2007 vie->base_register = VM_REG_LAST;
2008 vie->index_register = VM_REG_LAST;
2009 vie->segment_register = VM_REG_LAST;
2010 }
2011
2012 void
vie_init(struct vie * vie,const char * inst_bytes,int inst_length)2013 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
2014 {
2015 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
2016 ("%s: invalid instruction length (%d)", __func__, inst_length));
2017
2018 vie_restart(vie);
2019 memset(vie->inst, 0, sizeof(vie->inst));
2020 if (inst_length != 0)
2021 memcpy(vie->inst, inst_bytes, inst_length);
2022 vie->num_valid = inst_length;
2023 }
2024
2025 #ifdef _KERNEL
2026 static int
pf_error_code(int usermode,int prot,int rsvd,uint64_t pte)2027 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2028 {
2029 int error_code = 0;
2030
2031 if (pte & PG_V)
2032 error_code |= PGEX_P;
2033 if (prot & VM_PROT_WRITE)
2034 error_code |= PGEX_W;
2035 if (usermode)
2036 error_code |= PGEX_U;
2037 if (rsvd)
2038 error_code |= PGEX_RSV;
2039 if (prot & VM_PROT_EXECUTE)
2040 error_code |= PGEX_I;
2041
2042 return (error_code);
2043 }
2044
2045 static void
ptp_release(void ** cookie)2046 ptp_release(void **cookie)
2047 {
2048 if (*cookie != NULL) {
2049 vm_gpa_release(*cookie);
2050 *cookie = NULL;
2051 }
2052 }
2053
2054 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)2055 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2056 {
2057 void *ptr;
2058
2059 ptp_release(cookie);
2060 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
2061 return (ptr);
2062 }
2063
2064 static int
_vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault,bool check_only)2065 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2066 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2067 {
2068 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2069 u_int retries;
2070 uint64_t *ptpbase, ptpphys, pte, pgsize;
2071 uint32_t *ptpbase32, pte32;
2072 void *cookie;
2073
2074 *guest_fault = 0;
2075
2076 usermode = (paging->cpl == 3 ? 1 : 0);
2077 writable = prot & VM_PROT_WRITE;
2078 cookie = NULL;
2079 retval = 0;
2080 retries = 0;
2081 restart:
2082 ptpphys = paging->cr3; /* root of the page tables */
2083 ptp_release(&cookie);
2084 if (retries++ > 0)
2085 maybe_yield();
2086
2087 if (vie_canonical_check(paging->cpu_mode, gla)) {
2088 /*
2089 * XXX assuming a non-stack reference otherwise a stack fault
2090 * should be generated.
2091 */
2092 if (!check_only)
2093 vm_inject_gp(vcpu);
2094 goto fault;
2095 }
2096
2097 if (paging->paging_mode == PAGING_MODE_FLAT) {
2098 *gpa = gla;
2099 goto done;
2100 }
2101
2102 if (paging->paging_mode == PAGING_MODE_32) {
2103 nlevels = 2;
2104 while (--nlevels >= 0) {
2105 /* Zero out the lower 12 bits. */
2106 ptpphys &= ~0xfff;
2107
2108 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,
2109 &cookie);
2110
2111 if (ptpbase32 == NULL)
2112 goto error;
2113
2114 ptpshift = PAGE_SHIFT + nlevels * 10;
2115 ptpindex = (gla >> ptpshift) & 0x3FF;
2116 pgsize = 1UL << ptpshift;
2117
2118 pte32 = ptpbase32[ptpindex];
2119
2120 if ((pte32 & PG_V) == 0 ||
2121 (usermode && (pte32 & PG_U) == 0) ||
2122 (writable && (pte32 & PG_RW) == 0)) {
2123 if (!check_only) {
2124 pfcode = pf_error_code(usermode, prot, 0,
2125 pte32);
2126 vm_inject_pf(vcpu, pfcode, gla);
2127 }
2128 goto fault;
2129 }
2130
2131 /*
2132 * Emulate the x86 MMU's management of the accessed
2133 * and dirty flags. While the accessed flag is set
2134 * at every level of the page table, the dirty flag
2135 * is only set at the last level providing the guest
2136 * physical address.
2137 */
2138 if (!check_only && (pte32 & PG_A) == 0) {
2139 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2140 pte32, pte32 | PG_A) == 0) {
2141 goto restart;
2142 }
2143 }
2144
2145 /* XXX must be ignored if CR4.PSE=0 */
2146 if (nlevels > 0 && (pte32 & PG_PS) != 0)
2147 break;
2148
2149 ptpphys = pte32;
2150 }
2151
2152 /* Set the dirty bit in the page table entry if necessary */
2153 if (!check_only && writable && (pte32 & PG_M) == 0) {
2154 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2155 pte32, pte32 | PG_M) == 0) {
2156 goto restart;
2157 }
2158 }
2159
2160 /* Zero out the lower 'ptpshift' bits */
2161 pte32 >>= ptpshift; pte32 <<= ptpshift;
2162 *gpa = pte32 | (gla & (pgsize - 1));
2163 goto done;
2164 }
2165
2166 if (paging->paging_mode == PAGING_MODE_PAE) {
2167 /* Zero out the lower 5 bits and the upper 32 bits */
2168 ptpphys &= 0xffffffe0UL;
2169
2170 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,
2171 &cookie);
2172 if (ptpbase == NULL)
2173 goto error;
2174
2175 ptpindex = (gla >> 30) & 0x3;
2176
2177 pte = ptpbase[ptpindex];
2178
2179 if ((pte & PG_V) == 0) {
2180 if (!check_only) {
2181 pfcode = pf_error_code(usermode, prot, 0, pte);
2182 vm_inject_pf(vcpu, pfcode, gla);
2183 }
2184 goto fault;
2185 }
2186
2187 ptpphys = pte;
2188
2189 nlevels = 2;
2190 } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
2191 nlevels = 5;
2192 } else {
2193 nlevels = 4;
2194 }
2195
2196 while (--nlevels >= 0) {
2197 /* Zero out the lower 12 bits and the upper 12 bits */
2198 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2199
2200 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);
2201 if (ptpbase == NULL)
2202 goto error;
2203
2204 ptpshift = PAGE_SHIFT + nlevels * 9;
2205 ptpindex = (gla >> ptpshift) & 0x1FF;
2206 pgsize = 1UL << ptpshift;
2207
2208 pte = ptpbase[ptpindex];
2209
2210 if ((pte & PG_V) == 0 ||
2211 (usermode && (pte & PG_U) == 0) ||
2212 (writable && (pte & PG_RW) == 0)) {
2213 if (!check_only) {
2214 pfcode = pf_error_code(usermode, prot, 0, pte);
2215 vm_inject_pf(vcpu, pfcode, gla);
2216 }
2217 goto fault;
2218 }
2219
2220 /* Set the accessed bit in the page table entry */
2221 if (!check_only && (pte & PG_A) == 0) {
2222 if (atomic_cmpset_64(&ptpbase[ptpindex],
2223 pte, pte | PG_A) == 0) {
2224 goto restart;
2225 }
2226 }
2227
2228 if (nlevels > 0 && (pte & PG_PS) != 0) {
2229 if (pgsize > 1 * GB) {
2230 if (!check_only) {
2231 pfcode = pf_error_code(usermode, prot, 1,
2232 pte);
2233 vm_inject_pf(vcpu, pfcode, gla);
2234 }
2235 goto fault;
2236 }
2237 break;
2238 }
2239
2240 ptpphys = pte;
2241 }
2242
2243 /* Set the dirty bit in the page table entry if necessary */
2244 if (!check_only && writable && (pte & PG_M) == 0) {
2245 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2246 goto restart;
2247 }
2248
2249 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2250 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2251 *gpa = pte | (gla & (pgsize - 1));
2252 done:
2253 ptp_release(&cookie);
2254 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2255 __func__, retval));
2256 return (retval);
2257 error:
2258 retval = EFAULT;
2259 goto done;
2260 fault:
2261 *guest_fault = 1;
2262 goto done;
2263 }
2264
2265 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2266 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2267 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2268 {
2269
2270 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2271 false));
2272 }
2273
2274 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2275 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
2276 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2277 {
2278
2279 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2280 true));
2281 }
2282
2283 int
vmm_fetch_instruction(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t rip,int inst_length,struct vie * vie,int * faultptr)2284 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,
2285 uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2286 {
2287 struct vm_copyinfo copyinfo[2];
2288 int error, prot;
2289
2290 if (inst_length > VIE_INST_SIZE)
2291 panic("vmm_fetch_instruction: invalid length %d", inst_length);
2292
2293 prot = PROT_READ | PROT_EXEC;
2294 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,
2295 copyinfo, nitems(copyinfo), faultptr);
2296 if (error || *faultptr)
2297 return (error);
2298
2299 vm_copyin(copyinfo, vie->inst, inst_length);
2300 vm_copy_teardown(copyinfo, nitems(copyinfo));
2301 vie->num_valid = inst_length;
2302 return (0);
2303 }
2304 #endif /* _KERNEL */
2305
2306 static int
vie_peek(struct vie * vie,uint8_t * x)2307 vie_peek(struct vie *vie, uint8_t *x)
2308 {
2309
2310 if (vie->num_processed < vie->num_valid) {
2311 *x = vie->inst[vie->num_processed];
2312 return (0);
2313 } else
2314 return (-1);
2315 }
2316
2317 static void
vie_advance(struct vie * vie)2318 vie_advance(struct vie *vie)
2319 {
2320
2321 vie->num_processed++;
2322 }
2323
2324 static bool
segment_override(uint8_t x,int * seg)2325 segment_override(uint8_t x, int *seg)
2326 {
2327
2328 switch (x) {
2329 case 0x2E:
2330 *seg = VM_REG_GUEST_CS;
2331 break;
2332 case 0x36:
2333 *seg = VM_REG_GUEST_SS;
2334 break;
2335 case 0x3E:
2336 *seg = VM_REG_GUEST_DS;
2337 break;
2338 case 0x26:
2339 *seg = VM_REG_GUEST_ES;
2340 break;
2341 case 0x64:
2342 *seg = VM_REG_GUEST_FS;
2343 break;
2344 case 0x65:
2345 *seg = VM_REG_GUEST_GS;
2346 break;
2347 default:
2348 return (false);
2349 }
2350 return (true);
2351 }
2352
2353 static int
decode_prefixes(struct vie * vie,enum vm_cpu_mode cpu_mode,int cs_d)2354 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2355 {
2356 uint8_t x;
2357
2358 while (1) {
2359 if (vie_peek(vie, &x))
2360 return (-1);
2361
2362 if (x == 0x66)
2363 vie->opsize_override = 1;
2364 else if (x == 0x67)
2365 vie->addrsize_override = 1;
2366 else if (x == 0xF3)
2367 vie->repz_present = 1;
2368 else if (x == 0xF2)
2369 vie->repnz_present = 1;
2370 else if (segment_override(x, &vie->segment_register))
2371 vie->segment_override = 1;
2372 else
2373 break;
2374
2375 vie_advance(vie);
2376 }
2377
2378 /*
2379 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2380 * - Only one REX prefix is allowed per instruction.
2381 * - The REX prefix must immediately precede the opcode byte or the
2382 * escape opcode byte.
2383 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2384 * the mandatory prefix must come before the REX prefix.
2385 */
2386 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2387 vie->rex_present = 1;
2388 vie->rex_w = x & 0x8 ? 1 : 0;
2389 vie->rex_r = x & 0x4 ? 1 : 0;
2390 vie->rex_x = x & 0x2 ? 1 : 0;
2391 vie->rex_b = x & 0x1 ? 1 : 0;
2392 vie_advance(vie);
2393 }
2394
2395 /*
2396 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2397 */
2398 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2399 && x == 0xC4) {
2400 const struct vie_op *optab;
2401
2402 /* 3-byte VEX prefix. */
2403 vie->vex_present = 1;
2404
2405 vie_advance(vie);
2406 if (vie_peek(vie, &x))
2407 return (-1);
2408
2409 /*
2410 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted
2411 * relative to REX encoding.
2412 */
2413 vie->rex_r = x & 0x80 ? 0 : 1;
2414 vie->rex_x = x & 0x40 ? 0 : 1;
2415 vie->rex_b = x & 0x20 ? 0 : 1;
2416
2417 switch (x & 0x1F) {
2418 case 0x2:
2419 /* 0F 38. */
2420 optab = three_byte_opcodes_0f38;
2421 break;
2422 case 0x1:
2423 /* 0F class - nothing handled here yet. */
2424 /* FALLTHROUGH */
2425 case 0x3:
2426 /* 0F 3A class - nothing handled here yet. */
2427 /* FALLTHROUGH */
2428 default:
2429 /* Reserved (#UD). */
2430 return (-1);
2431 }
2432
2433 vie_advance(vie);
2434 if (vie_peek(vie, &x))
2435 return (-1);
2436
2437 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2438 vie->rex_w = x & 0x80 ? 1 : 0;
2439
2440 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2441 vie->vex_l = !!(x & 0x4);
2442 vie->vex_pp = (x & 0x3);
2443
2444 /* PP: 1=66 2=F3 3=F2 prefixes. */
2445 switch (vie->vex_pp) {
2446 case 0x1:
2447 vie->opsize_override = 1;
2448 break;
2449 case 0x2:
2450 vie->repz_present = 1;
2451 break;
2452 case 0x3:
2453 vie->repnz_present = 1;
2454 break;
2455 }
2456
2457 vie_advance(vie);
2458
2459 /* Opcode, sans literal prefix prefix. */
2460 if (vie_peek(vie, &x))
2461 return (-1);
2462
2463 vie->op = optab[x];
2464 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2465 return (-1);
2466
2467 vie_advance(vie);
2468 }
2469
2470 /*
2471 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2472 */
2473 if (cpu_mode == CPU_MODE_64BIT) {
2474 /*
2475 * Default address size is 64-bits and default operand size
2476 * is 32-bits.
2477 */
2478 vie->addrsize = vie->addrsize_override ? 4 : 8;
2479 if (vie->rex_w)
2480 vie->opsize = 8;
2481 else if (vie->opsize_override)
2482 vie->opsize = 2;
2483 else
2484 vie->opsize = 4;
2485 } else if (cs_d) {
2486 /* Default address and operand sizes are 32-bits */
2487 vie->addrsize = vie->addrsize_override ? 2 : 4;
2488 vie->opsize = vie->opsize_override ? 2 : 4;
2489 } else {
2490 /* Default address and operand sizes are 16-bits */
2491 vie->addrsize = vie->addrsize_override ? 4 : 2;
2492 vie->opsize = vie->opsize_override ? 4 : 2;
2493 }
2494 return (0);
2495 }
2496
2497 static int
decode_two_byte_opcode(struct vie * vie)2498 decode_two_byte_opcode(struct vie *vie)
2499 {
2500 uint8_t x;
2501
2502 if (vie_peek(vie, &x))
2503 return (-1);
2504
2505 vie->op = two_byte_opcodes[x];
2506
2507 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2508 return (-1);
2509
2510 vie_advance(vie);
2511 return (0);
2512 }
2513
2514 static int
decode_opcode(struct vie * vie)2515 decode_opcode(struct vie *vie)
2516 {
2517 uint8_t x;
2518
2519 if (vie_peek(vie, &x))
2520 return (-1);
2521
2522 /* Already did this via VEX prefix. */
2523 if (vie->op.op_type != VIE_OP_TYPE_NONE)
2524 return (0);
2525
2526 vie->op = one_byte_opcodes[x];
2527
2528 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2529 return (-1);
2530
2531 vie_advance(vie);
2532
2533 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2534 return (decode_two_byte_opcode(vie));
2535
2536 return (0);
2537 }
2538
2539 static int
decode_modrm(struct vie * vie,enum vm_cpu_mode cpu_mode)2540 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2541 {
2542 uint8_t x;
2543
2544 if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2545 return (0);
2546
2547 if (cpu_mode == CPU_MODE_REAL)
2548 return (-1);
2549
2550 if (vie_peek(vie, &x))
2551 return (-1);
2552
2553 vie->mod = (x >> 6) & 0x3;
2554 vie->rm = (x >> 0) & 0x7;
2555 vie->reg = (x >> 3) & 0x7;
2556
2557 /*
2558 * A direct addressing mode makes no sense in the context of an EPT
2559 * fault. There has to be a memory access involved to cause the
2560 * EPT fault.
2561 */
2562 if (vie->mod == VIE_MOD_DIRECT)
2563 return (-1);
2564
2565 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2566 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2567 /*
2568 * Table 2-5: Special Cases of REX Encodings
2569 *
2570 * mod=0, r/m=5 is used in the compatibility mode to
2571 * indicate a disp32 without a base register.
2572 *
2573 * mod!=3, r/m=4 is used in the compatibility mode to
2574 * indicate that the SIB byte is present.
2575 *
2576 * The 'b' bit in the REX prefix is don't care in
2577 * this case.
2578 */
2579 } else {
2580 vie->rm |= (vie->rex_b << 3);
2581 }
2582
2583 vie->reg |= (vie->rex_r << 3);
2584
2585 /* SIB */
2586 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2587 goto done;
2588
2589 vie->base_register = gpr_map[vie->rm];
2590
2591 switch (vie->mod) {
2592 case VIE_MOD_INDIRECT_DISP8:
2593 vie->disp_bytes = 1;
2594 break;
2595 case VIE_MOD_INDIRECT_DISP32:
2596 vie->disp_bytes = 4;
2597 break;
2598 case VIE_MOD_INDIRECT:
2599 if (vie->rm == VIE_RM_DISP32) {
2600 vie->disp_bytes = 4;
2601 /*
2602 * Table 2-7. RIP-Relative Addressing
2603 *
2604 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2605 * whereas in compatibility mode it just implies disp32.
2606 */
2607
2608 if (cpu_mode == CPU_MODE_64BIT)
2609 vie->base_register = VM_REG_GUEST_RIP;
2610 else
2611 vie->base_register = VM_REG_LAST;
2612 }
2613 break;
2614 }
2615
2616 done:
2617 vie_advance(vie);
2618
2619 return (0);
2620 }
2621
2622 static int
decode_sib(struct vie * vie)2623 decode_sib(struct vie *vie)
2624 {
2625 uint8_t x;
2626
2627 /* Proceed only if SIB byte is present */
2628 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2629 return (0);
2630
2631 if (vie_peek(vie, &x))
2632 return (-1);
2633
2634 /* De-construct the SIB byte */
2635 vie->ss = (x >> 6) & 0x3;
2636 vie->index = (x >> 3) & 0x7;
2637 vie->base = (x >> 0) & 0x7;
2638
2639 /* Apply the REX prefix modifiers */
2640 vie->index |= vie->rex_x << 3;
2641 vie->base |= vie->rex_b << 3;
2642
2643 switch (vie->mod) {
2644 case VIE_MOD_INDIRECT_DISP8:
2645 vie->disp_bytes = 1;
2646 break;
2647 case VIE_MOD_INDIRECT_DISP32:
2648 vie->disp_bytes = 4;
2649 break;
2650 }
2651
2652 if (vie->mod == VIE_MOD_INDIRECT &&
2653 (vie->base == 5 || vie->base == 13)) {
2654 /*
2655 * Special case when base register is unused if mod = 0
2656 * and base = %rbp or %r13.
2657 *
2658 * Documented in:
2659 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2660 * Table 2-5: Special Cases of REX Encodings
2661 */
2662 vie->disp_bytes = 4;
2663 } else {
2664 vie->base_register = gpr_map[vie->base];
2665 }
2666
2667 /*
2668 * All encodings of 'index' are valid except for %rsp (4).
2669 *
2670 * Documented in:
2671 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2672 * Table 2-5: Special Cases of REX Encodings
2673 */
2674 if (vie->index != 4)
2675 vie->index_register = gpr_map[vie->index];
2676
2677 /* 'scale' makes sense only in the context of an index register */
2678 if (vie->index_register < VM_REG_LAST)
2679 vie->scale = 1 << vie->ss;
2680
2681 vie_advance(vie);
2682
2683 return (0);
2684 }
2685
2686 static int
decode_displacement(struct vie * vie)2687 decode_displacement(struct vie *vie)
2688 {
2689 int n, i;
2690 uint8_t x;
2691
2692 union {
2693 char buf[4];
2694 int8_t signed8;
2695 int32_t signed32;
2696 } u;
2697
2698 if ((n = vie->disp_bytes) == 0)
2699 return (0);
2700
2701 if (n != 1 && n != 4)
2702 panic("decode_displacement: invalid disp_bytes %d", n);
2703
2704 for (i = 0; i < n; i++) {
2705 if (vie_peek(vie, &x))
2706 return (-1);
2707
2708 u.buf[i] = x;
2709 vie_advance(vie);
2710 }
2711
2712 if (n == 1)
2713 vie->displacement = u.signed8; /* sign-extended */
2714 else
2715 vie->displacement = u.signed32; /* sign-extended */
2716
2717 return (0);
2718 }
2719
2720 static int
decode_immediate(struct vie * vie)2721 decode_immediate(struct vie *vie)
2722 {
2723 int i, n;
2724 uint8_t x;
2725 union {
2726 char buf[4];
2727 int8_t signed8;
2728 int16_t signed16;
2729 int32_t signed32;
2730 } u;
2731
2732 /* Figure out immediate operand size (if any) */
2733 if (vie->op.op_flags & VIE_OP_F_IMM) {
2734 /*
2735 * Section 2.2.1.5 "Immediates", Intel SDM:
2736 * In 64-bit mode the typical size of immediate operands
2737 * remains 32-bits. When the operand size if 64-bits, the
2738 * processor sign-extends all immediates to 64-bits prior
2739 * to their use.
2740 */
2741 if (vie->opsize == 4 || vie->opsize == 8)
2742 vie->imm_bytes = 4;
2743 else
2744 vie->imm_bytes = 2;
2745 } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2746 vie->imm_bytes = 1;
2747 }
2748
2749 if ((n = vie->imm_bytes) == 0)
2750 return (0);
2751
2752 KASSERT(n == 1 || n == 2 || n == 4,
2753 ("%s: invalid number of immediate bytes: %d", __func__, n));
2754
2755 for (i = 0; i < n; i++) {
2756 if (vie_peek(vie, &x))
2757 return (-1);
2758
2759 u.buf[i] = x;
2760 vie_advance(vie);
2761 }
2762
2763 /* sign-extend the immediate value before use */
2764 if (n == 1)
2765 vie->immediate = u.signed8;
2766 else if (n == 2)
2767 vie->immediate = u.signed16;
2768 else
2769 vie->immediate = u.signed32;
2770
2771 return (0);
2772 }
2773
2774 static int
decode_moffset(struct vie * vie)2775 decode_moffset(struct vie *vie)
2776 {
2777 int i, n;
2778 uint8_t x;
2779 union {
2780 char buf[8];
2781 uint64_t u64;
2782 } u;
2783
2784 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2785 return (0);
2786
2787 /*
2788 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2789 * The memory offset size follows the address-size of the instruction.
2790 */
2791 n = vie->addrsize;
2792 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2793
2794 u.u64 = 0;
2795 for (i = 0; i < n; i++) {
2796 if (vie_peek(vie, &x))
2797 return (-1);
2798
2799 u.buf[i] = x;
2800 vie_advance(vie);
2801 }
2802 vie->displacement = u.u64;
2803 return (0);
2804 }
2805
2806 #ifdef _KERNEL
2807 /*
2808 * Verify that the 'guest linear address' provided as collateral of the nested
2809 * page table fault matches with our instruction decoding.
2810 */
2811 static int
verify_gla(struct vcpu * vcpu,uint64_t gla,struct vie * vie,enum vm_cpu_mode cpu_mode)2812 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,
2813 enum vm_cpu_mode cpu_mode)
2814 {
2815 int error;
2816 uint64_t base, segbase, idx, gla2;
2817 enum vm_reg_name seg;
2818 struct seg_desc desc;
2819
2820 /* Skip 'gla' verification */
2821 if (gla == VIE_INVALID_GLA)
2822 return (0);
2823
2824 base = 0;
2825 if (vie->base_register != VM_REG_LAST) {
2826 error = vm_get_register(vcpu, vie->base_register, &base);
2827 if (error) {
2828 printf("verify_gla: error %d getting base reg %d\n",
2829 error, vie->base_register);
2830 return (-1);
2831 }
2832
2833 /*
2834 * RIP-relative addressing starts from the following
2835 * instruction
2836 */
2837 if (vie->base_register == VM_REG_GUEST_RIP)
2838 base += vie->num_processed;
2839 }
2840
2841 idx = 0;
2842 if (vie->index_register != VM_REG_LAST) {
2843 error = vm_get_register(vcpu, vie->index_register, &idx);
2844 if (error) {
2845 printf("verify_gla: error %d getting index reg %d\n",
2846 error, vie->index_register);
2847 return (-1);
2848 }
2849 }
2850
2851 /*
2852 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2853 *
2854 * In 64-bit mode, segmentation is generally (but not
2855 * completely) disabled. The exceptions are the FS and GS
2856 * segments.
2857 *
2858 * In legacy IA-32 mode, when the ESP or EBP register is used
2859 * as the base, the SS segment is the default segment. For
2860 * other data references, except when relative to stack or
2861 * string destination the DS segment is the default. These
2862 * can be overridden to allow other segments to be accessed.
2863 */
2864 if (vie->segment_override)
2865 seg = vie->segment_register;
2866 else if (vie->base_register == VM_REG_GUEST_RSP ||
2867 vie->base_register == VM_REG_GUEST_RBP)
2868 seg = VM_REG_GUEST_SS;
2869 else
2870 seg = VM_REG_GUEST_DS;
2871 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2872 seg != VM_REG_GUEST_GS) {
2873 segbase = 0;
2874 } else {
2875 error = vm_get_seg_desc(vcpu, seg, &desc);
2876 if (error) {
2877 printf("verify_gla: error %d getting segment"
2878 " descriptor %d", error,
2879 vie->segment_register);
2880 return (-1);
2881 }
2882 segbase = desc.base;
2883 }
2884
2885 gla2 = segbase + base + vie->scale * idx + vie->displacement;
2886 gla2 &= size2mask[vie->addrsize];
2887 if (gla != gla2) {
2888 printf("verify_gla mismatch: segbase(0x%0lx)"
2889 "base(0x%0lx), scale(%d), index(0x%0lx), "
2890 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2891 segbase, base, vie->scale, idx, vie->displacement,
2892 gla, gla2);
2893 return (-1);
2894 }
2895
2896 return (0);
2897 }
2898 #endif /* _KERNEL */
2899
2900 int
2901 #ifdef _KERNEL
vmm_decode_instruction(struct vcpu * vcpu,uint64_t gla,enum vm_cpu_mode cpu_mode,int cs_d,struct vie * vie)2902 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
2903 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2904 #else
2905 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2906 #endif
2907 {
2908
2909 if (decode_prefixes(vie, cpu_mode, cs_d))
2910 return (-1);
2911
2912 if (decode_opcode(vie))
2913 return (-1);
2914
2915 if (decode_modrm(vie, cpu_mode))
2916 return (-1);
2917
2918 if (decode_sib(vie))
2919 return (-1);
2920
2921 if (decode_displacement(vie))
2922 return (-1);
2923
2924 if (decode_immediate(vie))
2925 return (-1);
2926
2927 if (decode_moffset(vie))
2928 return (-1);
2929
2930 #ifdef _KERNEL
2931 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2932 if (verify_gla(vcpu, gla, vie, cpu_mode))
2933 return (-1);
2934 }
2935 #endif
2936
2937 vie->decoded = 1; /* success */
2938
2939 return (0);
2940 }
2941