1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Sandvine, Inc.
5 * Copyright (c) 2012 NetApp, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #ifdef _KERNEL
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/proc.h>
36
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39
40 #include <machine/vmparam.h>
41 #include <machine/vmm.h>
42 #else /* !_KERNEL */
43 #include <sys/types.h>
44 #include <sys/errno.h>
45 #include <sys/_iovec.h>
46
47 #include <machine/vmm.h>
48
49 #include <err.h>
50 #include <assert.h>
51 #include <stdbool.h>
52 #include <stddef.h>
53 #include <stdio.h>
54 #include <string.h>
55 #include <strings.h>
56 #include <vmmapi.h>
57 #define __diagused
58 #define KASSERT(exp,msg) assert((exp))
59 #define panic(...) errx(4, __VA_ARGS__)
60 #endif /* _KERNEL */
61
62 #include <machine/vmm_instruction_emul.h>
63 #include <x86/psl.h>
64 #include <x86/specialreg.h>
65
66 /* struct vie_op.op_type */
67 enum {
68 VIE_OP_TYPE_NONE = 0,
69 VIE_OP_TYPE_MOV,
70 VIE_OP_TYPE_MOVSX,
71 VIE_OP_TYPE_MOVZX,
72 VIE_OP_TYPE_AND,
73 VIE_OP_TYPE_OR,
74 VIE_OP_TYPE_SUB,
75 VIE_OP_TYPE_TWO_BYTE,
76 VIE_OP_TYPE_PUSH,
77 VIE_OP_TYPE_CMP,
78 VIE_OP_TYPE_POP,
79 VIE_OP_TYPE_MOVS,
80 VIE_OP_TYPE_GROUP1,
81 VIE_OP_TYPE_STOS,
82 VIE_OP_TYPE_BITTEST,
83 VIE_OP_TYPE_TWOB_GRP15,
84 VIE_OP_TYPE_ADD,
85 VIE_OP_TYPE_TEST,
86 VIE_OP_TYPE_BEXTR,
87 VIE_OP_TYPE_LAST
88 };
89
90 /* struct vie_op.op_flags */
91 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
92 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
93 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
94 #define VIE_OP_F_NO_MODRM (1 << 3)
95 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
96
97 static const struct vie_op three_byte_opcodes_0f38[256] = {
98 [0xF7] = {
99 .op_byte = 0xF7,
100 .op_type = VIE_OP_TYPE_BEXTR,
101 },
102 };
103
104 static const struct vie_op two_byte_opcodes[256] = {
105 [0xAE] = {
106 .op_byte = 0xAE,
107 .op_type = VIE_OP_TYPE_TWOB_GRP15,
108 },
109 [0xB6] = {
110 .op_byte = 0xB6,
111 .op_type = VIE_OP_TYPE_MOVZX,
112 },
113 [0xB7] = {
114 .op_byte = 0xB7,
115 .op_type = VIE_OP_TYPE_MOVZX,
116 },
117 [0xBA] = {
118 .op_byte = 0xBA,
119 .op_type = VIE_OP_TYPE_BITTEST,
120 .op_flags = VIE_OP_F_IMM8,
121 },
122 [0xBE] = {
123 .op_byte = 0xBE,
124 .op_type = VIE_OP_TYPE_MOVSX,
125 },
126 };
127
128 static const struct vie_op one_byte_opcodes[256] = {
129 [0x03] = {
130 .op_byte = 0x03,
131 .op_type = VIE_OP_TYPE_ADD,
132 },
133 [0x0F] = {
134 .op_byte = 0x0F,
135 .op_type = VIE_OP_TYPE_TWO_BYTE
136 },
137 [0x0B] = {
138 .op_byte = 0x0B,
139 .op_type = VIE_OP_TYPE_OR,
140 },
141 [0x2B] = {
142 .op_byte = 0x2B,
143 .op_type = VIE_OP_TYPE_SUB,
144 },
145 [0x39] = {
146 .op_byte = 0x39,
147 .op_type = VIE_OP_TYPE_CMP,
148 },
149 [0x3B] = {
150 .op_byte = 0x3B,
151 .op_type = VIE_OP_TYPE_CMP,
152 },
153 [0x88] = {
154 .op_byte = 0x88,
155 .op_type = VIE_OP_TYPE_MOV,
156 },
157 [0x89] = {
158 .op_byte = 0x89,
159 .op_type = VIE_OP_TYPE_MOV,
160 },
161 [0x8A] = {
162 .op_byte = 0x8A,
163 .op_type = VIE_OP_TYPE_MOV,
164 },
165 [0x8B] = {
166 .op_byte = 0x8B,
167 .op_type = VIE_OP_TYPE_MOV,
168 },
169 [0xA1] = {
170 .op_byte = 0xA1,
171 .op_type = VIE_OP_TYPE_MOV,
172 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
173 },
174 [0xA3] = {
175 .op_byte = 0xA3,
176 .op_type = VIE_OP_TYPE_MOV,
177 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
178 },
179 [0xA4] = {
180 .op_byte = 0xA4,
181 .op_type = VIE_OP_TYPE_MOVS,
182 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
183 },
184 [0xA5] = {
185 .op_byte = 0xA5,
186 .op_type = VIE_OP_TYPE_MOVS,
187 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
188 },
189 [0xAA] = {
190 .op_byte = 0xAA,
191 .op_type = VIE_OP_TYPE_STOS,
192 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
193 },
194 [0xAB] = {
195 .op_byte = 0xAB,
196 .op_type = VIE_OP_TYPE_STOS,
197 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
198 },
199 [0xC6] = {
200 /* XXX Group 11 extended opcode - not just MOV */
201 .op_byte = 0xC6,
202 .op_type = VIE_OP_TYPE_MOV,
203 .op_flags = VIE_OP_F_IMM8,
204 },
205 [0xC7] = {
206 .op_byte = 0xC7,
207 .op_type = VIE_OP_TYPE_MOV,
208 .op_flags = VIE_OP_F_IMM,
209 },
210 [0x23] = {
211 .op_byte = 0x23,
212 .op_type = VIE_OP_TYPE_AND,
213 },
214 [0x80] = {
215 /* Group 1 extended opcode */
216 .op_byte = 0x80,
217 .op_type = VIE_OP_TYPE_GROUP1,
218 .op_flags = VIE_OP_F_IMM8,
219 },
220 [0x81] = {
221 /* Group 1 extended opcode */
222 .op_byte = 0x81,
223 .op_type = VIE_OP_TYPE_GROUP1,
224 .op_flags = VIE_OP_F_IMM,
225 },
226 [0x83] = {
227 /* Group 1 extended opcode */
228 .op_byte = 0x83,
229 .op_type = VIE_OP_TYPE_GROUP1,
230 .op_flags = VIE_OP_F_IMM8,
231 },
232 [0x8F] = {
233 /* XXX Group 1A extended opcode - not just POP */
234 .op_byte = 0x8F,
235 .op_type = VIE_OP_TYPE_POP,
236 },
237 [0xF7] = {
238 /* XXX Group 3 extended opcode - not just TEST */
239 .op_byte = 0xF7,
240 .op_type = VIE_OP_TYPE_TEST,
241 .op_flags = VIE_OP_F_IMM,
242 },
243 [0xFF] = {
244 /* XXX Group 5 extended opcode - not just PUSH */
245 .op_byte = 0xFF,
246 .op_type = VIE_OP_TYPE_PUSH,
247 }
248 };
249
250 /* struct vie.mod */
251 #define VIE_MOD_INDIRECT 0
252 #define VIE_MOD_INDIRECT_DISP8 1
253 #define VIE_MOD_INDIRECT_DISP32 2
254 #define VIE_MOD_DIRECT 3
255
256 /* struct vie.rm */
257 #define VIE_RM_SIB 4
258 #define VIE_RM_DISP32 5
259
260 #define GB (1024 * 1024 * 1024)
261
262 static enum vm_reg_name gpr_map[16] = {
263 VM_REG_GUEST_RAX,
264 VM_REG_GUEST_RCX,
265 VM_REG_GUEST_RDX,
266 VM_REG_GUEST_RBX,
267 VM_REG_GUEST_RSP,
268 VM_REG_GUEST_RBP,
269 VM_REG_GUEST_RSI,
270 VM_REG_GUEST_RDI,
271 VM_REG_GUEST_R8,
272 VM_REG_GUEST_R9,
273 VM_REG_GUEST_R10,
274 VM_REG_GUEST_R11,
275 VM_REG_GUEST_R12,
276 VM_REG_GUEST_R13,
277 VM_REG_GUEST_R14,
278 VM_REG_GUEST_R15
279 };
280
281 static uint64_t size2mask[] = {
282 [1] = 0xff,
283 [2] = 0xffff,
284 [4] = 0xffffffff,
285 [8] = 0xffffffffffffffff,
286 };
287
288 static int
vie_read_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t * rval)289 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval)
290 {
291 int error;
292
293 error = vm_get_register(vcpu, reg, rval);
294
295 return (error);
296 }
297
298 static void
vie_calc_bytereg(struct vie * vie,enum vm_reg_name * reg,int * lhbr)299 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
300 {
301 *lhbr = 0;
302 *reg = gpr_map[vie->reg];
303
304 /*
305 * 64-bit mode imposes limitations on accessing legacy high byte
306 * registers (lhbr).
307 *
308 * The legacy high-byte registers cannot be addressed if the REX
309 * prefix is present. In this case the values 4, 5, 6 and 7 of the
310 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
311 *
312 * If the REX prefix is not present then the values 4, 5, 6 and 7
313 * of the 'ModRM:reg' field address the legacy high-byte registers,
314 * %ah, %ch, %dh and %bh respectively.
315 */
316 if (!vie->rex_present) {
317 if (vie->reg & 0x4) {
318 *lhbr = 1;
319 *reg = gpr_map[vie->reg & 0x3];
320 }
321 }
322 }
323
324 static int
vie_read_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t * rval)325 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval)
326 {
327 uint64_t val;
328 int error, lhbr;
329 enum vm_reg_name reg;
330
331 vie_calc_bytereg(vie, ®, &lhbr);
332 error = vm_get_register(vcpu, reg, &val);
333
334 /*
335 * To obtain the value of a legacy high byte register shift the
336 * base register right by 8 bits (%ah = %rax >> 8).
337 */
338 if (lhbr)
339 *rval = val >> 8;
340 else
341 *rval = val;
342 return (error);
343 }
344
345 static int
vie_write_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t byte)346 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte)
347 {
348 uint64_t origval, val, mask;
349 int error, lhbr;
350 enum vm_reg_name reg;
351
352 vie_calc_bytereg(vie, ®, &lhbr);
353 error = vm_get_register(vcpu, reg, &origval);
354 if (error == 0) {
355 val = byte;
356 mask = 0xff;
357 if (lhbr) {
358 /*
359 * Shift left by 8 to store 'byte' in a legacy high
360 * byte register.
361 */
362 val <<= 8;
363 mask <<= 8;
364 }
365 val |= origval & ~mask;
366 error = vm_set_register(vcpu, reg, val);
367 }
368 return (error);
369 }
370
371 int
vie_update_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t val,int size)372 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,
373 uint64_t val, int size)
374 {
375 int error;
376 uint64_t origval;
377
378 switch (size) {
379 case 1:
380 case 2:
381 error = vie_read_register(vcpu, reg, &origval);
382 if (error)
383 return (error);
384 val &= size2mask[size];
385 val |= origval & ~size2mask[size];
386 break;
387 case 4:
388 val &= 0xffffffffUL;
389 break;
390 case 8:
391 break;
392 default:
393 return (EINVAL);
394 }
395
396 error = vm_set_register(vcpu, reg, val);
397 return (error);
398 }
399
400 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
401
402 /*
403 * Return the status flags that would result from doing (x - y).
404 */
405 #define GETCC(sz) \
406 static u_long \
407 getcc##sz(uint##sz##_t x, uint##sz##_t y) \
408 { \
409 u_long rflags; \
410 \
411 __asm __volatile("sub %2,%1; pushfq; popq %0" : \
412 "=r" (rflags), "+r" (x) : "m" (y)); \
413 return (rflags); \
414 } struct __hack
415
416 GETCC(8);
417 GETCC(16);
418 GETCC(32);
419 GETCC(64);
420
421 static u_long
getcc(int opsize,uint64_t x,uint64_t y)422 getcc(int opsize, uint64_t x, uint64_t y)
423 {
424 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
425 ("getcc: invalid operand size %d", opsize));
426
427 if (opsize == 1)
428 return (getcc8(x, y));
429 else if (opsize == 2)
430 return (getcc16(x, y));
431 else if (opsize == 4)
432 return (getcc32(x, y));
433 else
434 return (getcc64(x, y));
435 }
436
437 /*
438 * Macro creation of functions getaddflags{8,16,32,64}
439 */
440 #define GETADDFLAGS(sz) \
441 static u_long \
442 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \
443 { \
444 u_long rflags; \
445 \
446 __asm __volatile("add %2,%1; pushfq; popq %0" : \
447 "=r" (rflags), "+r" (x) : "m" (y)); \
448 return (rflags); \
449 } struct __hack
450
451 GETADDFLAGS(8);
452 GETADDFLAGS(16);
453 GETADDFLAGS(32);
454 GETADDFLAGS(64);
455
456 static u_long
getaddflags(int opsize,uint64_t x,uint64_t y)457 getaddflags(int opsize, uint64_t x, uint64_t y)
458 {
459 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
460 ("getaddflags: invalid operand size %d", opsize));
461
462 if (opsize == 1)
463 return (getaddflags8(x, y));
464 else if (opsize == 2)
465 return (getaddflags16(x, y));
466 else if (opsize == 4)
467 return (getaddflags32(x, y));
468 else
469 return (getaddflags64(x, y));
470 }
471
472 /*
473 * Return the status flags that would result from doing (x & y).
474 */
475 #define GETANDFLAGS(sz) \
476 static u_long \
477 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \
478 { \
479 u_long rflags; \
480 \
481 __asm __volatile("and %2,%1; pushfq; popq %0" : \
482 "=r" (rflags), "+r" (x) : "m" (y)); \
483 return (rflags); \
484 } struct __hack
485
486 GETANDFLAGS(8);
487 GETANDFLAGS(16);
488 GETANDFLAGS(32);
489 GETANDFLAGS(64);
490
491 static u_long
getandflags(int opsize,uint64_t x,uint64_t y)492 getandflags(int opsize, uint64_t x, uint64_t y)
493 {
494 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
495 ("getandflags: invalid operand size %d", opsize));
496
497 if (opsize == 1)
498 return (getandflags8(x, y));
499 else if (opsize == 2)
500 return (getandflags16(x, y));
501 else if (opsize == 4)
502 return (getandflags32(x, y));
503 else
504 return (getandflags64(x, y));
505 }
506
507 static int
emulate_mov(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)508 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
509 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
510 {
511 int error, size;
512 enum vm_reg_name reg;
513 uint8_t byte;
514 uint64_t val;
515
516 size = vie->opsize;
517 error = EINVAL;
518
519 switch (vie->op.op_byte) {
520 case 0x88:
521 /*
522 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
523 * 88/r: mov r/m8, r8
524 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
525 */
526 size = 1; /* override for byte operation */
527 error = vie_read_bytereg(vcpu, vie, &byte);
528 if (error == 0)
529 error = memwrite(vcpu, gpa, byte, size, arg);
530 break;
531 case 0x89:
532 /*
533 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
534 * 89/r: mov r/m16, r16
535 * 89/r: mov r/m32, r32
536 * REX.W + 89/r mov r/m64, r64
537 */
538 reg = gpr_map[vie->reg];
539 error = vie_read_register(vcpu, reg, &val);
540 if (error == 0) {
541 val &= size2mask[size];
542 error = memwrite(vcpu, gpa, val, size, arg);
543 }
544 break;
545 case 0x8A:
546 /*
547 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
548 * 8A/r: mov r8, r/m8
549 * REX + 8A/r: mov r8, r/m8
550 */
551 size = 1; /* override for byte operation */
552 error = memread(vcpu, gpa, &val, size, arg);
553 if (error == 0)
554 error = vie_write_bytereg(vcpu, vie, val);
555 break;
556 case 0x8B:
557 /*
558 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
559 * 8B/r: mov r16, r/m16
560 * 8B/r: mov r32, r/m32
561 * REX.W 8B/r: mov r64, r/m64
562 */
563 error = memread(vcpu, gpa, &val, size, arg);
564 if (error == 0) {
565 reg = gpr_map[vie->reg];
566 error = vie_update_register(vcpu, reg, val, size);
567 }
568 break;
569 case 0xA1:
570 /*
571 * MOV from seg:moffset to AX/EAX/RAX
572 * A1: mov AX, moffs16
573 * A1: mov EAX, moffs32
574 * REX.W + A1: mov RAX, moffs64
575 */
576 error = memread(vcpu, gpa, &val, size, arg);
577 if (error == 0) {
578 reg = VM_REG_GUEST_RAX;
579 error = vie_update_register(vcpu, reg, val, size);
580 }
581 break;
582 case 0xA3:
583 /*
584 * MOV from AX/EAX/RAX to seg:moffset
585 * A3: mov moffs16, AX
586 * A3: mov moffs32, EAX
587 * REX.W + A3: mov moffs64, RAX
588 */
589 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
590 if (error == 0) {
591 val &= size2mask[size];
592 error = memwrite(vcpu, gpa, val, size, arg);
593 }
594 break;
595 case 0xC6:
596 /*
597 * MOV from imm8 to mem (ModRM:r/m)
598 * C6/0 mov r/m8, imm8
599 * REX + C6/0 mov r/m8, imm8
600 */
601 size = 1; /* override for byte operation */
602 error = memwrite(vcpu, gpa, vie->immediate, size, arg);
603 break;
604 case 0xC7:
605 /*
606 * MOV from imm16/imm32 to mem (ModRM:r/m)
607 * C7/0 mov r/m16, imm16
608 * C7/0 mov r/m32, imm32
609 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
610 */
611 val = vie->immediate & size2mask[size];
612 error = memwrite(vcpu, gpa, val, size, arg);
613 break;
614 default:
615 break;
616 }
617
618 return (error);
619 }
620
621 static int
emulate_movx(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)622 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
623 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
624 {
625 int error, size;
626 enum vm_reg_name reg;
627 uint64_t val;
628
629 size = vie->opsize;
630 error = EINVAL;
631
632 switch (vie->op.op_byte) {
633 case 0xB6:
634 /*
635 * MOV and zero extend byte from mem (ModRM:r/m) to
636 * reg (ModRM:reg).
637 *
638 * 0F B6/r movzx r16, r/m8
639 * 0F B6/r movzx r32, r/m8
640 * REX.W + 0F B6/r movzx r64, r/m8
641 */
642
643 /* get the first operand */
644 error = memread(vcpu, gpa, &val, 1, arg);
645 if (error)
646 break;
647
648 /* get the second operand */
649 reg = gpr_map[vie->reg];
650
651 /* zero-extend byte */
652 val = (uint8_t)val;
653
654 /* write the result */
655 error = vie_update_register(vcpu, reg, val, size);
656 break;
657 case 0xB7:
658 /*
659 * MOV and zero extend word from mem (ModRM:r/m) to
660 * reg (ModRM:reg).
661 *
662 * 0F B7/r movzx r32, r/m16
663 * REX.W + 0F B7/r movzx r64, r/m16
664 */
665 error = memread(vcpu, gpa, &val, 2, arg);
666 if (error)
667 return (error);
668
669 reg = gpr_map[vie->reg];
670
671 /* zero-extend word */
672 val = (uint16_t)val;
673
674 error = vie_update_register(vcpu, reg, val, size);
675 break;
676 case 0xBE:
677 /*
678 * MOV and sign extend byte from mem (ModRM:r/m) to
679 * reg (ModRM:reg).
680 *
681 * 0F BE/r movsx r16, r/m8
682 * 0F BE/r movsx r32, r/m8
683 * REX.W + 0F BE/r movsx r64, r/m8
684 */
685
686 /* get the first operand */
687 error = memread(vcpu, gpa, &val, 1, arg);
688 if (error)
689 break;
690
691 /* get the second operand */
692 reg = gpr_map[vie->reg];
693
694 /* sign extend byte */
695 val = (int8_t)val;
696
697 /* write the result */
698 error = vie_update_register(vcpu, reg, val, size);
699 break;
700 default:
701 break;
702 }
703 return (error);
704 }
705
706 /*
707 * Helper function to calculate and validate a linear address.
708 */
709 static int
get_gla(struct vcpu * vcpu,struct vie * vie __unused,struct vm_guest_paging * paging,int opsize,int addrsize,int prot,enum vm_reg_name seg,enum vm_reg_name gpr,uint64_t * gla,int * fault)710 get_gla(struct vcpu *vcpu, struct vie *vie __unused,
711 struct vm_guest_paging *paging, int opsize, int addrsize, int prot,
712 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)
713 {
714 struct seg_desc desc;
715 uint64_t cr0, val, rflags;
716 int error __diagused;
717
718 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
719 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
720
721 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
722 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
723
724 error = vm_get_seg_desc(vcpu, seg, &desc);
725 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
726 __func__, error, seg));
727
728 error = vie_read_register(vcpu, gpr, &val);
729 KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
730 error, gpr));
731
732 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
733 addrsize, prot, gla)) {
734 if (seg == VM_REG_GUEST_SS)
735 vm_inject_ss(vcpu, 0);
736 else
737 vm_inject_gp(vcpu);
738 goto guest_fault;
739 }
740
741 if (vie_canonical_check(paging->cpu_mode, *gla)) {
742 if (seg == VM_REG_GUEST_SS)
743 vm_inject_ss(vcpu, 0);
744 else
745 vm_inject_gp(vcpu);
746 goto guest_fault;
747 }
748
749 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
750 vm_inject_ac(vcpu, 0);
751 goto guest_fault;
752 }
753
754 *fault = 0;
755 return (0);
756
757 guest_fault:
758 *fault = 1;
759 return (0);
760 }
761
762 static int
emulate_movs(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)763 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
764 struct vm_guest_paging *paging, mem_region_read_t memread,
765 mem_region_write_t memwrite, void *arg)
766 {
767 #ifdef _KERNEL
768 struct vm_copyinfo copyinfo[2];
769 #else
770 struct iovec copyinfo[2];
771 #endif
772 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
773 uint64_t rcx, rdi, rsi, rflags;
774 int error, fault, opsize, seg, repeat;
775
776 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
777 val = 0;
778 error = 0;
779
780 /*
781 * XXX although the MOVS instruction is only supposed to be used with
782 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
783 *
784 * Empirically the "repnz" prefix has identical behavior to "rep"
785 * and the zero flag does not make a difference.
786 */
787 repeat = vie->repz_present | vie->repnz_present;
788
789 if (repeat) {
790 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
791 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
792
793 /*
794 * The count register is %rcx, %ecx or %cx depending on the
795 * address size of the instruction.
796 */
797 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
798 error = 0;
799 goto done;
800 }
801 }
802
803 /*
804 * Source Destination Comments
805 * --------------------------------------------
806 * (1) memory memory n/a
807 * (2) memory mmio emulated
808 * (3) mmio memory emulated
809 * (4) mmio mmio emulated
810 *
811 * At this point we don't have sufficient information to distinguish
812 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
813 * out because it will succeed only when operating on regular memory.
814 *
815 * XXX the emulation doesn't properly handle the case where 'gpa'
816 * is straddling the boundary between the normal memory and MMIO.
817 */
818
819 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
820 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
821 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
822 if (error || fault)
823 goto done;
824
825 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ,
826 copyinfo, nitems(copyinfo), &fault);
827 if (error == 0) {
828 if (fault)
829 goto done; /* Resume guest to handle fault */
830
831 /*
832 * case (2): read from system memory and write to mmio.
833 */
834 vm_copyin(copyinfo, &val, opsize);
835 vm_copy_teardown(copyinfo, nitems(copyinfo));
836 error = memwrite(vcpu, gpa, val, opsize, arg);
837 if (error)
838 goto done;
839 } else {
840 /*
841 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
842 * if 'srcaddr' is in the mmio space.
843 */
844
845 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
846 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
847 &fault);
848 if (error || fault)
849 goto done;
850
851 error = vm_copy_setup(vcpu, paging, dstaddr, opsize,
852 PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
853 if (error == 0) {
854 if (fault)
855 goto done; /* Resume guest to handle fault */
856
857 /*
858 * case (3): read from MMIO and write to system memory.
859 *
860 * A MMIO read can have side-effects so we
861 * commit to it only after vm_copy_setup() is
862 * successful. If a page-fault needs to be
863 * injected into the guest then it will happen
864 * before the MMIO read is attempted.
865 */
866 error = memread(vcpu, gpa, &val, opsize, arg);
867 if (error)
868 goto done;
869
870 vm_copyout(&val, copyinfo, opsize);
871 vm_copy_teardown(copyinfo, nitems(copyinfo));
872 } else {
873 /*
874 * Case (4): read from and write to mmio.
875 *
876 * Commit to the MMIO read/write (with potential
877 * side-effects) only after we are sure that the
878 * instruction is not going to be restarted due
879 * to address translation faults.
880 */
881 error = vm_gla2gpa(vcpu, paging, srcaddr,
882 PROT_READ, &srcgpa, &fault);
883 if (error || fault)
884 goto done;
885
886 error = vm_gla2gpa(vcpu, paging, dstaddr,
887 PROT_WRITE, &dstgpa, &fault);
888 if (error || fault)
889 goto done;
890
891 error = memread(vcpu, srcgpa, &val, opsize, arg);
892 if (error)
893 goto done;
894
895 error = memwrite(vcpu, dstgpa, val, opsize, arg);
896 if (error)
897 goto done;
898 }
899 }
900
901 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi);
902 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
903
904 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
905 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
906
907 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
908 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
909
910 if (rflags & PSL_D) {
911 rsi -= opsize;
912 rdi -= opsize;
913 } else {
914 rsi += opsize;
915 rdi += opsize;
916 }
917
918 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi,
919 vie->addrsize);
920 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
921
922 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
923 vie->addrsize);
924 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
925
926 if (repeat) {
927 rcx = rcx - 1;
928 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
929 rcx, vie->addrsize);
930 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
931
932 /*
933 * Repeat the instruction if the count register is not zero.
934 */
935 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
936 vm_restart_instruction(vcpu);
937 }
938 done:
939 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
940 __func__, error));
941 return (error);
942 }
943
944 static int
emulate_stos(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread __unused,mem_region_write_t memwrite,void * arg)945 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
946 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,
947 mem_region_write_t memwrite, void *arg)
948 {
949 int error, opsize, repeat;
950 uint64_t val;
951 uint64_t rcx, rdi, rflags;
952
953 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
954 repeat = vie->repz_present | vie->repnz_present;
955
956 if (repeat) {
957 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
958 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
959
960 /*
961 * The count register is %rcx, %ecx or %cx depending on the
962 * address size of the instruction.
963 */
964 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
965 return (0);
966 }
967
968 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
969 KASSERT(!error, ("%s: error %d getting rax", __func__, error));
970
971 error = memwrite(vcpu, gpa, val, opsize, arg);
972 if (error)
973 return (error);
974
975 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
976 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
977
978 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
979 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
980
981 if (rflags & PSL_D)
982 rdi -= opsize;
983 else
984 rdi += opsize;
985
986 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
987 vie->addrsize);
988 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
989
990 if (repeat) {
991 rcx = rcx - 1;
992 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
993 rcx, vie->addrsize);
994 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
995
996 /*
997 * Repeat the instruction if the count register is not zero.
998 */
999 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1000 vm_restart_instruction(vcpu);
1001 }
1002
1003 return (0);
1004 }
1005
1006 static int
emulate_and(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1007 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1008 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1009 {
1010 int error, size;
1011 enum vm_reg_name reg;
1012 uint64_t result, rflags, rflags2, val1, val2;
1013
1014 size = vie->opsize;
1015 error = EINVAL;
1016
1017 switch (vie->op.op_byte) {
1018 case 0x23:
1019 /*
1020 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1021 * result in reg.
1022 *
1023 * 23/r and r16, r/m16
1024 * 23/r and r32, r/m32
1025 * REX.W + 23/r and r64, r/m64
1026 */
1027
1028 /* get the first operand */
1029 reg = gpr_map[vie->reg];
1030 error = vie_read_register(vcpu, reg, &val1);
1031 if (error)
1032 break;
1033
1034 /* get the second operand */
1035 error = memread(vcpu, gpa, &val2, size, arg);
1036 if (error)
1037 break;
1038
1039 /* perform the operation and write the result */
1040 result = val1 & val2;
1041 error = vie_update_register(vcpu, reg, result, size);
1042 break;
1043 case 0x81:
1044 case 0x83:
1045 /*
1046 * AND mem (ModRM:r/m) with immediate and store the
1047 * result in mem.
1048 *
1049 * 81 /4 and r/m16, imm16
1050 * 81 /4 and r/m32, imm32
1051 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
1052 *
1053 * 83 /4 and r/m16, imm8 sign-extended to 16
1054 * 83 /4 and r/m32, imm8 sign-extended to 32
1055 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
1056 */
1057
1058 /* get the first operand */
1059 error = memread(vcpu, gpa, &val1, size, arg);
1060 if (error)
1061 break;
1062
1063 /*
1064 * perform the operation with the pre-fetched immediate
1065 * operand and write the result
1066 */
1067 result = val1 & vie->immediate;
1068 error = memwrite(vcpu, gpa, result, size, arg);
1069 break;
1070 default:
1071 break;
1072 }
1073 if (error)
1074 return (error);
1075
1076 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1077 if (error)
1078 return (error);
1079
1080 /*
1081 * OF and CF are cleared; the SF, ZF and PF flags are set according
1082 * to the result; AF is undefined.
1083 *
1084 * The updated status flags are obtained by subtracting 0 from 'result'.
1085 */
1086 rflags2 = getcc(size, result, 0);
1087 rflags &= ~RFLAGS_STATUS_BITS;
1088 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1089
1090 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1091 return (error);
1092 }
1093
1094 static int
emulate_or(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1095 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1096 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1097 {
1098 int error, size;
1099 enum vm_reg_name reg;
1100 uint64_t result, rflags, rflags2, val1, val2;
1101
1102 size = vie->opsize;
1103 error = EINVAL;
1104
1105 switch (vie->op.op_byte) {
1106 case 0x0B:
1107 /*
1108 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1109 * result in reg.
1110 *
1111 * 0b/r or r16, r/m16
1112 * 0b/r or r32, r/m32
1113 * REX.W + 0b/r or r64, r/m64
1114 */
1115
1116 /* get the first operand */
1117 reg = gpr_map[vie->reg];
1118 error = vie_read_register(vcpu, reg, &val1);
1119 if (error)
1120 break;
1121
1122 /* get the second operand */
1123 error = memread(vcpu, gpa, &val2, size, arg);
1124 if (error)
1125 break;
1126
1127 /* perform the operation and write the result */
1128 result = val1 | val2;
1129 error = vie_update_register(vcpu, reg, result, size);
1130 break;
1131 case 0x81:
1132 case 0x83:
1133 /*
1134 * OR mem (ModRM:r/m) with immediate and store the
1135 * result in mem.
1136 *
1137 * 81 /1 or r/m16, imm16
1138 * 81 /1 or r/m32, imm32
1139 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
1140 *
1141 * 83 /1 or r/m16, imm8 sign-extended to 16
1142 * 83 /1 or r/m32, imm8 sign-extended to 32
1143 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
1144 */
1145
1146 /* get the first operand */
1147 error = memread(vcpu, gpa, &val1, size, arg);
1148 if (error)
1149 break;
1150
1151 /*
1152 * perform the operation with the pre-fetched immediate
1153 * operand and write the result
1154 */
1155 result = val1 | vie->immediate;
1156 error = memwrite(vcpu, gpa, result, size, arg);
1157 break;
1158 default:
1159 break;
1160 }
1161 if (error)
1162 return (error);
1163
1164 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1165 if (error)
1166 return (error);
1167
1168 /*
1169 * OF and CF are cleared; the SF, ZF and PF flags are set according
1170 * to the result; AF is undefined.
1171 *
1172 * The updated status flags are obtained by subtracting 0 from 'result'.
1173 */
1174 rflags2 = getcc(size, result, 0);
1175 rflags &= ~RFLAGS_STATUS_BITS;
1176 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1177
1178 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1179 return (error);
1180 }
1181
1182 static int
emulate_cmp(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1183 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1184 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1185 {
1186 int error, size;
1187 uint64_t regop, memop, op1, op2, rflags, rflags2;
1188 enum vm_reg_name reg;
1189
1190 size = vie->opsize;
1191 switch (vie->op.op_byte) {
1192 case 0x39:
1193 case 0x3B:
1194 /*
1195 * 39/r CMP r/m16, r16
1196 * 39/r CMP r/m32, r32
1197 * REX.W 39/r CMP r/m64, r64
1198 *
1199 * 3B/r CMP r16, r/m16
1200 * 3B/r CMP r32, r/m32
1201 * REX.W + 3B/r CMP r64, r/m64
1202 *
1203 * Compare the first operand with the second operand and
1204 * set status flags in EFLAGS register. The comparison is
1205 * performed by subtracting the second operand from the first
1206 * operand and then setting the status flags.
1207 */
1208
1209 /* Get the register operand */
1210 reg = gpr_map[vie->reg];
1211 error = vie_read_register(vcpu, reg, ®op);
1212 if (error)
1213 return (error);
1214
1215 /* Get the memory operand */
1216 error = memread(vcpu, gpa, &memop, size, arg);
1217 if (error)
1218 return (error);
1219
1220 if (vie->op.op_byte == 0x3B) {
1221 op1 = regop;
1222 op2 = memop;
1223 } else {
1224 op1 = memop;
1225 op2 = regop;
1226 }
1227 rflags2 = getcc(size, op1, op2);
1228 break;
1229 case 0x80:
1230 case 0x81:
1231 case 0x83:
1232 /*
1233 * 80 /7 cmp r/m8, imm8
1234 * REX + 80 /7 cmp r/m8, imm8
1235 *
1236 * 81 /7 cmp r/m16, imm16
1237 * 81 /7 cmp r/m32, imm32
1238 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
1239 *
1240 * 83 /7 cmp r/m16, imm8 sign-extended to 16
1241 * 83 /7 cmp r/m32, imm8 sign-extended to 32
1242 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
1243 *
1244 * Compare mem (ModRM:r/m) with immediate and set
1245 * status flags according to the results. The
1246 * comparison is performed by subtracting the
1247 * immediate from the first operand and then setting
1248 * the status flags.
1249 *
1250 */
1251 if (vie->op.op_byte == 0x80)
1252 size = 1;
1253
1254 /* get the first operand */
1255 error = memread(vcpu, gpa, &op1, size, arg);
1256 if (error)
1257 return (error);
1258
1259 rflags2 = getcc(size, op1, vie->immediate);
1260 break;
1261 default:
1262 return (EINVAL);
1263 }
1264 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1265 if (error)
1266 return (error);
1267 rflags &= ~RFLAGS_STATUS_BITS;
1268 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1269
1270 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1271 return (error);
1272 }
1273
1274 static int
emulate_test(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1275 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1276 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1277 {
1278 int error, size;
1279 uint64_t op1, rflags, rflags2;
1280
1281 size = vie->opsize;
1282 error = EINVAL;
1283
1284 switch (vie->op.op_byte) {
1285 case 0xF7:
1286 /*
1287 * F7 /0 test r/m16, imm16
1288 * F7 /0 test r/m32, imm32
1289 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64
1290 *
1291 * Test mem (ModRM:r/m) with immediate and set status
1292 * flags according to the results. The comparison is
1293 * performed by anding the immediate from the first
1294 * operand and then setting the status flags.
1295 */
1296 if ((vie->reg & 7) != 0)
1297 return (EINVAL);
1298
1299 error = memread(vcpu, gpa, &op1, size, arg);
1300 if (error)
1301 return (error);
1302
1303 rflags2 = getandflags(size, op1, vie->immediate);
1304 break;
1305 default:
1306 return (EINVAL);
1307 }
1308 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1309 if (error)
1310 return (error);
1311
1312 /*
1313 * OF and CF are cleared; the SF, ZF and PF flags are set according
1314 * to the result; AF is undefined.
1315 */
1316 rflags &= ~RFLAGS_STATUS_BITS;
1317 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1318
1319 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1320 return (error);
1321 }
1322
1323 static int
emulate_bextr(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1324 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1325 struct vm_guest_paging *paging, mem_region_read_t memread,
1326 mem_region_write_t memwrite __unused, void *arg)
1327 {
1328 uint64_t src1, src2, dst, rflags;
1329 unsigned start, len, size;
1330 int error;
1331
1332 size = vie->opsize;
1333 error = EINVAL;
1334
1335 /*
1336 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b
1337 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b
1338 *
1339 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and
1340 * Vex.vvvv.
1341 *
1342 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1343 */
1344 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1345 size = 4;
1346
1347 /*
1348 * Extracts contiguous bits from the first /source/ operand (second
1349 * operand) using an index and length specified in the second /source/
1350 * operand (third operand).
1351 */
1352 error = memread(vcpu, gpa, &src1, size, arg);
1353 if (error)
1354 return (error);
1355 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2);
1356 if (error)
1357 return (error);
1358 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1359 if (error)
1360 return (error);
1361
1362 start = (src2 & 0xff);
1363 len = (src2 & 0xff00) >> 8;
1364
1365 /* If no bits are extracted, the destination register is cleared. */
1366 dst = 0;
1367
1368 /* If START exceeds the operand size, no bits are extracted. */
1369 if (start > size * 8)
1370 goto done;
1371 /* Length is bounded by both the destination size and start offset. */
1372 if (start + len > size * 8)
1373 len = (size * 8) - start;
1374 if (len == 0)
1375 goto done;
1376
1377 if (start > 0)
1378 src1 = (src1 >> start);
1379 if (len < 64)
1380 src1 = src1 & ((1ull << len) - 1);
1381 dst = src1;
1382
1383 done:
1384 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size);
1385 if (error)
1386 return (error);
1387
1388 /*
1389 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1390 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1391 */
1392 rflags &= ~RFLAGS_STATUS_BITS;
1393 if (dst == 0)
1394 rflags |= PSL_Z;
1395 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags,
1396 8);
1397 return (error);
1398 }
1399
1400 static int
emulate_add(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1401 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1402 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1403 {
1404 int error, size;
1405 uint64_t nval, rflags, rflags2, val1, val2;
1406 enum vm_reg_name reg;
1407
1408 size = vie->opsize;
1409 error = EINVAL;
1410
1411 switch (vie->op.op_byte) {
1412 case 0x03:
1413 /*
1414 * ADD r/m to r and store the result in r
1415 *
1416 * 03/r ADD r16, r/m16
1417 * 03/r ADD r32, r/m32
1418 * REX.W + 03/r ADD r64, r/m64
1419 */
1420
1421 /* get the first operand */
1422 reg = gpr_map[vie->reg];
1423 error = vie_read_register(vcpu, reg, &val1);
1424 if (error)
1425 break;
1426
1427 /* get the second operand */
1428 error = memread(vcpu, gpa, &val2, size, arg);
1429 if (error)
1430 break;
1431
1432 /* perform the operation and write the result */
1433 nval = val1 + val2;
1434 error = vie_update_register(vcpu, reg, nval, size);
1435 break;
1436 default:
1437 break;
1438 }
1439
1440 if (!error) {
1441 rflags2 = getaddflags(size, val1, val2);
1442 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1443 &rflags);
1444 if (error)
1445 return (error);
1446
1447 rflags &= ~RFLAGS_STATUS_BITS;
1448 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1449 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1450 rflags, 8);
1451 }
1452
1453 return (error);
1454 }
1455
1456 static int
emulate_sub(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1457 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1458 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1459 {
1460 int error, size;
1461 uint64_t nval, rflags, rflags2, val1, val2;
1462 enum vm_reg_name reg;
1463
1464 size = vie->opsize;
1465 error = EINVAL;
1466
1467 switch (vie->op.op_byte) {
1468 case 0x2B:
1469 /*
1470 * SUB r/m from r and store the result in r
1471 *
1472 * 2B/r SUB r16, r/m16
1473 * 2B/r SUB r32, r/m32
1474 * REX.W + 2B/r SUB r64, r/m64
1475 */
1476
1477 /* get the first operand */
1478 reg = gpr_map[vie->reg];
1479 error = vie_read_register(vcpu, reg, &val1);
1480 if (error)
1481 break;
1482
1483 /* get the second operand */
1484 error = memread(vcpu, gpa, &val2, size, arg);
1485 if (error)
1486 break;
1487
1488 /* perform the operation and write the result */
1489 nval = val1 - val2;
1490 error = vie_update_register(vcpu, reg, nval, size);
1491 break;
1492 default:
1493 break;
1494 }
1495
1496 if (!error) {
1497 rflags2 = getcc(size, val1, val2);
1498 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1499 &rflags);
1500 if (error)
1501 return (error);
1502
1503 rflags &= ~RFLAGS_STATUS_BITS;
1504 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1505 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1506 rflags, 8);
1507 }
1508
1509 return (error);
1510 }
1511
1512 static int
emulate_stack_op(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1513 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1514 struct vm_guest_paging *paging, mem_region_read_t memread,
1515 mem_region_write_t memwrite, void *arg)
1516 {
1517 #ifdef _KERNEL
1518 struct vm_copyinfo copyinfo[2];
1519 #else
1520 struct iovec copyinfo[2];
1521 #endif
1522 struct seg_desc ss_desc;
1523 uint64_t cr0, rflags, rsp, stack_gla, val;
1524 int error, fault, size, stackaddrsize, pushop;
1525
1526 val = 0;
1527 size = vie->opsize;
1528 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1529
1530 /*
1531 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1532 */
1533 if (paging->cpu_mode == CPU_MODE_REAL) {
1534 stackaddrsize = 2;
1535 } else if (paging->cpu_mode == CPU_MODE_64BIT) {
1536 /*
1537 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1538 * - Stack pointer size is always 64-bits.
1539 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1540 * - 16-bit PUSH/POP is supported by using the operand size
1541 * override prefix (66H).
1542 */
1543 stackaddrsize = 8;
1544 size = vie->opsize_override ? 2 : 8;
1545 } else {
1546 /*
1547 * In protected or compatibility mode the 'B' flag in the
1548 * stack-segment descriptor determines the size of the
1549 * stack pointer.
1550 */
1551 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc);
1552 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1553 __func__, error));
1554 if (SEG_DESC_DEF32(ss_desc.access))
1555 stackaddrsize = 4;
1556 else
1557 stackaddrsize = 2;
1558 }
1559
1560 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
1561 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1562
1563 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1564 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1565
1566 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp);
1567 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1568 if (pushop) {
1569 rsp -= size;
1570 }
1571
1572 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1573 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1574 &stack_gla)) {
1575 vm_inject_ss(vcpu, 0);
1576 return (0);
1577 }
1578
1579 if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1580 vm_inject_ss(vcpu, 0);
1581 return (0);
1582 }
1583
1584 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1585 vm_inject_ac(vcpu, 0);
1586 return (0);
1587 }
1588
1589 error = vm_copy_setup(vcpu, paging, stack_gla, size,
1590 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1591 &fault);
1592 if (error || fault)
1593 return (error);
1594
1595 if (pushop) {
1596 error = memread(vcpu, mmio_gpa, &val, size, arg);
1597 if (error == 0)
1598 vm_copyout(&val, copyinfo, size);
1599 } else {
1600 vm_copyin(copyinfo, &val, size);
1601 error = memwrite(vcpu, mmio_gpa, val, size, arg);
1602 rsp += size;
1603 }
1604 vm_copy_teardown(copyinfo, nitems(copyinfo));
1605
1606 if (error == 0) {
1607 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp,
1608 stackaddrsize);
1609 KASSERT(error == 0, ("error %d updating rsp", error));
1610 }
1611 return (error);
1612 }
1613
1614 static int
emulate_push(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1615 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1616 struct vm_guest_paging *paging, mem_region_read_t memread,
1617 mem_region_write_t memwrite, void *arg)
1618 {
1619 int error;
1620
1621 /*
1622 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1623 *
1624 * PUSH is part of the group 5 extended opcodes and is identified
1625 * by ModRM:reg = b110.
1626 */
1627 if ((vie->reg & 7) != 6)
1628 return (EINVAL);
1629
1630 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1631 memwrite, arg);
1632 return (error);
1633 }
1634
1635 static int
emulate_pop(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1636 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1637 struct vm_guest_paging *paging, mem_region_read_t memread,
1638 mem_region_write_t memwrite, void *arg)
1639 {
1640 int error;
1641
1642 /*
1643 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1644 *
1645 * POP is part of the group 1A extended opcodes and is identified
1646 * by ModRM:reg = b000.
1647 */
1648 if ((vie->reg & 7) != 0)
1649 return (EINVAL);
1650
1651 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1652 memwrite, arg);
1653 return (error);
1654 }
1655
1656 static int
emulate_group1(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1657 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1658 struct vm_guest_paging *paging __unused, mem_region_read_t memread,
1659 mem_region_write_t memwrite, void *memarg)
1660 {
1661 int error;
1662
1663 switch (vie->reg & 7) {
1664 case 0x1: /* OR */
1665 error = emulate_or(vcpu, gpa, vie,
1666 memread, memwrite, memarg);
1667 break;
1668 case 0x4: /* AND */
1669 error = emulate_and(vcpu, gpa, vie,
1670 memread, memwrite, memarg);
1671 break;
1672 case 0x7: /* CMP */
1673 error = emulate_cmp(vcpu, gpa, vie,
1674 memread, memwrite, memarg);
1675 break;
1676 default:
1677 error = EINVAL;
1678 break;
1679 }
1680
1681 return (error);
1682 }
1683
1684 static int
emulate_bittest(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1685 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1686 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1687 void *memarg)
1688 {
1689 uint64_t val, rflags;
1690 int error, bitmask, bitoff;
1691
1692 /*
1693 * 0F BA is a Group 8 extended opcode.
1694 *
1695 * Currently we only emulate the 'Bit Test' instruction which is
1696 * identified by a ModR/M:reg encoding of 100b.
1697 */
1698 if ((vie->reg & 7) != 4)
1699 return (EINVAL);
1700
1701 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1702 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1703
1704 error = memread(vcpu, gpa, &val, vie->opsize, memarg);
1705 if (error)
1706 return (error);
1707
1708 /*
1709 * Intel SDM, Vol 2, Table 3-2:
1710 * "Range of Bit Positions Specified by Bit Offset Operands"
1711 */
1712 bitmask = vie->opsize * 8 - 1;
1713 bitoff = vie->immediate & bitmask;
1714
1715 /* Copy the bit into the Carry flag in %rflags */
1716 if (val & (1UL << bitoff))
1717 rflags |= PSL_C;
1718 else
1719 rflags &= ~PSL_C;
1720
1721 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1722 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1723
1724 return (0);
1725 }
1726
1727 static int
emulate_twob_group15(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1728 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1729 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1730 void *memarg)
1731 {
1732 int error;
1733 uint64_t buf;
1734
1735 switch (vie->reg & 7) {
1736 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */
1737 if (vie->mod == 0x3) {
1738 /*
1739 * SFENCE. Ignore it, VM exit provides enough
1740 * barriers on its own.
1741 */
1742 error = 0;
1743 } else {
1744 /*
1745 * CLFLUSH, CLFLUSHOPT. Only check for access
1746 * rights.
1747 */
1748 error = memread(vcpu, gpa, &buf, 1, memarg);
1749 }
1750 break;
1751 default:
1752 error = EINVAL;
1753 break;
1754 }
1755
1756 return (error);
1757 }
1758
1759 int
vmm_emulate_instruction(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1760 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1761 struct vm_guest_paging *paging, mem_region_read_t memread,
1762 mem_region_write_t memwrite, void *memarg)
1763 {
1764 int error;
1765
1766 if (!vie->decoded)
1767 return (EINVAL);
1768
1769 switch (vie->op.op_type) {
1770 case VIE_OP_TYPE_GROUP1:
1771 error = emulate_group1(vcpu, gpa, vie, paging, memread,
1772 memwrite, memarg);
1773 break;
1774 case VIE_OP_TYPE_POP:
1775 error = emulate_pop(vcpu, gpa, vie, paging, memread,
1776 memwrite, memarg);
1777 break;
1778 case VIE_OP_TYPE_PUSH:
1779 error = emulate_push(vcpu, gpa, vie, paging, memread,
1780 memwrite, memarg);
1781 break;
1782 case VIE_OP_TYPE_CMP:
1783 error = emulate_cmp(vcpu, gpa, vie,
1784 memread, memwrite, memarg);
1785 break;
1786 case VIE_OP_TYPE_MOV:
1787 error = emulate_mov(vcpu, gpa, vie,
1788 memread, memwrite, memarg);
1789 break;
1790 case VIE_OP_TYPE_MOVSX:
1791 case VIE_OP_TYPE_MOVZX:
1792 error = emulate_movx(vcpu, gpa, vie,
1793 memread, memwrite, memarg);
1794 break;
1795 case VIE_OP_TYPE_MOVS:
1796 error = emulate_movs(vcpu, gpa, vie, paging, memread,
1797 memwrite, memarg);
1798 break;
1799 case VIE_OP_TYPE_STOS:
1800 error = emulate_stos(vcpu, gpa, vie, paging, memread,
1801 memwrite, memarg);
1802 break;
1803 case VIE_OP_TYPE_AND:
1804 error = emulate_and(vcpu, gpa, vie,
1805 memread, memwrite, memarg);
1806 break;
1807 case VIE_OP_TYPE_OR:
1808 error = emulate_or(vcpu, gpa, vie,
1809 memread, memwrite, memarg);
1810 break;
1811 case VIE_OP_TYPE_SUB:
1812 error = emulate_sub(vcpu, gpa, vie,
1813 memread, memwrite, memarg);
1814 break;
1815 case VIE_OP_TYPE_BITTEST:
1816 error = emulate_bittest(vcpu, gpa, vie,
1817 memread, memwrite, memarg);
1818 break;
1819 case VIE_OP_TYPE_TWOB_GRP15:
1820 error = emulate_twob_group15(vcpu, gpa, vie,
1821 memread, memwrite, memarg);
1822 break;
1823 case VIE_OP_TYPE_ADD:
1824 error = emulate_add(vcpu, gpa, vie, memread,
1825 memwrite, memarg);
1826 break;
1827 case VIE_OP_TYPE_TEST:
1828 error = emulate_test(vcpu, gpa, vie,
1829 memread, memwrite, memarg);
1830 break;
1831 case VIE_OP_TYPE_BEXTR:
1832 error = emulate_bextr(vcpu, gpa, vie, paging,
1833 memread, memwrite, memarg);
1834 break;
1835 default:
1836 error = EINVAL;
1837 break;
1838 }
1839
1840 return (error);
1841 }
1842
1843 int
vie_alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)1844 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1845 {
1846 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1847 ("%s: invalid size %d", __func__, size));
1848 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1849
1850 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1851 return (0);
1852
1853 return ((gla & (size - 1)) ? 1 : 0);
1854 }
1855
1856 int
vie_canonical_check(enum vm_cpu_mode cpu_mode,uint64_t gla)1857 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1858 {
1859 uint64_t mask;
1860
1861 if (cpu_mode != CPU_MODE_64BIT)
1862 return (0);
1863
1864 /*
1865 * The value of the bit 47 in the 'gla' should be replicated in the
1866 * most significant 16 bits.
1867 */
1868 mask = ~((1UL << 48) - 1);
1869 if (gla & (1UL << 47))
1870 return ((gla & mask) != mask);
1871 else
1872 return ((gla & mask) != 0);
1873 }
1874
1875 uint64_t
vie_size2mask(int size)1876 vie_size2mask(int size)
1877 {
1878 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1879 ("vie_size2mask: invalid size %d", size));
1880 return (size2mask[size]);
1881 }
1882
1883 int
vie_calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)1884 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1885 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1886 int prot, uint64_t *gla)
1887 {
1888 uint64_t firstoff, low_limit, high_limit, segbase;
1889 int glasize, type;
1890
1891 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1892 ("%s: invalid segment %d", __func__, seg));
1893 KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1894 ("%s: invalid operand size %d", __func__, length));
1895 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1896 ("%s: invalid prot %#x", __func__, prot));
1897
1898 firstoff = offset;
1899 if (cpu_mode == CPU_MODE_64BIT) {
1900 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1901 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1902 glasize = 8;
1903 } else {
1904 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1905 "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1906 glasize = 4;
1907 /*
1908 * If the segment selector is loaded with a NULL selector
1909 * then the descriptor is unusable and attempting to use
1910 * it results in a #GP(0).
1911 */
1912 if (SEG_DESC_UNUSABLE(desc->access))
1913 return (-1);
1914
1915 /*
1916 * The processor generates a #NP exception when a segment
1917 * register is loaded with a selector that points to a
1918 * descriptor that is not present. If this was the case then
1919 * it would have been checked before the VM-exit.
1920 */
1921 KASSERT(SEG_DESC_PRESENT(desc->access),
1922 ("segment %d not present: %#x", seg, desc->access));
1923
1924 /*
1925 * The descriptor type must indicate a code/data segment.
1926 */
1927 type = SEG_DESC_TYPE(desc->access);
1928 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1929 "descriptor type %#x", seg, type));
1930
1931 if (prot & PROT_READ) {
1932 /* #GP on a read access to a exec-only code segment */
1933 if ((type & 0xA) == 0x8)
1934 return (-1);
1935 }
1936
1937 if (prot & PROT_WRITE) {
1938 /*
1939 * #GP on a write access to a code segment or a
1940 * read-only data segment.
1941 */
1942 if (type & 0x8) /* code segment */
1943 return (-1);
1944
1945 if ((type & 0xA) == 0) /* read-only data seg */
1946 return (-1);
1947 }
1948
1949 /*
1950 * 'desc->limit' is fully expanded taking granularity into
1951 * account.
1952 */
1953 if ((type & 0xC) == 0x4) {
1954 /* expand-down data segment */
1955 low_limit = desc->limit + 1;
1956 high_limit = SEG_DESC_DEF32(desc->access) ?
1957 0xffffffff : 0xffff;
1958 } else {
1959 /* code segment or expand-up data segment */
1960 low_limit = 0;
1961 high_limit = desc->limit;
1962 }
1963
1964 while (length > 0) {
1965 offset &= vie_size2mask(addrsize);
1966 if (offset < low_limit || offset > high_limit)
1967 return (-1);
1968 offset++;
1969 length--;
1970 }
1971 }
1972
1973 /*
1974 * In 64-bit mode all segments except %fs and %gs have a segment
1975 * base address of 0.
1976 */
1977 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1978 seg != VM_REG_GUEST_GS) {
1979 segbase = 0;
1980 } else {
1981 segbase = desc->base;
1982 }
1983
1984 /*
1985 * Truncate 'firstoff' to the effective address size before adding
1986 * it to the segment base.
1987 */
1988 firstoff &= vie_size2mask(addrsize);
1989 *gla = (segbase + firstoff) & vie_size2mask(glasize);
1990 return (0);
1991 }
1992
1993 /*
1994 * Prepare a partially decoded vie for a 2nd attempt.
1995 */
1996 void
vie_restart(struct vie * vie)1997 vie_restart(struct vie *vie)
1998 {
1999 _Static_assert(
2000 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
2001 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
2002 "restart should not erase instruction length or contents");
2003
2004 memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
2005 sizeof(*vie) - offsetof(struct vie, vie_startzero));
2006
2007 vie->base_register = VM_REG_LAST;
2008 vie->index_register = VM_REG_LAST;
2009 vie->segment_register = VM_REG_LAST;
2010 }
2011
2012 void
vie_init(struct vie * vie,const char * inst_bytes,int inst_length)2013 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
2014 {
2015 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
2016 ("%s: invalid instruction length (%d)", __func__, inst_length));
2017
2018 vie_restart(vie);
2019 memset(vie->inst, 0, sizeof(vie->inst));
2020 if (inst_length != 0)
2021 memcpy(vie->inst, inst_bytes, inst_length);
2022 vie->num_valid = inst_length;
2023 }
2024
2025 #ifdef _KERNEL
2026 static int
pf_error_code(int usermode,int prot,int rsvd,uint64_t pte)2027 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2028 {
2029 int error_code = 0;
2030
2031 if (pte & PG_V)
2032 error_code |= PGEX_P;
2033 if (prot & VM_PROT_WRITE)
2034 error_code |= PGEX_W;
2035 if (usermode)
2036 error_code |= PGEX_U;
2037 if (rsvd)
2038 error_code |= PGEX_RSV;
2039 if (prot & VM_PROT_EXECUTE)
2040 error_code |= PGEX_I;
2041
2042 return (error_code);
2043 }
2044
2045 static void
ptp_release(void ** cookie)2046 ptp_release(void **cookie)
2047 {
2048 if (*cookie != NULL) {
2049 vm_gpa_release(*cookie);
2050 *cookie = NULL;
2051 }
2052 }
2053
2054 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)2055 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2056 {
2057 void *ptr;
2058
2059 ptp_release(cookie);
2060 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
2061 return (ptr);
2062 }
2063
2064 static int
_vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault,bool check_only)2065 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2066 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2067 {
2068 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2069 u_int retries;
2070 uint64_t *ptpbase, ptpphys, pte, pgsize;
2071 uint32_t *ptpbase32, pte32;
2072 void *cookie;
2073
2074 *guest_fault = 0;
2075
2076 usermode = (paging->cpl == 3 ? 1 : 0);
2077 writable = prot & VM_PROT_WRITE;
2078 cookie = NULL;
2079 retval = 0;
2080 retries = 0;
2081 restart:
2082 ptpphys = paging->cr3; /* root of the page tables */
2083 ptp_release(&cookie);
2084 if (retries++ > 0)
2085 maybe_yield();
2086
2087 if (vie_canonical_check(paging->cpu_mode, gla)) {
2088 /*
2089 * XXX assuming a non-stack reference otherwise a stack fault
2090 * should be generated.
2091 */
2092 if (!check_only)
2093 vm_inject_gp(vcpu);
2094 goto fault;
2095 }
2096
2097 if (paging->paging_mode == PAGING_MODE_FLAT) {
2098 *gpa = gla;
2099 goto done;
2100 }
2101
2102 if (paging->paging_mode == PAGING_MODE_32) {
2103 nlevels = 2;
2104 while (--nlevels >= 0) {
2105 /* Zero out the lower 12 bits. */
2106 ptpphys &= ~0xfff;
2107
2108 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,
2109 &cookie);
2110
2111 if (ptpbase32 == NULL)
2112 goto error;
2113
2114 ptpshift = PAGE_SHIFT + nlevels * 10;
2115 ptpindex = (gla >> ptpshift) & 0x3FF;
2116 pgsize = 1UL << ptpshift;
2117
2118 pte32 = ptpbase32[ptpindex];
2119
2120 if ((pte32 & PG_V) == 0 ||
2121 (usermode && (pte32 & PG_U) == 0) ||
2122 (writable && (pte32 & PG_RW) == 0)) {
2123 if (!check_only) {
2124 pfcode = pf_error_code(usermode, prot, 0,
2125 pte32);
2126 vm_inject_pf(vcpu, pfcode, gla);
2127 }
2128 goto fault;
2129 }
2130
2131 /*
2132 * Emulate the x86 MMU's management of the accessed
2133 * and dirty flags. While the accessed flag is set
2134 * at every level of the page table, the dirty flag
2135 * is only set at the last level providing the guest
2136 * physical address.
2137 */
2138 if (!check_only && (pte32 & PG_A) == 0) {
2139 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2140 pte32, pte32 | PG_A) == 0) {
2141 goto restart;
2142 }
2143 }
2144
2145 /* XXX must be ignored if CR4.PSE=0 */
2146 if (nlevels > 0 && (pte32 & PG_PS) != 0)
2147 break;
2148
2149 ptpphys = pte32;
2150 }
2151
2152 /* Set the dirty bit in the page table entry if necessary */
2153 if (!check_only && writable && (pte32 & PG_M) == 0) {
2154 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2155 pte32, pte32 | PG_M) == 0) {
2156 goto restart;
2157 }
2158 }
2159
2160 /* Zero out the lower 'ptpshift' bits */
2161 pte32 >>= ptpshift; pte32 <<= ptpshift;
2162 *gpa = pte32 | (gla & (pgsize - 1));
2163 goto done;
2164 }
2165
2166 if (paging->paging_mode == PAGING_MODE_PAE) {
2167 /* Zero out the lower 5 bits and the upper 32 bits */
2168 ptpphys &= 0xffffffe0UL;
2169
2170 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,
2171 &cookie);
2172 if (ptpbase == NULL)
2173 goto error;
2174
2175 ptpindex = (gla >> 30) & 0x3;
2176
2177 pte = ptpbase[ptpindex];
2178
2179 if ((pte & PG_V) == 0) {
2180 if (!check_only) {
2181 pfcode = pf_error_code(usermode, prot, 0, pte);
2182 vm_inject_pf(vcpu, pfcode, gla);
2183 }
2184 goto fault;
2185 }
2186
2187 ptpphys = pte;
2188
2189 nlevels = 2;
2190 } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
2191 nlevels = 5;
2192 } else {
2193 nlevels = 4;
2194 }
2195
2196 while (--nlevels >= 0) {
2197 /* Zero out the lower 12 bits and the upper 12 bits */
2198 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2199
2200 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);
2201 if (ptpbase == NULL)
2202 goto error;
2203
2204 ptpshift = PAGE_SHIFT + nlevels * 9;
2205 ptpindex = (gla >> ptpshift) & 0x1FF;
2206 pgsize = 1UL << ptpshift;
2207
2208 pte = ptpbase[ptpindex];
2209
2210 if ((pte & PG_V) == 0 ||
2211 (usermode && (pte & PG_U) == 0) ||
2212 (writable && (pte & PG_RW) == 0)) {
2213 if (!check_only) {
2214 pfcode = pf_error_code(usermode, prot, 0, pte);
2215 vm_inject_pf(vcpu, pfcode, gla);
2216 }
2217 goto fault;
2218 }
2219
2220 /* Set the accessed bit in the page table entry */
2221 if (!check_only && (pte & PG_A) == 0) {
2222 if (atomic_cmpset_64(&ptpbase[ptpindex],
2223 pte, pte | PG_A) == 0) {
2224 goto restart;
2225 }
2226 }
2227
2228 if (nlevels > 0 && (pte & PG_PS) != 0) {
2229 if (pgsize > 1 * GB) {
2230 if (!check_only) {
2231 pfcode = pf_error_code(usermode, prot, 1,
2232 pte);
2233 vm_inject_pf(vcpu, pfcode, gla);
2234 }
2235 goto fault;
2236 }
2237 break;
2238 }
2239
2240 ptpphys = pte;
2241 }
2242
2243 /* Set the dirty bit in the page table entry if necessary */
2244 if (!check_only && writable && (pte & PG_M) == 0) {
2245 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2246 goto restart;
2247 }
2248
2249 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2250 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2251 *gpa = pte | (gla & (pgsize - 1));
2252 done:
2253 ptp_release(&cookie);
2254 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2255 __func__, retval));
2256 return (retval);
2257 error:
2258 retval = EFAULT;
2259 goto done;
2260 fault:
2261 *guest_fault = 1;
2262 goto done;
2263 }
2264
2265 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2266 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2267 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2268 {
2269
2270 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2271 false));
2272 }
2273
2274 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2275 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
2276 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2277 {
2278
2279 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2280 true));
2281 }
2282
2283 int
vmm_fetch_instruction(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t rip,int inst_length,struct vie * vie,int * faultptr)2284 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,
2285 uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2286 {
2287 struct vm_copyinfo copyinfo[2];
2288 int error, prot;
2289
2290 if (inst_length > VIE_INST_SIZE)
2291 panic("vmm_fetch_instruction: invalid length %d", inst_length);
2292
2293 prot = PROT_READ | PROT_EXEC;
2294 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,
2295 copyinfo, nitems(copyinfo), faultptr);
2296 if (error || *faultptr)
2297 return (error);
2298
2299 vm_copyin(copyinfo, vie->inst, inst_length);
2300 vm_copy_teardown(copyinfo, nitems(copyinfo));
2301 vie->num_valid = inst_length;
2302 return (0);
2303 }
2304 #endif /* _KERNEL */
2305
2306 static int
vie_peek(struct vie * vie,uint8_t * x)2307 vie_peek(struct vie *vie, uint8_t *x)
2308 {
2309
2310 if (vie->num_processed < vie->num_valid) {
2311 *x = vie->inst[vie->num_processed];
2312 return (0);
2313 } else
2314 return (-1);
2315 }
2316
2317 static void
vie_advance(struct vie * vie)2318 vie_advance(struct vie *vie)
2319 {
2320
2321 vie->num_processed++;
2322 }
2323
2324 static bool
segment_override(uint8_t x,int * seg)2325 segment_override(uint8_t x, int *seg)
2326 {
2327
2328 switch (x) {
2329 case 0x2E:
2330 *seg = VM_REG_GUEST_CS;
2331 break;
2332 case 0x36:
2333 *seg = VM_REG_GUEST_SS;
2334 break;
2335 case 0x3E:
2336 *seg = VM_REG_GUEST_DS;
2337 break;
2338 case 0x26:
2339 *seg = VM_REG_GUEST_ES;
2340 break;
2341 case 0x64:
2342 *seg = VM_REG_GUEST_FS;
2343 break;
2344 case 0x65:
2345 *seg = VM_REG_GUEST_GS;
2346 break;
2347 default:
2348 return (false);
2349 }
2350 return (true);
2351 }
2352
2353 static int
decode_prefixes(struct vie * vie,enum vm_cpu_mode cpu_mode,int cs_d)2354 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2355 {
2356 uint8_t x;
2357
2358 while (1) {
2359 if (vie_peek(vie, &x))
2360 return (-1);
2361
2362 if (x == 0x66)
2363 vie->opsize_override = 1;
2364 else if (x == 0x67)
2365 vie->addrsize_override = 1;
2366 else if (x == 0xF3)
2367 vie->repz_present = 1;
2368 else if (x == 0xF2)
2369 vie->repnz_present = 1;
2370 else if (segment_override(x, &vie->segment_register))
2371 vie->segment_override = 1;
2372 else
2373 break;
2374
2375 vie_advance(vie);
2376 }
2377
2378 /*
2379 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2380 * - Only one REX prefix is allowed per instruction.
2381 * - The REX prefix must immediately precede the opcode byte or the
2382 * escape opcode byte.
2383 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2384 * the mandatory prefix must come before the REX prefix.
2385 */
2386 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2387 vie->rex_present = 1;
2388 vie->rex_w = x & 0x8 ? 1 : 0;
2389 vie->rex_r = x & 0x4 ? 1 : 0;
2390 vie->rex_x = x & 0x2 ? 1 : 0;
2391 vie->rex_b = x & 0x1 ? 1 : 0;
2392 vie_advance(vie);
2393 }
2394
2395 /*
2396 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2397 */
2398 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2399 && x == 0xC4) {
2400 const struct vie_op *optab;
2401
2402 /* 3-byte VEX prefix. */
2403 vie->vex_present = 1;
2404
2405 vie_advance(vie);
2406 if (vie_peek(vie, &x))
2407 return (-1);
2408
2409 /*
2410 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted
2411 * relative to REX encoding.
2412 */
2413 vie->rex_r = x & 0x80 ? 0 : 1;
2414 vie->rex_x = x & 0x40 ? 0 : 1;
2415 vie->rex_b = x & 0x20 ? 0 : 1;
2416
2417 switch (x & 0x1F) {
2418 case 0x2:
2419 /* 0F 38. */
2420 optab = three_byte_opcodes_0f38;
2421 break;
2422 case 0x1:
2423 /* 0F class - nothing handled here yet. */
2424 /* FALLTHROUGH */
2425 case 0x3:
2426 /* 0F 3A class - nothing handled here yet. */
2427 /* FALLTHROUGH */
2428 default:
2429 /* Reserved (#UD). */
2430 return (-1);
2431 }
2432
2433 vie_advance(vie);
2434 if (vie_peek(vie, &x))
2435 return (-1);
2436
2437 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2438 vie->rex_w = x & 0x80 ? 1 : 0;
2439
2440 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2441 vie->vex_l = !!(x & 0x4);
2442 vie->vex_pp = (x & 0x3);
2443
2444 /* PP: 1=66 2=F3 3=F2 prefixes. */
2445 switch (vie->vex_pp) {
2446 case 0x1:
2447 vie->opsize_override = 1;
2448 break;
2449 case 0x2:
2450 vie->repz_present = 1;
2451 break;
2452 case 0x3:
2453 vie->repnz_present = 1;
2454 break;
2455 }
2456
2457 vie_advance(vie);
2458
2459 /* Opcode, sans literal prefix prefix. */
2460 if (vie_peek(vie, &x))
2461 return (-1);
2462
2463 vie->op = optab[x];
2464 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2465 return (-1);
2466
2467 vie_advance(vie);
2468 }
2469
2470 /*
2471 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2472 */
2473 if (cpu_mode == CPU_MODE_64BIT) {
2474 /*
2475 * Default address size is 64-bits and default operand size
2476 * is 32-bits.
2477 */
2478 vie->addrsize = vie->addrsize_override ? 4 : 8;
2479 if (vie->rex_w)
2480 vie->opsize = 8;
2481 else if (vie->opsize_override)
2482 vie->opsize = 2;
2483 else
2484 vie->opsize = 4;
2485 } else if (cs_d) {
2486 /* Default address and operand sizes are 32-bits */
2487 vie->addrsize = vie->addrsize_override ? 2 : 4;
2488 vie->opsize = vie->opsize_override ? 2 : 4;
2489 } else {
2490 /* Default address and operand sizes are 16-bits */
2491 vie->addrsize = vie->addrsize_override ? 4 : 2;
2492 vie->opsize = vie->opsize_override ? 4 : 2;
2493 }
2494 return (0);
2495 }
2496
2497 static int
decode_two_byte_opcode(struct vie * vie)2498 decode_two_byte_opcode(struct vie *vie)
2499 {
2500 uint8_t x;
2501
2502 if (vie_peek(vie, &x))
2503 return (-1);
2504
2505 vie->op = two_byte_opcodes[x];
2506
2507 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2508 return (-1);
2509
2510 vie_advance(vie);
2511 return (0);
2512 }
2513
2514 static int
decode_opcode(struct vie * vie)2515 decode_opcode(struct vie *vie)
2516 {
2517 uint8_t x;
2518
2519 if (vie_peek(vie, &x))
2520 return (-1);
2521
2522 /* Already did this via VEX prefix. */
2523 if (vie->op.op_type != VIE_OP_TYPE_NONE)
2524 return (0);
2525
2526 vie->op = one_byte_opcodes[x];
2527
2528 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2529 return (-1);
2530
2531 vie_advance(vie);
2532
2533 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2534 return (decode_two_byte_opcode(vie));
2535
2536 return (0);
2537 }
2538
2539 static int
decode_modrm(struct vie * vie,enum vm_cpu_mode cpu_mode)2540 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2541 {
2542 uint8_t x;
2543
2544 if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2545 return (0);
2546
2547 if (cpu_mode == CPU_MODE_REAL)
2548 return (-1);
2549
2550 if (vie_peek(vie, &x))
2551 return (-1);
2552
2553 vie->mod = (x >> 6) & 0x3;
2554 vie->rm = (x >> 0) & 0x7;
2555 vie->reg = (x >> 3) & 0x7;
2556
2557 /*
2558 * A direct addressing mode makes no sense in the context of an EPT
2559 * fault. There has to be a memory access involved to cause the
2560 * EPT fault.
2561 */
2562 if (vie->mod == VIE_MOD_DIRECT)
2563 return (-1);
2564
2565 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2566 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2567 /*
2568 * Table 2-5: Special Cases of REX Encodings
2569 *
2570 * mod=0, r/m=5 is used in the compatibility mode to
2571 * indicate a disp32 without a base register.
2572 *
2573 * mod!=3, r/m=4 is used in the compatibility mode to
2574 * indicate that the SIB byte is present.
2575 *
2576 * The 'b' bit in the REX prefix is don't care in
2577 * this case.
2578 */
2579 } else {
2580 vie->rm |= (vie->rex_b << 3);
2581 }
2582
2583 vie->reg |= (vie->rex_r << 3);
2584
2585 /* SIB */
2586 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2587 goto done;
2588
2589 vie->base_register = gpr_map[vie->rm];
2590
2591 switch (vie->mod) {
2592 case VIE_MOD_INDIRECT_DISP8:
2593 vie->disp_bytes = 1;
2594 break;
2595 case VIE_MOD_INDIRECT_DISP32:
2596 vie->disp_bytes = 4;
2597 break;
2598 case VIE_MOD_INDIRECT:
2599 if (vie->rm == VIE_RM_DISP32) {
2600 vie->disp_bytes = 4;
2601 /*
2602 * Table 2-7. RIP-Relative Addressing
2603 *
2604 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2605 * whereas in compatibility mode it just implies disp32.
2606 */
2607
2608 if (cpu_mode == CPU_MODE_64BIT)
2609 vie->base_register = VM_REG_GUEST_RIP;
2610 else
2611 vie->base_register = VM_REG_LAST;
2612 }
2613 break;
2614 }
2615
2616 done:
2617 vie_advance(vie);
2618
2619 return (0);
2620 }
2621
2622 static int
decode_sib(struct vie * vie)2623 decode_sib(struct vie *vie)
2624 {
2625 uint8_t x;
2626
2627 /* Proceed only if SIB byte is present */
2628 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2629 return (0);
2630
2631 if (vie_peek(vie, &x))
2632 return (-1);
2633
2634 /* De-construct the SIB byte */
2635 vie->ss = (x >> 6) & 0x3;
2636 vie->index = (x >> 3) & 0x7;
2637 vie->base = (x >> 0) & 0x7;
2638
2639 /* Apply the REX prefix modifiers */
2640 vie->index |= vie->rex_x << 3;
2641 vie->base |= vie->rex_b << 3;
2642
2643 switch (vie->mod) {
2644 case VIE_MOD_INDIRECT_DISP8:
2645 vie->disp_bytes = 1;
2646 break;
2647 case VIE_MOD_INDIRECT_DISP32:
2648 vie->disp_bytes = 4;
2649 break;
2650 }
2651
2652 if (vie->mod == VIE_MOD_INDIRECT &&
2653 (vie->base == 5 || vie->base == 13)) {
2654 /*
2655 * Special case when base register is unused if mod = 0
2656 * and base = %rbp or %r13.
2657 *
2658 * Documented in:
2659 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2660 * Table 2-5: Special Cases of REX Encodings
2661 */
2662 vie->disp_bytes = 4;
2663 } else {
2664 vie->base_register = gpr_map[vie->base];
2665 }
2666
2667 /*
2668 * All encodings of 'index' are valid except for %rsp (4).
2669 *
2670 * Documented in:
2671 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2672 * Table 2-5: Special Cases of REX Encodings
2673 */
2674 if (vie->index != 4)
2675 vie->index_register = gpr_map[vie->index];
2676
2677 /* 'scale' makes sense only in the context of an index register */
2678 if (vie->index_register < VM_REG_LAST)
2679 vie->scale = 1 << vie->ss;
2680
2681 vie_advance(vie);
2682
2683 return (0);
2684 }
2685
2686 static int
decode_displacement(struct vie * vie)2687 decode_displacement(struct vie *vie)
2688 {
2689 int n, i;
2690 uint8_t x;
2691
2692 union {
2693 char buf[4];
2694 int8_t signed8;
2695 int32_t signed32;
2696 } u;
2697
2698 if ((n = vie->disp_bytes) == 0)
2699 return (0);
2700
2701 if (n != 1 && n != 4)
2702 panic("decode_displacement: invalid disp_bytes %d", n);
2703
2704 for (i = 0; i < n; i++) {
2705 if (vie_peek(vie, &x))
2706 return (-1);
2707
2708 u.buf[i] = x;
2709 vie_advance(vie);
2710 }
2711
2712 if (n == 1)
2713 vie->displacement = u.signed8; /* sign-extended */
2714 else
2715 vie->displacement = u.signed32; /* sign-extended */
2716
2717 return (0);
2718 }
2719
2720 static int
decode_immediate(struct vie * vie)2721 decode_immediate(struct vie *vie)
2722 {
2723 int i, n;
2724 uint8_t x;
2725 union {
2726 char buf[4];
2727 int8_t signed8;
2728 int16_t signed16;
2729 int32_t signed32;
2730 } u;
2731
2732 /* Figure out immediate operand size (if any) */
2733 if (vie->op.op_flags & VIE_OP_F_IMM) {
2734 /*
2735 * Section 2.2.1.5 "Immediates", Intel SDM:
2736 * In 64-bit mode the typical size of immediate operands
2737 * remains 32-bits. When the operand size if 64-bits, the
2738 * processor sign-extends all immediates to 64-bits prior
2739 * to their use.
2740 */
2741 if (vie->opsize == 4 || vie->opsize == 8)
2742 vie->imm_bytes = 4;
2743 else
2744 vie->imm_bytes = 2;
2745 } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2746 vie->imm_bytes = 1;
2747 }
2748
2749 if ((n = vie->imm_bytes) == 0)
2750 return (0);
2751
2752 KASSERT(n == 1 || n == 2 || n == 4,
2753 ("%s: invalid number of immediate bytes: %d", __func__, n));
2754
2755 for (i = 0; i < n; i++) {
2756 if (vie_peek(vie, &x))
2757 return (-1);
2758
2759 u.buf[i] = x;
2760 vie_advance(vie);
2761 }
2762
2763 /* sign-extend the immediate value before use */
2764 if (n == 1)
2765 vie->immediate = u.signed8;
2766 else if (n == 2)
2767 vie->immediate = u.signed16;
2768 else
2769 vie->immediate = u.signed32;
2770
2771 return (0);
2772 }
2773
2774 static int
decode_moffset(struct vie * vie)2775 decode_moffset(struct vie *vie)
2776 {
2777 int i, n;
2778 uint8_t x;
2779 union {
2780 char buf[8];
2781 uint64_t u64;
2782 } u;
2783
2784 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2785 return (0);
2786
2787 /*
2788 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2789 * The memory offset size follows the address-size of the instruction.
2790 */
2791 n = vie->addrsize;
2792 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2793
2794 u.u64 = 0;
2795 for (i = 0; i < n; i++) {
2796 if (vie_peek(vie, &x))
2797 return (-1);
2798
2799 u.buf[i] = x;
2800 vie_advance(vie);
2801 }
2802 vie->displacement = u.u64;
2803 return (0);
2804 }
2805
2806 #ifdef _KERNEL
2807 /*
2808 * Verify that the 'guest linear address' provided as collateral of the nested
2809 * page table fault matches with our instruction decoding.
2810 */
2811 static int
verify_gla(struct vcpu * vcpu,uint64_t gla,struct vie * vie,enum vm_cpu_mode cpu_mode)2812 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,
2813 enum vm_cpu_mode cpu_mode)
2814 {
2815 int error;
2816 uint64_t base, segbase, idx, gla2;
2817 enum vm_reg_name seg;
2818 struct seg_desc desc;
2819
2820 /* Skip 'gla' verification */
2821 if (gla == VIE_INVALID_GLA)
2822 return (0);
2823
2824 base = 0;
2825 if (vie->base_register != VM_REG_LAST) {
2826 error = vm_get_register(vcpu, vie->base_register, &base);
2827 if (error) {
2828 printf("verify_gla: error %d getting base reg %d\n",
2829 error, vie->base_register);
2830 return (-1);
2831 }
2832
2833 /*
2834 * RIP-relative addressing starts from the following
2835 * instruction
2836 */
2837 if (vie->base_register == VM_REG_GUEST_RIP)
2838 base += vie->num_processed;
2839 }
2840
2841 idx = 0;
2842 if (vie->index_register != VM_REG_LAST) {
2843 error = vm_get_register(vcpu, vie->index_register, &idx);
2844 if (error) {
2845 printf("verify_gla: error %d getting index reg %d\n",
2846 error, vie->index_register);
2847 return (-1);
2848 }
2849 }
2850
2851 /*
2852 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2853 *
2854 * In 64-bit mode, segmentation is generally (but not
2855 * completely) disabled. The exceptions are the FS and GS
2856 * segments.
2857 *
2858 * In legacy IA-32 mode, when the ESP or EBP register is used
2859 * as the base, the SS segment is the default segment. For
2860 * other data references, except when relative to stack or
2861 * string destination the DS segment is the default. These
2862 * can be overridden to allow other segments to be accessed.
2863 */
2864 if (vie->segment_override)
2865 seg = vie->segment_register;
2866 else if (vie->base_register == VM_REG_GUEST_RSP ||
2867 vie->base_register == VM_REG_GUEST_RBP)
2868 seg = VM_REG_GUEST_SS;
2869 else
2870 seg = VM_REG_GUEST_DS;
2871 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2872 seg != VM_REG_GUEST_GS) {
2873 segbase = 0;
2874 } else {
2875 error = vm_get_seg_desc(vcpu, seg, &desc);
2876 if (error) {
2877 printf("verify_gla: error %d getting segment"
2878 " descriptor %d", error,
2879 vie->segment_register);
2880 return (-1);
2881 }
2882 segbase = desc.base;
2883 }
2884
2885 gla2 = segbase + base + vie->scale * idx + vie->displacement;
2886 gla2 &= size2mask[vie->addrsize];
2887 if (gla != gla2) {
2888 printf("verify_gla mismatch: segbase(0x%0lx)"
2889 "base(0x%0lx), scale(%d), index(0x%0lx), "
2890 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2891 segbase, base, vie->scale, idx, vie->displacement,
2892 gla, gla2);
2893 return (-1);
2894 }
2895
2896 return (0);
2897 }
2898 #endif /* _KERNEL */
2899
2900 int
2901 #ifdef _KERNEL
vmm_decode_instruction(struct vcpu * vcpu,uint64_t gla,enum vm_cpu_mode cpu_mode,int cs_d,struct vie * vie)2902 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
2903 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2904 #else
2905 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2906 #endif
2907 {
2908
2909 if (decode_prefixes(vie, cpu_mode, cs_d))
2910 return (-1);
2911
2912 if (decode_opcode(vie))
2913 return (-1);
2914
2915 if (decode_modrm(vie, cpu_mode))
2916 return (-1);
2917
2918 if (decode_sib(vie))
2919 return (-1);
2920
2921 if (decode_displacement(vie))
2922 return (-1);
2923
2924 if (decode_immediate(vie))
2925 return (-1);
2926
2927 if (decode_moffset(vie))
2928 return (-1);
2929
2930 #ifdef _KERNEL
2931 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2932 if (verify_gla(vcpu, gla, vie, cpu_mode))
2933 return (-1);
2934 }
2935 #endif
2936
2937 vie->decoded = 1; /* success */
2938
2939 return (0);
2940 }
2941