1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Sandvine, Inc.
5 * Copyright (c) 2012 NetApp, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #ifdef _KERNEL
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/proc.h>
36
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39
40 #include <machine/vmparam.h>
41 #include <machine/vmm.h>
42
43 #include <dev/vmm/vmm_mem.h>
44 #else /* !_KERNEL */
45 #include <sys/types.h>
46 #include <sys/errno.h>
47 #include <sys/_iovec.h>
48
49 #include <machine/vmm.h>
50
51 #include <err.h>
52 #include <assert.h>
53 #include <stdbool.h>
54 #include <stddef.h>
55 #include <stdio.h>
56 #include <string.h>
57 #include <strings.h>
58 #include <vmmapi.h>
59 #define __diagused
60 #define KASSERT(exp,msg) assert((exp))
61 #define panic(...) errx(4, __VA_ARGS__)
62 #endif /* _KERNEL */
63
64 #include <machine/vmm_instruction_emul.h>
65 #include <x86/psl.h>
66 #include <x86/specialreg.h>
67
68 /* struct vie_op.op_type */
69 enum {
70 VIE_OP_TYPE_NONE = 0,
71 VIE_OP_TYPE_MOV,
72 VIE_OP_TYPE_MOVSX,
73 VIE_OP_TYPE_MOVZX,
74 VIE_OP_TYPE_AND,
75 VIE_OP_TYPE_OR,
76 VIE_OP_TYPE_SUB,
77 VIE_OP_TYPE_TWO_BYTE,
78 VIE_OP_TYPE_PUSH,
79 VIE_OP_TYPE_CMP,
80 VIE_OP_TYPE_POP,
81 VIE_OP_TYPE_MOVS,
82 VIE_OP_TYPE_GROUP1,
83 VIE_OP_TYPE_STOS,
84 VIE_OP_TYPE_BITTEST,
85 VIE_OP_TYPE_TWOB_GRP15,
86 VIE_OP_TYPE_ADD,
87 VIE_OP_TYPE_TEST,
88 VIE_OP_TYPE_BEXTR,
89 VIE_OP_TYPE_LAST
90 };
91
92 /* struct vie_op.op_flags */
93 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
94 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
95 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
96 #define VIE_OP_F_NO_MODRM (1 << 3)
97 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
98
99 static const struct vie_op three_byte_opcodes_0f38[256] = {
100 [0xF7] = {
101 .op_byte = 0xF7,
102 .op_type = VIE_OP_TYPE_BEXTR,
103 },
104 };
105
106 static const struct vie_op two_byte_opcodes[256] = {
107 [0xAE] = {
108 .op_byte = 0xAE,
109 .op_type = VIE_OP_TYPE_TWOB_GRP15,
110 },
111 [0xB6] = {
112 .op_byte = 0xB6,
113 .op_type = VIE_OP_TYPE_MOVZX,
114 },
115 [0xB7] = {
116 .op_byte = 0xB7,
117 .op_type = VIE_OP_TYPE_MOVZX,
118 },
119 [0xBA] = {
120 .op_byte = 0xBA,
121 .op_type = VIE_OP_TYPE_BITTEST,
122 .op_flags = VIE_OP_F_IMM8,
123 },
124 [0xBE] = {
125 .op_byte = 0xBE,
126 .op_type = VIE_OP_TYPE_MOVSX,
127 },
128 };
129
130 static const struct vie_op one_byte_opcodes[256] = {
131 [0x03] = {
132 .op_byte = 0x03,
133 .op_type = VIE_OP_TYPE_ADD,
134 },
135 [0x0F] = {
136 .op_byte = 0x0F,
137 .op_type = VIE_OP_TYPE_TWO_BYTE
138 },
139 [0x0B] = {
140 .op_byte = 0x0B,
141 .op_type = VIE_OP_TYPE_OR,
142 },
143 [0x2B] = {
144 .op_byte = 0x2B,
145 .op_type = VIE_OP_TYPE_SUB,
146 },
147 [0x39] = {
148 .op_byte = 0x39,
149 .op_type = VIE_OP_TYPE_CMP,
150 },
151 [0x3B] = {
152 .op_byte = 0x3B,
153 .op_type = VIE_OP_TYPE_CMP,
154 },
155 [0x88] = {
156 .op_byte = 0x88,
157 .op_type = VIE_OP_TYPE_MOV,
158 },
159 [0x89] = {
160 .op_byte = 0x89,
161 .op_type = VIE_OP_TYPE_MOV,
162 },
163 [0x8A] = {
164 .op_byte = 0x8A,
165 .op_type = VIE_OP_TYPE_MOV,
166 },
167 [0x8B] = {
168 .op_byte = 0x8B,
169 .op_type = VIE_OP_TYPE_MOV,
170 },
171 [0xA1] = {
172 .op_byte = 0xA1,
173 .op_type = VIE_OP_TYPE_MOV,
174 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
175 },
176 [0xA3] = {
177 .op_byte = 0xA3,
178 .op_type = VIE_OP_TYPE_MOV,
179 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
180 },
181 [0xA4] = {
182 .op_byte = 0xA4,
183 .op_type = VIE_OP_TYPE_MOVS,
184 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
185 },
186 [0xA5] = {
187 .op_byte = 0xA5,
188 .op_type = VIE_OP_TYPE_MOVS,
189 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
190 },
191 [0xAA] = {
192 .op_byte = 0xAA,
193 .op_type = VIE_OP_TYPE_STOS,
194 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
195 },
196 [0xAB] = {
197 .op_byte = 0xAB,
198 .op_type = VIE_OP_TYPE_STOS,
199 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
200 },
201 [0xC6] = {
202 /* XXX Group 11 extended opcode - not just MOV */
203 .op_byte = 0xC6,
204 .op_type = VIE_OP_TYPE_MOV,
205 .op_flags = VIE_OP_F_IMM8,
206 },
207 [0xC7] = {
208 .op_byte = 0xC7,
209 .op_type = VIE_OP_TYPE_MOV,
210 .op_flags = VIE_OP_F_IMM,
211 },
212 [0x23] = {
213 .op_byte = 0x23,
214 .op_type = VIE_OP_TYPE_AND,
215 },
216 [0x80] = {
217 /* Group 1 extended opcode */
218 .op_byte = 0x80,
219 .op_type = VIE_OP_TYPE_GROUP1,
220 .op_flags = VIE_OP_F_IMM8,
221 },
222 [0x81] = {
223 /* Group 1 extended opcode */
224 .op_byte = 0x81,
225 .op_type = VIE_OP_TYPE_GROUP1,
226 .op_flags = VIE_OP_F_IMM,
227 },
228 [0x83] = {
229 /* Group 1 extended opcode */
230 .op_byte = 0x83,
231 .op_type = VIE_OP_TYPE_GROUP1,
232 .op_flags = VIE_OP_F_IMM8,
233 },
234 [0x8F] = {
235 /* XXX Group 1A extended opcode - not just POP */
236 .op_byte = 0x8F,
237 .op_type = VIE_OP_TYPE_POP,
238 },
239 [0xF6] = {
240 /* XXX Group 3 extended opcode - not just TEST */
241 .op_byte = 0xF6,
242 .op_type = VIE_OP_TYPE_TEST,
243 .op_flags = VIE_OP_F_IMM8,
244 },
245 [0xF7] = {
246 /* XXX Group 3 extended opcode - not just TEST */
247 .op_byte = 0xF7,
248 .op_type = VIE_OP_TYPE_TEST,
249 .op_flags = VIE_OP_F_IMM,
250 },
251 [0xFF] = {
252 /* XXX Group 5 extended opcode - not just PUSH */
253 .op_byte = 0xFF,
254 .op_type = VIE_OP_TYPE_PUSH,
255 }
256 };
257
258 /* struct vie.mod */
259 #define VIE_MOD_INDIRECT 0
260 #define VIE_MOD_INDIRECT_DISP8 1
261 #define VIE_MOD_INDIRECT_DISP32 2
262 #define VIE_MOD_DIRECT 3
263
264 /* struct vie.rm */
265 #define VIE_RM_SIB 4
266 #define VIE_RM_DISP32 5
267
268 #define GB (1024 * 1024 * 1024)
269
270 static enum vm_reg_name gpr_map[16] = {
271 VM_REG_GUEST_RAX,
272 VM_REG_GUEST_RCX,
273 VM_REG_GUEST_RDX,
274 VM_REG_GUEST_RBX,
275 VM_REG_GUEST_RSP,
276 VM_REG_GUEST_RBP,
277 VM_REG_GUEST_RSI,
278 VM_REG_GUEST_RDI,
279 VM_REG_GUEST_R8,
280 VM_REG_GUEST_R9,
281 VM_REG_GUEST_R10,
282 VM_REG_GUEST_R11,
283 VM_REG_GUEST_R12,
284 VM_REG_GUEST_R13,
285 VM_REG_GUEST_R14,
286 VM_REG_GUEST_R15
287 };
288
289 static uint64_t size2mask[] = {
290 [1] = 0xff,
291 [2] = 0xffff,
292 [4] = 0xffffffff,
293 [8] = 0xffffffffffffffff,
294 };
295
296 static int
vie_read_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t * rval)297 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval)
298 {
299 int error;
300
301 error = vm_get_register(vcpu, reg, rval);
302
303 return (error);
304 }
305
306 static void
vie_calc_bytereg(struct vie * vie,enum vm_reg_name * reg,int * lhbr)307 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
308 {
309 *lhbr = 0;
310 *reg = gpr_map[vie->reg];
311
312 /*
313 * 64-bit mode imposes limitations on accessing legacy high byte
314 * registers (lhbr).
315 *
316 * The legacy high-byte registers cannot be addressed if the REX
317 * prefix is present. In this case the values 4, 5, 6 and 7 of the
318 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
319 *
320 * If the REX prefix is not present then the values 4, 5, 6 and 7
321 * of the 'ModRM:reg' field address the legacy high-byte registers,
322 * %ah, %ch, %dh and %bh respectively.
323 */
324 if (!vie->rex_present) {
325 if (vie->reg & 0x4) {
326 *lhbr = 1;
327 *reg = gpr_map[vie->reg & 0x3];
328 }
329 }
330 }
331
332 static int
vie_read_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t * rval)333 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval)
334 {
335 uint64_t val;
336 int error, lhbr;
337 enum vm_reg_name reg;
338
339 vie_calc_bytereg(vie, ®, &lhbr);
340 error = vm_get_register(vcpu, reg, &val);
341
342 /*
343 * To obtain the value of a legacy high byte register shift the
344 * base register right by 8 bits (%ah = %rax >> 8).
345 */
346 if (lhbr)
347 *rval = val >> 8;
348 else
349 *rval = val;
350 return (error);
351 }
352
353 static int
vie_write_bytereg(struct vcpu * vcpu,struct vie * vie,uint8_t byte)354 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte)
355 {
356 uint64_t origval, val, mask;
357 int error, lhbr;
358 enum vm_reg_name reg;
359
360 vie_calc_bytereg(vie, ®, &lhbr);
361 error = vm_get_register(vcpu, reg, &origval);
362 if (error == 0) {
363 val = byte;
364 mask = 0xff;
365 if (lhbr) {
366 /*
367 * Shift left by 8 to store 'byte' in a legacy high
368 * byte register.
369 */
370 val <<= 8;
371 mask <<= 8;
372 }
373 val |= origval & ~mask;
374 error = vm_set_register(vcpu, reg, val);
375 }
376 return (error);
377 }
378
379 int
vie_update_register(struct vcpu * vcpu,enum vm_reg_name reg,uint64_t val,int size)380 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg,
381 uint64_t val, int size)
382 {
383 int error;
384 uint64_t origval;
385
386 switch (size) {
387 case 1:
388 case 2:
389 error = vie_read_register(vcpu, reg, &origval);
390 if (error)
391 return (error);
392 val &= size2mask[size];
393 val |= origval & ~size2mask[size];
394 break;
395 case 4:
396 val &= 0xffffffffUL;
397 break;
398 case 8:
399 break;
400 default:
401 return (EINVAL);
402 }
403
404 error = vm_set_register(vcpu, reg, val);
405 return (error);
406 }
407
408 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
409
410 /*
411 * Return the status flags that would result from doing (x - y).
412 */
413 #define GETCC(sz) \
414 static u_long \
415 getcc##sz(uint##sz##_t x, uint##sz##_t y) \
416 { \
417 u_long rflags; \
418 \
419 __asm __volatile("sub %2,%1; pushfq; popq %0" : \
420 "=r" (rflags), "+r" (x) : "m" (y)); \
421 return (rflags); \
422 } struct __hack
423
424 GETCC(8);
425 GETCC(16);
426 GETCC(32);
427 GETCC(64);
428
429 static u_long
getcc(int opsize,uint64_t x,uint64_t y)430 getcc(int opsize, uint64_t x, uint64_t y)
431 {
432 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
433 ("getcc: invalid operand size %d", opsize));
434
435 if (opsize == 1)
436 return (getcc8(x, y));
437 else if (opsize == 2)
438 return (getcc16(x, y));
439 else if (opsize == 4)
440 return (getcc32(x, y));
441 else
442 return (getcc64(x, y));
443 }
444
445 /*
446 * Macro creation of functions getaddflags{8,16,32,64}
447 */
448 #define GETADDFLAGS(sz) \
449 static u_long \
450 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \
451 { \
452 u_long rflags; \
453 \
454 __asm __volatile("add %2,%1; pushfq; popq %0" : \
455 "=r" (rflags), "+r" (x) : "m" (y)); \
456 return (rflags); \
457 } struct __hack
458
459 GETADDFLAGS(8);
460 GETADDFLAGS(16);
461 GETADDFLAGS(32);
462 GETADDFLAGS(64);
463
464 static u_long
getaddflags(int opsize,uint64_t x,uint64_t y)465 getaddflags(int opsize, uint64_t x, uint64_t y)
466 {
467 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
468 ("getaddflags: invalid operand size %d", opsize));
469
470 if (opsize == 1)
471 return (getaddflags8(x, y));
472 else if (opsize == 2)
473 return (getaddflags16(x, y));
474 else if (opsize == 4)
475 return (getaddflags32(x, y));
476 else
477 return (getaddflags64(x, y));
478 }
479
480 /*
481 * Return the status flags that would result from doing (x & y).
482 */
483 #define GETANDFLAGS(sz) \
484 static u_long \
485 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \
486 { \
487 u_long rflags; \
488 \
489 __asm __volatile("and %2,%1; pushfq; popq %0" : \
490 "=r" (rflags), "+r" (x) : "m" (y)); \
491 return (rflags); \
492 } struct __hack
493
494 GETANDFLAGS(8);
495 GETANDFLAGS(16);
496 GETANDFLAGS(32);
497 GETANDFLAGS(64);
498
499 static u_long
getandflags(int opsize,uint64_t x,uint64_t y)500 getandflags(int opsize, uint64_t x, uint64_t y)
501 {
502 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
503 ("getandflags: invalid operand size %d", opsize));
504
505 if (opsize == 1)
506 return (getandflags8(x, y));
507 else if (opsize == 2)
508 return (getandflags16(x, y));
509 else if (opsize == 4)
510 return (getandflags32(x, y));
511 else
512 return (getandflags64(x, y));
513 }
514
515 static int
emulate_mov(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)516 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
517 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
518 {
519 int error, size;
520 enum vm_reg_name reg;
521 uint8_t byte;
522 uint64_t val;
523
524 size = vie->opsize;
525 error = EINVAL;
526
527 switch (vie->op.op_byte) {
528 case 0x88:
529 /*
530 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
531 * 88/r: mov r/m8, r8
532 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
533 */
534 size = 1; /* override for byte operation */
535 error = vie_read_bytereg(vcpu, vie, &byte);
536 if (error == 0)
537 error = memwrite(vcpu, gpa, byte, size, arg);
538 break;
539 case 0x89:
540 /*
541 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
542 * 89/r: mov r/m16, r16
543 * 89/r: mov r/m32, r32
544 * REX.W + 89/r mov r/m64, r64
545 */
546 reg = gpr_map[vie->reg];
547 error = vie_read_register(vcpu, reg, &val);
548 if (error == 0) {
549 val &= size2mask[size];
550 error = memwrite(vcpu, gpa, val, size, arg);
551 }
552 break;
553 case 0x8A:
554 /*
555 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
556 * 8A/r: mov r8, r/m8
557 * REX + 8A/r: mov r8, r/m8
558 */
559 size = 1; /* override for byte operation */
560 error = memread(vcpu, gpa, &val, size, arg);
561 if (error == 0)
562 error = vie_write_bytereg(vcpu, vie, val);
563 break;
564 case 0x8B:
565 /*
566 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
567 * 8B/r: mov r16, r/m16
568 * 8B/r: mov r32, r/m32
569 * REX.W 8B/r: mov r64, r/m64
570 */
571 error = memread(vcpu, gpa, &val, size, arg);
572 if (error == 0) {
573 reg = gpr_map[vie->reg];
574 error = vie_update_register(vcpu, reg, val, size);
575 }
576 break;
577 case 0xA1:
578 /*
579 * MOV from seg:moffset to AX/EAX/RAX
580 * A1: mov AX, moffs16
581 * A1: mov EAX, moffs32
582 * REX.W + A1: mov RAX, moffs64
583 */
584 error = memread(vcpu, gpa, &val, size, arg);
585 if (error == 0) {
586 reg = VM_REG_GUEST_RAX;
587 error = vie_update_register(vcpu, reg, val, size);
588 }
589 break;
590 case 0xA3:
591 /*
592 * MOV from AX/EAX/RAX to seg:moffset
593 * A3: mov moffs16, AX
594 * A3: mov moffs32, EAX
595 * REX.W + A3: mov moffs64, RAX
596 */
597 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
598 if (error == 0) {
599 val &= size2mask[size];
600 error = memwrite(vcpu, gpa, val, size, arg);
601 }
602 break;
603 case 0xC6:
604 /*
605 * MOV from imm8 to mem (ModRM:r/m)
606 * C6/0 mov r/m8, imm8
607 * REX + C6/0 mov r/m8, imm8
608 */
609 size = 1; /* override for byte operation */
610 error = memwrite(vcpu, gpa, vie->immediate, size, arg);
611 break;
612 case 0xC7:
613 /*
614 * MOV from imm16/imm32 to mem (ModRM:r/m)
615 * C7/0 mov r/m16, imm16
616 * C7/0 mov r/m32, imm32
617 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
618 */
619 val = vie->immediate & size2mask[size];
620 error = memwrite(vcpu, gpa, val, size, arg);
621 break;
622 default:
623 break;
624 }
625
626 return (error);
627 }
628
629 static int
emulate_movx(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)630 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
631 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
632 {
633 int error, size;
634 enum vm_reg_name reg;
635 uint64_t val;
636
637 size = vie->opsize;
638 error = EINVAL;
639
640 switch (vie->op.op_byte) {
641 case 0xB6:
642 /*
643 * MOV and zero extend byte from mem (ModRM:r/m) to
644 * reg (ModRM:reg).
645 *
646 * 0F B6/r movzx r16, r/m8
647 * 0F B6/r movzx r32, r/m8
648 * REX.W + 0F B6/r movzx r64, r/m8
649 */
650
651 /* get the first operand */
652 error = memread(vcpu, gpa, &val, 1, arg);
653 if (error)
654 break;
655
656 /* get the second operand */
657 reg = gpr_map[vie->reg];
658
659 /* zero-extend byte */
660 val = (uint8_t)val;
661
662 /* write the result */
663 error = vie_update_register(vcpu, reg, val, size);
664 break;
665 case 0xB7:
666 /*
667 * MOV and zero extend word from mem (ModRM:r/m) to
668 * reg (ModRM:reg).
669 *
670 * 0F B7/r movzx r32, r/m16
671 * REX.W + 0F B7/r movzx r64, r/m16
672 */
673 error = memread(vcpu, gpa, &val, 2, arg);
674 if (error)
675 return (error);
676
677 reg = gpr_map[vie->reg];
678
679 /* zero-extend word */
680 val = (uint16_t)val;
681
682 error = vie_update_register(vcpu, reg, val, size);
683 break;
684 case 0xBE:
685 /*
686 * MOV and sign extend byte from mem (ModRM:r/m) to
687 * reg (ModRM:reg).
688 *
689 * 0F BE/r movsx r16, r/m8
690 * 0F BE/r movsx r32, r/m8
691 * REX.W + 0F BE/r movsx r64, r/m8
692 */
693
694 /* get the first operand */
695 error = memread(vcpu, gpa, &val, 1, arg);
696 if (error)
697 break;
698
699 /* get the second operand */
700 reg = gpr_map[vie->reg];
701
702 /* sign extend byte */
703 val = (int8_t)val;
704
705 /* write the result */
706 error = vie_update_register(vcpu, reg, val, size);
707 break;
708 default:
709 break;
710 }
711 return (error);
712 }
713
714 /*
715 * Helper function to calculate and validate a linear address.
716 */
717 static int
get_gla(struct vcpu * vcpu,struct vie * vie __unused,struct vm_guest_paging * paging,int opsize,int addrsize,int prot,enum vm_reg_name seg,enum vm_reg_name gpr,uint64_t * gla,int * fault)718 get_gla(struct vcpu *vcpu, struct vie *vie __unused,
719 struct vm_guest_paging *paging, int opsize, int addrsize, int prot,
720 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)
721 {
722 struct seg_desc desc;
723 uint64_t cr0, val, rflags;
724 int error __diagused;
725
726 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
727 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
728
729 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
730 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
731
732 error = vm_get_seg_desc(vcpu, seg, &desc);
733 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
734 __func__, error, seg));
735
736 error = vie_read_register(vcpu, gpr, &val);
737 KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
738 error, gpr));
739
740 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
741 addrsize, prot, gla)) {
742 if (seg == VM_REG_GUEST_SS)
743 vm_inject_ss(vcpu, 0);
744 else
745 vm_inject_gp(vcpu);
746 goto guest_fault;
747 }
748
749 if (vie_canonical_check(paging->cpu_mode, *gla)) {
750 if (seg == VM_REG_GUEST_SS)
751 vm_inject_ss(vcpu, 0);
752 else
753 vm_inject_gp(vcpu);
754 goto guest_fault;
755 }
756
757 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
758 vm_inject_ac(vcpu, 0);
759 goto guest_fault;
760 }
761
762 *fault = 0;
763 return (0);
764
765 guest_fault:
766 *fault = 1;
767 return (0);
768 }
769
770 static int
emulate_movs(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)771 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
772 struct vm_guest_paging *paging, mem_region_read_t memread,
773 mem_region_write_t memwrite, void *arg)
774 {
775 #ifdef _KERNEL
776 struct vm_copyinfo copyinfo[2];
777 #else
778 struct iovec copyinfo[2];
779 #endif
780 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
781 uint64_t rcx, rdi, rsi, rflags;
782 int error, fault, opsize, seg, repeat;
783
784 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
785 val = 0;
786 error = 0;
787
788 /*
789 * XXX although the MOVS instruction is only supposed to be used with
790 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
791 *
792 * Empirically the "repnz" prefix has identical behavior to "rep"
793 * and the zero flag does not make a difference.
794 */
795 repeat = vie->repz_present | vie->repnz_present;
796
797 if (repeat) {
798 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
799 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
800
801 /*
802 * The count register is %rcx, %ecx or %cx depending on the
803 * address size of the instruction.
804 */
805 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
806 error = 0;
807 goto done;
808 }
809 }
810
811 /*
812 * Source Destination Comments
813 * --------------------------------------------
814 * (1) memory memory n/a
815 * (2) memory mmio emulated
816 * (3) mmio memory emulated
817 * (4) mmio mmio emulated
818 *
819 * At this point we don't have sufficient information to distinguish
820 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
821 * out because it will succeed only when operating on regular memory.
822 *
823 * XXX the emulation doesn't properly handle the case where 'gpa'
824 * is straddling the boundary between the normal memory and MMIO.
825 */
826
827 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
828 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
829 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
830 if (error || fault)
831 goto done;
832
833 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ,
834 copyinfo, nitems(copyinfo), &fault);
835 if (error == 0) {
836 if (fault)
837 goto done; /* Resume guest to handle fault */
838
839 /*
840 * case (2): read from system memory and write to mmio.
841 */
842 vm_copyin(copyinfo, &val, opsize);
843 vm_copy_teardown(copyinfo, nitems(copyinfo));
844 error = memwrite(vcpu, gpa, val, opsize, arg);
845 if (error)
846 goto done;
847 } else {
848 /*
849 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
850 * if 'srcaddr' is in the mmio space.
851 */
852
853 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize,
854 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
855 &fault);
856 if (error || fault)
857 goto done;
858
859 error = vm_copy_setup(vcpu, paging, dstaddr, opsize,
860 PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
861 if (error == 0) {
862 if (fault)
863 goto done; /* Resume guest to handle fault */
864
865 /*
866 * case (3): read from MMIO and write to system memory.
867 *
868 * A MMIO read can have side-effects so we
869 * commit to it only after vm_copy_setup() is
870 * successful. If a page-fault needs to be
871 * injected into the guest then it will happen
872 * before the MMIO read is attempted.
873 */
874 error = memread(vcpu, gpa, &val, opsize, arg);
875 if (error)
876 goto done;
877
878 vm_copyout(&val, copyinfo, opsize);
879 vm_copy_teardown(copyinfo, nitems(copyinfo));
880 } else {
881 /*
882 * Case (4): read from and write to mmio.
883 *
884 * Commit to the MMIO read/write (with potential
885 * side-effects) only after we are sure that the
886 * instruction is not going to be restarted due
887 * to address translation faults.
888 */
889 error = vm_gla2gpa(vcpu, paging, srcaddr,
890 PROT_READ, &srcgpa, &fault);
891 if (error || fault)
892 goto done;
893
894 error = vm_gla2gpa(vcpu, paging, dstaddr,
895 PROT_WRITE, &dstgpa, &fault);
896 if (error || fault)
897 goto done;
898
899 error = memread(vcpu, srcgpa, &val, opsize, arg);
900 if (error)
901 goto done;
902
903 error = memwrite(vcpu, dstgpa, val, opsize, arg);
904 if (error)
905 goto done;
906 }
907 }
908
909 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi);
910 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
911
912 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
913 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
914
915 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
916 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
917
918 if (rflags & PSL_D) {
919 rsi -= opsize;
920 rdi -= opsize;
921 } else {
922 rsi += opsize;
923 rdi += opsize;
924 }
925
926 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi,
927 vie->addrsize);
928 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
929
930 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
931 vie->addrsize);
932 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
933
934 if (repeat) {
935 rcx = rcx - 1;
936 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
937 rcx, vie->addrsize);
938 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
939
940 /*
941 * Repeat the instruction if the count register is not zero.
942 */
943 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
944 vm_restart_instruction(vcpu);
945 }
946 done:
947 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
948 __func__, error));
949 return (error);
950 }
951
952 static int
emulate_stos(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread __unused,mem_region_write_t memwrite,void * arg)953 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
954 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,
955 mem_region_write_t memwrite, void *arg)
956 {
957 int error, opsize, repeat;
958 uint64_t val;
959 uint64_t rcx, rdi, rflags;
960
961 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
962 repeat = vie->repz_present | vie->repnz_present;
963
964 if (repeat) {
965 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx);
966 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
967
968 /*
969 * The count register is %rcx, %ecx or %cx depending on the
970 * address size of the instruction.
971 */
972 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
973 return (0);
974 }
975
976 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val);
977 KASSERT(!error, ("%s: error %d getting rax", __func__, error));
978
979 error = memwrite(vcpu, gpa, val, opsize, arg);
980 if (error)
981 return (error);
982
983 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi);
984 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
985
986 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
987 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
988
989 if (rflags & PSL_D)
990 rdi -= opsize;
991 else
992 rdi += opsize;
993
994 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi,
995 vie->addrsize);
996 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
997
998 if (repeat) {
999 rcx = rcx - 1;
1000 error = vie_update_register(vcpu, VM_REG_GUEST_RCX,
1001 rcx, vie->addrsize);
1002 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1003
1004 /*
1005 * Repeat the instruction if the count register is not zero.
1006 */
1007 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1008 vm_restart_instruction(vcpu);
1009 }
1010
1011 return (0);
1012 }
1013
1014 static int
emulate_and(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1015 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1016 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1017 {
1018 int error, size;
1019 enum vm_reg_name reg;
1020 uint64_t result, rflags, rflags2, val1, val2;
1021
1022 size = vie->opsize;
1023 error = EINVAL;
1024
1025 switch (vie->op.op_byte) {
1026 case 0x23:
1027 /*
1028 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1029 * result in reg.
1030 *
1031 * 23/r and r16, r/m16
1032 * 23/r and r32, r/m32
1033 * REX.W + 23/r and r64, r/m64
1034 */
1035
1036 /* get the first operand */
1037 reg = gpr_map[vie->reg];
1038 error = vie_read_register(vcpu, reg, &val1);
1039 if (error)
1040 break;
1041
1042 /* get the second operand */
1043 error = memread(vcpu, gpa, &val2, size, arg);
1044 if (error)
1045 break;
1046
1047 /* perform the operation and write the result */
1048 result = val1 & val2;
1049 error = vie_update_register(vcpu, reg, result, size);
1050 break;
1051 case 0x81:
1052 case 0x83:
1053 /*
1054 * AND mem (ModRM:r/m) with immediate and store the
1055 * result in mem.
1056 *
1057 * 81 /4 and r/m16, imm16
1058 * 81 /4 and r/m32, imm32
1059 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
1060 *
1061 * 83 /4 and r/m16, imm8 sign-extended to 16
1062 * 83 /4 and r/m32, imm8 sign-extended to 32
1063 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
1064 */
1065
1066 /* get the first operand */
1067 error = memread(vcpu, gpa, &val1, size, arg);
1068 if (error)
1069 break;
1070
1071 /*
1072 * perform the operation with the pre-fetched immediate
1073 * operand and write the result
1074 */
1075 result = val1 & vie->immediate;
1076 error = memwrite(vcpu, gpa, result, size, arg);
1077 break;
1078 default:
1079 break;
1080 }
1081 if (error)
1082 return (error);
1083
1084 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1085 if (error)
1086 return (error);
1087
1088 /*
1089 * OF and CF are cleared; the SF, ZF and PF flags are set according
1090 * to the result; AF is undefined.
1091 *
1092 * The updated status flags are obtained by subtracting 0 from 'result'.
1093 */
1094 rflags2 = getcc(size, result, 0);
1095 rflags &= ~RFLAGS_STATUS_BITS;
1096 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1097
1098 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1099 return (error);
1100 }
1101
1102 static int
emulate_or(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1103 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1104 mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1105 {
1106 int error, size;
1107 enum vm_reg_name reg;
1108 uint64_t result, rflags, rflags2, val1, val2;
1109
1110 size = vie->opsize;
1111 error = EINVAL;
1112
1113 switch (vie->op.op_byte) {
1114 case 0x0B:
1115 /*
1116 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1117 * result in reg.
1118 *
1119 * 0b/r or r16, r/m16
1120 * 0b/r or r32, r/m32
1121 * REX.W + 0b/r or r64, r/m64
1122 */
1123
1124 /* get the first operand */
1125 reg = gpr_map[vie->reg];
1126 error = vie_read_register(vcpu, reg, &val1);
1127 if (error)
1128 break;
1129
1130 /* get the second operand */
1131 error = memread(vcpu, gpa, &val2, size, arg);
1132 if (error)
1133 break;
1134
1135 /* perform the operation and write the result */
1136 result = val1 | val2;
1137 error = vie_update_register(vcpu, reg, result, size);
1138 break;
1139 case 0x81:
1140 case 0x83:
1141 /*
1142 * OR mem (ModRM:r/m) with immediate and store the
1143 * result in mem.
1144 *
1145 * 81 /1 or r/m16, imm16
1146 * 81 /1 or r/m32, imm32
1147 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
1148 *
1149 * 83 /1 or r/m16, imm8 sign-extended to 16
1150 * 83 /1 or r/m32, imm8 sign-extended to 32
1151 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
1152 */
1153
1154 /* get the first operand */
1155 error = memread(vcpu, gpa, &val1, size, arg);
1156 if (error)
1157 break;
1158
1159 /*
1160 * perform the operation with the pre-fetched immediate
1161 * operand and write the result
1162 */
1163 result = val1 | vie->immediate;
1164 error = memwrite(vcpu, gpa, result, size, arg);
1165 break;
1166 default:
1167 break;
1168 }
1169 if (error)
1170 return (error);
1171
1172 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1173 if (error)
1174 return (error);
1175
1176 /*
1177 * OF and CF are cleared; the SF, ZF and PF flags are set according
1178 * to the result; AF is undefined.
1179 *
1180 * The updated status flags are obtained by subtracting 0 from 'result'.
1181 */
1182 rflags2 = getcc(size, result, 0);
1183 rflags &= ~RFLAGS_STATUS_BITS;
1184 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1185
1186 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1187 return (error);
1188 }
1189
1190 static int
emulate_cmp(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1191 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1192 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1193 {
1194 int error, size;
1195 uint64_t regop, memop, op1, op2, rflags, rflags2;
1196 enum vm_reg_name reg;
1197
1198 size = vie->opsize;
1199 switch (vie->op.op_byte) {
1200 case 0x39:
1201 case 0x3B:
1202 /*
1203 * 39/r CMP r/m16, r16
1204 * 39/r CMP r/m32, r32
1205 * REX.W 39/r CMP r/m64, r64
1206 *
1207 * 3B/r CMP r16, r/m16
1208 * 3B/r CMP r32, r/m32
1209 * REX.W + 3B/r CMP r64, r/m64
1210 *
1211 * Compare the first operand with the second operand and
1212 * set status flags in EFLAGS register. The comparison is
1213 * performed by subtracting the second operand from the first
1214 * operand and then setting the status flags.
1215 */
1216
1217 /* Get the register operand */
1218 reg = gpr_map[vie->reg];
1219 error = vie_read_register(vcpu, reg, ®op);
1220 if (error)
1221 return (error);
1222
1223 /* Get the memory operand */
1224 error = memread(vcpu, gpa, &memop, size, arg);
1225 if (error)
1226 return (error);
1227
1228 if (vie->op.op_byte == 0x3B) {
1229 op1 = regop;
1230 op2 = memop;
1231 } else {
1232 op1 = memop;
1233 op2 = regop;
1234 }
1235 rflags2 = getcc(size, op1, op2);
1236 break;
1237 case 0x80:
1238 case 0x81:
1239 case 0x83:
1240 /*
1241 * 80 /7 cmp r/m8, imm8
1242 * REX + 80 /7 cmp r/m8, imm8
1243 *
1244 * 81 /7 cmp r/m16, imm16
1245 * 81 /7 cmp r/m32, imm32
1246 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
1247 *
1248 * 83 /7 cmp r/m16, imm8 sign-extended to 16
1249 * 83 /7 cmp r/m32, imm8 sign-extended to 32
1250 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
1251 *
1252 * Compare mem (ModRM:r/m) with immediate and set
1253 * status flags according to the results. The
1254 * comparison is performed by subtracting the
1255 * immediate from the first operand and then setting
1256 * the status flags.
1257 *
1258 */
1259 if (vie->op.op_byte == 0x80)
1260 size = 1;
1261
1262 /* get the first operand */
1263 error = memread(vcpu, gpa, &op1, size, arg);
1264 if (error)
1265 return (error);
1266
1267 rflags2 = getcc(size, op1, vie->immediate);
1268 break;
1269 default:
1270 return (EINVAL);
1271 }
1272 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1273 if (error)
1274 return (error);
1275 rflags &= ~RFLAGS_STATUS_BITS;
1276 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1277
1278 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1279 return (error);
1280 }
1281
1282 static int
emulate_test(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1283 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1284 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1285 {
1286 int error, size;
1287 uint64_t op1, rflags, rflags2;
1288
1289 size = vie->opsize;
1290 error = EINVAL;
1291
1292 switch (vie->op.op_byte) {
1293 case 0xF6:
1294 /*
1295 * F6 /0 test r/m8, imm8
1296 */
1297 size = 1; /* override for byte operation */
1298 /* FALLTHROUGH */
1299 case 0xF7:
1300 /*
1301 * F7 /0 test r/m16, imm16
1302 * F7 /0 test r/m32, imm32
1303 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64
1304 *
1305 * Test mem (ModRM:r/m) with immediate and set status
1306 * flags according to the results. The comparison is
1307 * performed by anding the immediate from the first
1308 * operand and then setting the status flags.
1309 */
1310 if ((vie->reg & 7) != 0)
1311 return (EINVAL);
1312
1313 error = memread(vcpu, gpa, &op1, size, arg);
1314 if (error)
1315 return (error);
1316
1317 rflags2 = getandflags(size, op1, vie->immediate);
1318 break;
1319 default:
1320 return (EINVAL);
1321 }
1322 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1323 if (error)
1324 return (error);
1325
1326 /*
1327 * OF and CF are cleared; the SF, ZF and PF flags are set according
1328 * to the result; AF is undefined.
1329 */
1330 rflags &= ~RFLAGS_STATUS_BITS;
1331 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1332
1333 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1334 return (error);
1335 }
1336
1337 static int
emulate_bextr(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1338 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1339 struct vm_guest_paging *paging, mem_region_read_t memread,
1340 mem_region_write_t memwrite __unused, void *arg)
1341 {
1342 uint64_t src1, src2, dst, rflags;
1343 unsigned start, len, size;
1344 int error;
1345
1346 size = vie->opsize;
1347 error = EINVAL;
1348
1349 /*
1350 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b
1351 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b
1352 *
1353 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and
1354 * Vex.vvvv.
1355 *
1356 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1357 */
1358 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1359 size = 4;
1360
1361 /*
1362 * Extracts contiguous bits from the first /source/ operand (second
1363 * operand) using an index and length specified in the second /source/
1364 * operand (third operand).
1365 */
1366 error = memread(vcpu, gpa, &src1, size, arg);
1367 if (error)
1368 return (error);
1369 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2);
1370 if (error)
1371 return (error);
1372 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1373 if (error)
1374 return (error);
1375
1376 start = (src2 & 0xff);
1377 len = (src2 & 0xff00) >> 8;
1378
1379 /* If no bits are extracted, the destination register is cleared. */
1380 dst = 0;
1381
1382 /* If START exceeds the operand size, no bits are extracted. */
1383 if (start > size * 8)
1384 goto done;
1385 /* Length is bounded by both the destination size and start offset. */
1386 if (start + len > size * 8)
1387 len = (size * 8) - start;
1388 if (len == 0)
1389 goto done;
1390
1391 if (start > 0)
1392 src1 = (src1 >> start);
1393 if (len < 64)
1394 src1 = src1 & ((1ull << len) - 1);
1395 dst = src1;
1396
1397 done:
1398 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size);
1399 if (error)
1400 return (error);
1401
1402 /*
1403 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1404 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1405 */
1406 rflags &= ~RFLAGS_STATUS_BITS;
1407 if (dst == 0)
1408 rflags |= PSL_Z;
1409 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags,
1410 8);
1411 return (error);
1412 }
1413
1414 static int
emulate_add(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1415 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1416 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1417 {
1418 int error, size;
1419 uint64_t nval, rflags, rflags2, val1, val2;
1420 enum vm_reg_name reg;
1421
1422 size = vie->opsize;
1423 error = EINVAL;
1424
1425 switch (vie->op.op_byte) {
1426 case 0x03:
1427 /*
1428 * ADD r/m to r and store the result in r
1429 *
1430 * 03/r ADD r16, r/m16
1431 * 03/r ADD r32, r/m32
1432 * REX.W + 03/r ADD r64, r/m64
1433 */
1434
1435 /* get the first operand */
1436 reg = gpr_map[vie->reg];
1437 error = vie_read_register(vcpu, reg, &val1);
1438 if (error)
1439 break;
1440
1441 /* get the second operand */
1442 error = memread(vcpu, gpa, &val2, size, arg);
1443 if (error)
1444 break;
1445
1446 /* perform the operation and write the result */
1447 nval = val1 + val2;
1448 error = vie_update_register(vcpu, reg, nval, size);
1449 break;
1450 default:
1451 break;
1452 }
1453
1454 if (!error) {
1455 rflags2 = getaddflags(size, val1, val2);
1456 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1457 &rflags);
1458 if (error)
1459 return (error);
1460
1461 rflags &= ~RFLAGS_STATUS_BITS;
1462 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1463 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1464 rflags, 8);
1465 }
1466
1467 return (error);
1468 }
1469
1470 static int
emulate_sub(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * arg)1471 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1472 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
1473 {
1474 int error, size;
1475 uint64_t nval, rflags, rflags2, val1, val2;
1476 enum vm_reg_name reg;
1477
1478 size = vie->opsize;
1479 error = EINVAL;
1480
1481 switch (vie->op.op_byte) {
1482 case 0x2B:
1483 /*
1484 * SUB r/m from r and store the result in r
1485 *
1486 * 2B/r SUB r16, r/m16
1487 * 2B/r SUB r32, r/m32
1488 * REX.W + 2B/r SUB r64, r/m64
1489 */
1490
1491 /* get the first operand */
1492 reg = gpr_map[vie->reg];
1493 error = vie_read_register(vcpu, reg, &val1);
1494 if (error)
1495 break;
1496
1497 /* get the second operand */
1498 error = memread(vcpu, gpa, &val2, size, arg);
1499 if (error)
1500 break;
1501
1502 /* perform the operation and write the result */
1503 nval = val1 - val2;
1504 error = vie_update_register(vcpu, reg, nval, size);
1505 break;
1506 default:
1507 break;
1508 }
1509
1510 if (!error) {
1511 rflags2 = getcc(size, val1, val2);
1512 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS,
1513 &rflags);
1514 if (error)
1515 return (error);
1516
1517 rflags &= ~RFLAGS_STATUS_BITS;
1518 rflags |= rflags2 & RFLAGS_STATUS_BITS;
1519 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS,
1520 rflags, 8);
1521 }
1522
1523 return (error);
1524 }
1525
1526 static int
emulate_stack_op(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1527 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1528 struct vm_guest_paging *paging, mem_region_read_t memread,
1529 mem_region_write_t memwrite, void *arg)
1530 {
1531 #ifdef _KERNEL
1532 struct vm_copyinfo copyinfo[2];
1533 #else
1534 struct iovec copyinfo[2];
1535 #endif
1536 struct seg_desc ss_desc;
1537 uint64_t cr0, rflags, rsp, stack_gla, val;
1538 int error, fault, size, stackaddrsize, pushop;
1539
1540 val = 0;
1541 size = vie->opsize;
1542 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1543
1544 /*
1545 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1546 */
1547 if (paging->cpu_mode == CPU_MODE_REAL) {
1548 stackaddrsize = 2;
1549 } else if (paging->cpu_mode == CPU_MODE_64BIT) {
1550 /*
1551 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1552 * - Stack pointer size is always 64-bits.
1553 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1554 * - 16-bit PUSH/POP is supported by using the operand size
1555 * override prefix (66H).
1556 */
1557 stackaddrsize = 8;
1558 size = vie->opsize_override ? 2 : 8;
1559 } else {
1560 /*
1561 * In protected or compatibility mode the 'B' flag in the
1562 * stack-segment descriptor determines the size of the
1563 * stack pointer.
1564 */
1565 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc);
1566 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1567 __func__, error));
1568 if (SEG_DESC_DEF32(ss_desc.access))
1569 stackaddrsize = 4;
1570 else
1571 stackaddrsize = 2;
1572 }
1573
1574 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0);
1575 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1576
1577 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1578 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1579
1580 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp);
1581 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1582 if (pushop) {
1583 rsp -= size;
1584 }
1585
1586 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1587 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1588 &stack_gla)) {
1589 vm_inject_ss(vcpu, 0);
1590 return (0);
1591 }
1592
1593 if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1594 vm_inject_ss(vcpu, 0);
1595 return (0);
1596 }
1597
1598 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1599 vm_inject_ac(vcpu, 0);
1600 return (0);
1601 }
1602
1603 error = vm_copy_setup(vcpu, paging, stack_gla, size,
1604 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1605 &fault);
1606 if (error || fault)
1607 return (error);
1608
1609 if (pushop) {
1610 error = memread(vcpu, mmio_gpa, &val, size, arg);
1611 if (error == 0)
1612 vm_copyout(&val, copyinfo, size);
1613 } else {
1614 vm_copyin(copyinfo, &val, size);
1615 error = memwrite(vcpu, mmio_gpa, val, size, arg);
1616 rsp += size;
1617 }
1618 vm_copy_teardown(copyinfo, nitems(copyinfo));
1619
1620 if (error == 0) {
1621 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp,
1622 stackaddrsize);
1623 KASSERT(error == 0, ("error %d updating rsp", error));
1624 }
1625 return (error);
1626 }
1627
1628 static int
emulate_push(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1629 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1630 struct vm_guest_paging *paging, mem_region_read_t memread,
1631 mem_region_write_t memwrite, void *arg)
1632 {
1633 int error;
1634
1635 /*
1636 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1637 *
1638 * PUSH is part of the group 5 extended opcodes and is identified
1639 * by ModRM:reg = b110.
1640 */
1641 if ((vie->reg & 7) != 6)
1642 return (EINVAL);
1643
1644 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1645 memwrite, arg);
1646 return (error);
1647 }
1648
1649 static int
emulate_pop(struct vcpu * vcpu,uint64_t mmio_gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * arg)1650 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie,
1651 struct vm_guest_paging *paging, mem_region_read_t memread,
1652 mem_region_write_t memwrite, void *arg)
1653 {
1654 int error;
1655
1656 /*
1657 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1658 *
1659 * POP is part of the group 1A extended opcodes and is identified
1660 * by ModRM:reg = b000.
1661 */
1662 if ((vie->reg & 7) != 0)
1663 return (EINVAL);
1664
1665 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread,
1666 memwrite, arg);
1667 return (error);
1668 }
1669
1670 static int
emulate_group1(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging __unused,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1671 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1672 struct vm_guest_paging *paging __unused, mem_region_read_t memread,
1673 mem_region_write_t memwrite, void *memarg)
1674 {
1675 int error;
1676
1677 switch (vie->reg & 7) {
1678 case 0x1: /* OR */
1679 error = emulate_or(vcpu, gpa, vie,
1680 memread, memwrite, memarg);
1681 break;
1682 case 0x4: /* AND */
1683 error = emulate_and(vcpu, gpa, vie,
1684 memread, memwrite, memarg);
1685 break;
1686 case 0x7: /* CMP */
1687 error = emulate_cmp(vcpu, gpa, vie,
1688 memread, memwrite, memarg);
1689 break;
1690 default:
1691 error = EINVAL;
1692 break;
1693 }
1694
1695 return (error);
1696 }
1697
1698 static int
emulate_bittest(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1699 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1700 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1701 void *memarg)
1702 {
1703 uint64_t val, rflags;
1704 int error, bitmask, bitoff;
1705
1706 /*
1707 * 0F BA is a Group 8 extended opcode.
1708 *
1709 * Currently we only emulate the 'Bit Test' instruction which is
1710 * identified by a ModR/M:reg encoding of 100b.
1711 */
1712 if ((vie->reg & 7) != 4)
1713 return (EINVAL);
1714
1715 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags);
1716 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1717
1718 error = memread(vcpu, gpa, &val, vie->opsize, memarg);
1719 if (error)
1720 return (error);
1721
1722 /*
1723 * Intel SDM, Vol 2, Table 3-2:
1724 * "Range of Bit Positions Specified by Bit Offset Operands"
1725 */
1726 bitmask = vie->opsize * 8 - 1;
1727 bitoff = vie->immediate & bitmask;
1728
1729 /* Copy the bit into the Carry flag in %rflags */
1730 if (val & (1UL << bitoff))
1731 rflags |= PSL_C;
1732 else
1733 rflags &= ~PSL_C;
1734
1735 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8);
1736 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1737
1738 return (0);
1739 }
1740
1741 static int
emulate_twob_group15(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,mem_region_read_t memread,mem_region_write_t memwrite __unused,void * memarg)1742 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1743 mem_region_read_t memread, mem_region_write_t memwrite __unused,
1744 void *memarg)
1745 {
1746 int error;
1747 uint64_t buf;
1748
1749 switch (vie->reg & 7) {
1750 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */
1751 if (vie->mod == 0x3) {
1752 /*
1753 * SFENCE. Ignore it, VM exit provides enough
1754 * barriers on its own.
1755 */
1756 error = 0;
1757 } else {
1758 /*
1759 * CLFLUSH, CLFLUSHOPT. Only check for access
1760 * rights.
1761 */
1762 error = memread(vcpu, gpa, &buf, 1, memarg);
1763 }
1764 break;
1765 default:
1766 error = EINVAL;
1767 break;
1768 }
1769
1770 return (error);
1771 }
1772
1773 int
vmm_emulate_instruction(struct vcpu * vcpu,uint64_t gpa,struct vie * vie,struct vm_guest_paging * paging,mem_region_read_t memread,mem_region_write_t memwrite,void * memarg)1774 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie,
1775 struct vm_guest_paging *paging, mem_region_read_t memread,
1776 mem_region_write_t memwrite, void *memarg)
1777 {
1778 int error;
1779
1780 if (!vie->decoded)
1781 return (EINVAL);
1782
1783 switch (vie->op.op_type) {
1784 case VIE_OP_TYPE_GROUP1:
1785 error = emulate_group1(vcpu, gpa, vie, paging, memread,
1786 memwrite, memarg);
1787 break;
1788 case VIE_OP_TYPE_POP:
1789 error = emulate_pop(vcpu, gpa, vie, paging, memread,
1790 memwrite, memarg);
1791 break;
1792 case VIE_OP_TYPE_PUSH:
1793 error = emulate_push(vcpu, gpa, vie, paging, memread,
1794 memwrite, memarg);
1795 break;
1796 case VIE_OP_TYPE_CMP:
1797 error = emulate_cmp(vcpu, gpa, vie,
1798 memread, memwrite, memarg);
1799 break;
1800 case VIE_OP_TYPE_MOV:
1801 error = emulate_mov(vcpu, gpa, vie,
1802 memread, memwrite, memarg);
1803 break;
1804 case VIE_OP_TYPE_MOVSX:
1805 case VIE_OP_TYPE_MOVZX:
1806 error = emulate_movx(vcpu, gpa, vie,
1807 memread, memwrite, memarg);
1808 break;
1809 case VIE_OP_TYPE_MOVS:
1810 error = emulate_movs(vcpu, gpa, vie, paging, memread,
1811 memwrite, memarg);
1812 break;
1813 case VIE_OP_TYPE_STOS:
1814 error = emulate_stos(vcpu, gpa, vie, paging, memread,
1815 memwrite, memarg);
1816 break;
1817 case VIE_OP_TYPE_AND:
1818 error = emulate_and(vcpu, gpa, vie,
1819 memread, memwrite, memarg);
1820 break;
1821 case VIE_OP_TYPE_OR:
1822 error = emulate_or(vcpu, gpa, vie,
1823 memread, memwrite, memarg);
1824 break;
1825 case VIE_OP_TYPE_SUB:
1826 error = emulate_sub(vcpu, gpa, vie,
1827 memread, memwrite, memarg);
1828 break;
1829 case VIE_OP_TYPE_BITTEST:
1830 error = emulate_bittest(vcpu, gpa, vie,
1831 memread, memwrite, memarg);
1832 break;
1833 case VIE_OP_TYPE_TWOB_GRP15:
1834 error = emulate_twob_group15(vcpu, gpa, vie,
1835 memread, memwrite, memarg);
1836 break;
1837 case VIE_OP_TYPE_ADD:
1838 error = emulate_add(vcpu, gpa, vie, memread,
1839 memwrite, memarg);
1840 break;
1841 case VIE_OP_TYPE_TEST:
1842 error = emulate_test(vcpu, gpa, vie,
1843 memread, memwrite, memarg);
1844 break;
1845 case VIE_OP_TYPE_BEXTR:
1846 error = emulate_bextr(vcpu, gpa, vie, paging,
1847 memread, memwrite, memarg);
1848 break;
1849 default:
1850 error = EINVAL;
1851 break;
1852 }
1853
1854 return (error);
1855 }
1856
1857 int
vie_alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)1858 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1859 {
1860 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1861 ("%s: invalid size %d", __func__, size));
1862 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1863
1864 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1865 return (0);
1866
1867 return ((gla & (size - 1)) ? 1 : 0);
1868 }
1869
1870 int
vie_canonical_check(enum vm_cpu_mode cpu_mode,uint64_t gla)1871 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1872 {
1873 uint64_t mask;
1874
1875 if (cpu_mode != CPU_MODE_64BIT)
1876 return (0);
1877
1878 /*
1879 * The value of the bit 47 in the 'gla' should be replicated in the
1880 * most significant 16 bits.
1881 */
1882 mask = ~((1UL << 48) - 1);
1883 if (gla & (1UL << 47))
1884 return ((gla & mask) != mask);
1885 else
1886 return ((gla & mask) != 0);
1887 }
1888
1889 uint64_t
vie_size2mask(int size)1890 vie_size2mask(int size)
1891 {
1892 KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1893 ("vie_size2mask: invalid size %d", size));
1894 return (size2mask[size]);
1895 }
1896
1897 int
vie_calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)1898 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1899 struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1900 int prot, uint64_t *gla)
1901 {
1902 uint64_t firstoff, low_limit, high_limit, segbase;
1903 int glasize, type;
1904
1905 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1906 ("%s: invalid segment %d", __func__, seg));
1907 KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1908 ("%s: invalid operand size %d", __func__, length));
1909 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1910 ("%s: invalid prot %#x", __func__, prot));
1911
1912 firstoff = offset;
1913 if (cpu_mode == CPU_MODE_64BIT) {
1914 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1915 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1916 glasize = 8;
1917 } else {
1918 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1919 "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1920 glasize = 4;
1921 /*
1922 * If the segment selector is loaded with a NULL selector
1923 * then the descriptor is unusable and attempting to use
1924 * it results in a #GP(0).
1925 */
1926 if (SEG_DESC_UNUSABLE(desc->access))
1927 return (-1);
1928
1929 /*
1930 * The processor generates a #NP exception when a segment
1931 * register is loaded with a selector that points to a
1932 * descriptor that is not present. If this was the case then
1933 * it would have been checked before the VM-exit.
1934 */
1935 KASSERT(SEG_DESC_PRESENT(desc->access),
1936 ("segment %d not present: %#x", seg, desc->access));
1937
1938 /*
1939 * The descriptor type must indicate a code/data segment.
1940 */
1941 type = SEG_DESC_TYPE(desc->access);
1942 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1943 "descriptor type %#x", seg, type));
1944
1945 if (prot & PROT_READ) {
1946 /* #GP on a read access to a exec-only code segment */
1947 if ((type & 0xA) == 0x8)
1948 return (-1);
1949 }
1950
1951 if (prot & PROT_WRITE) {
1952 /*
1953 * #GP on a write access to a code segment or a
1954 * read-only data segment.
1955 */
1956 if (type & 0x8) /* code segment */
1957 return (-1);
1958
1959 if ((type & 0xA) == 0) /* read-only data seg */
1960 return (-1);
1961 }
1962
1963 /*
1964 * 'desc->limit' is fully expanded taking granularity into
1965 * account.
1966 */
1967 if ((type & 0xC) == 0x4) {
1968 /* expand-down data segment */
1969 low_limit = desc->limit + 1;
1970 high_limit = SEG_DESC_DEF32(desc->access) ?
1971 0xffffffff : 0xffff;
1972 } else {
1973 /* code segment or expand-up data segment */
1974 low_limit = 0;
1975 high_limit = desc->limit;
1976 }
1977
1978 while (length > 0) {
1979 offset &= vie_size2mask(addrsize);
1980 if (offset < low_limit || offset > high_limit)
1981 return (-1);
1982 offset++;
1983 length--;
1984 }
1985 }
1986
1987 /*
1988 * In 64-bit mode all segments except %fs and %gs have a segment
1989 * base address of 0.
1990 */
1991 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1992 seg != VM_REG_GUEST_GS) {
1993 segbase = 0;
1994 } else {
1995 segbase = desc->base;
1996 }
1997
1998 /*
1999 * Truncate 'firstoff' to the effective address size before adding
2000 * it to the segment base.
2001 */
2002 firstoff &= vie_size2mask(addrsize);
2003 *gla = (segbase + firstoff) & vie_size2mask(glasize);
2004 return (0);
2005 }
2006
2007 /*
2008 * Prepare a partially decoded vie for a 2nd attempt.
2009 */
2010 void
vie_restart(struct vie * vie)2011 vie_restart(struct vie *vie)
2012 {
2013 _Static_assert(
2014 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
2015 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
2016 "restart should not erase instruction length or contents");
2017
2018 memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
2019 sizeof(*vie) - offsetof(struct vie, vie_startzero));
2020
2021 vie->base_register = VM_REG_LAST;
2022 vie->index_register = VM_REG_LAST;
2023 vie->segment_register = VM_REG_LAST;
2024 }
2025
2026 void
vie_init(struct vie * vie,const char * inst_bytes,int inst_length)2027 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
2028 {
2029 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
2030 ("%s: invalid instruction length (%d)", __func__, inst_length));
2031
2032 vie_restart(vie);
2033 memset(vie->inst, 0, sizeof(vie->inst));
2034 if (inst_length != 0)
2035 memcpy(vie->inst, inst_bytes, inst_length);
2036 vie->num_valid = inst_length;
2037 }
2038
2039 #ifdef _KERNEL
2040 static int
pf_error_code(int usermode,int prot,int rsvd,uint64_t pte)2041 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2042 {
2043 int error_code = 0;
2044
2045 if (pte & PG_V)
2046 error_code |= PGEX_P;
2047 if (prot & VM_PROT_WRITE)
2048 error_code |= PGEX_W;
2049 if (usermode)
2050 error_code |= PGEX_U;
2051 if (rsvd)
2052 error_code |= PGEX_RSV;
2053 if (prot & VM_PROT_EXECUTE)
2054 error_code |= PGEX_I;
2055
2056 return (error_code);
2057 }
2058
2059 static void
ptp_release(void ** cookie)2060 ptp_release(void **cookie)
2061 {
2062 if (*cookie != NULL) {
2063 vm_gpa_release(*cookie);
2064 *cookie = NULL;
2065 }
2066 }
2067
2068 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)2069 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
2070 {
2071 void *ptr;
2072
2073 ptp_release(cookie);
2074 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
2075 return (ptr);
2076 }
2077
2078 static int
_vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault,bool check_only)2079 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2080 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2081 {
2082 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
2083 u_int retries;
2084 uint64_t *ptpbase, ptpphys, pte, pgsize;
2085 uint32_t *ptpbase32, pte32;
2086 void *cookie;
2087
2088 *guest_fault = 0;
2089
2090 usermode = (paging->cpl == 3 ? 1 : 0);
2091 writable = prot & VM_PROT_WRITE;
2092 cookie = NULL;
2093 retval = 0;
2094 retries = 0;
2095 restart:
2096 ptpphys = paging->cr3; /* root of the page tables */
2097 ptp_release(&cookie);
2098 if (retries++ > 0)
2099 maybe_yield();
2100
2101 if (vie_canonical_check(paging->cpu_mode, gla)) {
2102 /*
2103 * XXX assuming a non-stack reference otherwise a stack fault
2104 * should be generated.
2105 */
2106 if (!check_only)
2107 vm_inject_gp(vcpu);
2108 goto fault;
2109 }
2110
2111 if (paging->paging_mode == PAGING_MODE_FLAT) {
2112 *gpa = gla;
2113 goto done;
2114 }
2115
2116 if (paging->paging_mode == PAGING_MODE_32) {
2117 nlevels = 2;
2118 while (--nlevels >= 0) {
2119 /* Zero out the lower 12 bits. */
2120 ptpphys &= ~0xfff;
2121
2122 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,
2123 &cookie);
2124
2125 if (ptpbase32 == NULL)
2126 goto error;
2127
2128 ptpshift = PAGE_SHIFT + nlevels * 10;
2129 ptpindex = (gla >> ptpshift) & 0x3FF;
2130 pgsize = 1UL << ptpshift;
2131
2132 pte32 = ptpbase32[ptpindex];
2133
2134 if ((pte32 & PG_V) == 0 ||
2135 (usermode && (pte32 & PG_U) == 0) ||
2136 (writable && (pte32 & PG_RW) == 0)) {
2137 if (!check_only) {
2138 pfcode = pf_error_code(usermode, prot, 0,
2139 pte32);
2140 vm_inject_pf(vcpu, pfcode, gla);
2141 }
2142 goto fault;
2143 }
2144
2145 /*
2146 * Emulate the x86 MMU's management of the accessed
2147 * and dirty flags. While the accessed flag is set
2148 * at every level of the page table, the dirty flag
2149 * is only set at the last level providing the guest
2150 * physical address.
2151 */
2152 if (!check_only && (pte32 & PG_A) == 0) {
2153 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2154 pte32, pte32 | PG_A) == 0) {
2155 goto restart;
2156 }
2157 }
2158
2159 /* XXX must be ignored if CR4.PSE=0 */
2160 if (nlevels > 0 && (pte32 & PG_PS) != 0)
2161 break;
2162
2163 ptpphys = pte32;
2164 }
2165
2166 /* Set the dirty bit in the page table entry if necessary */
2167 if (!check_only && writable && (pte32 & PG_M) == 0) {
2168 if (atomic_cmpset_32(&ptpbase32[ptpindex],
2169 pte32, pte32 | PG_M) == 0) {
2170 goto restart;
2171 }
2172 }
2173
2174 /* Zero out the lower 'ptpshift' bits */
2175 pte32 >>= ptpshift; pte32 <<= ptpshift;
2176 *gpa = pte32 | (gla & (pgsize - 1));
2177 goto done;
2178 }
2179
2180 if (paging->paging_mode == PAGING_MODE_PAE) {
2181 /* Zero out the lower 5 bits and the upper 32 bits */
2182 ptpphys &= 0xffffffe0UL;
2183
2184 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,
2185 &cookie);
2186 if (ptpbase == NULL)
2187 goto error;
2188
2189 ptpindex = (gla >> 30) & 0x3;
2190
2191 pte = ptpbase[ptpindex];
2192
2193 if ((pte & PG_V) == 0) {
2194 if (!check_only) {
2195 pfcode = pf_error_code(usermode, prot, 0, pte);
2196 vm_inject_pf(vcpu, pfcode, gla);
2197 }
2198 goto fault;
2199 }
2200
2201 ptpphys = pte;
2202
2203 nlevels = 2;
2204 } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
2205 nlevels = 5;
2206 } else {
2207 nlevels = 4;
2208 }
2209
2210 while (--nlevels >= 0) {
2211 /* Zero out the lower 12 bits and the upper 12 bits */
2212 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
2213
2214 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);
2215 if (ptpbase == NULL)
2216 goto error;
2217
2218 ptpshift = PAGE_SHIFT + nlevels * 9;
2219 ptpindex = (gla >> ptpshift) & 0x1FF;
2220 pgsize = 1UL << ptpshift;
2221
2222 pte = ptpbase[ptpindex];
2223
2224 if ((pte & PG_V) == 0 ||
2225 (usermode && (pte & PG_U) == 0) ||
2226 (writable && (pte & PG_RW) == 0)) {
2227 if (!check_only) {
2228 pfcode = pf_error_code(usermode, prot, 0, pte);
2229 vm_inject_pf(vcpu, pfcode, gla);
2230 }
2231 goto fault;
2232 }
2233
2234 /* Set the accessed bit in the page table entry */
2235 if (!check_only && (pte & PG_A) == 0) {
2236 if (atomic_cmpset_64(&ptpbase[ptpindex],
2237 pte, pte | PG_A) == 0) {
2238 goto restart;
2239 }
2240 }
2241
2242 if (nlevels > 0 && (pte & PG_PS) != 0) {
2243 if (pgsize > 1 * GB) {
2244 if (!check_only) {
2245 pfcode = pf_error_code(usermode, prot, 1,
2246 pte);
2247 vm_inject_pf(vcpu, pfcode, gla);
2248 }
2249 goto fault;
2250 }
2251 break;
2252 }
2253
2254 ptpphys = pte;
2255 }
2256
2257 /* Set the dirty bit in the page table entry if necessary */
2258 if (!check_only && writable && (pte & PG_M) == 0) {
2259 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
2260 goto restart;
2261 }
2262
2263 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
2264 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
2265 *gpa = pte | (gla & (pgsize - 1));
2266 done:
2267 ptp_release(&cookie);
2268 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
2269 __func__, retval));
2270 return (retval);
2271 error:
2272 retval = EFAULT;
2273 goto done;
2274 fault:
2275 *guest_fault = 1;
2276 goto done;
2277 }
2278
2279 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2280 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
2281 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2282 {
2283
2284 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2285 false));
2286 }
2287
2288 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)2289 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
2290 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
2291 {
2292
2293 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
2294 true));
2295 }
2296
2297 int
vmm_fetch_instruction(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t rip,int inst_length,struct vie * vie,int * faultptr)2298 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,
2299 uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
2300 {
2301 struct vm_copyinfo copyinfo[2];
2302 int error, prot;
2303
2304 if (inst_length > VIE_INST_SIZE)
2305 panic("vmm_fetch_instruction: invalid length %d", inst_length);
2306
2307 prot = PROT_READ | PROT_EXEC;
2308 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,
2309 copyinfo, nitems(copyinfo), faultptr);
2310 if (error || *faultptr)
2311 return (error);
2312
2313 vm_copyin(copyinfo, vie->inst, inst_length);
2314 vm_copy_teardown(copyinfo, nitems(copyinfo));
2315 vie->num_valid = inst_length;
2316 return (0);
2317 }
2318 #endif /* _KERNEL */
2319
2320 static int
vie_peek(struct vie * vie,uint8_t * x)2321 vie_peek(struct vie *vie, uint8_t *x)
2322 {
2323
2324 if (vie->num_processed < vie->num_valid) {
2325 *x = vie->inst[vie->num_processed];
2326 return (0);
2327 } else
2328 return (-1);
2329 }
2330
2331 static void
vie_advance(struct vie * vie)2332 vie_advance(struct vie *vie)
2333 {
2334
2335 vie->num_processed++;
2336 }
2337
2338 static bool
segment_override(uint8_t x,int * seg)2339 segment_override(uint8_t x, int *seg)
2340 {
2341
2342 switch (x) {
2343 case 0x2E:
2344 *seg = VM_REG_GUEST_CS;
2345 break;
2346 case 0x36:
2347 *seg = VM_REG_GUEST_SS;
2348 break;
2349 case 0x3E:
2350 *seg = VM_REG_GUEST_DS;
2351 break;
2352 case 0x26:
2353 *seg = VM_REG_GUEST_ES;
2354 break;
2355 case 0x64:
2356 *seg = VM_REG_GUEST_FS;
2357 break;
2358 case 0x65:
2359 *seg = VM_REG_GUEST_GS;
2360 break;
2361 default:
2362 return (false);
2363 }
2364 return (true);
2365 }
2366
2367 static int
decode_prefixes(struct vie * vie,enum vm_cpu_mode cpu_mode,int cs_d)2368 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2369 {
2370 uint8_t x;
2371
2372 while (1) {
2373 if (vie_peek(vie, &x))
2374 return (-1);
2375
2376 if (x == 0x66)
2377 vie->opsize_override = 1;
2378 else if (x == 0x67)
2379 vie->addrsize_override = 1;
2380 else if (x == 0xF3)
2381 vie->repz_present = 1;
2382 else if (x == 0xF2)
2383 vie->repnz_present = 1;
2384 else if (segment_override(x, &vie->segment_register))
2385 vie->segment_override = 1;
2386 else
2387 break;
2388
2389 vie_advance(vie);
2390 }
2391
2392 /*
2393 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2394 * - Only one REX prefix is allowed per instruction.
2395 * - The REX prefix must immediately precede the opcode byte or the
2396 * escape opcode byte.
2397 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2398 * the mandatory prefix must come before the REX prefix.
2399 */
2400 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2401 vie->rex_present = 1;
2402 vie->rex_w = x & 0x8 ? 1 : 0;
2403 vie->rex_r = x & 0x4 ? 1 : 0;
2404 vie->rex_x = x & 0x2 ? 1 : 0;
2405 vie->rex_b = x & 0x1 ? 1 : 0;
2406 vie_advance(vie);
2407 }
2408
2409 /*
2410 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
2411 */
2412 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
2413 && x == 0xC4) {
2414 const struct vie_op *optab;
2415
2416 /* 3-byte VEX prefix. */
2417 vie->vex_present = 1;
2418
2419 vie_advance(vie);
2420 if (vie_peek(vie, &x))
2421 return (-1);
2422
2423 /*
2424 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted
2425 * relative to REX encoding.
2426 */
2427 vie->rex_r = x & 0x80 ? 0 : 1;
2428 vie->rex_x = x & 0x40 ? 0 : 1;
2429 vie->rex_b = x & 0x20 ? 0 : 1;
2430
2431 switch (x & 0x1F) {
2432 case 0x2:
2433 /* 0F 38. */
2434 optab = three_byte_opcodes_0f38;
2435 break;
2436 case 0x1:
2437 /* 0F class - nothing handled here yet. */
2438 /* FALLTHROUGH */
2439 case 0x3:
2440 /* 0F 3A class - nothing handled here yet. */
2441 /* FALLTHROUGH */
2442 default:
2443 /* Reserved (#UD). */
2444 return (-1);
2445 }
2446
2447 vie_advance(vie);
2448 if (vie_peek(vie, &x))
2449 return (-1);
2450
2451 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
2452 vie->rex_w = x & 0x80 ? 1 : 0;
2453
2454 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
2455 vie->vex_l = !!(x & 0x4);
2456 vie->vex_pp = (x & 0x3);
2457
2458 /* PP: 1=66 2=F3 3=F2 prefixes. */
2459 switch (vie->vex_pp) {
2460 case 0x1:
2461 vie->opsize_override = 1;
2462 break;
2463 case 0x2:
2464 vie->repz_present = 1;
2465 break;
2466 case 0x3:
2467 vie->repnz_present = 1;
2468 break;
2469 }
2470
2471 vie_advance(vie);
2472
2473 /* Opcode, sans literal prefix prefix. */
2474 if (vie_peek(vie, &x))
2475 return (-1);
2476
2477 vie->op = optab[x];
2478 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2479 return (-1);
2480
2481 vie_advance(vie);
2482 }
2483
2484 /*
2485 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2486 */
2487 if (cpu_mode == CPU_MODE_64BIT) {
2488 /*
2489 * Default address size is 64-bits and default operand size
2490 * is 32-bits.
2491 */
2492 vie->addrsize = vie->addrsize_override ? 4 : 8;
2493 if (vie->rex_w)
2494 vie->opsize = 8;
2495 else if (vie->opsize_override)
2496 vie->opsize = 2;
2497 else
2498 vie->opsize = 4;
2499 } else if (cs_d) {
2500 /* Default address and operand sizes are 32-bits */
2501 vie->addrsize = vie->addrsize_override ? 2 : 4;
2502 vie->opsize = vie->opsize_override ? 2 : 4;
2503 } else {
2504 /* Default address and operand sizes are 16-bits */
2505 vie->addrsize = vie->addrsize_override ? 4 : 2;
2506 vie->opsize = vie->opsize_override ? 4 : 2;
2507 }
2508 return (0);
2509 }
2510
2511 static int
decode_two_byte_opcode(struct vie * vie)2512 decode_two_byte_opcode(struct vie *vie)
2513 {
2514 uint8_t x;
2515
2516 if (vie_peek(vie, &x))
2517 return (-1);
2518
2519 vie->op = two_byte_opcodes[x];
2520
2521 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2522 return (-1);
2523
2524 vie_advance(vie);
2525 return (0);
2526 }
2527
2528 static int
decode_opcode(struct vie * vie)2529 decode_opcode(struct vie *vie)
2530 {
2531 uint8_t x;
2532
2533 if (vie_peek(vie, &x))
2534 return (-1);
2535
2536 /* Already did this via VEX prefix. */
2537 if (vie->op.op_type != VIE_OP_TYPE_NONE)
2538 return (0);
2539
2540 vie->op = one_byte_opcodes[x];
2541
2542 if (vie->op.op_type == VIE_OP_TYPE_NONE)
2543 return (-1);
2544
2545 vie_advance(vie);
2546
2547 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2548 return (decode_two_byte_opcode(vie));
2549
2550 return (0);
2551 }
2552
2553 static int
decode_modrm(struct vie * vie,enum vm_cpu_mode cpu_mode)2554 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2555 {
2556 uint8_t x;
2557
2558 if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2559 return (0);
2560
2561 if (cpu_mode == CPU_MODE_REAL)
2562 return (-1);
2563
2564 if (vie_peek(vie, &x))
2565 return (-1);
2566
2567 vie->mod = (x >> 6) & 0x3;
2568 vie->rm = (x >> 0) & 0x7;
2569 vie->reg = (x >> 3) & 0x7;
2570
2571 /*
2572 * A direct addressing mode makes no sense in the context of an EPT
2573 * fault. There has to be a memory access involved to cause the
2574 * EPT fault.
2575 */
2576 if (vie->mod == VIE_MOD_DIRECT)
2577 return (-1);
2578
2579 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2580 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2581 /*
2582 * Table 2-5: Special Cases of REX Encodings
2583 *
2584 * mod=0, r/m=5 is used in the compatibility mode to
2585 * indicate a disp32 without a base register.
2586 *
2587 * mod!=3, r/m=4 is used in the compatibility mode to
2588 * indicate that the SIB byte is present.
2589 *
2590 * The 'b' bit in the REX prefix is don't care in
2591 * this case.
2592 */
2593 } else {
2594 vie->rm |= (vie->rex_b << 3);
2595 }
2596
2597 vie->reg |= (vie->rex_r << 3);
2598
2599 /* SIB */
2600 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2601 goto done;
2602
2603 vie->base_register = gpr_map[vie->rm];
2604
2605 switch (vie->mod) {
2606 case VIE_MOD_INDIRECT_DISP8:
2607 vie->disp_bytes = 1;
2608 break;
2609 case VIE_MOD_INDIRECT_DISP32:
2610 vie->disp_bytes = 4;
2611 break;
2612 case VIE_MOD_INDIRECT:
2613 if (vie->rm == VIE_RM_DISP32) {
2614 vie->disp_bytes = 4;
2615 /*
2616 * Table 2-7. RIP-Relative Addressing
2617 *
2618 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2619 * whereas in compatibility mode it just implies disp32.
2620 */
2621
2622 if (cpu_mode == CPU_MODE_64BIT)
2623 vie->base_register = VM_REG_GUEST_RIP;
2624 else
2625 vie->base_register = VM_REG_LAST;
2626 }
2627 break;
2628 }
2629
2630 done:
2631 vie_advance(vie);
2632
2633 return (0);
2634 }
2635
2636 static int
decode_sib(struct vie * vie)2637 decode_sib(struct vie *vie)
2638 {
2639 uint8_t x;
2640
2641 /* Proceed only if SIB byte is present */
2642 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2643 return (0);
2644
2645 if (vie_peek(vie, &x))
2646 return (-1);
2647
2648 /* De-construct the SIB byte */
2649 vie->ss = (x >> 6) & 0x3;
2650 vie->index = (x >> 3) & 0x7;
2651 vie->base = (x >> 0) & 0x7;
2652
2653 /* Apply the REX prefix modifiers */
2654 vie->index |= vie->rex_x << 3;
2655 vie->base |= vie->rex_b << 3;
2656
2657 switch (vie->mod) {
2658 case VIE_MOD_INDIRECT_DISP8:
2659 vie->disp_bytes = 1;
2660 break;
2661 case VIE_MOD_INDIRECT_DISP32:
2662 vie->disp_bytes = 4;
2663 break;
2664 }
2665
2666 if (vie->mod == VIE_MOD_INDIRECT &&
2667 (vie->base == 5 || vie->base == 13)) {
2668 /*
2669 * Special case when base register is unused if mod = 0
2670 * and base = %rbp or %r13.
2671 *
2672 * Documented in:
2673 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2674 * Table 2-5: Special Cases of REX Encodings
2675 */
2676 vie->disp_bytes = 4;
2677 } else {
2678 vie->base_register = gpr_map[vie->base];
2679 }
2680
2681 /*
2682 * All encodings of 'index' are valid except for %rsp (4).
2683 *
2684 * Documented in:
2685 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2686 * Table 2-5: Special Cases of REX Encodings
2687 */
2688 if (vie->index != 4)
2689 vie->index_register = gpr_map[vie->index];
2690
2691 /* 'scale' makes sense only in the context of an index register */
2692 if (vie->index_register < VM_REG_LAST)
2693 vie->scale = 1 << vie->ss;
2694
2695 vie_advance(vie);
2696
2697 return (0);
2698 }
2699
2700 static int
decode_displacement(struct vie * vie)2701 decode_displacement(struct vie *vie)
2702 {
2703 int n, i;
2704 uint8_t x;
2705
2706 union {
2707 char buf[4];
2708 int8_t signed8;
2709 int32_t signed32;
2710 } u;
2711
2712 if ((n = vie->disp_bytes) == 0)
2713 return (0);
2714
2715 if (n != 1 && n != 4)
2716 panic("decode_displacement: invalid disp_bytes %d", n);
2717
2718 for (i = 0; i < n; i++) {
2719 if (vie_peek(vie, &x))
2720 return (-1);
2721
2722 u.buf[i] = x;
2723 vie_advance(vie);
2724 }
2725
2726 if (n == 1)
2727 vie->displacement = u.signed8; /* sign-extended */
2728 else
2729 vie->displacement = u.signed32; /* sign-extended */
2730
2731 return (0);
2732 }
2733
2734 static int
decode_immediate(struct vie * vie)2735 decode_immediate(struct vie *vie)
2736 {
2737 int i, n;
2738 uint8_t x;
2739 union {
2740 char buf[4];
2741 int8_t signed8;
2742 int16_t signed16;
2743 int32_t signed32;
2744 } u;
2745
2746 /* Figure out immediate operand size (if any) */
2747 if (vie->op.op_flags & VIE_OP_F_IMM) {
2748 /*
2749 * Section 2.2.1.5 "Immediates", Intel SDM:
2750 * In 64-bit mode the typical size of immediate operands
2751 * remains 32-bits. When the operand size if 64-bits, the
2752 * processor sign-extends all immediates to 64-bits prior
2753 * to their use.
2754 */
2755 if (vie->opsize == 4 || vie->opsize == 8)
2756 vie->imm_bytes = 4;
2757 else
2758 vie->imm_bytes = 2;
2759 } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2760 vie->imm_bytes = 1;
2761 }
2762
2763 if ((n = vie->imm_bytes) == 0)
2764 return (0);
2765
2766 KASSERT(n == 1 || n == 2 || n == 4,
2767 ("%s: invalid number of immediate bytes: %d", __func__, n));
2768
2769 for (i = 0; i < n; i++) {
2770 if (vie_peek(vie, &x))
2771 return (-1);
2772
2773 u.buf[i] = x;
2774 vie_advance(vie);
2775 }
2776
2777 /* sign-extend the immediate value before use */
2778 if (n == 1)
2779 vie->immediate = u.signed8;
2780 else if (n == 2)
2781 vie->immediate = u.signed16;
2782 else
2783 vie->immediate = u.signed32;
2784
2785 return (0);
2786 }
2787
2788 static int
decode_moffset(struct vie * vie)2789 decode_moffset(struct vie *vie)
2790 {
2791 int i, n;
2792 uint8_t x;
2793 union {
2794 char buf[8];
2795 uint64_t u64;
2796 } u;
2797
2798 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2799 return (0);
2800
2801 /*
2802 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2803 * The memory offset size follows the address-size of the instruction.
2804 */
2805 n = vie->addrsize;
2806 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2807
2808 u.u64 = 0;
2809 for (i = 0; i < n; i++) {
2810 if (vie_peek(vie, &x))
2811 return (-1);
2812
2813 u.buf[i] = x;
2814 vie_advance(vie);
2815 }
2816 vie->displacement = u.u64;
2817 return (0);
2818 }
2819
2820 #ifdef _KERNEL
2821 /*
2822 * Verify that the 'guest linear address' provided as collateral of the nested
2823 * page table fault matches with our instruction decoding.
2824 */
2825 static int
verify_gla(struct vcpu * vcpu,uint64_t gla,struct vie * vie,enum vm_cpu_mode cpu_mode)2826 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,
2827 enum vm_cpu_mode cpu_mode)
2828 {
2829 int error;
2830 uint64_t base, segbase, idx, gla2;
2831 enum vm_reg_name seg;
2832 struct seg_desc desc;
2833
2834 /* Skip 'gla' verification */
2835 if (gla == VIE_INVALID_GLA)
2836 return (0);
2837
2838 base = 0;
2839 if (vie->base_register != VM_REG_LAST) {
2840 error = vm_get_register(vcpu, vie->base_register, &base);
2841 if (error) {
2842 printf("verify_gla: error %d getting base reg %d\n",
2843 error, vie->base_register);
2844 return (-1);
2845 }
2846
2847 /*
2848 * RIP-relative addressing starts from the following
2849 * instruction
2850 */
2851 if (vie->base_register == VM_REG_GUEST_RIP)
2852 base += vie->num_processed;
2853 }
2854
2855 idx = 0;
2856 if (vie->index_register != VM_REG_LAST) {
2857 error = vm_get_register(vcpu, vie->index_register, &idx);
2858 if (error) {
2859 printf("verify_gla: error %d getting index reg %d\n",
2860 error, vie->index_register);
2861 return (-1);
2862 }
2863 }
2864
2865 /*
2866 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2867 *
2868 * In 64-bit mode, segmentation is generally (but not
2869 * completely) disabled. The exceptions are the FS and GS
2870 * segments.
2871 *
2872 * In legacy IA-32 mode, when the ESP or EBP register is used
2873 * as the base, the SS segment is the default segment. For
2874 * other data references, except when relative to stack or
2875 * string destination the DS segment is the default. These
2876 * can be overridden to allow other segments to be accessed.
2877 */
2878 if (vie->segment_override)
2879 seg = vie->segment_register;
2880 else if (vie->base_register == VM_REG_GUEST_RSP ||
2881 vie->base_register == VM_REG_GUEST_RBP)
2882 seg = VM_REG_GUEST_SS;
2883 else
2884 seg = VM_REG_GUEST_DS;
2885 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2886 seg != VM_REG_GUEST_GS) {
2887 segbase = 0;
2888 } else {
2889 error = vm_get_seg_desc(vcpu, seg, &desc);
2890 if (error) {
2891 printf("verify_gla: error %d getting segment"
2892 " descriptor %d", error,
2893 vie->segment_register);
2894 return (-1);
2895 }
2896 segbase = desc.base;
2897 }
2898
2899 gla2 = segbase + base + vie->scale * idx + vie->displacement;
2900 gla2 &= size2mask[vie->addrsize];
2901 if (gla != gla2) {
2902 printf("verify_gla mismatch: segbase(0x%0lx)"
2903 "base(0x%0lx), scale(%d), index(0x%0lx), "
2904 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2905 segbase, base, vie->scale, idx, vie->displacement,
2906 gla, gla2);
2907 return (-1);
2908 }
2909
2910 return (0);
2911 }
2912 #endif /* _KERNEL */
2913
2914 int
2915 #ifdef _KERNEL
vmm_decode_instruction(struct vcpu * vcpu,uint64_t gla,enum vm_cpu_mode cpu_mode,int cs_d,struct vie * vie)2916 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
2917 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2918 #else
2919 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2920 #endif
2921 {
2922
2923 if (decode_prefixes(vie, cpu_mode, cs_d))
2924 return (-1);
2925
2926 if (decode_opcode(vie))
2927 return (-1);
2928
2929 if (decode_modrm(vie, cpu_mode))
2930 return (-1);
2931
2932 if (decode_sib(vie))
2933 return (-1);
2934
2935 if (decode_displacement(vie))
2936 return (-1);
2937
2938 if (decode_immediate(vie))
2939 return (-1);
2940
2941 if (decode_moffset(vie))
2942 return (-1);
2943
2944 #ifdef _KERNEL
2945 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2946 if (verify_gla(vcpu, gla, vie, cpu_mode))
2947 return (-1);
2948 }
2949 #endif
2950
2951 vie->decoded = 1; /* success */
2952
2953 return (0);
2954 }
2955