1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_LAST 73 }; 74 75 /* struct vie_op.op_flags */ 76 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 77 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 78 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 79 #define VIE_OP_F_NO_MODRM (1 << 3) 80 81 static const struct vie_op two_byte_opcodes[256] = { 82 [0xB6] = { 83 .op_byte = 0xB6, 84 .op_type = VIE_OP_TYPE_MOVZX, 85 }, 86 [0xB7] = { 87 .op_byte = 0xB7, 88 .op_type = VIE_OP_TYPE_MOVZX, 89 }, 90 [0xBE] = { 91 .op_byte = 0xBE, 92 .op_type = VIE_OP_TYPE_MOVSX, 93 }, 94 }; 95 96 static const struct vie_op one_byte_opcodes[256] = { 97 [0x0F] = { 98 .op_byte = 0x0F, 99 .op_type = VIE_OP_TYPE_TWO_BYTE 100 }, 101 [0x2B] = { 102 .op_byte = 0x2B, 103 .op_type = VIE_OP_TYPE_SUB, 104 }, 105 [0x3B] = { 106 .op_byte = 0x3B, 107 .op_type = VIE_OP_TYPE_CMP, 108 }, 109 [0x88] = { 110 .op_byte = 0x88, 111 .op_type = VIE_OP_TYPE_MOV, 112 }, 113 [0x89] = { 114 .op_byte = 0x89, 115 .op_type = VIE_OP_TYPE_MOV, 116 }, 117 [0x8A] = { 118 .op_byte = 0x8A, 119 .op_type = VIE_OP_TYPE_MOV, 120 }, 121 [0x8B] = { 122 .op_byte = 0x8B, 123 .op_type = VIE_OP_TYPE_MOV, 124 }, 125 [0xA1] = { 126 .op_byte = 0xA1, 127 .op_type = VIE_OP_TYPE_MOV, 128 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 129 }, 130 [0xA3] = { 131 .op_byte = 0xA3, 132 .op_type = VIE_OP_TYPE_MOV, 133 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 134 }, 135 [0xC6] = { 136 /* XXX Group 11 extended opcode - not just MOV */ 137 .op_byte = 0xC6, 138 .op_type = VIE_OP_TYPE_MOV, 139 .op_flags = VIE_OP_F_IMM8, 140 }, 141 [0xC7] = { 142 .op_byte = 0xC7, 143 .op_type = VIE_OP_TYPE_MOV, 144 .op_flags = VIE_OP_F_IMM, 145 }, 146 [0x23] = { 147 .op_byte = 0x23, 148 .op_type = VIE_OP_TYPE_AND, 149 }, 150 [0x81] = { 151 /* XXX Group 1 extended opcode - not just AND */ 152 .op_byte = 0x81, 153 .op_type = VIE_OP_TYPE_AND, 154 .op_flags = VIE_OP_F_IMM, 155 }, 156 [0x83] = { 157 /* XXX Group 1 extended opcode - not just OR */ 158 .op_byte = 0x83, 159 .op_type = VIE_OP_TYPE_OR, 160 .op_flags = VIE_OP_F_IMM8, 161 }, 162 [0xFF] = { 163 /* XXX Group 5 extended opcode - not just PUSH */ 164 .op_byte = 0xFF, 165 .op_type = VIE_OP_TYPE_PUSH, 166 } 167 }; 168 169 /* struct vie.mod */ 170 #define VIE_MOD_INDIRECT 0 171 #define VIE_MOD_INDIRECT_DISP8 1 172 #define VIE_MOD_INDIRECT_DISP32 2 173 #define VIE_MOD_DIRECT 3 174 175 /* struct vie.rm */ 176 #define VIE_RM_SIB 4 177 #define VIE_RM_DISP32 5 178 179 #define GB (1024 * 1024 * 1024) 180 181 static enum vm_reg_name gpr_map[16] = { 182 VM_REG_GUEST_RAX, 183 VM_REG_GUEST_RCX, 184 VM_REG_GUEST_RDX, 185 VM_REG_GUEST_RBX, 186 VM_REG_GUEST_RSP, 187 VM_REG_GUEST_RBP, 188 VM_REG_GUEST_RSI, 189 VM_REG_GUEST_RDI, 190 VM_REG_GUEST_R8, 191 VM_REG_GUEST_R9, 192 VM_REG_GUEST_R10, 193 VM_REG_GUEST_R11, 194 VM_REG_GUEST_R12, 195 VM_REG_GUEST_R13, 196 VM_REG_GUEST_R14, 197 VM_REG_GUEST_R15 198 }; 199 200 static uint64_t size2mask[] = { 201 [1] = 0xff, 202 [2] = 0xffff, 203 [4] = 0xffffffff, 204 [8] = 0xffffffffffffffff, 205 }; 206 207 static int 208 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 209 { 210 int error; 211 212 error = vm_get_register(vm, vcpuid, reg, rval); 213 214 return (error); 215 } 216 217 static void 218 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 219 { 220 *lhbr = 0; 221 *reg = gpr_map[vie->reg]; 222 223 /* 224 * 64-bit mode imposes limitations on accessing legacy high byte 225 * registers (lhbr). 226 * 227 * The legacy high-byte registers cannot be addressed if the REX 228 * prefix is present. In this case the values 4, 5, 6 and 7 of the 229 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 230 * 231 * If the REX prefix is not present then the values 4, 5, 6 and 7 232 * of the 'ModRM:reg' field address the legacy high-byte registers, 233 * %ah, %ch, %dh and %bh respectively. 234 */ 235 if (!vie->rex_present) { 236 if (vie->reg & 0x4) { 237 *lhbr = 1; 238 *reg = gpr_map[vie->reg & 0x3]; 239 } 240 } 241 } 242 243 static int 244 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 245 { 246 uint64_t val; 247 int error, lhbr; 248 enum vm_reg_name reg; 249 250 vie_calc_bytereg(vie, ®, &lhbr); 251 error = vm_get_register(vm, vcpuid, reg, &val); 252 253 /* 254 * To obtain the value of a legacy high byte register shift the 255 * base register right by 8 bits (%ah = %rax >> 8). 256 */ 257 if (lhbr) 258 *rval = val >> 8; 259 else 260 *rval = val; 261 return (error); 262 } 263 264 static int 265 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 266 { 267 uint64_t origval, val, mask; 268 int error, lhbr; 269 enum vm_reg_name reg; 270 271 vie_calc_bytereg(vie, ®, &lhbr); 272 error = vm_get_register(vm, vcpuid, reg, &origval); 273 if (error == 0) { 274 val = byte; 275 mask = 0xff; 276 if (lhbr) { 277 /* 278 * Shift left by 8 to store 'byte' in a legacy high 279 * byte register. 280 */ 281 val <<= 8; 282 mask <<= 8; 283 } 284 val |= origval & ~mask; 285 error = vm_set_register(vm, vcpuid, reg, val); 286 } 287 return (error); 288 } 289 290 int 291 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 292 uint64_t val, int size) 293 { 294 int error; 295 uint64_t origval; 296 297 switch (size) { 298 case 1: 299 case 2: 300 error = vie_read_register(vm, vcpuid, reg, &origval); 301 if (error) 302 return (error); 303 val &= size2mask[size]; 304 val |= origval & ~size2mask[size]; 305 break; 306 case 4: 307 val &= 0xffffffffUL; 308 break; 309 case 8: 310 break; 311 default: 312 return (EINVAL); 313 } 314 315 error = vm_set_register(vm, vcpuid, reg, val); 316 return (error); 317 } 318 319 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 320 321 /* 322 * Return the status flags that would result from doing (x - y). 323 */ 324 #define GETCC(sz) \ 325 static u_long \ 326 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 327 { \ 328 u_long rflags; \ 329 \ 330 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 331 "=r" (rflags), "+r" (x) : "m" (y)); \ 332 return (rflags); \ 333 } struct __hack 334 335 GETCC(8); 336 GETCC(16); 337 GETCC(32); 338 GETCC(64); 339 340 static u_long 341 getcc(int opsize, uint64_t x, uint64_t y) 342 { 343 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 344 ("getcc: invalid operand size %d", opsize)); 345 346 if (opsize == 1) 347 return (getcc8(x, y)); 348 else if (opsize == 2) 349 return (getcc16(x, y)); 350 else if (opsize == 4) 351 return (getcc32(x, y)); 352 else 353 return (getcc64(x, y)); 354 } 355 356 static int 357 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 358 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 359 { 360 int error, size; 361 enum vm_reg_name reg; 362 uint8_t byte; 363 uint64_t val; 364 365 size = vie->opsize; 366 error = EINVAL; 367 368 switch (vie->op.op_byte) { 369 case 0x88: 370 /* 371 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 372 * 88/r: mov r/m8, r8 373 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 374 */ 375 size = 1; /* override for byte operation */ 376 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 377 if (error == 0) 378 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 379 break; 380 case 0x89: 381 /* 382 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 383 * 89/r: mov r/m16, r16 384 * 89/r: mov r/m32, r32 385 * REX.W + 89/r mov r/m64, r64 386 */ 387 reg = gpr_map[vie->reg]; 388 error = vie_read_register(vm, vcpuid, reg, &val); 389 if (error == 0) { 390 val &= size2mask[size]; 391 error = memwrite(vm, vcpuid, gpa, val, size, arg); 392 } 393 break; 394 case 0x8A: 395 /* 396 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 397 * 8A/r: mov r8, r/m8 398 * REX + 8A/r: mov r8, r/m8 399 */ 400 size = 1; /* override for byte operation */ 401 error = memread(vm, vcpuid, gpa, &val, size, arg); 402 if (error == 0) 403 error = vie_write_bytereg(vm, vcpuid, vie, val); 404 break; 405 case 0x8B: 406 /* 407 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 408 * 8B/r: mov r16, r/m16 409 * 8B/r: mov r32, r/m32 410 * REX.W 8B/r: mov r64, r/m64 411 */ 412 error = memread(vm, vcpuid, gpa, &val, size, arg); 413 if (error == 0) { 414 reg = gpr_map[vie->reg]; 415 error = vie_update_register(vm, vcpuid, reg, val, size); 416 } 417 break; 418 case 0xA1: 419 /* 420 * MOV from seg:moffset to AX/EAX/RAX 421 * A1: mov AX, moffs16 422 * A1: mov EAX, moffs32 423 * REX.W + A1: mov RAX, moffs64 424 */ 425 error = memread(vm, vcpuid, gpa, &val, size, arg); 426 if (error == 0) { 427 reg = VM_REG_GUEST_RAX; 428 error = vie_update_register(vm, vcpuid, reg, val, size); 429 } 430 break; 431 case 0xA3: 432 /* 433 * MOV from AX/EAX/RAX to seg:moffset 434 * A3: mov moffs16, AX 435 * A3: mov moffs32, EAX 436 * REX.W + A3: mov moffs64, RAX 437 */ 438 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 439 if (error == 0) { 440 val &= size2mask[size]; 441 error = memwrite(vm, vcpuid, gpa, val, size, arg); 442 } 443 break; 444 case 0xC6: 445 /* 446 * MOV from imm8 to mem (ModRM:r/m) 447 * C6/0 mov r/m8, imm8 448 * REX + C6/0 mov r/m8, imm8 449 */ 450 size = 1; /* override for byte operation */ 451 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 452 break; 453 case 0xC7: 454 /* 455 * MOV from imm16/imm32 to mem (ModRM:r/m) 456 * C7/0 mov r/m16, imm16 457 * C7/0 mov r/m32, imm32 458 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 459 */ 460 val = vie->immediate & size2mask[size]; 461 error = memwrite(vm, vcpuid, gpa, val, size, arg); 462 break; 463 default: 464 break; 465 } 466 467 return (error); 468 } 469 470 static int 471 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 472 mem_region_read_t memread, mem_region_write_t memwrite, 473 void *arg) 474 { 475 int error, size; 476 enum vm_reg_name reg; 477 uint64_t val; 478 479 size = vie->opsize; 480 error = EINVAL; 481 482 switch (vie->op.op_byte) { 483 case 0xB6: 484 /* 485 * MOV and zero extend byte from mem (ModRM:r/m) to 486 * reg (ModRM:reg). 487 * 488 * 0F B6/r movzx r16, r/m8 489 * 0F B6/r movzx r32, r/m8 490 * REX.W + 0F B6/r movzx r64, r/m8 491 */ 492 493 /* get the first operand */ 494 error = memread(vm, vcpuid, gpa, &val, 1, arg); 495 if (error) 496 break; 497 498 /* get the second operand */ 499 reg = gpr_map[vie->reg]; 500 501 /* zero-extend byte */ 502 val = (uint8_t)val; 503 504 /* write the result */ 505 error = vie_update_register(vm, vcpuid, reg, val, size); 506 break; 507 case 0xB7: 508 /* 509 * MOV and zero extend word from mem (ModRM:r/m) to 510 * reg (ModRM:reg). 511 * 512 * 0F B7/r movzx r32, r/m16 513 * REX.W + 0F B7/r movzx r64, r/m16 514 */ 515 error = memread(vm, vcpuid, gpa, &val, 2, arg); 516 if (error) 517 return (error); 518 519 reg = gpr_map[vie->reg]; 520 521 /* zero-extend word */ 522 val = (uint16_t)val; 523 524 error = vie_update_register(vm, vcpuid, reg, val, size); 525 break; 526 case 0xBE: 527 /* 528 * MOV and sign extend byte from mem (ModRM:r/m) to 529 * reg (ModRM:reg). 530 * 531 * 0F BE/r movsx r16, r/m8 532 * 0F BE/r movsx r32, r/m8 533 * REX.W + 0F BE/r movsx r64, r/m8 534 */ 535 536 /* get the first operand */ 537 error = memread(vm, vcpuid, gpa, &val, 1, arg); 538 if (error) 539 break; 540 541 /* get the second operand */ 542 reg = gpr_map[vie->reg]; 543 544 /* sign extend byte */ 545 val = (int8_t)val; 546 547 /* write the result */ 548 error = vie_update_register(vm, vcpuid, reg, val, size); 549 break; 550 default: 551 break; 552 } 553 return (error); 554 } 555 556 static int 557 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 558 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 559 { 560 int error, size; 561 enum vm_reg_name reg; 562 uint64_t result, rflags, rflags2, val1, val2; 563 564 size = vie->opsize; 565 error = EINVAL; 566 567 switch (vie->op.op_byte) { 568 case 0x23: 569 /* 570 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 571 * result in reg. 572 * 573 * 23/r and r16, r/m16 574 * 23/r and r32, r/m32 575 * REX.W + 23/r and r64, r/m64 576 */ 577 578 /* get the first operand */ 579 reg = gpr_map[vie->reg]; 580 error = vie_read_register(vm, vcpuid, reg, &val1); 581 if (error) 582 break; 583 584 /* get the second operand */ 585 error = memread(vm, vcpuid, gpa, &val2, size, arg); 586 if (error) 587 break; 588 589 /* perform the operation and write the result */ 590 result = val1 & val2; 591 error = vie_update_register(vm, vcpuid, reg, result, size); 592 break; 593 case 0x81: 594 /* 595 * AND/OR mem (ModRM:r/m) with immediate and store the 596 * result in mem. 597 * 598 * AND: i = 4 599 * OR: i = 1 600 * 81 /i op r/m16, imm16 601 * 81 /i op r/m32, imm32 602 * REX.W + 81 /i op r/m64, imm32 sign-extended to 64 603 * 604 */ 605 606 /* get the first operand */ 607 error = memread(vm, vcpuid, gpa, &val1, size, arg); 608 if (error) 609 break; 610 611 /* 612 * perform the operation with the pre-fetched immediate 613 * operand and write the result 614 */ 615 switch (vie->reg & 7) { 616 case 0x4: 617 /* modrm:reg == b100, AND */ 618 result = val1 & vie->immediate; 619 break; 620 case 0x1: 621 /* modrm:reg == b001, OR */ 622 result = val1 | vie->immediate; 623 break; 624 default: 625 error = EINVAL; 626 break; 627 } 628 if (error) 629 break; 630 631 error = memwrite(vm, vcpuid, gpa, result, size, arg); 632 break; 633 default: 634 break; 635 } 636 if (error) 637 return (error); 638 639 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 640 if (error) 641 return (error); 642 643 /* 644 * OF and CF are cleared; the SF, ZF and PF flags are set according 645 * to the result; AF is undefined. 646 * 647 * The updated status flags are obtained by subtracting 0 from 'result'. 648 */ 649 rflags2 = getcc(size, result, 0); 650 rflags &= ~RFLAGS_STATUS_BITS; 651 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 652 653 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 654 return (error); 655 } 656 657 static int 658 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 659 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 660 { 661 int error, size; 662 uint64_t val1, result, rflags, rflags2; 663 664 size = vie->opsize; 665 error = EINVAL; 666 667 switch (vie->op.op_byte) { 668 case 0x83: 669 /* 670 * OR mem (ModRM:r/m) with immediate and store the 671 * result in mem. 672 * 673 * 83 /1 OR r/m16, imm8 sign-extended to 16 674 * 83 /1 OR r/m32, imm8 sign-extended to 32 675 * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 676 * 677 * Currently, only the OR operation of the 0x83 opcode 678 * is implemented (ModRM:reg = b001). 679 */ 680 if ((vie->reg & 7) != 1) 681 break; 682 683 /* get the first operand */ 684 error = memread(vm, vcpuid, gpa, &val1, size, arg); 685 if (error) 686 break; 687 688 /* 689 * perform the operation with the pre-fetched immediate 690 * operand and write the result 691 */ 692 result = val1 | vie->immediate; 693 error = memwrite(vm, vcpuid, gpa, result, size, arg); 694 break; 695 default: 696 break; 697 } 698 if (error) 699 return (error); 700 701 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 702 if (error) 703 return (error); 704 705 /* 706 * OF and CF are cleared; the SF, ZF and PF flags are set according 707 * to the result; AF is undefined. 708 * 709 * The updated status flags are obtained by subtracting 0 from 'result'. 710 */ 711 rflags2 = getcc(size, result, 0); 712 rflags &= ~RFLAGS_STATUS_BITS; 713 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 714 715 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 716 return (error); 717 } 718 719 static int 720 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 721 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 722 { 723 int error, size; 724 uint64_t op1, op2, rflags, rflags2; 725 enum vm_reg_name reg; 726 727 size = vie->opsize; 728 switch (vie->op.op_byte) { 729 case 0x3B: 730 /* 731 * 3B/r CMP r16, r/m16 732 * 3B/r CMP r32, r/m32 733 * REX.W + 3B/r CMP r64, r/m64 734 * 735 * Compare first operand (reg) with second operand (r/m) and 736 * set status flags in EFLAGS register. The comparison is 737 * performed by subtracting the second operand from the first 738 * operand and then setting the status flags. 739 */ 740 741 /* Get the first operand */ 742 reg = gpr_map[vie->reg]; 743 error = vie_read_register(vm, vcpuid, reg, &op1); 744 if (error) 745 return (error); 746 747 /* Get the second operand */ 748 error = memread(vm, vcpuid, gpa, &op2, size, arg); 749 if (error) 750 return (error); 751 752 break; 753 default: 754 return (EINVAL); 755 } 756 rflags2 = getcc(size, op1, op2); 757 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 758 if (error) 759 return (error); 760 rflags &= ~RFLAGS_STATUS_BITS; 761 rflags |= rflags2 & RFLAGS_STATUS_BITS; 762 763 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 764 return (error); 765 } 766 767 static int 768 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 769 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 770 { 771 int error, size; 772 uint64_t nval, rflags, rflags2, val1, val2; 773 enum vm_reg_name reg; 774 775 size = vie->opsize; 776 error = EINVAL; 777 778 switch (vie->op.op_byte) { 779 case 0x2B: 780 /* 781 * SUB r/m from r and store the result in r 782 * 783 * 2B/r SUB r16, r/m16 784 * 2B/r SUB r32, r/m32 785 * REX.W + 2B/r SUB r64, r/m64 786 */ 787 788 /* get the first operand */ 789 reg = gpr_map[vie->reg]; 790 error = vie_read_register(vm, vcpuid, reg, &val1); 791 if (error) 792 break; 793 794 /* get the second operand */ 795 error = memread(vm, vcpuid, gpa, &val2, size, arg); 796 if (error) 797 break; 798 799 /* perform the operation and write the result */ 800 nval = val1 - val2; 801 error = vie_update_register(vm, vcpuid, reg, nval, size); 802 break; 803 default: 804 break; 805 } 806 807 if (!error) { 808 rflags2 = getcc(size, val1, val2); 809 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 810 &rflags); 811 if (error) 812 return (error); 813 814 rflags &= ~RFLAGS_STATUS_BITS; 815 rflags |= rflags2 & RFLAGS_STATUS_BITS; 816 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 817 rflags, 8); 818 } 819 820 return (error); 821 } 822 823 static int 824 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 825 struct vm_guest_paging *paging, mem_region_read_t memread, 826 mem_region_write_t memwrite, void *arg) 827 { 828 #ifdef _KERNEL 829 struct vm_copyinfo copyinfo[2]; 830 #else 831 struct iovec copyinfo[2]; 832 #endif 833 struct seg_desc ss_desc; 834 uint64_t cr0, rflags, rsp, stack_gla, val; 835 int error, size, stackaddrsize; 836 837 /* 838 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 839 * 840 * PUSH is part of the group 5 extended opcodes and is identified 841 * by ModRM:reg = b110. 842 */ 843 if ((vie->reg & 7) != 6) 844 return (EINVAL); 845 846 size = vie->opsize; 847 /* 848 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 849 */ 850 if (paging->cpu_mode == CPU_MODE_REAL) { 851 stackaddrsize = 2; 852 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 853 /* 854 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 855 * - Stack pointer size is always 64-bits. 856 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 857 * - 16-bit PUSH/POP is supported by using the operand size 858 * override prefix (66H). 859 */ 860 stackaddrsize = 8; 861 size = vie->opsize_override ? 2 : 8; 862 } else { 863 /* 864 * In protected or compability mode the 'B' flag in the 865 * stack-segment descriptor determines the size of the 866 * stack pointer. 867 */ 868 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 869 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 870 __func__, error)); 871 if (SEG_DESC_DEF32(ss_desc.access)) 872 stackaddrsize = 4; 873 else 874 stackaddrsize = 2; 875 } 876 877 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 878 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 879 880 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 881 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 882 883 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 884 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 885 886 rsp -= size; 887 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 888 rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) { 889 vm_inject_ss(vm, vcpuid, 0); 890 return (0); 891 } 892 893 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 894 vm_inject_ss(vm, vcpuid, 0); 895 return (0); 896 } 897 898 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 899 vm_inject_ac(vm, vcpuid, 0); 900 return (0); 901 } 902 903 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE, 904 copyinfo, nitems(copyinfo)); 905 if (error == -1) { 906 /* 907 * XXX cannot return a negative error value here because it 908 * ends up being the return value of the VM_RUN() ioctl and 909 * is interpreted as a pseudo-error (for e.g. ERESTART). 910 */ 911 return (EFAULT); 912 } else if (error == 1) { 913 /* Resume guest execution to handle page fault */ 914 return (0); 915 } 916 917 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 918 if (error == 0) { 919 vm_copyout(vm, vcpuid, &val, copyinfo, size); 920 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 921 stackaddrsize); 922 KASSERT(error == 0, ("error %d updating rsp", error)); 923 } 924 #ifdef _KERNEL 925 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 926 #endif 927 return (error); 928 } 929 930 int 931 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 932 struct vm_guest_paging *paging, mem_region_read_t memread, 933 mem_region_write_t memwrite, void *memarg) 934 { 935 int error; 936 937 if (!vie->decoded) 938 return (EINVAL); 939 940 switch (vie->op.op_type) { 941 case VIE_OP_TYPE_PUSH: 942 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 943 memwrite, memarg); 944 break; 945 case VIE_OP_TYPE_CMP: 946 error = emulate_cmp(vm, vcpuid, gpa, vie, 947 memread, memwrite, memarg); 948 break; 949 case VIE_OP_TYPE_MOV: 950 error = emulate_mov(vm, vcpuid, gpa, vie, 951 memread, memwrite, memarg); 952 break; 953 case VIE_OP_TYPE_MOVSX: 954 case VIE_OP_TYPE_MOVZX: 955 error = emulate_movx(vm, vcpuid, gpa, vie, 956 memread, memwrite, memarg); 957 break; 958 case VIE_OP_TYPE_AND: 959 error = emulate_and(vm, vcpuid, gpa, vie, 960 memread, memwrite, memarg); 961 break; 962 case VIE_OP_TYPE_OR: 963 error = emulate_or(vm, vcpuid, gpa, vie, 964 memread, memwrite, memarg); 965 break; 966 case VIE_OP_TYPE_SUB: 967 error = emulate_sub(vm, vcpuid, gpa, vie, 968 memread, memwrite, memarg); 969 break; 970 default: 971 error = EINVAL; 972 break; 973 } 974 975 return (error); 976 } 977 978 int 979 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 980 { 981 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 982 ("%s: invalid size %d", __func__, size)); 983 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 984 985 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 986 return (0); 987 988 return ((gla & (size - 1)) ? 1 : 0); 989 } 990 991 int 992 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 993 { 994 uint64_t mask; 995 996 if (cpu_mode != CPU_MODE_64BIT) 997 return (0); 998 999 /* 1000 * The value of the bit 47 in the 'gla' should be replicated in the 1001 * most significant 16 bits. 1002 */ 1003 mask = ~((1UL << 48) - 1); 1004 if (gla & (1UL << 47)) 1005 return ((gla & mask) != mask); 1006 else 1007 return ((gla & mask) != 0); 1008 } 1009 1010 uint64_t 1011 vie_size2mask(int size) 1012 { 1013 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1014 ("vie_size2mask: invalid size %d", size)); 1015 return (size2mask[size]); 1016 } 1017 1018 int 1019 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1020 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1021 int prot, uint64_t *gla) 1022 { 1023 uint64_t firstoff, low_limit, high_limit, segbase; 1024 int glasize, type; 1025 1026 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1027 ("%s: invalid segment %d", __func__, seg)); 1028 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1029 ("%s: invalid operand size %d", __func__, length)); 1030 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1031 ("%s: invalid prot %#x", __func__, prot)); 1032 1033 firstoff = offset; 1034 if (cpu_mode == CPU_MODE_64BIT) { 1035 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1036 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1037 glasize = 8; 1038 } else { 1039 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1040 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1041 glasize = 4; 1042 /* 1043 * If the segment selector is loaded with a NULL selector 1044 * then the descriptor is unusable and attempting to use 1045 * it results in a #GP(0). 1046 */ 1047 if (SEG_DESC_UNUSABLE(desc->access)) 1048 return (-1); 1049 1050 /* 1051 * The processor generates a #NP exception when a segment 1052 * register is loaded with a selector that points to a 1053 * descriptor that is not present. If this was the case then 1054 * it would have been checked before the VM-exit. 1055 */ 1056 KASSERT(SEG_DESC_PRESENT(desc->access), 1057 ("segment %d not present: %#x", seg, desc->access)); 1058 1059 /* 1060 * The descriptor type must indicate a code/data segment. 1061 */ 1062 type = SEG_DESC_TYPE(desc->access); 1063 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1064 "descriptor type %#x", seg, type)); 1065 1066 if (prot & PROT_READ) { 1067 /* #GP on a read access to a exec-only code segment */ 1068 if ((type & 0xA) == 0x8) 1069 return (-1); 1070 } 1071 1072 if (prot & PROT_WRITE) { 1073 /* 1074 * #GP on a write access to a code segment or a 1075 * read-only data segment. 1076 */ 1077 if (type & 0x8) /* code segment */ 1078 return (-1); 1079 1080 if ((type & 0xA) == 0) /* read-only data seg */ 1081 return (-1); 1082 } 1083 1084 /* 1085 * 'desc->limit' is fully expanded taking granularity into 1086 * account. 1087 */ 1088 if ((type & 0xC) == 0x4) { 1089 /* expand-down data segment */ 1090 low_limit = desc->limit + 1; 1091 high_limit = SEG_DESC_DEF32(desc->access) ? 1092 0xffffffff : 0xffff; 1093 } else { 1094 /* code segment or expand-up data segment */ 1095 low_limit = 0; 1096 high_limit = desc->limit; 1097 } 1098 1099 while (length > 0) { 1100 offset &= vie_size2mask(addrsize); 1101 if (offset < low_limit || offset > high_limit) 1102 return (-1); 1103 offset++; 1104 length--; 1105 } 1106 } 1107 1108 /* 1109 * In 64-bit mode all segments except %fs and %gs have a segment 1110 * base address of 0. 1111 */ 1112 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1113 seg != VM_REG_GUEST_GS) { 1114 segbase = 0; 1115 } else { 1116 segbase = desc->base; 1117 } 1118 1119 /* 1120 * Truncate 'firstoff' to the effective address size before adding 1121 * it to the segment base. 1122 */ 1123 firstoff &= vie_size2mask(addrsize); 1124 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1125 return (0); 1126 } 1127 1128 #ifdef _KERNEL 1129 void 1130 vie_init(struct vie *vie) 1131 { 1132 1133 bzero(vie, sizeof(struct vie)); 1134 1135 vie->base_register = VM_REG_LAST; 1136 vie->index_register = VM_REG_LAST; 1137 } 1138 1139 static int 1140 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1141 { 1142 int error_code = 0; 1143 1144 if (pte & PG_V) 1145 error_code |= PGEX_P; 1146 if (prot & VM_PROT_WRITE) 1147 error_code |= PGEX_W; 1148 if (usermode) 1149 error_code |= PGEX_U; 1150 if (rsvd) 1151 error_code |= PGEX_RSV; 1152 if (prot & VM_PROT_EXECUTE) 1153 error_code |= PGEX_I; 1154 1155 return (error_code); 1156 } 1157 1158 static void 1159 ptp_release(void **cookie) 1160 { 1161 if (*cookie != NULL) { 1162 vm_gpa_release(*cookie); 1163 *cookie = NULL; 1164 } 1165 } 1166 1167 static void * 1168 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1169 { 1170 void *ptr; 1171 1172 ptp_release(cookie); 1173 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1174 return (ptr); 1175 } 1176 1177 int 1178 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1179 uint64_t gla, int prot, uint64_t *gpa) 1180 { 1181 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1182 u_int retries; 1183 uint64_t *ptpbase, ptpphys, pte, pgsize; 1184 uint32_t *ptpbase32, pte32; 1185 void *cookie; 1186 1187 usermode = (paging->cpl == 3 ? 1 : 0); 1188 writable = prot & VM_PROT_WRITE; 1189 cookie = NULL; 1190 retval = 0; 1191 retries = 0; 1192 restart: 1193 ptpphys = paging->cr3; /* root of the page tables */ 1194 ptp_release(&cookie); 1195 if (retries++ > 0) 1196 maybe_yield(); 1197 1198 if (vie_canonical_check(paging->cpu_mode, gla)) { 1199 /* 1200 * XXX assuming a non-stack reference otherwise a stack fault 1201 * should be generated. 1202 */ 1203 vm_inject_gp(vm, vcpuid); 1204 goto fault; 1205 } 1206 1207 if (paging->paging_mode == PAGING_MODE_FLAT) { 1208 *gpa = gla; 1209 goto done; 1210 } 1211 1212 if (paging->paging_mode == PAGING_MODE_32) { 1213 nlevels = 2; 1214 while (--nlevels >= 0) { 1215 /* Zero out the lower 12 bits. */ 1216 ptpphys &= ~0xfff; 1217 1218 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1219 1220 if (ptpbase32 == NULL) 1221 goto error; 1222 1223 ptpshift = PAGE_SHIFT + nlevels * 10; 1224 ptpindex = (gla >> ptpshift) & 0x3FF; 1225 pgsize = 1UL << ptpshift; 1226 1227 pte32 = ptpbase32[ptpindex]; 1228 1229 if ((pte32 & PG_V) == 0 || 1230 (usermode && (pte32 & PG_U) == 0) || 1231 (writable && (pte32 & PG_RW) == 0)) { 1232 pfcode = pf_error_code(usermode, prot, 0, 1233 pte32); 1234 vm_inject_pf(vm, vcpuid, pfcode, gla); 1235 goto fault; 1236 } 1237 1238 /* 1239 * Emulate the x86 MMU's management of the accessed 1240 * and dirty flags. While the accessed flag is set 1241 * at every level of the page table, the dirty flag 1242 * is only set at the last level providing the guest 1243 * physical address. 1244 */ 1245 if ((pte32 & PG_A) == 0) { 1246 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1247 pte32, pte32 | PG_A) == 0) { 1248 goto restart; 1249 } 1250 } 1251 1252 /* XXX must be ignored if CR4.PSE=0 */ 1253 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1254 break; 1255 1256 ptpphys = pte32; 1257 } 1258 1259 /* Set the dirty bit in the page table entry if necessary */ 1260 if (writable && (pte32 & PG_M) == 0) { 1261 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1262 pte32, pte32 | PG_M) == 0) { 1263 goto restart; 1264 } 1265 } 1266 1267 /* Zero out the lower 'ptpshift' bits */ 1268 pte32 >>= ptpshift; pte32 <<= ptpshift; 1269 *gpa = pte32 | (gla & (pgsize - 1)); 1270 goto done; 1271 } 1272 1273 if (paging->paging_mode == PAGING_MODE_PAE) { 1274 /* Zero out the lower 5 bits and the upper 32 bits */ 1275 ptpphys &= 0xffffffe0UL; 1276 1277 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1278 if (ptpbase == NULL) 1279 goto error; 1280 1281 ptpindex = (gla >> 30) & 0x3; 1282 1283 pte = ptpbase[ptpindex]; 1284 1285 if ((pte & PG_V) == 0) { 1286 pfcode = pf_error_code(usermode, prot, 0, pte); 1287 vm_inject_pf(vm, vcpuid, pfcode, gla); 1288 goto fault; 1289 } 1290 1291 ptpphys = pte; 1292 1293 nlevels = 2; 1294 } else 1295 nlevels = 4; 1296 while (--nlevels >= 0) { 1297 /* Zero out the lower 12 bits and the upper 12 bits */ 1298 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1299 1300 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1301 if (ptpbase == NULL) 1302 goto error; 1303 1304 ptpshift = PAGE_SHIFT + nlevels * 9; 1305 ptpindex = (gla >> ptpshift) & 0x1FF; 1306 pgsize = 1UL << ptpshift; 1307 1308 pte = ptpbase[ptpindex]; 1309 1310 if ((pte & PG_V) == 0 || 1311 (usermode && (pte & PG_U) == 0) || 1312 (writable && (pte & PG_RW) == 0)) { 1313 pfcode = pf_error_code(usermode, prot, 0, pte); 1314 vm_inject_pf(vm, vcpuid, pfcode, gla); 1315 goto fault; 1316 } 1317 1318 /* Set the accessed bit in the page table entry */ 1319 if ((pte & PG_A) == 0) { 1320 if (atomic_cmpset_64(&ptpbase[ptpindex], 1321 pte, pte | PG_A) == 0) { 1322 goto restart; 1323 } 1324 } 1325 1326 if (nlevels > 0 && (pte & PG_PS) != 0) { 1327 if (pgsize > 1 * GB) { 1328 pfcode = pf_error_code(usermode, prot, 1, pte); 1329 vm_inject_pf(vm, vcpuid, pfcode, gla); 1330 goto fault; 1331 } 1332 break; 1333 } 1334 1335 ptpphys = pte; 1336 } 1337 1338 /* Set the dirty bit in the page table entry if necessary */ 1339 if (writable && (pte & PG_M) == 0) { 1340 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1341 goto restart; 1342 } 1343 1344 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1345 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1346 *gpa = pte | (gla & (pgsize - 1)); 1347 done: 1348 ptp_release(&cookie); 1349 return (retval); 1350 error: 1351 retval = -1; 1352 goto done; 1353 fault: 1354 retval = 1; 1355 goto done; 1356 } 1357 1358 int 1359 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1360 uint64_t rip, int inst_length, struct vie *vie) 1361 { 1362 struct vm_copyinfo copyinfo[2]; 1363 int error, prot; 1364 1365 if (inst_length > VIE_INST_SIZE) 1366 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1367 1368 prot = PROT_READ | PROT_EXEC; 1369 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1370 copyinfo, nitems(copyinfo)); 1371 if (error == 0) { 1372 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1373 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1374 vie->num_valid = inst_length; 1375 } 1376 return (error); 1377 } 1378 1379 static int 1380 vie_peek(struct vie *vie, uint8_t *x) 1381 { 1382 1383 if (vie->num_processed < vie->num_valid) { 1384 *x = vie->inst[vie->num_processed]; 1385 return (0); 1386 } else 1387 return (-1); 1388 } 1389 1390 static void 1391 vie_advance(struct vie *vie) 1392 { 1393 1394 vie->num_processed++; 1395 } 1396 1397 static int 1398 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1399 { 1400 uint8_t x; 1401 1402 while (1) { 1403 if (vie_peek(vie, &x)) 1404 return (-1); 1405 1406 if (x == 0x66) 1407 vie->opsize_override = 1; 1408 else if (x == 0x67) 1409 vie->addrsize_override = 1; 1410 else 1411 break; 1412 1413 vie_advance(vie); 1414 } 1415 1416 /* 1417 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1418 * - Only one REX prefix is allowed per instruction. 1419 * - The REX prefix must immediately precede the opcode byte or the 1420 * escape opcode byte. 1421 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1422 * the mandatory prefix must come before the REX prefix. 1423 */ 1424 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1425 vie->rex_present = 1; 1426 vie->rex_w = x & 0x8 ? 1 : 0; 1427 vie->rex_r = x & 0x4 ? 1 : 0; 1428 vie->rex_x = x & 0x2 ? 1 : 0; 1429 vie->rex_b = x & 0x1 ? 1 : 0; 1430 vie_advance(vie); 1431 } 1432 1433 /* 1434 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1435 */ 1436 if (cpu_mode == CPU_MODE_64BIT) { 1437 /* 1438 * Default address size is 64-bits and default operand size 1439 * is 32-bits. 1440 */ 1441 vie->addrsize = vie->addrsize_override ? 4 : 8; 1442 if (vie->rex_w) 1443 vie->opsize = 8; 1444 else if (vie->opsize_override) 1445 vie->opsize = 2; 1446 else 1447 vie->opsize = 4; 1448 } else if (cs_d) { 1449 /* Default address and operand sizes are 32-bits */ 1450 vie->addrsize = vie->addrsize_override ? 2 : 4; 1451 vie->opsize = vie->opsize_override ? 2 : 4; 1452 } else { 1453 /* Default address and operand sizes are 16-bits */ 1454 vie->addrsize = vie->addrsize_override ? 4 : 2; 1455 vie->opsize = vie->opsize_override ? 4 : 2; 1456 } 1457 return (0); 1458 } 1459 1460 static int 1461 decode_two_byte_opcode(struct vie *vie) 1462 { 1463 uint8_t x; 1464 1465 if (vie_peek(vie, &x)) 1466 return (-1); 1467 1468 vie->op = two_byte_opcodes[x]; 1469 1470 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1471 return (-1); 1472 1473 vie_advance(vie); 1474 return (0); 1475 } 1476 1477 static int 1478 decode_opcode(struct vie *vie) 1479 { 1480 uint8_t x; 1481 1482 if (vie_peek(vie, &x)) 1483 return (-1); 1484 1485 vie->op = one_byte_opcodes[x]; 1486 1487 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1488 return (-1); 1489 1490 vie_advance(vie); 1491 1492 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1493 return (decode_two_byte_opcode(vie)); 1494 1495 return (0); 1496 } 1497 1498 static int 1499 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1500 { 1501 uint8_t x; 1502 1503 if (cpu_mode == CPU_MODE_REAL) 1504 return (-1); 1505 1506 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1507 return (0); 1508 1509 if (vie_peek(vie, &x)) 1510 return (-1); 1511 1512 vie->mod = (x >> 6) & 0x3; 1513 vie->rm = (x >> 0) & 0x7; 1514 vie->reg = (x >> 3) & 0x7; 1515 1516 /* 1517 * A direct addressing mode makes no sense in the context of an EPT 1518 * fault. There has to be a memory access involved to cause the 1519 * EPT fault. 1520 */ 1521 if (vie->mod == VIE_MOD_DIRECT) 1522 return (-1); 1523 1524 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1525 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1526 /* 1527 * Table 2-5: Special Cases of REX Encodings 1528 * 1529 * mod=0, r/m=5 is used in the compatibility mode to 1530 * indicate a disp32 without a base register. 1531 * 1532 * mod!=3, r/m=4 is used in the compatibility mode to 1533 * indicate that the SIB byte is present. 1534 * 1535 * The 'b' bit in the REX prefix is don't care in 1536 * this case. 1537 */ 1538 } else { 1539 vie->rm |= (vie->rex_b << 3); 1540 } 1541 1542 vie->reg |= (vie->rex_r << 3); 1543 1544 /* SIB */ 1545 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1546 goto done; 1547 1548 vie->base_register = gpr_map[vie->rm]; 1549 1550 switch (vie->mod) { 1551 case VIE_MOD_INDIRECT_DISP8: 1552 vie->disp_bytes = 1; 1553 break; 1554 case VIE_MOD_INDIRECT_DISP32: 1555 vie->disp_bytes = 4; 1556 break; 1557 case VIE_MOD_INDIRECT: 1558 if (vie->rm == VIE_RM_DISP32) { 1559 vie->disp_bytes = 4; 1560 /* 1561 * Table 2-7. RIP-Relative Addressing 1562 * 1563 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1564 * whereas in compatibility mode it just implies disp32. 1565 */ 1566 1567 if (cpu_mode == CPU_MODE_64BIT) 1568 vie->base_register = VM_REG_GUEST_RIP; 1569 else 1570 vie->base_register = VM_REG_LAST; 1571 } 1572 break; 1573 } 1574 1575 done: 1576 vie_advance(vie); 1577 1578 return (0); 1579 } 1580 1581 static int 1582 decode_sib(struct vie *vie) 1583 { 1584 uint8_t x; 1585 1586 /* Proceed only if SIB byte is present */ 1587 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1588 return (0); 1589 1590 if (vie_peek(vie, &x)) 1591 return (-1); 1592 1593 /* De-construct the SIB byte */ 1594 vie->ss = (x >> 6) & 0x3; 1595 vie->index = (x >> 3) & 0x7; 1596 vie->base = (x >> 0) & 0x7; 1597 1598 /* Apply the REX prefix modifiers */ 1599 vie->index |= vie->rex_x << 3; 1600 vie->base |= vie->rex_b << 3; 1601 1602 switch (vie->mod) { 1603 case VIE_MOD_INDIRECT_DISP8: 1604 vie->disp_bytes = 1; 1605 break; 1606 case VIE_MOD_INDIRECT_DISP32: 1607 vie->disp_bytes = 4; 1608 break; 1609 } 1610 1611 if (vie->mod == VIE_MOD_INDIRECT && 1612 (vie->base == 5 || vie->base == 13)) { 1613 /* 1614 * Special case when base register is unused if mod = 0 1615 * and base = %rbp or %r13. 1616 * 1617 * Documented in: 1618 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1619 * Table 2-5: Special Cases of REX Encodings 1620 */ 1621 vie->disp_bytes = 4; 1622 } else { 1623 vie->base_register = gpr_map[vie->base]; 1624 } 1625 1626 /* 1627 * All encodings of 'index' are valid except for %rsp (4). 1628 * 1629 * Documented in: 1630 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1631 * Table 2-5: Special Cases of REX Encodings 1632 */ 1633 if (vie->index != 4) 1634 vie->index_register = gpr_map[vie->index]; 1635 1636 /* 'scale' makes sense only in the context of an index register */ 1637 if (vie->index_register < VM_REG_LAST) 1638 vie->scale = 1 << vie->ss; 1639 1640 vie_advance(vie); 1641 1642 return (0); 1643 } 1644 1645 static int 1646 decode_displacement(struct vie *vie) 1647 { 1648 int n, i; 1649 uint8_t x; 1650 1651 union { 1652 char buf[4]; 1653 int8_t signed8; 1654 int32_t signed32; 1655 } u; 1656 1657 if ((n = vie->disp_bytes) == 0) 1658 return (0); 1659 1660 if (n != 1 && n != 4) 1661 panic("decode_displacement: invalid disp_bytes %d", n); 1662 1663 for (i = 0; i < n; i++) { 1664 if (vie_peek(vie, &x)) 1665 return (-1); 1666 1667 u.buf[i] = x; 1668 vie_advance(vie); 1669 } 1670 1671 if (n == 1) 1672 vie->displacement = u.signed8; /* sign-extended */ 1673 else 1674 vie->displacement = u.signed32; /* sign-extended */ 1675 1676 return (0); 1677 } 1678 1679 static int 1680 decode_immediate(struct vie *vie) 1681 { 1682 int i, n; 1683 uint8_t x; 1684 union { 1685 char buf[4]; 1686 int8_t signed8; 1687 int16_t signed16; 1688 int32_t signed32; 1689 } u; 1690 1691 /* Figure out immediate operand size (if any) */ 1692 if (vie->op.op_flags & VIE_OP_F_IMM) { 1693 /* 1694 * Section 2.2.1.5 "Immediates", Intel SDM: 1695 * In 64-bit mode the typical size of immediate operands 1696 * remains 32-bits. When the operand size if 64-bits, the 1697 * processor sign-extends all immediates to 64-bits prior 1698 * to their use. 1699 */ 1700 if (vie->opsize == 4 || vie->opsize == 8) 1701 vie->imm_bytes = 4; 1702 else 1703 vie->imm_bytes = 2; 1704 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 1705 vie->imm_bytes = 1; 1706 } 1707 1708 if ((n = vie->imm_bytes) == 0) 1709 return (0); 1710 1711 KASSERT(n == 1 || n == 2 || n == 4, 1712 ("%s: invalid number of immediate bytes: %d", __func__, n)); 1713 1714 for (i = 0; i < n; i++) { 1715 if (vie_peek(vie, &x)) 1716 return (-1); 1717 1718 u.buf[i] = x; 1719 vie_advance(vie); 1720 } 1721 1722 /* sign-extend the immediate value before use */ 1723 if (n == 1) 1724 vie->immediate = u.signed8; 1725 else if (n == 2) 1726 vie->immediate = u.signed16; 1727 else 1728 vie->immediate = u.signed32; 1729 1730 return (0); 1731 } 1732 1733 static int 1734 decode_moffset(struct vie *vie) 1735 { 1736 int i, n; 1737 uint8_t x; 1738 union { 1739 char buf[8]; 1740 uint64_t u64; 1741 } u; 1742 1743 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 1744 return (0); 1745 1746 /* 1747 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 1748 * The memory offset size follows the address-size of the instruction. 1749 */ 1750 n = vie->addrsize; 1751 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 1752 1753 u.u64 = 0; 1754 for (i = 0; i < n; i++) { 1755 if (vie_peek(vie, &x)) 1756 return (-1); 1757 1758 u.buf[i] = x; 1759 vie_advance(vie); 1760 } 1761 vie->displacement = u.u64; 1762 return (0); 1763 } 1764 1765 /* 1766 * Verify that all the bytes in the instruction buffer were consumed. 1767 */ 1768 static int 1769 verify_inst_length(struct vie *vie) 1770 { 1771 1772 if (vie->num_processed == vie->num_valid) 1773 return (0); 1774 else 1775 return (-1); 1776 } 1777 1778 /* 1779 * Verify that the 'guest linear address' provided as collateral of the nested 1780 * page table fault matches with our instruction decoding. 1781 */ 1782 static int 1783 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 1784 { 1785 int error; 1786 uint64_t base, idx, gla2; 1787 1788 /* Skip 'gla' verification */ 1789 if (gla == VIE_INVALID_GLA) 1790 return (0); 1791 1792 base = 0; 1793 if (vie->base_register != VM_REG_LAST) { 1794 error = vm_get_register(vm, cpuid, vie->base_register, &base); 1795 if (error) { 1796 printf("verify_gla: error %d getting base reg %d\n", 1797 error, vie->base_register); 1798 return (-1); 1799 } 1800 1801 /* 1802 * RIP-relative addressing starts from the following 1803 * instruction 1804 */ 1805 if (vie->base_register == VM_REG_GUEST_RIP) 1806 base += vie->num_valid; 1807 } 1808 1809 idx = 0; 1810 if (vie->index_register != VM_REG_LAST) { 1811 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 1812 if (error) { 1813 printf("verify_gla: error %d getting index reg %d\n", 1814 error, vie->index_register); 1815 return (-1); 1816 } 1817 } 1818 1819 /* XXX assuming that the base address of the segment is 0 */ 1820 gla2 = base + vie->scale * idx + vie->displacement; 1821 gla2 &= size2mask[vie->addrsize]; 1822 if (gla != gla2) { 1823 printf("verify_gla mismatch: " 1824 "base(0x%0lx), scale(%d), index(0x%0lx), " 1825 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 1826 base, vie->scale, idx, vie->displacement, gla, gla2); 1827 return (-1); 1828 } 1829 1830 return (0); 1831 } 1832 1833 int 1834 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 1835 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 1836 { 1837 1838 if (decode_prefixes(vie, cpu_mode, cs_d)) 1839 return (-1); 1840 1841 if (decode_opcode(vie)) 1842 return (-1); 1843 1844 if (decode_modrm(vie, cpu_mode)) 1845 return (-1); 1846 1847 if (decode_sib(vie)) 1848 return (-1); 1849 1850 if (decode_displacement(vie)) 1851 return (-1); 1852 1853 if (decode_immediate(vie)) 1854 return (-1); 1855 1856 if (decode_moffset(vie)) 1857 return (-1); 1858 1859 if (verify_inst_length(vie)) 1860 return (-1); 1861 1862 if (verify_gla(vm, cpuid, gla, vie)) 1863 return (-1); 1864 1865 vie->decoded = 1; /* success */ 1866 1867 return (0); 1868 } 1869 #endif /* _KERNEL */ 1870