1 /*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #ifdef _KERNEL 34 #include <sys/param.h> 35 #include <sys/pcpu.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <vm/vm.h> 40 #include <vm/pmap.h> 41 42 #include <machine/vmparam.h> 43 #include <machine/vmm.h> 44 #else /* !_KERNEL */ 45 #include <sys/types.h> 46 #include <sys/errno.h> 47 #include <sys/_iovec.h> 48 49 #include <machine/vmm.h> 50 51 #include <assert.h> 52 #include <vmmapi.h> 53 #define KASSERT(exp,msg) assert((exp)) 54 #endif /* _KERNEL */ 55 56 #include <machine/vmm_instruction_emul.h> 57 #include <x86/psl.h> 58 #include <x86/specialreg.h> 59 60 /* struct vie_op.op_type */ 61 enum { 62 VIE_OP_TYPE_NONE = 0, 63 VIE_OP_TYPE_MOV, 64 VIE_OP_TYPE_MOVSX, 65 VIE_OP_TYPE_MOVZX, 66 VIE_OP_TYPE_AND, 67 VIE_OP_TYPE_OR, 68 VIE_OP_TYPE_SUB, 69 VIE_OP_TYPE_TWO_BYTE, 70 VIE_OP_TYPE_PUSH, 71 VIE_OP_TYPE_CMP, 72 VIE_OP_TYPE_POP, 73 VIE_OP_TYPE_LAST 74 }; 75 76 /* struct vie_op.op_flags */ 77 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 78 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 79 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 80 #define VIE_OP_F_NO_MODRM (1 << 3) 81 82 static const struct vie_op two_byte_opcodes[256] = { 83 [0xB6] = { 84 .op_byte = 0xB6, 85 .op_type = VIE_OP_TYPE_MOVZX, 86 }, 87 [0xB7] = { 88 .op_byte = 0xB7, 89 .op_type = VIE_OP_TYPE_MOVZX, 90 }, 91 [0xBE] = { 92 .op_byte = 0xBE, 93 .op_type = VIE_OP_TYPE_MOVSX, 94 }, 95 }; 96 97 static const struct vie_op one_byte_opcodes[256] = { 98 [0x0F] = { 99 .op_byte = 0x0F, 100 .op_type = VIE_OP_TYPE_TWO_BYTE 101 }, 102 [0x2B] = { 103 .op_byte = 0x2B, 104 .op_type = VIE_OP_TYPE_SUB, 105 }, 106 [0x3B] = { 107 .op_byte = 0x3B, 108 .op_type = VIE_OP_TYPE_CMP, 109 }, 110 [0x88] = { 111 .op_byte = 0x88, 112 .op_type = VIE_OP_TYPE_MOV, 113 }, 114 [0x89] = { 115 .op_byte = 0x89, 116 .op_type = VIE_OP_TYPE_MOV, 117 }, 118 [0x8A] = { 119 .op_byte = 0x8A, 120 .op_type = VIE_OP_TYPE_MOV, 121 }, 122 [0x8B] = { 123 .op_byte = 0x8B, 124 .op_type = VIE_OP_TYPE_MOV, 125 }, 126 [0xA1] = { 127 .op_byte = 0xA1, 128 .op_type = VIE_OP_TYPE_MOV, 129 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 130 }, 131 [0xA3] = { 132 .op_byte = 0xA3, 133 .op_type = VIE_OP_TYPE_MOV, 134 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 135 }, 136 [0xC6] = { 137 /* XXX Group 11 extended opcode - not just MOV */ 138 .op_byte = 0xC6, 139 .op_type = VIE_OP_TYPE_MOV, 140 .op_flags = VIE_OP_F_IMM8, 141 }, 142 [0xC7] = { 143 .op_byte = 0xC7, 144 .op_type = VIE_OP_TYPE_MOV, 145 .op_flags = VIE_OP_F_IMM, 146 }, 147 [0x23] = { 148 .op_byte = 0x23, 149 .op_type = VIE_OP_TYPE_AND, 150 }, 151 [0x81] = { 152 /* XXX Group 1 extended opcode - not just AND */ 153 .op_byte = 0x81, 154 .op_type = VIE_OP_TYPE_AND, 155 .op_flags = VIE_OP_F_IMM, 156 }, 157 [0x83] = { 158 /* XXX Group 1 extended opcode - not just OR */ 159 .op_byte = 0x83, 160 .op_type = VIE_OP_TYPE_OR, 161 .op_flags = VIE_OP_F_IMM8, 162 }, 163 [0x8F] = { 164 /* XXX Group 1A extended opcode - not just POP */ 165 .op_byte = 0x8F, 166 .op_type = VIE_OP_TYPE_POP, 167 }, 168 [0xFF] = { 169 /* XXX Group 5 extended opcode - not just PUSH */ 170 .op_byte = 0xFF, 171 .op_type = VIE_OP_TYPE_PUSH, 172 } 173 }; 174 175 /* struct vie.mod */ 176 #define VIE_MOD_INDIRECT 0 177 #define VIE_MOD_INDIRECT_DISP8 1 178 #define VIE_MOD_INDIRECT_DISP32 2 179 #define VIE_MOD_DIRECT 3 180 181 /* struct vie.rm */ 182 #define VIE_RM_SIB 4 183 #define VIE_RM_DISP32 5 184 185 #define GB (1024 * 1024 * 1024) 186 187 static enum vm_reg_name gpr_map[16] = { 188 VM_REG_GUEST_RAX, 189 VM_REG_GUEST_RCX, 190 VM_REG_GUEST_RDX, 191 VM_REG_GUEST_RBX, 192 VM_REG_GUEST_RSP, 193 VM_REG_GUEST_RBP, 194 VM_REG_GUEST_RSI, 195 VM_REG_GUEST_RDI, 196 VM_REG_GUEST_R8, 197 VM_REG_GUEST_R9, 198 VM_REG_GUEST_R10, 199 VM_REG_GUEST_R11, 200 VM_REG_GUEST_R12, 201 VM_REG_GUEST_R13, 202 VM_REG_GUEST_R14, 203 VM_REG_GUEST_R15 204 }; 205 206 static uint64_t size2mask[] = { 207 [1] = 0xff, 208 [2] = 0xffff, 209 [4] = 0xffffffff, 210 [8] = 0xffffffffffffffff, 211 }; 212 213 static int 214 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 215 { 216 int error; 217 218 error = vm_get_register(vm, vcpuid, reg, rval); 219 220 return (error); 221 } 222 223 static void 224 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 225 { 226 *lhbr = 0; 227 *reg = gpr_map[vie->reg]; 228 229 /* 230 * 64-bit mode imposes limitations on accessing legacy high byte 231 * registers (lhbr). 232 * 233 * The legacy high-byte registers cannot be addressed if the REX 234 * prefix is present. In this case the values 4, 5, 6 and 7 of the 235 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 236 * 237 * If the REX prefix is not present then the values 4, 5, 6 and 7 238 * of the 'ModRM:reg' field address the legacy high-byte registers, 239 * %ah, %ch, %dh and %bh respectively. 240 */ 241 if (!vie->rex_present) { 242 if (vie->reg & 0x4) { 243 *lhbr = 1; 244 *reg = gpr_map[vie->reg & 0x3]; 245 } 246 } 247 } 248 249 static int 250 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 251 { 252 uint64_t val; 253 int error, lhbr; 254 enum vm_reg_name reg; 255 256 vie_calc_bytereg(vie, ®, &lhbr); 257 error = vm_get_register(vm, vcpuid, reg, &val); 258 259 /* 260 * To obtain the value of a legacy high byte register shift the 261 * base register right by 8 bits (%ah = %rax >> 8). 262 */ 263 if (lhbr) 264 *rval = val >> 8; 265 else 266 *rval = val; 267 return (error); 268 } 269 270 static int 271 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) 272 { 273 uint64_t origval, val, mask; 274 int error, lhbr; 275 enum vm_reg_name reg; 276 277 vie_calc_bytereg(vie, ®, &lhbr); 278 error = vm_get_register(vm, vcpuid, reg, &origval); 279 if (error == 0) { 280 val = byte; 281 mask = 0xff; 282 if (lhbr) { 283 /* 284 * Shift left by 8 to store 'byte' in a legacy high 285 * byte register. 286 */ 287 val <<= 8; 288 mask <<= 8; 289 } 290 val |= origval & ~mask; 291 error = vm_set_register(vm, vcpuid, reg, val); 292 } 293 return (error); 294 } 295 296 int 297 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 298 uint64_t val, int size) 299 { 300 int error; 301 uint64_t origval; 302 303 switch (size) { 304 case 1: 305 case 2: 306 error = vie_read_register(vm, vcpuid, reg, &origval); 307 if (error) 308 return (error); 309 val &= size2mask[size]; 310 val |= origval & ~size2mask[size]; 311 break; 312 case 4: 313 val &= 0xffffffffUL; 314 break; 315 case 8: 316 break; 317 default: 318 return (EINVAL); 319 } 320 321 error = vm_set_register(vm, vcpuid, reg, val); 322 return (error); 323 } 324 325 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 326 327 /* 328 * Return the status flags that would result from doing (x - y). 329 */ 330 #define GETCC(sz) \ 331 static u_long \ 332 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 333 { \ 334 u_long rflags; \ 335 \ 336 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 337 "=r" (rflags), "+r" (x) : "m" (y)); \ 338 return (rflags); \ 339 } struct __hack 340 341 GETCC(8); 342 GETCC(16); 343 GETCC(32); 344 GETCC(64); 345 346 static u_long 347 getcc(int opsize, uint64_t x, uint64_t y) 348 { 349 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 350 ("getcc: invalid operand size %d", opsize)); 351 352 if (opsize == 1) 353 return (getcc8(x, y)); 354 else if (opsize == 2) 355 return (getcc16(x, y)); 356 else if (opsize == 4) 357 return (getcc32(x, y)); 358 else 359 return (getcc64(x, y)); 360 } 361 362 static int 363 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 364 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 365 { 366 int error, size; 367 enum vm_reg_name reg; 368 uint8_t byte; 369 uint64_t val; 370 371 size = vie->opsize; 372 error = EINVAL; 373 374 switch (vie->op.op_byte) { 375 case 0x88: 376 /* 377 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 378 * 88/r: mov r/m8, r8 379 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 380 */ 381 size = 1; /* override for byte operation */ 382 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 383 if (error == 0) 384 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 385 break; 386 case 0x89: 387 /* 388 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 389 * 89/r: mov r/m16, r16 390 * 89/r: mov r/m32, r32 391 * REX.W + 89/r mov r/m64, r64 392 */ 393 reg = gpr_map[vie->reg]; 394 error = vie_read_register(vm, vcpuid, reg, &val); 395 if (error == 0) { 396 val &= size2mask[size]; 397 error = memwrite(vm, vcpuid, gpa, val, size, arg); 398 } 399 break; 400 case 0x8A: 401 /* 402 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 403 * 8A/r: mov r8, r/m8 404 * REX + 8A/r: mov r8, r/m8 405 */ 406 size = 1; /* override for byte operation */ 407 error = memread(vm, vcpuid, gpa, &val, size, arg); 408 if (error == 0) 409 error = vie_write_bytereg(vm, vcpuid, vie, val); 410 break; 411 case 0x8B: 412 /* 413 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 414 * 8B/r: mov r16, r/m16 415 * 8B/r: mov r32, r/m32 416 * REX.W 8B/r: mov r64, r/m64 417 */ 418 error = memread(vm, vcpuid, gpa, &val, size, arg); 419 if (error == 0) { 420 reg = gpr_map[vie->reg]; 421 error = vie_update_register(vm, vcpuid, reg, val, size); 422 } 423 break; 424 case 0xA1: 425 /* 426 * MOV from seg:moffset to AX/EAX/RAX 427 * A1: mov AX, moffs16 428 * A1: mov EAX, moffs32 429 * REX.W + A1: mov RAX, moffs64 430 */ 431 error = memread(vm, vcpuid, gpa, &val, size, arg); 432 if (error == 0) { 433 reg = VM_REG_GUEST_RAX; 434 error = vie_update_register(vm, vcpuid, reg, val, size); 435 } 436 break; 437 case 0xA3: 438 /* 439 * MOV from AX/EAX/RAX to seg:moffset 440 * A3: mov moffs16, AX 441 * A3: mov moffs32, EAX 442 * REX.W + A3: mov moffs64, RAX 443 */ 444 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); 445 if (error == 0) { 446 val &= size2mask[size]; 447 error = memwrite(vm, vcpuid, gpa, val, size, arg); 448 } 449 break; 450 case 0xC6: 451 /* 452 * MOV from imm8 to mem (ModRM:r/m) 453 * C6/0 mov r/m8, imm8 454 * REX + C6/0 mov r/m8, imm8 455 */ 456 size = 1; /* override for byte operation */ 457 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); 458 break; 459 case 0xC7: 460 /* 461 * MOV from imm16/imm32 to mem (ModRM:r/m) 462 * C7/0 mov r/m16, imm16 463 * C7/0 mov r/m32, imm32 464 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 465 */ 466 val = vie->immediate & size2mask[size]; 467 error = memwrite(vm, vcpuid, gpa, val, size, arg); 468 break; 469 default: 470 break; 471 } 472 473 return (error); 474 } 475 476 static int 477 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 478 mem_region_read_t memread, mem_region_write_t memwrite, 479 void *arg) 480 { 481 int error, size; 482 enum vm_reg_name reg; 483 uint64_t val; 484 485 size = vie->opsize; 486 error = EINVAL; 487 488 switch (vie->op.op_byte) { 489 case 0xB6: 490 /* 491 * MOV and zero extend byte from mem (ModRM:r/m) to 492 * reg (ModRM:reg). 493 * 494 * 0F B6/r movzx r16, r/m8 495 * 0F B6/r movzx r32, r/m8 496 * REX.W + 0F B6/r movzx r64, r/m8 497 */ 498 499 /* get the first operand */ 500 error = memread(vm, vcpuid, gpa, &val, 1, arg); 501 if (error) 502 break; 503 504 /* get the second operand */ 505 reg = gpr_map[vie->reg]; 506 507 /* zero-extend byte */ 508 val = (uint8_t)val; 509 510 /* write the result */ 511 error = vie_update_register(vm, vcpuid, reg, val, size); 512 break; 513 case 0xB7: 514 /* 515 * MOV and zero extend word from mem (ModRM:r/m) to 516 * reg (ModRM:reg). 517 * 518 * 0F B7/r movzx r32, r/m16 519 * REX.W + 0F B7/r movzx r64, r/m16 520 */ 521 error = memread(vm, vcpuid, gpa, &val, 2, arg); 522 if (error) 523 return (error); 524 525 reg = gpr_map[vie->reg]; 526 527 /* zero-extend word */ 528 val = (uint16_t)val; 529 530 error = vie_update_register(vm, vcpuid, reg, val, size); 531 break; 532 case 0xBE: 533 /* 534 * MOV and sign extend byte from mem (ModRM:r/m) to 535 * reg (ModRM:reg). 536 * 537 * 0F BE/r movsx r16, r/m8 538 * 0F BE/r movsx r32, r/m8 539 * REX.W + 0F BE/r movsx r64, r/m8 540 */ 541 542 /* get the first operand */ 543 error = memread(vm, vcpuid, gpa, &val, 1, arg); 544 if (error) 545 break; 546 547 /* get the second operand */ 548 reg = gpr_map[vie->reg]; 549 550 /* sign extend byte */ 551 val = (int8_t)val; 552 553 /* write the result */ 554 error = vie_update_register(vm, vcpuid, reg, val, size); 555 break; 556 default: 557 break; 558 } 559 return (error); 560 } 561 562 static int 563 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 564 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 565 { 566 int error, size; 567 enum vm_reg_name reg; 568 uint64_t result, rflags, rflags2, val1, val2; 569 570 size = vie->opsize; 571 error = EINVAL; 572 573 switch (vie->op.op_byte) { 574 case 0x23: 575 /* 576 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 577 * result in reg. 578 * 579 * 23/r and r16, r/m16 580 * 23/r and r32, r/m32 581 * REX.W + 23/r and r64, r/m64 582 */ 583 584 /* get the first operand */ 585 reg = gpr_map[vie->reg]; 586 error = vie_read_register(vm, vcpuid, reg, &val1); 587 if (error) 588 break; 589 590 /* get the second operand */ 591 error = memread(vm, vcpuid, gpa, &val2, size, arg); 592 if (error) 593 break; 594 595 /* perform the operation and write the result */ 596 result = val1 & val2; 597 error = vie_update_register(vm, vcpuid, reg, result, size); 598 break; 599 case 0x81: 600 /* 601 * AND/OR mem (ModRM:r/m) with immediate and store the 602 * result in mem. 603 * 604 * AND: i = 4 605 * OR: i = 1 606 * 81 /i op r/m16, imm16 607 * 81 /i op r/m32, imm32 608 * REX.W + 81 /i op r/m64, imm32 sign-extended to 64 609 * 610 */ 611 612 /* get the first operand */ 613 error = memread(vm, vcpuid, gpa, &val1, size, arg); 614 if (error) 615 break; 616 617 /* 618 * perform the operation with the pre-fetched immediate 619 * operand and write the result 620 */ 621 switch (vie->reg & 7) { 622 case 0x4: 623 /* modrm:reg == b100, AND */ 624 result = val1 & vie->immediate; 625 break; 626 case 0x1: 627 /* modrm:reg == b001, OR */ 628 result = val1 | vie->immediate; 629 break; 630 default: 631 error = EINVAL; 632 break; 633 } 634 if (error) 635 break; 636 637 error = memwrite(vm, vcpuid, gpa, result, size, arg); 638 break; 639 default: 640 break; 641 } 642 if (error) 643 return (error); 644 645 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 646 if (error) 647 return (error); 648 649 /* 650 * OF and CF are cleared; the SF, ZF and PF flags are set according 651 * to the result; AF is undefined. 652 * 653 * The updated status flags are obtained by subtracting 0 from 'result'. 654 */ 655 rflags2 = getcc(size, result, 0); 656 rflags &= ~RFLAGS_STATUS_BITS; 657 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 658 659 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 660 return (error); 661 } 662 663 static int 664 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 665 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 666 { 667 int error, size; 668 uint64_t val1, result, rflags, rflags2; 669 670 size = vie->opsize; 671 error = EINVAL; 672 673 switch (vie->op.op_byte) { 674 case 0x83: 675 /* 676 * OR mem (ModRM:r/m) with immediate and store the 677 * result in mem. 678 * 679 * 83 /1 OR r/m16, imm8 sign-extended to 16 680 * 83 /1 OR r/m32, imm8 sign-extended to 32 681 * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 682 * 683 * Currently, only the OR operation of the 0x83 opcode 684 * is implemented (ModRM:reg = b001). 685 */ 686 if ((vie->reg & 7) != 1) 687 break; 688 689 /* get the first operand */ 690 error = memread(vm, vcpuid, gpa, &val1, size, arg); 691 if (error) 692 break; 693 694 /* 695 * perform the operation with the pre-fetched immediate 696 * operand and write the result 697 */ 698 result = val1 | vie->immediate; 699 error = memwrite(vm, vcpuid, gpa, result, size, arg); 700 break; 701 default: 702 break; 703 } 704 if (error) 705 return (error); 706 707 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 708 if (error) 709 return (error); 710 711 /* 712 * OF and CF are cleared; the SF, ZF and PF flags are set according 713 * to the result; AF is undefined. 714 * 715 * The updated status flags are obtained by subtracting 0 from 'result'. 716 */ 717 rflags2 = getcc(size, result, 0); 718 rflags &= ~RFLAGS_STATUS_BITS; 719 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 720 721 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 722 return (error); 723 } 724 725 static int 726 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 727 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 728 { 729 int error, size; 730 uint64_t op1, op2, rflags, rflags2; 731 enum vm_reg_name reg; 732 733 size = vie->opsize; 734 switch (vie->op.op_byte) { 735 case 0x3B: 736 /* 737 * 3B/r CMP r16, r/m16 738 * 3B/r CMP r32, r/m32 739 * REX.W + 3B/r CMP r64, r/m64 740 * 741 * Compare first operand (reg) with second operand (r/m) and 742 * set status flags in EFLAGS register. The comparison is 743 * performed by subtracting the second operand from the first 744 * operand and then setting the status flags. 745 */ 746 747 /* Get the first operand */ 748 reg = gpr_map[vie->reg]; 749 error = vie_read_register(vm, vcpuid, reg, &op1); 750 if (error) 751 return (error); 752 753 /* Get the second operand */ 754 error = memread(vm, vcpuid, gpa, &op2, size, arg); 755 if (error) 756 return (error); 757 758 break; 759 default: 760 return (EINVAL); 761 } 762 rflags2 = getcc(size, op1, op2); 763 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 764 if (error) 765 return (error); 766 rflags &= ~RFLAGS_STATUS_BITS; 767 rflags |= rflags2 & RFLAGS_STATUS_BITS; 768 769 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); 770 return (error); 771 } 772 773 static int 774 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 775 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 776 { 777 int error, size; 778 uint64_t nval, rflags, rflags2, val1, val2; 779 enum vm_reg_name reg; 780 781 size = vie->opsize; 782 error = EINVAL; 783 784 switch (vie->op.op_byte) { 785 case 0x2B: 786 /* 787 * SUB r/m from r and store the result in r 788 * 789 * 2B/r SUB r16, r/m16 790 * 2B/r SUB r32, r/m32 791 * REX.W + 2B/r SUB r64, r/m64 792 */ 793 794 /* get the first operand */ 795 reg = gpr_map[vie->reg]; 796 error = vie_read_register(vm, vcpuid, reg, &val1); 797 if (error) 798 break; 799 800 /* get the second operand */ 801 error = memread(vm, vcpuid, gpa, &val2, size, arg); 802 if (error) 803 break; 804 805 /* perform the operation and write the result */ 806 nval = val1 - val2; 807 error = vie_update_register(vm, vcpuid, reg, nval, size); 808 break; 809 default: 810 break; 811 } 812 813 if (!error) { 814 rflags2 = getcc(size, val1, val2); 815 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 816 &rflags); 817 if (error) 818 return (error); 819 820 rflags &= ~RFLAGS_STATUS_BITS; 821 rflags |= rflags2 & RFLAGS_STATUS_BITS; 822 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 823 rflags, 8); 824 } 825 826 return (error); 827 } 828 829 static int 830 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 831 struct vm_guest_paging *paging, mem_region_read_t memread, 832 mem_region_write_t memwrite, void *arg) 833 { 834 #ifdef _KERNEL 835 struct vm_copyinfo copyinfo[2]; 836 #else 837 struct iovec copyinfo[2]; 838 #endif 839 struct seg_desc ss_desc; 840 uint64_t cr0, rflags, rsp, stack_gla, val; 841 int error, size, stackaddrsize, pushop; 842 843 val = 0; 844 size = vie->opsize; 845 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 846 847 /* 848 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 849 */ 850 if (paging->cpu_mode == CPU_MODE_REAL) { 851 stackaddrsize = 2; 852 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 853 /* 854 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 855 * - Stack pointer size is always 64-bits. 856 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 857 * - 16-bit PUSH/POP is supported by using the operand size 858 * override prefix (66H). 859 */ 860 stackaddrsize = 8; 861 size = vie->opsize_override ? 2 : 8; 862 } else { 863 /* 864 * In protected or compability mode the 'B' flag in the 865 * stack-segment descriptor determines the size of the 866 * stack pointer. 867 */ 868 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); 869 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 870 __func__, error)); 871 if (SEG_DESC_DEF32(ss_desc.access)) 872 stackaddrsize = 4; 873 else 874 stackaddrsize = 2; 875 } 876 877 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); 878 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 879 880 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); 881 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 882 883 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); 884 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 885 if (pushop) { 886 rsp -= size; 887 } 888 889 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 890 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 891 &stack_gla)) { 892 vm_inject_ss(vm, vcpuid, 0); 893 return (0); 894 } 895 896 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 897 vm_inject_ss(vm, vcpuid, 0); 898 return (0); 899 } 900 901 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 902 vm_inject_ac(vm, vcpuid, 0); 903 return (0); 904 } 905 906 error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, 907 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); 908 if (error == -1) { 909 /* 910 * XXX cannot return a negative error value here because it 911 * ends up being the return value of the VM_RUN() ioctl and 912 * is interpreted as a pseudo-error (for e.g. ERESTART). 913 */ 914 return (EFAULT); 915 } else if (error == 1) { 916 /* Resume guest execution to handle page fault */ 917 return (0); 918 } 919 920 if (pushop) { 921 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); 922 if (error == 0) 923 vm_copyout(vm, vcpuid, &val, copyinfo, size); 924 } else { 925 vm_copyin(vm, vcpuid, copyinfo, &val, size); 926 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); 927 rsp += size; 928 } 929 #ifdef _KERNEL 930 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 931 #endif 932 933 if (error == 0) { 934 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, 935 stackaddrsize); 936 KASSERT(error == 0, ("error %d updating rsp", error)); 937 } 938 return (error); 939 } 940 941 static int 942 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 943 struct vm_guest_paging *paging, mem_region_read_t memread, 944 mem_region_write_t memwrite, void *arg) 945 { 946 int error; 947 948 /* 949 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 950 * 951 * PUSH is part of the group 5 extended opcodes and is identified 952 * by ModRM:reg = b110. 953 */ 954 if ((vie->reg & 7) != 6) 955 return (EINVAL); 956 957 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 958 memwrite, arg); 959 return (error); 960 } 961 962 static int 963 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, 964 struct vm_guest_paging *paging, mem_region_read_t memread, 965 mem_region_write_t memwrite, void *arg) 966 { 967 int error; 968 969 /* 970 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 971 * 972 * POP is part of the group 1A extended opcodes and is identified 973 * by ModRM:reg = b000. 974 */ 975 if ((vie->reg & 7) != 0) 976 return (EINVAL); 977 978 error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, 979 memwrite, arg); 980 return (error); 981 } 982 983 int 984 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 985 struct vm_guest_paging *paging, mem_region_read_t memread, 986 mem_region_write_t memwrite, void *memarg) 987 { 988 int error; 989 990 if (!vie->decoded) 991 return (EINVAL); 992 993 switch (vie->op.op_type) { 994 case VIE_OP_TYPE_POP: 995 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, 996 memwrite, memarg); 997 break; 998 case VIE_OP_TYPE_PUSH: 999 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, 1000 memwrite, memarg); 1001 break; 1002 case VIE_OP_TYPE_CMP: 1003 error = emulate_cmp(vm, vcpuid, gpa, vie, 1004 memread, memwrite, memarg); 1005 break; 1006 case VIE_OP_TYPE_MOV: 1007 error = emulate_mov(vm, vcpuid, gpa, vie, 1008 memread, memwrite, memarg); 1009 break; 1010 case VIE_OP_TYPE_MOVSX: 1011 case VIE_OP_TYPE_MOVZX: 1012 error = emulate_movx(vm, vcpuid, gpa, vie, 1013 memread, memwrite, memarg); 1014 break; 1015 case VIE_OP_TYPE_AND: 1016 error = emulate_and(vm, vcpuid, gpa, vie, 1017 memread, memwrite, memarg); 1018 break; 1019 case VIE_OP_TYPE_OR: 1020 error = emulate_or(vm, vcpuid, gpa, vie, 1021 memread, memwrite, memarg); 1022 break; 1023 case VIE_OP_TYPE_SUB: 1024 error = emulate_sub(vm, vcpuid, gpa, vie, 1025 memread, memwrite, memarg); 1026 break; 1027 default: 1028 error = EINVAL; 1029 break; 1030 } 1031 1032 return (error); 1033 } 1034 1035 int 1036 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1037 { 1038 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1039 ("%s: invalid size %d", __func__, size)); 1040 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1041 1042 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1043 return (0); 1044 1045 return ((gla & (size - 1)) ? 1 : 0); 1046 } 1047 1048 int 1049 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1050 { 1051 uint64_t mask; 1052 1053 if (cpu_mode != CPU_MODE_64BIT) 1054 return (0); 1055 1056 /* 1057 * The value of the bit 47 in the 'gla' should be replicated in the 1058 * most significant 16 bits. 1059 */ 1060 mask = ~((1UL << 48) - 1); 1061 if (gla & (1UL << 47)) 1062 return ((gla & mask) != mask); 1063 else 1064 return ((gla & mask) != 0); 1065 } 1066 1067 uint64_t 1068 vie_size2mask(int size) 1069 { 1070 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1071 ("vie_size2mask: invalid size %d", size)); 1072 return (size2mask[size]); 1073 } 1074 1075 int 1076 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1077 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1078 int prot, uint64_t *gla) 1079 { 1080 uint64_t firstoff, low_limit, high_limit, segbase; 1081 int glasize, type; 1082 1083 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1084 ("%s: invalid segment %d", __func__, seg)); 1085 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1086 ("%s: invalid operand size %d", __func__, length)); 1087 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1088 ("%s: invalid prot %#x", __func__, prot)); 1089 1090 firstoff = offset; 1091 if (cpu_mode == CPU_MODE_64BIT) { 1092 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1093 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1094 glasize = 8; 1095 } else { 1096 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1097 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1098 glasize = 4; 1099 /* 1100 * If the segment selector is loaded with a NULL selector 1101 * then the descriptor is unusable and attempting to use 1102 * it results in a #GP(0). 1103 */ 1104 if (SEG_DESC_UNUSABLE(desc->access)) 1105 return (-1); 1106 1107 /* 1108 * The processor generates a #NP exception when a segment 1109 * register is loaded with a selector that points to a 1110 * descriptor that is not present. If this was the case then 1111 * it would have been checked before the VM-exit. 1112 */ 1113 KASSERT(SEG_DESC_PRESENT(desc->access), 1114 ("segment %d not present: %#x", seg, desc->access)); 1115 1116 /* 1117 * The descriptor type must indicate a code/data segment. 1118 */ 1119 type = SEG_DESC_TYPE(desc->access); 1120 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1121 "descriptor type %#x", seg, type)); 1122 1123 if (prot & PROT_READ) { 1124 /* #GP on a read access to a exec-only code segment */ 1125 if ((type & 0xA) == 0x8) 1126 return (-1); 1127 } 1128 1129 if (prot & PROT_WRITE) { 1130 /* 1131 * #GP on a write access to a code segment or a 1132 * read-only data segment. 1133 */ 1134 if (type & 0x8) /* code segment */ 1135 return (-1); 1136 1137 if ((type & 0xA) == 0) /* read-only data seg */ 1138 return (-1); 1139 } 1140 1141 /* 1142 * 'desc->limit' is fully expanded taking granularity into 1143 * account. 1144 */ 1145 if ((type & 0xC) == 0x4) { 1146 /* expand-down data segment */ 1147 low_limit = desc->limit + 1; 1148 high_limit = SEG_DESC_DEF32(desc->access) ? 1149 0xffffffff : 0xffff; 1150 } else { 1151 /* code segment or expand-up data segment */ 1152 low_limit = 0; 1153 high_limit = desc->limit; 1154 } 1155 1156 while (length > 0) { 1157 offset &= vie_size2mask(addrsize); 1158 if (offset < low_limit || offset > high_limit) 1159 return (-1); 1160 offset++; 1161 length--; 1162 } 1163 } 1164 1165 /* 1166 * In 64-bit mode all segments except %fs and %gs have a segment 1167 * base address of 0. 1168 */ 1169 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1170 seg != VM_REG_GUEST_GS) { 1171 segbase = 0; 1172 } else { 1173 segbase = desc->base; 1174 } 1175 1176 /* 1177 * Truncate 'firstoff' to the effective address size before adding 1178 * it to the segment base. 1179 */ 1180 firstoff &= vie_size2mask(addrsize); 1181 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1182 return (0); 1183 } 1184 1185 #ifdef _KERNEL 1186 void 1187 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 1188 { 1189 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 1190 ("%s: invalid instruction length (%d)", __func__, inst_length)); 1191 1192 bzero(vie, sizeof(struct vie)); 1193 1194 vie->base_register = VM_REG_LAST; 1195 vie->index_register = VM_REG_LAST; 1196 1197 if (inst_length) { 1198 bcopy(inst_bytes, vie->inst, inst_length); 1199 vie->num_valid = inst_length; 1200 } 1201 } 1202 1203 static int 1204 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 1205 { 1206 int error_code = 0; 1207 1208 if (pte & PG_V) 1209 error_code |= PGEX_P; 1210 if (prot & VM_PROT_WRITE) 1211 error_code |= PGEX_W; 1212 if (usermode) 1213 error_code |= PGEX_U; 1214 if (rsvd) 1215 error_code |= PGEX_RSV; 1216 if (prot & VM_PROT_EXECUTE) 1217 error_code |= PGEX_I; 1218 1219 return (error_code); 1220 } 1221 1222 static void 1223 ptp_release(void **cookie) 1224 { 1225 if (*cookie != NULL) { 1226 vm_gpa_release(*cookie); 1227 *cookie = NULL; 1228 } 1229 } 1230 1231 static void * 1232 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 1233 { 1234 void *ptr; 1235 1236 ptp_release(cookie); 1237 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 1238 return (ptr); 1239 } 1240 1241 int 1242 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1243 uint64_t gla, int prot, uint64_t *gpa) 1244 { 1245 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 1246 u_int retries; 1247 uint64_t *ptpbase, ptpphys, pte, pgsize; 1248 uint32_t *ptpbase32, pte32; 1249 void *cookie; 1250 1251 usermode = (paging->cpl == 3 ? 1 : 0); 1252 writable = prot & VM_PROT_WRITE; 1253 cookie = NULL; 1254 retval = 0; 1255 retries = 0; 1256 restart: 1257 ptpphys = paging->cr3; /* root of the page tables */ 1258 ptp_release(&cookie); 1259 if (retries++ > 0) 1260 maybe_yield(); 1261 1262 if (vie_canonical_check(paging->cpu_mode, gla)) { 1263 /* 1264 * XXX assuming a non-stack reference otherwise a stack fault 1265 * should be generated. 1266 */ 1267 vm_inject_gp(vm, vcpuid); 1268 goto fault; 1269 } 1270 1271 if (paging->paging_mode == PAGING_MODE_FLAT) { 1272 *gpa = gla; 1273 goto done; 1274 } 1275 1276 if (paging->paging_mode == PAGING_MODE_32) { 1277 nlevels = 2; 1278 while (--nlevels >= 0) { 1279 /* Zero out the lower 12 bits. */ 1280 ptpphys &= ~0xfff; 1281 1282 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1283 1284 if (ptpbase32 == NULL) 1285 goto error; 1286 1287 ptpshift = PAGE_SHIFT + nlevels * 10; 1288 ptpindex = (gla >> ptpshift) & 0x3FF; 1289 pgsize = 1UL << ptpshift; 1290 1291 pte32 = ptpbase32[ptpindex]; 1292 1293 if ((pte32 & PG_V) == 0 || 1294 (usermode && (pte32 & PG_U) == 0) || 1295 (writable && (pte32 & PG_RW) == 0)) { 1296 pfcode = pf_error_code(usermode, prot, 0, 1297 pte32); 1298 vm_inject_pf(vm, vcpuid, pfcode, gla); 1299 goto fault; 1300 } 1301 1302 /* 1303 * Emulate the x86 MMU's management of the accessed 1304 * and dirty flags. While the accessed flag is set 1305 * at every level of the page table, the dirty flag 1306 * is only set at the last level providing the guest 1307 * physical address. 1308 */ 1309 if ((pte32 & PG_A) == 0) { 1310 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1311 pte32, pte32 | PG_A) == 0) { 1312 goto restart; 1313 } 1314 } 1315 1316 /* XXX must be ignored if CR4.PSE=0 */ 1317 if (nlevels > 0 && (pte32 & PG_PS) != 0) 1318 break; 1319 1320 ptpphys = pte32; 1321 } 1322 1323 /* Set the dirty bit in the page table entry if necessary */ 1324 if (writable && (pte32 & PG_M) == 0) { 1325 if (atomic_cmpset_32(&ptpbase32[ptpindex], 1326 pte32, pte32 | PG_M) == 0) { 1327 goto restart; 1328 } 1329 } 1330 1331 /* Zero out the lower 'ptpshift' bits */ 1332 pte32 >>= ptpshift; pte32 <<= ptpshift; 1333 *gpa = pte32 | (gla & (pgsize - 1)); 1334 goto done; 1335 } 1336 1337 if (paging->paging_mode == PAGING_MODE_PAE) { 1338 /* Zero out the lower 5 bits and the upper 32 bits */ 1339 ptpphys &= 0xffffffe0UL; 1340 1341 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 1342 if (ptpbase == NULL) 1343 goto error; 1344 1345 ptpindex = (gla >> 30) & 0x3; 1346 1347 pte = ptpbase[ptpindex]; 1348 1349 if ((pte & PG_V) == 0) { 1350 pfcode = pf_error_code(usermode, prot, 0, pte); 1351 vm_inject_pf(vm, vcpuid, pfcode, gla); 1352 goto fault; 1353 } 1354 1355 ptpphys = pte; 1356 1357 nlevels = 2; 1358 } else 1359 nlevels = 4; 1360 while (--nlevels >= 0) { 1361 /* Zero out the lower 12 bits and the upper 12 bits */ 1362 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 1363 1364 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 1365 if (ptpbase == NULL) 1366 goto error; 1367 1368 ptpshift = PAGE_SHIFT + nlevels * 9; 1369 ptpindex = (gla >> ptpshift) & 0x1FF; 1370 pgsize = 1UL << ptpshift; 1371 1372 pte = ptpbase[ptpindex]; 1373 1374 if ((pte & PG_V) == 0 || 1375 (usermode && (pte & PG_U) == 0) || 1376 (writable && (pte & PG_RW) == 0)) { 1377 pfcode = pf_error_code(usermode, prot, 0, pte); 1378 vm_inject_pf(vm, vcpuid, pfcode, gla); 1379 goto fault; 1380 } 1381 1382 /* Set the accessed bit in the page table entry */ 1383 if ((pte & PG_A) == 0) { 1384 if (atomic_cmpset_64(&ptpbase[ptpindex], 1385 pte, pte | PG_A) == 0) { 1386 goto restart; 1387 } 1388 } 1389 1390 if (nlevels > 0 && (pte & PG_PS) != 0) { 1391 if (pgsize > 1 * GB) { 1392 pfcode = pf_error_code(usermode, prot, 1, pte); 1393 vm_inject_pf(vm, vcpuid, pfcode, gla); 1394 goto fault; 1395 } 1396 break; 1397 } 1398 1399 ptpphys = pte; 1400 } 1401 1402 /* Set the dirty bit in the page table entry if necessary */ 1403 if (writable && (pte & PG_M) == 0) { 1404 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 1405 goto restart; 1406 } 1407 1408 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 1409 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 1410 *gpa = pte | (gla & (pgsize - 1)); 1411 done: 1412 ptp_release(&cookie); 1413 return (retval); 1414 error: 1415 retval = -1; 1416 goto done; 1417 fault: 1418 retval = 1; 1419 goto done; 1420 } 1421 1422 int 1423 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 1424 uint64_t rip, int inst_length, struct vie *vie) 1425 { 1426 struct vm_copyinfo copyinfo[2]; 1427 int error, prot; 1428 1429 if (inst_length > VIE_INST_SIZE) 1430 panic("vmm_fetch_instruction: invalid length %d", inst_length); 1431 1432 prot = PROT_READ | PROT_EXEC; 1433 error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, 1434 copyinfo, nitems(copyinfo)); 1435 if (error == 0) { 1436 vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); 1437 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); 1438 vie->num_valid = inst_length; 1439 } 1440 return (error); 1441 } 1442 1443 static int 1444 vie_peek(struct vie *vie, uint8_t *x) 1445 { 1446 1447 if (vie->num_processed < vie->num_valid) { 1448 *x = vie->inst[vie->num_processed]; 1449 return (0); 1450 } else 1451 return (-1); 1452 } 1453 1454 static void 1455 vie_advance(struct vie *vie) 1456 { 1457 1458 vie->num_processed++; 1459 } 1460 1461 static int 1462 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 1463 { 1464 uint8_t x; 1465 1466 while (1) { 1467 if (vie_peek(vie, &x)) 1468 return (-1); 1469 1470 if (x == 0x66) 1471 vie->opsize_override = 1; 1472 else if (x == 0x67) 1473 vie->addrsize_override = 1; 1474 else 1475 break; 1476 1477 vie_advance(vie); 1478 } 1479 1480 /* 1481 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 1482 * - Only one REX prefix is allowed per instruction. 1483 * - The REX prefix must immediately precede the opcode byte or the 1484 * escape opcode byte. 1485 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 1486 * the mandatory prefix must come before the REX prefix. 1487 */ 1488 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 1489 vie->rex_present = 1; 1490 vie->rex_w = x & 0x8 ? 1 : 0; 1491 vie->rex_r = x & 0x4 ? 1 : 0; 1492 vie->rex_x = x & 0x2 ? 1 : 0; 1493 vie->rex_b = x & 0x1 ? 1 : 0; 1494 vie_advance(vie); 1495 } 1496 1497 /* 1498 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 1499 */ 1500 if (cpu_mode == CPU_MODE_64BIT) { 1501 /* 1502 * Default address size is 64-bits and default operand size 1503 * is 32-bits. 1504 */ 1505 vie->addrsize = vie->addrsize_override ? 4 : 8; 1506 if (vie->rex_w) 1507 vie->opsize = 8; 1508 else if (vie->opsize_override) 1509 vie->opsize = 2; 1510 else 1511 vie->opsize = 4; 1512 } else if (cs_d) { 1513 /* Default address and operand sizes are 32-bits */ 1514 vie->addrsize = vie->addrsize_override ? 2 : 4; 1515 vie->opsize = vie->opsize_override ? 2 : 4; 1516 } else { 1517 /* Default address and operand sizes are 16-bits */ 1518 vie->addrsize = vie->addrsize_override ? 4 : 2; 1519 vie->opsize = vie->opsize_override ? 4 : 2; 1520 } 1521 return (0); 1522 } 1523 1524 static int 1525 decode_two_byte_opcode(struct vie *vie) 1526 { 1527 uint8_t x; 1528 1529 if (vie_peek(vie, &x)) 1530 return (-1); 1531 1532 vie->op = two_byte_opcodes[x]; 1533 1534 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1535 return (-1); 1536 1537 vie_advance(vie); 1538 return (0); 1539 } 1540 1541 static int 1542 decode_opcode(struct vie *vie) 1543 { 1544 uint8_t x; 1545 1546 if (vie_peek(vie, &x)) 1547 return (-1); 1548 1549 vie->op = one_byte_opcodes[x]; 1550 1551 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1552 return (-1); 1553 1554 vie_advance(vie); 1555 1556 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1557 return (decode_two_byte_opcode(vie)); 1558 1559 return (0); 1560 } 1561 1562 static int 1563 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1564 { 1565 uint8_t x; 1566 1567 if (cpu_mode == CPU_MODE_REAL) 1568 return (-1); 1569 1570 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 1571 return (0); 1572 1573 if (vie_peek(vie, &x)) 1574 return (-1); 1575 1576 vie->mod = (x >> 6) & 0x3; 1577 vie->rm = (x >> 0) & 0x7; 1578 vie->reg = (x >> 3) & 0x7; 1579 1580 /* 1581 * A direct addressing mode makes no sense in the context of an EPT 1582 * fault. There has to be a memory access involved to cause the 1583 * EPT fault. 1584 */ 1585 if (vie->mod == VIE_MOD_DIRECT) 1586 return (-1); 1587 1588 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1589 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1590 /* 1591 * Table 2-5: Special Cases of REX Encodings 1592 * 1593 * mod=0, r/m=5 is used in the compatibility mode to 1594 * indicate a disp32 without a base register. 1595 * 1596 * mod!=3, r/m=4 is used in the compatibility mode to 1597 * indicate that the SIB byte is present. 1598 * 1599 * The 'b' bit in the REX prefix is don't care in 1600 * this case. 1601 */ 1602 } else { 1603 vie->rm |= (vie->rex_b << 3); 1604 } 1605 1606 vie->reg |= (vie->rex_r << 3); 1607 1608 /* SIB */ 1609 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1610 goto done; 1611 1612 vie->base_register = gpr_map[vie->rm]; 1613 1614 switch (vie->mod) { 1615 case VIE_MOD_INDIRECT_DISP8: 1616 vie->disp_bytes = 1; 1617 break; 1618 case VIE_MOD_INDIRECT_DISP32: 1619 vie->disp_bytes = 4; 1620 break; 1621 case VIE_MOD_INDIRECT: 1622 if (vie->rm == VIE_RM_DISP32) { 1623 vie->disp_bytes = 4; 1624 /* 1625 * Table 2-7. RIP-Relative Addressing 1626 * 1627 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1628 * whereas in compatibility mode it just implies disp32. 1629 */ 1630 1631 if (cpu_mode == CPU_MODE_64BIT) 1632 vie->base_register = VM_REG_GUEST_RIP; 1633 else 1634 vie->base_register = VM_REG_LAST; 1635 } 1636 break; 1637 } 1638 1639 done: 1640 vie_advance(vie); 1641 1642 return (0); 1643 } 1644 1645 static int 1646 decode_sib(struct vie *vie) 1647 { 1648 uint8_t x; 1649 1650 /* Proceed only if SIB byte is present */ 1651 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1652 return (0); 1653 1654 if (vie_peek(vie, &x)) 1655 return (-1); 1656 1657 /* De-construct the SIB byte */ 1658 vie->ss = (x >> 6) & 0x3; 1659 vie->index = (x >> 3) & 0x7; 1660 vie->base = (x >> 0) & 0x7; 1661 1662 /* Apply the REX prefix modifiers */ 1663 vie->index |= vie->rex_x << 3; 1664 vie->base |= vie->rex_b << 3; 1665 1666 switch (vie->mod) { 1667 case VIE_MOD_INDIRECT_DISP8: 1668 vie->disp_bytes = 1; 1669 break; 1670 case VIE_MOD_INDIRECT_DISP32: 1671 vie->disp_bytes = 4; 1672 break; 1673 } 1674 1675 if (vie->mod == VIE_MOD_INDIRECT && 1676 (vie->base == 5 || vie->base == 13)) { 1677 /* 1678 * Special case when base register is unused if mod = 0 1679 * and base = %rbp or %r13. 1680 * 1681 * Documented in: 1682 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1683 * Table 2-5: Special Cases of REX Encodings 1684 */ 1685 vie->disp_bytes = 4; 1686 } else { 1687 vie->base_register = gpr_map[vie->base]; 1688 } 1689 1690 /* 1691 * All encodings of 'index' are valid except for %rsp (4). 1692 * 1693 * Documented in: 1694 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1695 * Table 2-5: Special Cases of REX Encodings 1696 */ 1697 if (vie->index != 4) 1698 vie->index_register = gpr_map[vie->index]; 1699 1700 /* 'scale' makes sense only in the context of an index register */ 1701 if (vie->index_register < VM_REG_LAST) 1702 vie->scale = 1 << vie->ss; 1703 1704 vie_advance(vie); 1705 1706 return (0); 1707 } 1708 1709 static int 1710 decode_displacement(struct vie *vie) 1711 { 1712 int n, i; 1713 uint8_t x; 1714 1715 union { 1716 char buf[4]; 1717 int8_t signed8; 1718 int32_t signed32; 1719 } u; 1720 1721 if ((n = vie->disp_bytes) == 0) 1722 return (0); 1723 1724 if (n != 1 && n != 4) 1725 panic("decode_displacement: invalid disp_bytes %d", n); 1726 1727 for (i = 0; i < n; i++) { 1728 if (vie_peek(vie, &x)) 1729 return (-1); 1730 1731 u.buf[i] = x; 1732 vie_advance(vie); 1733 } 1734 1735 if (n == 1) 1736 vie->displacement = u.signed8; /* sign-extended */ 1737 else 1738 vie->displacement = u.signed32; /* sign-extended */ 1739 1740 return (0); 1741 } 1742 1743 static int 1744 decode_immediate(struct vie *vie) 1745 { 1746 int i, n; 1747 uint8_t x; 1748 union { 1749 char buf[4]; 1750 int8_t signed8; 1751 int16_t signed16; 1752 int32_t signed32; 1753 } u; 1754 1755 /* Figure out immediate operand size (if any) */ 1756 if (vie->op.op_flags & VIE_OP_F_IMM) { 1757 /* 1758 * Section 2.2.1.5 "Immediates", Intel SDM: 1759 * In 64-bit mode the typical size of immediate operands 1760 * remains 32-bits. When the operand size if 64-bits, the 1761 * processor sign-extends all immediates to 64-bits prior 1762 * to their use. 1763 */ 1764 if (vie->opsize == 4 || vie->opsize == 8) 1765 vie->imm_bytes = 4; 1766 else 1767 vie->imm_bytes = 2; 1768 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 1769 vie->imm_bytes = 1; 1770 } 1771 1772 if ((n = vie->imm_bytes) == 0) 1773 return (0); 1774 1775 KASSERT(n == 1 || n == 2 || n == 4, 1776 ("%s: invalid number of immediate bytes: %d", __func__, n)); 1777 1778 for (i = 0; i < n; i++) { 1779 if (vie_peek(vie, &x)) 1780 return (-1); 1781 1782 u.buf[i] = x; 1783 vie_advance(vie); 1784 } 1785 1786 /* sign-extend the immediate value before use */ 1787 if (n == 1) 1788 vie->immediate = u.signed8; 1789 else if (n == 2) 1790 vie->immediate = u.signed16; 1791 else 1792 vie->immediate = u.signed32; 1793 1794 return (0); 1795 } 1796 1797 static int 1798 decode_moffset(struct vie *vie) 1799 { 1800 int i, n; 1801 uint8_t x; 1802 union { 1803 char buf[8]; 1804 uint64_t u64; 1805 } u; 1806 1807 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 1808 return (0); 1809 1810 /* 1811 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 1812 * The memory offset size follows the address-size of the instruction. 1813 */ 1814 n = vie->addrsize; 1815 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 1816 1817 u.u64 = 0; 1818 for (i = 0; i < n; i++) { 1819 if (vie_peek(vie, &x)) 1820 return (-1); 1821 1822 u.buf[i] = x; 1823 vie_advance(vie); 1824 } 1825 vie->displacement = u.u64; 1826 return (0); 1827 } 1828 1829 /* 1830 * Verify that all the bytes in the instruction buffer were consumed. 1831 */ 1832 static int 1833 verify_inst_length(struct vie *vie) 1834 { 1835 1836 if (vie->num_processed) 1837 return (0); 1838 else 1839 return (-1); 1840 } 1841 1842 /* 1843 * Verify that the 'guest linear address' provided as collateral of the nested 1844 * page table fault matches with our instruction decoding. 1845 */ 1846 static int 1847 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 1848 { 1849 int error; 1850 uint64_t base, idx, gla2; 1851 1852 /* Skip 'gla' verification */ 1853 if (gla == VIE_INVALID_GLA) 1854 return (0); 1855 1856 base = 0; 1857 if (vie->base_register != VM_REG_LAST) { 1858 error = vm_get_register(vm, cpuid, vie->base_register, &base); 1859 if (error) { 1860 printf("verify_gla: error %d getting base reg %d\n", 1861 error, vie->base_register); 1862 return (-1); 1863 } 1864 1865 /* 1866 * RIP-relative addressing starts from the following 1867 * instruction 1868 */ 1869 if (vie->base_register == VM_REG_GUEST_RIP) 1870 base += vie->num_valid; 1871 } 1872 1873 idx = 0; 1874 if (vie->index_register != VM_REG_LAST) { 1875 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 1876 if (error) { 1877 printf("verify_gla: error %d getting index reg %d\n", 1878 error, vie->index_register); 1879 return (-1); 1880 } 1881 } 1882 1883 /* XXX assuming that the base address of the segment is 0 */ 1884 gla2 = base + vie->scale * idx + vie->displacement; 1885 gla2 &= size2mask[vie->addrsize]; 1886 if (gla != gla2) { 1887 printf("verify_gla mismatch: " 1888 "base(0x%0lx), scale(%d), index(0x%0lx), " 1889 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 1890 base, vie->scale, idx, vie->displacement, gla, gla2); 1891 return (-1); 1892 } 1893 1894 return (0); 1895 } 1896 1897 int 1898 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 1899 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 1900 { 1901 1902 if (decode_prefixes(vie, cpu_mode, cs_d)) 1903 return (-1); 1904 1905 if (decode_opcode(vie)) 1906 return (-1); 1907 1908 if (decode_modrm(vie, cpu_mode)) 1909 return (-1); 1910 1911 if (decode_sib(vie)) 1912 return (-1); 1913 1914 if (decode_displacement(vie)) 1915 return (-1); 1916 1917 if (decode_immediate(vie)) 1918 return (-1); 1919 1920 if (decode_moffset(vie)) 1921 return (-1); 1922 1923 if (verify_inst_length(vie)) 1924 return (-1); 1925 1926 if (verify_gla(vm, cpuid, gla, vie)) 1927 return (-1); 1928 1929 vie->decoded = 1; /* success */ 1930 1931 return (0); 1932 } 1933 #endif /* _KERNEL */ 1934