1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Sandvine, Inc. 5 * Copyright (c) 2012 NetApp, Inc. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #ifdef _KERNEL 32 #include <sys/param.h> 33 #include <sys/pcpu.h> 34 #include <sys/systm.h> 35 #include <sys/proc.h> 36 37 #include <vm/vm.h> 38 #include <vm/pmap.h> 39 40 #include <machine/vmparam.h> 41 #include <machine/vmm.h> 42 #else /* !_KERNEL */ 43 #include <sys/types.h> 44 #include <sys/errno.h> 45 #include <sys/_iovec.h> 46 47 #include <machine/vmm.h> 48 49 #include <err.h> 50 #include <assert.h> 51 #include <stdbool.h> 52 #include <stddef.h> 53 #include <stdio.h> 54 #include <string.h> 55 #include <strings.h> 56 #include <vmmapi.h> 57 #define __diagused 58 #define KASSERT(exp,msg) assert((exp)) 59 #define panic(...) errx(4, __VA_ARGS__) 60 #endif /* _KERNEL */ 61 62 #include <machine/vmm_instruction_emul.h> 63 #include <x86/psl.h> 64 #include <x86/specialreg.h> 65 66 /* struct vie_op.op_type */ 67 enum { 68 VIE_OP_TYPE_NONE = 0, 69 VIE_OP_TYPE_MOV, 70 VIE_OP_TYPE_MOVSX, 71 VIE_OP_TYPE_MOVZX, 72 VIE_OP_TYPE_AND, 73 VIE_OP_TYPE_OR, 74 VIE_OP_TYPE_SUB, 75 VIE_OP_TYPE_TWO_BYTE, 76 VIE_OP_TYPE_PUSH, 77 VIE_OP_TYPE_CMP, 78 VIE_OP_TYPE_POP, 79 VIE_OP_TYPE_MOVS, 80 VIE_OP_TYPE_GROUP1, 81 VIE_OP_TYPE_STOS, 82 VIE_OP_TYPE_BITTEST, 83 VIE_OP_TYPE_TWOB_GRP15, 84 VIE_OP_TYPE_ADD, 85 VIE_OP_TYPE_TEST, 86 VIE_OP_TYPE_BEXTR, 87 VIE_OP_TYPE_LAST 88 }; 89 90 /* struct vie_op.op_flags */ 91 #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ 92 #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 93 #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ 94 #define VIE_OP_F_NO_MODRM (1 << 3) 95 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) 96 97 static const struct vie_op three_byte_opcodes_0f38[256] = { 98 [0xF7] = { 99 .op_byte = 0xF7, 100 .op_type = VIE_OP_TYPE_BEXTR, 101 }, 102 }; 103 104 static const struct vie_op two_byte_opcodes[256] = { 105 [0xAE] = { 106 .op_byte = 0xAE, 107 .op_type = VIE_OP_TYPE_TWOB_GRP15, 108 }, 109 [0xB6] = { 110 .op_byte = 0xB6, 111 .op_type = VIE_OP_TYPE_MOVZX, 112 }, 113 [0xB7] = { 114 .op_byte = 0xB7, 115 .op_type = VIE_OP_TYPE_MOVZX, 116 }, 117 [0xBA] = { 118 .op_byte = 0xBA, 119 .op_type = VIE_OP_TYPE_BITTEST, 120 .op_flags = VIE_OP_F_IMM8, 121 }, 122 [0xBE] = { 123 .op_byte = 0xBE, 124 .op_type = VIE_OP_TYPE_MOVSX, 125 }, 126 }; 127 128 static const struct vie_op one_byte_opcodes[256] = { 129 [0x03] = { 130 .op_byte = 0x03, 131 .op_type = VIE_OP_TYPE_ADD, 132 }, 133 [0x0F] = { 134 .op_byte = 0x0F, 135 .op_type = VIE_OP_TYPE_TWO_BYTE 136 }, 137 [0x0B] = { 138 .op_byte = 0x0B, 139 .op_type = VIE_OP_TYPE_OR, 140 }, 141 [0x2B] = { 142 .op_byte = 0x2B, 143 .op_type = VIE_OP_TYPE_SUB, 144 }, 145 [0x39] = { 146 .op_byte = 0x39, 147 .op_type = VIE_OP_TYPE_CMP, 148 }, 149 [0x3B] = { 150 .op_byte = 0x3B, 151 .op_type = VIE_OP_TYPE_CMP, 152 }, 153 [0x88] = { 154 .op_byte = 0x88, 155 .op_type = VIE_OP_TYPE_MOV, 156 }, 157 [0x89] = { 158 .op_byte = 0x89, 159 .op_type = VIE_OP_TYPE_MOV, 160 }, 161 [0x8A] = { 162 .op_byte = 0x8A, 163 .op_type = VIE_OP_TYPE_MOV, 164 }, 165 [0x8B] = { 166 .op_byte = 0x8B, 167 .op_type = VIE_OP_TYPE_MOV, 168 }, 169 [0xA1] = { 170 .op_byte = 0xA1, 171 .op_type = VIE_OP_TYPE_MOV, 172 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 173 }, 174 [0xA3] = { 175 .op_byte = 0xA3, 176 .op_type = VIE_OP_TYPE_MOV, 177 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, 178 }, 179 [0xA4] = { 180 .op_byte = 0xA4, 181 .op_type = VIE_OP_TYPE_MOVS, 182 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 183 }, 184 [0xA5] = { 185 .op_byte = 0xA5, 186 .op_type = VIE_OP_TYPE_MOVS, 187 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 188 }, 189 [0xAA] = { 190 .op_byte = 0xAA, 191 .op_type = VIE_OP_TYPE_STOS, 192 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 193 }, 194 [0xAB] = { 195 .op_byte = 0xAB, 196 .op_type = VIE_OP_TYPE_STOS, 197 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION 198 }, 199 [0xC6] = { 200 /* XXX Group 11 extended opcode - not just MOV */ 201 .op_byte = 0xC6, 202 .op_type = VIE_OP_TYPE_MOV, 203 .op_flags = VIE_OP_F_IMM8, 204 }, 205 [0xC7] = { 206 .op_byte = 0xC7, 207 .op_type = VIE_OP_TYPE_MOV, 208 .op_flags = VIE_OP_F_IMM, 209 }, 210 [0x23] = { 211 .op_byte = 0x23, 212 .op_type = VIE_OP_TYPE_AND, 213 }, 214 [0x80] = { 215 /* Group 1 extended opcode */ 216 .op_byte = 0x80, 217 .op_type = VIE_OP_TYPE_GROUP1, 218 .op_flags = VIE_OP_F_IMM8, 219 }, 220 [0x81] = { 221 /* Group 1 extended opcode */ 222 .op_byte = 0x81, 223 .op_type = VIE_OP_TYPE_GROUP1, 224 .op_flags = VIE_OP_F_IMM, 225 }, 226 [0x83] = { 227 /* Group 1 extended opcode */ 228 .op_byte = 0x83, 229 .op_type = VIE_OP_TYPE_GROUP1, 230 .op_flags = VIE_OP_F_IMM8, 231 }, 232 [0x8F] = { 233 /* XXX Group 1A extended opcode - not just POP */ 234 .op_byte = 0x8F, 235 .op_type = VIE_OP_TYPE_POP, 236 }, 237 [0xF7] = { 238 /* XXX Group 3 extended opcode - not just TEST */ 239 .op_byte = 0xF7, 240 .op_type = VIE_OP_TYPE_TEST, 241 .op_flags = VIE_OP_F_IMM, 242 }, 243 [0xFF] = { 244 /* XXX Group 5 extended opcode - not just PUSH */ 245 .op_byte = 0xFF, 246 .op_type = VIE_OP_TYPE_PUSH, 247 } 248 }; 249 250 /* struct vie.mod */ 251 #define VIE_MOD_INDIRECT 0 252 #define VIE_MOD_INDIRECT_DISP8 1 253 #define VIE_MOD_INDIRECT_DISP32 2 254 #define VIE_MOD_DIRECT 3 255 256 /* struct vie.rm */ 257 #define VIE_RM_SIB 4 258 #define VIE_RM_DISP32 5 259 260 #define GB (1024 * 1024 * 1024) 261 262 static enum vm_reg_name gpr_map[16] = { 263 VM_REG_GUEST_RAX, 264 VM_REG_GUEST_RCX, 265 VM_REG_GUEST_RDX, 266 VM_REG_GUEST_RBX, 267 VM_REG_GUEST_RSP, 268 VM_REG_GUEST_RBP, 269 VM_REG_GUEST_RSI, 270 VM_REG_GUEST_RDI, 271 VM_REG_GUEST_R8, 272 VM_REG_GUEST_R9, 273 VM_REG_GUEST_R10, 274 VM_REG_GUEST_R11, 275 VM_REG_GUEST_R12, 276 VM_REG_GUEST_R13, 277 VM_REG_GUEST_R14, 278 VM_REG_GUEST_R15 279 }; 280 281 static uint64_t size2mask[] = { 282 [1] = 0xff, 283 [2] = 0xffff, 284 [4] = 0xffffffff, 285 [8] = 0xffffffffffffffff, 286 }; 287 288 static int 289 vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval) 290 { 291 int error; 292 293 error = vm_get_register(vcpu, reg, rval); 294 295 return (error); 296 } 297 298 static void 299 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) 300 { 301 *lhbr = 0; 302 *reg = gpr_map[vie->reg]; 303 304 /* 305 * 64-bit mode imposes limitations on accessing legacy high byte 306 * registers (lhbr). 307 * 308 * The legacy high-byte registers cannot be addressed if the REX 309 * prefix is present. In this case the values 4, 5, 6 and 7 of the 310 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 311 * 312 * If the REX prefix is not present then the values 4, 5, 6 and 7 313 * of the 'ModRM:reg' field address the legacy high-byte registers, 314 * %ah, %ch, %dh and %bh respectively. 315 */ 316 if (!vie->rex_present) { 317 if (vie->reg & 0x4) { 318 *lhbr = 1; 319 *reg = gpr_map[vie->reg & 0x3]; 320 } 321 } 322 } 323 324 static int 325 vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval) 326 { 327 uint64_t val; 328 int error, lhbr; 329 enum vm_reg_name reg; 330 331 vie_calc_bytereg(vie, ®, &lhbr); 332 error = vm_get_register(vcpu, reg, &val); 333 334 /* 335 * To obtain the value of a legacy high byte register shift the 336 * base register right by 8 bits (%ah = %rax >> 8). 337 */ 338 if (lhbr) 339 *rval = val >> 8; 340 else 341 *rval = val; 342 return (error); 343 } 344 345 static int 346 vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte) 347 { 348 uint64_t origval, val, mask; 349 int error, lhbr; 350 enum vm_reg_name reg; 351 352 vie_calc_bytereg(vie, ®, &lhbr); 353 error = vm_get_register(vcpu, reg, &origval); 354 if (error == 0) { 355 val = byte; 356 mask = 0xff; 357 if (lhbr) { 358 /* 359 * Shift left by 8 to store 'byte' in a legacy high 360 * byte register. 361 */ 362 val <<= 8; 363 mask <<= 8; 364 } 365 val |= origval & ~mask; 366 error = vm_set_register(vcpu, reg, val); 367 } 368 return (error); 369 } 370 371 int 372 vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, 373 uint64_t val, int size) 374 { 375 int error; 376 uint64_t origval; 377 378 switch (size) { 379 case 1: 380 case 2: 381 error = vie_read_register(vcpu, reg, &origval); 382 if (error) 383 return (error); 384 val &= size2mask[size]; 385 val |= origval & ~size2mask[size]; 386 break; 387 case 4: 388 val &= 0xffffffffUL; 389 break; 390 case 8: 391 break; 392 default: 393 return (EINVAL); 394 } 395 396 error = vm_set_register(vcpu, reg, val); 397 return (error); 398 } 399 400 #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) 401 402 /* 403 * Return the status flags that would result from doing (x - y). 404 */ 405 #define GETCC(sz) \ 406 static u_long \ 407 getcc##sz(uint##sz##_t x, uint##sz##_t y) \ 408 { \ 409 u_long rflags; \ 410 \ 411 __asm __volatile("sub %2,%1; pushfq; popq %0" : \ 412 "=r" (rflags), "+r" (x) : "m" (y)); \ 413 return (rflags); \ 414 } struct __hack 415 416 GETCC(8); 417 GETCC(16); 418 GETCC(32); 419 GETCC(64); 420 421 static u_long 422 getcc(int opsize, uint64_t x, uint64_t y) 423 { 424 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 425 ("getcc: invalid operand size %d", opsize)); 426 427 if (opsize == 1) 428 return (getcc8(x, y)); 429 else if (opsize == 2) 430 return (getcc16(x, y)); 431 else if (opsize == 4) 432 return (getcc32(x, y)); 433 else 434 return (getcc64(x, y)); 435 } 436 437 /* 438 * Macro creation of functions getaddflags{8,16,32,64} 439 */ 440 #define GETADDFLAGS(sz) \ 441 static u_long \ 442 getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ 443 { \ 444 u_long rflags; \ 445 \ 446 __asm __volatile("add %2,%1; pushfq; popq %0" : \ 447 "=r" (rflags), "+r" (x) : "m" (y)); \ 448 return (rflags); \ 449 } struct __hack 450 451 GETADDFLAGS(8); 452 GETADDFLAGS(16); 453 GETADDFLAGS(32); 454 GETADDFLAGS(64); 455 456 static u_long 457 getaddflags(int opsize, uint64_t x, uint64_t y) 458 { 459 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 460 ("getaddflags: invalid operand size %d", opsize)); 461 462 if (opsize == 1) 463 return (getaddflags8(x, y)); 464 else if (opsize == 2) 465 return (getaddflags16(x, y)); 466 else if (opsize == 4) 467 return (getaddflags32(x, y)); 468 else 469 return (getaddflags64(x, y)); 470 } 471 472 /* 473 * Return the status flags that would result from doing (x & y). 474 */ 475 #define GETANDFLAGS(sz) \ 476 static u_long \ 477 getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ 478 { \ 479 u_long rflags; \ 480 \ 481 __asm __volatile("and %2,%1; pushfq; popq %0" : \ 482 "=r" (rflags), "+r" (x) : "m" (y)); \ 483 return (rflags); \ 484 } struct __hack 485 486 GETANDFLAGS(8); 487 GETANDFLAGS(16); 488 GETANDFLAGS(32); 489 GETANDFLAGS(64); 490 491 static u_long 492 getandflags(int opsize, uint64_t x, uint64_t y) 493 { 494 KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, 495 ("getandflags: invalid operand size %d", opsize)); 496 497 if (opsize == 1) 498 return (getandflags8(x, y)); 499 else if (opsize == 2) 500 return (getandflags16(x, y)); 501 else if (opsize == 4) 502 return (getandflags32(x, y)); 503 else 504 return (getandflags64(x, y)); 505 } 506 507 static int 508 emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 509 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 510 { 511 int error, size; 512 enum vm_reg_name reg; 513 uint8_t byte; 514 uint64_t val; 515 516 size = vie->opsize; 517 error = EINVAL; 518 519 switch (vie->op.op_byte) { 520 case 0x88: 521 /* 522 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 523 * 88/r: mov r/m8, r8 524 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 525 */ 526 size = 1; /* override for byte operation */ 527 error = vie_read_bytereg(vcpu, vie, &byte); 528 if (error == 0) 529 error = memwrite(vcpu, gpa, byte, size, arg); 530 break; 531 case 0x89: 532 /* 533 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 534 * 89/r: mov r/m16, r16 535 * 89/r: mov r/m32, r32 536 * REX.W + 89/r mov r/m64, r64 537 */ 538 reg = gpr_map[vie->reg]; 539 error = vie_read_register(vcpu, reg, &val); 540 if (error == 0) { 541 val &= size2mask[size]; 542 error = memwrite(vcpu, gpa, val, size, arg); 543 } 544 break; 545 case 0x8A: 546 /* 547 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) 548 * 8A/r: mov r8, r/m8 549 * REX + 8A/r: mov r8, r/m8 550 */ 551 size = 1; /* override for byte operation */ 552 error = memread(vcpu, gpa, &val, size, arg); 553 if (error == 0) 554 error = vie_write_bytereg(vcpu, vie, val); 555 break; 556 case 0x8B: 557 /* 558 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 559 * 8B/r: mov r16, r/m16 560 * 8B/r: mov r32, r/m32 561 * REX.W 8B/r: mov r64, r/m64 562 */ 563 error = memread(vcpu, gpa, &val, size, arg); 564 if (error == 0) { 565 reg = gpr_map[vie->reg]; 566 error = vie_update_register(vcpu, reg, val, size); 567 } 568 break; 569 case 0xA1: 570 /* 571 * MOV from seg:moffset to AX/EAX/RAX 572 * A1: mov AX, moffs16 573 * A1: mov EAX, moffs32 574 * REX.W + A1: mov RAX, moffs64 575 */ 576 error = memread(vcpu, gpa, &val, size, arg); 577 if (error == 0) { 578 reg = VM_REG_GUEST_RAX; 579 error = vie_update_register(vcpu, reg, val, size); 580 } 581 break; 582 case 0xA3: 583 /* 584 * MOV from AX/EAX/RAX to seg:moffset 585 * A3: mov moffs16, AX 586 * A3: mov moffs32, EAX 587 * REX.W + A3: mov moffs64, RAX 588 */ 589 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 590 if (error == 0) { 591 val &= size2mask[size]; 592 error = memwrite(vcpu, gpa, val, size, arg); 593 } 594 break; 595 case 0xC6: 596 /* 597 * MOV from imm8 to mem (ModRM:r/m) 598 * C6/0 mov r/m8, imm8 599 * REX + C6/0 mov r/m8, imm8 600 */ 601 size = 1; /* override for byte operation */ 602 error = memwrite(vcpu, gpa, vie->immediate, size, arg); 603 break; 604 case 0xC7: 605 /* 606 * MOV from imm16/imm32 to mem (ModRM:r/m) 607 * C7/0 mov r/m16, imm16 608 * C7/0 mov r/m32, imm32 609 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 610 */ 611 val = vie->immediate & size2mask[size]; 612 error = memwrite(vcpu, gpa, val, size, arg); 613 break; 614 default: 615 break; 616 } 617 618 return (error); 619 } 620 621 static int 622 emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 623 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 624 { 625 int error, size; 626 enum vm_reg_name reg; 627 uint64_t val; 628 629 size = vie->opsize; 630 error = EINVAL; 631 632 switch (vie->op.op_byte) { 633 case 0xB6: 634 /* 635 * MOV and zero extend byte from mem (ModRM:r/m) to 636 * reg (ModRM:reg). 637 * 638 * 0F B6/r movzx r16, r/m8 639 * 0F B6/r movzx r32, r/m8 640 * REX.W + 0F B6/r movzx r64, r/m8 641 */ 642 643 /* get the first operand */ 644 error = memread(vcpu, gpa, &val, 1, arg); 645 if (error) 646 break; 647 648 /* get the second operand */ 649 reg = gpr_map[vie->reg]; 650 651 /* zero-extend byte */ 652 val = (uint8_t)val; 653 654 /* write the result */ 655 error = vie_update_register(vcpu, reg, val, size); 656 break; 657 case 0xB7: 658 /* 659 * MOV and zero extend word from mem (ModRM:r/m) to 660 * reg (ModRM:reg). 661 * 662 * 0F B7/r movzx r32, r/m16 663 * REX.W + 0F B7/r movzx r64, r/m16 664 */ 665 error = memread(vcpu, gpa, &val, 2, arg); 666 if (error) 667 return (error); 668 669 reg = gpr_map[vie->reg]; 670 671 /* zero-extend word */ 672 val = (uint16_t)val; 673 674 error = vie_update_register(vcpu, reg, val, size); 675 break; 676 case 0xBE: 677 /* 678 * MOV and sign extend byte from mem (ModRM:r/m) to 679 * reg (ModRM:reg). 680 * 681 * 0F BE/r movsx r16, r/m8 682 * 0F BE/r movsx r32, r/m8 683 * REX.W + 0F BE/r movsx r64, r/m8 684 */ 685 686 /* get the first operand */ 687 error = memread(vcpu, gpa, &val, 1, arg); 688 if (error) 689 break; 690 691 /* get the second operand */ 692 reg = gpr_map[vie->reg]; 693 694 /* sign extend byte */ 695 val = (int8_t)val; 696 697 /* write the result */ 698 error = vie_update_register(vcpu, reg, val, size); 699 break; 700 default: 701 break; 702 } 703 return (error); 704 } 705 706 /* 707 * Helper function to calculate and validate a linear address. 708 */ 709 static int 710 get_gla(struct vcpu *vcpu, struct vie *vie __unused, 711 struct vm_guest_paging *paging, int opsize, int addrsize, int prot, 712 enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) 713 { 714 struct seg_desc desc; 715 uint64_t cr0, val, rflags; 716 int error __diagused; 717 718 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 719 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 720 721 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 722 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 723 724 error = vm_get_seg_desc(vcpu, seg, &desc); 725 KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", 726 __func__, error, seg)); 727 728 error = vie_read_register(vcpu, gpr, &val); 729 KASSERT(error == 0, ("%s: error %d getting register %d", __func__, 730 error, gpr)); 731 732 if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, 733 addrsize, prot, gla)) { 734 if (seg == VM_REG_GUEST_SS) 735 vm_inject_ss(vcpu, 0); 736 else 737 vm_inject_gp(vcpu); 738 goto guest_fault; 739 } 740 741 if (vie_canonical_check(paging->cpu_mode, *gla)) { 742 if (seg == VM_REG_GUEST_SS) 743 vm_inject_ss(vcpu, 0); 744 else 745 vm_inject_gp(vcpu); 746 goto guest_fault; 747 } 748 749 if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { 750 vm_inject_ac(vcpu, 0); 751 goto guest_fault; 752 } 753 754 *fault = 0; 755 return (0); 756 757 guest_fault: 758 *fault = 1; 759 return (0); 760 } 761 762 static int 763 emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 764 struct vm_guest_paging *paging, mem_region_read_t memread, 765 mem_region_write_t memwrite, void *arg) 766 { 767 #ifdef _KERNEL 768 struct vm_copyinfo copyinfo[2]; 769 #else 770 struct iovec copyinfo[2]; 771 #endif 772 uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; 773 uint64_t rcx, rdi, rsi, rflags; 774 int error, fault, opsize, seg, repeat; 775 776 opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; 777 val = 0; 778 error = 0; 779 780 /* 781 * XXX although the MOVS instruction is only supposed to be used with 782 * the "rep" prefix some guests like FreeBSD will use "repnz" instead. 783 * 784 * Empirically the "repnz" prefix has identical behavior to "rep" 785 * and the zero flag does not make a difference. 786 */ 787 repeat = vie->repz_present | vie->repnz_present; 788 789 if (repeat) { 790 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 791 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 792 793 /* 794 * The count register is %rcx, %ecx or %cx depending on the 795 * address size of the instruction. 796 */ 797 if ((rcx & vie_size2mask(vie->addrsize)) == 0) { 798 error = 0; 799 goto done; 800 } 801 } 802 803 /* 804 * Source Destination Comments 805 * -------------------------------------------- 806 * (1) memory memory n/a 807 * (2) memory mmio emulated 808 * (3) mmio memory emulated 809 * (4) mmio mmio emulated 810 * 811 * At this point we don't have sufficient information to distinguish 812 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this 813 * out because it will succeed only when operating on regular memory. 814 * 815 * XXX the emulation doesn't properly handle the case where 'gpa' 816 * is straddling the boundary between the normal memory and MMIO. 817 */ 818 819 seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; 820 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 821 PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); 822 if (error || fault) 823 goto done; 824 825 error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ, 826 copyinfo, nitems(copyinfo), &fault); 827 if (error == 0) { 828 if (fault) 829 goto done; /* Resume guest to handle fault */ 830 831 /* 832 * case (2): read from system memory and write to mmio. 833 */ 834 vm_copyin(copyinfo, &val, opsize); 835 vm_copy_teardown(copyinfo, nitems(copyinfo)); 836 error = memwrite(vcpu, gpa, val, opsize, arg); 837 if (error) 838 goto done; 839 } else { 840 /* 841 * 'vm_copy_setup()' is expected to fail for cases (3) and (4) 842 * if 'srcaddr' is in the mmio space. 843 */ 844 845 error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, 846 PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, 847 &fault); 848 if (error || fault) 849 goto done; 850 851 error = vm_copy_setup(vcpu, paging, dstaddr, opsize, 852 PROT_WRITE, copyinfo, nitems(copyinfo), &fault); 853 if (error == 0) { 854 if (fault) 855 goto done; /* Resume guest to handle fault */ 856 857 /* 858 * case (3): read from MMIO and write to system memory. 859 * 860 * A MMIO read can have side-effects so we 861 * commit to it only after vm_copy_setup() is 862 * successful. If a page-fault needs to be 863 * injected into the guest then it will happen 864 * before the MMIO read is attempted. 865 */ 866 error = memread(vcpu, gpa, &val, opsize, arg); 867 if (error) 868 goto done; 869 870 vm_copyout(&val, copyinfo, opsize); 871 vm_copy_teardown(copyinfo, nitems(copyinfo)); 872 } else { 873 /* 874 * Case (4): read from and write to mmio. 875 * 876 * Commit to the MMIO read/write (with potential 877 * side-effects) only after we are sure that the 878 * instruction is not going to be restarted due 879 * to address translation faults. 880 */ 881 error = vm_gla2gpa(vcpu, paging, srcaddr, 882 PROT_READ, &srcgpa, &fault); 883 if (error || fault) 884 goto done; 885 886 error = vm_gla2gpa(vcpu, paging, dstaddr, 887 PROT_WRITE, &dstgpa, &fault); 888 if (error || fault) 889 goto done; 890 891 error = memread(vcpu, srcgpa, &val, opsize, arg); 892 if (error) 893 goto done; 894 895 error = memwrite(vcpu, dstgpa, val, opsize, arg); 896 if (error) 897 goto done; 898 } 899 } 900 901 error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi); 902 KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); 903 904 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 905 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 906 907 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 908 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 909 910 if (rflags & PSL_D) { 911 rsi -= opsize; 912 rdi -= opsize; 913 } else { 914 rsi += opsize; 915 rdi += opsize; 916 } 917 918 error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi, 919 vie->addrsize); 920 KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); 921 922 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 923 vie->addrsize); 924 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 925 926 if (repeat) { 927 rcx = rcx - 1; 928 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 929 rcx, vie->addrsize); 930 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 931 932 /* 933 * Repeat the instruction if the count register is not zero. 934 */ 935 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 936 vm_restart_instruction(vcpu); 937 } 938 done: 939 KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", 940 __func__, error)); 941 return (error); 942 } 943 944 static int 945 emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 946 struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, 947 mem_region_write_t memwrite, void *arg) 948 { 949 int error, opsize, repeat; 950 uint64_t val; 951 uint64_t rcx, rdi, rflags; 952 953 opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; 954 repeat = vie->repz_present | vie->repnz_present; 955 956 if (repeat) { 957 error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); 958 KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); 959 960 /* 961 * The count register is %rcx, %ecx or %cx depending on the 962 * address size of the instruction. 963 */ 964 if ((rcx & vie_size2mask(vie->addrsize)) == 0) 965 return (0); 966 } 967 968 error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); 969 KASSERT(!error, ("%s: error %d getting rax", __func__, error)); 970 971 error = memwrite(vcpu, gpa, val, opsize, arg); 972 if (error) 973 return (error); 974 975 error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); 976 KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); 977 978 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 979 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 980 981 if (rflags & PSL_D) 982 rdi -= opsize; 983 else 984 rdi += opsize; 985 986 error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, 987 vie->addrsize); 988 KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); 989 990 if (repeat) { 991 rcx = rcx - 1; 992 error = vie_update_register(vcpu, VM_REG_GUEST_RCX, 993 rcx, vie->addrsize); 994 KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); 995 996 /* 997 * Repeat the instruction if the count register is not zero. 998 */ 999 if ((rcx & vie_size2mask(vie->addrsize)) != 0) 1000 vm_restart_instruction(vcpu); 1001 } 1002 1003 return (0); 1004 } 1005 1006 static int 1007 emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1008 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1009 { 1010 int error, size; 1011 enum vm_reg_name reg; 1012 uint64_t result, rflags, rflags2, val1, val2; 1013 1014 size = vie->opsize; 1015 error = EINVAL; 1016 1017 switch (vie->op.op_byte) { 1018 case 0x23: 1019 /* 1020 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 1021 * result in reg. 1022 * 1023 * 23/r and r16, r/m16 1024 * 23/r and r32, r/m32 1025 * REX.W + 23/r and r64, r/m64 1026 */ 1027 1028 /* get the first operand */ 1029 reg = gpr_map[vie->reg]; 1030 error = vie_read_register(vcpu, reg, &val1); 1031 if (error) 1032 break; 1033 1034 /* get the second operand */ 1035 error = memread(vcpu, gpa, &val2, size, arg); 1036 if (error) 1037 break; 1038 1039 /* perform the operation and write the result */ 1040 result = val1 & val2; 1041 error = vie_update_register(vcpu, reg, result, size); 1042 break; 1043 case 0x81: 1044 case 0x83: 1045 /* 1046 * AND mem (ModRM:r/m) with immediate and store the 1047 * result in mem. 1048 * 1049 * 81 /4 and r/m16, imm16 1050 * 81 /4 and r/m32, imm32 1051 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 1052 * 1053 * 83 /4 and r/m16, imm8 sign-extended to 16 1054 * 83 /4 and r/m32, imm8 sign-extended to 32 1055 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 1056 */ 1057 1058 /* get the first operand */ 1059 error = memread(vcpu, gpa, &val1, size, arg); 1060 if (error) 1061 break; 1062 1063 /* 1064 * perform the operation with the pre-fetched immediate 1065 * operand and write the result 1066 */ 1067 result = val1 & vie->immediate; 1068 error = memwrite(vcpu, gpa, result, size, arg); 1069 break; 1070 default: 1071 break; 1072 } 1073 if (error) 1074 return (error); 1075 1076 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1077 if (error) 1078 return (error); 1079 1080 /* 1081 * OF and CF are cleared; the SF, ZF and PF flags are set according 1082 * to the result; AF is undefined. 1083 * 1084 * The updated status flags are obtained by subtracting 0 from 'result'. 1085 */ 1086 rflags2 = getcc(size, result, 0); 1087 rflags &= ~RFLAGS_STATUS_BITS; 1088 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1089 1090 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1091 return (error); 1092 } 1093 1094 static int 1095 emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1096 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 1097 { 1098 int error, size; 1099 enum vm_reg_name reg; 1100 uint64_t result, rflags, rflags2, val1, val2; 1101 1102 size = vie->opsize; 1103 error = EINVAL; 1104 1105 switch (vie->op.op_byte) { 1106 case 0x0B: 1107 /* 1108 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the 1109 * result in reg. 1110 * 1111 * 0b/r or r16, r/m16 1112 * 0b/r or r32, r/m32 1113 * REX.W + 0b/r or r64, r/m64 1114 */ 1115 1116 /* get the first operand */ 1117 reg = gpr_map[vie->reg]; 1118 error = vie_read_register(vcpu, reg, &val1); 1119 if (error) 1120 break; 1121 1122 /* get the second operand */ 1123 error = memread(vcpu, gpa, &val2, size, arg); 1124 if (error) 1125 break; 1126 1127 /* perform the operation and write the result */ 1128 result = val1 | val2; 1129 error = vie_update_register(vcpu, reg, result, size); 1130 break; 1131 case 0x81: 1132 case 0x83: 1133 /* 1134 * OR mem (ModRM:r/m) with immediate and store the 1135 * result in mem. 1136 * 1137 * 81 /1 or r/m16, imm16 1138 * 81 /1 or r/m32, imm32 1139 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 1140 * 1141 * 83 /1 or r/m16, imm8 sign-extended to 16 1142 * 83 /1 or r/m32, imm8 sign-extended to 32 1143 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 1144 */ 1145 1146 /* get the first operand */ 1147 error = memread(vcpu, gpa, &val1, size, arg); 1148 if (error) 1149 break; 1150 1151 /* 1152 * perform the operation with the pre-fetched immediate 1153 * operand and write the result 1154 */ 1155 result = val1 | vie->immediate; 1156 error = memwrite(vcpu, gpa, result, size, arg); 1157 break; 1158 default: 1159 break; 1160 } 1161 if (error) 1162 return (error); 1163 1164 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1165 if (error) 1166 return (error); 1167 1168 /* 1169 * OF and CF are cleared; the SF, ZF and PF flags are set according 1170 * to the result; AF is undefined. 1171 * 1172 * The updated status flags are obtained by subtracting 0 from 'result'. 1173 */ 1174 rflags2 = getcc(size, result, 0); 1175 rflags &= ~RFLAGS_STATUS_BITS; 1176 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1177 1178 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1179 return (error); 1180 } 1181 1182 static int 1183 emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1184 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1185 { 1186 int error, size; 1187 uint64_t regop, memop, op1, op2, rflags, rflags2; 1188 enum vm_reg_name reg; 1189 1190 size = vie->opsize; 1191 switch (vie->op.op_byte) { 1192 case 0x39: 1193 case 0x3B: 1194 /* 1195 * 39/r CMP r/m16, r16 1196 * 39/r CMP r/m32, r32 1197 * REX.W 39/r CMP r/m64, r64 1198 * 1199 * 3B/r CMP r16, r/m16 1200 * 3B/r CMP r32, r/m32 1201 * REX.W + 3B/r CMP r64, r/m64 1202 * 1203 * Compare the first operand with the second operand and 1204 * set status flags in EFLAGS register. The comparison is 1205 * performed by subtracting the second operand from the first 1206 * operand and then setting the status flags. 1207 */ 1208 1209 /* Get the register operand */ 1210 reg = gpr_map[vie->reg]; 1211 error = vie_read_register(vcpu, reg, ®op); 1212 if (error) 1213 return (error); 1214 1215 /* Get the memory operand */ 1216 error = memread(vcpu, gpa, &memop, size, arg); 1217 if (error) 1218 return (error); 1219 1220 if (vie->op.op_byte == 0x3B) { 1221 op1 = regop; 1222 op2 = memop; 1223 } else { 1224 op1 = memop; 1225 op2 = regop; 1226 } 1227 rflags2 = getcc(size, op1, op2); 1228 break; 1229 case 0x80: 1230 case 0x81: 1231 case 0x83: 1232 /* 1233 * 80 /7 cmp r/m8, imm8 1234 * REX + 80 /7 cmp r/m8, imm8 1235 * 1236 * 81 /7 cmp r/m16, imm16 1237 * 81 /7 cmp r/m32, imm32 1238 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 1239 * 1240 * 83 /7 cmp r/m16, imm8 sign-extended to 16 1241 * 83 /7 cmp r/m32, imm8 sign-extended to 32 1242 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 1243 * 1244 * Compare mem (ModRM:r/m) with immediate and set 1245 * status flags according to the results. The 1246 * comparison is performed by subtracting the 1247 * immediate from the first operand and then setting 1248 * the status flags. 1249 * 1250 */ 1251 if (vie->op.op_byte == 0x80) 1252 size = 1; 1253 1254 /* get the first operand */ 1255 error = memread(vcpu, gpa, &op1, size, arg); 1256 if (error) 1257 return (error); 1258 1259 rflags2 = getcc(size, op1, vie->immediate); 1260 break; 1261 default: 1262 return (EINVAL); 1263 } 1264 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1265 if (error) 1266 return (error); 1267 rflags &= ~RFLAGS_STATUS_BITS; 1268 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1269 1270 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1271 return (error); 1272 } 1273 1274 static int 1275 emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1276 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1277 { 1278 int error, size; 1279 uint64_t op1, rflags, rflags2; 1280 1281 size = vie->opsize; 1282 error = EINVAL; 1283 1284 switch (vie->op.op_byte) { 1285 case 0xF7: 1286 /* 1287 * F7 /0 test r/m16, imm16 1288 * F7 /0 test r/m32, imm32 1289 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 1290 * 1291 * Test mem (ModRM:r/m) with immediate and set status 1292 * flags according to the results. The comparison is 1293 * performed by anding the immediate from the first 1294 * operand and then setting the status flags. 1295 */ 1296 if ((vie->reg & 7) != 0) 1297 return (EINVAL); 1298 1299 error = memread(vcpu, gpa, &op1, size, arg); 1300 if (error) 1301 return (error); 1302 1303 rflags2 = getandflags(size, op1, vie->immediate); 1304 break; 1305 default: 1306 return (EINVAL); 1307 } 1308 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1309 if (error) 1310 return (error); 1311 1312 /* 1313 * OF and CF are cleared; the SF, ZF and PF flags are set according 1314 * to the result; AF is undefined. 1315 */ 1316 rflags &= ~RFLAGS_STATUS_BITS; 1317 rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); 1318 1319 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1320 return (error); 1321 } 1322 1323 static int 1324 emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1325 struct vm_guest_paging *paging, mem_region_read_t memread, 1326 mem_region_write_t memwrite __unused, void *arg) 1327 { 1328 uint64_t src1, src2, dst, rflags; 1329 unsigned start, len, size; 1330 int error; 1331 1332 size = vie->opsize; 1333 error = EINVAL; 1334 1335 /* 1336 * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b 1337 * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b 1338 * 1339 * Destination operand is ModRM:reg. Source operands are ModRM:r/m and 1340 * Vex.vvvv. 1341 * 1342 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). 1343 */ 1344 if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) 1345 size = 4; 1346 1347 /* 1348 * Extracts contiguous bits from the first /source/ operand (second 1349 * operand) using an index and length specified in the second /source/ 1350 * operand (third operand). 1351 */ 1352 error = memread(vcpu, gpa, &src1, size, arg); 1353 if (error) 1354 return (error); 1355 error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2); 1356 if (error) 1357 return (error); 1358 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1359 if (error) 1360 return (error); 1361 1362 start = (src2 & 0xff); 1363 len = (src2 & 0xff00) >> 8; 1364 1365 /* If no bits are extracted, the destination register is cleared. */ 1366 dst = 0; 1367 1368 /* If START exceeds the operand size, no bits are extracted. */ 1369 if (start > size * 8) 1370 goto done; 1371 /* Length is bounded by both the destination size and start offset. */ 1372 if (start + len > size * 8) 1373 len = (size * 8) - start; 1374 if (len == 0) 1375 goto done; 1376 1377 if (start > 0) 1378 src1 = (src1 >> start); 1379 if (len < 64) 1380 src1 = src1 & ((1ull << len) - 1); 1381 dst = src1; 1382 1383 done: 1384 error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size); 1385 if (error) 1386 return (error); 1387 1388 /* 1389 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. 1390 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. 1391 */ 1392 rflags &= ~RFLAGS_STATUS_BITS; 1393 if (dst == 0) 1394 rflags |= PSL_Z; 1395 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 1396 8); 1397 return (error); 1398 } 1399 1400 static int 1401 emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1402 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1403 { 1404 int error, size; 1405 uint64_t nval, rflags, rflags2, val1, val2; 1406 enum vm_reg_name reg; 1407 1408 size = vie->opsize; 1409 error = EINVAL; 1410 1411 switch (vie->op.op_byte) { 1412 case 0x03: 1413 /* 1414 * ADD r/m to r and store the result in r 1415 * 1416 * 03/r ADD r16, r/m16 1417 * 03/r ADD r32, r/m32 1418 * REX.W + 03/r ADD r64, r/m64 1419 */ 1420 1421 /* get the first operand */ 1422 reg = gpr_map[vie->reg]; 1423 error = vie_read_register(vcpu, reg, &val1); 1424 if (error) 1425 break; 1426 1427 /* get the second operand */ 1428 error = memread(vcpu, gpa, &val2, size, arg); 1429 if (error) 1430 break; 1431 1432 /* perform the operation and write the result */ 1433 nval = val1 + val2; 1434 error = vie_update_register(vcpu, reg, nval, size); 1435 break; 1436 default: 1437 break; 1438 } 1439 1440 if (!error) { 1441 rflags2 = getaddflags(size, val1, val2); 1442 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1443 &rflags); 1444 if (error) 1445 return (error); 1446 1447 rflags &= ~RFLAGS_STATUS_BITS; 1448 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1449 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1450 rflags, 8); 1451 } 1452 1453 return (error); 1454 } 1455 1456 static int 1457 emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1458 mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) 1459 { 1460 int error, size; 1461 uint64_t nval, rflags, rflags2, val1, val2; 1462 enum vm_reg_name reg; 1463 1464 size = vie->opsize; 1465 error = EINVAL; 1466 1467 switch (vie->op.op_byte) { 1468 case 0x2B: 1469 /* 1470 * SUB r/m from r and store the result in r 1471 * 1472 * 2B/r SUB r16, r/m16 1473 * 2B/r SUB r32, r/m32 1474 * REX.W + 2B/r SUB r64, r/m64 1475 */ 1476 1477 /* get the first operand */ 1478 reg = gpr_map[vie->reg]; 1479 error = vie_read_register(vcpu, reg, &val1); 1480 if (error) 1481 break; 1482 1483 /* get the second operand */ 1484 error = memread(vcpu, gpa, &val2, size, arg); 1485 if (error) 1486 break; 1487 1488 /* perform the operation and write the result */ 1489 nval = val1 - val2; 1490 error = vie_update_register(vcpu, reg, nval, size); 1491 break; 1492 default: 1493 break; 1494 } 1495 1496 if (!error) { 1497 rflags2 = getcc(size, val1, val2); 1498 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, 1499 &rflags); 1500 if (error) 1501 return (error); 1502 1503 rflags &= ~RFLAGS_STATUS_BITS; 1504 rflags |= rflags2 & RFLAGS_STATUS_BITS; 1505 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, 1506 rflags, 8); 1507 } 1508 1509 return (error); 1510 } 1511 1512 static int 1513 emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1514 struct vm_guest_paging *paging, mem_region_read_t memread, 1515 mem_region_write_t memwrite, void *arg) 1516 { 1517 #ifdef _KERNEL 1518 struct vm_copyinfo copyinfo[2]; 1519 #else 1520 struct iovec copyinfo[2]; 1521 #endif 1522 struct seg_desc ss_desc; 1523 uint64_t cr0, rflags, rsp, stack_gla, val; 1524 int error, fault, size, stackaddrsize, pushop; 1525 1526 val = 0; 1527 size = vie->opsize; 1528 pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; 1529 1530 /* 1531 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 1532 */ 1533 if (paging->cpu_mode == CPU_MODE_REAL) { 1534 stackaddrsize = 2; 1535 } else if (paging->cpu_mode == CPU_MODE_64BIT) { 1536 /* 1537 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 1538 * - Stack pointer size is always 64-bits. 1539 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. 1540 * - 16-bit PUSH/POP is supported by using the operand size 1541 * override prefix (66H). 1542 */ 1543 stackaddrsize = 8; 1544 size = vie->opsize_override ? 2 : 8; 1545 } else { 1546 /* 1547 * In protected or compatibility mode the 'B' flag in the 1548 * stack-segment descriptor determines the size of the 1549 * stack pointer. 1550 */ 1551 error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc); 1552 KASSERT(error == 0, ("%s: error %d getting SS descriptor", 1553 __func__, error)); 1554 if (SEG_DESC_DEF32(ss_desc.access)) 1555 stackaddrsize = 4; 1556 else 1557 stackaddrsize = 2; 1558 } 1559 1560 error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); 1561 KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); 1562 1563 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1564 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1565 1566 error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp); 1567 KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); 1568 if (pushop) { 1569 rsp -= size; 1570 } 1571 1572 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, 1573 rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, 1574 &stack_gla)) { 1575 vm_inject_ss(vcpu, 0); 1576 return (0); 1577 } 1578 1579 if (vie_canonical_check(paging->cpu_mode, stack_gla)) { 1580 vm_inject_ss(vcpu, 0); 1581 return (0); 1582 } 1583 1584 if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { 1585 vm_inject_ac(vcpu, 0); 1586 return (0); 1587 } 1588 1589 error = vm_copy_setup(vcpu, paging, stack_gla, size, 1590 pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), 1591 &fault); 1592 if (error || fault) 1593 return (error); 1594 1595 if (pushop) { 1596 error = memread(vcpu, mmio_gpa, &val, size, arg); 1597 if (error == 0) 1598 vm_copyout(&val, copyinfo, size); 1599 } else { 1600 vm_copyin(copyinfo, &val, size); 1601 error = memwrite(vcpu, mmio_gpa, val, size, arg); 1602 rsp += size; 1603 } 1604 vm_copy_teardown(copyinfo, nitems(copyinfo)); 1605 1606 if (error == 0) { 1607 error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp, 1608 stackaddrsize); 1609 KASSERT(error == 0, ("error %d updating rsp", error)); 1610 } 1611 return (error); 1612 } 1613 1614 static int 1615 emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1616 struct vm_guest_paging *paging, mem_region_read_t memread, 1617 mem_region_write_t memwrite, void *arg) 1618 { 1619 int error; 1620 1621 /* 1622 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1623 * 1624 * PUSH is part of the group 5 extended opcodes and is identified 1625 * by ModRM:reg = b110. 1626 */ 1627 if ((vie->reg & 7) != 6) 1628 return (EINVAL); 1629 1630 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1631 memwrite, arg); 1632 return (error); 1633 } 1634 1635 static int 1636 emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, 1637 struct vm_guest_paging *paging, mem_region_read_t memread, 1638 mem_region_write_t memwrite, void *arg) 1639 { 1640 int error; 1641 1642 /* 1643 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. 1644 * 1645 * POP is part of the group 1A extended opcodes and is identified 1646 * by ModRM:reg = b000. 1647 */ 1648 if ((vie->reg & 7) != 0) 1649 return (EINVAL); 1650 1651 error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, 1652 memwrite, arg); 1653 return (error); 1654 } 1655 1656 static int 1657 emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1658 struct vm_guest_paging *paging __unused, mem_region_read_t memread, 1659 mem_region_write_t memwrite, void *memarg) 1660 { 1661 int error; 1662 1663 switch (vie->reg & 7) { 1664 case 0x1: /* OR */ 1665 error = emulate_or(vcpu, gpa, vie, 1666 memread, memwrite, memarg); 1667 break; 1668 case 0x4: /* AND */ 1669 error = emulate_and(vcpu, gpa, vie, 1670 memread, memwrite, memarg); 1671 break; 1672 case 0x7: /* CMP */ 1673 error = emulate_cmp(vcpu, gpa, vie, 1674 memread, memwrite, memarg); 1675 break; 1676 default: 1677 error = EINVAL; 1678 break; 1679 } 1680 1681 return (error); 1682 } 1683 1684 static int 1685 emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1686 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1687 void *memarg) 1688 { 1689 uint64_t val, rflags; 1690 int error, bitmask, bitoff; 1691 1692 /* 1693 * 0F BA is a Group 8 extended opcode. 1694 * 1695 * Currently we only emulate the 'Bit Test' instruction which is 1696 * identified by a ModR/M:reg encoding of 100b. 1697 */ 1698 if ((vie->reg & 7) != 4) 1699 return (EINVAL); 1700 1701 error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); 1702 KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); 1703 1704 error = memread(vcpu, gpa, &val, vie->opsize, memarg); 1705 if (error) 1706 return (error); 1707 1708 /* 1709 * Intel SDM, Vol 2, Table 3-2: 1710 * "Range of Bit Positions Specified by Bit Offset Operands" 1711 */ 1712 bitmask = vie->opsize * 8 - 1; 1713 bitoff = vie->immediate & bitmask; 1714 1715 /* Copy the bit into the Carry flag in %rflags */ 1716 if (val & (1UL << bitoff)) 1717 rflags |= PSL_C; 1718 else 1719 rflags &= ~PSL_C; 1720 1721 error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); 1722 KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); 1723 1724 return (0); 1725 } 1726 1727 static int 1728 emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1729 mem_region_read_t memread, mem_region_write_t memwrite __unused, 1730 void *memarg) 1731 { 1732 int error; 1733 uint64_t buf; 1734 1735 switch (vie->reg & 7) { 1736 case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ 1737 if (vie->mod == 0x3) { 1738 /* 1739 * SFENCE. Ignore it, VM exit provides enough 1740 * barriers on its own. 1741 */ 1742 error = 0; 1743 } else { 1744 /* 1745 * CLFLUSH, CLFLUSHOPT. Only check for access 1746 * rights. 1747 */ 1748 error = memread(vcpu, gpa, &buf, 1, memarg); 1749 } 1750 break; 1751 default: 1752 error = EINVAL; 1753 break; 1754 } 1755 1756 return (error); 1757 } 1758 1759 int 1760 vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, 1761 struct vm_guest_paging *paging, mem_region_read_t memread, 1762 mem_region_write_t memwrite, void *memarg) 1763 { 1764 int error; 1765 1766 if (!vie->decoded) 1767 return (EINVAL); 1768 1769 switch (vie->op.op_type) { 1770 case VIE_OP_TYPE_GROUP1: 1771 error = emulate_group1(vcpu, gpa, vie, paging, memread, 1772 memwrite, memarg); 1773 break; 1774 case VIE_OP_TYPE_POP: 1775 error = emulate_pop(vcpu, gpa, vie, paging, memread, 1776 memwrite, memarg); 1777 break; 1778 case VIE_OP_TYPE_PUSH: 1779 error = emulate_push(vcpu, gpa, vie, paging, memread, 1780 memwrite, memarg); 1781 break; 1782 case VIE_OP_TYPE_CMP: 1783 error = emulate_cmp(vcpu, gpa, vie, 1784 memread, memwrite, memarg); 1785 break; 1786 case VIE_OP_TYPE_MOV: 1787 error = emulate_mov(vcpu, gpa, vie, 1788 memread, memwrite, memarg); 1789 break; 1790 case VIE_OP_TYPE_MOVSX: 1791 case VIE_OP_TYPE_MOVZX: 1792 error = emulate_movx(vcpu, gpa, vie, 1793 memread, memwrite, memarg); 1794 break; 1795 case VIE_OP_TYPE_MOVS: 1796 error = emulate_movs(vcpu, gpa, vie, paging, memread, 1797 memwrite, memarg); 1798 break; 1799 case VIE_OP_TYPE_STOS: 1800 error = emulate_stos(vcpu, gpa, vie, paging, memread, 1801 memwrite, memarg); 1802 break; 1803 case VIE_OP_TYPE_AND: 1804 error = emulate_and(vcpu, gpa, vie, 1805 memread, memwrite, memarg); 1806 break; 1807 case VIE_OP_TYPE_OR: 1808 error = emulate_or(vcpu, gpa, vie, 1809 memread, memwrite, memarg); 1810 break; 1811 case VIE_OP_TYPE_SUB: 1812 error = emulate_sub(vcpu, gpa, vie, 1813 memread, memwrite, memarg); 1814 break; 1815 case VIE_OP_TYPE_BITTEST: 1816 error = emulate_bittest(vcpu, gpa, vie, 1817 memread, memwrite, memarg); 1818 break; 1819 case VIE_OP_TYPE_TWOB_GRP15: 1820 error = emulate_twob_group15(vcpu, gpa, vie, 1821 memread, memwrite, memarg); 1822 break; 1823 case VIE_OP_TYPE_ADD: 1824 error = emulate_add(vcpu, gpa, vie, memread, 1825 memwrite, memarg); 1826 break; 1827 case VIE_OP_TYPE_TEST: 1828 error = emulate_test(vcpu, gpa, vie, 1829 memread, memwrite, memarg); 1830 break; 1831 case VIE_OP_TYPE_BEXTR: 1832 error = emulate_bextr(vcpu, gpa, vie, paging, 1833 memread, memwrite, memarg); 1834 break; 1835 default: 1836 error = EINVAL; 1837 break; 1838 } 1839 1840 return (error); 1841 } 1842 1843 int 1844 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 1845 { 1846 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1847 ("%s: invalid size %d", __func__, size)); 1848 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 1849 1850 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 1851 return (0); 1852 1853 return ((gla & (size - 1)) ? 1 : 0); 1854 } 1855 1856 int 1857 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 1858 { 1859 uint64_t mask; 1860 1861 if (cpu_mode != CPU_MODE_64BIT) 1862 return (0); 1863 1864 /* 1865 * The value of the bit 47 in the 'gla' should be replicated in the 1866 * most significant 16 bits. 1867 */ 1868 mask = ~((1UL << 48) - 1); 1869 if (gla & (1UL << 47)) 1870 return ((gla & mask) != mask); 1871 else 1872 return ((gla & mask) != 0); 1873 } 1874 1875 uint64_t 1876 vie_size2mask(int size) 1877 { 1878 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 1879 ("vie_size2mask: invalid size %d", size)); 1880 return (size2mask[size]); 1881 } 1882 1883 int 1884 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 1885 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 1886 int prot, uint64_t *gla) 1887 { 1888 uint64_t firstoff, low_limit, high_limit, segbase; 1889 int glasize, type; 1890 1891 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 1892 ("%s: invalid segment %d", __func__, seg)); 1893 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 1894 ("%s: invalid operand size %d", __func__, length)); 1895 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 1896 ("%s: invalid prot %#x", __func__, prot)); 1897 1898 firstoff = offset; 1899 if (cpu_mode == CPU_MODE_64BIT) { 1900 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 1901 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 1902 glasize = 8; 1903 } else { 1904 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 1905 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 1906 glasize = 4; 1907 /* 1908 * If the segment selector is loaded with a NULL selector 1909 * then the descriptor is unusable and attempting to use 1910 * it results in a #GP(0). 1911 */ 1912 if (SEG_DESC_UNUSABLE(desc->access)) 1913 return (-1); 1914 1915 /* 1916 * The processor generates a #NP exception when a segment 1917 * register is loaded with a selector that points to a 1918 * descriptor that is not present. If this was the case then 1919 * it would have been checked before the VM-exit. 1920 */ 1921 KASSERT(SEG_DESC_PRESENT(desc->access), 1922 ("segment %d not present: %#x", seg, desc->access)); 1923 1924 /* 1925 * The descriptor type must indicate a code/data segment. 1926 */ 1927 type = SEG_DESC_TYPE(desc->access); 1928 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 1929 "descriptor type %#x", seg, type)); 1930 1931 if (prot & PROT_READ) { 1932 /* #GP on a read access to a exec-only code segment */ 1933 if ((type & 0xA) == 0x8) 1934 return (-1); 1935 } 1936 1937 if (prot & PROT_WRITE) { 1938 /* 1939 * #GP on a write access to a code segment or a 1940 * read-only data segment. 1941 */ 1942 if (type & 0x8) /* code segment */ 1943 return (-1); 1944 1945 if ((type & 0xA) == 0) /* read-only data seg */ 1946 return (-1); 1947 } 1948 1949 /* 1950 * 'desc->limit' is fully expanded taking granularity into 1951 * account. 1952 */ 1953 if ((type & 0xC) == 0x4) { 1954 /* expand-down data segment */ 1955 low_limit = desc->limit + 1; 1956 high_limit = SEG_DESC_DEF32(desc->access) ? 1957 0xffffffff : 0xffff; 1958 } else { 1959 /* code segment or expand-up data segment */ 1960 low_limit = 0; 1961 high_limit = desc->limit; 1962 } 1963 1964 while (length > 0) { 1965 offset &= vie_size2mask(addrsize); 1966 if (offset < low_limit || offset > high_limit) 1967 return (-1); 1968 offset++; 1969 length--; 1970 } 1971 } 1972 1973 /* 1974 * In 64-bit mode all segments except %fs and %gs have a segment 1975 * base address of 0. 1976 */ 1977 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 1978 seg != VM_REG_GUEST_GS) { 1979 segbase = 0; 1980 } else { 1981 segbase = desc->base; 1982 } 1983 1984 /* 1985 * Truncate 'firstoff' to the effective address size before adding 1986 * it to the segment base. 1987 */ 1988 firstoff &= vie_size2mask(addrsize); 1989 *gla = (segbase + firstoff) & vie_size2mask(glasize); 1990 return (0); 1991 } 1992 1993 /* 1994 * Prepare a partially decoded vie for a 2nd attempt. 1995 */ 1996 void 1997 vie_restart(struct vie *vie) 1998 { 1999 _Static_assert( 2000 offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && 2001 offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), 2002 "restart should not erase instruction length or contents"); 2003 2004 memset((char *)vie + offsetof(struct vie, vie_startzero), 0, 2005 sizeof(*vie) - offsetof(struct vie, vie_startzero)); 2006 2007 vie->base_register = VM_REG_LAST; 2008 vie->index_register = VM_REG_LAST; 2009 vie->segment_register = VM_REG_LAST; 2010 } 2011 2012 void 2013 vie_init(struct vie *vie, const char *inst_bytes, int inst_length) 2014 { 2015 KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, 2016 ("%s: invalid instruction length (%d)", __func__, inst_length)); 2017 2018 vie_restart(vie); 2019 memset(vie->inst, 0, sizeof(vie->inst)); 2020 if (inst_length != 0) 2021 memcpy(vie->inst, inst_bytes, inst_length); 2022 vie->num_valid = inst_length; 2023 } 2024 2025 #ifdef _KERNEL 2026 static int 2027 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 2028 { 2029 int error_code = 0; 2030 2031 if (pte & PG_V) 2032 error_code |= PGEX_P; 2033 if (prot & VM_PROT_WRITE) 2034 error_code |= PGEX_W; 2035 if (usermode) 2036 error_code |= PGEX_U; 2037 if (rsvd) 2038 error_code |= PGEX_RSV; 2039 if (prot & VM_PROT_EXECUTE) 2040 error_code |= PGEX_I; 2041 2042 return (error_code); 2043 } 2044 2045 static void 2046 ptp_release(void **cookie) 2047 { 2048 if (*cookie != NULL) { 2049 vm_gpa_release(*cookie); 2050 *cookie = NULL; 2051 } 2052 } 2053 2054 static void * 2055 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) 2056 { 2057 void *ptr; 2058 2059 ptp_release(cookie); 2060 ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); 2061 return (ptr); 2062 } 2063 2064 static int 2065 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2066 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) 2067 { 2068 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 2069 u_int retries; 2070 uint64_t *ptpbase, ptpphys, pte, pgsize; 2071 uint32_t *ptpbase32, pte32; 2072 void *cookie; 2073 2074 *guest_fault = 0; 2075 2076 usermode = (paging->cpl == 3 ? 1 : 0); 2077 writable = prot & VM_PROT_WRITE; 2078 cookie = NULL; 2079 retval = 0; 2080 retries = 0; 2081 restart: 2082 ptpphys = paging->cr3; /* root of the page tables */ 2083 ptp_release(&cookie); 2084 if (retries++ > 0) 2085 maybe_yield(); 2086 2087 if (vie_canonical_check(paging->cpu_mode, gla)) { 2088 /* 2089 * XXX assuming a non-stack reference otherwise a stack fault 2090 * should be generated. 2091 */ 2092 if (!check_only) 2093 vm_inject_gp(vcpu); 2094 goto fault; 2095 } 2096 2097 if (paging->paging_mode == PAGING_MODE_FLAT) { 2098 *gpa = gla; 2099 goto done; 2100 } 2101 2102 if (paging->paging_mode == PAGING_MODE_32) { 2103 nlevels = 2; 2104 while (--nlevels >= 0) { 2105 /* Zero out the lower 12 bits. */ 2106 ptpphys &= ~0xfff; 2107 2108 ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE, 2109 &cookie); 2110 2111 if (ptpbase32 == NULL) 2112 goto error; 2113 2114 ptpshift = PAGE_SHIFT + nlevels * 10; 2115 ptpindex = (gla >> ptpshift) & 0x3FF; 2116 pgsize = 1UL << ptpshift; 2117 2118 pte32 = ptpbase32[ptpindex]; 2119 2120 if ((pte32 & PG_V) == 0 || 2121 (usermode && (pte32 & PG_U) == 0) || 2122 (writable && (pte32 & PG_RW) == 0)) { 2123 if (!check_only) { 2124 pfcode = pf_error_code(usermode, prot, 0, 2125 pte32); 2126 vm_inject_pf(vcpu, pfcode, gla); 2127 } 2128 goto fault; 2129 } 2130 2131 /* 2132 * Emulate the x86 MMU's management of the accessed 2133 * and dirty flags. While the accessed flag is set 2134 * at every level of the page table, the dirty flag 2135 * is only set at the last level providing the guest 2136 * physical address. 2137 */ 2138 if (!check_only && (pte32 & PG_A) == 0) { 2139 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2140 pte32, pte32 | PG_A) == 0) { 2141 goto restart; 2142 } 2143 } 2144 2145 /* XXX must be ignored if CR4.PSE=0 */ 2146 if (nlevels > 0 && (pte32 & PG_PS) != 0) 2147 break; 2148 2149 ptpphys = pte32; 2150 } 2151 2152 /* Set the dirty bit in the page table entry if necessary */ 2153 if (!check_only && writable && (pte32 & PG_M) == 0) { 2154 if (atomic_cmpset_32(&ptpbase32[ptpindex], 2155 pte32, pte32 | PG_M) == 0) { 2156 goto restart; 2157 } 2158 } 2159 2160 /* Zero out the lower 'ptpshift' bits */ 2161 pte32 >>= ptpshift; pte32 <<= ptpshift; 2162 *gpa = pte32 | (gla & (pgsize - 1)); 2163 goto done; 2164 } 2165 2166 if (paging->paging_mode == PAGING_MODE_PAE) { 2167 /* Zero out the lower 5 bits and the upper 32 bits */ 2168 ptpphys &= 0xffffffe0UL; 2169 2170 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4, 2171 &cookie); 2172 if (ptpbase == NULL) 2173 goto error; 2174 2175 ptpindex = (gla >> 30) & 0x3; 2176 2177 pte = ptpbase[ptpindex]; 2178 2179 if ((pte & PG_V) == 0) { 2180 if (!check_only) { 2181 pfcode = pf_error_code(usermode, prot, 0, pte); 2182 vm_inject_pf(vcpu, pfcode, gla); 2183 } 2184 goto fault; 2185 } 2186 2187 ptpphys = pte; 2188 2189 nlevels = 2; 2190 } else if (paging->paging_mode == PAGING_MODE_64_LA57) { 2191 nlevels = 5; 2192 } else { 2193 nlevels = 4; 2194 } 2195 2196 while (--nlevels >= 0) { 2197 /* Zero out the lower 12 bits and the upper 12 bits */ 2198 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 2199 2200 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); 2201 if (ptpbase == NULL) 2202 goto error; 2203 2204 ptpshift = PAGE_SHIFT + nlevels * 9; 2205 ptpindex = (gla >> ptpshift) & 0x1FF; 2206 pgsize = 1UL << ptpshift; 2207 2208 pte = ptpbase[ptpindex]; 2209 2210 if ((pte & PG_V) == 0 || 2211 (usermode && (pte & PG_U) == 0) || 2212 (writable && (pte & PG_RW) == 0)) { 2213 if (!check_only) { 2214 pfcode = pf_error_code(usermode, prot, 0, pte); 2215 vm_inject_pf(vcpu, pfcode, gla); 2216 } 2217 goto fault; 2218 } 2219 2220 /* Set the accessed bit in the page table entry */ 2221 if (!check_only && (pte & PG_A) == 0) { 2222 if (atomic_cmpset_64(&ptpbase[ptpindex], 2223 pte, pte | PG_A) == 0) { 2224 goto restart; 2225 } 2226 } 2227 2228 if (nlevels > 0 && (pte & PG_PS) != 0) { 2229 if (pgsize > 1 * GB) { 2230 if (!check_only) { 2231 pfcode = pf_error_code(usermode, prot, 1, 2232 pte); 2233 vm_inject_pf(vcpu, pfcode, gla); 2234 } 2235 goto fault; 2236 } 2237 break; 2238 } 2239 2240 ptpphys = pte; 2241 } 2242 2243 /* Set the dirty bit in the page table entry if necessary */ 2244 if (!check_only && writable && (pte & PG_M) == 0) { 2245 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 2246 goto restart; 2247 } 2248 2249 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 2250 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 2251 *gpa = pte | (gla & (pgsize - 1)); 2252 done: 2253 ptp_release(&cookie); 2254 KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", 2255 __func__, retval)); 2256 return (retval); 2257 error: 2258 retval = EFAULT; 2259 goto done; 2260 fault: 2261 *guest_fault = 1; 2262 goto done; 2263 } 2264 2265 int 2266 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 2267 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2268 { 2269 2270 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2271 false)); 2272 } 2273 2274 int 2275 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 2276 uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) 2277 { 2278 2279 return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, 2280 true)); 2281 } 2282 2283 int 2284 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging, 2285 uint64_t rip, int inst_length, struct vie *vie, int *faultptr) 2286 { 2287 struct vm_copyinfo copyinfo[2]; 2288 int error, prot; 2289 2290 if (inst_length > VIE_INST_SIZE) 2291 panic("vmm_fetch_instruction: invalid length %d", inst_length); 2292 2293 prot = PROT_READ | PROT_EXEC; 2294 error = vm_copy_setup(vcpu, paging, rip, inst_length, prot, 2295 copyinfo, nitems(copyinfo), faultptr); 2296 if (error || *faultptr) 2297 return (error); 2298 2299 vm_copyin(copyinfo, vie->inst, inst_length); 2300 vm_copy_teardown(copyinfo, nitems(copyinfo)); 2301 vie->num_valid = inst_length; 2302 return (0); 2303 } 2304 #endif /* _KERNEL */ 2305 2306 static int 2307 vie_peek(struct vie *vie, uint8_t *x) 2308 { 2309 2310 if (vie->num_processed < vie->num_valid) { 2311 *x = vie->inst[vie->num_processed]; 2312 return (0); 2313 } else 2314 return (-1); 2315 } 2316 2317 static void 2318 vie_advance(struct vie *vie) 2319 { 2320 2321 vie->num_processed++; 2322 } 2323 2324 static bool 2325 segment_override(uint8_t x, int *seg) 2326 { 2327 2328 switch (x) { 2329 case 0x2E: 2330 *seg = VM_REG_GUEST_CS; 2331 break; 2332 case 0x36: 2333 *seg = VM_REG_GUEST_SS; 2334 break; 2335 case 0x3E: 2336 *seg = VM_REG_GUEST_DS; 2337 break; 2338 case 0x26: 2339 *seg = VM_REG_GUEST_ES; 2340 break; 2341 case 0x64: 2342 *seg = VM_REG_GUEST_FS; 2343 break; 2344 case 0x65: 2345 *seg = VM_REG_GUEST_GS; 2346 break; 2347 default: 2348 return (false); 2349 } 2350 return (true); 2351 } 2352 2353 static int 2354 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) 2355 { 2356 uint8_t x; 2357 2358 while (1) { 2359 if (vie_peek(vie, &x)) 2360 return (-1); 2361 2362 if (x == 0x66) 2363 vie->opsize_override = 1; 2364 else if (x == 0x67) 2365 vie->addrsize_override = 1; 2366 else if (x == 0xF3) 2367 vie->repz_present = 1; 2368 else if (x == 0xF2) 2369 vie->repnz_present = 1; 2370 else if (segment_override(x, &vie->segment_register)) 2371 vie->segment_override = 1; 2372 else 2373 break; 2374 2375 vie_advance(vie); 2376 } 2377 2378 /* 2379 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: 2380 * - Only one REX prefix is allowed per instruction. 2381 * - The REX prefix must immediately precede the opcode byte or the 2382 * escape opcode byte. 2383 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) 2384 * the mandatory prefix must come before the REX prefix. 2385 */ 2386 if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { 2387 vie->rex_present = 1; 2388 vie->rex_w = x & 0x8 ? 1 : 0; 2389 vie->rex_r = x & 0x4 ? 1 : 0; 2390 vie->rex_x = x & 0x2 ? 1 : 0; 2391 vie->rex_b = x & 0x1 ? 1 : 0; 2392 vie_advance(vie); 2393 } 2394 2395 /* 2396 * § 2.3.5, "The VEX Prefix", SDM Vol 2. 2397 */ 2398 if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) 2399 && x == 0xC4) { 2400 const struct vie_op *optab; 2401 2402 /* 3-byte VEX prefix. */ 2403 vie->vex_present = 1; 2404 2405 vie_advance(vie); 2406 if (vie_peek(vie, &x)) 2407 return (-1); 2408 2409 /* 2410 * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted 2411 * relative to REX encoding. 2412 */ 2413 vie->rex_r = x & 0x80 ? 0 : 1; 2414 vie->rex_x = x & 0x40 ? 0 : 1; 2415 vie->rex_b = x & 0x20 ? 0 : 1; 2416 2417 switch (x & 0x1F) { 2418 case 0x2: 2419 /* 0F 38. */ 2420 optab = three_byte_opcodes_0f38; 2421 break; 2422 case 0x1: 2423 /* 0F class - nothing handled here yet. */ 2424 /* FALLTHROUGH */ 2425 case 0x3: 2426 /* 0F 3A class - nothing handled here yet. */ 2427 /* FALLTHROUGH */ 2428 default: 2429 /* Reserved (#UD). */ 2430 return (-1); 2431 } 2432 2433 vie_advance(vie); 2434 if (vie_peek(vie, &x)) 2435 return (-1); 2436 2437 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ 2438 vie->rex_w = x & 0x80 ? 1 : 0; 2439 2440 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); 2441 vie->vex_l = !!(x & 0x4); 2442 vie->vex_pp = (x & 0x3); 2443 2444 /* PP: 1=66 2=F3 3=F2 prefixes. */ 2445 switch (vie->vex_pp) { 2446 case 0x1: 2447 vie->opsize_override = 1; 2448 break; 2449 case 0x2: 2450 vie->repz_present = 1; 2451 break; 2452 case 0x3: 2453 vie->repnz_present = 1; 2454 break; 2455 } 2456 2457 vie_advance(vie); 2458 2459 /* Opcode, sans literal prefix prefix. */ 2460 if (vie_peek(vie, &x)) 2461 return (-1); 2462 2463 vie->op = optab[x]; 2464 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2465 return (-1); 2466 2467 vie_advance(vie); 2468 } 2469 2470 /* 2471 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 2472 */ 2473 if (cpu_mode == CPU_MODE_64BIT) { 2474 /* 2475 * Default address size is 64-bits and default operand size 2476 * is 32-bits. 2477 */ 2478 vie->addrsize = vie->addrsize_override ? 4 : 8; 2479 if (vie->rex_w) 2480 vie->opsize = 8; 2481 else if (vie->opsize_override) 2482 vie->opsize = 2; 2483 else 2484 vie->opsize = 4; 2485 } else if (cs_d) { 2486 /* Default address and operand sizes are 32-bits */ 2487 vie->addrsize = vie->addrsize_override ? 2 : 4; 2488 vie->opsize = vie->opsize_override ? 2 : 4; 2489 } else { 2490 /* Default address and operand sizes are 16-bits */ 2491 vie->addrsize = vie->addrsize_override ? 4 : 2; 2492 vie->opsize = vie->opsize_override ? 4 : 2; 2493 } 2494 return (0); 2495 } 2496 2497 static int 2498 decode_two_byte_opcode(struct vie *vie) 2499 { 2500 uint8_t x; 2501 2502 if (vie_peek(vie, &x)) 2503 return (-1); 2504 2505 vie->op = two_byte_opcodes[x]; 2506 2507 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2508 return (-1); 2509 2510 vie_advance(vie); 2511 return (0); 2512 } 2513 2514 static int 2515 decode_opcode(struct vie *vie) 2516 { 2517 uint8_t x; 2518 2519 if (vie_peek(vie, &x)) 2520 return (-1); 2521 2522 /* Already did this via VEX prefix. */ 2523 if (vie->op.op_type != VIE_OP_TYPE_NONE) 2524 return (0); 2525 2526 vie->op = one_byte_opcodes[x]; 2527 2528 if (vie->op.op_type == VIE_OP_TYPE_NONE) 2529 return (-1); 2530 2531 vie_advance(vie); 2532 2533 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 2534 return (decode_two_byte_opcode(vie)); 2535 2536 return (0); 2537 } 2538 2539 static int 2540 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 2541 { 2542 uint8_t x; 2543 2544 if (vie->op.op_flags & VIE_OP_F_NO_MODRM) 2545 return (0); 2546 2547 if (cpu_mode == CPU_MODE_REAL) 2548 return (-1); 2549 2550 if (vie_peek(vie, &x)) 2551 return (-1); 2552 2553 vie->mod = (x >> 6) & 0x3; 2554 vie->rm = (x >> 0) & 0x7; 2555 vie->reg = (x >> 3) & 0x7; 2556 2557 /* 2558 * A direct addressing mode makes no sense in the context of an EPT 2559 * fault. There has to be a memory access involved to cause the 2560 * EPT fault. 2561 */ 2562 if (vie->mod == VIE_MOD_DIRECT) 2563 return (-1); 2564 2565 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 2566 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 2567 /* 2568 * Table 2-5: Special Cases of REX Encodings 2569 * 2570 * mod=0, r/m=5 is used in the compatibility mode to 2571 * indicate a disp32 without a base register. 2572 * 2573 * mod!=3, r/m=4 is used in the compatibility mode to 2574 * indicate that the SIB byte is present. 2575 * 2576 * The 'b' bit in the REX prefix is don't care in 2577 * this case. 2578 */ 2579 } else { 2580 vie->rm |= (vie->rex_b << 3); 2581 } 2582 2583 vie->reg |= (vie->rex_r << 3); 2584 2585 /* SIB */ 2586 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 2587 goto done; 2588 2589 vie->base_register = gpr_map[vie->rm]; 2590 2591 switch (vie->mod) { 2592 case VIE_MOD_INDIRECT_DISP8: 2593 vie->disp_bytes = 1; 2594 break; 2595 case VIE_MOD_INDIRECT_DISP32: 2596 vie->disp_bytes = 4; 2597 break; 2598 case VIE_MOD_INDIRECT: 2599 if (vie->rm == VIE_RM_DISP32) { 2600 vie->disp_bytes = 4; 2601 /* 2602 * Table 2-7. RIP-Relative Addressing 2603 * 2604 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 2605 * whereas in compatibility mode it just implies disp32. 2606 */ 2607 2608 if (cpu_mode == CPU_MODE_64BIT) 2609 vie->base_register = VM_REG_GUEST_RIP; 2610 else 2611 vie->base_register = VM_REG_LAST; 2612 } 2613 break; 2614 } 2615 2616 done: 2617 vie_advance(vie); 2618 2619 return (0); 2620 } 2621 2622 static int 2623 decode_sib(struct vie *vie) 2624 { 2625 uint8_t x; 2626 2627 /* Proceed only if SIB byte is present */ 2628 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 2629 return (0); 2630 2631 if (vie_peek(vie, &x)) 2632 return (-1); 2633 2634 /* De-construct the SIB byte */ 2635 vie->ss = (x >> 6) & 0x3; 2636 vie->index = (x >> 3) & 0x7; 2637 vie->base = (x >> 0) & 0x7; 2638 2639 /* Apply the REX prefix modifiers */ 2640 vie->index |= vie->rex_x << 3; 2641 vie->base |= vie->rex_b << 3; 2642 2643 switch (vie->mod) { 2644 case VIE_MOD_INDIRECT_DISP8: 2645 vie->disp_bytes = 1; 2646 break; 2647 case VIE_MOD_INDIRECT_DISP32: 2648 vie->disp_bytes = 4; 2649 break; 2650 } 2651 2652 if (vie->mod == VIE_MOD_INDIRECT && 2653 (vie->base == 5 || vie->base == 13)) { 2654 /* 2655 * Special case when base register is unused if mod = 0 2656 * and base = %rbp or %r13. 2657 * 2658 * Documented in: 2659 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2660 * Table 2-5: Special Cases of REX Encodings 2661 */ 2662 vie->disp_bytes = 4; 2663 } else { 2664 vie->base_register = gpr_map[vie->base]; 2665 } 2666 2667 /* 2668 * All encodings of 'index' are valid except for %rsp (4). 2669 * 2670 * Documented in: 2671 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 2672 * Table 2-5: Special Cases of REX Encodings 2673 */ 2674 if (vie->index != 4) 2675 vie->index_register = gpr_map[vie->index]; 2676 2677 /* 'scale' makes sense only in the context of an index register */ 2678 if (vie->index_register < VM_REG_LAST) 2679 vie->scale = 1 << vie->ss; 2680 2681 vie_advance(vie); 2682 2683 return (0); 2684 } 2685 2686 static int 2687 decode_displacement(struct vie *vie) 2688 { 2689 int n, i; 2690 uint8_t x; 2691 2692 union { 2693 char buf[4]; 2694 int8_t signed8; 2695 int32_t signed32; 2696 } u; 2697 2698 if ((n = vie->disp_bytes) == 0) 2699 return (0); 2700 2701 if (n != 1 && n != 4) 2702 panic("decode_displacement: invalid disp_bytes %d", n); 2703 2704 for (i = 0; i < n; i++) { 2705 if (vie_peek(vie, &x)) 2706 return (-1); 2707 2708 u.buf[i] = x; 2709 vie_advance(vie); 2710 } 2711 2712 if (n == 1) 2713 vie->displacement = u.signed8; /* sign-extended */ 2714 else 2715 vie->displacement = u.signed32; /* sign-extended */ 2716 2717 return (0); 2718 } 2719 2720 static int 2721 decode_immediate(struct vie *vie) 2722 { 2723 int i, n; 2724 uint8_t x; 2725 union { 2726 char buf[4]; 2727 int8_t signed8; 2728 int16_t signed16; 2729 int32_t signed32; 2730 } u; 2731 2732 /* Figure out immediate operand size (if any) */ 2733 if (vie->op.op_flags & VIE_OP_F_IMM) { 2734 /* 2735 * Section 2.2.1.5 "Immediates", Intel SDM: 2736 * In 64-bit mode the typical size of immediate operands 2737 * remains 32-bits. When the operand size if 64-bits, the 2738 * processor sign-extends all immediates to 64-bits prior 2739 * to their use. 2740 */ 2741 if (vie->opsize == 4 || vie->opsize == 8) 2742 vie->imm_bytes = 4; 2743 else 2744 vie->imm_bytes = 2; 2745 } else if (vie->op.op_flags & VIE_OP_F_IMM8) { 2746 vie->imm_bytes = 1; 2747 } 2748 2749 if ((n = vie->imm_bytes) == 0) 2750 return (0); 2751 2752 KASSERT(n == 1 || n == 2 || n == 4, 2753 ("%s: invalid number of immediate bytes: %d", __func__, n)); 2754 2755 for (i = 0; i < n; i++) { 2756 if (vie_peek(vie, &x)) 2757 return (-1); 2758 2759 u.buf[i] = x; 2760 vie_advance(vie); 2761 } 2762 2763 /* sign-extend the immediate value before use */ 2764 if (n == 1) 2765 vie->immediate = u.signed8; 2766 else if (n == 2) 2767 vie->immediate = u.signed16; 2768 else 2769 vie->immediate = u.signed32; 2770 2771 return (0); 2772 } 2773 2774 static int 2775 decode_moffset(struct vie *vie) 2776 { 2777 int i, n; 2778 uint8_t x; 2779 union { 2780 char buf[8]; 2781 uint64_t u64; 2782 } u; 2783 2784 if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) 2785 return (0); 2786 2787 /* 2788 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: 2789 * The memory offset size follows the address-size of the instruction. 2790 */ 2791 n = vie->addrsize; 2792 KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); 2793 2794 u.u64 = 0; 2795 for (i = 0; i < n; i++) { 2796 if (vie_peek(vie, &x)) 2797 return (-1); 2798 2799 u.buf[i] = x; 2800 vie_advance(vie); 2801 } 2802 vie->displacement = u.u64; 2803 return (0); 2804 } 2805 2806 #ifdef _KERNEL 2807 /* 2808 * Verify that the 'guest linear address' provided as collateral of the nested 2809 * page table fault matches with our instruction decoding. 2810 */ 2811 static int 2812 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie, 2813 enum vm_cpu_mode cpu_mode) 2814 { 2815 int error; 2816 uint64_t base, segbase, idx, gla2; 2817 enum vm_reg_name seg; 2818 struct seg_desc desc; 2819 2820 /* Skip 'gla' verification */ 2821 if (gla == VIE_INVALID_GLA) 2822 return (0); 2823 2824 base = 0; 2825 if (vie->base_register != VM_REG_LAST) { 2826 error = vm_get_register(vcpu, vie->base_register, &base); 2827 if (error) { 2828 printf("verify_gla: error %d getting base reg %d\n", 2829 error, vie->base_register); 2830 return (-1); 2831 } 2832 2833 /* 2834 * RIP-relative addressing starts from the following 2835 * instruction 2836 */ 2837 if (vie->base_register == VM_REG_GUEST_RIP) 2838 base += vie->num_processed; 2839 } 2840 2841 idx = 0; 2842 if (vie->index_register != VM_REG_LAST) { 2843 error = vm_get_register(vcpu, vie->index_register, &idx); 2844 if (error) { 2845 printf("verify_gla: error %d getting index reg %d\n", 2846 error, vie->index_register); 2847 return (-1); 2848 } 2849 } 2850 2851 /* 2852 * From "Specifying a Segment Selector", Intel SDM, Vol 1 2853 * 2854 * In 64-bit mode, segmentation is generally (but not 2855 * completely) disabled. The exceptions are the FS and GS 2856 * segments. 2857 * 2858 * In legacy IA-32 mode, when the ESP or EBP register is used 2859 * as the base, the SS segment is the default segment. For 2860 * other data references, except when relative to stack or 2861 * string destination the DS segment is the default. These 2862 * can be overridden to allow other segments to be accessed. 2863 */ 2864 if (vie->segment_override) 2865 seg = vie->segment_register; 2866 else if (vie->base_register == VM_REG_GUEST_RSP || 2867 vie->base_register == VM_REG_GUEST_RBP) 2868 seg = VM_REG_GUEST_SS; 2869 else 2870 seg = VM_REG_GUEST_DS; 2871 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 2872 seg != VM_REG_GUEST_GS) { 2873 segbase = 0; 2874 } else { 2875 error = vm_get_seg_desc(vcpu, seg, &desc); 2876 if (error) { 2877 printf("verify_gla: error %d getting segment" 2878 " descriptor %d", error, 2879 vie->segment_register); 2880 return (-1); 2881 } 2882 segbase = desc.base; 2883 } 2884 2885 gla2 = segbase + base + vie->scale * idx + vie->displacement; 2886 gla2 &= size2mask[vie->addrsize]; 2887 if (gla != gla2) { 2888 printf("verify_gla mismatch: segbase(0x%0lx)" 2889 "base(0x%0lx), scale(%d), index(0x%0lx), " 2890 "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", 2891 segbase, base, vie->scale, idx, vie->displacement, 2892 gla, gla2); 2893 return (-1); 2894 } 2895 2896 return (0); 2897 } 2898 #endif /* _KERNEL */ 2899 2900 int 2901 #ifdef _KERNEL 2902 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, 2903 enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2904 #else 2905 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) 2906 #endif 2907 { 2908 2909 if (decode_prefixes(vie, cpu_mode, cs_d)) 2910 return (-1); 2911 2912 if (decode_opcode(vie)) 2913 return (-1); 2914 2915 if (decode_modrm(vie, cpu_mode)) 2916 return (-1); 2917 2918 if (decode_sib(vie)) 2919 return (-1); 2920 2921 if (decode_displacement(vie)) 2922 return (-1); 2923 2924 if (decode_immediate(vie)) 2925 return (-1); 2926 2927 if (decode_moffset(vie)) 2928 return (-1); 2929 2930 #ifdef _KERNEL 2931 if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { 2932 if (verify_gla(vcpu, gla, vie, cpu_mode)) 2933 return (-1); 2934 } 2935 #endif 2936 2937 vie->decoded = 1; /* success */ 2938 2939 return (0); 2940 } 2941