1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force a shadow sync because 177 * VM_INSTRUCTION_ERROR is not shadowed 178 */ 179 return kvm_skip_emulated_instruction(vcpu); 180 } 181 182 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 183 { 184 struct vcpu_vmx *vmx = to_vmx(vcpu); 185 186 /* 187 * failValid writes the error number to the current VMCS, which 188 * can't be done if there isn't a current VMCS. 189 */ 190 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 191 return nested_vmx_failInvalid(vcpu); 192 193 return nested_vmx_failValid(vcpu, vm_instruction_error); 194 } 195 196 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 197 { 198 /* TODO: not to reset guest simply here. */ 199 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 200 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 201 } 202 203 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 204 { 205 return fixed_bits_valid(control, low, high); 206 } 207 208 static inline u64 vmx_control_msr(u32 low, u32 high) 209 { 210 return low | ((u64)high << 32); 211 } 212 213 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 214 { 215 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 216 vmcs_write64(VMCS_LINK_POINTER, -1ull); 217 vmx->nested.need_vmcs12_to_shadow_sync = false; 218 } 219 220 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 221 { 222 struct vcpu_vmx *vmx = to_vmx(vcpu); 223 224 if (!vmx->nested.hv_evmcs) 225 return; 226 227 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 228 vmx->nested.hv_evmcs_vmptr = 0; 229 vmx->nested.hv_evmcs = NULL; 230 } 231 232 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 233 struct loaded_vmcs *prev) 234 { 235 struct vmcs_host_state *dest, *src; 236 237 if (unlikely(!vmx->guest_state_loaded)) 238 return; 239 240 src = &prev->host_state; 241 dest = &vmx->loaded_vmcs->host_state; 242 243 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 244 dest->ldt_sel = src->ldt_sel; 245 #ifdef CONFIG_X86_64 246 dest->ds_sel = src->ds_sel; 247 dest->es_sel = src->es_sel; 248 #endif 249 } 250 251 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 252 { 253 struct vcpu_vmx *vmx = to_vmx(vcpu); 254 struct loaded_vmcs *prev; 255 int cpu; 256 257 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 258 return; 259 260 cpu = get_cpu(); 261 prev = vmx->loaded_vmcs; 262 vmx->loaded_vmcs = vmcs; 263 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 264 vmx_sync_vmcs_host_state(vmx, prev); 265 put_cpu(); 266 267 vmx_register_cache_reset(vcpu); 268 } 269 270 /* 271 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 272 * just stops using VMX. 273 */ 274 static void free_nested(struct kvm_vcpu *vcpu) 275 { 276 struct vcpu_vmx *vmx = to_vmx(vcpu); 277 278 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 279 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 280 281 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 282 return; 283 284 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 285 286 vmx->nested.vmxon = false; 287 vmx->nested.smm.vmxon = false; 288 free_vpid(vmx->nested.vpid02); 289 vmx->nested.posted_intr_nv = -1; 290 vmx->nested.current_vmptr = -1ull; 291 if (enable_shadow_vmcs) { 292 vmx_disable_shadow_vmcs(vmx); 293 vmcs_clear(vmx->vmcs01.shadow_vmcs); 294 free_vmcs(vmx->vmcs01.shadow_vmcs); 295 vmx->vmcs01.shadow_vmcs = NULL; 296 } 297 kfree(vmx->nested.cached_vmcs12); 298 vmx->nested.cached_vmcs12 = NULL; 299 kfree(vmx->nested.cached_shadow_vmcs12); 300 vmx->nested.cached_shadow_vmcs12 = NULL; 301 /* Unpin physical memory we referred to in the vmcs02 */ 302 if (vmx->nested.apic_access_page) { 303 kvm_release_page_clean(vmx->nested.apic_access_page); 304 vmx->nested.apic_access_page = NULL; 305 } 306 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 307 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 308 vmx->nested.pi_desc = NULL; 309 310 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 311 312 nested_release_evmcs(vcpu); 313 314 free_loaded_vmcs(&vmx->nested.vmcs02); 315 } 316 317 /* 318 * Ensure that the current vmcs of the logical processor is the 319 * vmcs01 of the vcpu before calling free_nested(). 320 */ 321 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 322 { 323 vcpu_load(vcpu); 324 vmx_leave_nested(vcpu); 325 vcpu_put(vcpu); 326 } 327 328 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 329 struct x86_exception *fault) 330 { 331 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 332 struct vcpu_vmx *vmx = to_vmx(vcpu); 333 u32 vm_exit_reason; 334 unsigned long exit_qualification = vcpu->arch.exit_qualification; 335 336 if (vmx->nested.pml_full) { 337 vm_exit_reason = EXIT_REASON_PML_FULL; 338 vmx->nested.pml_full = false; 339 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 340 } else if (fault->error_code & PFERR_RSVD_MASK) 341 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 342 else 343 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 344 345 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 346 vmcs12->guest_physical_address = fault->address; 347 } 348 349 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 350 { 351 WARN_ON(mmu_is_nested(vcpu)); 352 353 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 354 kvm_init_shadow_ept_mmu(vcpu, 355 to_vmx(vcpu)->nested.msrs.ept_caps & 356 VMX_EPT_EXECUTE_ONLY_BIT, 357 nested_ept_ad_enabled(vcpu), 358 nested_ept_get_eptp(vcpu)); 359 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 360 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 361 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 362 363 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 364 } 365 366 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 367 { 368 vcpu->arch.mmu = &vcpu->arch.root_mmu; 369 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 370 } 371 372 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 373 u16 error_code) 374 { 375 bool inequality, bit; 376 377 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 378 inequality = 379 (error_code & vmcs12->page_fault_error_code_mask) != 380 vmcs12->page_fault_error_code_match; 381 return inequality ^ bit; 382 } 383 384 385 /* 386 * KVM wants to inject page-faults which it got to the guest. This function 387 * checks whether in a nested guest, we need to inject them to L1 or L2. 388 */ 389 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 390 { 391 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 392 unsigned int nr = vcpu->arch.exception.nr; 393 bool has_payload = vcpu->arch.exception.has_payload; 394 unsigned long payload = vcpu->arch.exception.payload; 395 396 if (nr == PF_VECTOR) { 397 if (vcpu->arch.exception.nested_apf) { 398 *exit_qual = vcpu->arch.apf.nested_apf_token; 399 return 1; 400 } 401 if (nested_vmx_is_page_fault_vmexit(vmcs12, 402 vcpu->arch.exception.error_code)) { 403 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 404 return 1; 405 } 406 } else if (vmcs12->exception_bitmap & (1u << nr)) { 407 if (nr == DB_VECTOR) { 408 if (!has_payload) { 409 payload = vcpu->arch.dr6; 410 payload &= ~DR6_BT; 411 payload ^= DR6_ACTIVE_LOW; 412 } 413 *exit_qual = payload; 414 } else 415 *exit_qual = 0; 416 return 1; 417 } 418 419 return 0; 420 } 421 422 423 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 424 struct x86_exception *fault) 425 { 426 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 427 428 WARN_ON(!is_guest_mode(vcpu)); 429 430 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 431 !to_vmx(vcpu)->nested.nested_run_pending) { 432 vmcs12->vm_exit_intr_error_code = fault->error_code; 433 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 434 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 435 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 436 fault->address); 437 } else { 438 kvm_inject_page_fault(vcpu, fault); 439 } 440 } 441 442 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 443 struct vmcs12 *vmcs12) 444 { 445 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 446 return 0; 447 448 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 449 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 450 return -EINVAL; 451 452 return 0; 453 } 454 455 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 456 struct vmcs12 *vmcs12) 457 { 458 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 459 return 0; 460 461 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 462 return -EINVAL; 463 464 return 0; 465 } 466 467 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 468 struct vmcs12 *vmcs12) 469 { 470 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 471 return 0; 472 473 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 474 return -EINVAL; 475 476 return 0; 477 } 478 479 /* 480 * Check if MSR is intercepted for L01 MSR bitmap. 481 */ 482 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 483 { 484 unsigned long *msr_bitmap; 485 int f = sizeof(unsigned long); 486 487 if (!cpu_has_vmx_msr_bitmap()) 488 return true; 489 490 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 491 492 if (msr <= 0x1fff) { 493 return !!test_bit(msr, msr_bitmap + 0x800 / f); 494 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 495 msr &= 0x1fff; 496 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 497 } 498 499 return true; 500 } 501 502 /* 503 * If a msr is allowed by L0, we should check whether it is allowed by L1. 504 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 505 */ 506 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 507 unsigned long *msr_bitmap_nested, 508 u32 msr, int type) 509 { 510 int f = sizeof(unsigned long); 511 512 /* 513 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 514 * have the write-low and read-high bitmap offsets the wrong way round. 515 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 516 */ 517 if (msr <= 0x1fff) { 518 if (type & MSR_TYPE_R && 519 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 520 /* read-low */ 521 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 522 523 if (type & MSR_TYPE_W && 524 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 525 /* write-low */ 526 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 527 528 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 529 msr &= 0x1fff; 530 if (type & MSR_TYPE_R && 531 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 532 /* read-high */ 533 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 534 535 if (type & MSR_TYPE_W && 536 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 537 /* write-high */ 538 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 539 540 } 541 } 542 543 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 544 { 545 int msr; 546 547 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 548 unsigned word = msr / BITS_PER_LONG; 549 550 msr_bitmap[word] = ~0; 551 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 552 } 553 } 554 555 /* 556 * Merge L0's and L1's MSR bitmap, return false to indicate that 557 * we do not use the hardware. 558 */ 559 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 560 struct vmcs12 *vmcs12) 561 { 562 int msr; 563 unsigned long *msr_bitmap_l1; 564 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 565 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 566 567 /* Nothing to do if the MSR bitmap is not in use. */ 568 if (!cpu_has_vmx_msr_bitmap() || 569 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 570 return false; 571 572 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 573 return false; 574 575 msr_bitmap_l1 = (unsigned long *)map->hva; 576 577 /* 578 * To keep the control flow simple, pay eight 8-byte writes (sixteen 579 * 4-byte writes on 32-bit systems) up front to enable intercepts for 580 * the x2APIC MSR range and selectively disable them below. 581 */ 582 enable_x2apic_msr_intercepts(msr_bitmap_l0); 583 584 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 585 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 586 /* 587 * L0 need not intercept reads for MSRs between 0x800 588 * and 0x8ff, it just lets the processor take the value 589 * from the virtual-APIC page; take those 256 bits 590 * directly from the L1 bitmap. 591 */ 592 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 593 unsigned word = msr / BITS_PER_LONG; 594 595 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 596 } 597 } 598 599 nested_vmx_disable_intercept_for_msr( 600 msr_bitmap_l1, msr_bitmap_l0, 601 X2APIC_MSR(APIC_TASKPRI), 602 MSR_TYPE_R | MSR_TYPE_W); 603 604 if (nested_cpu_has_vid(vmcs12)) { 605 nested_vmx_disable_intercept_for_msr( 606 msr_bitmap_l1, msr_bitmap_l0, 607 X2APIC_MSR(APIC_EOI), 608 MSR_TYPE_W); 609 nested_vmx_disable_intercept_for_msr( 610 msr_bitmap_l1, msr_bitmap_l0, 611 X2APIC_MSR(APIC_SELF_IPI), 612 MSR_TYPE_W); 613 } 614 } 615 616 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 617 #ifdef CONFIG_X86_64 618 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 619 MSR_FS_BASE, MSR_TYPE_RW); 620 621 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 622 MSR_GS_BASE, MSR_TYPE_RW); 623 624 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 625 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 626 #endif 627 628 /* 629 * Checking the L0->L1 bitmap is trying to verify two things: 630 * 631 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 632 * ensures that we do not accidentally generate an L02 MSR bitmap 633 * from the L12 MSR bitmap that is too permissive. 634 * 2. That L1 or L2s have actually used the MSR. This avoids 635 * unnecessarily merging of the bitmap if the MSR is unused. This 636 * works properly because we only update the L01 MSR bitmap lazily. 637 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 638 * updated to reflect this when L1 (or its L2s) actually write to 639 * the MSR. 640 */ 641 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 642 nested_vmx_disable_intercept_for_msr( 643 msr_bitmap_l1, msr_bitmap_l0, 644 MSR_IA32_SPEC_CTRL, 645 MSR_TYPE_R | MSR_TYPE_W); 646 647 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 648 nested_vmx_disable_intercept_for_msr( 649 msr_bitmap_l1, msr_bitmap_l0, 650 MSR_IA32_PRED_CMD, 651 MSR_TYPE_W); 652 653 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 654 655 return true; 656 } 657 658 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 659 struct vmcs12 *vmcs12) 660 { 661 struct kvm_host_map map; 662 struct vmcs12 *shadow; 663 664 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 665 vmcs12->vmcs_link_pointer == -1ull) 666 return; 667 668 shadow = get_shadow_vmcs12(vcpu); 669 670 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 671 return; 672 673 memcpy(shadow, map.hva, VMCS12_SIZE); 674 kvm_vcpu_unmap(vcpu, &map, false); 675 } 676 677 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 678 struct vmcs12 *vmcs12) 679 { 680 struct vcpu_vmx *vmx = to_vmx(vcpu); 681 682 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 683 vmcs12->vmcs_link_pointer == -1ull) 684 return; 685 686 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 687 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 688 } 689 690 /* 691 * In nested virtualization, check if L1 has set 692 * VM_EXIT_ACK_INTR_ON_EXIT 693 */ 694 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 695 { 696 return get_vmcs12(vcpu)->vm_exit_controls & 697 VM_EXIT_ACK_INTR_ON_EXIT; 698 } 699 700 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 701 struct vmcs12 *vmcs12) 702 { 703 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 704 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 705 return -EINVAL; 706 else 707 return 0; 708 } 709 710 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 711 struct vmcs12 *vmcs12) 712 { 713 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 714 !nested_cpu_has_apic_reg_virt(vmcs12) && 715 !nested_cpu_has_vid(vmcs12) && 716 !nested_cpu_has_posted_intr(vmcs12)) 717 return 0; 718 719 /* 720 * If virtualize x2apic mode is enabled, 721 * virtualize apic access must be disabled. 722 */ 723 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 724 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 725 return -EINVAL; 726 727 /* 728 * If virtual interrupt delivery is enabled, 729 * we must exit on external interrupts. 730 */ 731 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 732 return -EINVAL; 733 734 /* 735 * bits 15:8 should be zero in posted_intr_nv, 736 * the descriptor address has been already checked 737 * in nested_get_vmcs12_pages. 738 * 739 * bits 5:0 of posted_intr_desc_addr should be zero. 740 */ 741 if (nested_cpu_has_posted_intr(vmcs12) && 742 (CC(!nested_cpu_has_vid(vmcs12)) || 743 CC(!nested_exit_intr_ack_set(vcpu)) || 744 CC((vmcs12->posted_intr_nv & 0xff00)) || 745 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 746 return -EINVAL; 747 748 /* tpr shadow is needed by all apicv features. */ 749 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 750 return -EINVAL; 751 752 return 0; 753 } 754 755 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 756 u32 count, u64 addr) 757 { 758 if (count == 0) 759 return 0; 760 761 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 762 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 763 return -EINVAL; 764 765 return 0; 766 } 767 768 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 769 struct vmcs12 *vmcs12) 770 { 771 if (CC(nested_vmx_check_msr_switch(vcpu, 772 vmcs12->vm_exit_msr_load_count, 773 vmcs12->vm_exit_msr_load_addr)) || 774 CC(nested_vmx_check_msr_switch(vcpu, 775 vmcs12->vm_exit_msr_store_count, 776 vmcs12->vm_exit_msr_store_addr))) 777 return -EINVAL; 778 779 return 0; 780 } 781 782 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 783 struct vmcs12 *vmcs12) 784 { 785 if (CC(nested_vmx_check_msr_switch(vcpu, 786 vmcs12->vm_entry_msr_load_count, 787 vmcs12->vm_entry_msr_load_addr))) 788 return -EINVAL; 789 790 return 0; 791 } 792 793 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 794 struct vmcs12 *vmcs12) 795 { 796 if (!nested_cpu_has_pml(vmcs12)) 797 return 0; 798 799 if (CC(!nested_cpu_has_ept(vmcs12)) || 800 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 801 return -EINVAL; 802 803 return 0; 804 } 805 806 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 807 struct vmcs12 *vmcs12) 808 { 809 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 810 !nested_cpu_has_ept(vmcs12))) 811 return -EINVAL; 812 return 0; 813 } 814 815 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 816 struct vmcs12 *vmcs12) 817 { 818 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 819 !nested_cpu_has_ept(vmcs12))) 820 return -EINVAL; 821 return 0; 822 } 823 824 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 828 return 0; 829 830 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 831 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 832 return -EINVAL; 833 834 return 0; 835 } 836 837 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 838 struct vmx_msr_entry *e) 839 { 840 /* x2APIC MSR accesses are not allowed */ 841 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 842 return -EINVAL; 843 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 844 CC(e->index == MSR_IA32_UCODE_REV)) 845 return -EINVAL; 846 if (CC(e->reserved != 0)) 847 return -EINVAL; 848 return 0; 849 } 850 851 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 852 struct vmx_msr_entry *e) 853 { 854 if (CC(e->index == MSR_FS_BASE) || 855 CC(e->index == MSR_GS_BASE) || 856 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 857 nested_vmx_msr_check_common(vcpu, e)) 858 return -EINVAL; 859 return 0; 860 } 861 862 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 863 struct vmx_msr_entry *e) 864 { 865 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 866 nested_vmx_msr_check_common(vcpu, e)) 867 return -EINVAL; 868 return 0; 869 } 870 871 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 872 { 873 struct vcpu_vmx *vmx = to_vmx(vcpu); 874 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 875 vmx->nested.msrs.misc_high); 876 877 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 878 } 879 880 /* 881 * Load guest's/host's msr at nested entry/exit. 882 * return 0 for success, entry index for failure. 883 * 884 * One of the failure modes for MSR load/store is when a list exceeds the 885 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 886 * as possible, process all valid entries before failing rather than precheck 887 * for a capacity violation. 888 */ 889 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 890 { 891 u32 i; 892 struct vmx_msr_entry e; 893 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 894 895 for (i = 0; i < count; i++) { 896 if (unlikely(i >= max_msr_list_size)) 897 goto fail; 898 899 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 900 &e, sizeof(e))) { 901 pr_debug_ratelimited( 902 "%s cannot read MSR entry (%u, 0x%08llx)\n", 903 __func__, i, gpa + i * sizeof(e)); 904 goto fail; 905 } 906 if (nested_vmx_load_msr_check(vcpu, &e)) { 907 pr_debug_ratelimited( 908 "%s check failed (%u, 0x%x, 0x%x)\n", 909 __func__, i, e.index, e.reserved); 910 goto fail; 911 } 912 if (kvm_set_msr(vcpu, e.index, e.value)) { 913 pr_debug_ratelimited( 914 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 915 __func__, i, e.index, e.value); 916 goto fail; 917 } 918 } 919 return 0; 920 fail: 921 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 922 return i + 1; 923 } 924 925 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 926 u32 msr_index, 927 u64 *data) 928 { 929 struct vcpu_vmx *vmx = to_vmx(vcpu); 930 931 /* 932 * If the L0 hypervisor stored a more accurate value for the TSC that 933 * does not include the time taken for emulation of the L2->L1 934 * VM-exit in L0, use the more accurate value. 935 */ 936 if (msr_index == MSR_IA32_TSC) { 937 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 938 MSR_IA32_TSC); 939 940 if (i >= 0) { 941 u64 val = vmx->msr_autostore.guest.val[i].value; 942 943 *data = kvm_read_l1_tsc(vcpu, val); 944 return true; 945 } 946 } 947 948 if (kvm_get_msr(vcpu, msr_index, data)) { 949 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 950 msr_index); 951 return false; 952 } 953 return true; 954 } 955 956 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 957 struct vmx_msr_entry *e) 958 { 959 if (kvm_vcpu_read_guest(vcpu, 960 gpa + i * sizeof(*e), 961 e, 2 * sizeof(u32))) { 962 pr_debug_ratelimited( 963 "%s cannot read MSR entry (%u, 0x%08llx)\n", 964 __func__, i, gpa + i * sizeof(*e)); 965 return false; 966 } 967 if (nested_vmx_store_msr_check(vcpu, e)) { 968 pr_debug_ratelimited( 969 "%s check failed (%u, 0x%x, 0x%x)\n", 970 __func__, i, e->index, e->reserved); 971 return false; 972 } 973 return true; 974 } 975 976 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 977 { 978 u64 data; 979 u32 i; 980 struct vmx_msr_entry e; 981 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 982 983 for (i = 0; i < count; i++) { 984 if (unlikely(i >= max_msr_list_size)) 985 return -EINVAL; 986 987 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 988 return -EINVAL; 989 990 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 991 return -EINVAL; 992 993 if (kvm_vcpu_write_guest(vcpu, 994 gpa + i * sizeof(e) + 995 offsetof(struct vmx_msr_entry, value), 996 &data, sizeof(data))) { 997 pr_debug_ratelimited( 998 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 999 __func__, i, e.index, data); 1000 return -EINVAL; 1001 } 1002 } 1003 return 0; 1004 } 1005 1006 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1007 { 1008 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1009 u32 count = vmcs12->vm_exit_msr_store_count; 1010 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1011 struct vmx_msr_entry e; 1012 u32 i; 1013 1014 for (i = 0; i < count; i++) { 1015 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1016 return false; 1017 1018 if (e.index == msr_index) 1019 return true; 1020 } 1021 return false; 1022 } 1023 1024 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1025 u32 msr_index) 1026 { 1027 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1029 bool in_vmcs12_store_list; 1030 int msr_autostore_slot; 1031 bool in_autostore_list; 1032 int last; 1033 1034 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1035 in_autostore_list = msr_autostore_slot >= 0; 1036 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1037 1038 if (in_vmcs12_store_list && !in_autostore_list) { 1039 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1040 /* 1041 * Emulated VMEntry does not fail here. Instead a less 1042 * accurate value will be returned by 1043 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1044 * instead of reading the value from the vmcs02 VMExit 1045 * MSR-store area. 1046 */ 1047 pr_warn_ratelimited( 1048 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1049 msr_index); 1050 return; 1051 } 1052 last = autostore->nr++; 1053 autostore->val[last].index = msr_index; 1054 } else if (!in_vmcs12_store_list && in_autostore_list) { 1055 last = --autostore->nr; 1056 autostore->val[msr_autostore_slot] = autostore->val[last]; 1057 } 1058 } 1059 1060 /* 1061 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. 1062 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't 1063 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). 1064 * Here's why. 1065 * 1066 * If EPT is enabled by L0 a sync is never needed: 1067 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there 1068 * cannot be unsync'd SPTEs for either L1 or L2. 1069 * 1070 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter 1071 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings 1072 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush 1073 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't 1074 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. 1075 * 1076 * If EPT is disabled by L0: 1077 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 1078 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't 1079 * required to invalidate linear mappings (EPT is disabled so there are 1080 * no combined or guest-physical mappings), i.e. L1 can't rely on the 1081 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). 1082 * 1083 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all 1084 * linear mappings (EPT is disabled so there are no combined or guest-physical 1085 * mappings) to be invalidated on both VM-Enter and VM-Exit. 1086 * 1087 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which 1088 * additionally checks that L2 has been assigned a VPID (when EPT is disabled). 1089 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect 1090 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 1091 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has 1092 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 1093 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't 1094 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush 1095 * stale TLB entries, at which point L0 will sync L2's MMU. 1096 */ 1097 static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) 1098 { 1099 return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); 1100 } 1101 1102 /* 1103 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1104 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1105 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1106 * @entry_failure_code. 1107 */ 1108 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 1109 enum vm_entry_failure_code *entry_failure_code) 1110 { 1111 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1112 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1113 return -EINVAL; 1114 } 1115 1116 /* 1117 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1118 * must not be dereferenced. 1119 */ 1120 if (!nested_ept && is_pae_paging(vcpu) && 1121 (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { 1122 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1123 *entry_failure_code = ENTRY_FAIL_PDPTE; 1124 return -EINVAL; 1125 } 1126 } 1127 1128 /* 1129 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB 1130 * flushes are handled by nested_vmx_transition_tlb_flush(). See 1131 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. 1132 */ 1133 if (!nested_ept) 1134 kvm_mmu_new_pgd(vcpu, cr3, true, 1135 !nested_vmx_transition_mmu_sync(vcpu)); 1136 1137 vcpu->arch.cr3 = cr3; 1138 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1139 1140 kvm_init_mmu(vcpu, false); 1141 1142 return 0; 1143 } 1144 1145 /* 1146 * Returns if KVM is able to config CPU to tag TLB entries 1147 * populated by L2 differently than TLB entries populated 1148 * by L1. 1149 * 1150 * If L0 uses EPT, L1 and L2 run with different EPTP because 1151 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1152 * are tagged with different EPTP. 1153 * 1154 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1155 * with different VPID (L1 entries are tagged with vmx->vpid 1156 * while L2 entries are tagged with vmx->nested.vpid02). 1157 */ 1158 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1159 { 1160 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1161 1162 return enable_ept || 1163 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1164 } 1165 1166 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1167 struct vmcs12 *vmcs12, 1168 bool is_vmenter) 1169 { 1170 struct vcpu_vmx *vmx = to_vmx(vcpu); 1171 1172 /* 1173 * If VPID is disabled, linear and combined mappings are flushed on 1174 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for 1175 * their associated EPTP. 1176 */ 1177 if (!enable_vpid) 1178 return; 1179 1180 /* 1181 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1182 * for *all* contexts to be flushed on VM-Enter/VM-Exit. 1183 * 1184 * If VPID is enabled and used by vmc12, but L2 does not have a unique 1185 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate 1186 * a VPID for L2, flush the current context as the effective ASID is 1187 * common to both L1 and L2. 1188 * 1189 * Defer the flush so that it runs after vmcs02.EPTP has been set by 1190 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid 1191 * redundant flushes further down the nested pipeline. 1192 * 1193 * If a TLB flush isn't required due to any of the above, and vpid12 is 1194 * changing then the new "virtual" VPID (vpid12) will reuse the same 1195 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct 1196 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for 1197 * all nested vCPUs. 1198 */ 1199 if (!nested_cpu_has_vpid(vmcs12)) { 1200 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1201 } else if (!nested_has_guest_tlb_tag(vcpu)) { 1202 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1203 } else if (is_vmenter && 1204 vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1205 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1206 vpid_sync_context(nested_get_vpid02(vcpu)); 1207 } 1208 } 1209 1210 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1211 { 1212 superset &= mask; 1213 subset &= mask; 1214 1215 return (superset | subset) == superset; 1216 } 1217 1218 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1219 { 1220 const u64 feature_and_reserved = 1221 /* feature (except bit 48; see below) */ 1222 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1223 /* reserved */ 1224 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1225 u64 vmx_basic = vmx->nested.msrs.basic; 1226 1227 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1228 return -EINVAL; 1229 1230 /* 1231 * KVM does not emulate a version of VMX that constrains physical 1232 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1233 */ 1234 if (data & BIT_ULL(48)) 1235 return -EINVAL; 1236 1237 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1238 vmx_basic_vmcs_revision_id(data)) 1239 return -EINVAL; 1240 1241 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1242 return -EINVAL; 1243 1244 vmx->nested.msrs.basic = data; 1245 return 0; 1246 } 1247 1248 static int 1249 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1250 { 1251 u64 supported; 1252 u32 *lowp, *highp; 1253 1254 switch (msr_index) { 1255 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1256 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1257 highp = &vmx->nested.msrs.pinbased_ctls_high; 1258 break; 1259 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1260 lowp = &vmx->nested.msrs.procbased_ctls_low; 1261 highp = &vmx->nested.msrs.procbased_ctls_high; 1262 break; 1263 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1264 lowp = &vmx->nested.msrs.exit_ctls_low; 1265 highp = &vmx->nested.msrs.exit_ctls_high; 1266 break; 1267 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1268 lowp = &vmx->nested.msrs.entry_ctls_low; 1269 highp = &vmx->nested.msrs.entry_ctls_high; 1270 break; 1271 case MSR_IA32_VMX_PROCBASED_CTLS2: 1272 lowp = &vmx->nested.msrs.secondary_ctls_low; 1273 highp = &vmx->nested.msrs.secondary_ctls_high; 1274 break; 1275 default: 1276 BUG(); 1277 } 1278 1279 supported = vmx_control_msr(*lowp, *highp); 1280 1281 /* Check must-be-1 bits are still 1. */ 1282 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1283 return -EINVAL; 1284 1285 /* Check must-be-0 bits are still 0. */ 1286 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1287 return -EINVAL; 1288 1289 *lowp = data; 1290 *highp = data >> 32; 1291 return 0; 1292 } 1293 1294 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1295 { 1296 const u64 feature_and_reserved_bits = 1297 /* feature */ 1298 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1299 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1300 /* reserved */ 1301 GENMASK_ULL(13, 9) | BIT_ULL(31); 1302 u64 vmx_misc; 1303 1304 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1305 vmx->nested.msrs.misc_high); 1306 1307 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1308 return -EINVAL; 1309 1310 if ((vmx->nested.msrs.pinbased_ctls_high & 1311 PIN_BASED_VMX_PREEMPTION_TIMER) && 1312 vmx_misc_preemption_timer_rate(data) != 1313 vmx_misc_preemption_timer_rate(vmx_misc)) 1314 return -EINVAL; 1315 1316 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1317 return -EINVAL; 1318 1319 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1320 return -EINVAL; 1321 1322 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1323 return -EINVAL; 1324 1325 vmx->nested.msrs.misc_low = data; 1326 vmx->nested.msrs.misc_high = data >> 32; 1327 1328 return 0; 1329 } 1330 1331 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1332 { 1333 u64 vmx_ept_vpid_cap; 1334 1335 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1336 vmx->nested.msrs.vpid_caps); 1337 1338 /* Every bit is either reserved or a feature bit. */ 1339 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1340 return -EINVAL; 1341 1342 vmx->nested.msrs.ept_caps = data; 1343 vmx->nested.msrs.vpid_caps = data >> 32; 1344 return 0; 1345 } 1346 1347 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1348 { 1349 u64 *msr; 1350 1351 switch (msr_index) { 1352 case MSR_IA32_VMX_CR0_FIXED0: 1353 msr = &vmx->nested.msrs.cr0_fixed0; 1354 break; 1355 case MSR_IA32_VMX_CR4_FIXED0: 1356 msr = &vmx->nested.msrs.cr4_fixed0; 1357 break; 1358 default: 1359 BUG(); 1360 } 1361 1362 /* 1363 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1364 * must be 1 in the restored value. 1365 */ 1366 if (!is_bitwise_subset(data, *msr, -1ULL)) 1367 return -EINVAL; 1368 1369 *msr = data; 1370 return 0; 1371 } 1372 1373 /* 1374 * Called when userspace is restoring VMX MSRs. 1375 * 1376 * Returns 0 on success, non-0 otherwise. 1377 */ 1378 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1379 { 1380 struct vcpu_vmx *vmx = to_vmx(vcpu); 1381 1382 /* 1383 * Don't allow changes to the VMX capability MSRs while the vCPU 1384 * is in VMX operation. 1385 */ 1386 if (vmx->nested.vmxon) 1387 return -EBUSY; 1388 1389 switch (msr_index) { 1390 case MSR_IA32_VMX_BASIC: 1391 return vmx_restore_vmx_basic(vmx, data); 1392 case MSR_IA32_VMX_PINBASED_CTLS: 1393 case MSR_IA32_VMX_PROCBASED_CTLS: 1394 case MSR_IA32_VMX_EXIT_CTLS: 1395 case MSR_IA32_VMX_ENTRY_CTLS: 1396 /* 1397 * The "non-true" VMX capability MSRs are generated from the 1398 * "true" MSRs, so we do not support restoring them directly. 1399 * 1400 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1401 * should restore the "true" MSRs with the must-be-1 bits 1402 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1403 * DEFAULT SETTINGS". 1404 */ 1405 return -EINVAL; 1406 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1407 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1408 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1409 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1410 case MSR_IA32_VMX_PROCBASED_CTLS2: 1411 return vmx_restore_control_msr(vmx, msr_index, data); 1412 case MSR_IA32_VMX_MISC: 1413 return vmx_restore_vmx_misc(vmx, data); 1414 case MSR_IA32_VMX_CR0_FIXED0: 1415 case MSR_IA32_VMX_CR4_FIXED0: 1416 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1417 case MSR_IA32_VMX_CR0_FIXED1: 1418 case MSR_IA32_VMX_CR4_FIXED1: 1419 /* 1420 * These MSRs are generated based on the vCPU's CPUID, so we 1421 * do not support restoring them directly. 1422 */ 1423 return -EINVAL; 1424 case MSR_IA32_VMX_EPT_VPID_CAP: 1425 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1426 case MSR_IA32_VMX_VMCS_ENUM: 1427 vmx->nested.msrs.vmcs_enum = data; 1428 return 0; 1429 case MSR_IA32_VMX_VMFUNC: 1430 if (data & ~vmx->nested.msrs.vmfunc_controls) 1431 return -EINVAL; 1432 vmx->nested.msrs.vmfunc_controls = data; 1433 return 0; 1434 default: 1435 /* 1436 * The rest of the VMX capability MSRs do not support restore. 1437 */ 1438 return -EINVAL; 1439 } 1440 } 1441 1442 /* Returns 0 on success, non-0 otherwise. */ 1443 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1444 { 1445 switch (msr_index) { 1446 case MSR_IA32_VMX_BASIC: 1447 *pdata = msrs->basic; 1448 break; 1449 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1450 case MSR_IA32_VMX_PINBASED_CTLS: 1451 *pdata = vmx_control_msr( 1452 msrs->pinbased_ctls_low, 1453 msrs->pinbased_ctls_high); 1454 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1455 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1456 break; 1457 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1458 case MSR_IA32_VMX_PROCBASED_CTLS: 1459 *pdata = vmx_control_msr( 1460 msrs->procbased_ctls_low, 1461 msrs->procbased_ctls_high); 1462 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1463 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1464 break; 1465 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1466 case MSR_IA32_VMX_EXIT_CTLS: 1467 *pdata = vmx_control_msr( 1468 msrs->exit_ctls_low, 1469 msrs->exit_ctls_high); 1470 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1471 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1472 break; 1473 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1474 case MSR_IA32_VMX_ENTRY_CTLS: 1475 *pdata = vmx_control_msr( 1476 msrs->entry_ctls_low, 1477 msrs->entry_ctls_high); 1478 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1479 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1480 break; 1481 case MSR_IA32_VMX_MISC: 1482 *pdata = vmx_control_msr( 1483 msrs->misc_low, 1484 msrs->misc_high); 1485 break; 1486 case MSR_IA32_VMX_CR0_FIXED0: 1487 *pdata = msrs->cr0_fixed0; 1488 break; 1489 case MSR_IA32_VMX_CR0_FIXED1: 1490 *pdata = msrs->cr0_fixed1; 1491 break; 1492 case MSR_IA32_VMX_CR4_FIXED0: 1493 *pdata = msrs->cr4_fixed0; 1494 break; 1495 case MSR_IA32_VMX_CR4_FIXED1: 1496 *pdata = msrs->cr4_fixed1; 1497 break; 1498 case MSR_IA32_VMX_VMCS_ENUM: 1499 *pdata = msrs->vmcs_enum; 1500 break; 1501 case MSR_IA32_VMX_PROCBASED_CTLS2: 1502 *pdata = vmx_control_msr( 1503 msrs->secondary_ctls_low, 1504 msrs->secondary_ctls_high); 1505 break; 1506 case MSR_IA32_VMX_EPT_VPID_CAP: 1507 *pdata = msrs->ept_caps | 1508 ((u64)msrs->vpid_caps << 32); 1509 break; 1510 case MSR_IA32_VMX_VMFUNC: 1511 *pdata = msrs->vmfunc_controls; 1512 break; 1513 default: 1514 return 1; 1515 } 1516 1517 return 0; 1518 } 1519 1520 /* 1521 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1522 * been modified by the L1 guest. Note, "writable" in this context means 1523 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1524 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1525 * VM-exit information fields (which are actually writable if the vCPU is 1526 * configured to support "VMWRITE to any supported field in the VMCS"). 1527 */ 1528 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1529 { 1530 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1531 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1532 struct shadow_vmcs_field field; 1533 unsigned long val; 1534 int i; 1535 1536 if (WARN_ON(!shadow_vmcs)) 1537 return; 1538 1539 preempt_disable(); 1540 1541 vmcs_load(shadow_vmcs); 1542 1543 for (i = 0; i < max_shadow_read_write_fields; i++) { 1544 field = shadow_read_write_fields[i]; 1545 val = __vmcs_readl(field.encoding); 1546 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1547 } 1548 1549 vmcs_clear(shadow_vmcs); 1550 vmcs_load(vmx->loaded_vmcs->vmcs); 1551 1552 preempt_enable(); 1553 } 1554 1555 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1556 { 1557 const struct shadow_vmcs_field *fields[] = { 1558 shadow_read_write_fields, 1559 shadow_read_only_fields 1560 }; 1561 const int max_fields[] = { 1562 max_shadow_read_write_fields, 1563 max_shadow_read_only_fields 1564 }; 1565 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1566 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1567 struct shadow_vmcs_field field; 1568 unsigned long val; 1569 int i, q; 1570 1571 if (WARN_ON(!shadow_vmcs)) 1572 return; 1573 1574 vmcs_load(shadow_vmcs); 1575 1576 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1577 for (i = 0; i < max_fields[q]; i++) { 1578 field = fields[q][i]; 1579 val = vmcs12_read_any(vmcs12, field.encoding, 1580 field.offset); 1581 __vmcs_writel(field.encoding, val); 1582 } 1583 } 1584 1585 vmcs_clear(shadow_vmcs); 1586 vmcs_load(vmx->loaded_vmcs->vmcs); 1587 } 1588 1589 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1590 { 1591 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1592 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1593 1594 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1595 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1596 vmcs12->guest_rip = evmcs->guest_rip; 1597 1598 if (unlikely(!(evmcs->hv_clean_fields & 1599 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1600 vmcs12->guest_rsp = evmcs->guest_rsp; 1601 vmcs12->guest_rflags = evmcs->guest_rflags; 1602 vmcs12->guest_interruptibility_info = 1603 evmcs->guest_interruptibility_info; 1604 } 1605 1606 if (unlikely(!(evmcs->hv_clean_fields & 1607 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1608 vmcs12->cpu_based_vm_exec_control = 1609 evmcs->cpu_based_vm_exec_control; 1610 } 1611 1612 if (unlikely(!(evmcs->hv_clean_fields & 1613 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1614 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1615 } 1616 1617 if (unlikely(!(evmcs->hv_clean_fields & 1618 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1619 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1620 } 1621 1622 if (unlikely(!(evmcs->hv_clean_fields & 1623 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1624 vmcs12->vm_entry_intr_info_field = 1625 evmcs->vm_entry_intr_info_field; 1626 vmcs12->vm_entry_exception_error_code = 1627 evmcs->vm_entry_exception_error_code; 1628 vmcs12->vm_entry_instruction_len = 1629 evmcs->vm_entry_instruction_len; 1630 } 1631 1632 if (unlikely(!(evmcs->hv_clean_fields & 1633 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1634 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1635 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1636 vmcs12->host_cr0 = evmcs->host_cr0; 1637 vmcs12->host_cr3 = evmcs->host_cr3; 1638 vmcs12->host_cr4 = evmcs->host_cr4; 1639 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1640 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1641 vmcs12->host_rip = evmcs->host_rip; 1642 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1643 vmcs12->host_es_selector = evmcs->host_es_selector; 1644 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1645 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1646 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1647 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1648 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1649 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1650 } 1651 1652 if (unlikely(!(evmcs->hv_clean_fields & 1653 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1654 vmcs12->pin_based_vm_exec_control = 1655 evmcs->pin_based_vm_exec_control; 1656 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1657 vmcs12->secondary_vm_exec_control = 1658 evmcs->secondary_vm_exec_control; 1659 } 1660 1661 if (unlikely(!(evmcs->hv_clean_fields & 1662 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1663 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1664 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1665 } 1666 1667 if (unlikely(!(evmcs->hv_clean_fields & 1668 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1669 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1670 } 1671 1672 if (unlikely(!(evmcs->hv_clean_fields & 1673 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1674 vmcs12->guest_es_base = evmcs->guest_es_base; 1675 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1676 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1677 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1678 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1679 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1680 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1681 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1682 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1683 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1684 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1685 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1686 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1687 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1688 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1689 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1690 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1691 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1692 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1693 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1694 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1695 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1696 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1697 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1698 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1699 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1700 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1701 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1702 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1703 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1704 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1705 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1706 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1707 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1708 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1709 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1710 } 1711 1712 if (unlikely(!(evmcs->hv_clean_fields & 1713 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1714 vmcs12->tsc_offset = evmcs->tsc_offset; 1715 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1716 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1717 } 1718 1719 if (unlikely(!(evmcs->hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1721 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1722 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1723 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1724 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1725 vmcs12->guest_cr0 = evmcs->guest_cr0; 1726 vmcs12->guest_cr3 = evmcs->guest_cr3; 1727 vmcs12->guest_cr4 = evmcs->guest_cr4; 1728 vmcs12->guest_dr7 = evmcs->guest_dr7; 1729 } 1730 1731 if (unlikely(!(evmcs->hv_clean_fields & 1732 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1733 vmcs12->host_fs_base = evmcs->host_fs_base; 1734 vmcs12->host_gs_base = evmcs->host_gs_base; 1735 vmcs12->host_tr_base = evmcs->host_tr_base; 1736 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1737 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1738 vmcs12->host_rsp = evmcs->host_rsp; 1739 } 1740 1741 if (unlikely(!(evmcs->hv_clean_fields & 1742 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1743 vmcs12->ept_pointer = evmcs->ept_pointer; 1744 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1745 } 1746 1747 if (unlikely(!(evmcs->hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1749 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1750 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1751 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1752 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1753 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1754 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1755 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1756 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1757 vmcs12->guest_pending_dbg_exceptions = 1758 evmcs->guest_pending_dbg_exceptions; 1759 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1760 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1761 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1762 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1763 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1764 } 1765 1766 /* 1767 * Not used? 1768 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1769 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1770 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1771 * vmcs12->page_fault_error_code_mask = 1772 * evmcs->page_fault_error_code_mask; 1773 * vmcs12->page_fault_error_code_match = 1774 * evmcs->page_fault_error_code_match; 1775 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1776 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1777 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1778 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1779 */ 1780 1781 /* 1782 * Read only fields: 1783 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1784 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1785 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1786 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1787 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1788 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1789 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1790 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1791 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1792 * vmcs12->exit_qualification = evmcs->exit_qualification; 1793 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1794 * 1795 * Not present in struct vmcs12: 1796 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1797 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1798 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1799 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1800 */ 1801 1802 return 0; 1803 } 1804 1805 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1806 { 1807 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1808 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1809 1810 /* 1811 * Should not be changed by KVM: 1812 * 1813 * evmcs->host_es_selector = vmcs12->host_es_selector; 1814 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1815 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1816 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1817 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1818 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1819 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1820 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1821 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1822 * evmcs->host_cr0 = vmcs12->host_cr0; 1823 * evmcs->host_cr3 = vmcs12->host_cr3; 1824 * evmcs->host_cr4 = vmcs12->host_cr4; 1825 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1826 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1827 * evmcs->host_rip = vmcs12->host_rip; 1828 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1829 * evmcs->host_fs_base = vmcs12->host_fs_base; 1830 * evmcs->host_gs_base = vmcs12->host_gs_base; 1831 * evmcs->host_tr_base = vmcs12->host_tr_base; 1832 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1833 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1834 * evmcs->host_rsp = vmcs12->host_rsp; 1835 * sync_vmcs02_to_vmcs12() doesn't read these: 1836 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1837 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1838 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1839 * evmcs->ept_pointer = vmcs12->ept_pointer; 1840 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1841 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1842 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1843 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1844 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1845 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1846 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1847 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1848 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1849 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1850 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1851 * evmcs->page_fault_error_code_mask = 1852 * vmcs12->page_fault_error_code_mask; 1853 * evmcs->page_fault_error_code_match = 1854 * vmcs12->page_fault_error_code_match; 1855 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1856 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1857 * evmcs->tsc_offset = vmcs12->tsc_offset; 1858 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1859 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1860 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1861 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1862 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1863 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1864 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1865 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1866 * 1867 * Not present in struct vmcs12: 1868 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1869 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1870 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1871 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1872 */ 1873 1874 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1875 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1876 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1877 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1878 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1879 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1880 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1881 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1882 1883 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1884 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1885 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1886 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1887 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1888 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1889 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1890 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1891 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1892 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1893 1894 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1895 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1896 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1897 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1898 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1899 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1900 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1901 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1902 1903 evmcs->guest_es_base = vmcs12->guest_es_base; 1904 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1905 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1906 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1907 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1908 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1909 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1910 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1911 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1912 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1913 1914 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1915 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1916 1917 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1918 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1919 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1920 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1921 1922 evmcs->guest_pending_dbg_exceptions = 1923 vmcs12->guest_pending_dbg_exceptions; 1924 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1925 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1926 1927 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1928 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1929 1930 evmcs->guest_cr0 = vmcs12->guest_cr0; 1931 evmcs->guest_cr3 = vmcs12->guest_cr3; 1932 evmcs->guest_cr4 = vmcs12->guest_cr4; 1933 evmcs->guest_dr7 = vmcs12->guest_dr7; 1934 1935 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1936 1937 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1938 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1939 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1940 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1941 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1942 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1943 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1944 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1945 1946 evmcs->exit_qualification = vmcs12->exit_qualification; 1947 1948 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1949 evmcs->guest_rsp = vmcs12->guest_rsp; 1950 evmcs->guest_rflags = vmcs12->guest_rflags; 1951 1952 evmcs->guest_interruptibility_info = 1953 vmcs12->guest_interruptibility_info; 1954 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1955 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1956 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1957 evmcs->vm_entry_exception_error_code = 1958 vmcs12->vm_entry_exception_error_code; 1959 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1960 1961 evmcs->guest_rip = vmcs12->guest_rip; 1962 1963 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1964 1965 return 0; 1966 } 1967 1968 /* 1969 * This is an equivalent of the nested hypervisor executing the vmptrld 1970 * instruction. 1971 */ 1972 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1973 struct kvm_vcpu *vcpu, bool from_launch) 1974 { 1975 struct vcpu_vmx *vmx = to_vmx(vcpu); 1976 bool evmcs_gpa_changed = false; 1977 u64 evmcs_gpa; 1978 1979 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1980 return EVMPTRLD_DISABLED; 1981 1982 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1983 return EVMPTRLD_DISABLED; 1984 1985 if (unlikely(!vmx->nested.hv_evmcs || 1986 evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1987 if (!vmx->nested.hv_evmcs) 1988 vmx->nested.current_vmptr = -1ull; 1989 1990 nested_release_evmcs(vcpu); 1991 1992 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1993 &vmx->nested.hv_evmcs_map)) 1994 return EVMPTRLD_ERROR; 1995 1996 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1997 1998 /* 1999 * Currently, KVM only supports eVMCS version 1 2000 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2001 * value to first u32 field of eVMCS which should specify eVMCS 2002 * VersionNumber. 2003 * 2004 * Guest should be aware of supported eVMCS versions by host by 2005 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2006 * expected to set this CPUID leaf according to the value 2007 * returned in vmcs_version from nested_enable_evmcs(). 2008 * 2009 * However, it turns out that Microsoft Hyper-V fails to comply 2010 * to their own invented interface: When Hyper-V use eVMCS, it 2011 * just sets first u32 field of eVMCS to revision_id specified 2012 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2013 * which is one of the supported versions specified in 2014 * CPUID.0x4000000A.EAX[0:15]. 2015 * 2016 * To overcome Hyper-V bug, we accept here either a supported 2017 * eVMCS version or VMCS12 revision_id as valid values for first 2018 * u32 field of eVMCS. 2019 */ 2020 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2021 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2022 nested_release_evmcs(vcpu); 2023 return EVMPTRLD_VMFAIL; 2024 } 2025 2026 vmx->nested.dirty_vmcs12 = true; 2027 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2028 2029 evmcs_gpa_changed = true; 2030 /* 2031 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2032 * reloaded from guest's memory (read only fields, fields not 2033 * present in struct hv_enlightened_vmcs, ...). Make sure there 2034 * are no leftovers. 2035 */ 2036 if (from_launch) { 2037 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2038 memset(vmcs12, 0, sizeof(*vmcs12)); 2039 vmcs12->hdr.revision_id = VMCS12_REVISION; 2040 } 2041 2042 } 2043 2044 /* 2045 * Clean fields data can't be used on VMLAUNCH and when we switch 2046 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2047 */ 2048 if (from_launch || evmcs_gpa_changed) 2049 vmx->nested.hv_evmcs->hv_clean_fields &= 2050 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2051 2052 return EVMPTRLD_SUCCEEDED; 2053 } 2054 2055 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2056 { 2057 struct vcpu_vmx *vmx = to_vmx(vcpu); 2058 2059 if (vmx->nested.hv_evmcs) { 2060 copy_vmcs12_to_enlightened(vmx); 2061 /* All fields are clean */ 2062 vmx->nested.hv_evmcs->hv_clean_fields |= 2063 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2064 } else { 2065 copy_vmcs12_to_shadow(vmx); 2066 } 2067 2068 vmx->nested.need_vmcs12_to_shadow_sync = false; 2069 } 2070 2071 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2072 { 2073 struct vcpu_vmx *vmx = 2074 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2075 2076 vmx->nested.preemption_timer_expired = true; 2077 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2078 kvm_vcpu_kick(&vmx->vcpu); 2079 2080 return HRTIMER_NORESTART; 2081 } 2082 2083 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2084 { 2085 struct vcpu_vmx *vmx = to_vmx(vcpu); 2086 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2087 2088 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2089 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2090 2091 if (!vmx->nested.has_preemption_timer_deadline) { 2092 vmx->nested.preemption_timer_deadline = 2093 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2094 vmx->nested.has_preemption_timer_deadline = true; 2095 } 2096 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2097 } 2098 2099 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2100 u64 preemption_timeout) 2101 { 2102 struct vcpu_vmx *vmx = to_vmx(vcpu); 2103 2104 /* 2105 * A timer value of zero is architecturally guaranteed to cause 2106 * a VMExit prior to executing any instructions in the guest. 2107 */ 2108 if (preemption_timeout == 0) { 2109 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2110 return; 2111 } 2112 2113 if (vcpu->arch.virtual_tsc_khz == 0) 2114 return; 2115 2116 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2117 preemption_timeout *= 1000000; 2118 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2119 hrtimer_start(&vmx->nested.preemption_timer, 2120 ktime_add_ns(ktime_get(), preemption_timeout), 2121 HRTIMER_MODE_ABS_PINNED); 2122 } 2123 2124 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2125 { 2126 if (vmx->nested.nested_run_pending && 2127 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2128 return vmcs12->guest_ia32_efer; 2129 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2130 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2131 else 2132 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2133 } 2134 2135 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2136 { 2137 /* 2138 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2139 * according to L0's settings (vmcs12 is irrelevant here). Host 2140 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2141 * will be set as needed prior to VMLAUNCH/VMRESUME. 2142 */ 2143 if (vmx->nested.vmcs02_initialized) 2144 return; 2145 vmx->nested.vmcs02_initialized = true; 2146 2147 /* 2148 * We don't care what the EPTP value is we just need to guarantee 2149 * it's valid so we don't get a false positive when doing early 2150 * consistency checks. 2151 */ 2152 if (enable_ept && nested_early_check) 2153 vmcs_write64(EPT_POINTER, 2154 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2155 2156 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2157 if (cpu_has_vmx_vmfunc()) 2158 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2159 2160 if (cpu_has_vmx_posted_intr()) 2161 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2162 2163 if (cpu_has_vmx_msr_bitmap()) 2164 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2165 2166 /* 2167 * PML is emulated for L2, but never enabled in hardware as the MMU 2168 * handles A/D emulation. Disabling PML for L2 also avoids having to 2169 * deal with filtering out L2 GPAs from the buffer. 2170 */ 2171 if (enable_pml) { 2172 vmcs_write64(PML_ADDRESS, 0); 2173 vmcs_write16(GUEST_PML_INDEX, -1); 2174 } 2175 2176 if (cpu_has_vmx_encls_vmexit()) 2177 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2178 2179 /* 2180 * Set the MSR load/store lists to match L0's settings. Only the 2181 * addresses are constant (for vmcs02), the counts can change based 2182 * on L2's behavior, e.g. switching to/from long mode. 2183 */ 2184 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2185 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2186 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2187 2188 vmx_set_constant_host_state(vmx); 2189 } 2190 2191 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2192 struct vmcs12 *vmcs12) 2193 { 2194 prepare_vmcs02_constant_state(vmx); 2195 2196 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2197 2198 if (enable_vpid) { 2199 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2200 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2201 else 2202 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2203 } 2204 } 2205 2206 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2207 { 2208 u32 exec_control; 2209 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2210 2211 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2212 prepare_vmcs02_early_rare(vmx, vmcs12); 2213 2214 /* 2215 * PIN CONTROLS 2216 */ 2217 exec_control = vmx_pin_based_exec_ctrl(vmx); 2218 exec_control |= (vmcs12->pin_based_vm_exec_control & 2219 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2220 2221 /* Posted interrupts setting is only taken from vmcs12. */ 2222 if (nested_cpu_has_posted_intr(vmcs12)) { 2223 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2224 vmx->nested.pi_pending = false; 2225 } else { 2226 exec_control &= ~PIN_BASED_POSTED_INTR; 2227 } 2228 pin_controls_set(vmx, exec_control); 2229 2230 /* 2231 * EXEC CONTROLS 2232 */ 2233 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2234 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2235 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2236 exec_control &= ~CPU_BASED_TPR_SHADOW; 2237 exec_control |= vmcs12->cpu_based_vm_exec_control; 2238 2239 vmx->nested.l1_tpr_threshold = -1; 2240 if (exec_control & CPU_BASED_TPR_SHADOW) 2241 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2242 #ifdef CONFIG_X86_64 2243 else 2244 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2245 CPU_BASED_CR8_STORE_EXITING; 2246 #endif 2247 2248 /* 2249 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2250 * for I/O port accesses. 2251 */ 2252 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2253 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2254 2255 /* 2256 * This bit will be computed in nested_get_vmcs12_pages, because 2257 * we do not have access to L1's MSR bitmap yet. For now, keep 2258 * the same bit as before, hoping to avoid multiple VMWRITEs that 2259 * only set/clear this bit. 2260 */ 2261 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2262 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2263 2264 exec_controls_set(vmx, exec_control); 2265 2266 /* 2267 * SECONDARY EXEC CONTROLS 2268 */ 2269 if (cpu_has_secondary_exec_ctrls()) { 2270 exec_control = vmx->secondary_exec_control; 2271 2272 /* Take the following fields only from vmcs12 */ 2273 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2274 SECONDARY_EXEC_ENABLE_INVPCID | 2275 SECONDARY_EXEC_ENABLE_RDTSCP | 2276 SECONDARY_EXEC_XSAVES | 2277 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2278 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2279 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2280 SECONDARY_EXEC_ENABLE_VMFUNC); 2281 if (nested_cpu_has(vmcs12, 2282 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2283 exec_control |= vmcs12->secondary_vm_exec_control; 2284 2285 /* PML is emulated and never enabled in hardware for L2. */ 2286 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2287 2288 /* VMCS shadowing for L2 is emulated for now */ 2289 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2290 2291 /* 2292 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2293 * will not have to rewrite the controls just for this bit. 2294 */ 2295 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2296 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2297 exec_control |= SECONDARY_EXEC_DESC; 2298 2299 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2300 vmcs_write16(GUEST_INTR_STATUS, 2301 vmcs12->guest_intr_status); 2302 2303 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2304 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2305 2306 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2307 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2308 2309 secondary_exec_controls_set(vmx, exec_control); 2310 } 2311 2312 /* 2313 * ENTRY CONTROLS 2314 * 2315 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2316 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2317 * on the related bits (if supported by the CPU) in the hope that 2318 * we can avoid VMWrites during vmx_set_efer(). 2319 */ 2320 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2321 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2322 if (cpu_has_load_ia32_efer()) { 2323 if (guest_efer & EFER_LMA) 2324 exec_control |= VM_ENTRY_IA32E_MODE; 2325 if (guest_efer != host_efer) 2326 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2327 } 2328 vm_entry_controls_set(vmx, exec_control); 2329 2330 /* 2331 * EXIT CONTROLS 2332 * 2333 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2334 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2335 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2336 */ 2337 exec_control = vmx_vmexit_ctrl(); 2338 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2339 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2340 vm_exit_controls_set(vmx, exec_control); 2341 2342 /* 2343 * Interrupt/Exception Fields 2344 */ 2345 if (vmx->nested.nested_run_pending) { 2346 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2347 vmcs12->vm_entry_intr_info_field); 2348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2349 vmcs12->vm_entry_exception_error_code); 2350 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2351 vmcs12->vm_entry_instruction_len); 2352 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2353 vmcs12->guest_interruptibility_info); 2354 vmx->loaded_vmcs->nmi_known_unmasked = 2355 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2356 } else { 2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2358 } 2359 } 2360 2361 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2362 { 2363 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2364 2365 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2366 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2367 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2368 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2369 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2370 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2371 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2372 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2373 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2374 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2375 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2376 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2377 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2378 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2379 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2380 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2381 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2382 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2383 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2384 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2385 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2386 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2387 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2388 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2389 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2390 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2391 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2392 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2393 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2394 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2395 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2396 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2397 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2398 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2399 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2400 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2401 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2402 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2403 2404 vmx->segment_cache.bitmask = 0; 2405 } 2406 2407 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2409 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2411 vmcs12->guest_pending_dbg_exceptions); 2412 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2413 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2414 2415 /* 2416 * L1 may access the L2's PDPTR, so save them to construct 2417 * vmcs12 2418 */ 2419 if (enable_ept) { 2420 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2421 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2422 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2423 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2424 } 2425 2426 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2427 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2428 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2429 } 2430 2431 if (nested_cpu_has_xsaves(vmcs12)) 2432 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2433 2434 /* 2435 * Whether page-faults are trapped is determined by a combination of 2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2437 * doesn't care about page faults then we should set all of these to 2438 * L1's desires. However, if L0 does care about (some) page faults, it 2439 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2440 * simply ask to exit on each and every L2 page fault. This is done by 2441 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2442 * Note that below we don't need special code to set EB.PF beyond the 2443 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2444 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2445 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2446 */ 2447 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2448 /* 2449 * TODO: if both L0 and L1 need the same MASK and MATCH, 2450 * go ahead and use it? 2451 */ 2452 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2453 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2454 } else { 2455 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2456 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2457 } 2458 2459 if (cpu_has_vmx_apicv()) { 2460 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2461 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2462 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2463 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2464 } 2465 2466 /* 2467 * Make sure the msr_autostore list is up to date before we set the 2468 * count in the vmcs02. 2469 */ 2470 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2471 2472 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2473 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2474 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2475 2476 set_cr4_guest_host_mask(vmx); 2477 } 2478 2479 /* 2480 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2481 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2482 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2483 * guest in a way that will both be appropriate to L1's requests, and our 2484 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2485 * function also has additional necessary side-effects, like setting various 2486 * vcpu->arch fields. 2487 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2488 * is assigned to entry_failure_code on failure. 2489 */ 2490 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2491 enum vm_entry_failure_code *entry_failure_code) 2492 { 2493 struct vcpu_vmx *vmx = to_vmx(vcpu); 2494 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2495 bool load_guest_pdptrs_vmcs12 = false; 2496 2497 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2498 prepare_vmcs02_rare(vmx, vmcs12); 2499 vmx->nested.dirty_vmcs12 = false; 2500 2501 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2502 !(hv_evmcs->hv_clean_fields & 2503 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2504 } 2505 2506 if (vmx->nested.nested_run_pending && 2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2508 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2509 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2510 } else { 2511 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2512 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2513 } 2514 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2515 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2516 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2517 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2518 2519 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2520 * bitwise-or of what L1 wants to trap for L2, and what we want to 2521 * trap. Note that CR0.TS also needs updating - we do this later. 2522 */ 2523 vmx_update_exception_bitmap(vcpu); 2524 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2525 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2526 2527 if (vmx->nested.nested_run_pending && 2528 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2529 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2530 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2531 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2532 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2533 } 2534 2535 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2536 2537 if (kvm_has_tsc_control) 2538 decache_tsc_multiplier(vmx); 2539 2540 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2541 2542 if (nested_cpu_has_ept(vmcs12)) 2543 nested_ept_init_mmu_context(vcpu); 2544 2545 /* 2546 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2547 * bits which we consider mandatory enabled. 2548 * The CR0_READ_SHADOW is what L2 should have expected to read given 2549 * the specifications by L1; It's not enough to take 2550 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2551 * have more bits than L1 expected. 2552 */ 2553 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2554 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2555 2556 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2557 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2558 2559 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2560 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2561 vmx_set_efer(vcpu, vcpu->arch.efer); 2562 2563 /* 2564 * Guest state is invalid and unrestricted guest is disabled, 2565 * which means L1 attempted VMEntry to L2 with invalid state. 2566 * Fail the VMEntry. 2567 */ 2568 if (CC(!vmx_guest_state_valid(vcpu))) { 2569 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2570 return -EINVAL; 2571 } 2572 2573 /* Shadow page tables on either EPT or shadow page tables. */ 2574 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2575 entry_failure_code)) 2576 return -EINVAL; 2577 2578 /* 2579 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2580 * on nested VM-Exit, which can occur without actually running L2 and 2581 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2582 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2583 * transition to HLT instead of running L2. 2584 */ 2585 if (enable_ept) 2586 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2587 2588 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2589 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2590 is_pae_paging(vcpu)) { 2591 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2592 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2593 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2594 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2595 } 2596 2597 if (!enable_ept) 2598 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2599 2600 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2601 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2602 vmcs12->guest_ia32_perf_global_ctrl))) 2603 return -EINVAL; 2604 2605 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2606 kvm_rip_write(vcpu, vmcs12->guest_rip); 2607 return 0; 2608 } 2609 2610 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2611 { 2612 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2613 nested_cpu_has_virtual_nmis(vmcs12))) 2614 return -EINVAL; 2615 2616 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2617 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2618 return -EINVAL; 2619 2620 return 0; 2621 } 2622 2623 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2624 { 2625 struct vcpu_vmx *vmx = to_vmx(vcpu); 2626 2627 /* Check for memory type validity */ 2628 switch (new_eptp & VMX_EPTP_MT_MASK) { 2629 case VMX_EPTP_MT_UC: 2630 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2631 return false; 2632 break; 2633 case VMX_EPTP_MT_WB: 2634 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2635 return false; 2636 break; 2637 default: 2638 return false; 2639 } 2640 2641 /* Page-walk levels validity. */ 2642 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2643 case VMX_EPTP_PWL_5: 2644 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2645 return false; 2646 break; 2647 case VMX_EPTP_PWL_4: 2648 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2649 return false; 2650 break; 2651 default: 2652 return false; 2653 } 2654 2655 /* Reserved bits should not be set */ 2656 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2657 return false; 2658 2659 /* AD, if set, should be supported */ 2660 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2661 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2662 return false; 2663 } 2664 2665 return true; 2666 } 2667 2668 /* 2669 * Checks related to VM-Execution Control Fields 2670 */ 2671 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2672 struct vmcs12 *vmcs12) 2673 { 2674 struct vcpu_vmx *vmx = to_vmx(vcpu); 2675 2676 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2677 vmx->nested.msrs.pinbased_ctls_low, 2678 vmx->nested.msrs.pinbased_ctls_high)) || 2679 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2680 vmx->nested.msrs.procbased_ctls_low, 2681 vmx->nested.msrs.procbased_ctls_high))) 2682 return -EINVAL; 2683 2684 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2685 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2686 vmx->nested.msrs.secondary_ctls_low, 2687 vmx->nested.msrs.secondary_ctls_high))) 2688 return -EINVAL; 2689 2690 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2691 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2692 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2693 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2694 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2695 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2696 nested_vmx_check_nmi_controls(vmcs12) || 2697 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2698 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2699 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2700 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2701 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2702 return -EINVAL; 2703 2704 if (!nested_cpu_has_preemption_timer(vmcs12) && 2705 nested_cpu_has_save_preemption_timer(vmcs12)) 2706 return -EINVAL; 2707 2708 if (nested_cpu_has_ept(vmcs12) && 2709 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2710 return -EINVAL; 2711 2712 if (nested_cpu_has_vmfunc(vmcs12)) { 2713 if (CC(vmcs12->vm_function_control & 2714 ~vmx->nested.msrs.vmfunc_controls)) 2715 return -EINVAL; 2716 2717 if (nested_cpu_has_eptp_switching(vmcs12)) { 2718 if (CC(!nested_cpu_has_ept(vmcs12)) || 2719 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2720 return -EINVAL; 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 /* 2728 * Checks related to VM-Exit Control Fields 2729 */ 2730 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2731 struct vmcs12 *vmcs12) 2732 { 2733 struct vcpu_vmx *vmx = to_vmx(vcpu); 2734 2735 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2736 vmx->nested.msrs.exit_ctls_low, 2737 vmx->nested.msrs.exit_ctls_high)) || 2738 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2739 return -EINVAL; 2740 2741 return 0; 2742 } 2743 2744 /* 2745 * Checks related to VM-Entry Control Fields 2746 */ 2747 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2748 struct vmcs12 *vmcs12) 2749 { 2750 struct vcpu_vmx *vmx = to_vmx(vcpu); 2751 2752 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2753 vmx->nested.msrs.entry_ctls_low, 2754 vmx->nested.msrs.entry_ctls_high))) 2755 return -EINVAL; 2756 2757 /* 2758 * From the Intel SDM, volume 3: 2759 * Fields relevant to VM-entry event injection must be set properly. 2760 * These fields are the VM-entry interruption-information field, the 2761 * VM-entry exception error code, and the VM-entry instruction length. 2762 */ 2763 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2764 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2765 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2766 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2767 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2768 bool should_have_error_code; 2769 bool urg = nested_cpu_has2(vmcs12, 2770 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2771 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2772 2773 /* VM-entry interruption-info field: interruption type */ 2774 if (CC(intr_type == INTR_TYPE_RESERVED) || 2775 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2776 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2777 return -EINVAL; 2778 2779 /* VM-entry interruption-info field: vector */ 2780 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2781 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2782 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2783 return -EINVAL; 2784 2785 /* VM-entry interruption-info field: deliver error code */ 2786 should_have_error_code = 2787 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2788 x86_exception_has_error_code(vector); 2789 if (CC(has_error_code != should_have_error_code)) 2790 return -EINVAL; 2791 2792 /* VM-entry exception error code */ 2793 if (CC(has_error_code && 2794 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2795 return -EINVAL; 2796 2797 /* VM-entry interruption-info field: reserved bits */ 2798 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2799 return -EINVAL; 2800 2801 /* VM-entry instruction length */ 2802 switch (intr_type) { 2803 case INTR_TYPE_SOFT_EXCEPTION: 2804 case INTR_TYPE_SOFT_INTR: 2805 case INTR_TYPE_PRIV_SW_EXCEPTION: 2806 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2807 CC(vmcs12->vm_entry_instruction_len == 0 && 2808 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2809 return -EINVAL; 2810 } 2811 } 2812 2813 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2814 return -EINVAL; 2815 2816 return 0; 2817 } 2818 2819 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2820 struct vmcs12 *vmcs12) 2821 { 2822 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2823 nested_check_vm_exit_controls(vcpu, vmcs12) || 2824 nested_check_vm_entry_controls(vcpu, vmcs12)) 2825 return -EINVAL; 2826 2827 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2828 return nested_evmcs_check_controls(vmcs12); 2829 2830 return 0; 2831 } 2832 2833 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2834 struct vmcs12 *vmcs12) 2835 { 2836 bool ia32e; 2837 2838 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2839 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2840 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2841 return -EINVAL; 2842 2843 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2844 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2845 return -EINVAL; 2846 2847 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2848 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2849 return -EINVAL; 2850 2851 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2852 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2853 vmcs12->host_ia32_perf_global_ctrl))) 2854 return -EINVAL; 2855 2856 #ifdef CONFIG_X86_64 2857 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2858 #else 2859 ia32e = false; 2860 #endif 2861 2862 if (ia32e) { 2863 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2864 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2865 return -EINVAL; 2866 } else { 2867 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2868 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2869 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2870 CC((vmcs12->host_rip) >> 32)) 2871 return -EINVAL; 2872 } 2873 2874 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2875 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2876 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2877 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2878 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2879 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2880 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2881 CC(vmcs12->host_cs_selector == 0) || 2882 CC(vmcs12->host_tr_selector == 0) || 2883 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2884 return -EINVAL; 2885 2886 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2887 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2888 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2889 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2890 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2891 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2892 return -EINVAL; 2893 2894 /* 2895 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2896 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2897 * the values of the LMA and LME bits in the field must each be that of 2898 * the host address-space size VM-exit control. 2899 */ 2900 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2901 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2902 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2903 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2904 return -EINVAL; 2905 } 2906 2907 return 0; 2908 } 2909 2910 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2911 struct vmcs12 *vmcs12) 2912 { 2913 int r = 0; 2914 struct vmcs12 *shadow; 2915 struct kvm_host_map map; 2916 2917 if (vmcs12->vmcs_link_pointer == -1ull) 2918 return 0; 2919 2920 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2921 return -EINVAL; 2922 2923 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2924 return -EINVAL; 2925 2926 shadow = map.hva; 2927 2928 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2929 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2930 r = -EINVAL; 2931 2932 kvm_vcpu_unmap(vcpu, &map, false); 2933 return r; 2934 } 2935 2936 /* 2937 * Checks related to Guest Non-register State 2938 */ 2939 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2940 { 2941 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2942 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2943 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2944 return -EINVAL; 2945 2946 return 0; 2947 } 2948 2949 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2950 struct vmcs12 *vmcs12, 2951 enum vm_entry_failure_code *entry_failure_code) 2952 { 2953 bool ia32e; 2954 2955 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2956 2957 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2958 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2959 return -EINVAL; 2960 2961 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2962 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2963 return -EINVAL; 2964 2965 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2966 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2967 return -EINVAL; 2968 2969 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2970 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2971 return -EINVAL; 2972 } 2973 2974 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2975 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2976 vmcs12->guest_ia32_perf_global_ctrl))) 2977 return -EINVAL; 2978 2979 /* 2980 * If the load IA32_EFER VM-entry control is 1, the following checks 2981 * are performed on the field for the IA32_EFER MSR: 2982 * - Bits reserved in the IA32_EFER MSR must be 0. 2983 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2984 * the IA-32e mode guest VM-exit control. It must also be identical 2985 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2986 * CR0.PG) is 1. 2987 */ 2988 if (to_vmx(vcpu)->nested.nested_run_pending && 2989 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2990 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2991 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2992 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2993 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2994 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2995 return -EINVAL; 2996 } 2997 2998 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2999 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3000 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3001 return -EINVAL; 3002 3003 if (nested_check_guest_non_reg_state(vmcs12)) 3004 return -EINVAL; 3005 3006 return 0; 3007 } 3008 3009 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3010 { 3011 struct vcpu_vmx *vmx = to_vmx(vcpu); 3012 unsigned long cr3, cr4; 3013 bool vm_fail; 3014 3015 if (!nested_early_check) 3016 return 0; 3017 3018 if (vmx->msr_autoload.host.nr) 3019 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3020 if (vmx->msr_autoload.guest.nr) 3021 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3022 3023 preempt_disable(); 3024 3025 vmx_prepare_switch_to_guest(vcpu); 3026 3027 /* 3028 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3029 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3030 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3031 * there is no need to preserve other bits or save/restore the field. 3032 */ 3033 vmcs_writel(GUEST_RFLAGS, 0); 3034 3035 cr3 = __get_current_cr3_fast(); 3036 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3037 vmcs_writel(HOST_CR3, cr3); 3038 vmx->loaded_vmcs->host_state.cr3 = cr3; 3039 } 3040 3041 cr4 = cr4_read_shadow(); 3042 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3043 vmcs_writel(HOST_CR4, cr4); 3044 vmx->loaded_vmcs->host_state.cr4 = cr4; 3045 } 3046 3047 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3048 vmx->loaded_vmcs->launched); 3049 3050 if (vmx->msr_autoload.host.nr) 3051 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3052 if (vmx->msr_autoload.guest.nr) 3053 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3054 3055 if (vm_fail) { 3056 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3057 3058 preempt_enable(); 3059 3060 trace_kvm_nested_vmenter_failed( 3061 "early hardware check VM-instruction error: ", error); 3062 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3063 return 1; 3064 } 3065 3066 /* 3067 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3068 */ 3069 if (hw_breakpoint_active()) 3070 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3071 local_irq_enable(); 3072 preempt_enable(); 3073 3074 /* 3075 * A non-failing VMEntry means we somehow entered guest mode with 3076 * an illegal RIP, and that's just the tip of the iceberg. There 3077 * is no telling what memory has been modified or what state has 3078 * been exposed to unknown code. Hitting this all but guarantees 3079 * a (very critical) hardware issue. 3080 */ 3081 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3082 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3083 3084 return 0; 3085 } 3086 3087 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3088 { 3089 struct vcpu_vmx *vmx = to_vmx(vcpu); 3090 3091 /* 3092 * hv_evmcs may end up being not mapped after migration (when 3093 * L2 was running), map it here to make sure vmcs12 changes are 3094 * properly reflected. 3095 */ 3096 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { 3097 enum nested_evmptrld_status evmptrld_status = 3098 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3099 3100 if (evmptrld_status == EVMPTRLD_VMFAIL || 3101 evmptrld_status == EVMPTRLD_ERROR) 3102 return false; 3103 } 3104 3105 return true; 3106 } 3107 3108 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3109 { 3110 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3111 struct vcpu_vmx *vmx = to_vmx(vcpu); 3112 struct kvm_host_map *map; 3113 struct page *page; 3114 u64 hpa; 3115 3116 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3117 /* 3118 * Translate L1 physical address to host physical 3119 * address for vmcs02. Keep the page pinned, so this 3120 * physical address remains valid. We keep a reference 3121 * to it so we can release it later. 3122 */ 3123 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3124 kvm_release_page_clean(vmx->nested.apic_access_page); 3125 vmx->nested.apic_access_page = NULL; 3126 } 3127 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3128 if (!is_error_page(page)) { 3129 vmx->nested.apic_access_page = page; 3130 hpa = page_to_phys(vmx->nested.apic_access_page); 3131 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3132 } else { 3133 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3134 __func__); 3135 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3136 vcpu->run->internal.suberror = 3137 KVM_INTERNAL_ERROR_EMULATION; 3138 vcpu->run->internal.ndata = 0; 3139 return false; 3140 } 3141 } 3142 3143 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3144 map = &vmx->nested.virtual_apic_map; 3145 3146 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3147 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3148 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3149 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3150 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3151 /* 3152 * The processor will never use the TPR shadow, simply 3153 * clear the bit from the execution control. Such a 3154 * configuration is useless, but it happens in tests. 3155 * For any other configuration, failing the vm entry is 3156 * _not_ what the processor does but it's basically the 3157 * only possibility we have. 3158 */ 3159 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3160 } else { 3161 /* 3162 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3163 * force VM-Entry to fail. 3164 */ 3165 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 3166 } 3167 } 3168 3169 if (nested_cpu_has_posted_intr(vmcs12)) { 3170 map = &vmx->nested.pi_desc_map; 3171 3172 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3173 vmx->nested.pi_desc = 3174 (struct pi_desc *)(((void *)map->hva) + 3175 offset_in_page(vmcs12->posted_intr_desc_addr)); 3176 vmcs_write64(POSTED_INTR_DESC_ADDR, 3177 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3178 } 3179 } 3180 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3181 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3182 else 3183 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3184 3185 return true; 3186 } 3187 3188 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3189 { 3190 if (!nested_get_evmcs_page(vcpu)) { 3191 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3192 __func__); 3193 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3194 vcpu->run->internal.suberror = 3195 KVM_INTERNAL_ERROR_EMULATION; 3196 vcpu->run->internal.ndata = 0; 3197 3198 return false; 3199 } 3200 3201 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3202 return false; 3203 3204 return true; 3205 } 3206 3207 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3208 { 3209 struct vmcs12 *vmcs12; 3210 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 gpa_t dst; 3212 3213 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3214 return 0; 3215 3216 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3217 return 1; 3218 3219 /* 3220 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3221 * set is already checked as part of A/D emulation. 3222 */ 3223 vmcs12 = get_vmcs12(vcpu); 3224 if (!nested_cpu_has_pml(vmcs12)) 3225 return 0; 3226 3227 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3228 vmx->nested.pml_full = true; 3229 return 1; 3230 } 3231 3232 gpa &= ~0xFFFull; 3233 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3234 3235 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3236 offset_in_page(dst), sizeof(gpa))) 3237 return 0; 3238 3239 vmcs12->guest_pml_index--; 3240 3241 return 0; 3242 } 3243 3244 /* 3245 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3246 * for running VMX instructions (except VMXON, whose prerequisites are 3247 * slightly different). It also specifies what exception to inject otherwise. 3248 * Note that many of these exceptions have priority over VM exits, so they 3249 * don't have to be checked again here. 3250 */ 3251 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3252 { 3253 if (!to_vmx(vcpu)->nested.vmxon) { 3254 kvm_queue_exception(vcpu, UD_VECTOR); 3255 return 0; 3256 } 3257 3258 if (vmx_get_cpl(vcpu)) { 3259 kvm_inject_gp(vcpu, 0); 3260 return 0; 3261 } 3262 3263 return 1; 3264 } 3265 3266 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3267 { 3268 u8 rvi = vmx_get_rvi(); 3269 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3270 3271 return ((rvi & 0xf0) > (vppr & 0xf0)); 3272 } 3273 3274 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3275 struct vmcs12 *vmcs12); 3276 3277 /* 3278 * If from_vmentry is false, this is being called from state restore (either RSM 3279 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3280 * 3281 * Returns: 3282 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3283 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3284 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3285 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3286 */ 3287 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3288 bool from_vmentry) 3289 { 3290 struct vcpu_vmx *vmx = to_vmx(vcpu); 3291 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3292 enum vm_entry_failure_code entry_failure_code; 3293 bool evaluate_pending_interrupts; 3294 union vmx_exit_reason exit_reason = { 3295 .basic = EXIT_REASON_INVALID_STATE, 3296 .failed_vmentry = 1, 3297 }; 3298 u32 failed_index; 3299 3300 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 3301 kvm_vcpu_flush_tlb_current(vcpu); 3302 3303 evaluate_pending_interrupts = exec_controls_get(vmx) & 3304 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3305 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3306 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3307 3308 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3309 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3310 if (kvm_mpx_supported() && 3311 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3312 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3313 3314 /* 3315 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3316 * nested early checks are disabled. In the event of a "late" VM-Fail, 3317 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3318 * software model to the pre-VMEntry host state. When EPT is disabled, 3319 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3320 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3321 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3322 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3323 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3324 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3325 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3326 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3327 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3328 * path would need to manually save/restore vmcs01.GUEST_CR3. 3329 */ 3330 if (!enable_ept && !nested_early_check) 3331 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3332 3333 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3334 3335 prepare_vmcs02_early(vmx, vmcs12); 3336 3337 if (from_vmentry) { 3338 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3339 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3340 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3341 } 3342 3343 if (nested_vmx_check_vmentry_hw(vcpu)) { 3344 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3345 return NVMX_VMENTRY_VMFAIL; 3346 } 3347 3348 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3349 &entry_failure_code)) { 3350 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3351 vmcs12->exit_qualification = entry_failure_code; 3352 goto vmentry_fail_vmexit; 3353 } 3354 } 3355 3356 enter_guest_mode(vcpu); 3357 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3358 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3359 3360 if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { 3361 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3362 vmcs12->exit_qualification = entry_failure_code; 3363 goto vmentry_fail_vmexit_guest_mode; 3364 } 3365 3366 if (from_vmentry) { 3367 failed_index = nested_vmx_load_msr(vcpu, 3368 vmcs12->vm_entry_msr_load_addr, 3369 vmcs12->vm_entry_msr_load_count); 3370 if (failed_index) { 3371 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3372 vmcs12->exit_qualification = failed_index; 3373 goto vmentry_fail_vmexit_guest_mode; 3374 } 3375 } else { 3376 /* 3377 * The MMU is not initialized to point at the right entities yet and 3378 * "get pages" would need to read data from the guest (i.e. we will 3379 * need to perform gpa to hpa translation). Request a call 3380 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3381 * have already been set at vmentry time and should not be reset. 3382 */ 3383 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3384 } 3385 3386 /* 3387 * If L1 had a pending IRQ/NMI until it executed 3388 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3389 * disallowed (e.g. interrupts disabled), L0 needs to 3390 * evaluate if this pending event should cause an exit from L2 3391 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3392 * intercept EXTERNAL_INTERRUPT). 3393 * 3394 * Usually this would be handled by the processor noticing an 3395 * IRQ/NMI window request, or checking RVI during evaluation of 3396 * pending virtual interrupts. However, this setting was done 3397 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3398 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3399 */ 3400 if (unlikely(evaluate_pending_interrupts)) 3401 kvm_make_request(KVM_REQ_EVENT, vcpu); 3402 3403 /* 3404 * Do not start the preemption timer hrtimer until after we know 3405 * we are successful, so that only nested_vmx_vmexit needs to cancel 3406 * the timer. 3407 */ 3408 vmx->nested.preemption_timer_expired = false; 3409 if (nested_cpu_has_preemption_timer(vmcs12)) { 3410 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3411 vmx_start_preemption_timer(vcpu, timer_value); 3412 } 3413 3414 /* 3415 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3416 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3417 * returned as far as L1 is concerned. It will only return (and set 3418 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3419 */ 3420 return NVMX_VMENTRY_SUCCESS; 3421 3422 /* 3423 * A failed consistency check that leads to a VMExit during L1's 3424 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3425 * 26.7 "VM-entry failures during or after loading guest state". 3426 */ 3427 vmentry_fail_vmexit_guest_mode: 3428 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3429 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3430 leave_guest_mode(vcpu); 3431 3432 vmentry_fail_vmexit: 3433 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3434 3435 if (!from_vmentry) 3436 return NVMX_VMENTRY_VMEXIT; 3437 3438 load_vmcs12_host_state(vcpu, vmcs12); 3439 vmcs12->vm_exit_reason = exit_reason.full; 3440 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3441 vmx->nested.need_vmcs12_to_shadow_sync = true; 3442 return NVMX_VMENTRY_VMEXIT; 3443 } 3444 3445 /* 3446 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3447 * for running an L2 nested guest. 3448 */ 3449 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3450 { 3451 struct vmcs12 *vmcs12; 3452 enum nvmx_vmentry_status status; 3453 struct vcpu_vmx *vmx = to_vmx(vcpu); 3454 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3455 enum nested_evmptrld_status evmptrld_status; 3456 3457 ++vcpu->stat.nested_run; 3458 3459 if (!nested_vmx_check_permission(vcpu)) 3460 return 1; 3461 3462 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3463 if (evmptrld_status == EVMPTRLD_ERROR) { 3464 kvm_queue_exception(vcpu, UD_VECTOR); 3465 return 1; 3466 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3467 return nested_vmx_failInvalid(vcpu); 3468 } 3469 3470 if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) 3471 return nested_vmx_failInvalid(vcpu); 3472 3473 vmcs12 = get_vmcs12(vcpu); 3474 3475 /* 3476 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3477 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3478 * rather than RFLAGS.ZF, and no error number is stored to the 3479 * VM-instruction error field. 3480 */ 3481 if (CC(vmcs12->hdr.shadow_vmcs)) 3482 return nested_vmx_failInvalid(vcpu); 3483 3484 if (vmx->nested.hv_evmcs) { 3485 copy_enlightened_to_vmcs12(vmx); 3486 /* Enlightened VMCS doesn't have launch state */ 3487 vmcs12->launch_state = !launch; 3488 } else if (enable_shadow_vmcs) { 3489 copy_shadow_to_vmcs12(vmx); 3490 } 3491 3492 /* 3493 * The nested entry process starts with enforcing various prerequisites 3494 * on vmcs12 as required by the Intel SDM, and act appropriately when 3495 * they fail: As the SDM explains, some conditions should cause the 3496 * instruction to fail, while others will cause the instruction to seem 3497 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3498 * To speed up the normal (success) code path, we should avoid checking 3499 * for misconfigurations which will anyway be caught by the processor 3500 * when using the merged vmcs02. 3501 */ 3502 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3503 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3504 3505 if (CC(vmcs12->launch_state == launch)) 3506 return nested_vmx_fail(vcpu, 3507 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3508 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3509 3510 if (nested_vmx_check_controls(vcpu, vmcs12)) 3511 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3512 3513 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3514 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3515 3516 /* 3517 * We're finally done with prerequisite checking, and can start with 3518 * the nested entry. 3519 */ 3520 vmx->nested.nested_run_pending = 1; 3521 vmx->nested.has_preemption_timer_deadline = false; 3522 status = nested_vmx_enter_non_root_mode(vcpu, true); 3523 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3524 goto vmentry_failed; 3525 3526 /* Emulate processing of posted interrupts on VM-Enter. */ 3527 if (nested_cpu_has_posted_intr(vmcs12) && 3528 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3529 vmx->nested.pi_pending = true; 3530 kvm_make_request(KVM_REQ_EVENT, vcpu); 3531 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3532 } 3533 3534 /* Hide L1D cache contents from the nested guest. */ 3535 vmx->vcpu.arch.l1tf_flush_l1d = true; 3536 3537 /* 3538 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3539 * also be used as part of restoring nVMX state for 3540 * snapshot restore (migration). 3541 * 3542 * In this flow, it is assumed that vmcs12 cache was 3543 * transferred as part of captured nVMX state and should 3544 * therefore not be read from guest memory (which may not 3545 * exist on destination host yet). 3546 */ 3547 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3548 3549 switch (vmcs12->guest_activity_state) { 3550 case GUEST_ACTIVITY_HLT: 3551 /* 3552 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3553 * awakened by event injection or by an NMI-window VM-exit or 3554 * by an interrupt-window VM-exit, halt the vcpu. 3555 */ 3556 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3557 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3558 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3559 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3560 vmx->nested.nested_run_pending = 0; 3561 return kvm_vcpu_halt(vcpu); 3562 } 3563 break; 3564 case GUEST_ACTIVITY_WAIT_SIPI: 3565 vmx->nested.nested_run_pending = 0; 3566 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3567 break; 3568 default: 3569 break; 3570 } 3571 3572 return 1; 3573 3574 vmentry_failed: 3575 vmx->nested.nested_run_pending = 0; 3576 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3577 return 0; 3578 if (status == NVMX_VMENTRY_VMEXIT) 3579 return 1; 3580 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3581 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3582 } 3583 3584 /* 3585 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3586 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3587 * This function returns the new value we should put in vmcs12.guest_cr0. 3588 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3589 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3590 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3591 * didn't trap the bit, because if L1 did, so would L0). 3592 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3593 * been modified by L2, and L1 knows it. So just leave the old value of 3594 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3595 * isn't relevant, because if L0 traps this bit it can set it to anything. 3596 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3597 * changed these bits, and therefore they need to be updated, but L0 3598 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3599 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3600 */ 3601 static inline unsigned long 3602 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3603 { 3604 return 3605 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3606 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3607 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3608 vcpu->arch.cr0_guest_owned_bits)); 3609 } 3610 3611 static inline unsigned long 3612 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3613 { 3614 return 3615 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3616 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3617 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3618 vcpu->arch.cr4_guest_owned_bits)); 3619 } 3620 3621 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3622 struct vmcs12 *vmcs12) 3623 { 3624 u32 idt_vectoring; 3625 unsigned int nr; 3626 3627 if (vcpu->arch.exception.injected) { 3628 nr = vcpu->arch.exception.nr; 3629 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3630 3631 if (kvm_exception_is_soft(nr)) { 3632 vmcs12->vm_exit_instruction_len = 3633 vcpu->arch.event_exit_inst_len; 3634 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3635 } else 3636 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3637 3638 if (vcpu->arch.exception.has_error_code) { 3639 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3640 vmcs12->idt_vectoring_error_code = 3641 vcpu->arch.exception.error_code; 3642 } 3643 3644 vmcs12->idt_vectoring_info_field = idt_vectoring; 3645 } else if (vcpu->arch.nmi_injected) { 3646 vmcs12->idt_vectoring_info_field = 3647 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3648 } else if (vcpu->arch.interrupt.injected) { 3649 nr = vcpu->arch.interrupt.nr; 3650 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3651 3652 if (vcpu->arch.interrupt.soft) { 3653 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3654 vmcs12->vm_entry_instruction_len = 3655 vcpu->arch.event_exit_inst_len; 3656 } else 3657 idt_vectoring |= INTR_TYPE_EXT_INTR; 3658 3659 vmcs12->idt_vectoring_info_field = idt_vectoring; 3660 } 3661 } 3662 3663 3664 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3665 { 3666 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3667 gfn_t gfn; 3668 3669 /* 3670 * Don't need to mark the APIC access page dirty; it is never 3671 * written to by the CPU during APIC virtualization. 3672 */ 3673 3674 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3675 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3676 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3677 } 3678 3679 if (nested_cpu_has_posted_intr(vmcs12)) { 3680 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3681 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3682 } 3683 } 3684 3685 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3686 { 3687 struct vcpu_vmx *vmx = to_vmx(vcpu); 3688 int max_irr; 3689 void *vapic_page; 3690 u16 status; 3691 3692 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3693 return; 3694 3695 vmx->nested.pi_pending = false; 3696 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3697 return; 3698 3699 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3700 if (max_irr != 256) { 3701 vapic_page = vmx->nested.virtual_apic_map.hva; 3702 if (!vapic_page) 3703 return; 3704 3705 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3706 vapic_page, &max_irr); 3707 status = vmcs_read16(GUEST_INTR_STATUS); 3708 if ((u8)max_irr > ((u8)status & 0xff)) { 3709 status &= ~0xff; 3710 status |= (u8)max_irr; 3711 vmcs_write16(GUEST_INTR_STATUS, status); 3712 } 3713 } 3714 3715 nested_mark_vmcs12_pages_dirty(vcpu); 3716 } 3717 3718 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3719 unsigned long exit_qual) 3720 { 3721 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3722 unsigned int nr = vcpu->arch.exception.nr; 3723 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3724 3725 if (vcpu->arch.exception.has_error_code) { 3726 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3727 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3728 } 3729 3730 if (kvm_exception_is_soft(nr)) 3731 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3732 else 3733 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3734 3735 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3736 vmx_get_nmi_mask(vcpu)) 3737 intr_info |= INTR_INFO_UNBLOCK_NMI; 3738 3739 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3740 } 3741 3742 /* 3743 * Returns true if a debug trap is pending delivery. 3744 * 3745 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3746 * exception may be inferred from the presence of an exception payload. 3747 */ 3748 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3749 { 3750 return vcpu->arch.exception.pending && 3751 vcpu->arch.exception.nr == DB_VECTOR && 3752 vcpu->arch.exception.payload; 3753 } 3754 3755 /* 3756 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3757 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3758 * represents these debug traps with a payload that is said to be compatible 3759 * with the 'pending debug exceptions' field, write the payload to the VMCS 3760 * field if a VM-exit is delivered before the debug trap. 3761 */ 3762 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3763 { 3764 if (vmx_pending_dbg_trap(vcpu)) 3765 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3766 vcpu->arch.exception.payload); 3767 } 3768 3769 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3770 { 3771 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3772 to_vmx(vcpu)->nested.preemption_timer_expired; 3773 } 3774 3775 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3776 { 3777 struct vcpu_vmx *vmx = to_vmx(vcpu); 3778 unsigned long exit_qual; 3779 bool block_nested_events = 3780 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3781 bool mtf_pending = vmx->nested.mtf_pending; 3782 struct kvm_lapic *apic = vcpu->arch.apic; 3783 3784 /* 3785 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3786 * this state is discarded. 3787 */ 3788 if (!block_nested_events) 3789 vmx->nested.mtf_pending = false; 3790 3791 if (lapic_in_kernel(vcpu) && 3792 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3793 if (block_nested_events) 3794 return -EBUSY; 3795 nested_vmx_update_pending_dbg(vcpu); 3796 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3797 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3798 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3799 return 0; 3800 } 3801 3802 if (lapic_in_kernel(vcpu) && 3803 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3804 if (block_nested_events) 3805 return -EBUSY; 3806 3807 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3808 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3809 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3810 apic->sipi_vector & 0xFFUL); 3811 return 0; 3812 } 3813 3814 /* 3815 * Process any exceptions that are not debug traps before MTF. 3816 * 3817 * Note that only a pending nested run can block a pending exception. 3818 * Otherwise an injected NMI/interrupt should either be 3819 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3820 * while delivering the pending exception. 3821 */ 3822 3823 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3824 if (vmx->nested.nested_run_pending) 3825 return -EBUSY; 3826 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3827 goto no_vmexit; 3828 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3829 return 0; 3830 } 3831 3832 if (mtf_pending) { 3833 if (block_nested_events) 3834 return -EBUSY; 3835 nested_vmx_update_pending_dbg(vcpu); 3836 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3837 return 0; 3838 } 3839 3840 if (vcpu->arch.exception.pending) { 3841 if (vmx->nested.nested_run_pending) 3842 return -EBUSY; 3843 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3844 goto no_vmexit; 3845 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3846 return 0; 3847 } 3848 3849 if (nested_vmx_preemption_timer_pending(vcpu)) { 3850 if (block_nested_events) 3851 return -EBUSY; 3852 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3853 return 0; 3854 } 3855 3856 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3857 if (block_nested_events) 3858 return -EBUSY; 3859 goto no_vmexit; 3860 } 3861 3862 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3863 if (block_nested_events) 3864 return -EBUSY; 3865 if (!nested_exit_on_nmi(vcpu)) 3866 goto no_vmexit; 3867 3868 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3869 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3870 INTR_INFO_VALID_MASK, 0); 3871 /* 3872 * The NMI-triggered VM exit counts as injection: 3873 * clear this one and block further NMIs. 3874 */ 3875 vcpu->arch.nmi_pending = 0; 3876 vmx_set_nmi_mask(vcpu, true); 3877 return 0; 3878 } 3879 3880 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3881 if (block_nested_events) 3882 return -EBUSY; 3883 if (!nested_exit_on_intr(vcpu)) 3884 goto no_vmexit; 3885 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3886 return 0; 3887 } 3888 3889 no_vmexit: 3890 vmx_complete_nested_posted_interrupt(vcpu); 3891 return 0; 3892 } 3893 3894 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3895 { 3896 ktime_t remaining = 3897 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3898 u64 value; 3899 3900 if (ktime_to_ns(remaining) <= 0) 3901 return 0; 3902 3903 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3904 do_div(value, 1000000); 3905 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3906 } 3907 3908 static bool is_vmcs12_ext_field(unsigned long field) 3909 { 3910 switch (field) { 3911 case GUEST_ES_SELECTOR: 3912 case GUEST_CS_SELECTOR: 3913 case GUEST_SS_SELECTOR: 3914 case GUEST_DS_SELECTOR: 3915 case GUEST_FS_SELECTOR: 3916 case GUEST_GS_SELECTOR: 3917 case GUEST_LDTR_SELECTOR: 3918 case GUEST_TR_SELECTOR: 3919 case GUEST_ES_LIMIT: 3920 case GUEST_CS_LIMIT: 3921 case GUEST_SS_LIMIT: 3922 case GUEST_DS_LIMIT: 3923 case GUEST_FS_LIMIT: 3924 case GUEST_GS_LIMIT: 3925 case GUEST_LDTR_LIMIT: 3926 case GUEST_TR_LIMIT: 3927 case GUEST_GDTR_LIMIT: 3928 case GUEST_IDTR_LIMIT: 3929 case GUEST_ES_AR_BYTES: 3930 case GUEST_DS_AR_BYTES: 3931 case GUEST_FS_AR_BYTES: 3932 case GUEST_GS_AR_BYTES: 3933 case GUEST_LDTR_AR_BYTES: 3934 case GUEST_TR_AR_BYTES: 3935 case GUEST_ES_BASE: 3936 case GUEST_CS_BASE: 3937 case GUEST_SS_BASE: 3938 case GUEST_DS_BASE: 3939 case GUEST_FS_BASE: 3940 case GUEST_GS_BASE: 3941 case GUEST_LDTR_BASE: 3942 case GUEST_TR_BASE: 3943 case GUEST_GDTR_BASE: 3944 case GUEST_IDTR_BASE: 3945 case GUEST_PENDING_DBG_EXCEPTIONS: 3946 case GUEST_BNDCFGS: 3947 return true; 3948 default: 3949 break; 3950 } 3951 3952 return false; 3953 } 3954 3955 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3956 struct vmcs12 *vmcs12) 3957 { 3958 struct vcpu_vmx *vmx = to_vmx(vcpu); 3959 3960 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3961 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3962 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3963 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3964 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3965 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3966 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3967 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3968 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3969 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3970 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3971 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3972 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3973 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3974 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3975 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3976 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3977 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3978 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3979 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3980 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3981 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3982 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3983 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3984 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3985 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3986 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3987 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3988 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3989 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3990 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3991 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3992 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3993 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3994 vmcs12->guest_pending_dbg_exceptions = 3995 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3996 if (kvm_mpx_supported()) 3997 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3998 3999 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4000 } 4001 4002 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4003 struct vmcs12 *vmcs12) 4004 { 4005 struct vcpu_vmx *vmx = to_vmx(vcpu); 4006 int cpu; 4007 4008 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4009 return; 4010 4011 4012 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4013 4014 cpu = get_cpu(); 4015 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4016 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4017 4018 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4019 4020 vmx->loaded_vmcs = &vmx->vmcs01; 4021 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4022 put_cpu(); 4023 } 4024 4025 /* 4026 * Update the guest state fields of vmcs12 to reflect changes that 4027 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4028 * VM-entry controls is also updated, since this is really a guest 4029 * state bit.) 4030 */ 4031 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4032 { 4033 struct vcpu_vmx *vmx = to_vmx(vcpu); 4034 4035 if (vmx->nested.hv_evmcs) 4036 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4037 4038 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 4039 4040 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4041 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4042 4043 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4044 vmcs12->guest_rip = kvm_rip_read(vcpu); 4045 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4046 4047 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4048 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4049 4050 vmcs12->guest_interruptibility_info = 4051 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4052 4053 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4054 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4055 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4056 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4057 else 4058 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4059 4060 if (nested_cpu_has_preemption_timer(vmcs12) && 4061 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4062 !vmx->nested.nested_run_pending) 4063 vmcs12->vmx_preemption_timer_value = 4064 vmx_get_preemption_timer_value(vcpu); 4065 4066 /* 4067 * In some cases (usually, nested EPT), L2 is allowed to change its 4068 * own CR3 without exiting. If it has changed it, we must keep it. 4069 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4070 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4071 * 4072 * Additionally, restore L2's PDPTR to vmcs12. 4073 */ 4074 if (enable_ept) { 4075 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4076 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4077 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4078 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4079 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4080 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4081 } 4082 } 4083 4084 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4085 4086 if (nested_cpu_has_vid(vmcs12)) 4087 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4088 4089 vmcs12->vm_entry_controls = 4090 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4091 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4092 4093 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4094 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4095 4096 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4097 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4098 } 4099 4100 /* 4101 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4102 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4103 * and this function updates it to reflect the changes to the guest state while 4104 * L2 was running (and perhaps made some exits which were handled directly by L0 4105 * without going back to L1), and to reflect the exit reason. 4106 * Note that we do not have to copy here all VMCS fields, just those that 4107 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4108 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4109 * which already writes to vmcs12 directly. 4110 */ 4111 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4112 u32 vm_exit_reason, u32 exit_intr_info, 4113 unsigned long exit_qualification) 4114 { 4115 /* update exit information fields: */ 4116 vmcs12->vm_exit_reason = vm_exit_reason; 4117 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4118 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4119 vmcs12->exit_qualification = exit_qualification; 4120 vmcs12->vm_exit_intr_info = exit_intr_info; 4121 4122 vmcs12->idt_vectoring_info_field = 0; 4123 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4124 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4125 4126 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4127 vmcs12->launch_state = 1; 4128 4129 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4130 * instead of reading the real value. */ 4131 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4132 4133 /* 4134 * Transfer the event that L0 or L1 may wanted to inject into 4135 * L2 to IDT_VECTORING_INFO_FIELD. 4136 */ 4137 vmcs12_save_pending_event(vcpu, vmcs12); 4138 4139 /* 4140 * According to spec, there's no need to store the guest's 4141 * MSRs if the exit is due to a VM-entry failure that occurs 4142 * during or after loading the guest state. Since this exit 4143 * does not fall in that category, we need to save the MSRs. 4144 */ 4145 if (nested_vmx_store_msr(vcpu, 4146 vmcs12->vm_exit_msr_store_addr, 4147 vmcs12->vm_exit_msr_store_count)) 4148 nested_vmx_abort(vcpu, 4149 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4150 } 4151 4152 /* 4153 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4154 * preserved above and would only end up incorrectly in L1. 4155 */ 4156 vcpu->arch.nmi_injected = false; 4157 kvm_clear_exception_queue(vcpu); 4158 kvm_clear_interrupt_queue(vcpu); 4159 } 4160 4161 /* 4162 * A part of what we need to when the nested L2 guest exits and we want to 4163 * run its L1 parent, is to reset L1's guest state to the host state specified 4164 * in vmcs12. 4165 * This function is to be called not only on normal nested exit, but also on 4166 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4167 * Failures During or After Loading Guest State"). 4168 * This function should be called when the active VMCS is L1's (vmcs01). 4169 */ 4170 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4171 struct vmcs12 *vmcs12) 4172 { 4173 enum vm_entry_failure_code ignored; 4174 struct kvm_segment seg; 4175 4176 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4177 vcpu->arch.efer = vmcs12->host_ia32_efer; 4178 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4179 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4180 else 4181 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4182 vmx_set_efer(vcpu, vcpu->arch.efer); 4183 4184 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4185 kvm_rip_write(vcpu, vmcs12->host_rip); 4186 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4187 vmx_set_interrupt_shadow(vcpu, 0); 4188 4189 /* 4190 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4191 * actually changed, because vmx_set_cr0 refers to efer set above. 4192 * 4193 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4194 * (KVM doesn't change it); 4195 */ 4196 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4197 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4198 4199 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4200 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4201 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4202 4203 nested_ept_uninit_mmu_context(vcpu); 4204 4205 /* 4206 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4207 * couldn't have changed. 4208 */ 4209 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) 4210 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4211 4212 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4213 4214 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4215 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4216 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4217 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4218 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4219 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4220 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4221 4222 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4223 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4224 vmcs_write64(GUEST_BNDCFGS, 0); 4225 4226 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4227 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4228 vcpu->arch.pat = vmcs12->host_ia32_pat; 4229 } 4230 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4231 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4232 vmcs12->host_ia32_perf_global_ctrl)); 4233 4234 /* Set L1 segment info according to Intel SDM 4235 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4236 seg = (struct kvm_segment) { 4237 .base = 0, 4238 .limit = 0xFFFFFFFF, 4239 .selector = vmcs12->host_cs_selector, 4240 .type = 11, 4241 .present = 1, 4242 .s = 1, 4243 .g = 1 4244 }; 4245 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4246 seg.l = 1; 4247 else 4248 seg.db = 1; 4249 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4250 seg = (struct kvm_segment) { 4251 .base = 0, 4252 .limit = 0xFFFFFFFF, 4253 .type = 3, 4254 .present = 1, 4255 .s = 1, 4256 .db = 1, 4257 .g = 1 4258 }; 4259 seg.selector = vmcs12->host_ds_selector; 4260 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4261 seg.selector = vmcs12->host_es_selector; 4262 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4263 seg.selector = vmcs12->host_ss_selector; 4264 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4265 seg.selector = vmcs12->host_fs_selector; 4266 seg.base = vmcs12->host_fs_base; 4267 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4268 seg.selector = vmcs12->host_gs_selector; 4269 seg.base = vmcs12->host_gs_base; 4270 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4271 seg = (struct kvm_segment) { 4272 .base = vmcs12->host_tr_base, 4273 .limit = 0x67, 4274 .selector = vmcs12->host_tr_selector, 4275 .type = 11, 4276 .present = 1 4277 }; 4278 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4279 4280 kvm_set_dr(vcpu, 7, 0x400); 4281 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4282 4283 if (cpu_has_vmx_msr_bitmap()) 4284 vmx_update_msr_bitmap(vcpu); 4285 4286 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4287 vmcs12->vm_exit_msr_load_count)) 4288 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4289 } 4290 4291 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4292 { 4293 struct vmx_uret_msr *efer_msr; 4294 unsigned int i; 4295 4296 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4297 return vmcs_read64(GUEST_IA32_EFER); 4298 4299 if (cpu_has_load_ia32_efer()) 4300 return host_efer; 4301 4302 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4303 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4304 return vmx->msr_autoload.guest.val[i].value; 4305 } 4306 4307 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4308 if (efer_msr) 4309 return efer_msr->data; 4310 4311 return host_efer; 4312 } 4313 4314 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4315 { 4316 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4317 struct vcpu_vmx *vmx = to_vmx(vcpu); 4318 struct vmx_msr_entry g, h; 4319 gpa_t gpa; 4320 u32 i, j; 4321 4322 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4323 4324 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4325 /* 4326 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4327 * as vmcs01.GUEST_DR7 contains a userspace defined value 4328 * and vcpu->arch.dr7 is not squirreled away before the 4329 * nested VMENTER (not worth adding a variable in nested_vmx). 4330 */ 4331 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4332 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4333 else 4334 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4335 } 4336 4337 /* 4338 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4339 * handle a variety of side effects to KVM's software model. 4340 */ 4341 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4342 4343 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4344 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4345 4346 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4347 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4348 4349 nested_ept_uninit_mmu_context(vcpu); 4350 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4351 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4352 4353 /* 4354 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4355 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4356 * VMFail, like everything else we just need to ensure our 4357 * software model is up-to-date. 4358 */ 4359 if (enable_ept && is_pae_paging(vcpu)) 4360 ept_save_pdptrs(vcpu); 4361 4362 kvm_mmu_reset_context(vcpu); 4363 4364 if (cpu_has_vmx_msr_bitmap()) 4365 vmx_update_msr_bitmap(vcpu); 4366 4367 /* 4368 * This nasty bit of open coding is a compromise between blindly 4369 * loading L1's MSRs using the exit load lists (incorrect emulation 4370 * of VMFail), leaving the nested VM's MSRs in the software model 4371 * (incorrect behavior) and snapshotting the modified MSRs (too 4372 * expensive since the lists are unbound by hardware). For each 4373 * MSR that was (prematurely) loaded from the nested VMEntry load 4374 * list, reload it from the exit load list if it exists and differs 4375 * from the guest value. The intent is to stuff host state as 4376 * silently as possible, not to fully process the exit load list. 4377 */ 4378 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4379 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4380 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4381 pr_debug_ratelimited( 4382 "%s read MSR index failed (%u, 0x%08llx)\n", 4383 __func__, i, gpa); 4384 goto vmabort; 4385 } 4386 4387 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4388 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4389 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4390 pr_debug_ratelimited( 4391 "%s read MSR failed (%u, 0x%08llx)\n", 4392 __func__, j, gpa); 4393 goto vmabort; 4394 } 4395 if (h.index != g.index) 4396 continue; 4397 if (h.value == g.value) 4398 break; 4399 4400 if (nested_vmx_load_msr_check(vcpu, &h)) { 4401 pr_debug_ratelimited( 4402 "%s check failed (%u, 0x%x, 0x%x)\n", 4403 __func__, j, h.index, h.reserved); 4404 goto vmabort; 4405 } 4406 4407 if (kvm_set_msr(vcpu, h.index, h.value)) { 4408 pr_debug_ratelimited( 4409 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4410 __func__, j, h.index, h.value); 4411 goto vmabort; 4412 } 4413 } 4414 } 4415 4416 return; 4417 4418 vmabort: 4419 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4420 } 4421 4422 /* 4423 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4424 * and modify vmcs12 to make it see what it would expect to see there if 4425 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4426 */ 4427 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4428 u32 exit_intr_info, unsigned long exit_qualification) 4429 { 4430 struct vcpu_vmx *vmx = to_vmx(vcpu); 4431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4432 4433 /* trying to cancel vmlaunch/vmresume is a bug */ 4434 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4435 4436 /* Similarly, triple faults in L2 should never escape. */ 4437 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4438 4439 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4440 /* 4441 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4442 * Enlightened VMCS after migration and we still need to 4443 * do that when something is forcing L2->L1 exit prior to 4444 * the first L2 run. 4445 */ 4446 (void)nested_get_evmcs_page(vcpu); 4447 } 4448 4449 /* Service the TLB flush request for L2 before switching to L1. */ 4450 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 4451 kvm_vcpu_flush_tlb_current(vcpu); 4452 4453 /* 4454 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4455 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4456 * up-to-date before switching to L1. 4457 */ 4458 if (enable_ept && is_pae_paging(vcpu)) 4459 vmx_ept_load_pdptrs(vcpu); 4460 4461 leave_guest_mode(vcpu); 4462 4463 if (nested_cpu_has_preemption_timer(vmcs12)) 4464 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4465 4466 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 4467 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4468 4469 if (likely(!vmx->fail)) { 4470 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4471 4472 if (vm_exit_reason != -1) 4473 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4474 exit_intr_info, exit_qualification); 4475 4476 /* 4477 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4478 * also be used to capture vmcs12 cache as part of 4479 * capturing nVMX state for snapshot (migration). 4480 * 4481 * Otherwise, this flush will dirty guest memory at a 4482 * point it is already assumed by user-space to be 4483 * immutable. 4484 */ 4485 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4486 } else { 4487 /* 4488 * The only expected VM-instruction error is "VM entry with 4489 * invalid control field(s)." Anything else indicates a 4490 * problem with L0. And we should never get here with a 4491 * VMFail of any type if early consistency checks are enabled. 4492 */ 4493 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4494 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4495 WARN_ON_ONCE(nested_early_check); 4496 } 4497 4498 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4499 4500 /* Update any VMCS fields that might have changed while L2 ran */ 4501 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4502 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4503 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4504 if (vmx->nested.l1_tpr_threshold != -1) 4505 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4506 4507 if (kvm_has_tsc_control) 4508 decache_tsc_multiplier(vmx); 4509 4510 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4511 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4512 vmx_set_virtual_apic_mode(vcpu); 4513 } 4514 4515 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4516 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4517 vmx_update_cpu_dirty_logging(vcpu); 4518 } 4519 4520 /* Unpin physical memory we referred to in vmcs02 */ 4521 if (vmx->nested.apic_access_page) { 4522 kvm_release_page_clean(vmx->nested.apic_access_page); 4523 vmx->nested.apic_access_page = NULL; 4524 } 4525 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4526 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4527 vmx->nested.pi_desc = NULL; 4528 4529 if (vmx->nested.reload_vmcs01_apic_access_page) { 4530 vmx->nested.reload_vmcs01_apic_access_page = false; 4531 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4532 } 4533 4534 if ((vm_exit_reason != -1) && 4535 (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4536 vmx->nested.need_vmcs12_to_shadow_sync = true; 4537 4538 /* in case we halted in L2 */ 4539 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4540 4541 if (likely(!vmx->fail)) { 4542 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4543 nested_exit_intr_ack_set(vcpu)) { 4544 int irq = kvm_cpu_get_interrupt(vcpu); 4545 WARN_ON(irq < 0); 4546 vmcs12->vm_exit_intr_info = irq | 4547 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4548 } 4549 4550 if (vm_exit_reason != -1) 4551 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4552 vmcs12->exit_qualification, 4553 vmcs12->idt_vectoring_info_field, 4554 vmcs12->vm_exit_intr_info, 4555 vmcs12->vm_exit_intr_error_code, 4556 KVM_ISA_VMX); 4557 4558 load_vmcs12_host_state(vcpu, vmcs12); 4559 4560 return; 4561 } 4562 4563 /* 4564 * After an early L2 VM-entry failure, we're now back 4565 * in L1 which thinks it just finished a VMLAUNCH or 4566 * VMRESUME instruction, so we need to set the failure 4567 * flag and the VM-instruction error field of the VMCS 4568 * accordingly, and skip the emulated instruction. 4569 */ 4570 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4571 4572 /* 4573 * Restore L1's host state to KVM's software model. We're here 4574 * because a consistency check was caught by hardware, which 4575 * means some amount of guest state has been propagated to KVM's 4576 * model and needs to be unwound to the host's state. 4577 */ 4578 nested_vmx_restore_host_state(vcpu); 4579 4580 vmx->fail = 0; 4581 } 4582 4583 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4584 { 4585 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4586 } 4587 4588 /* 4589 * Decode the memory-address operand of a vmx instruction, as recorded on an 4590 * exit caused by such an instruction (run by a guest hypervisor). 4591 * On success, returns 0. When the operand is invalid, returns 1 and throws 4592 * #UD, #GP, or #SS. 4593 */ 4594 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4595 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4596 { 4597 gva_t off; 4598 bool exn; 4599 struct kvm_segment s; 4600 4601 /* 4602 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4603 * Execution", on an exit, vmx_instruction_info holds most of the 4604 * addressing components of the operand. Only the displacement part 4605 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4606 * For how an actual address is calculated from all these components, 4607 * refer to Vol. 1, "Operand Addressing". 4608 */ 4609 int scaling = vmx_instruction_info & 3; 4610 int addr_size = (vmx_instruction_info >> 7) & 7; 4611 bool is_reg = vmx_instruction_info & (1u << 10); 4612 int seg_reg = (vmx_instruction_info >> 15) & 7; 4613 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4614 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4615 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4616 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4617 4618 if (is_reg) { 4619 kvm_queue_exception(vcpu, UD_VECTOR); 4620 return 1; 4621 } 4622 4623 /* Addr = segment_base + offset */ 4624 /* offset = base + [index * scale] + displacement */ 4625 off = exit_qualification; /* holds the displacement */ 4626 if (addr_size == 1) 4627 off = (gva_t)sign_extend64(off, 31); 4628 else if (addr_size == 0) 4629 off = (gva_t)sign_extend64(off, 15); 4630 if (base_is_valid) 4631 off += kvm_register_read(vcpu, base_reg); 4632 if (index_is_valid) 4633 off += kvm_register_read(vcpu, index_reg) << scaling; 4634 vmx_get_segment(vcpu, &s, seg_reg); 4635 4636 /* 4637 * The effective address, i.e. @off, of a memory operand is truncated 4638 * based on the address size of the instruction. Note that this is 4639 * the *effective address*, i.e. the address prior to accounting for 4640 * the segment's base. 4641 */ 4642 if (addr_size == 1) /* 32 bit */ 4643 off &= 0xffffffff; 4644 else if (addr_size == 0) /* 16 bit */ 4645 off &= 0xffff; 4646 4647 /* Checks for #GP/#SS exceptions. */ 4648 exn = false; 4649 if (is_long_mode(vcpu)) { 4650 /* 4651 * The virtual/linear address is never truncated in 64-bit 4652 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4653 * address when using FS/GS with a non-zero base. 4654 */ 4655 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4656 *ret = s.base + off; 4657 else 4658 *ret = off; 4659 4660 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4661 * non-canonical form. This is the only check on the memory 4662 * destination for long mode! 4663 */ 4664 exn = is_noncanonical_address(*ret, vcpu); 4665 } else { 4666 /* 4667 * When not in long mode, the virtual/linear address is 4668 * unconditionally truncated to 32 bits regardless of the 4669 * address size. 4670 */ 4671 *ret = (s.base + off) & 0xffffffff; 4672 4673 /* Protected mode: apply checks for segment validity in the 4674 * following order: 4675 * - segment type check (#GP(0) may be thrown) 4676 * - usability check (#GP(0)/#SS(0)) 4677 * - limit check (#GP(0)/#SS(0)) 4678 */ 4679 if (wr) 4680 /* #GP(0) if the destination operand is located in a 4681 * read-only data segment or any code segment. 4682 */ 4683 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4684 else 4685 /* #GP(0) if the source operand is located in an 4686 * execute-only code segment 4687 */ 4688 exn = ((s.type & 0xa) == 8); 4689 if (exn) { 4690 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4691 return 1; 4692 } 4693 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4694 */ 4695 exn = (s.unusable != 0); 4696 4697 /* 4698 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4699 * outside the segment limit. All CPUs that support VMX ignore 4700 * limit checks for flat segments, i.e. segments with base==0, 4701 * limit==0xffffffff and of type expand-up data or code. 4702 */ 4703 if (!(s.base == 0 && s.limit == 0xffffffff && 4704 ((s.type & 8) || !(s.type & 4)))) 4705 exn = exn || ((u64)off + len - 1 > s.limit); 4706 } 4707 if (exn) { 4708 kvm_queue_exception_e(vcpu, 4709 seg_reg == VCPU_SREG_SS ? 4710 SS_VECTOR : GP_VECTOR, 4711 0); 4712 return 1; 4713 } 4714 4715 return 0; 4716 } 4717 4718 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4719 { 4720 struct vcpu_vmx *vmx; 4721 4722 if (!nested_vmx_allowed(vcpu)) 4723 return; 4724 4725 vmx = to_vmx(vcpu); 4726 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4727 vmx->nested.msrs.entry_ctls_high |= 4728 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4729 vmx->nested.msrs.exit_ctls_high |= 4730 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4731 } else { 4732 vmx->nested.msrs.entry_ctls_high &= 4733 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4734 vmx->nested.msrs.exit_ctls_high &= 4735 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4736 } 4737 } 4738 4739 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4740 int *ret) 4741 { 4742 gva_t gva; 4743 struct x86_exception e; 4744 int r; 4745 4746 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4747 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4748 sizeof(*vmpointer), &gva)) { 4749 *ret = 1; 4750 return -EINVAL; 4751 } 4752 4753 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4754 if (r != X86EMUL_CONTINUE) { 4755 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4756 return -EINVAL; 4757 } 4758 4759 return 0; 4760 } 4761 4762 /* 4763 * Allocate a shadow VMCS and associate it with the currently loaded 4764 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4765 * VMCS is also VMCLEARed, so that it is ready for use. 4766 */ 4767 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4768 { 4769 struct vcpu_vmx *vmx = to_vmx(vcpu); 4770 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4771 4772 /* 4773 * We should allocate a shadow vmcs for vmcs01 only when L1 4774 * executes VMXON and free it when L1 executes VMXOFF. 4775 * As it is invalid to execute VMXON twice, we shouldn't reach 4776 * here when vmcs01 already have an allocated shadow vmcs. 4777 */ 4778 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4779 4780 if (!loaded_vmcs->shadow_vmcs) { 4781 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4782 if (loaded_vmcs->shadow_vmcs) 4783 vmcs_clear(loaded_vmcs->shadow_vmcs); 4784 } 4785 return loaded_vmcs->shadow_vmcs; 4786 } 4787 4788 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4789 { 4790 struct vcpu_vmx *vmx = to_vmx(vcpu); 4791 int r; 4792 4793 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4794 if (r < 0) 4795 goto out_vmcs02; 4796 4797 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4798 if (!vmx->nested.cached_vmcs12) 4799 goto out_cached_vmcs12; 4800 4801 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4802 if (!vmx->nested.cached_shadow_vmcs12) 4803 goto out_cached_shadow_vmcs12; 4804 4805 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4806 goto out_shadow_vmcs; 4807 4808 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4809 HRTIMER_MODE_ABS_PINNED); 4810 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4811 4812 vmx->nested.vpid02 = allocate_vpid(); 4813 4814 vmx->nested.vmcs02_initialized = false; 4815 vmx->nested.vmxon = true; 4816 4817 if (vmx_pt_mode_is_host_guest()) { 4818 vmx->pt_desc.guest.ctl = 0; 4819 pt_update_intercept_for_msr(vcpu); 4820 } 4821 4822 return 0; 4823 4824 out_shadow_vmcs: 4825 kfree(vmx->nested.cached_shadow_vmcs12); 4826 4827 out_cached_shadow_vmcs12: 4828 kfree(vmx->nested.cached_vmcs12); 4829 4830 out_cached_vmcs12: 4831 free_loaded_vmcs(&vmx->nested.vmcs02); 4832 4833 out_vmcs02: 4834 return -ENOMEM; 4835 } 4836 4837 /* 4838 * Emulate the VMXON instruction. 4839 * Currently, we just remember that VMX is active, and do not save or even 4840 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4841 * do not currently need to store anything in that guest-allocated memory 4842 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4843 * argument is different from the VMXON pointer (which the spec says they do). 4844 */ 4845 static int handle_vmon(struct kvm_vcpu *vcpu) 4846 { 4847 int ret; 4848 gpa_t vmptr; 4849 uint32_t revision; 4850 struct vcpu_vmx *vmx = to_vmx(vcpu); 4851 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4852 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4853 4854 /* 4855 * The Intel VMX Instruction Reference lists a bunch of bits that are 4856 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4857 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4858 * Otherwise, we should fail with #UD. But most faulting conditions 4859 * have already been checked by hardware, prior to the VM-exit for 4860 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4861 * that bit set to 1 in non-root mode. 4862 */ 4863 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4864 kvm_queue_exception(vcpu, UD_VECTOR); 4865 return 1; 4866 } 4867 4868 /* CPL=0 must be checked manually. */ 4869 if (vmx_get_cpl(vcpu)) { 4870 kvm_inject_gp(vcpu, 0); 4871 return 1; 4872 } 4873 4874 if (vmx->nested.vmxon) 4875 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4876 4877 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4878 != VMXON_NEEDED_FEATURES) { 4879 kvm_inject_gp(vcpu, 0); 4880 return 1; 4881 } 4882 4883 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4884 return ret; 4885 4886 /* 4887 * SDM 3: 24.11.5 4888 * The first 4 bytes of VMXON region contain the supported 4889 * VMCS revision identifier 4890 * 4891 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4892 * which replaces physical address width with 32 4893 */ 4894 if (!page_address_valid(vcpu, vmptr)) 4895 return nested_vmx_failInvalid(vcpu); 4896 4897 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4898 revision != VMCS12_REVISION) 4899 return nested_vmx_failInvalid(vcpu); 4900 4901 vmx->nested.vmxon_ptr = vmptr; 4902 ret = enter_vmx_operation(vcpu); 4903 if (ret) 4904 return ret; 4905 4906 return nested_vmx_succeed(vcpu); 4907 } 4908 4909 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4910 { 4911 struct vcpu_vmx *vmx = to_vmx(vcpu); 4912 4913 if (vmx->nested.current_vmptr == -1ull) 4914 return; 4915 4916 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4917 4918 if (enable_shadow_vmcs) { 4919 /* copy to memory all shadowed fields in case 4920 they were modified */ 4921 copy_shadow_to_vmcs12(vmx); 4922 vmx_disable_shadow_vmcs(vmx); 4923 } 4924 vmx->nested.posted_intr_nv = -1; 4925 4926 /* Flush VMCS12 to guest memory */ 4927 kvm_vcpu_write_guest_page(vcpu, 4928 vmx->nested.current_vmptr >> PAGE_SHIFT, 4929 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4930 4931 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4932 4933 vmx->nested.current_vmptr = -1ull; 4934 } 4935 4936 /* Emulate the VMXOFF instruction */ 4937 static int handle_vmoff(struct kvm_vcpu *vcpu) 4938 { 4939 if (!nested_vmx_check_permission(vcpu)) 4940 return 1; 4941 4942 free_nested(vcpu); 4943 4944 /* Process a latched INIT during time CPU was in VMX operation */ 4945 kvm_make_request(KVM_REQ_EVENT, vcpu); 4946 4947 return nested_vmx_succeed(vcpu); 4948 } 4949 4950 /* Emulate the VMCLEAR instruction */ 4951 static int handle_vmclear(struct kvm_vcpu *vcpu) 4952 { 4953 struct vcpu_vmx *vmx = to_vmx(vcpu); 4954 u32 zero = 0; 4955 gpa_t vmptr; 4956 u64 evmcs_gpa; 4957 int r; 4958 4959 if (!nested_vmx_check_permission(vcpu)) 4960 return 1; 4961 4962 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 4963 return r; 4964 4965 if (!page_address_valid(vcpu, vmptr)) 4966 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 4967 4968 if (vmptr == vmx->nested.vmxon_ptr) 4969 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 4970 4971 /* 4972 * When Enlightened VMEntry is enabled on the calling CPU we treat 4973 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4974 * way to distinguish it from VMCS12) and we must not corrupt it by 4975 * writing to the non-existent 'launch_state' field. The area doesn't 4976 * have to be the currently active EVMCS on the calling CPU and there's 4977 * nothing KVM has to do to transition it from 'active' to 'non-active' 4978 * state. It is possible that the area will stay mapped as 4979 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4980 */ 4981 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4982 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4983 if (vmptr == vmx->nested.current_vmptr) 4984 nested_release_vmcs12(vcpu); 4985 4986 kvm_vcpu_write_guest(vcpu, 4987 vmptr + offsetof(struct vmcs12, 4988 launch_state), 4989 &zero, sizeof(zero)); 4990 } 4991 4992 return nested_vmx_succeed(vcpu); 4993 } 4994 4995 /* Emulate the VMLAUNCH instruction */ 4996 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4997 { 4998 return nested_vmx_run(vcpu, true); 4999 } 5000 5001 /* Emulate the VMRESUME instruction */ 5002 static int handle_vmresume(struct kvm_vcpu *vcpu) 5003 { 5004 5005 return nested_vmx_run(vcpu, false); 5006 } 5007 5008 static int handle_vmread(struct kvm_vcpu *vcpu) 5009 { 5010 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5011 : get_vmcs12(vcpu); 5012 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5013 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5014 struct vcpu_vmx *vmx = to_vmx(vcpu); 5015 struct x86_exception e; 5016 unsigned long field; 5017 u64 value; 5018 gva_t gva = 0; 5019 short offset; 5020 int len, r; 5021 5022 if (!nested_vmx_check_permission(vcpu)) 5023 return 1; 5024 5025 /* 5026 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5027 * any VMREAD sets the ALU flags for VMfailInvalid. 5028 */ 5029 if (vmx->nested.current_vmptr == -1ull || 5030 (is_guest_mode(vcpu) && 5031 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5032 return nested_vmx_failInvalid(vcpu); 5033 5034 /* Decode instruction info and find the field to read */ 5035 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5036 5037 offset = vmcs_field_to_offset(field); 5038 if (offset < 0) 5039 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5040 5041 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5042 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5043 5044 /* Read the field, zero-extended to a u64 value */ 5045 value = vmcs12_read_any(vmcs12, field, offset); 5046 5047 /* 5048 * Now copy part of this value to register or memory, as requested. 5049 * Note that the number of bits actually copied is 32 or 64 depending 5050 * on the guest's mode (32 or 64 bit), not on the given field's length. 5051 */ 5052 if (instr_info & BIT(10)) { 5053 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5054 } else { 5055 len = is_64_bit_mode(vcpu) ? 8 : 4; 5056 if (get_vmx_mem_address(vcpu, exit_qualification, 5057 instr_info, true, len, &gva)) 5058 return 1; 5059 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5060 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5061 if (r != X86EMUL_CONTINUE) 5062 return kvm_handle_memory_failure(vcpu, r, &e); 5063 } 5064 5065 return nested_vmx_succeed(vcpu); 5066 } 5067 5068 static bool is_shadow_field_rw(unsigned long field) 5069 { 5070 switch (field) { 5071 #define SHADOW_FIELD_RW(x, y) case x: 5072 #include "vmcs_shadow_fields.h" 5073 return true; 5074 default: 5075 break; 5076 } 5077 return false; 5078 } 5079 5080 static bool is_shadow_field_ro(unsigned long field) 5081 { 5082 switch (field) { 5083 #define SHADOW_FIELD_RO(x, y) case x: 5084 #include "vmcs_shadow_fields.h" 5085 return true; 5086 default: 5087 break; 5088 } 5089 return false; 5090 } 5091 5092 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5093 { 5094 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5095 : get_vmcs12(vcpu); 5096 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5097 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5098 struct vcpu_vmx *vmx = to_vmx(vcpu); 5099 struct x86_exception e; 5100 unsigned long field; 5101 short offset; 5102 gva_t gva; 5103 int len, r; 5104 5105 /* 5106 * The value to write might be 32 or 64 bits, depending on L1's long 5107 * mode, and eventually we need to write that into a field of several 5108 * possible lengths. The code below first zero-extends the value to 64 5109 * bit (value), and then copies only the appropriate number of 5110 * bits into the vmcs12 field. 5111 */ 5112 u64 value = 0; 5113 5114 if (!nested_vmx_check_permission(vcpu)) 5115 return 1; 5116 5117 /* 5118 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5119 * any VMWRITE sets the ALU flags for VMfailInvalid. 5120 */ 5121 if (vmx->nested.current_vmptr == -1ull || 5122 (is_guest_mode(vcpu) && 5123 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5124 return nested_vmx_failInvalid(vcpu); 5125 5126 if (instr_info & BIT(10)) 5127 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5128 else { 5129 len = is_64_bit_mode(vcpu) ? 8 : 4; 5130 if (get_vmx_mem_address(vcpu, exit_qualification, 5131 instr_info, false, len, &gva)) 5132 return 1; 5133 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5134 if (r != X86EMUL_CONTINUE) 5135 return kvm_handle_memory_failure(vcpu, r, &e); 5136 } 5137 5138 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5139 5140 offset = vmcs_field_to_offset(field); 5141 if (offset < 0) 5142 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5143 5144 /* 5145 * If the vCPU supports "VMWRITE to any supported field in the 5146 * VMCS," then the "read-only" fields are actually read/write. 5147 */ 5148 if (vmcs_field_readonly(field) && 5149 !nested_cpu_has_vmwrite_any_field(vcpu)) 5150 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5151 5152 /* 5153 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5154 * vmcs12, else we may crush a field or consume a stale value. 5155 */ 5156 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5157 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5158 5159 /* 5160 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5161 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5162 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5163 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5164 * from L1 will return a different value than VMREAD from L2 (L1 sees 5165 * the stripped down value, L2 sees the full value as stored by KVM). 5166 */ 5167 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5168 value &= 0x1f0ff; 5169 5170 vmcs12_write_any(vmcs12, field, offset, value); 5171 5172 /* 5173 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5174 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5175 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5176 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5177 */ 5178 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5179 /* 5180 * L1 can read these fields without exiting, ensure the 5181 * shadow VMCS is up-to-date. 5182 */ 5183 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5184 preempt_disable(); 5185 vmcs_load(vmx->vmcs01.shadow_vmcs); 5186 5187 __vmcs_writel(field, value); 5188 5189 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5190 vmcs_load(vmx->loaded_vmcs->vmcs); 5191 preempt_enable(); 5192 } 5193 vmx->nested.dirty_vmcs12 = true; 5194 } 5195 5196 return nested_vmx_succeed(vcpu); 5197 } 5198 5199 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5200 { 5201 vmx->nested.current_vmptr = vmptr; 5202 if (enable_shadow_vmcs) { 5203 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5204 vmcs_write64(VMCS_LINK_POINTER, 5205 __pa(vmx->vmcs01.shadow_vmcs)); 5206 vmx->nested.need_vmcs12_to_shadow_sync = true; 5207 } 5208 vmx->nested.dirty_vmcs12 = true; 5209 } 5210 5211 /* Emulate the VMPTRLD instruction */ 5212 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5213 { 5214 struct vcpu_vmx *vmx = to_vmx(vcpu); 5215 gpa_t vmptr; 5216 int r; 5217 5218 if (!nested_vmx_check_permission(vcpu)) 5219 return 1; 5220 5221 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5222 return r; 5223 5224 if (!page_address_valid(vcpu, vmptr)) 5225 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5226 5227 if (vmptr == vmx->nested.vmxon_ptr) 5228 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5229 5230 /* Forbid normal VMPTRLD if Enlightened version was used */ 5231 if (vmx->nested.hv_evmcs) 5232 return 1; 5233 5234 if (vmx->nested.current_vmptr != vmptr) { 5235 struct kvm_host_map map; 5236 struct vmcs12 *new_vmcs12; 5237 5238 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 5239 /* 5240 * Reads from an unbacked page return all 1s, 5241 * which means that the 32 bits located at the 5242 * given physical address won't match the required 5243 * VMCS12_REVISION identifier. 5244 */ 5245 return nested_vmx_fail(vcpu, 5246 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5247 } 5248 5249 new_vmcs12 = map.hva; 5250 5251 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5252 (new_vmcs12->hdr.shadow_vmcs && 5253 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5254 kvm_vcpu_unmap(vcpu, &map, false); 5255 return nested_vmx_fail(vcpu, 5256 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5257 } 5258 5259 nested_release_vmcs12(vcpu); 5260 5261 /* 5262 * Load VMCS12 from guest memory since it is not already 5263 * cached. 5264 */ 5265 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5266 kvm_vcpu_unmap(vcpu, &map, false); 5267 5268 set_current_vmptr(vmx, vmptr); 5269 } 5270 5271 return nested_vmx_succeed(vcpu); 5272 } 5273 5274 /* Emulate the VMPTRST instruction */ 5275 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5276 { 5277 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5278 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5279 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5280 struct x86_exception e; 5281 gva_t gva; 5282 int r; 5283 5284 if (!nested_vmx_check_permission(vcpu)) 5285 return 1; 5286 5287 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 5288 return 1; 5289 5290 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5291 true, sizeof(gpa_t), &gva)) 5292 return 1; 5293 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5294 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5295 sizeof(gpa_t), &e); 5296 if (r != X86EMUL_CONTINUE) 5297 return kvm_handle_memory_failure(vcpu, r, &e); 5298 5299 return nested_vmx_succeed(vcpu); 5300 } 5301 5302 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 5303 5304 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 5305 { 5306 return VALID_PAGE(root_hpa) && 5307 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 5308 } 5309 5310 /* Emulate the INVEPT instruction */ 5311 static int handle_invept(struct kvm_vcpu *vcpu) 5312 { 5313 struct vcpu_vmx *vmx = to_vmx(vcpu); 5314 u32 vmx_instruction_info, types; 5315 unsigned long type, roots_to_free; 5316 struct kvm_mmu *mmu; 5317 gva_t gva; 5318 struct x86_exception e; 5319 struct { 5320 u64 eptp, gpa; 5321 } operand; 5322 int i, r; 5323 5324 if (!(vmx->nested.msrs.secondary_ctls_high & 5325 SECONDARY_EXEC_ENABLE_EPT) || 5326 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5327 kvm_queue_exception(vcpu, UD_VECTOR); 5328 return 1; 5329 } 5330 5331 if (!nested_vmx_check_permission(vcpu)) 5332 return 1; 5333 5334 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5335 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5336 5337 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5338 5339 if (type >= 32 || !(types & (1 << type))) 5340 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5341 5342 /* According to the Intel VMX instruction reference, the memory 5343 * operand is read even if it isn't needed (e.g., for type==global) 5344 */ 5345 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5346 vmx_instruction_info, false, sizeof(operand), &gva)) 5347 return 1; 5348 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5349 if (r != X86EMUL_CONTINUE) 5350 return kvm_handle_memory_failure(vcpu, r, &e); 5351 5352 /* 5353 * Nested EPT roots are always held through guest_mmu, 5354 * not root_mmu. 5355 */ 5356 mmu = &vcpu->arch.guest_mmu; 5357 5358 switch (type) { 5359 case VMX_EPT_EXTENT_CONTEXT: 5360 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5361 return nested_vmx_fail(vcpu, 5362 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5363 5364 roots_to_free = 0; 5365 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5366 operand.eptp)) 5367 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5368 5369 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5370 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5371 mmu->prev_roots[i].pgd, 5372 operand.eptp)) 5373 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5374 } 5375 break; 5376 case VMX_EPT_EXTENT_GLOBAL: 5377 roots_to_free = KVM_MMU_ROOTS_ALL; 5378 break; 5379 default: 5380 BUG(); 5381 break; 5382 } 5383 5384 if (roots_to_free) 5385 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5386 5387 return nested_vmx_succeed(vcpu); 5388 } 5389 5390 static int handle_invvpid(struct kvm_vcpu *vcpu) 5391 { 5392 struct vcpu_vmx *vmx = to_vmx(vcpu); 5393 u32 vmx_instruction_info; 5394 unsigned long type, types; 5395 gva_t gva; 5396 struct x86_exception e; 5397 struct { 5398 u64 vpid; 5399 u64 gla; 5400 } operand; 5401 u16 vpid02; 5402 int r; 5403 5404 if (!(vmx->nested.msrs.secondary_ctls_high & 5405 SECONDARY_EXEC_ENABLE_VPID) || 5406 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5407 kvm_queue_exception(vcpu, UD_VECTOR); 5408 return 1; 5409 } 5410 5411 if (!nested_vmx_check_permission(vcpu)) 5412 return 1; 5413 5414 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5415 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5416 5417 types = (vmx->nested.msrs.vpid_caps & 5418 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5419 5420 if (type >= 32 || !(types & (1 << type))) 5421 return nested_vmx_fail(vcpu, 5422 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5423 5424 /* according to the intel vmx instruction reference, the memory 5425 * operand is read even if it isn't needed (e.g., for type==global) 5426 */ 5427 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5428 vmx_instruction_info, false, sizeof(operand), &gva)) 5429 return 1; 5430 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5431 if (r != X86EMUL_CONTINUE) 5432 return kvm_handle_memory_failure(vcpu, r, &e); 5433 5434 if (operand.vpid >> 16) 5435 return nested_vmx_fail(vcpu, 5436 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5437 5438 vpid02 = nested_get_vpid02(vcpu); 5439 switch (type) { 5440 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5441 if (!operand.vpid || 5442 is_noncanonical_address(operand.gla, vcpu)) 5443 return nested_vmx_fail(vcpu, 5444 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5445 vpid_sync_vcpu_addr(vpid02, operand.gla); 5446 break; 5447 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5448 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5449 if (!operand.vpid) 5450 return nested_vmx_fail(vcpu, 5451 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5452 vpid_sync_context(vpid02); 5453 break; 5454 case VMX_VPID_EXTENT_ALL_CONTEXT: 5455 vpid_sync_context(vpid02); 5456 break; 5457 default: 5458 WARN_ON_ONCE(1); 5459 return kvm_skip_emulated_instruction(vcpu); 5460 } 5461 5462 /* 5463 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5464 * linear mappings for L2 (tagged with L2's VPID). Free all roots as 5465 * VPIDs are not tracked in the MMU role. 5466 * 5467 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5468 * an MMU when EPT is disabled. 5469 * 5470 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5471 */ 5472 if (!enable_ept) 5473 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, 5474 KVM_MMU_ROOTS_ALL); 5475 5476 return nested_vmx_succeed(vcpu); 5477 } 5478 5479 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5480 struct vmcs12 *vmcs12) 5481 { 5482 u32 index = kvm_rcx_read(vcpu); 5483 u64 new_eptp; 5484 bool accessed_dirty; 5485 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5486 5487 if (!nested_cpu_has_eptp_switching(vmcs12) || 5488 !nested_cpu_has_ept(vmcs12)) 5489 return 1; 5490 5491 if (index >= VMFUNC_EPTP_ENTRIES) 5492 return 1; 5493 5494 5495 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5496 &new_eptp, index * 8, 8)) 5497 return 1; 5498 5499 accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); 5500 5501 /* 5502 * If the (L2) guest does a vmfunc to the currently 5503 * active ept pointer, we don't have to do anything else 5504 */ 5505 if (vmcs12->ept_pointer != new_eptp) { 5506 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5507 return 1; 5508 5509 mmu->ept_ad = accessed_dirty; 5510 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5511 vmcs12->ept_pointer = new_eptp; 5512 5513 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 5514 } 5515 5516 return 0; 5517 } 5518 5519 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5520 { 5521 struct vcpu_vmx *vmx = to_vmx(vcpu); 5522 struct vmcs12 *vmcs12; 5523 u32 function = kvm_rax_read(vcpu); 5524 5525 /* 5526 * VMFUNC is only supported for nested guests, but we always enable the 5527 * secondary control for simplicity; for non-nested mode, fake that we 5528 * didn't by injecting #UD. 5529 */ 5530 if (!is_guest_mode(vcpu)) { 5531 kvm_queue_exception(vcpu, UD_VECTOR); 5532 return 1; 5533 } 5534 5535 vmcs12 = get_vmcs12(vcpu); 5536 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5537 goto fail; 5538 5539 switch (function) { 5540 case 0: 5541 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5542 goto fail; 5543 break; 5544 default: 5545 goto fail; 5546 } 5547 return kvm_skip_emulated_instruction(vcpu); 5548 5549 fail: 5550 /* 5551 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5552 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5553 * EXIT_REASON_VMFUNC as the exit reason. 5554 */ 5555 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5556 vmx_get_intr_info(vcpu), 5557 vmx_get_exit_qual(vcpu)); 5558 return 1; 5559 } 5560 5561 /* 5562 * Return true if an IO instruction with the specified port and size should cause 5563 * a VM-exit into L1. 5564 */ 5565 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5566 int size) 5567 { 5568 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5569 gpa_t bitmap, last_bitmap; 5570 u8 b; 5571 5572 last_bitmap = (gpa_t)-1; 5573 b = -1; 5574 5575 while (size > 0) { 5576 if (port < 0x8000) 5577 bitmap = vmcs12->io_bitmap_a; 5578 else if (port < 0x10000) 5579 bitmap = vmcs12->io_bitmap_b; 5580 else 5581 return true; 5582 bitmap += (port & 0x7fff) / 8; 5583 5584 if (last_bitmap != bitmap) 5585 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5586 return true; 5587 if (b & (1 << (port & 7))) 5588 return true; 5589 5590 port++; 5591 size--; 5592 last_bitmap = bitmap; 5593 } 5594 5595 return false; 5596 } 5597 5598 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5599 struct vmcs12 *vmcs12) 5600 { 5601 unsigned long exit_qualification; 5602 unsigned short port; 5603 int size; 5604 5605 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5606 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5607 5608 exit_qualification = vmx_get_exit_qual(vcpu); 5609 5610 port = exit_qualification >> 16; 5611 size = (exit_qualification & 7) + 1; 5612 5613 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5614 } 5615 5616 /* 5617 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5618 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5619 * disinterest in the current event (read or write a specific MSR) by using an 5620 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5621 */ 5622 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5623 struct vmcs12 *vmcs12, 5624 union vmx_exit_reason exit_reason) 5625 { 5626 u32 msr_index = kvm_rcx_read(vcpu); 5627 gpa_t bitmap; 5628 5629 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5630 return true; 5631 5632 /* 5633 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5634 * for the four combinations of read/write and low/high MSR numbers. 5635 * First we need to figure out which of the four to use: 5636 */ 5637 bitmap = vmcs12->msr_bitmap; 5638 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5639 bitmap += 2048; 5640 if (msr_index >= 0xc0000000) { 5641 msr_index -= 0xc0000000; 5642 bitmap += 1024; 5643 } 5644 5645 /* Then read the msr_index'th bit from this bitmap: */ 5646 if (msr_index < 1024*8) { 5647 unsigned char b; 5648 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5649 return true; 5650 return 1 & (b >> (msr_index & 7)); 5651 } else 5652 return true; /* let L1 handle the wrong parameter */ 5653 } 5654 5655 /* 5656 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5657 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5658 * intercept (via guest_host_mask etc.) the current event. 5659 */ 5660 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5661 struct vmcs12 *vmcs12) 5662 { 5663 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5664 int cr = exit_qualification & 15; 5665 int reg; 5666 unsigned long val; 5667 5668 switch ((exit_qualification >> 4) & 3) { 5669 case 0: /* mov to cr */ 5670 reg = (exit_qualification >> 8) & 15; 5671 val = kvm_register_read(vcpu, reg); 5672 switch (cr) { 5673 case 0: 5674 if (vmcs12->cr0_guest_host_mask & 5675 (val ^ vmcs12->cr0_read_shadow)) 5676 return true; 5677 break; 5678 case 3: 5679 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5680 return true; 5681 break; 5682 case 4: 5683 if (vmcs12->cr4_guest_host_mask & 5684 (vmcs12->cr4_read_shadow ^ val)) 5685 return true; 5686 break; 5687 case 8: 5688 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5689 return true; 5690 break; 5691 } 5692 break; 5693 case 2: /* clts */ 5694 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5695 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5696 return true; 5697 break; 5698 case 1: /* mov from cr */ 5699 switch (cr) { 5700 case 3: 5701 if (vmcs12->cpu_based_vm_exec_control & 5702 CPU_BASED_CR3_STORE_EXITING) 5703 return true; 5704 break; 5705 case 8: 5706 if (vmcs12->cpu_based_vm_exec_control & 5707 CPU_BASED_CR8_STORE_EXITING) 5708 return true; 5709 break; 5710 } 5711 break; 5712 case 3: /* lmsw */ 5713 /* 5714 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5715 * cr0. Other attempted changes are ignored, with no exit. 5716 */ 5717 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5718 if (vmcs12->cr0_guest_host_mask & 0xe & 5719 (val ^ vmcs12->cr0_read_shadow)) 5720 return true; 5721 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5722 !(vmcs12->cr0_read_shadow & 0x1) && 5723 (val & 0x1)) 5724 return true; 5725 break; 5726 } 5727 return false; 5728 } 5729 5730 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5731 struct vmcs12 *vmcs12) 5732 { 5733 u32 encls_leaf; 5734 5735 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5736 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5737 return false; 5738 5739 encls_leaf = kvm_rax_read(vcpu); 5740 if (encls_leaf > 62) 5741 encls_leaf = 63; 5742 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5743 } 5744 5745 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5746 struct vmcs12 *vmcs12, gpa_t bitmap) 5747 { 5748 u32 vmx_instruction_info; 5749 unsigned long field; 5750 u8 b; 5751 5752 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5753 return true; 5754 5755 /* Decode instruction info and find the field to access */ 5756 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5757 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5758 5759 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5760 if (field >> 15) 5761 return true; 5762 5763 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5764 return true; 5765 5766 return 1 & (b >> (field & 7)); 5767 } 5768 5769 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5770 { 5771 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5772 5773 if (nested_cpu_has_mtf(vmcs12)) 5774 return true; 5775 5776 /* 5777 * An MTF VM-exit may be injected into the guest by setting the 5778 * interruption-type to 7 (other event) and the vector field to 0. Such 5779 * is the case regardless of the 'monitor trap flag' VM-execution 5780 * control. 5781 */ 5782 return entry_intr_info == (INTR_INFO_VALID_MASK 5783 | INTR_TYPE_OTHER_EVENT); 5784 } 5785 5786 /* 5787 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5788 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5789 */ 5790 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5791 union vmx_exit_reason exit_reason) 5792 { 5793 u32 intr_info; 5794 5795 switch ((u16)exit_reason.basic) { 5796 case EXIT_REASON_EXCEPTION_NMI: 5797 intr_info = vmx_get_intr_info(vcpu); 5798 if (is_nmi(intr_info)) 5799 return true; 5800 else if (is_page_fault(intr_info)) 5801 return vcpu->arch.apf.host_apf_flags || !enable_ept; 5802 else if (is_debug(intr_info) && 5803 vcpu->guest_debug & 5804 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5805 return true; 5806 else if (is_breakpoint(intr_info) && 5807 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5808 return true; 5809 return false; 5810 case EXIT_REASON_EXTERNAL_INTERRUPT: 5811 return true; 5812 case EXIT_REASON_MCE_DURING_VMENTRY: 5813 return true; 5814 case EXIT_REASON_EPT_VIOLATION: 5815 /* 5816 * L0 always deals with the EPT violation. If nested EPT is 5817 * used, and the nested mmu code discovers that the address is 5818 * missing in the guest EPT table (EPT12), the EPT violation 5819 * will be injected with nested_ept_inject_page_fault() 5820 */ 5821 return true; 5822 case EXIT_REASON_EPT_MISCONFIG: 5823 /* 5824 * L2 never uses directly L1's EPT, but rather L0's own EPT 5825 * table (shadow on EPT) or a merged EPT table that L0 built 5826 * (EPT on EPT). So any problems with the structure of the 5827 * table is L0's fault. 5828 */ 5829 return true; 5830 case EXIT_REASON_PREEMPTION_TIMER: 5831 return true; 5832 case EXIT_REASON_PML_FULL: 5833 /* 5834 * PML is emulated for an L1 VMM and should never be enabled in 5835 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5836 */ 5837 return true; 5838 case EXIT_REASON_VMFUNC: 5839 /* VM functions are emulated through L2->L0 vmexits. */ 5840 return true; 5841 default: 5842 break; 5843 } 5844 return false; 5845 } 5846 5847 /* 5848 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5849 * is_guest_mode (L2). 5850 */ 5851 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5852 union vmx_exit_reason exit_reason) 5853 { 5854 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5855 u32 intr_info; 5856 5857 switch ((u16)exit_reason.basic) { 5858 case EXIT_REASON_EXCEPTION_NMI: 5859 intr_info = vmx_get_intr_info(vcpu); 5860 if (is_nmi(intr_info)) 5861 return true; 5862 else if (is_page_fault(intr_info)) 5863 return true; 5864 return vmcs12->exception_bitmap & 5865 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5866 case EXIT_REASON_EXTERNAL_INTERRUPT: 5867 return nested_exit_on_intr(vcpu); 5868 case EXIT_REASON_TRIPLE_FAULT: 5869 return true; 5870 case EXIT_REASON_INTERRUPT_WINDOW: 5871 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5872 case EXIT_REASON_NMI_WINDOW: 5873 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5874 case EXIT_REASON_TASK_SWITCH: 5875 return true; 5876 case EXIT_REASON_CPUID: 5877 return true; 5878 case EXIT_REASON_HLT: 5879 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5880 case EXIT_REASON_INVD: 5881 return true; 5882 case EXIT_REASON_INVLPG: 5883 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5884 case EXIT_REASON_RDPMC: 5885 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5886 case EXIT_REASON_RDRAND: 5887 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5888 case EXIT_REASON_RDSEED: 5889 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5890 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5891 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5892 case EXIT_REASON_VMREAD: 5893 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5894 vmcs12->vmread_bitmap); 5895 case EXIT_REASON_VMWRITE: 5896 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5897 vmcs12->vmwrite_bitmap); 5898 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5899 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5900 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5901 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5902 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5903 /* 5904 * VMX instructions trap unconditionally. This allows L1 to 5905 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5906 */ 5907 return true; 5908 case EXIT_REASON_CR_ACCESS: 5909 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5910 case EXIT_REASON_DR_ACCESS: 5911 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5912 case EXIT_REASON_IO_INSTRUCTION: 5913 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5914 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5915 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5916 case EXIT_REASON_MSR_READ: 5917 case EXIT_REASON_MSR_WRITE: 5918 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5919 case EXIT_REASON_INVALID_STATE: 5920 return true; 5921 case EXIT_REASON_MWAIT_INSTRUCTION: 5922 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5923 case EXIT_REASON_MONITOR_TRAP_FLAG: 5924 return nested_vmx_exit_handled_mtf(vmcs12); 5925 case EXIT_REASON_MONITOR_INSTRUCTION: 5926 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5927 case EXIT_REASON_PAUSE_INSTRUCTION: 5928 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5929 nested_cpu_has2(vmcs12, 5930 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5931 case EXIT_REASON_MCE_DURING_VMENTRY: 5932 return true; 5933 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5934 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5935 case EXIT_REASON_APIC_ACCESS: 5936 case EXIT_REASON_APIC_WRITE: 5937 case EXIT_REASON_EOI_INDUCED: 5938 /* 5939 * The controls for "virtualize APIC accesses," "APIC- 5940 * register virtualization," and "virtual-interrupt 5941 * delivery" only come from vmcs12. 5942 */ 5943 return true; 5944 case EXIT_REASON_INVPCID: 5945 return 5946 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5947 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5948 case EXIT_REASON_WBINVD: 5949 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5950 case EXIT_REASON_XSETBV: 5951 return true; 5952 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5953 /* 5954 * This should never happen, since it is not possible to 5955 * set XSS to a non-zero value---neither in L1 nor in L2. 5956 * If if it were, XSS would have to be checked against 5957 * the XSS exit bitmap in vmcs12. 5958 */ 5959 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5960 case EXIT_REASON_UMWAIT: 5961 case EXIT_REASON_TPAUSE: 5962 return nested_cpu_has2(vmcs12, 5963 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5964 case EXIT_REASON_ENCLS: 5965 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 5966 default: 5967 return true; 5968 } 5969 } 5970 5971 /* 5972 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 5973 * reflected into L1. 5974 */ 5975 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 5976 { 5977 struct vcpu_vmx *vmx = to_vmx(vcpu); 5978 union vmx_exit_reason exit_reason = vmx->exit_reason; 5979 unsigned long exit_qual; 5980 u32 exit_intr_info; 5981 5982 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5983 5984 /* 5985 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 5986 * has already loaded L2's state. 5987 */ 5988 if (unlikely(vmx->fail)) { 5989 trace_kvm_nested_vmenter_failed( 5990 "hardware VM-instruction error: ", 5991 vmcs_read32(VM_INSTRUCTION_ERROR)); 5992 exit_intr_info = 0; 5993 exit_qual = 0; 5994 goto reflect_vmexit; 5995 } 5996 5997 trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); 5998 5999 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6000 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6001 return false; 6002 6003 /* If L1 doesn't want the exit, handle it in L0. */ 6004 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6005 return false; 6006 6007 /* 6008 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6009 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6010 * need to be synthesized by querying the in-kernel LAPIC, but external 6011 * interrupts are never reflected to L1 so it's a non-issue. 6012 */ 6013 exit_intr_info = vmx_get_intr_info(vcpu); 6014 if (is_exception_with_error_code(exit_intr_info)) { 6015 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6016 6017 vmcs12->vm_exit_intr_error_code = 6018 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6019 } 6020 exit_qual = vmx_get_exit_qual(vcpu); 6021 6022 reflect_vmexit: 6023 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6024 return true; 6025 } 6026 6027 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6028 struct kvm_nested_state __user *user_kvm_nested_state, 6029 u32 user_data_size) 6030 { 6031 struct vcpu_vmx *vmx; 6032 struct vmcs12 *vmcs12; 6033 struct kvm_nested_state kvm_state = { 6034 .flags = 0, 6035 .format = KVM_STATE_NESTED_FORMAT_VMX, 6036 .size = sizeof(kvm_state), 6037 .hdr.vmx.flags = 0, 6038 .hdr.vmx.vmxon_pa = -1ull, 6039 .hdr.vmx.vmcs12_pa = -1ull, 6040 .hdr.vmx.preemption_timer_deadline = 0, 6041 }; 6042 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6043 &user_kvm_nested_state->data.vmx[0]; 6044 6045 if (!vcpu) 6046 return kvm_state.size + sizeof(*user_vmx_nested_state); 6047 6048 vmx = to_vmx(vcpu); 6049 vmcs12 = get_vmcs12(vcpu); 6050 6051 if (nested_vmx_allowed(vcpu) && 6052 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6053 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6054 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6055 6056 if (vmx_has_valid_vmcs12(vcpu)) { 6057 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6058 6059 if (vmx->nested.hv_evmcs) 6060 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6061 6062 if (is_guest_mode(vcpu) && 6063 nested_cpu_has_shadow_vmcs(vmcs12) && 6064 vmcs12->vmcs_link_pointer != -1ull) 6065 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6066 } 6067 6068 if (vmx->nested.smm.vmxon) 6069 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6070 6071 if (vmx->nested.smm.guest_mode) 6072 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6073 6074 if (is_guest_mode(vcpu)) { 6075 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6076 6077 if (vmx->nested.nested_run_pending) 6078 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6079 6080 if (vmx->nested.mtf_pending) 6081 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6082 6083 if (nested_cpu_has_preemption_timer(vmcs12) && 6084 vmx->nested.has_preemption_timer_deadline) { 6085 kvm_state.hdr.vmx.flags |= 6086 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6087 kvm_state.hdr.vmx.preemption_timer_deadline = 6088 vmx->nested.preemption_timer_deadline; 6089 } 6090 } 6091 } 6092 6093 if (user_data_size < kvm_state.size) 6094 goto out; 6095 6096 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6097 return -EFAULT; 6098 6099 if (!vmx_has_valid_vmcs12(vcpu)) 6100 goto out; 6101 6102 /* 6103 * When running L2, the authoritative vmcs12 state is in the 6104 * vmcs02. When running L1, the authoritative vmcs12 state is 6105 * in the shadow or enlightened vmcs linked to vmcs01, unless 6106 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6107 * vmcs12 state is in the vmcs12 already. 6108 */ 6109 if (is_guest_mode(vcpu)) { 6110 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6111 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6112 } else { 6113 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6114 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6115 if (vmx->nested.hv_evmcs) 6116 copy_enlightened_to_vmcs12(vmx); 6117 else if (enable_shadow_vmcs) 6118 copy_shadow_to_vmcs12(vmx); 6119 } 6120 } 6121 6122 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6123 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6124 6125 /* 6126 * Copy over the full allocated size of vmcs12 rather than just the size 6127 * of the struct. 6128 */ 6129 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6130 return -EFAULT; 6131 6132 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6133 vmcs12->vmcs_link_pointer != -1ull) { 6134 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6135 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6136 return -EFAULT; 6137 } 6138 out: 6139 return kvm_state.size; 6140 } 6141 6142 /* 6143 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6144 */ 6145 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6146 { 6147 if (is_guest_mode(vcpu)) { 6148 to_vmx(vcpu)->nested.nested_run_pending = 0; 6149 nested_vmx_vmexit(vcpu, -1, 0, 0); 6150 } 6151 free_nested(vcpu); 6152 } 6153 6154 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6155 struct kvm_nested_state __user *user_kvm_nested_state, 6156 struct kvm_nested_state *kvm_state) 6157 { 6158 struct vcpu_vmx *vmx = to_vmx(vcpu); 6159 struct vmcs12 *vmcs12; 6160 enum vm_entry_failure_code ignored; 6161 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6162 &user_kvm_nested_state->data.vmx[0]; 6163 int ret; 6164 6165 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6166 return -EINVAL; 6167 6168 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 6169 if (kvm_state->hdr.vmx.smm.flags) 6170 return -EINVAL; 6171 6172 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 6173 return -EINVAL; 6174 6175 /* 6176 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6177 * enable eVMCS capability on vCPU. However, since then 6178 * code was changed such that flag signals vmcs12 should 6179 * be copied into eVMCS in guest memory. 6180 * 6181 * To preserve backwards compatability, allow user 6182 * to set this flag even when there is no VMXON region. 6183 */ 6184 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6185 return -EINVAL; 6186 } else { 6187 if (!nested_vmx_allowed(vcpu)) 6188 return -EINVAL; 6189 6190 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6191 return -EINVAL; 6192 } 6193 6194 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6195 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6196 return -EINVAL; 6197 6198 if (kvm_state->hdr.vmx.smm.flags & 6199 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6200 return -EINVAL; 6201 6202 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6203 return -EINVAL; 6204 6205 /* 6206 * SMM temporarily disables VMX, so we cannot be in guest mode, 6207 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6208 * must be zero. 6209 */ 6210 if (is_smm(vcpu) ? 6211 (kvm_state->flags & 6212 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6213 : kvm_state->hdr.vmx.smm.flags) 6214 return -EINVAL; 6215 6216 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6217 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6218 return -EINVAL; 6219 6220 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6221 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6222 return -EINVAL; 6223 6224 vmx_leave_nested(vcpu); 6225 6226 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 6227 return 0; 6228 6229 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6230 ret = enter_vmx_operation(vcpu); 6231 if (ret) 6232 return ret; 6233 6234 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6235 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6236 /* See vmx_has_valid_vmcs12. */ 6237 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6238 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6239 (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) 6240 return -EINVAL; 6241 else 6242 return 0; 6243 } 6244 6245 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 6246 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6247 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6248 return -EINVAL; 6249 6250 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6251 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6252 /* 6253 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6254 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6255 * restored yet. EVMCS will be mapped from 6256 * nested_get_vmcs12_pages(). 6257 */ 6258 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6259 } else { 6260 return -EINVAL; 6261 } 6262 6263 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6264 vmx->nested.smm.vmxon = true; 6265 vmx->nested.vmxon = false; 6266 6267 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6268 vmx->nested.smm.guest_mode = true; 6269 } 6270 6271 vmcs12 = get_vmcs12(vcpu); 6272 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6273 return -EFAULT; 6274 6275 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6276 return -EINVAL; 6277 6278 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6279 return 0; 6280 6281 vmx->nested.nested_run_pending = 6282 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6283 6284 vmx->nested.mtf_pending = 6285 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6286 6287 ret = -EINVAL; 6288 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6289 vmcs12->vmcs_link_pointer != -1ull) { 6290 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6291 6292 if (kvm_state->size < 6293 sizeof(*kvm_state) + 6294 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6295 goto error_guest_mode; 6296 6297 if (copy_from_user(shadow_vmcs12, 6298 user_vmx_nested_state->shadow_vmcs12, 6299 sizeof(*shadow_vmcs12))) { 6300 ret = -EFAULT; 6301 goto error_guest_mode; 6302 } 6303 6304 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6305 !shadow_vmcs12->hdr.shadow_vmcs) 6306 goto error_guest_mode; 6307 } 6308 6309 vmx->nested.has_preemption_timer_deadline = false; 6310 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6311 vmx->nested.has_preemption_timer_deadline = true; 6312 vmx->nested.preemption_timer_deadline = 6313 kvm_state->hdr.vmx.preemption_timer_deadline; 6314 } 6315 6316 if (nested_vmx_check_controls(vcpu, vmcs12) || 6317 nested_vmx_check_host_state(vcpu, vmcs12) || 6318 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6319 goto error_guest_mode; 6320 6321 vmx->nested.dirty_vmcs12 = true; 6322 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6323 if (ret) 6324 goto error_guest_mode; 6325 6326 return 0; 6327 6328 error_guest_mode: 6329 vmx->nested.nested_run_pending = 0; 6330 return ret; 6331 } 6332 6333 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6334 { 6335 if (enable_shadow_vmcs) { 6336 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6337 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6338 } 6339 } 6340 6341 /* 6342 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6343 * returned for the various VMX controls MSRs when nested VMX is enabled. 6344 * The same values should also be used to verify that vmcs12 control fields are 6345 * valid during nested entry from L1 to L2. 6346 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6347 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6348 * bit in the high half is on if the corresponding bit in the control field 6349 * may be on. See also vmx_control_verify(). 6350 */ 6351 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6352 { 6353 /* 6354 * Note that as a general rule, the high half of the MSRs (bits in 6355 * the control fields which may be 1) should be initialized by the 6356 * intersection of the underlying hardware's MSR (i.e., features which 6357 * can be supported) and the list of features we want to expose - 6358 * because they are known to be properly supported in our code. 6359 * Also, usually, the low half of the MSRs (bits which must be 1) can 6360 * be set to 0, meaning that L1 may turn off any of these bits. The 6361 * reason is that if one of these bits is necessary, it will appear 6362 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6363 * fields of vmcs01 and vmcs02, will turn these bits off - and 6364 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6365 * These rules have exceptions below. 6366 */ 6367 6368 /* pin-based controls */ 6369 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6370 msrs->pinbased_ctls_low, 6371 msrs->pinbased_ctls_high); 6372 msrs->pinbased_ctls_low |= 6373 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6374 msrs->pinbased_ctls_high &= 6375 PIN_BASED_EXT_INTR_MASK | 6376 PIN_BASED_NMI_EXITING | 6377 PIN_BASED_VIRTUAL_NMIS | 6378 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6379 msrs->pinbased_ctls_high |= 6380 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6381 PIN_BASED_VMX_PREEMPTION_TIMER; 6382 6383 /* exit controls */ 6384 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6385 msrs->exit_ctls_low, 6386 msrs->exit_ctls_high); 6387 msrs->exit_ctls_low = 6388 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6389 6390 msrs->exit_ctls_high &= 6391 #ifdef CONFIG_X86_64 6392 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6393 #endif 6394 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6395 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6396 msrs->exit_ctls_high |= 6397 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6398 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6399 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6400 6401 /* We support free control of debug control saving. */ 6402 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6403 6404 /* entry controls */ 6405 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6406 msrs->entry_ctls_low, 6407 msrs->entry_ctls_high); 6408 msrs->entry_ctls_low = 6409 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6410 msrs->entry_ctls_high &= 6411 #ifdef CONFIG_X86_64 6412 VM_ENTRY_IA32E_MODE | 6413 #endif 6414 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6415 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6416 msrs->entry_ctls_high |= 6417 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6418 6419 /* We support free control of debug control loading. */ 6420 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6421 6422 /* cpu-based controls */ 6423 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6424 msrs->procbased_ctls_low, 6425 msrs->procbased_ctls_high); 6426 msrs->procbased_ctls_low = 6427 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6428 msrs->procbased_ctls_high &= 6429 CPU_BASED_INTR_WINDOW_EXITING | 6430 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6431 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6432 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6433 CPU_BASED_CR3_STORE_EXITING | 6434 #ifdef CONFIG_X86_64 6435 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6436 #endif 6437 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6438 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6439 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6440 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6441 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6442 /* 6443 * We can allow some features even when not supported by the 6444 * hardware. For example, L1 can specify an MSR bitmap - and we 6445 * can use it to avoid exits to L1 - even when L0 runs L2 6446 * without MSR bitmaps. 6447 */ 6448 msrs->procbased_ctls_high |= 6449 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6450 CPU_BASED_USE_MSR_BITMAPS; 6451 6452 /* We support free control of CR3 access interception. */ 6453 msrs->procbased_ctls_low &= 6454 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6455 6456 /* 6457 * secondary cpu-based controls. Do not include those that 6458 * depend on CPUID bits, they are added later by 6459 * vmx_vcpu_after_set_cpuid. 6460 */ 6461 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6462 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6463 msrs->secondary_ctls_low, 6464 msrs->secondary_ctls_high); 6465 6466 msrs->secondary_ctls_low = 0; 6467 msrs->secondary_ctls_high &= 6468 SECONDARY_EXEC_DESC | 6469 SECONDARY_EXEC_ENABLE_RDTSCP | 6470 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6471 SECONDARY_EXEC_WBINVD_EXITING | 6472 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6473 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6474 SECONDARY_EXEC_RDRAND_EXITING | 6475 SECONDARY_EXEC_ENABLE_INVPCID | 6476 SECONDARY_EXEC_RDSEED_EXITING | 6477 SECONDARY_EXEC_XSAVES; 6478 6479 /* 6480 * We can emulate "VMCS shadowing," even if the hardware 6481 * doesn't support it. 6482 */ 6483 msrs->secondary_ctls_high |= 6484 SECONDARY_EXEC_SHADOW_VMCS; 6485 6486 if (enable_ept) { 6487 /* nested EPT: emulate EPT also to L1 */ 6488 msrs->secondary_ctls_high |= 6489 SECONDARY_EXEC_ENABLE_EPT; 6490 msrs->ept_caps = 6491 VMX_EPT_PAGE_WALK_4_BIT | 6492 VMX_EPT_PAGE_WALK_5_BIT | 6493 VMX_EPTP_WB_BIT | 6494 VMX_EPT_INVEPT_BIT | 6495 VMX_EPT_EXECUTE_ONLY_BIT; 6496 6497 msrs->ept_caps &= ept_caps; 6498 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6499 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6500 VMX_EPT_1GB_PAGE_BIT; 6501 if (enable_ept_ad_bits) { 6502 msrs->secondary_ctls_high |= 6503 SECONDARY_EXEC_ENABLE_PML; 6504 msrs->ept_caps |= VMX_EPT_AD_BIT; 6505 } 6506 } 6507 6508 if (cpu_has_vmx_vmfunc()) { 6509 msrs->secondary_ctls_high |= 6510 SECONDARY_EXEC_ENABLE_VMFUNC; 6511 /* 6512 * Advertise EPTP switching unconditionally 6513 * since we emulate it 6514 */ 6515 if (enable_ept) 6516 msrs->vmfunc_controls = 6517 VMX_VMFUNC_EPTP_SWITCHING; 6518 } 6519 6520 /* 6521 * Old versions of KVM use the single-context version without 6522 * checking for support, so declare that it is supported even 6523 * though it is treated as global context. The alternative is 6524 * not failing the single-context invvpid, and it is worse. 6525 */ 6526 if (enable_vpid) { 6527 msrs->secondary_ctls_high |= 6528 SECONDARY_EXEC_ENABLE_VPID; 6529 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6530 VMX_VPID_EXTENT_SUPPORTED_MASK; 6531 } 6532 6533 if (enable_unrestricted_guest) 6534 msrs->secondary_ctls_high |= 6535 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6536 6537 if (flexpriority_enabled) 6538 msrs->secondary_ctls_high |= 6539 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6540 6541 if (enable_sgx) 6542 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6543 6544 /* miscellaneous data */ 6545 rdmsr(MSR_IA32_VMX_MISC, 6546 msrs->misc_low, 6547 msrs->misc_high); 6548 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6549 msrs->misc_low |= 6550 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6551 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6552 VMX_MISC_ACTIVITY_HLT | 6553 VMX_MISC_ACTIVITY_WAIT_SIPI; 6554 msrs->misc_high = 0; 6555 6556 /* 6557 * This MSR reports some information about VMX support. We 6558 * should return information about the VMX we emulate for the 6559 * guest, and the VMCS structure we give it - not about the 6560 * VMX support of the underlying hardware. 6561 */ 6562 msrs->basic = 6563 VMCS12_REVISION | 6564 VMX_BASIC_TRUE_CTLS | 6565 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6566 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6567 6568 if (cpu_has_vmx_basic_inout()) 6569 msrs->basic |= VMX_BASIC_INOUT; 6570 6571 /* 6572 * These MSRs specify bits which the guest must keep fixed on 6573 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6574 * We picked the standard core2 setting. 6575 */ 6576 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6577 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6578 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6579 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6580 6581 /* These MSRs specify bits which the guest must keep fixed off. */ 6582 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6583 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6584 6585 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6586 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6587 } 6588 6589 void nested_vmx_hardware_unsetup(void) 6590 { 6591 int i; 6592 6593 if (enable_shadow_vmcs) { 6594 for (i = 0; i < VMX_BITMAP_NR; i++) 6595 free_page((unsigned long)vmx_bitmap[i]); 6596 } 6597 } 6598 6599 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6600 { 6601 int i; 6602 6603 if (!cpu_has_vmx_shadow_vmcs()) 6604 enable_shadow_vmcs = 0; 6605 if (enable_shadow_vmcs) { 6606 for (i = 0; i < VMX_BITMAP_NR; i++) { 6607 /* 6608 * The vmx_bitmap is not tied to a VM and so should 6609 * not be charged to a memcg. 6610 */ 6611 vmx_bitmap[i] = (unsigned long *) 6612 __get_free_page(GFP_KERNEL); 6613 if (!vmx_bitmap[i]) { 6614 nested_vmx_hardware_unsetup(); 6615 return -ENOMEM; 6616 } 6617 } 6618 6619 init_vmcs_shadow_fields(); 6620 } 6621 6622 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6623 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6624 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6625 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6626 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6627 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6628 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6629 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6630 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6631 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6632 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6633 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6634 6635 return 0; 6636 } 6637 6638 struct kvm_x86_nested_ops vmx_nested_ops = { 6639 .check_events = vmx_check_nested_events, 6640 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6641 .triple_fault = nested_vmx_triple_fault, 6642 .get_state = vmx_get_nested_state, 6643 .set_state = vmx_set_nested_state, 6644 .get_nested_state_pages = vmx_get_nested_state_pages, 6645 .write_log_dirty = nested_vmx_write_pml_buffer, 6646 .enable_evmcs = nested_enable_evmcs, 6647 .get_evmcs_version = nested_get_evmcs_version, 6648 }; 6649