1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 23 static bool __read_mostly enable_shadow_vmcs = 1; 24 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 25 26 static bool __read_mostly nested_early_check = 0; 27 module_param(nested_early_check, bool, S_IRUGO); 28 29 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 30 31 /* 32 * Hyper-V requires all of these, so mark them as supported even though 33 * they are just treated the same as all-context. 34 */ 35 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 36 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 40 41 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 42 43 enum { 44 VMX_VMREAD_BITMAP, 45 VMX_VMWRITE_BITMAP, 46 VMX_BITMAP_NR 47 }; 48 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 49 50 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 51 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 52 53 struct shadow_vmcs_field { 54 u16 encoding; 55 u16 offset; 56 }; 57 static struct shadow_vmcs_field shadow_read_only_fields[] = { 58 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 59 #include "vmcs_shadow_fields.h" 60 }; 61 static int max_shadow_read_only_fields = 62 ARRAY_SIZE(shadow_read_only_fields); 63 64 static struct shadow_vmcs_field shadow_read_write_fields[] = { 65 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 66 #include "vmcs_shadow_fields.h" 67 }; 68 static int max_shadow_read_write_fields = 69 ARRAY_SIZE(shadow_read_write_fields); 70 71 static void init_vmcs_shadow_fields(void) 72 { 73 int i, j; 74 75 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 76 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 77 78 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 79 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 80 u16 field = entry.encoding; 81 82 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 83 (i + 1 == max_shadow_read_only_fields || 84 shadow_read_only_fields[i + 1].encoding != field + 1)) 85 pr_err("Missing field from shadow_read_only_field %x\n", 86 field + 1); 87 88 clear_bit(field, vmx_vmread_bitmap); 89 if (field & 1) 90 #ifdef CONFIG_X86_64 91 continue; 92 #else 93 entry.offset += sizeof(u32); 94 #endif 95 shadow_read_only_fields[j++] = entry; 96 } 97 max_shadow_read_only_fields = j; 98 99 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 100 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 101 u16 field = entry.encoding; 102 103 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 104 (i + 1 == max_shadow_read_write_fields || 105 shadow_read_write_fields[i + 1].encoding != field + 1)) 106 pr_err("Missing field from shadow_read_write_field %x\n", 107 field + 1); 108 109 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 110 field <= GUEST_TR_AR_BYTES, 111 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 112 113 /* 114 * PML and the preemption timer can be emulated, but the 115 * processor cannot vmwrite to fields that don't exist 116 * on bare metal. 117 */ 118 switch (field) { 119 case GUEST_PML_INDEX: 120 if (!cpu_has_vmx_pml()) 121 continue; 122 break; 123 case VMX_PREEMPTION_TIMER_VALUE: 124 if (!cpu_has_vmx_preemption_timer()) 125 continue; 126 break; 127 case GUEST_INTR_STATUS: 128 if (!cpu_has_vmx_apicv()) 129 continue; 130 break; 131 default: 132 break; 133 } 134 135 clear_bit(field, vmx_vmwrite_bitmap); 136 clear_bit(field, vmx_vmread_bitmap); 137 if (field & 1) 138 #ifdef CONFIG_X86_64 139 continue; 140 #else 141 entry.offset += sizeof(u32); 142 #endif 143 shadow_read_write_fields[j++] = entry; 144 } 145 max_shadow_read_write_fields = j; 146 } 147 148 /* 149 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 150 * set the success or error code of an emulated VMX instruction (as specified 151 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 152 * instruction. 153 */ 154 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 155 { 156 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 157 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 158 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 159 return kvm_skip_emulated_instruction(vcpu); 160 } 161 162 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 163 { 164 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 165 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 166 X86_EFLAGS_SF | X86_EFLAGS_OF)) 167 | X86_EFLAGS_CF); 168 return kvm_skip_emulated_instruction(vcpu); 169 } 170 171 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 172 u32 vm_instruction_error) 173 { 174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 175 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 176 X86_EFLAGS_SF | X86_EFLAGS_OF)) 177 | X86_EFLAGS_ZF); 178 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 179 /* 180 * We don't need to force sync to shadow VMCS because 181 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 182 * fields and thus must be synced. 183 */ 184 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 185 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 186 187 return kvm_skip_emulated_instruction(vcpu); 188 } 189 190 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 191 { 192 struct vcpu_vmx *vmx = to_vmx(vcpu); 193 194 /* 195 * failValid writes the error number to the current VMCS, which 196 * can't be done if there isn't a current VMCS. 197 */ 198 if (vmx->nested.current_vmptr == INVALID_GPA && 199 !nested_vmx_is_evmptr12_valid(vmx)) 200 return nested_vmx_failInvalid(vcpu); 201 202 return nested_vmx_failValid(vcpu, vm_instruction_error); 203 } 204 205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 206 { 207 /* TODO: not to reset guest simply here. */ 208 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 209 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 210 } 211 212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 213 { 214 return fixed_bits_valid(control, low, high); 215 } 216 217 static inline u64 vmx_control_msr(u32 low, u32 high) 218 { 219 return low | ((u64)high << 32); 220 } 221 222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 223 { 224 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 225 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 226 vmx->nested.need_vmcs12_to_shadow_sync = false; 227 } 228 229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 230 { 231 #ifdef CONFIG_KVM_HYPERV 232 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 233 struct vcpu_vmx *vmx = to_vmx(vcpu); 234 235 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 236 vmx->nested.hv_evmcs = NULL; 237 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 238 239 if (hv_vcpu) { 240 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 241 hv_vcpu->nested.vm_id = 0; 242 hv_vcpu->nested.vp_id = 0; 243 } 244 #endif 245 } 246 247 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 248 { 249 #ifdef CONFIG_KVM_HYPERV 250 struct vcpu_vmx *vmx = to_vmx(vcpu); 251 /* 252 * When Enlightened VMEntry is enabled on the calling CPU we treat 253 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 254 * way to distinguish it from VMCS12) and we must not corrupt it by 255 * writing to the non-existent 'launch_state' field. The area doesn't 256 * have to be the currently active EVMCS on the calling CPU and there's 257 * nothing KVM has to do to transition it from 'active' to 'non-active' 258 * state. It is possible that the area will stay mapped as 259 * vmx->nested.hv_evmcs but this shouldn't be a problem. 260 */ 261 if (!guest_cpu_cap_has_evmcs(vcpu) || 262 !evmptr_is_valid(nested_get_evmptr(vcpu))) 263 return false; 264 265 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 266 nested_release_evmcs(vcpu); 267 268 return true; 269 #else 270 return false; 271 #endif 272 } 273 274 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 275 struct loaded_vmcs *prev) 276 { 277 struct vmcs_host_state *dest, *src; 278 279 if (unlikely(!vmx->vt.guest_state_loaded)) 280 return; 281 282 src = &prev->host_state; 283 dest = &vmx->loaded_vmcs->host_state; 284 285 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 286 dest->ldt_sel = src->ldt_sel; 287 #ifdef CONFIG_X86_64 288 dest->ds_sel = src->ds_sel; 289 dest->es_sel = src->es_sel; 290 #endif 291 } 292 293 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 struct loaded_vmcs *prev; 297 int cpu; 298 299 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 300 return; 301 302 cpu = get_cpu(); 303 prev = vmx->loaded_vmcs; 304 vmx->loaded_vmcs = vmcs; 305 vmx_vcpu_load_vmcs(vcpu, cpu); 306 vmx_sync_vmcs_host_state(vmx, prev); 307 put_cpu(); 308 309 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 310 311 /* 312 * All lazily updated registers will be reloaded from VMCS12 on both 313 * vmentry and vmexit. 314 */ 315 vcpu->arch.regs_dirty = 0; 316 } 317 318 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 319 { 320 struct vcpu_vmx *vmx = to_vmx(vcpu); 321 322 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 323 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 324 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 325 vmx->nested.pi_desc = NULL; 326 } 327 328 /* 329 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 330 * just stops using VMX. 331 */ 332 static void free_nested(struct kvm_vcpu *vcpu) 333 { 334 struct vcpu_vmx *vmx = to_vmx(vcpu); 335 336 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 337 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 338 339 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 340 return; 341 342 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 343 344 vmx->nested.vmxon = false; 345 vmx->nested.smm.vmxon = false; 346 vmx->nested.vmxon_ptr = INVALID_GPA; 347 free_vpid(vmx->nested.vpid02); 348 vmx->nested.posted_intr_nv = -1; 349 vmx->nested.current_vmptr = INVALID_GPA; 350 if (enable_shadow_vmcs) { 351 vmx_disable_shadow_vmcs(vmx); 352 vmcs_clear(vmx->vmcs01.shadow_vmcs); 353 free_vmcs(vmx->vmcs01.shadow_vmcs); 354 vmx->vmcs01.shadow_vmcs = NULL; 355 } 356 kfree(vmx->nested.cached_vmcs12); 357 vmx->nested.cached_vmcs12 = NULL; 358 kfree(vmx->nested.cached_shadow_vmcs12); 359 vmx->nested.cached_shadow_vmcs12 = NULL; 360 361 nested_put_vmcs12_pages(vcpu); 362 363 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 364 365 nested_release_evmcs(vcpu); 366 367 free_loaded_vmcs(&vmx->nested.vmcs02); 368 } 369 370 /* 371 * Ensure that the current vmcs of the logical processor is the 372 * vmcs01 of the vcpu before calling free_nested(). 373 */ 374 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 375 { 376 vcpu_load(vcpu); 377 vmx_leave_nested(vcpu); 378 vcpu_put(vcpu); 379 } 380 381 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 382 383 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 384 { 385 return VALID_PAGE(root_hpa) && 386 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 387 } 388 389 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 390 gpa_t addr) 391 { 392 unsigned long roots = 0; 393 uint i; 394 struct kvm_mmu_root_info *cached_root; 395 396 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 397 398 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 399 cached_root = &vcpu->arch.mmu->prev_roots[i]; 400 401 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 402 eptp)) 403 roots |= KVM_MMU_ROOT_PREVIOUS(i); 404 } 405 if (roots) 406 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 407 } 408 409 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 410 struct x86_exception *fault) 411 { 412 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 413 struct vcpu_vmx *vmx = to_vmx(vcpu); 414 unsigned long exit_qualification; 415 u32 vm_exit_reason; 416 417 if (vmx->nested.pml_full) { 418 vm_exit_reason = EXIT_REASON_PML_FULL; 419 vmx->nested.pml_full = false; 420 421 /* 422 * It should be impossible to trigger a nested PML Full VM-Exit 423 * for anything other than an EPT Violation from L2. KVM *can* 424 * trigger nEPT page fault injection in response to an EPT 425 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 426 * tables also changed, but KVM should not treat EPT Misconfig 427 * VM-Exits as writes. 428 */ 429 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 430 431 /* 432 * PML Full and EPT Violation VM-Exits both use bit 12 to report 433 * "NMI unblocking due to IRET", i.e. the bit can be propagated 434 * as-is from the original EXIT_QUALIFICATION. 435 */ 436 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 437 } else { 438 if (fault->error_code & PFERR_RSVD_MASK) { 439 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 440 exit_qualification = 0; 441 } else { 442 exit_qualification = fault->exit_qualification; 443 exit_qualification |= vmx_get_exit_qual(vcpu) & 444 (EPT_VIOLATION_GVA_IS_VALID | 445 EPT_VIOLATION_GVA_TRANSLATED); 446 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 447 } 448 449 /* 450 * Although the caller (kvm_inject_emulated_page_fault) would 451 * have already synced the faulting address in the shadow EPT 452 * tables for the current EPTP12, we also need to sync it for 453 * any other cached EPTP02s based on the same EP4TA, since the 454 * TLB associates mappings to the EP4TA rather than the full EPTP. 455 */ 456 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 457 fault->address); 458 } 459 460 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 461 vmcs12->guest_physical_address = fault->address; 462 } 463 464 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 465 { 466 struct vcpu_vmx *vmx = to_vmx(vcpu); 467 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 468 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 469 470 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 471 nested_ept_ad_enabled(vcpu), 472 nested_ept_get_eptp(vcpu)); 473 } 474 475 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 476 { 477 WARN_ON(mmu_is_nested(vcpu)); 478 479 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 480 nested_ept_new_eptp(vcpu); 481 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 482 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 483 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 484 485 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 486 } 487 488 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 489 { 490 vcpu->arch.mmu = &vcpu->arch.root_mmu; 491 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 492 } 493 494 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 495 u16 error_code) 496 { 497 bool inequality, bit; 498 499 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 500 inequality = 501 (error_code & vmcs12->page_fault_error_code_mask) != 502 vmcs12->page_fault_error_code_match; 503 return inequality ^ bit; 504 } 505 506 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 507 u32 error_code) 508 { 509 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 510 511 /* 512 * Drop bits 31:16 of the error code when performing the #PF mask+match 513 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 514 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 515 * error code. Including the to-be-dropped bits in the check might 516 * result in an "impossible" or missed exit from L1's perspective. 517 */ 518 if (vector == PF_VECTOR) 519 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 520 521 return (vmcs12->exception_bitmap & (1u << vector)); 522 } 523 524 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 525 struct vmcs12 *vmcs12) 526 { 527 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 528 return 0; 529 530 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 531 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 532 return -EINVAL; 533 534 return 0; 535 } 536 537 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 538 struct vmcs12 *vmcs12) 539 { 540 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 541 return 0; 542 543 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 544 return -EINVAL; 545 546 return 0; 547 } 548 549 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 550 struct vmcs12 *vmcs12) 551 { 552 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 553 return 0; 554 555 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 556 return -EINVAL; 557 558 return 0; 559 } 560 561 /* 562 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 563 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 564 * only the "disable intercept" case needs to be handled. 565 */ 566 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 567 unsigned long *msr_bitmap_l0, 568 u32 msr, int type) 569 { 570 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 571 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 572 573 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 574 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 575 } 576 577 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 578 { 579 int msr; 580 581 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 582 unsigned word = msr / BITS_PER_LONG; 583 584 msr_bitmap[word] = ~0; 585 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 586 } 587 } 588 589 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 590 static inline \ 591 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 592 unsigned long *msr_bitmap_l1, \ 593 unsigned long *msr_bitmap_l0, u32 msr) \ 594 { \ 595 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 596 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 597 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 598 else \ 599 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 600 } 601 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 602 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 603 604 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 605 unsigned long *msr_bitmap_l1, 606 unsigned long *msr_bitmap_l0, 607 u32 msr, int types) 608 { 609 if (types & MSR_TYPE_R) 610 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 611 msr_bitmap_l0, msr); 612 if (types & MSR_TYPE_W) 613 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 614 msr_bitmap_l0, msr); 615 } 616 617 /* 618 * Merge L0's and L1's MSR bitmap, return false to indicate that 619 * we do not use the hardware. 620 */ 621 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 622 struct vmcs12 *vmcs12) 623 { 624 struct vcpu_vmx *vmx = to_vmx(vcpu); 625 int msr; 626 unsigned long *msr_bitmap_l1; 627 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 628 struct kvm_host_map map; 629 630 /* Nothing to do if the MSR bitmap is not in use. */ 631 if (!cpu_has_vmx_msr_bitmap() || 632 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 633 return false; 634 635 /* 636 * MSR bitmap update can be skipped when: 637 * - MSR bitmap for L1 hasn't changed. 638 * - Nested hypervisor (L1) is attempting to launch the same L2 as 639 * before. 640 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 641 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 642 */ 643 if (!vmx->nested.force_msr_bitmap_recalc) { 644 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 645 646 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 647 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 648 return true; 649 } 650 651 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 652 return false; 653 654 msr_bitmap_l1 = (unsigned long *)map.hva; 655 656 /* 657 * To keep the control flow simple, pay eight 8-byte writes (sixteen 658 * 4-byte writes on 32-bit systems) up front to enable intercepts for 659 * the x2APIC MSR range and selectively toggle those relevant to L2. 660 */ 661 enable_x2apic_msr_intercepts(msr_bitmap_l0); 662 663 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 664 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 665 /* 666 * L0 need not intercept reads for MSRs between 0x800 667 * and 0x8ff, it just lets the processor take the value 668 * from the virtual-APIC page; take those 256 bits 669 * directly from the L1 bitmap. 670 */ 671 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 672 unsigned word = msr / BITS_PER_LONG; 673 674 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 675 } 676 } 677 678 nested_vmx_disable_intercept_for_x2apic_msr( 679 msr_bitmap_l1, msr_bitmap_l0, 680 X2APIC_MSR(APIC_TASKPRI), 681 MSR_TYPE_R | MSR_TYPE_W); 682 683 if (nested_cpu_has_vid(vmcs12)) { 684 nested_vmx_disable_intercept_for_x2apic_msr( 685 msr_bitmap_l1, msr_bitmap_l0, 686 X2APIC_MSR(APIC_EOI), 687 MSR_TYPE_W); 688 nested_vmx_disable_intercept_for_x2apic_msr( 689 msr_bitmap_l1, msr_bitmap_l0, 690 X2APIC_MSR(APIC_SELF_IPI), 691 MSR_TYPE_W); 692 } 693 } 694 695 /* 696 * Always check vmcs01's bitmap to honor userspace MSR filters and any 697 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 698 */ 699 #ifdef CONFIG_X86_64 700 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 701 MSR_FS_BASE, MSR_TYPE_RW); 702 703 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 704 MSR_GS_BASE, MSR_TYPE_RW); 705 706 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 707 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 708 #endif 709 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 710 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 711 712 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 713 MSR_IA32_PRED_CMD, MSR_TYPE_W); 714 715 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 716 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 717 718 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 719 MSR_IA32_APERF, MSR_TYPE_R); 720 721 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 722 MSR_IA32_MPERF, MSR_TYPE_R); 723 724 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 725 MSR_IA32_U_CET, MSR_TYPE_RW); 726 727 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 728 MSR_IA32_S_CET, MSR_TYPE_RW); 729 730 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 731 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 732 733 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 734 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 735 736 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 737 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 738 739 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 740 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 741 742 kvm_vcpu_unmap(vcpu, &map); 743 744 vmx->nested.force_msr_bitmap_recalc = false; 745 746 return true; 747 } 748 749 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 750 struct vmcs12 *vmcs12) 751 { 752 struct vcpu_vmx *vmx = to_vmx(vcpu); 753 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 754 755 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 756 vmcs12->vmcs_link_pointer == INVALID_GPA) 757 return; 758 759 if (ghc->gpa != vmcs12->vmcs_link_pointer && 760 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 761 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 762 return; 763 764 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 765 VMCS12_SIZE); 766 } 767 768 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 769 struct vmcs12 *vmcs12) 770 { 771 struct vcpu_vmx *vmx = to_vmx(vcpu); 772 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 773 774 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 775 vmcs12->vmcs_link_pointer == INVALID_GPA) 776 return; 777 778 if (ghc->gpa != vmcs12->vmcs_link_pointer && 779 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 780 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 781 return; 782 783 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 784 VMCS12_SIZE); 785 } 786 787 /* 788 * In nested virtualization, check if L1 has set 789 * VM_EXIT_ACK_INTR_ON_EXIT 790 */ 791 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 792 { 793 return get_vmcs12(vcpu)->vm_exit_controls & 794 VM_EXIT_ACK_INTR_ON_EXIT; 795 } 796 797 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 798 struct vmcs12 *vmcs12) 799 { 800 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 801 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 802 return -EINVAL; 803 else 804 return 0; 805 } 806 807 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 808 struct vmcs12 *vmcs12) 809 { 810 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 811 !nested_cpu_has_apic_reg_virt(vmcs12) && 812 !nested_cpu_has_vid(vmcs12) && 813 !nested_cpu_has_posted_intr(vmcs12)) 814 return 0; 815 816 /* 817 * If virtualize x2apic mode is enabled, 818 * virtualize apic access must be disabled. 819 */ 820 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 821 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 822 return -EINVAL; 823 824 /* 825 * If virtual interrupt delivery is enabled, 826 * we must exit on external interrupts. 827 */ 828 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 829 return -EINVAL; 830 831 /* 832 * bits 15:8 should be zero in posted_intr_nv, 833 * the descriptor address has been already checked 834 * in nested_get_vmcs12_pages. 835 * 836 * bits 5:0 of posted_intr_desc_addr should be zero. 837 */ 838 if (nested_cpu_has_posted_intr(vmcs12) && 839 (CC(!nested_cpu_has_vid(vmcs12)) || 840 CC(!nested_exit_intr_ack_set(vcpu)) || 841 CC((vmcs12->posted_intr_nv & 0xff00)) || 842 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 843 return -EINVAL; 844 845 /* tpr shadow is needed by all apicv features. */ 846 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 847 return -EINVAL; 848 849 return 0; 850 } 851 852 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 853 { 854 struct vcpu_vmx *vmx = to_vmx(vcpu); 855 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 856 vmx->nested.msrs.misc_high); 857 858 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 859 } 860 861 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 862 u32 count, u64 addr) 863 { 864 if (count == 0) 865 return 0; 866 867 /* 868 * Exceeding the limit results in architecturally _undefined_ behavior, 869 * i.e. KVM is allowed to do literally anything in response to a bad 870 * limit. Immediately generate a consistency check so that code that 871 * consumes the count doesn't need to worry about extreme edge cases. 872 */ 873 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 874 return -EINVAL; 875 876 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 877 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 878 return -EINVAL; 879 880 return 0; 881 } 882 883 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 884 struct vmcs12 *vmcs12) 885 { 886 if (CC(nested_vmx_check_msr_switch(vcpu, 887 vmcs12->vm_exit_msr_load_count, 888 vmcs12->vm_exit_msr_load_addr)) || 889 CC(nested_vmx_check_msr_switch(vcpu, 890 vmcs12->vm_exit_msr_store_count, 891 vmcs12->vm_exit_msr_store_addr))) 892 return -EINVAL; 893 894 return 0; 895 } 896 897 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 898 struct vmcs12 *vmcs12) 899 { 900 if (CC(nested_vmx_check_msr_switch(vcpu, 901 vmcs12->vm_entry_msr_load_count, 902 vmcs12->vm_entry_msr_load_addr))) 903 return -EINVAL; 904 905 return 0; 906 } 907 908 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 909 struct vmcs12 *vmcs12) 910 { 911 if (!nested_cpu_has_pml(vmcs12)) 912 return 0; 913 914 if (CC(!nested_cpu_has_ept(vmcs12)) || 915 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 916 return -EINVAL; 917 918 return 0; 919 } 920 921 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 922 struct vmcs12 *vmcs12) 923 { 924 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 925 !nested_cpu_has_ept(vmcs12))) 926 return -EINVAL; 927 return 0; 928 } 929 930 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 931 struct vmcs12 *vmcs12) 932 { 933 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 934 !nested_cpu_has_ept(vmcs12))) 935 return -EINVAL; 936 return 0; 937 } 938 939 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 940 struct vmcs12 *vmcs12) 941 { 942 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 943 return 0; 944 945 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 946 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 947 return -EINVAL; 948 949 return 0; 950 } 951 952 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 953 struct vmx_msr_entry *e) 954 { 955 /* x2APIC MSR accesses are not allowed */ 956 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 957 return -EINVAL; 958 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 959 CC(e->index == MSR_IA32_UCODE_REV)) 960 return -EINVAL; 961 if (CC(e->reserved != 0)) 962 return -EINVAL; 963 return 0; 964 } 965 966 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 967 struct vmx_msr_entry *e) 968 { 969 if (CC(e->index == MSR_FS_BASE) || 970 CC(e->index == MSR_GS_BASE) || 971 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 972 nested_vmx_msr_check_common(vcpu, e)) 973 return -EINVAL; 974 return 0; 975 } 976 977 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 978 struct vmx_msr_entry *e) 979 { 980 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 981 nested_vmx_msr_check_common(vcpu, e)) 982 return -EINVAL; 983 return 0; 984 } 985 986 /* 987 * Load guest's/host's msr at nested entry/exit. 988 * return 0 for success, entry index for failure. 989 * 990 * One of the failure modes for MSR load/store is when a list exceeds the 991 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 992 * as possible, process all valid entries before failing rather than precheck 993 * for a capacity violation. 994 */ 995 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 996 { 997 u32 i; 998 struct vmx_msr_entry e; 999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1000 1001 for (i = 0; i < count; i++) { 1002 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1003 goto fail; 1004 1005 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1006 &e, sizeof(e))) { 1007 pr_debug_ratelimited( 1008 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1009 __func__, i, gpa + i * sizeof(e)); 1010 goto fail; 1011 } 1012 if (nested_vmx_load_msr_check(vcpu, &e)) { 1013 pr_debug_ratelimited( 1014 "%s check failed (%u, 0x%x, 0x%x)\n", 1015 __func__, i, e.index, e.reserved); 1016 goto fail; 1017 } 1018 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1019 pr_debug_ratelimited( 1020 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1021 __func__, i, e.index, e.value); 1022 goto fail; 1023 } 1024 } 1025 return 0; 1026 fail: 1027 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1028 return i + 1; 1029 } 1030 1031 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1032 u32 msr_index, 1033 u64 *data) 1034 { 1035 struct vcpu_vmx *vmx = to_vmx(vcpu); 1036 1037 /* 1038 * If the L0 hypervisor stored a more accurate value for the TSC that 1039 * does not include the time taken for emulation of the L2->L1 1040 * VM-exit in L0, use the more accurate value. 1041 */ 1042 if (msr_index == MSR_IA32_TSC) { 1043 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1044 MSR_IA32_TSC); 1045 1046 if (i >= 0) { 1047 u64 val = vmx->msr_autostore.guest.val[i].value; 1048 1049 *data = kvm_read_l1_tsc(vcpu, val); 1050 return true; 1051 } 1052 } 1053 1054 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1055 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1056 msr_index); 1057 return false; 1058 } 1059 return true; 1060 } 1061 1062 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1063 struct vmx_msr_entry *e) 1064 { 1065 if (kvm_vcpu_read_guest(vcpu, 1066 gpa + i * sizeof(*e), 1067 e, 2 * sizeof(u32))) { 1068 pr_debug_ratelimited( 1069 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1070 __func__, i, gpa + i * sizeof(*e)); 1071 return false; 1072 } 1073 if (nested_vmx_store_msr_check(vcpu, e)) { 1074 pr_debug_ratelimited( 1075 "%s check failed (%u, 0x%x, 0x%x)\n", 1076 __func__, i, e->index, e->reserved); 1077 return false; 1078 } 1079 return true; 1080 } 1081 1082 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1083 { 1084 u64 data; 1085 u32 i; 1086 struct vmx_msr_entry e; 1087 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1088 1089 for (i = 0; i < count; i++) { 1090 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1091 return -EINVAL; 1092 1093 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1094 return -EINVAL; 1095 1096 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1097 return -EINVAL; 1098 1099 if (kvm_vcpu_write_guest(vcpu, 1100 gpa + i * sizeof(e) + 1101 offsetof(struct vmx_msr_entry, value), 1102 &data, sizeof(data))) { 1103 pr_debug_ratelimited( 1104 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1105 __func__, i, e.index, data); 1106 return -EINVAL; 1107 } 1108 } 1109 return 0; 1110 } 1111 1112 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1113 { 1114 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1115 u32 count = vmcs12->vm_exit_msr_store_count; 1116 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1117 struct vmx_msr_entry e; 1118 u32 i; 1119 1120 for (i = 0; i < count; i++) { 1121 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1122 return false; 1123 1124 if (e.index == msr_index) 1125 return true; 1126 } 1127 return false; 1128 } 1129 1130 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1131 u32 msr_index) 1132 { 1133 struct vcpu_vmx *vmx = to_vmx(vcpu); 1134 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1135 bool in_vmcs12_store_list; 1136 int msr_autostore_slot; 1137 bool in_autostore_list; 1138 int last; 1139 1140 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1141 in_autostore_list = msr_autostore_slot >= 0; 1142 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1143 1144 if (in_vmcs12_store_list && !in_autostore_list) { 1145 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1146 /* 1147 * Emulated VMEntry does not fail here. Instead a less 1148 * accurate value will be returned by 1149 * nested_vmx_get_vmexit_msr_value() by reading KVM's 1150 * internal MSR state instead of reading the value from 1151 * the vmcs02 VMExit MSR-store area. 1152 */ 1153 pr_warn_ratelimited( 1154 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1155 msr_index); 1156 return; 1157 } 1158 last = autostore->nr++; 1159 autostore->val[last].index = msr_index; 1160 } else if (!in_vmcs12_store_list && in_autostore_list) { 1161 last = --autostore->nr; 1162 autostore->val[msr_autostore_slot] = autostore->val[last]; 1163 } 1164 } 1165 1166 /* 1167 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1168 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1169 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1170 * @entry_failure_code. 1171 */ 1172 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1173 bool nested_ept, bool reload_pdptrs, 1174 enum vm_entry_failure_code *entry_failure_code) 1175 { 1176 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1177 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1178 return -EINVAL; 1179 } 1180 1181 /* 1182 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1183 * must not be dereferenced. 1184 */ 1185 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1186 CC(!load_pdptrs(vcpu, cr3))) { 1187 *entry_failure_code = ENTRY_FAIL_PDPTE; 1188 return -EINVAL; 1189 } 1190 1191 vcpu->arch.cr3 = cr3; 1192 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1193 1194 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1195 kvm_init_mmu(vcpu); 1196 1197 if (!nested_ept) 1198 kvm_mmu_new_pgd(vcpu, cr3); 1199 1200 return 0; 1201 } 1202 1203 /* 1204 * Returns if KVM is able to config CPU to tag TLB entries 1205 * populated by L2 differently than TLB entries populated 1206 * by L1. 1207 * 1208 * If L0 uses EPT, L1 and L2 run with different EPTP because 1209 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1210 * are tagged with different EPTP. 1211 * 1212 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1213 * with different VPID (L1 entries are tagged with vmx->vpid 1214 * while L2 entries are tagged with vmx->nested.vpid02). 1215 */ 1216 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1217 { 1218 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1219 1220 return enable_ept || 1221 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1222 } 1223 1224 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1225 struct vmcs12 *vmcs12, 1226 bool is_vmenter) 1227 { 1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1229 1230 /* Handle pending Hyper-V TLB flush requests */ 1231 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1232 1233 /* 1234 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1235 * same VPID as the host, and so architecturally, linear and combined 1236 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1237 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1238 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1239 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1240 * VPIDs) still occurs from L1's perspective, and KVM may need to 1241 * synchronize the MMU in response to the guest TLB flush. 1242 * 1243 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1244 * EPT is a special snowflake, as guest-physical mappings aren't 1245 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1246 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1247 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1248 * those mappings. 1249 */ 1250 if (!nested_cpu_has_vpid(vmcs12)) { 1251 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1252 return; 1253 } 1254 1255 /* L2 should never have a VPID if VPID is disabled. */ 1256 WARN_ON(!enable_vpid); 1257 1258 /* 1259 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1260 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1261 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1262 * that the new vpid12 has never been used and thus represents a new 1263 * guest ASID that cannot have entries in the TLB. 1264 */ 1265 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1266 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1267 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1268 return; 1269 } 1270 1271 /* 1272 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1273 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1274 * KVM was unable to allocate a VPID for L2, flush the current context 1275 * as the effective ASID is common to both L1 and L2. 1276 */ 1277 if (!nested_has_guest_tlb_tag(vcpu)) 1278 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1279 } 1280 1281 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1282 { 1283 superset &= mask; 1284 subset &= mask; 1285 1286 return (superset | subset) == superset; 1287 } 1288 1289 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1290 { 1291 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1292 VMX_BASIC_INOUT | 1293 VMX_BASIC_TRUE_CTLS | 1294 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1295 1296 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1297 GENMASK_ULL(47, 45) | 1298 BIT_ULL(31); 1299 1300 u64 vmx_basic = vmcs_config.nested.basic; 1301 1302 BUILD_BUG_ON(feature_bits & reserved_bits); 1303 1304 /* 1305 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1306 * inverted polarity), the incoming value must not set feature bits or 1307 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1308 * multi-bit values, are explicitly checked below. 1309 */ 1310 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1311 return -EINVAL; 1312 1313 /* 1314 * KVM does not emulate a version of VMX that constrains physical 1315 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1316 */ 1317 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1318 return -EINVAL; 1319 1320 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1321 vmx_basic_vmcs_revision_id(data)) 1322 return -EINVAL; 1323 1324 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1325 return -EINVAL; 1326 1327 vmx->nested.msrs.basic = data; 1328 return 0; 1329 } 1330 1331 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1332 u32 **low, u32 **high) 1333 { 1334 switch (msr_index) { 1335 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1336 *low = &msrs->pinbased_ctls_low; 1337 *high = &msrs->pinbased_ctls_high; 1338 break; 1339 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1340 *low = &msrs->procbased_ctls_low; 1341 *high = &msrs->procbased_ctls_high; 1342 break; 1343 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1344 *low = &msrs->exit_ctls_low; 1345 *high = &msrs->exit_ctls_high; 1346 break; 1347 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1348 *low = &msrs->entry_ctls_low; 1349 *high = &msrs->entry_ctls_high; 1350 break; 1351 case MSR_IA32_VMX_PROCBASED_CTLS2: 1352 *low = &msrs->secondary_ctls_low; 1353 *high = &msrs->secondary_ctls_high; 1354 break; 1355 default: 1356 BUG(); 1357 } 1358 } 1359 1360 static int 1361 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1362 { 1363 u32 *lowp, *highp; 1364 u64 supported; 1365 1366 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1367 1368 supported = vmx_control_msr(*lowp, *highp); 1369 1370 /* Check must-be-1 bits are still 1. */ 1371 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1372 return -EINVAL; 1373 1374 /* Check must-be-0 bits are still 0. */ 1375 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1376 return -EINVAL; 1377 1378 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1379 *lowp = data; 1380 *highp = data >> 32; 1381 return 0; 1382 } 1383 1384 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1385 { 1386 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1387 VMX_MISC_ACTIVITY_HLT | 1388 VMX_MISC_ACTIVITY_SHUTDOWN | 1389 VMX_MISC_ACTIVITY_WAIT_SIPI | 1390 VMX_MISC_INTEL_PT | 1391 VMX_MISC_RDMSR_IN_SMM | 1392 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1393 VMX_MISC_VMXOFF_BLOCK_SMI | 1394 VMX_MISC_ZERO_LEN_INS; 1395 1396 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1397 1398 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1399 vmcs_config.nested.misc_high); 1400 1401 BUILD_BUG_ON(feature_bits & reserved_bits); 1402 1403 /* 1404 * The incoming value must not set feature bits or reserved bits that 1405 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1406 * explicitly checked below. 1407 */ 1408 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1409 return -EINVAL; 1410 1411 if ((vmx->nested.msrs.pinbased_ctls_high & 1412 PIN_BASED_VMX_PREEMPTION_TIMER) && 1413 vmx_misc_preemption_timer_rate(data) != 1414 vmx_misc_preemption_timer_rate(vmx_misc)) 1415 return -EINVAL; 1416 1417 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1418 return -EINVAL; 1419 1420 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1421 return -EINVAL; 1422 1423 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1424 return -EINVAL; 1425 1426 vmx->nested.msrs.misc_low = data; 1427 vmx->nested.msrs.misc_high = data >> 32; 1428 1429 return 0; 1430 } 1431 1432 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1433 { 1434 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1435 vmcs_config.nested.vpid_caps); 1436 1437 /* Every bit is either reserved or a feature bit. */ 1438 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1439 return -EINVAL; 1440 1441 vmx->nested.msrs.ept_caps = data; 1442 vmx->nested.msrs.vpid_caps = data >> 32; 1443 return 0; 1444 } 1445 1446 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1447 { 1448 switch (msr_index) { 1449 case MSR_IA32_VMX_CR0_FIXED0: 1450 return &msrs->cr0_fixed0; 1451 case MSR_IA32_VMX_CR4_FIXED0: 1452 return &msrs->cr4_fixed0; 1453 default: 1454 BUG(); 1455 } 1456 } 1457 1458 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1459 { 1460 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1461 1462 /* 1463 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1464 * must be 1 in the restored value. 1465 */ 1466 if (!is_bitwise_subset(data, *msr, -1ULL)) 1467 return -EINVAL; 1468 1469 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1470 return 0; 1471 } 1472 1473 /* 1474 * Called when userspace is restoring VMX MSRs. 1475 * 1476 * Returns 0 on success, non-0 otherwise. 1477 */ 1478 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1479 { 1480 struct vcpu_vmx *vmx = to_vmx(vcpu); 1481 1482 /* 1483 * Don't allow changes to the VMX capability MSRs while the vCPU 1484 * is in VMX operation. 1485 */ 1486 if (vmx->nested.vmxon) 1487 return -EBUSY; 1488 1489 switch (msr_index) { 1490 case MSR_IA32_VMX_BASIC: 1491 return vmx_restore_vmx_basic(vmx, data); 1492 case MSR_IA32_VMX_PINBASED_CTLS: 1493 case MSR_IA32_VMX_PROCBASED_CTLS: 1494 case MSR_IA32_VMX_EXIT_CTLS: 1495 case MSR_IA32_VMX_ENTRY_CTLS: 1496 /* 1497 * The "non-true" VMX capability MSRs are generated from the 1498 * "true" MSRs, so we do not support restoring them directly. 1499 * 1500 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1501 * should restore the "true" MSRs with the must-be-1 bits 1502 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1503 * DEFAULT SETTINGS". 1504 */ 1505 return -EINVAL; 1506 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1507 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1508 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1509 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1510 case MSR_IA32_VMX_PROCBASED_CTLS2: 1511 return vmx_restore_control_msr(vmx, msr_index, data); 1512 case MSR_IA32_VMX_MISC: 1513 return vmx_restore_vmx_misc(vmx, data); 1514 case MSR_IA32_VMX_CR0_FIXED0: 1515 case MSR_IA32_VMX_CR4_FIXED0: 1516 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1517 case MSR_IA32_VMX_CR0_FIXED1: 1518 case MSR_IA32_VMX_CR4_FIXED1: 1519 /* 1520 * These MSRs are generated based on the vCPU's CPUID, so we 1521 * do not support restoring them directly. 1522 */ 1523 return -EINVAL; 1524 case MSR_IA32_VMX_EPT_VPID_CAP: 1525 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1526 case MSR_IA32_VMX_VMCS_ENUM: 1527 vmx->nested.msrs.vmcs_enum = data; 1528 return 0; 1529 case MSR_IA32_VMX_VMFUNC: 1530 if (data & ~vmcs_config.nested.vmfunc_controls) 1531 return -EINVAL; 1532 vmx->nested.msrs.vmfunc_controls = data; 1533 return 0; 1534 default: 1535 /* 1536 * The rest of the VMX capability MSRs do not support restore. 1537 */ 1538 return -EINVAL; 1539 } 1540 } 1541 1542 /* Returns 0 on success, non-0 otherwise. */ 1543 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1544 { 1545 switch (msr_index) { 1546 case MSR_IA32_VMX_BASIC: 1547 *pdata = msrs->basic; 1548 break; 1549 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1550 case MSR_IA32_VMX_PINBASED_CTLS: 1551 *pdata = vmx_control_msr( 1552 msrs->pinbased_ctls_low, 1553 msrs->pinbased_ctls_high); 1554 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1555 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1556 break; 1557 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1558 case MSR_IA32_VMX_PROCBASED_CTLS: 1559 *pdata = vmx_control_msr( 1560 msrs->procbased_ctls_low, 1561 msrs->procbased_ctls_high); 1562 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1563 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1564 break; 1565 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1566 case MSR_IA32_VMX_EXIT_CTLS: 1567 *pdata = vmx_control_msr( 1568 msrs->exit_ctls_low, 1569 msrs->exit_ctls_high); 1570 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1571 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1572 break; 1573 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1574 case MSR_IA32_VMX_ENTRY_CTLS: 1575 *pdata = vmx_control_msr( 1576 msrs->entry_ctls_low, 1577 msrs->entry_ctls_high); 1578 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1579 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1580 break; 1581 case MSR_IA32_VMX_MISC: 1582 *pdata = vmx_control_msr( 1583 msrs->misc_low, 1584 msrs->misc_high); 1585 break; 1586 case MSR_IA32_VMX_CR0_FIXED0: 1587 *pdata = msrs->cr0_fixed0; 1588 break; 1589 case MSR_IA32_VMX_CR0_FIXED1: 1590 *pdata = msrs->cr0_fixed1; 1591 break; 1592 case MSR_IA32_VMX_CR4_FIXED0: 1593 *pdata = msrs->cr4_fixed0; 1594 break; 1595 case MSR_IA32_VMX_CR4_FIXED1: 1596 *pdata = msrs->cr4_fixed1; 1597 break; 1598 case MSR_IA32_VMX_VMCS_ENUM: 1599 *pdata = msrs->vmcs_enum; 1600 break; 1601 case MSR_IA32_VMX_PROCBASED_CTLS2: 1602 *pdata = vmx_control_msr( 1603 msrs->secondary_ctls_low, 1604 msrs->secondary_ctls_high); 1605 break; 1606 case MSR_IA32_VMX_EPT_VPID_CAP: 1607 *pdata = msrs->ept_caps | 1608 ((u64)msrs->vpid_caps << 32); 1609 break; 1610 case MSR_IA32_VMX_VMFUNC: 1611 *pdata = msrs->vmfunc_controls; 1612 break; 1613 default: 1614 return 1; 1615 } 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1622 * been modified by the L1 guest. Note, "writable" in this context means 1623 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1624 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1625 * VM-exit information fields (which are actually writable if the vCPU is 1626 * configured to support "VMWRITE to any supported field in the VMCS"). 1627 */ 1628 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1629 { 1630 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1631 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1632 struct shadow_vmcs_field field; 1633 unsigned long val; 1634 int i; 1635 1636 if (WARN_ON(!shadow_vmcs)) 1637 return; 1638 1639 preempt_disable(); 1640 1641 vmcs_load(shadow_vmcs); 1642 1643 for (i = 0; i < max_shadow_read_write_fields; i++) { 1644 field = shadow_read_write_fields[i]; 1645 val = __vmcs_readl(field.encoding); 1646 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1647 } 1648 1649 vmcs_clear(shadow_vmcs); 1650 vmcs_load(vmx->loaded_vmcs->vmcs); 1651 1652 preempt_enable(); 1653 } 1654 1655 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1656 { 1657 const struct shadow_vmcs_field *fields[] = { 1658 shadow_read_write_fields, 1659 shadow_read_only_fields 1660 }; 1661 const int max_fields[] = { 1662 max_shadow_read_write_fields, 1663 max_shadow_read_only_fields 1664 }; 1665 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1666 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1667 struct shadow_vmcs_field field; 1668 unsigned long val; 1669 int i, q; 1670 1671 if (WARN_ON(!shadow_vmcs)) 1672 return; 1673 1674 vmcs_load(shadow_vmcs); 1675 1676 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1677 for (i = 0; i < max_fields[q]; i++) { 1678 field = fields[q][i]; 1679 val = vmcs12_read_any(vmcs12, field.encoding, 1680 field.offset); 1681 __vmcs_writel(field.encoding, val); 1682 } 1683 } 1684 1685 vmcs_clear(shadow_vmcs); 1686 vmcs_load(vmx->loaded_vmcs->vmcs); 1687 } 1688 1689 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1690 { 1691 #ifdef CONFIG_KVM_HYPERV 1692 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1693 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1694 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1695 1696 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1697 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1698 vmcs12->guest_rip = evmcs->guest_rip; 1699 1700 if (unlikely(!(hv_clean_fields & 1701 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1702 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1703 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1704 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1705 } 1706 1707 if (unlikely(!(hv_clean_fields & 1708 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1709 vmcs12->guest_rsp = evmcs->guest_rsp; 1710 vmcs12->guest_rflags = evmcs->guest_rflags; 1711 vmcs12->guest_interruptibility_info = 1712 evmcs->guest_interruptibility_info; 1713 /* 1714 * Not present in struct vmcs12: 1715 * vmcs12->guest_ssp = evmcs->guest_ssp; 1716 */ 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1721 vmcs12->cpu_based_vm_exec_control = 1722 evmcs->cpu_based_vm_exec_control; 1723 } 1724 1725 if (unlikely(!(hv_clean_fields & 1726 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1727 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1728 } 1729 1730 if (unlikely(!(hv_clean_fields & 1731 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1732 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1733 } 1734 1735 if (unlikely(!(hv_clean_fields & 1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1737 vmcs12->vm_entry_intr_info_field = 1738 evmcs->vm_entry_intr_info_field; 1739 vmcs12->vm_entry_exception_error_code = 1740 evmcs->vm_entry_exception_error_code; 1741 vmcs12->vm_entry_instruction_len = 1742 evmcs->vm_entry_instruction_len; 1743 } 1744 1745 if (unlikely(!(hv_clean_fields & 1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1747 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1748 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1749 vmcs12->host_cr0 = evmcs->host_cr0; 1750 vmcs12->host_cr3 = evmcs->host_cr3; 1751 vmcs12->host_cr4 = evmcs->host_cr4; 1752 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1753 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1754 vmcs12->host_rip = evmcs->host_rip; 1755 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1756 vmcs12->host_es_selector = evmcs->host_es_selector; 1757 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1758 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1759 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1760 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1761 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1762 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1763 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1764 /* 1765 * Not present in struct vmcs12: 1766 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1767 * vmcs12->host_ssp = evmcs->host_ssp; 1768 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1769 */ 1770 } 1771 1772 if (unlikely(!(hv_clean_fields & 1773 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1774 vmcs12->pin_based_vm_exec_control = 1775 evmcs->pin_based_vm_exec_control; 1776 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1777 vmcs12->secondary_vm_exec_control = 1778 evmcs->secondary_vm_exec_control; 1779 } 1780 1781 if (unlikely(!(hv_clean_fields & 1782 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1783 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1784 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1785 } 1786 1787 if (unlikely(!(hv_clean_fields & 1788 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1789 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1790 } 1791 1792 if (unlikely(!(hv_clean_fields & 1793 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1794 vmcs12->guest_es_base = evmcs->guest_es_base; 1795 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1796 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1797 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1798 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1799 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1800 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1801 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1802 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1803 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1804 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1805 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1806 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1807 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1808 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1809 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1810 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1811 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1812 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1813 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1814 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1815 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1816 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1817 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1818 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1819 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1820 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1821 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1822 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1823 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1824 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1825 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1826 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1827 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1828 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1829 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1830 } 1831 1832 if (unlikely(!(hv_clean_fields & 1833 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1834 vmcs12->tsc_offset = evmcs->tsc_offset; 1835 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1836 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1837 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1838 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1839 } 1840 1841 if (unlikely(!(hv_clean_fields & 1842 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1843 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1844 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1845 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1846 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1847 vmcs12->guest_cr0 = evmcs->guest_cr0; 1848 vmcs12->guest_cr3 = evmcs->guest_cr3; 1849 vmcs12->guest_cr4 = evmcs->guest_cr4; 1850 vmcs12->guest_dr7 = evmcs->guest_dr7; 1851 } 1852 1853 if (unlikely(!(hv_clean_fields & 1854 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1855 vmcs12->host_fs_base = evmcs->host_fs_base; 1856 vmcs12->host_gs_base = evmcs->host_gs_base; 1857 vmcs12->host_tr_base = evmcs->host_tr_base; 1858 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1859 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1860 vmcs12->host_rsp = evmcs->host_rsp; 1861 } 1862 1863 if (unlikely(!(hv_clean_fields & 1864 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1865 vmcs12->ept_pointer = evmcs->ept_pointer; 1866 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1867 } 1868 1869 if (unlikely(!(hv_clean_fields & 1870 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1871 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1872 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1873 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1874 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1875 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1876 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1877 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1878 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1879 vmcs12->guest_pending_dbg_exceptions = 1880 evmcs->guest_pending_dbg_exceptions; 1881 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1882 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1883 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1884 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1885 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1886 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1887 /* 1888 * Not present in struct vmcs12: 1889 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1890 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1891 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1892 */ 1893 } 1894 1895 /* 1896 * Not used? 1897 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1898 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1899 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1900 * vmcs12->page_fault_error_code_mask = 1901 * evmcs->page_fault_error_code_mask; 1902 * vmcs12->page_fault_error_code_match = 1903 * evmcs->page_fault_error_code_match; 1904 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1905 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1906 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1907 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1908 */ 1909 1910 /* 1911 * Read only fields: 1912 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1913 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1914 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1915 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1916 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1917 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1918 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1919 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1920 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1921 * vmcs12->exit_qualification = evmcs->exit_qualification; 1922 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1923 * 1924 * Not present in struct vmcs12: 1925 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1926 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1927 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1928 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1929 */ 1930 1931 return; 1932 #else /* CONFIG_KVM_HYPERV */ 1933 KVM_BUG_ON(1, vmx->vcpu.kvm); 1934 #endif /* CONFIG_KVM_HYPERV */ 1935 } 1936 1937 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1938 { 1939 #ifdef CONFIG_KVM_HYPERV 1940 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1941 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1942 1943 /* 1944 * Should not be changed by KVM: 1945 * 1946 * evmcs->host_es_selector = vmcs12->host_es_selector; 1947 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1948 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1949 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1950 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1951 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1952 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1953 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1954 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1955 * evmcs->host_cr0 = vmcs12->host_cr0; 1956 * evmcs->host_cr3 = vmcs12->host_cr3; 1957 * evmcs->host_cr4 = vmcs12->host_cr4; 1958 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1959 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1960 * evmcs->host_rip = vmcs12->host_rip; 1961 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1962 * evmcs->host_fs_base = vmcs12->host_fs_base; 1963 * evmcs->host_gs_base = vmcs12->host_gs_base; 1964 * evmcs->host_tr_base = vmcs12->host_tr_base; 1965 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1966 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1967 * evmcs->host_rsp = vmcs12->host_rsp; 1968 * sync_vmcs02_to_vmcs12() doesn't read these: 1969 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1970 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1971 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1972 * evmcs->ept_pointer = vmcs12->ept_pointer; 1973 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1974 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1975 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1976 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1977 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1978 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1979 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1980 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1981 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1982 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1983 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1984 * evmcs->page_fault_error_code_mask = 1985 * vmcs12->page_fault_error_code_mask; 1986 * evmcs->page_fault_error_code_match = 1987 * vmcs12->page_fault_error_code_match; 1988 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1989 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1990 * evmcs->tsc_offset = vmcs12->tsc_offset; 1991 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1992 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1993 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1994 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1995 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1996 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1997 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1998 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1999 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2000 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2001 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2002 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2003 * 2004 * Not present in struct vmcs12: 2005 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2006 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2007 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2008 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2009 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2010 * evmcs->host_ssp = vmcs12->host_ssp; 2011 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2012 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2013 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2014 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2015 * evmcs->guest_ssp = vmcs12->guest_ssp; 2016 */ 2017 2018 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2019 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2020 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2021 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2022 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2023 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2024 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2025 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2026 2027 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2028 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2029 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2030 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2031 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2032 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2033 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2034 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2035 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2036 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2037 2038 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2039 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2040 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2041 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2042 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2043 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2044 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2045 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2046 2047 evmcs->guest_es_base = vmcs12->guest_es_base; 2048 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2049 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2050 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2051 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2052 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2053 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2054 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2055 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2056 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2057 2058 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2059 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2060 2061 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2062 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2063 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2064 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2065 2066 evmcs->guest_pending_dbg_exceptions = 2067 vmcs12->guest_pending_dbg_exceptions; 2068 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2069 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2070 2071 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2072 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2073 2074 evmcs->guest_cr0 = vmcs12->guest_cr0; 2075 evmcs->guest_cr3 = vmcs12->guest_cr3; 2076 evmcs->guest_cr4 = vmcs12->guest_cr4; 2077 evmcs->guest_dr7 = vmcs12->guest_dr7; 2078 2079 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2080 2081 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2082 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2083 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2084 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2085 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2086 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2087 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2088 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2089 2090 evmcs->exit_qualification = vmcs12->exit_qualification; 2091 2092 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2093 evmcs->guest_rsp = vmcs12->guest_rsp; 2094 evmcs->guest_rflags = vmcs12->guest_rflags; 2095 2096 evmcs->guest_interruptibility_info = 2097 vmcs12->guest_interruptibility_info; 2098 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2099 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2100 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2101 evmcs->vm_entry_exception_error_code = 2102 vmcs12->vm_entry_exception_error_code; 2103 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2104 2105 evmcs->guest_rip = vmcs12->guest_rip; 2106 2107 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2108 2109 return; 2110 #else /* CONFIG_KVM_HYPERV */ 2111 KVM_BUG_ON(1, vmx->vcpu.kvm); 2112 #endif /* CONFIG_KVM_HYPERV */ 2113 } 2114 2115 /* 2116 * This is an equivalent of the nested hypervisor executing the vmptrld 2117 * instruction. 2118 */ 2119 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2120 struct kvm_vcpu *vcpu, bool from_launch) 2121 { 2122 #ifdef CONFIG_KVM_HYPERV 2123 struct vcpu_vmx *vmx = to_vmx(vcpu); 2124 bool evmcs_gpa_changed = false; 2125 u64 evmcs_gpa; 2126 2127 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2128 return EVMPTRLD_DISABLED; 2129 2130 evmcs_gpa = nested_get_evmptr(vcpu); 2131 if (!evmptr_is_valid(evmcs_gpa)) { 2132 nested_release_evmcs(vcpu); 2133 return EVMPTRLD_DISABLED; 2134 } 2135 2136 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2137 vmx->nested.current_vmptr = INVALID_GPA; 2138 2139 nested_release_evmcs(vcpu); 2140 2141 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2142 &vmx->nested.hv_evmcs_map)) 2143 return EVMPTRLD_ERROR; 2144 2145 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2146 2147 /* 2148 * Currently, KVM only supports eVMCS version 1 2149 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2150 * value to first u32 field of eVMCS which should specify eVMCS 2151 * VersionNumber. 2152 * 2153 * Guest should be aware of supported eVMCS versions by host by 2154 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2155 * expected to set this CPUID leaf according to the value 2156 * returned in vmcs_version from nested_enable_evmcs(). 2157 * 2158 * However, it turns out that Microsoft Hyper-V fails to comply 2159 * to their own invented interface: When Hyper-V use eVMCS, it 2160 * just sets first u32 field of eVMCS to revision_id specified 2161 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2162 * which is one of the supported versions specified in 2163 * CPUID.0x4000000A.EAX[0:15]. 2164 * 2165 * To overcome Hyper-V bug, we accept here either a supported 2166 * eVMCS version or VMCS12 revision_id as valid values for first 2167 * u32 field of eVMCS. 2168 */ 2169 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2170 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2171 nested_release_evmcs(vcpu); 2172 return EVMPTRLD_VMFAIL; 2173 } 2174 2175 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2176 2177 evmcs_gpa_changed = true; 2178 /* 2179 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2180 * reloaded from guest's memory (read only fields, fields not 2181 * present in struct hv_enlightened_vmcs, ...). Make sure there 2182 * are no leftovers. 2183 */ 2184 if (from_launch) { 2185 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2186 memset(vmcs12, 0, sizeof(*vmcs12)); 2187 vmcs12->hdr.revision_id = VMCS12_REVISION; 2188 } 2189 2190 } 2191 2192 /* 2193 * Clean fields data can't be used on VMLAUNCH and when we switch 2194 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2195 */ 2196 if (from_launch || evmcs_gpa_changed) { 2197 vmx->nested.hv_evmcs->hv_clean_fields &= 2198 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2199 2200 vmx->nested.force_msr_bitmap_recalc = true; 2201 } 2202 2203 return EVMPTRLD_SUCCEEDED; 2204 #else 2205 return EVMPTRLD_DISABLED; 2206 #endif 2207 } 2208 2209 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2210 { 2211 struct vcpu_vmx *vmx = to_vmx(vcpu); 2212 2213 if (nested_vmx_is_evmptr12_valid(vmx)) 2214 copy_vmcs12_to_enlightened(vmx); 2215 else 2216 copy_vmcs12_to_shadow(vmx); 2217 2218 vmx->nested.need_vmcs12_to_shadow_sync = false; 2219 } 2220 2221 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2222 { 2223 struct vcpu_vmx *vmx = 2224 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2225 2226 vmx->nested.preemption_timer_expired = true; 2227 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2228 kvm_vcpu_kick(&vmx->vcpu); 2229 2230 return HRTIMER_NORESTART; 2231 } 2232 2233 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2234 { 2235 struct vcpu_vmx *vmx = to_vmx(vcpu); 2236 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2237 2238 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2239 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2240 2241 if (!vmx->nested.has_preemption_timer_deadline) { 2242 vmx->nested.preemption_timer_deadline = 2243 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2244 vmx->nested.has_preemption_timer_deadline = true; 2245 } 2246 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2247 } 2248 2249 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2250 u64 preemption_timeout) 2251 { 2252 struct vcpu_vmx *vmx = to_vmx(vcpu); 2253 2254 /* 2255 * A timer value of zero is architecturally guaranteed to cause 2256 * a VMExit prior to executing any instructions in the guest. 2257 */ 2258 if (preemption_timeout == 0) { 2259 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2260 return; 2261 } 2262 2263 if (vcpu->arch.virtual_tsc_khz == 0) 2264 return; 2265 2266 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2267 preemption_timeout *= 1000000; 2268 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2269 hrtimer_start(&vmx->nested.preemption_timer, 2270 ktime_add_ns(ktime_get(), preemption_timeout), 2271 HRTIMER_MODE_ABS_PINNED); 2272 } 2273 2274 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2275 { 2276 if (vmx->nested.nested_run_pending && 2277 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2278 return vmcs12->guest_ia32_efer; 2279 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2280 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2281 else 2282 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2283 } 2284 2285 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2286 { 2287 struct kvm *kvm = vmx->vcpu.kvm; 2288 2289 /* 2290 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2291 * according to L0's settings (vmcs12 is irrelevant here). Host 2292 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2293 * will be set as needed prior to VMLAUNCH/VMRESUME. 2294 */ 2295 if (vmx->nested.vmcs02_initialized) 2296 return; 2297 vmx->nested.vmcs02_initialized = true; 2298 2299 /* 2300 * We don't care what the EPTP value is we just need to guarantee 2301 * it's valid so we don't get a false positive when doing early 2302 * consistency checks. 2303 */ 2304 if (enable_ept && nested_early_check) 2305 vmcs_write64(EPT_POINTER, 2306 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2307 2308 if (vmx->ve_info) 2309 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2310 2311 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2312 if (cpu_has_vmx_vmfunc()) 2313 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2314 2315 if (cpu_has_vmx_posted_intr()) 2316 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2317 2318 if (cpu_has_vmx_msr_bitmap()) 2319 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2320 2321 /* 2322 * PML is emulated for L2, but never enabled in hardware as the MMU 2323 * handles A/D emulation. Disabling PML for L2 also avoids having to 2324 * deal with filtering out L2 GPAs from the buffer. 2325 */ 2326 if (enable_pml) { 2327 vmcs_write64(PML_ADDRESS, 0); 2328 vmcs_write16(GUEST_PML_INDEX, -1); 2329 } 2330 2331 if (cpu_has_vmx_encls_vmexit()) 2332 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2333 2334 if (kvm_notify_vmexit_enabled(kvm)) 2335 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2336 2337 /* 2338 * Set the MSR load/store lists to match L0's settings. Only the 2339 * addresses are constant (for vmcs02), the counts can change based 2340 * on L2's behavior, e.g. switching to/from long mode. 2341 */ 2342 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2343 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2344 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2345 2346 vmx_set_constant_host_state(vmx); 2347 } 2348 2349 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2350 struct vmcs12 *vmcs12) 2351 { 2352 prepare_vmcs02_constant_state(vmx); 2353 2354 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2355 2356 /* 2357 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2358 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2359 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2360 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2361 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2362 * required flushes), but doing so would cause KVM to over-flush. E.g. 2363 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2364 * and then runs L2 X again, then KVM can and should retain TLB entries 2365 * for VPID12=1. 2366 */ 2367 if (enable_vpid) { 2368 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2369 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2370 else 2371 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2372 } 2373 } 2374 2375 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2376 struct vmcs12 *vmcs12) 2377 { 2378 u32 exec_control; 2379 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2380 2381 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2382 prepare_vmcs02_early_rare(vmx, vmcs12); 2383 2384 /* 2385 * PIN CONTROLS 2386 */ 2387 exec_control = __pin_controls_get(vmcs01); 2388 exec_control |= (vmcs12->pin_based_vm_exec_control & 2389 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2390 2391 /* Posted interrupts setting is only taken from vmcs12. */ 2392 vmx->nested.pi_pending = false; 2393 if (nested_cpu_has_posted_intr(vmcs12)) { 2394 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2395 } else { 2396 vmx->nested.posted_intr_nv = -1; 2397 exec_control &= ~PIN_BASED_POSTED_INTR; 2398 } 2399 pin_controls_set(vmx, exec_control); 2400 2401 /* 2402 * EXEC CONTROLS 2403 */ 2404 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2405 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2406 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2407 exec_control &= ~CPU_BASED_TPR_SHADOW; 2408 exec_control |= vmcs12->cpu_based_vm_exec_control; 2409 2410 vmx->nested.l1_tpr_threshold = -1; 2411 if (exec_control & CPU_BASED_TPR_SHADOW) 2412 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2413 #ifdef CONFIG_X86_64 2414 else 2415 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2416 CPU_BASED_CR8_STORE_EXITING; 2417 #endif 2418 2419 /* 2420 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2421 * for I/O port accesses. 2422 */ 2423 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2424 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2425 2426 /* 2427 * This bit will be computed in nested_get_vmcs12_pages, because 2428 * we do not have access to L1's MSR bitmap yet. For now, keep 2429 * the same bit as before, hoping to avoid multiple VMWRITEs that 2430 * only set/clear this bit. 2431 */ 2432 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2433 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2434 2435 exec_controls_set(vmx, exec_control); 2436 2437 /* 2438 * SECONDARY EXEC CONTROLS 2439 */ 2440 if (cpu_has_secondary_exec_ctrls()) { 2441 exec_control = __secondary_exec_controls_get(vmcs01); 2442 2443 /* Take the following fields only from vmcs12 */ 2444 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2445 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2446 SECONDARY_EXEC_ENABLE_INVPCID | 2447 SECONDARY_EXEC_ENABLE_RDTSCP | 2448 SECONDARY_EXEC_ENABLE_XSAVES | 2449 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2450 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2451 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2452 SECONDARY_EXEC_ENABLE_VMFUNC | 2453 SECONDARY_EXEC_DESC); 2454 2455 if (nested_cpu_has(vmcs12, 2456 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2457 exec_control |= vmcs12->secondary_vm_exec_control; 2458 2459 /* PML is emulated and never enabled in hardware for L2. */ 2460 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2461 2462 /* VMCS shadowing for L2 is emulated for now */ 2463 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2464 2465 /* 2466 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2467 * will not have to rewrite the controls just for this bit. 2468 */ 2469 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2470 exec_control |= SECONDARY_EXEC_DESC; 2471 2472 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2473 vmcs_write16(GUEST_INTR_STATUS, 2474 vmcs12->guest_intr_status); 2475 2476 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2477 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2478 2479 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2480 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2481 2482 secondary_exec_controls_set(vmx, exec_control); 2483 } 2484 2485 /* 2486 * ENTRY CONTROLS 2487 * 2488 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2489 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2490 * on the related bits (if supported by the CPU) in the hope that 2491 * we can avoid VMWrites during vmx_set_efer(). 2492 * 2493 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2494 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2495 * do the same for L2. 2496 */ 2497 exec_control = __vm_entry_controls_get(vmcs01); 2498 exec_control |= (vmcs12->vm_entry_controls & 2499 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2500 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2501 if (cpu_has_load_ia32_efer()) { 2502 if (guest_efer & EFER_LMA) 2503 exec_control |= VM_ENTRY_IA32E_MODE; 2504 if (guest_efer != kvm_host.efer) 2505 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2506 } 2507 vm_entry_controls_set(vmx, exec_control); 2508 2509 /* 2510 * EXIT CONTROLS 2511 * 2512 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2513 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2514 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2515 */ 2516 exec_control = __vm_exit_controls_get(vmcs01); 2517 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2518 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2519 else 2520 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2521 vm_exit_controls_set(vmx, exec_control); 2522 2523 /* 2524 * Interrupt/Exception Fields 2525 */ 2526 if (vmx->nested.nested_run_pending) { 2527 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2528 vmcs12->vm_entry_intr_info_field); 2529 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2530 vmcs12->vm_entry_exception_error_code); 2531 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2532 vmcs12->vm_entry_instruction_len); 2533 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2534 vmcs12->guest_interruptibility_info); 2535 vmx->loaded_vmcs->nmi_known_unmasked = 2536 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2537 } else { 2538 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2539 } 2540 } 2541 2542 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2543 u64 *ssp, u64 *ssp_tbl) 2544 { 2545 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2546 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2547 *s_cet = vmcs_readl(GUEST_S_CET); 2548 2549 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2550 *ssp = vmcs_readl(GUEST_SSP); 2551 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2552 } 2553 } 2554 2555 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2556 u64 ssp, u64 ssp_tbl) 2557 { 2558 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2559 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2560 vmcs_writel(GUEST_S_CET, s_cet); 2561 2562 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2563 vmcs_writel(GUEST_SSP, ssp); 2564 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2565 } 2566 } 2567 2568 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2569 { 2570 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2571 2572 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2573 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2574 2575 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2576 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2577 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2578 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2579 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2580 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2581 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2582 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2583 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2584 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2585 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2586 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2587 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2588 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2589 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2590 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2591 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2592 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2593 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2594 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2595 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2596 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2597 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2598 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2599 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2600 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2601 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2602 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2603 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2604 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2605 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2606 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2607 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2608 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2609 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2610 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2611 2612 vmx_segment_cache_clear(vmx); 2613 } 2614 2615 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2616 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2617 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2618 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2619 vmcs12->guest_pending_dbg_exceptions); 2620 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2621 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2622 2623 /* 2624 * L1 may access the L2's PDPTR, so save them to construct 2625 * vmcs12 2626 */ 2627 if (enable_ept) { 2628 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2629 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2630 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2631 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2632 } 2633 2634 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2635 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2636 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2637 } 2638 2639 if (nested_cpu_has_xsaves(vmcs12)) 2640 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2641 2642 /* 2643 * Whether page-faults are trapped is determined by a combination of 2644 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2645 * doesn't care about page faults then we should set all of these to 2646 * L1's desires. However, if L0 does care about (some) page faults, it 2647 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2648 * simply ask to exit on each and every L2 page fault. This is done by 2649 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2650 * Note that below we don't need special code to set EB.PF beyond the 2651 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2652 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2653 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2654 */ 2655 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2656 /* 2657 * TODO: if both L0 and L1 need the same MASK and MATCH, 2658 * go ahead and use it? 2659 */ 2660 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2661 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2662 } else { 2663 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2664 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2665 } 2666 2667 if (cpu_has_vmx_apicv()) { 2668 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2669 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2670 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2671 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2672 } 2673 2674 /* 2675 * Make sure the msr_autostore list is up to date before we set the 2676 * count in the vmcs02. 2677 */ 2678 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2679 2680 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2681 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2682 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2683 2684 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2685 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2686 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2687 2688 set_cr4_guest_host_mask(vmx); 2689 } 2690 2691 /* 2692 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2693 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2694 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2695 * guest in a way that will both be appropriate to L1's requests, and our 2696 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2697 * function also has additional necessary side-effects, like setting various 2698 * vcpu->arch fields. 2699 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2700 * is assigned to entry_failure_code on failure. 2701 */ 2702 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2703 bool from_vmentry, 2704 enum vm_entry_failure_code *entry_failure_code) 2705 { 2706 struct vcpu_vmx *vmx = to_vmx(vcpu); 2707 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2708 bool load_guest_pdptrs_vmcs12 = false; 2709 2710 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2711 prepare_vmcs02_rare(vmx, vmcs12); 2712 vmx->nested.dirty_vmcs12 = false; 2713 2714 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2715 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2716 } 2717 2718 if (vmx->nested.nested_run_pending && 2719 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2720 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2721 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2722 vmx_get_supported_debugctl(vcpu, false)); 2723 } else { 2724 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2725 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2726 } 2727 2728 if (!vmx->nested.nested_run_pending || 2729 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2730 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2731 vmx->nested.pre_vmenter_ssp, 2732 vmx->nested.pre_vmenter_ssp_tbl); 2733 2734 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2735 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2736 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2737 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2738 2739 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2740 * bitwise-or of what L1 wants to trap for L2, and what we want to 2741 * trap. Note that CR0.TS also needs updating - we do this later. 2742 */ 2743 vmx_update_exception_bitmap(vcpu); 2744 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2745 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2746 2747 if (vmx->nested.nested_run_pending && 2748 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2749 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2750 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2751 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2752 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2753 } 2754 2755 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2756 vcpu->arch.l1_tsc_offset, 2757 vmx_get_l2_tsc_offset(vcpu), 2758 vmx_get_l2_tsc_multiplier(vcpu)); 2759 2760 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2761 vcpu->arch.l1_tsc_scaling_ratio, 2762 vmx_get_l2_tsc_multiplier(vcpu)); 2763 2764 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2765 if (kvm_caps.has_tsc_control) 2766 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2767 2768 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2769 2770 if (nested_cpu_has_ept(vmcs12)) 2771 nested_ept_init_mmu_context(vcpu); 2772 2773 /* 2774 * Override the CR0/CR4 read shadows after setting the effective guest 2775 * CR0/CR4. The common helpers also set the shadows, but they don't 2776 * account for vmcs12's cr0/4_guest_host_mask. 2777 */ 2778 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2779 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2780 2781 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2782 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2783 2784 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2785 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2786 vmx_set_efer(vcpu, vcpu->arch.efer); 2787 2788 /* 2789 * Guest state is invalid and unrestricted guest is disabled, 2790 * which means L1 attempted VMEntry to L2 with invalid state. 2791 * Fail the VMEntry. 2792 * 2793 * However when force loading the guest state (SMM exit or 2794 * loading nested state after migration, it is possible to 2795 * have invalid guest state now, which will be later fixed by 2796 * restoring L2 register state 2797 */ 2798 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2799 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2800 return -EINVAL; 2801 } 2802 2803 /* Shadow page tables on either EPT or shadow page tables. */ 2804 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2805 from_vmentry, entry_failure_code)) 2806 return -EINVAL; 2807 2808 /* 2809 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2810 * on nested VM-Exit, which can occur without actually running L2 and 2811 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2812 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2813 * transition to HLT instead of running L2. 2814 */ 2815 if (enable_ept) 2816 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2817 2818 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2819 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2820 is_pae_paging(vcpu)) { 2821 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2822 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2823 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2824 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2825 } 2826 2827 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2828 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2829 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2830 vmcs12->guest_ia32_perf_global_ctrl))) { 2831 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2832 return -EINVAL; 2833 } 2834 2835 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2836 kvm_rip_write(vcpu, vmcs12->guest_rip); 2837 2838 /* 2839 * It was observed that genuine Hyper-V running in L1 doesn't reset 2840 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2841 * bits when it changes a field in eVMCS. Mark all fields as clean 2842 * here. 2843 */ 2844 if (nested_vmx_is_evmptr12_valid(vmx)) 2845 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2846 2847 return 0; 2848 } 2849 2850 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2851 { 2852 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2853 nested_cpu_has_virtual_nmis(vmcs12))) 2854 return -EINVAL; 2855 2856 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2857 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2858 return -EINVAL; 2859 2860 return 0; 2861 } 2862 2863 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2864 { 2865 struct vcpu_vmx *vmx = to_vmx(vcpu); 2866 2867 /* Check for memory type validity */ 2868 switch (new_eptp & VMX_EPTP_MT_MASK) { 2869 case VMX_EPTP_MT_UC: 2870 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2871 return false; 2872 break; 2873 case VMX_EPTP_MT_WB: 2874 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2875 return false; 2876 break; 2877 default: 2878 return false; 2879 } 2880 2881 /* Page-walk levels validity. */ 2882 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2883 case VMX_EPTP_PWL_5: 2884 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2885 return false; 2886 break; 2887 case VMX_EPTP_PWL_4: 2888 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2889 return false; 2890 break; 2891 default: 2892 return false; 2893 } 2894 2895 /* Reserved bits should not be set */ 2896 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2897 return false; 2898 2899 /* AD, if set, should be supported */ 2900 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2901 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2902 return false; 2903 } 2904 2905 return true; 2906 } 2907 2908 /* 2909 * Checks related to VM-Execution Control Fields 2910 */ 2911 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2912 struct vmcs12 *vmcs12) 2913 { 2914 struct vcpu_vmx *vmx = to_vmx(vcpu); 2915 2916 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2917 vmx->nested.msrs.pinbased_ctls_low, 2918 vmx->nested.msrs.pinbased_ctls_high)) || 2919 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2920 vmx->nested.msrs.procbased_ctls_low, 2921 vmx->nested.msrs.procbased_ctls_high))) 2922 return -EINVAL; 2923 2924 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2925 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2926 vmx->nested.msrs.secondary_ctls_low, 2927 vmx->nested.msrs.secondary_ctls_high))) 2928 return -EINVAL; 2929 2930 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2931 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2932 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2933 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2934 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2935 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2936 nested_vmx_check_nmi_controls(vmcs12) || 2937 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2938 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2939 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2940 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2941 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2942 return -EINVAL; 2943 2944 if (!nested_cpu_has_preemption_timer(vmcs12) && 2945 nested_cpu_has_save_preemption_timer(vmcs12)) 2946 return -EINVAL; 2947 2948 if (nested_cpu_has_ept(vmcs12) && 2949 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2950 return -EINVAL; 2951 2952 if (nested_cpu_has_vmfunc(vmcs12)) { 2953 if (CC(vmcs12->vm_function_control & 2954 ~vmx->nested.msrs.vmfunc_controls)) 2955 return -EINVAL; 2956 2957 if (nested_cpu_has_eptp_switching(vmcs12)) { 2958 if (CC(!nested_cpu_has_ept(vmcs12)) || 2959 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2960 return -EINVAL; 2961 } 2962 } 2963 2964 return 0; 2965 } 2966 2967 /* 2968 * Checks related to VM-Exit Control Fields 2969 */ 2970 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2971 struct vmcs12 *vmcs12) 2972 { 2973 struct vcpu_vmx *vmx = to_vmx(vcpu); 2974 2975 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2976 vmx->nested.msrs.exit_ctls_low, 2977 vmx->nested.msrs.exit_ctls_high)) || 2978 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2979 return -EINVAL; 2980 2981 return 0; 2982 } 2983 2984 /* 2985 * Checks related to VM-Entry Control Fields 2986 */ 2987 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2988 struct vmcs12 *vmcs12) 2989 { 2990 struct vcpu_vmx *vmx = to_vmx(vcpu); 2991 2992 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2993 vmx->nested.msrs.entry_ctls_low, 2994 vmx->nested.msrs.entry_ctls_high))) 2995 return -EINVAL; 2996 2997 /* 2998 * From the Intel SDM, volume 3: 2999 * Fields relevant to VM-entry event injection must be set properly. 3000 * These fields are the VM-entry interruption-information field, the 3001 * VM-entry exception error code, and the VM-entry instruction length. 3002 */ 3003 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3004 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3005 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3006 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3007 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3008 bool urg = nested_cpu_has2(vmcs12, 3009 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3010 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3011 3012 /* VM-entry interruption-info field: interruption type */ 3013 if (CC(intr_type == INTR_TYPE_RESERVED) || 3014 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3015 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3016 return -EINVAL; 3017 3018 /* VM-entry interruption-info field: vector */ 3019 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3020 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3021 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3022 return -EINVAL; 3023 3024 /* 3025 * Cannot deliver error code in real mode or if the interrupt 3026 * type is not hardware exception. For other cases, do the 3027 * consistency check only if the vCPU doesn't enumerate 3028 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3029 */ 3030 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3031 if (CC(has_error_code)) 3032 return -EINVAL; 3033 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3034 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3035 return -EINVAL; 3036 } 3037 3038 /* VM-entry exception error code */ 3039 if (CC(has_error_code && 3040 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3041 return -EINVAL; 3042 3043 /* VM-entry interruption-info field: reserved bits */ 3044 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3045 return -EINVAL; 3046 3047 /* VM-entry instruction length */ 3048 switch (intr_type) { 3049 case INTR_TYPE_SOFT_EXCEPTION: 3050 case INTR_TYPE_SOFT_INTR: 3051 case INTR_TYPE_PRIV_SW_EXCEPTION: 3052 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3053 CC(vmcs12->vm_entry_instruction_len == 0 && 3054 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3055 return -EINVAL; 3056 } 3057 } 3058 3059 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3060 return -EINVAL; 3061 3062 return 0; 3063 } 3064 3065 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3066 struct vmcs12 *vmcs12) 3067 { 3068 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3069 nested_check_vm_exit_controls(vcpu, vmcs12) || 3070 nested_check_vm_entry_controls(vcpu, vmcs12)) 3071 return -EINVAL; 3072 3073 #ifdef CONFIG_KVM_HYPERV 3074 if (guest_cpu_cap_has_evmcs(vcpu)) 3075 return nested_evmcs_check_controls(vmcs12); 3076 #endif 3077 3078 return 0; 3079 } 3080 3081 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3082 struct vmcs12 *vmcs12) 3083 { 3084 #ifdef CONFIG_X86_64 3085 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3086 !!(vcpu->arch.efer & EFER_LMA))) 3087 return -EINVAL; 3088 #endif 3089 return 0; 3090 } 3091 3092 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3093 { 3094 /* 3095 * Check that the given linear address is canonical after a VM exit 3096 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3097 */ 3098 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3099 3100 return !__is_canonical_address(la, l1_address_bits_on_exit); 3101 } 3102 3103 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3104 u64 ssp, u64 ssp_tbl) 3105 { 3106 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3107 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3108 return -EINVAL; 3109 3110 return 0; 3111 } 3112 3113 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3114 struct vmcs12 *vmcs12) 3115 { 3116 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3117 3118 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3119 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3120 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3121 return -EINVAL; 3122 3123 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3124 return -EINVAL; 3125 3126 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3127 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3128 return -EINVAL; 3129 3130 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3131 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3132 return -EINVAL; 3133 3134 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3135 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3136 vmcs12->host_ia32_perf_global_ctrl))) 3137 return -EINVAL; 3138 3139 if (ia32e) { 3140 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3141 return -EINVAL; 3142 } else { 3143 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3144 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3145 CC((vmcs12->host_rip) >> 32)) 3146 return -EINVAL; 3147 } 3148 3149 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3150 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3151 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3152 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3153 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3154 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3155 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3156 CC(vmcs12->host_cs_selector == 0) || 3157 CC(vmcs12->host_tr_selector == 0) || 3158 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3159 return -EINVAL; 3160 3161 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3162 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3163 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3164 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3165 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3166 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3167 return -EINVAL; 3168 3169 /* 3170 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3171 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3172 * the values of the LMA and LME bits in the field must each be that of 3173 * the host address-space size VM-exit control. 3174 */ 3175 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3176 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3177 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3178 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3179 return -EINVAL; 3180 } 3181 3182 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3183 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3184 vmcs12->host_ssp, 3185 vmcs12->host_ssp_tbl)) 3186 return -EINVAL; 3187 3188 /* 3189 * IA32_S_CET and SSP must be canonical if the host will 3190 * enter 64-bit mode after VM-exit; otherwise, higher 3191 * 32-bits must be all 0s. 3192 */ 3193 if (ia32e) { 3194 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3195 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3196 return -EINVAL; 3197 } else { 3198 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3199 return -EINVAL; 3200 } 3201 } 3202 3203 return 0; 3204 } 3205 3206 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3207 struct vmcs12 *vmcs12) 3208 { 3209 struct vcpu_vmx *vmx = to_vmx(vcpu); 3210 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3211 struct vmcs_hdr hdr; 3212 3213 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3214 return 0; 3215 3216 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3217 return -EINVAL; 3218 3219 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3220 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3221 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3222 return -EINVAL; 3223 3224 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3225 offsetof(struct vmcs12, hdr), 3226 sizeof(hdr)))) 3227 return -EINVAL; 3228 3229 if (CC(hdr.revision_id != VMCS12_REVISION) || 3230 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3231 return -EINVAL; 3232 3233 return 0; 3234 } 3235 3236 /* 3237 * Checks related to Guest Non-register State 3238 */ 3239 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3240 { 3241 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3242 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3243 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3244 return -EINVAL; 3245 3246 return 0; 3247 } 3248 3249 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3250 struct vmcs12 *vmcs12, 3251 enum vm_entry_failure_code *entry_failure_code) 3252 { 3253 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3254 3255 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3256 3257 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3258 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3259 return -EINVAL; 3260 3261 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3262 return -EINVAL; 3263 3264 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3265 (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3266 CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) 3267 return -EINVAL; 3268 3269 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3270 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3271 return -EINVAL; 3272 3273 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3274 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3275 return -EINVAL; 3276 } 3277 3278 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3279 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3280 vmcs12->guest_ia32_perf_global_ctrl))) 3281 return -EINVAL; 3282 3283 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3284 return -EINVAL; 3285 3286 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3287 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3288 return -EINVAL; 3289 3290 /* 3291 * If the load IA32_EFER VM-entry control is 1, the following checks 3292 * are performed on the field for the IA32_EFER MSR: 3293 * - Bits reserved in the IA32_EFER MSR must be 0. 3294 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3295 * the IA-32e mode guest VM-exit control. It must also be identical 3296 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3297 * CR0.PG) is 1. 3298 */ 3299 if (to_vmx(vcpu)->nested.nested_run_pending && 3300 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3301 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3302 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3303 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3304 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3305 return -EINVAL; 3306 } 3307 3308 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3309 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3310 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3311 return -EINVAL; 3312 3313 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3314 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3315 vmcs12->guest_ssp, 3316 vmcs12->guest_ssp_tbl)) 3317 return -EINVAL; 3318 3319 /* 3320 * Guest SSP must have 63:N bits identical, rather than 3321 * be canonical (i.e., 63:N-1 bits identical), where N is 3322 * the CPU's maximum linear-address width. Similar to 3323 * is_noncanonical_msr_address(), use the host's 3324 * linear-address width. 3325 */ 3326 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3327 return -EINVAL; 3328 } 3329 3330 if (nested_check_guest_non_reg_state(vmcs12)) 3331 return -EINVAL; 3332 3333 return 0; 3334 } 3335 3336 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3337 { 3338 struct vcpu_vmx *vmx = to_vmx(vcpu); 3339 unsigned long cr3, cr4; 3340 bool vm_fail; 3341 3342 if (!nested_early_check) 3343 return 0; 3344 3345 if (vmx->msr_autoload.host.nr) 3346 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3347 if (vmx->msr_autoload.guest.nr) 3348 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3349 3350 preempt_disable(); 3351 3352 vmx_prepare_switch_to_guest(vcpu); 3353 3354 /* 3355 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3356 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3357 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3358 * there is no need to preserve other bits or save/restore the field. 3359 */ 3360 vmcs_writel(GUEST_RFLAGS, 0); 3361 3362 cr3 = __get_current_cr3_fast(); 3363 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3364 vmcs_writel(HOST_CR3, cr3); 3365 vmx->loaded_vmcs->host_state.cr3 = cr3; 3366 } 3367 3368 cr4 = cr4_read_shadow(); 3369 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3370 vmcs_writel(HOST_CR4, cr4); 3371 vmx->loaded_vmcs->host_state.cr4 = cr4; 3372 } 3373 3374 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3375 __vmx_vcpu_run_flags(vmx)); 3376 3377 if (vmx->msr_autoload.host.nr) 3378 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3379 if (vmx->msr_autoload.guest.nr) 3380 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3381 3382 if (vm_fail) { 3383 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3384 3385 preempt_enable(); 3386 3387 trace_kvm_nested_vmenter_failed( 3388 "early hardware check VM-instruction error: ", error); 3389 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3390 return 1; 3391 } 3392 3393 /* 3394 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3395 */ 3396 if (hw_breakpoint_active()) 3397 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3398 local_irq_enable(); 3399 preempt_enable(); 3400 3401 /* 3402 * A non-failing VMEntry means we somehow entered guest mode with 3403 * an illegal RIP, and that's just the tip of the iceberg. There 3404 * is no telling what memory has been modified or what state has 3405 * been exposed to unknown code. Hitting this all but guarantees 3406 * a (very critical) hardware issue. 3407 */ 3408 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3409 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3410 3411 return 0; 3412 } 3413 3414 #ifdef CONFIG_KVM_HYPERV 3415 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3416 { 3417 struct vcpu_vmx *vmx = to_vmx(vcpu); 3418 3419 /* 3420 * hv_evmcs may end up being not mapped after migration (when 3421 * L2 was running), map it here to make sure vmcs12 changes are 3422 * properly reflected. 3423 */ 3424 if (guest_cpu_cap_has_evmcs(vcpu) && 3425 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3426 enum nested_evmptrld_status evmptrld_status = 3427 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3428 3429 if (evmptrld_status == EVMPTRLD_VMFAIL || 3430 evmptrld_status == EVMPTRLD_ERROR) 3431 return false; 3432 3433 /* 3434 * Post migration VMCS12 always provides the most actual 3435 * information, copy it to eVMCS upon entry. 3436 */ 3437 vmx->nested.need_vmcs12_to_shadow_sync = true; 3438 } 3439 3440 return true; 3441 } 3442 #endif 3443 3444 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3445 { 3446 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3447 struct vcpu_vmx *vmx = to_vmx(vcpu); 3448 struct kvm_host_map *map; 3449 3450 if (!vcpu->arch.pdptrs_from_userspace && 3451 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3452 /* 3453 * Reload the guest's PDPTRs since after a migration 3454 * the guest CR3 might be restored prior to setting the nested 3455 * state which can lead to a load of wrong PDPTRs. 3456 */ 3457 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3458 return false; 3459 } 3460 3461 3462 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3463 map = &vmx->nested.apic_access_page_map; 3464 3465 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3466 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3467 } else { 3468 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3469 __func__); 3470 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3471 vcpu->run->internal.suberror = 3472 KVM_INTERNAL_ERROR_EMULATION; 3473 vcpu->run->internal.ndata = 0; 3474 return false; 3475 } 3476 } 3477 3478 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3479 map = &vmx->nested.virtual_apic_map; 3480 3481 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3482 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3483 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3484 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3485 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3486 /* 3487 * The processor will never use the TPR shadow, simply 3488 * clear the bit from the execution control. Such a 3489 * configuration is useless, but it happens in tests. 3490 * For any other configuration, failing the vm entry is 3491 * _not_ what the processor does but it's basically the 3492 * only possibility we have. 3493 */ 3494 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3495 } else { 3496 /* 3497 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3498 * force VM-Entry to fail. 3499 */ 3500 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3501 } 3502 } 3503 3504 if (nested_cpu_has_posted_intr(vmcs12)) { 3505 map = &vmx->nested.pi_desc_map; 3506 3507 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3508 vmx->nested.pi_desc = 3509 (struct pi_desc *)(((void *)map->hva) + 3510 offset_in_page(vmcs12->posted_intr_desc_addr)); 3511 vmcs_write64(POSTED_INTR_DESC_ADDR, 3512 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3513 } else { 3514 /* 3515 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3516 * access the contents of the VMCS12 posted interrupt 3517 * descriptor. (Note that KVM may do this when it 3518 * should not, per the architectural specification.) 3519 */ 3520 vmx->nested.pi_desc = NULL; 3521 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3522 } 3523 } 3524 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3525 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3526 else 3527 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3528 3529 return true; 3530 } 3531 3532 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3533 { 3534 #ifdef CONFIG_KVM_HYPERV 3535 /* 3536 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3537 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3538 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3539 * migration. 3540 */ 3541 if (!nested_get_evmcs_page(vcpu)) { 3542 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3543 __func__); 3544 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3545 vcpu->run->internal.suberror = 3546 KVM_INTERNAL_ERROR_EMULATION; 3547 vcpu->run->internal.ndata = 0; 3548 3549 return false; 3550 } 3551 #endif 3552 3553 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3554 return false; 3555 3556 return true; 3557 } 3558 3559 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3560 { 3561 struct vmcs12 *vmcs12; 3562 struct vcpu_vmx *vmx = to_vmx(vcpu); 3563 gpa_t dst; 3564 3565 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3566 return 0; 3567 3568 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3569 return 1; 3570 3571 /* 3572 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3573 * set is already checked as part of A/D emulation. 3574 */ 3575 vmcs12 = get_vmcs12(vcpu); 3576 if (!nested_cpu_has_pml(vmcs12)) 3577 return 0; 3578 3579 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3580 vmx->nested.pml_full = true; 3581 return 1; 3582 } 3583 3584 gpa &= ~0xFFFull; 3585 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3586 3587 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3588 offset_in_page(dst), sizeof(gpa))) 3589 return 0; 3590 3591 vmcs12->guest_pml_index--; 3592 3593 return 0; 3594 } 3595 3596 /* 3597 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3598 * for running VMX instructions (except VMXON, whose prerequisites are 3599 * slightly different). It also specifies what exception to inject otherwise. 3600 * Note that many of these exceptions have priority over VM exits, so they 3601 * don't have to be checked again here. 3602 */ 3603 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3604 { 3605 if (!to_vmx(vcpu)->nested.vmxon) { 3606 kvm_queue_exception(vcpu, UD_VECTOR); 3607 return 0; 3608 } 3609 3610 if (vmx_get_cpl(vcpu)) { 3611 kvm_inject_gp(vcpu, 0); 3612 return 0; 3613 } 3614 3615 return 1; 3616 } 3617 3618 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3619 struct vmcs12 *vmcs12); 3620 3621 /* 3622 * If from_vmentry is false, this is being called from state restore (either RSM 3623 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3624 * 3625 * Returns: 3626 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3627 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3628 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3629 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3630 */ 3631 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3632 bool from_vmentry) 3633 { 3634 struct vcpu_vmx *vmx = to_vmx(vcpu); 3635 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3636 enum vm_entry_failure_code entry_failure_code; 3637 union vmx_exit_reason exit_reason = { 3638 .basic = EXIT_REASON_INVALID_STATE, 3639 .failed_vmentry = 1, 3640 }; 3641 u32 failed_index; 3642 3643 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3644 vmx->nested.current_vmptr, 3645 vmcs12->guest_rip, 3646 vmcs12->guest_intr_status, 3647 vmcs12->vm_entry_intr_info_field, 3648 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3649 vmcs12->ept_pointer, 3650 vmcs12->guest_cr3, 3651 KVM_ISA_VMX); 3652 3653 kvm_service_local_tlb_flush_requests(vcpu); 3654 3655 if (!vmx->nested.nested_run_pending || 3656 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3657 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3658 if (kvm_mpx_supported() && 3659 (!vmx->nested.nested_run_pending || 3660 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3661 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3662 3663 if (!vmx->nested.nested_run_pending || 3664 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3665 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3666 &vmx->nested.pre_vmenter_ssp, 3667 &vmx->nested.pre_vmenter_ssp_tbl); 3668 3669 /* 3670 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3671 * nested early checks are disabled. In the event of a "late" VM-Fail, 3672 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3673 * software model to the pre-VMEntry host state. When EPT is disabled, 3674 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3675 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3676 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3677 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3678 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3679 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3680 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3681 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3682 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3683 * path would need to manually save/restore vmcs01.GUEST_CR3. 3684 */ 3685 if (!enable_ept && !nested_early_check) 3686 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3687 3688 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3689 3690 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3691 3692 if (from_vmentry) { 3693 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3694 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3695 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3696 } 3697 3698 if (nested_vmx_check_vmentry_hw(vcpu)) { 3699 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3700 return NVMX_VMENTRY_VMFAIL; 3701 } 3702 3703 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3704 &entry_failure_code)) { 3705 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3706 vmcs12->exit_qualification = entry_failure_code; 3707 goto vmentry_fail_vmexit; 3708 } 3709 } 3710 3711 enter_guest_mode(vcpu); 3712 3713 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3714 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3715 vmcs12->exit_qualification = entry_failure_code; 3716 goto vmentry_fail_vmexit_guest_mode; 3717 } 3718 3719 if (from_vmentry) { 3720 failed_index = nested_vmx_load_msr(vcpu, 3721 vmcs12->vm_entry_msr_load_addr, 3722 vmcs12->vm_entry_msr_load_count); 3723 if (failed_index) { 3724 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3725 vmcs12->exit_qualification = failed_index; 3726 goto vmentry_fail_vmexit_guest_mode; 3727 } 3728 } else { 3729 /* 3730 * The MMU is not initialized to point at the right entities yet and 3731 * "get pages" would need to read data from the guest (i.e. we will 3732 * need to perform gpa to hpa translation). Request a call 3733 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3734 * have already been set at vmentry time and should not be reset. 3735 */ 3736 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3737 } 3738 3739 /* 3740 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3741 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3742 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3743 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3744 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3745 */ 3746 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3747 CPU_BASED_NMI_WINDOW_EXITING)) || 3748 kvm_apic_has_pending_init_or_sipi(vcpu) || 3749 kvm_apic_has_interrupt(vcpu)) 3750 kvm_make_request(KVM_REQ_EVENT, vcpu); 3751 3752 /* 3753 * Do not start the preemption timer hrtimer until after we know 3754 * we are successful, so that only nested_vmx_vmexit needs to cancel 3755 * the timer. 3756 */ 3757 vmx->nested.preemption_timer_expired = false; 3758 if (nested_cpu_has_preemption_timer(vmcs12)) { 3759 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3760 vmx_start_preemption_timer(vcpu, timer_value); 3761 } 3762 3763 /* 3764 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3765 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3766 * returned as far as L1 is concerned. It will only return (and set 3767 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3768 */ 3769 return NVMX_VMENTRY_SUCCESS; 3770 3771 /* 3772 * A failed consistency check that leads to a VMExit during L1's 3773 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3774 * 26.7 "VM-entry failures during or after loading guest state". 3775 */ 3776 vmentry_fail_vmexit_guest_mode: 3777 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3778 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3779 leave_guest_mode(vcpu); 3780 3781 vmentry_fail_vmexit: 3782 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3783 3784 if (!from_vmentry) 3785 return NVMX_VMENTRY_VMEXIT; 3786 3787 load_vmcs12_host_state(vcpu, vmcs12); 3788 vmcs12->vm_exit_reason = exit_reason.full; 3789 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3790 vmx->nested.need_vmcs12_to_shadow_sync = true; 3791 return NVMX_VMENTRY_VMEXIT; 3792 } 3793 3794 /* 3795 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3796 * for running an L2 nested guest. 3797 */ 3798 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3799 { 3800 struct vmcs12 *vmcs12; 3801 enum nvmx_vmentry_status status; 3802 struct vcpu_vmx *vmx = to_vmx(vcpu); 3803 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3804 enum nested_evmptrld_status evmptrld_status; 3805 3806 if (!nested_vmx_check_permission(vcpu)) 3807 return 1; 3808 3809 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3810 if (evmptrld_status == EVMPTRLD_ERROR) { 3811 kvm_queue_exception(vcpu, UD_VECTOR); 3812 return 1; 3813 } 3814 3815 kvm_pmu_branch_retired(vcpu); 3816 3817 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3818 return nested_vmx_failInvalid(vcpu); 3819 3820 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3821 vmx->nested.current_vmptr == INVALID_GPA)) 3822 return nested_vmx_failInvalid(vcpu); 3823 3824 vmcs12 = get_vmcs12(vcpu); 3825 3826 /* 3827 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3828 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3829 * rather than RFLAGS.ZF, and no error number is stored to the 3830 * VM-instruction error field. 3831 */ 3832 if (CC(vmcs12->hdr.shadow_vmcs)) 3833 return nested_vmx_failInvalid(vcpu); 3834 3835 if (nested_vmx_is_evmptr12_valid(vmx)) { 3836 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3837 3838 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3839 /* Enlightened VMCS doesn't have launch state */ 3840 vmcs12->launch_state = !launch; 3841 } else if (enable_shadow_vmcs) { 3842 copy_shadow_to_vmcs12(vmx); 3843 } 3844 3845 /* 3846 * The nested entry process starts with enforcing various prerequisites 3847 * on vmcs12 as required by the Intel SDM, and act appropriately when 3848 * they fail: As the SDM explains, some conditions should cause the 3849 * instruction to fail, while others will cause the instruction to seem 3850 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3851 * To speed up the normal (success) code path, we should avoid checking 3852 * for misconfigurations which will anyway be caught by the processor 3853 * when using the merged vmcs02. 3854 */ 3855 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3856 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3857 3858 if (CC(vmcs12->launch_state == launch)) 3859 return nested_vmx_fail(vcpu, 3860 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3861 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3862 3863 if (nested_vmx_check_controls(vcpu, vmcs12)) 3864 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3865 3866 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3867 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3868 3869 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3870 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3871 3872 /* 3873 * We're finally done with prerequisite checking, and can start with 3874 * the nested entry. 3875 */ 3876 vmx->nested.nested_run_pending = 1; 3877 vmx->nested.has_preemption_timer_deadline = false; 3878 status = nested_vmx_enter_non_root_mode(vcpu, true); 3879 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3880 goto vmentry_failed; 3881 3882 /* Hide L1D cache contents from the nested guest. */ 3883 vmx->vcpu.arch.l1tf_flush_l1d = true; 3884 3885 /* 3886 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3887 * also be used as part of restoring nVMX state for 3888 * snapshot restore (migration). 3889 * 3890 * In this flow, it is assumed that vmcs12 cache was 3891 * transferred as part of captured nVMX state and should 3892 * therefore not be read from guest memory (which may not 3893 * exist on destination host yet). 3894 */ 3895 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3896 3897 switch (vmcs12->guest_activity_state) { 3898 case GUEST_ACTIVITY_HLT: 3899 /* 3900 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3901 * awakened by event injection or by an NMI-window VM-exit or 3902 * by an interrupt-window VM-exit, halt the vcpu. 3903 */ 3904 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3905 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3906 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3907 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3908 vmx->nested.nested_run_pending = 0; 3909 return kvm_emulate_halt_noskip(vcpu); 3910 } 3911 break; 3912 case GUEST_ACTIVITY_WAIT_SIPI: 3913 vmx->nested.nested_run_pending = 0; 3914 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3915 break; 3916 default: 3917 break; 3918 } 3919 3920 return 1; 3921 3922 vmentry_failed: 3923 vmx->nested.nested_run_pending = 0; 3924 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3925 return 0; 3926 if (status == NVMX_VMENTRY_VMEXIT) 3927 return 1; 3928 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3929 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3930 } 3931 3932 /* 3933 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3934 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3935 * This function returns the new value we should put in vmcs12.guest_cr0. 3936 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3937 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3938 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3939 * didn't trap the bit, because if L1 did, so would L0). 3940 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3941 * been modified by L2, and L1 knows it. So just leave the old value of 3942 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3943 * isn't relevant, because if L0 traps this bit it can set it to anything. 3944 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3945 * changed these bits, and therefore they need to be updated, but L0 3946 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3947 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3948 */ 3949 static inline unsigned long 3950 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3951 { 3952 return 3953 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3954 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3955 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3956 vcpu->arch.cr0_guest_owned_bits)); 3957 } 3958 3959 static inline unsigned long 3960 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3961 { 3962 return 3963 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3964 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3965 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3966 vcpu->arch.cr4_guest_owned_bits)); 3967 } 3968 3969 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3970 struct vmcs12 *vmcs12, 3971 u32 vm_exit_reason, u32 exit_intr_info) 3972 { 3973 u32 idt_vectoring; 3974 unsigned int nr; 3975 3976 /* 3977 * Per the SDM, VM-Exits due to double and triple faults are never 3978 * considered to occur during event delivery, even if the double/triple 3979 * fault is the result of an escalating vectoring issue. 3980 * 3981 * Note, the SDM qualifies the double fault behavior with "The original 3982 * event results in a double-fault exception". It's unclear why the 3983 * qualification exists since exits due to double fault can occur only 3984 * while vectoring a different exception (injected events are never 3985 * subject to interception), i.e. there's _always_ an original event. 3986 * 3987 * The SDM also uses NMI as a confusing example for the "original event 3988 * causes the VM exit directly" clause. NMI isn't special in any way, 3989 * the same rule applies to all events that cause an exit directly. 3990 * NMI is an odd choice for the example because NMIs can only occur on 3991 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3992 */ 3993 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3994 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3995 is_double_fault(exit_intr_info))) { 3996 vmcs12->idt_vectoring_info_field = 0; 3997 } else if (vcpu->arch.exception.injected) { 3998 nr = vcpu->arch.exception.vector; 3999 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4000 4001 if (kvm_exception_is_soft(nr)) { 4002 vmcs12->vm_exit_instruction_len = 4003 vcpu->arch.event_exit_inst_len; 4004 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 4005 } else 4006 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 4007 4008 if (vcpu->arch.exception.has_error_code) { 4009 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 4010 vmcs12->idt_vectoring_error_code = 4011 vcpu->arch.exception.error_code; 4012 } 4013 4014 vmcs12->idt_vectoring_info_field = idt_vectoring; 4015 } else if (vcpu->arch.nmi_injected) { 4016 vmcs12->idt_vectoring_info_field = 4017 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 4018 } else if (vcpu->arch.interrupt.injected) { 4019 nr = vcpu->arch.interrupt.nr; 4020 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4021 4022 if (vcpu->arch.interrupt.soft) { 4023 idt_vectoring |= INTR_TYPE_SOFT_INTR; 4024 vmcs12->vm_entry_instruction_len = 4025 vcpu->arch.event_exit_inst_len; 4026 } else 4027 idt_vectoring |= INTR_TYPE_EXT_INTR; 4028 4029 vmcs12->idt_vectoring_info_field = idt_vectoring; 4030 } else { 4031 vmcs12->idt_vectoring_info_field = 0; 4032 } 4033 } 4034 4035 4036 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 4037 { 4038 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4039 gfn_t gfn; 4040 4041 /* 4042 * Don't need to mark the APIC access page dirty; it is never 4043 * written to by the CPU during APIC virtualization. 4044 */ 4045 4046 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 4047 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 4048 kvm_vcpu_mark_page_dirty(vcpu, gfn); 4049 } 4050 4051 if (nested_cpu_has_posted_intr(vmcs12)) { 4052 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 4053 kvm_vcpu_mark_page_dirty(vcpu, gfn); 4054 } 4055 } 4056 4057 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4058 { 4059 struct vcpu_vmx *vmx = to_vmx(vcpu); 4060 int max_irr; 4061 void *vapic_page; 4062 u16 status; 4063 4064 if (!vmx->nested.pi_pending) 4065 return 0; 4066 4067 if (!vmx->nested.pi_desc) 4068 goto mmio_needed; 4069 4070 vmx->nested.pi_pending = false; 4071 4072 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4073 return 0; 4074 4075 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4076 if (max_irr > 0) { 4077 vapic_page = vmx->nested.virtual_apic_map.hva; 4078 if (!vapic_page) 4079 goto mmio_needed; 4080 4081 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4082 vapic_page, &max_irr); 4083 status = vmcs_read16(GUEST_INTR_STATUS); 4084 if ((u8)max_irr > ((u8)status & 0xff)) { 4085 status &= ~0xff; 4086 status |= (u8)max_irr; 4087 vmcs_write16(GUEST_INTR_STATUS, status); 4088 } 4089 } 4090 4091 nested_mark_vmcs12_pages_dirty(vcpu); 4092 return 0; 4093 4094 mmio_needed: 4095 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4096 return -ENXIO; 4097 } 4098 4099 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4100 { 4101 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4102 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4103 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4104 unsigned long exit_qual; 4105 4106 if (ex->has_payload) { 4107 exit_qual = ex->payload; 4108 } else if (ex->vector == PF_VECTOR) { 4109 exit_qual = vcpu->arch.cr2; 4110 } else if (ex->vector == DB_VECTOR) { 4111 exit_qual = vcpu->arch.dr6; 4112 exit_qual &= ~DR6_BT; 4113 exit_qual ^= DR6_ACTIVE_LOW; 4114 } else { 4115 exit_qual = 0; 4116 } 4117 4118 /* 4119 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4120 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4121 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4122 */ 4123 if (ex->has_error_code && is_protmode(vcpu)) { 4124 /* 4125 * Intel CPUs do not generate error codes with bits 31:16 set, 4126 * and more importantly VMX disallows setting bits 31:16 in the 4127 * injected error code for VM-Entry. Drop the bits to mimic 4128 * hardware and avoid inducing failure on nested VM-Entry if L1 4129 * chooses to inject the exception back to L2. AMD CPUs _do_ 4130 * generate "full" 32-bit error codes, so KVM allows userspace 4131 * to inject exception error codes with bits 31:16 set. 4132 */ 4133 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4134 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4135 } 4136 4137 if (kvm_exception_is_soft(ex->vector)) 4138 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4139 else 4140 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4141 4142 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4143 vmx_get_nmi_mask(vcpu)) 4144 intr_info |= INTR_INFO_UNBLOCK_NMI; 4145 4146 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4147 } 4148 4149 /* 4150 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4151 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4152 * Using the payload is flawed because code breakpoints (fault-like) and data 4153 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4154 * this will return false positives if a to-be-injected code breakpoint #DB is 4155 * pending (from KVM's perspective, but not "pending" across an instruction 4156 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4157 * too is trap-like. 4158 * 4159 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4160 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4161 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4162 * from the emulator (because such #DBs are fault-like and thus don't trigger 4163 * actions that fire on instruction retire). 4164 */ 4165 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4166 { 4167 if (!ex->pending || ex->vector != DB_VECTOR) 4168 return 0; 4169 4170 /* General Detect #DBs are always fault-like. */ 4171 return ex->payload & ~DR6_BD; 4172 } 4173 4174 /* 4175 * Returns true if there's a pending #DB exception that is lower priority than 4176 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4177 * KVM, but could theoretically be injected by userspace. Note, this code is 4178 * imperfect, see above. 4179 */ 4180 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4181 { 4182 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4183 } 4184 4185 /* 4186 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4187 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4188 * represents these debug traps with a payload that is said to be compatible 4189 * with the 'pending debug exceptions' field, write the payload to the VMCS 4190 * field if a VM-exit is delivered before the debug trap. 4191 */ 4192 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4193 { 4194 unsigned long pending_dbg; 4195 4196 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4197 if (pending_dbg) 4198 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4199 } 4200 4201 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4202 { 4203 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4204 to_vmx(vcpu)->nested.preemption_timer_expired; 4205 } 4206 4207 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4208 { 4209 struct vcpu_vmx *vmx = to_vmx(vcpu); 4210 void *vapic = vmx->nested.virtual_apic_map.hva; 4211 int max_irr, vppr; 4212 4213 if (nested_vmx_preemption_timer_pending(vcpu) || 4214 vmx->nested.mtf_pending) 4215 return true; 4216 4217 /* 4218 * Virtual Interrupt Delivery doesn't require manual injection. Either 4219 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4220 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4221 * the interrupt from the PIR to RVI prior to entering the guest. 4222 */ 4223 if (for_injection) 4224 return false; 4225 4226 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4227 __vmx_interrupt_blocked(vcpu)) 4228 return false; 4229 4230 if (!vapic) 4231 return false; 4232 4233 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4234 4235 max_irr = vmx_get_rvi(); 4236 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4237 return true; 4238 4239 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4240 pi_test_on(vmx->nested.pi_desc)) { 4241 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4242 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4243 return true; 4244 } 4245 4246 return false; 4247 } 4248 4249 /* 4250 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4251 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4252 * and less minor edits to splice in the priority of VMX Non-Root specific 4253 * events, e.g. MTF and NMI/INTR-window exiting. 4254 * 4255 * 1 Hardware Reset and Machine Checks 4256 * - RESET 4257 * - Machine Check 4258 * 4259 * 2 Trap on Task Switch 4260 * - T flag in TSS is set (on task switch) 4261 * 4262 * 3 External Hardware Interventions 4263 * - FLUSH 4264 * - STOPCLK 4265 * - SMI 4266 * - INIT 4267 * 4268 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4269 * 4270 * 4 Traps on Previous Instruction 4271 * - Breakpoints 4272 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4273 * breakpoint, or #DB due to a split-lock access) 4274 * 4275 * 4.3 VMX-preemption timer expired VM-exit 4276 * 4277 * 4.6 NMI-window exiting VM-exit[2] 4278 * 4279 * 5 Nonmaskable Interrupts (NMI) 4280 * 4281 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4282 * 4283 * 6 Maskable Hardware Interrupts 4284 * 4285 * 7 Code Breakpoint Fault 4286 * 4287 * 8 Faults from Fetching Next Instruction 4288 * - Code-Segment Limit Violation 4289 * - Code Page Fault 4290 * - Control protection exception (missing ENDBRANCH at target of indirect 4291 * call or jump) 4292 * 4293 * 9 Faults from Decoding Next Instruction 4294 * - Instruction length > 15 bytes 4295 * - Invalid Opcode 4296 * - Coprocessor Not Available 4297 * 4298 *10 Faults on Executing Instruction 4299 * - Overflow 4300 * - Bound error 4301 * - Invalid TSS 4302 * - Segment Not Present 4303 * - Stack fault 4304 * - General Protection 4305 * - Data Page Fault 4306 * - Alignment Check 4307 * - x86 FPU Floating-point exception 4308 * - SIMD floating-point exception 4309 * - Virtualization exception 4310 * - Control protection exception 4311 * 4312 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4313 * INIT signals, and higher priority events take priority over MTF VM exits. 4314 * MTF VM exits take priority over debug-trap exceptions and lower priority 4315 * events. 4316 * 4317 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4318 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4319 * timer take priority over VM exits caused by the "NMI-window exiting" 4320 * VM-execution control and lower priority events. 4321 * 4322 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4323 * caused by "NMI-window exiting". VM exits caused by this control take 4324 * priority over non-maskable interrupts (NMIs) and lower priority events. 4325 * 4326 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4327 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4328 * non-maskable interrupts (NMIs) and higher priority events take priority over 4329 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4330 * priority over external interrupts and lower priority events. 4331 */ 4332 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4333 { 4334 struct kvm_lapic *apic = vcpu->arch.apic; 4335 struct vcpu_vmx *vmx = to_vmx(vcpu); 4336 /* 4337 * Only a pending nested run blocks a pending exception. If there is a 4338 * previously injected event, the pending exception occurred while said 4339 * event was being delivered and thus needs to be handled. 4340 */ 4341 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4342 /* 4343 * Events that don't require injection, i.e. that are virtualized by 4344 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4345 * to regain control in order to deliver the event, and hardware will 4346 * handle event ordering, e.g. with respect to injected exceptions. 4347 * 4348 * But, new events (not exceptions) are only recognized at instruction 4349 * boundaries. If an event needs reinjection, then KVM is handling a 4350 * VM-Exit that occurred _during_ instruction execution; new events, 4351 * irrespective of whether or not they're injected, are blocked until 4352 * the instruction completes. 4353 */ 4354 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4355 /* 4356 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4357 * for managing priority between concurrent events, i.e. KVM needs to 4358 * wait until after VM-Enter completes to deliver injected events. 4359 */ 4360 bool block_nested_events = block_nested_exceptions || 4361 block_non_injected_events; 4362 4363 if (lapic_in_kernel(vcpu) && 4364 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4365 if (block_nested_events) 4366 return -EBUSY; 4367 nested_vmx_update_pending_dbg(vcpu); 4368 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4369 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4370 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4371 4372 /* MTF is discarded if the vCPU is in WFS. */ 4373 vmx->nested.mtf_pending = false; 4374 return 0; 4375 } 4376 4377 if (lapic_in_kernel(vcpu) && 4378 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4379 if (block_nested_events) 4380 return -EBUSY; 4381 4382 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4383 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4384 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4385 apic->sipi_vector & 0xFFUL); 4386 return 0; 4387 } 4388 /* Fallthrough, the SIPI is completely ignored. */ 4389 } 4390 4391 /* 4392 * Process exceptions that are higher priority than Monitor Trap Flag: 4393 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4394 * could theoretically come in from userspace), and ICEBP (INT1). 4395 * 4396 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4397 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4398 * across SMI/RSM as it should; that needs to be addressed in order to 4399 * prioritize SMI over MTF and trap-like #DBs. 4400 */ 4401 if (vcpu->arch.exception_vmexit.pending && 4402 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4403 if (block_nested_exceptions) 4404 return -EBUSY; 4405 4406 nested_vmx_inject_exception_vmexit(vcpu); 4407 return 0; 4408 } 4409 4410 if (vcpu->arch.exception.pending && 4411 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4412 if (block_nested_exceptions) 4413 return -EBUSY; 4414 goto no_vmexit; 4415 } 4416 4417 if (vmx->nested.mtf_pending) { 4418 if (block_nested_events) 4419 return -EBUSY; 4420 nested_vmx_update_pending_dbg(vcpu); 4421 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4422 return 0; 4423 } 4424 4425 if (vcpu->arch.exception_vmexit.pending) { 4426 if (block_nested_exceptions) 4427 return -EBUSY; 4428 4429 nested_vmx_inject_exception_vmexit(vcpu); 4430 return 0; 4431 } 4432 4433 if (vcpu->arch.exception.pending) { 4434 if (block_nested_exceptions) 4435 return -EBUSY; 4436 goto no_vmexit; 4437 } 4438 4439 if (nested_vmx_preemption_timer_pending(vcpu)) { 4440 if (block_nested_events) 4441 return -EBUSY; 4442 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4443 return 0; 4444 } 4445 4446 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4447 if (block_nested_events) 4448 return -EBUSY; 4449 goto no_vmexit; 4450 } 4451 4452 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4453 if (block_nested_events) 4454 return -EBUSY; 4455 if (!nested_exit_on_nmi(vcpu)) 4456 goto no_vmexit; 4457 4458 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4459 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4460 INTR_INFO_VALID_MASK, 0); 4461 /* 4462 * The NMI-triggered VM exit counts as injection: 4463 * clear this one and block further NMIs. 4464 */ 4465 vcpu->arch.nmi_pending = 0; 4466 vmx_set_nmi_mask(vcpu, true); 4467 return 0; 4468 } 4469 4470 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4471 int irq; 4472 4473 if (!nested_exit_on_intr(vcpu)) { 4474 if (block_nested_events) 4475 return -EBUSY; 4476 4477 goto no_vmexit; 4478 } 4479 4480 if (!nested_exit_intr_ack_set(vcpu)) { 4481 if (block_nested_events) 4482 return -EBUSY; 4483 4484 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4485 return 0; 4486 } 4487 4488 irq = kvm_cpu_get_extint(vcpu); 4489 if (irq != -1) { 4490 if (block_nested_events) 4491 return -EBUSY; 4492 4493 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4494 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4495 return 0; 4496 } 4497 4498 irq = kvm_apic_has_interrupt(vcpu); 4499 if (WARN_ON_ONCE(irq < 0)) 4500 goto no_vmexit; 4501 4502 /* 4503 * If the IRQ is L2's PI notification vector, process posted 4504 * interrupts for L2 instead of injecting VM-Exit, as the 4505 * detection/morphing architecturally occurs when the IRQ is 4506 * delivered to the CPU. Note, only interrupts that are routed 4507 * through the local APIC trigger posted interrupt processing, 4508 * and enabling posted interrupts requires ACK-on-exit. 4509 */ 4510 if (irq == vmx->nested.posted_intr_nv) { 4511 /* 4512 * Nested posted interrupts are delivered via RVI, i.e. 4513 * aren't injected by KVM, and so can be queued even if 4514 * manual event injection is disallowed. 4515 */ 4516 if (block_non_injected_events) 4517 return -EBUSY; 4518 4519 vmx->nested.pi_pending = true; 4520 kvm_apic_clear_irr(vcpu, irq); 4521 goto no_vmexit; 4522 } 4523 4524 if (block_nested_events) 4525 return -EBUSY; 4526 4527 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4528 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4529 4530 /* 4531 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4532 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4533 * if APICv is active. 4534 */ 4535 kvm_apic_ack_interrupt(vcpu, irq); 4536 return 0; 4537 } 4538 4539 no_vmexit: 4540 return vmx_complete_nested_posted_interrupt(vcpu); 4541 } 4542 4543 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4544 { 4545 ktime_t remaining = 4546 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4547 u64 value; 4548 4549 if (ktime_to_ns(remaining) <= 0) 4550 return 0; 4551 4552 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4553 do_div(value, 1000000); 4554 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4555 } 4556 4557 static bool is_vmcs12_ext_field(unsigned long field) 4558 { 4559 switch (field) { 4560 case GUEST_ES_SELECTOR: 4561 case GUEST_CS_SELECTOR: 4562 case GUEST_SS_SELECTOR: 4563 case GUEST_DS_SELECTOR: 4564 case GUEST_FS_SELECTOR: 4565 case GUEST_GS_SELECTOR: 4566 case GUEST_LDTR_SELECTOR: 4567 case GUEST_TR_SELECTOR: 4568 case GUEST_ES_LIMIT: 4569 case GUEST_CS_LIMIT: 4570 case GUEST_SS_LIMIT: 4571 case GUEST_DS_LIMIT: 4572 case GUEST_FS_LIMIT: 4573 case GUEST_GS_LIMIT: 4574 case GUEST_LDTR_LIMIT: 4575 case GUEST_TR_LIMIT: 4576 case GUEST_GDTR_LIMIT: 4577 case GUEST_IDTR_LIMIT: 4578 case GUEST_ES_AR_BYTES: 4579 case GUEST_DS_AR_BYTES: 4580 case GUEST_FS_AR_BYTES: 4581 case GUEST_GS_AR_BYTES: 4582 case GUEST_LDTR_AR_BYTES: 4583 case GUEST_TR_AR_BYTES: 4584 case GUEST_ES_BASE: 4585 case GUEST_CS_BASE: 4586 case GUEST_SS_BASE: 4587 case GUEST_DS_BASE: 4588 case GUEST_FS_BASE: 4589 case GUEST_GS_BASE: 4590 case GUEST_LDTR_BASE: 4591 case GUEST_TR_BASE: 4592 case GUEST_GDTR_BASE: 4593 case GUEST_IDTR_BASE: 4594 case GUEST_PENDING_DBG_EXCEPTIONS: 4595 case GUEST_BNDCFGS: 4596 return true; 4597 default: 4598 break; 4599 } 4600 4601 return false; 4602 } 4603 4604 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4605 struct vmcs12 *vmcs12) 4606 { 4607 struct vcpu_vmx *vmx = to_vmx(vcpu); 4608 4609 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4610 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4611 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4612 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4613 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4614 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4615 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4616 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4617 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4618 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4619 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4620 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4621 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4622 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4623 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4624 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4625 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4626 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4627 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4628 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4629 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4630 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4631 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4632 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4633 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4634 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4635 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4636 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4637 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4638 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4639 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4640 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4641 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4642 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4643 vmcs12->guest_pending_dbg_exceptions = 4644 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4645 4646 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4647 } 4648 4649 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4650 struct vmcs12 *vmcs12) 4651 { 4652 struct vcpu_vmx *vmx = to_vmx(vcpu); 4653 int cpu; 4654 4655 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4656 return; 4657 4658 4659 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4660 4661 cpu = get_cpu(); 4662 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4663 vmx_vcpu_load_vmcs(vcpu, cpu); 4664 4665 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4666 4667 vmx->loaded_vmcs = &vmx->vmcs01; 4668 vmx_vcpu_load_vmcs(vcpu, cpu); 4669 put_cpu(); 4670 } 4671 4672 /* 4673 * Update the guest state fields of vmcs12 to reflect changes that 4674 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4675 * VM-entry controls is also updated, since this is really a guest 4676 * state bit.) 4677 */ 4678 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4679 { 4680 struct vcpu_vmx *vmx = to_vmx(vcpu); 4681 4682 if (nested_vmx_is_evmptr12_valid(vmx)) 4683 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4684 4685 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4686 !nested_vmx_is_evmptr12_valid(vmx); 4687 4688 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4689 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4690 4691 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4692 vmcs12->guest_rip = kvm_rip_read(vcpu); 4693 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4694 4695 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4696 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4697 4698 vmcs12->guest_interruptibility_info = 4699 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4700 4701 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4702 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4703 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4704 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4705 else 4706 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4707 4708 if (nested_cpu_has_preemption_timer(vmcs12) && 4709 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4710 !vmx->nested.nested_run_pending) 4711 vmcs12->vmx_preemption_timer_value = 4712 vmx_get_preemption_timer_value(vcpu); 4713 4714 /* 4715 * In some cases (usually, nested EPT), L2 is allowed to change its 4716 * own CR3 without exiting. If it has changed it, we must keep it. 4717 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4718 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4719 * 4720 * Additionally, restore L2's PDPTR to vmcs12. 4721 */ 4722 if (enable_ept) { 4723 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4724 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4725 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4726 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4727 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4728 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4729 } 4730 } 4731 4732 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4733 4734 if (nested_cpu_has_vid(vmcs12)) 4735 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4736 4737 vmcs12->vm_entry_controls = 4738 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4739 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4740 4741 /* 4742 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4743 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4744 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4745 * vmcs02 doesn't strictly track vmcs12. 4746 */ 4747 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4748 vmcs12->guest_dr7 = vcpu->arch.dr7; 4749 4750 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4751 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4752 4753 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4754 &vmcs12->guest_ssp, 4755 &vmcs12->guest_ssp_tbl); 4756 } 4757 4758 /* 4759 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4760 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4761 * and this function updates it to reflect the changes to the guest state while 4762 * L2 was running (and perhaps made some exits which were handled directly by L0 4763 * without going back to L1), and to reflect the exit reason. 4764 * Note that we do not have to copy here all VMCS fields, just those that 4765 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4766 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4767 * which already writes to vmcs12 directly. 4768 */ 4769 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4770 u32 vm_exit_reason, u32 exit_intr_info, 4771 unsigned long exit_qualification, u32 exit_insn_len) 4772 { 4773 /* update exit information fields: */ 4774 vmcs12->vm_exit_reason = vm_exit_reason; 4775 if (vmx_get_exit_reason(vcpu).enclave_mode) 4776 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4777 vmcs12->exit_qualification = exit_qualification; 4778 4779 /* 4780 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4781 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4782 * exit info fields are unmodified. 4783 */ 4784 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4785 vmcs12->launch_state = 1; 4786 4787 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4788 * instead of reading the real value. */ 4789 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4790 4791 /* 4792 * Transfer the event that L0 or L1 may wanted to inject into 4793 * L2 to IDT_VECTORING_INFO_FIELD. 4794 */ 4795 vmcs12_save_pending_event(vcpu, vmcs12, 4796 vm_exit_reason, exit_intr_info); 4797 4798 vmcs12->vm_exit_intr_info = exit_intr_info; 4799 vmcs12->vm_exit_instruction_len = exit_insn_len; 4800 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4801 4802 /* 4803 * According to spec, there's no need to store the guest's 4804 * MSRs if the exit is due to a VM-entry failure that occurs 4805 * during or after loading the guest state. Since this exit 4806 * does not fall in that category, we need to save the MSRs. 4807 */ 4808 if (nested_vmx_store_msr(vcpu, 4809 vmcs12->vm_exit_msr_store_addr, 4810 vmcs12->vm_exit_msr_store_count)) 4811 nested_vmx_abort(vcpu, 4812 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4813 } 4814 } 4815 4816 /* 4817 * A part of what we need to when the nested L2 guest exits and we want to 4818 * run its L1 parent, is to reset L1's guest state to the host state specified 4819 * in vmcs12. 4820 * This function is to be called not only on normal nested exit, but also on 4821 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4822 * Failures During or After Loading Guest State"). 4823 * This function should be called when the active VMCS is L1's (vmcs01). 4824 */ 4825 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4826 struct vmcs12 *vmcs12) 4827 { 4828 enum vm_entry_failure_code ignored; 4829 struct kvm_segment seg; 4830 4831 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4832 vcpu->arch.efer = vmcs12->host_ia32_efer; 4833 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4834 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4835 else 4836 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4837 vmx_set_efer(vcpu, vcpu->arch.efer); 4838 4839 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4840 kvm_rip_write(vcpu, vmcs12->host_rip); 4841 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4842 vmx_set_interrupt_shadow(vcpu, 0); 4843 4844 /* 4845 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4846 * actually changed, because vmx_set_cr0 refers to efer set above. 4847 * 4848 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4849 * (KVM doesn't change it); 4850 */ 4851 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4852 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4853 4854 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4855 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4856 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4857 4858 nested_ept_uninit_mmu_context(vcpu); 4859 4860 /* 4861 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4862 * couldn't have changed. 4863 */ 4864 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4865 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4866 4867 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4868 4869 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4870 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4871 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4872 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4873 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4874 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4875 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4876 4877 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4878 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4879 vmcs_write64(GUEST_BNDCFGS, 0); 4880 4881 /* 4882 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4883 * otherwise CET state should be retained across VM-exit, i.e., 4884 * guest values should be propagated from vmcs12 to vmcs01. 4885 */ 4886 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4887 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4888 vmcs12->host_ssp_tbl); 4889 else 4890 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4891 vmcs12->guest_ssp_tbl); 4892 4893 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4894 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4895 vcpu->arch.pat = vmcs12->host_ia32_pat; 4896 } 4897 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4898 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4899 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4900 vmcs12->host_ia32_perf_global_ctrl)); 4901 4902 /* Set L1 segment info according to Intel SDM 4903 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4904 seg = (struct kvm_segment) { 4905 .base = 0, 4906 .limit = 0xFFFFFFFF, 4907 .selector = vmcs12->host_cs_selector, 4908 .type = 11, 4909 .present = 1, 4910 .s = 1, 4911 .g = 1 4912 }; 4913 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4914 seg.l = 1; 4915 else 4916 seg.db = 1; 4917 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4918 seg = (struct kvm_segment) { 4919 .base = 0, 4920 .limit = 0xFFFFFFFF, 4921 .type = 3, 4922 .present = 1, 4923 .s = 1, 4924 .db = 1, 4925 .g = 1 4926 }; 4927 seg.selector = vmcs12->host_ds_selector; 4928 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4929 seg.selector = vmcs12->host_es_selector; 4930 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4931 seg.selector = vmcs12->host_ss_selector; 4932 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4933 seg.selector = vmcs12->host_fs_selector; 4934 seg.base = vmcs12->host_fs_base; 4935 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4936 seg.selector = vmcs12->host_gs_selector; 4937 seg.base = vmcs12->host_gs_base; 4938 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4939 seg = (struct kvm_segment) { 4940 .base = vmcs12->host_tr_base, 4941 .limit = 0x67, 4942 .selector = vmcs12->host_tr_selector, 4943 .type = 11, 4944 .present = 1 4945 }; 4946 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4947 4948 memset(&seg, 0, sizeof(seg)); 4949 seg.unusable = 1; 4950 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4951 4952 kvm_set_dr(vcpu, 7, 0x400); 4953 vmx_guest_debugctl_write(vcpu, 0); 4954 4955 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4956 vmcs12->vm_exit_msr_load_count)) 4957 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4958 4959 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4960 } 4961 4962 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4963 { 4964 struct vmx_uret_msr *efer_msr; 4965 unsigned int i; 4966 4967 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4968 return vmcs_read64(GUEST_IA32_EFER); 4969 4970 if (cpu_has_load_ia32_efer()) 4971 return kvm_host.efer; 4972 4973 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4974 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4975 return vmx->msr_autoload.guest.val[i].value; 4976 } 4977 4978 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4979 if (efer_msr) 4980 return efer_msr->data; 4981 4982 return kvm_host.efer; 4983 } 4984 4985 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4986 { 4987 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4988 struct vcpu_vmx *vmx = to_vmx(vcpu); 4989 struct vmx_msr_entry g, h; 4990 gpa_t gpa; 4991 u32 i, j; 4992 4993 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4994 4995 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4996 /* 4997 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4998 * as vmcs01.GUEST_DR7 contains a userspace defined value 4999 * and vcpu->arch.dr7 is not squirreled away before the 5000 * nested VMENTER (not worth adding a variable in nested_vmx). 5001 */ 5002 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 5003 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 5004 else 5005 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 5006 } 5007 5008 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 5009 vmx_reload_guest_debugctl(vcpu); 5010 5011 /* 5012 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 5013 * handle a variety of side effects to KVM's software model. 5014 */ 5015 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 5016 5017 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 5018 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 5019 5020 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 5021 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 5022 5023 nested_ept_uninit_mmu_context(vcpu); 5024 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 5025 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 5026 5027 /* 5028 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 5029 * from vmcs01 (if necessary). The PDPTRs are not loaded on 5030 * VMFail, like everything else we just need to ensure our 5031 * software model is up-to-date. 5032 */ 5033 if (enable_ept && is_pae_paging(vcpu)) 5034 ept_save_pdptrs(vcpu); 5035 5036 kvm_mmu_reset_context(vcpu); 5037 5038 /* 5039 * This nasty bit of open coding is a compromise between blindly 5040 * loading L1's MSRs using the exit load lists (incorrect emulation 5041 * of VMFail), leaving the nested VM's MSRs in the software model 5042 * (incorrect behavior) and snapshotting the modified MSRs (too 5043 * expensive since the lists are unbound by hardware). For each 5044 * MSR that was (prematurely) loaded from the nested VMEntry load 5045 * list, reload it from the exit load list if it exists and differs 5046 * from the guest value. The intent is to stuff host state as 5047 * silently as possible, not to fully process the exit load list. 5048 */ 5049 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5050 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5051 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5052 pr_debug_ratelimited( 5053 "%s read MSR index failed (%u, 0x%08llx)\n", 5054 __func__, i, gpa); 5055 goto vmabort; 5056 } 5057 5058 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5059 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5060 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5061 pr_debug_ratelimited( 5062 "%s read MSR failed (%u, 0x%08llx)\n", 5063 __func__, j, gpa); 5064 goto vmabort; 5065 } 5066 if (h.index != g.index) 5067 continue; 5068 if (h.value == g.value) 5069 break; 5070 5071 if (nested_vmx_load_msr_check(vcpu, &h)) { 5072 pr_debug_ratelimited( 5073 "%s check failed (%u, 0x%x, 0x%x)\n", 5074 __func__, j, h.index, h.reserved); 5075 goto vmabort; 5076 } 5077 5078 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5079 pr_debug_ratelimited( 5080 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5081 __func__, j, h.index, h.value); 5082 goto vmabort; 5083 } 5084 } 5085 } 5086 5087 return; 5088 5089 vmabort: 5090 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5091 } 5092 5093 /* 5094 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5095 * and modify vmcs12 to make it see what it would expect to see there if 5096 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5097 */ 5098 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5099 u32 exit_intr_info, unsigned long exit_qualification, 5100 u32 exit_insn_len) 5101 { 5102 struct vcpu_vmx *vmx = to_vmx(vcpu); 5103 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5104 5105 /* Pending MTF traps are discarded on VM-Exit. */ 5106 vmx->nested.mtf_pending = false; 5107 5108 /* trying to cancel vmlaunch/vmresume is a bug */ 5109 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5110 5111 #ifdef CONFIG_KVM_HYPERV 5112 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5113 /* 5114 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5115 * Enlightened VMCS after migration and we still need to 5116 * do that when something is forcing L2->L1 exit prior to 5117 * the first L2 run. 5118 */ 5119 (void)nested_get_evmcs_page(vcpu); 5120 } 5121 #endif 5122 5123 /* Service pending TLB flush requests for L2 before switching to L1. */ 5124 kvm_service_local_tlb_flush_requests(vcpu); 5125 5126 /* 5127 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5128 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5129 * up-to-date before switching to L1. 5130 */ 5131 if (enable_ept && is_pae_paging(vcpu)) 5132 vmx_ept_load_pdptrs(vcpu); 5133 5134 leave_guest_mode(vcpu); 5135 5136 if (nested_cpu_has_preemption_timer(vmcs12)) 5137 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5138 5139 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5140 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5141 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5142 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5143 } 5144 5145 if (likely(!vmx->fail)) { 5146 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5147 5148 if (vm_exit_reason != -1) 5149 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5150 exit_intr_info, exit_qualification, 5151 exit_insn_len); 5152 5153 /* 5154 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5155 * also be used to capture vmcs12 cache as part of 5156 * capturing nVMX state for snapshot (migration). 5157 * 5158 * Otherwise, this flush will dirty guest memory at a 5159 * point it is already assumed by user-space to be 5160 * immutable. 5161 */ 5162 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5163 } else { 5164 /* 5165 * The only expected VM-instruction error is "VM entry with 5166 * invalid control field(s)." Anything else indicates a 5167 * problem with L0. And we should never get here with a 5168 * VMFail of any type if early consistency checks are enabled. 5169 */ 5170 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5171 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5172 WARN_ON_ONCE(nested_early_check); 5173 } 5174 5175 /* 5176 * Drop events/exceptions that were queued for re-injection to L2 5177 * (picked up via vmx_complete_interrupts()), as well as exceptions 5178 * that were pending for L2. Note, this must NOT be hoisted above 5179 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5180 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5181 */ 5182 vcpu->arch.nmi_injected = false; 5183 kvm_clear_exception_queue(vcpu); 5184 kvm_clear_interrupt_queue(vcpu); 5185 5186 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5187 5188 kvm_nested_vmexit_handle_ibrs(vcpu); 5189 5190 /* Update any VMCS fields that might have changed while L2 ran */ 5191 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5192 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5193 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5194 if (kvm_caps.has_tsc_control) 5195 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5196 5197 if (vmx->nested.l1_tpr_threshold != -1) 5198 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 5199 5200 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 5201 vmx->nested.change_vmcs01_virtual_apic_mode = false; 5202 vmx_set_virtual_apic_mode(vcpu); 5203 } 5204 5205 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 5206 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 5207 vmx_update_cpu_dirty_logging(vcpu); 5208 } 5209 5210 nested_put_vmcs12_pages(vcpu); 5211 5212 if (vmx->nested.reload_vmcs01_apic_access_page) { 5213 vmx->nested.reload_vmcs01_apic_access_page = false; 5214 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5215 } 5216 5217 if (vmx->nested.update_vmcs01_apicv_status) { 5218 vmx->nested.update_vmcs01_apicv_status = false; 5219 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 5220 } 5221 5222 if (vmx->nested.update_vmcs01_hwapic_isr) { 5223 vmx->nested.update_vmcs01_hwapic_isr = false; 5224 kvm_apic_update_hwapic_isr(vcpu); 5225 } 5226 5227 if ((vm_exit_reason != -1) && 5228 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5229 vmx->nested.need_vmcs12_to_shadow_sync = true; 5230 5231 /* in case we halted in L2 */ 5232 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5233 5234 if (likely(!vmx->fail)) { 5235 if (vm_exit_reason != -1) 5236 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5237 vmcs12->exit_qualification, 5238 vmcs12->idt_vectoring_info_field, 5239 vmcs12->vm_exit_intr_info, 5240 vmcs12->vm_exit_intr_error_code, 5241 KVM_ISA_VMX); 5242 5243 load_vmcs12_host_state(vcpu, vmcs12); 5244 5245 /* 5246 * Process events if an injectable IRQ or NMI is pending, even 5247 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5248 * If an event became pending while L2 was active, KVM needs to 5249 * either inject the event or request an IRQ/NMI window. SMIs 5250 * don't need to be processed as SMM is mutually exclusive with 5251 * non-root mode. INIT/SIPI don't need to be checked as INIT 5252 * is blocked post-VMXON, and SIPIs are ignored. 5253 */ 5254 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5255 kvm_make_request(KVM_REQ_EVENT, vcpu); 5256 return; 5257 } 5258 5259 /* 5260 * After an early L2 VM-entry failure, we're now back 5261 * in L1 which thinks it just finished a VMLAUNCH or 5262 * VMRESUME instruction, so we need to set the failure 5263 * flag and the VM-instruction error field of the VMCS 5264 * accordingly, and skip the emulated instruction. 5265 */ 5266 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5267 5268 /* 5269 * Restore L1's host state to KVM's software model. We're here 5270 * because a consistency check was caught by hardware, which 5271 * means some amount of guest state has been propagated to KVM's 5272 * model and needs to be unwound to the host's state. 5273 */ 5274 nested_vmx_restore_host_state(vcpu); 5275 5276 vmx->fail = 0; 5277 } 5278 5279 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5280 { 5281 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5282 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5283 } 5284 5285 /* 5286 * Decode the memory-address operand of a vmx instruction, as recorded on an 5287 * exit caused by such an instruction (run by a guest hypervisor). 5288 * On success, returns 0. When the operand is invalid, returns 1 and throws 5289 * #UD, #GP, or #SS. 5290 */ 5291 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5292 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5293 { 5294 gva_t off; 5295 bool exn; 5296 struct kvm_segment s; 5297 5298 /* 5299 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5300 * Execution", on an exit, vmx_instruction_info holds most of the 5301 * addressing components of the operand. Only the displacement part 5302 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5303 * For how an actual address is calculated from all these components, 5304 * refer to Vol. 1, "Operand Addressing". 5305 */ 5306 int scaling = vmx_instruction_info & 3; 5307 int addr_size = (vmx_instruction_info >> 7) & 7; 5308 bool is_reg = vmx_instruction_info & (1u << 10); 5309 int seg_reg = (vmx_instruction_info >> 15) & 7; 5310 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5311 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5312 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5313 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5314 5315 if (is_reg) { 5316 kvm_queue_exception(vcpu, UD_VECTOR); 5317 return 1; 5318 } 5319 5320 /* Addr = segment_base + offset */ 5321 /* offset = base + [index * scale] + displacement */ 5322 off = exit_qualification; /* holds the displacement */ 5323 if (addr_size == 1) 5324 off = (gva_t)sign_extend64(off, 31); 5325 else if (addr_size == 0) 5326 off = (gva_t)sign_extend64(off, 15); 5327 if (base_is_valid) 5328 off += kvm_register_read(vcpu, base_reg); 5329 if (index_is_valid) 5330 off += kvm_register_read(vcpu, index_reg) << scaling; 5331 vmx_get_segment(vcpu, &s, seg_reg); 5332 5333 /* 5334 * The effective address, i.e. @off, of a memory operand is truncated 5335 * based on the address size of the instruction. Note that this is 5336 * the *effective address*, i.e. the address prior to accounting for 5337 * the segment's base. 5338 */ 5339 if (addr_size == 1) /* 32 bit */ 5340 off &= 0xffffffff; 5341 else if (addr_size == 0) /* 16 bit */ 5342 off &= 0xffff; 5343 5344 /* Checks for #GP/#SS exceptions. */ 5345 exn = false; 5346 if (is_long_mode(vcpu)) { 5347 /* 5348 * The virtual/linear address is never truncated in 64-bit 5349 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5350 * address when using FS/GS with a non-zero base. 5351 */ 5352 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5353 *ret = s.base + off; 5354 else 5355 *ret = off; 5356 5357 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5358 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5359 * non-canonical form. This is the only check on the memory 5360 * destination for long mode! 5361 */ 5362 exn = is_noncanonical_address(*ret, vcpu, 0); 5363 } else { 5364 /* 5365 * When not in long mode, the virtual/linear address is 5366 * unconditionally truncated to 32 bits regardless of the 5367 * address size. 5368 */ 5369 *ret = (s.base + off) & 0xffffffff; 5370 5371 /* Protected mode: apply checks for segment validity in the 5372 * following order: 5373 * - segment type check (#GP(0) may be thrown) 5374 * - usability check (#GP(0)/#SS(0)) 5375 * - limit check (#GP(0)/#SS(0)) 5376 */ 5377 if (wr) 5378 /* #GP(0) if the destination operand is located in a 5379 * read-only data segment or any code segment. 5380 */ 5381 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5382 else 5383 /* #GP(0) if the source operand is located in an 5384 * execute-only code segment 5385 */ 5386 exn = ((s.type & 0xa) == 8); 5387 if (exn) { 5388 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5389 return 1; 5390 } 5391 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5392 */ 5393 exn = (s.unusable != 0); 5394 5395 /* 5396 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5397 * outside the segment limit. All CPUs that support VMX ignore 5398 * limit checks for flat segments, i.e. segments with base==0, 5399 * limit==0xffffffff and of type expand-up data or code. 5400 */ 5401 if (!(s.base == 0 && s.limit == 0xffffffff && 5402 ((s.type & 8) || !(s.type & 4)))) 5403 exn = exn || ((u64)off + len - 1 > s.limit); 5404 } 5405 if (exn) { 5406 kvm_queue_exception_e(vcpu, 5407 seg_reg == VCPU_SREG_SS ? 5408 SS_VECTOR : GP_VECTOR, 5409 0); 5410 return 1; 5411 } 5412 5413 return 0; 5414 } 5415 5416 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5417 int *ret) 5418 { 5419 gva_t gva; 5420 struct x86_exception e; 5421 int r; 5422 5423 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5424 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5425 sizeof(*vmpointer), &gva)) { 5426 *ret = 1; 5427 return -EINVAL; 5428 } 5429 5430 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5431 if (r != X86EMUL_CONTINUE) { 5432 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5433 return -EINVAL; 5434 } 5435 5436 return 0; 5437 } 5438 5439 /* 5440 * Allocate a shadow VMCS and associate it with the currently loaded 5441 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5442 * VMCS is also VMCLEARed, so that it is ready for use. 5443 */ 5444 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5445 { 5446 struct vcpu_vmx *vmx = to_vmx(vcpu); 5447 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5448 5449 /* 5450 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5451 * when L1 executes VMXOFF or the vCPU is forced out of nested 5452 * operation. VMXON faults if the CPU is already post-VMXON, so it 5453 * should be impossible to already have an allocated shadow VMCS. KVM 5454 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5455 * always be the loaded VMCS. 5456 */ 5457 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5458 return loaded_vmcs->shadow_vmcs; 5459 5460 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5461 if (loaded_vmcs->shadow_vmcs) 5462 vmcs_clear(loaded_vmcs->shadow_vmcs); 5463 5464 return loaded_vmcs->shadow_vmcs; 5465 } 5466 5467 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5468 { 5469 struct vcpu_vmx *vmx = to_vmx(vcpu); 5470 int r; 5471 5472 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5473 if (r < 0) 5474 goto out_vmcs02; 5475 5476 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5477 if (!vmx->nested.cached_vmcs12) 5478 goto out_cached_vmcs12; 5479 5480 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5481 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5482 if (!vmx->nested.cached_shadow_vmcs12) 5483 goto out_cached_shadow_vmcs12; 5484 5485 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5486 goto out_shadow_vmcs; 5487 5488 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5489 HRTIMER_MODE_ABS_PINNED); 5490 5491 vmx->nested.vpid02 = allocate_vpid(); 5492 5493 vmx->nested.vmcs02_initialized = false; 5494 vmx->nested.vmxon = true; 5495 5496 if (vmx_pt_mode_is_host_guest()) { 5497 vmx->pt_desc.guest.ctl = 0; 5498 pt_update_intercept_for_msr(vcpu); 5499 } 5500 5501 return 0; 5502 5503 out_shadow_vmcs: 5504 kfree(vmx->nested.cached_shadow_vmcs12); 5505 5506 out_cached_shadow_vmcs12: 5507 kfree(vmx->nested.cached_vmcs12); 5508 5509 out_cached_vmcs12: 5510 free_loaded_vmcs(&vmx->nested.vmcs02); 5511 5512 out_vmcs02: 5513 return -ENOMEM; 5514 } 5515 5516 /* Emulate the VMXON instruction. */ 5517 static int handle_vmxon(struct kvm_vcpu *vcpu) 5518 { 5519 int ret; 5520 gpa_t vmptr; 5521 uint32_t revision; 5522 struct vcpu_vmx *vmx = to_vmx(vcpu); 5523 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5524 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5525 5526 /* 5527 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5528 * the guest and so cannot rely on hardware to perform the check, 5529 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5530 * for VMXON). 5531 * 5532 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5533 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5534 * force any of the relevant guest state. For a restricted guest, KVM 5535 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5536 * Real Mode, and so there's no need to check CR0.PE manually. 5537 */ 5538 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5539 kvm_queue_exception(vcpu, UD_VECTOR); 5540 return 1; 5541 } 5542 5543 /* 5544 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5545 * and has higher priority than the VM-Fail due to being post-VMXON, 5546 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5547 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5548 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5549 * VMX non-root. 5550 * 5551 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5552 * #UD checks (see above), is functionally ok because KVM doesn't allow 5553 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5554 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5555 * missed by hardware due to shadowing CR0 and/or CR4. 5556 */ 5557 if (vmx_get_cpl(vcpu)) { 5558 kvm_inject_gp(vcpu, 0); 5559 return 1; 5560 } 5561 5562 if (vmx->nested.vmxon) 5563 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5564 5565 /* 5566 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5567 * only if the vCPU isn't already in VMX operation, i.e. effectively 5568 * have lower priority than the VM-Fail above. 5569 */ 5570 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5571 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5572 kvm_inject_gp(vcpu, 0); 5573 return 1; 5574 } 5575 5576 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5577 != VMXON_NEEDED_FEATURES) { 5578 kvm_inject_gp(vcpu, 0); 5579 return 1; 5580 } 5581 5582 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5583 return ret; 5584 5585 /* 5586 * SDM 3: 24.11.5 5587 * The first 4 bytes of VMXON region contain the supported 5588 * VMCS revision identifier 5589 * 5590 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5591 * which replaces physical address width with 32 5592 */ 5593 if (!page_address_valid(vcpu, vmptr)) 5594 return nested_vmx_failInvalid(vcpu); 5595 5596 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5597 revision != VMCS12_REVISION) 5598 return nested_vmx_failInvalid(vcpu); 5599 5600 vmx->nested.vmxon_ptr = vmptr; 5601 ret = enter_vmx_operation(vcpu); 5602 if (ret) 5603 return ret; 5604 5605 return nested_vmx_succeed(vcpu); 5606 } 5607 5608 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5609 { 5610 struct vcpu_vmx *vmx = to_vmx(vcpu); 5611 5612 if (vmx->nested.current_vmptr == INVALID_GPA) 5613 return; 5614 5615 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5616 5617 if (enable_shadow_vmcs) { 5618 /* copy to memory all shadowed fields in case 5619 they were modified */ 5620 copy_shadow_to_vmcs12(vmx); 5621 vmx_disable_shadow_vmcs(vmx); 5622 } 5623 vmx->nested.posted_intr_nv = -1; 5624 5625 /* Flush VMCS12 to guest memory */ 5626 kvm_vcpu_write_guest_page(vcpu, 5627 vmx->nested.current_vmptr >> PAGE_SHIFT, 5628 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5629 5630 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5631 5632 vmx->nested.current_vmptr = INVALID_GPA; 5633 } 5634 5635 /* Emulate the VMXOFF instruction */ 5636 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5637 { 5638 if (!nested_vmx_check_permission(vcpu)) 5639 return 1; 5640 5641 free_nested(vcpu); 5642 5643 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5644 kvm_make_request(KVM_REQ_EVENT, vcpu); 5645 5646 return nested_vmx_succeed(vcpu); 5647 } 5648 5649 /* Emulate the VMCLEAR instruction */ 5650 static int handle_vmclear(struct kvm_vcpu *vcpu) 5651 { 5652 struct vcpu_vmx *vmx = to_vmx(vcpu); 5653 u32 zero = 0; 5654 gpa_t vmptr; 5655 int r; 5656 5657 if (!nested_vmx_check_permission(vcpu)) 5658 return 1; 5659 5660 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5661 return r; 5662 5663 if (!page_address_valid(vcpu, vmptr)) 5664 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5665 5666 if (vmptr == vmx->nested.vmxon_ptr) 5667 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5668 5669 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5670 if (vmptr == vmx->nested.current_vmptr) 5671 nested_release_vmcs12(vcpu); 5672 5673 /* 5674 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5675 * for VMCLEAR includes a "ensure that data for VMCS referenced 5676 * by the operand is in memory" clause that guards writes to 5677 * memory, i.e. doing nothing for I/O is architecturally valid. 5678 * 5679 * FIXME: Suppress failures if and only if no memslot is found, 5680 * i.e. exit to userspace if __copy_to_user() fails. 5681 */ 5682 (void)kvm_vcpu_write_guest(vcpu, 5683 vmptr + offsetof(struct vmcs12, 5684 launch_state), 5685 &zero, sizeof(zero)); 5686 } 5687 5688 return nested_vmx_succeed(vcpu); 5689 } 5690 5691 /* Emulate the VMLAUNCH instruction */ 5692 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5693 { 5694 return nested_vmx_run(vcpu, true); 5695 } 5696 5697 /* Emulate the VMRESUME instruction */ 5698 static int handle_vmresume(struct kvm_vcpu *vcpu) 5699 { 5700 5701 return nested_vmx_run(vcpu, false); 5702 } 5703 5704 static int handle_vmread(struct kvm_vcpu *vcpu) 5705 { 5706 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5707 : get_vmcs12(vcpu); 5708 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5709 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5710 struct vcpu_vmx *vmx = to_vmx(vcpu); 5711 struct x86_exception e; 5712 unsigned long field; 5713 u64 value; 5714 gva_t gva = 0; 5715 short offset; 5716 int len, r; 5717 5718 if (!nested_vmx_check_permission(vcpu)) 5719 return 1; 5720 5721 /* Decode instruction info and find the field to read */ 5722 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5723 5724 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5725 /* 5726 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5727 * any VMREAD sets the ALU flags for VMfailInvalid. 5728 */ 5729 if (vmx->nested.current_vmptr == INVALID_GPA || 5730 (is_guest_mode(vcpu) && 5731 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5732 return nested_vmx_failInvalid(vcpu); 5733 5734 offset = get_vmcs12_field_offset(field); 5735 if (offset < 0) 5736 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5737 5738 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5739 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5740 5741 /* Read the field, zero-extended to a u64 value */ 5742 value = vmcs12_read_any(vmcs12, field, offset); 5743 } else { 5744 /* 5745 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5746 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5747 * unsupported. Unfortunately, certain versions of Windows 11 5748 * don't comply with this requirement which is not enforced in 5749 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5750 * workaround, as misbehaving guests will panic on VM-Fail. 5751 * Note, enlightened VMCS is incompatible with shadow VMCS so 5752 * all VMREADs from L2 should go to L1. 5753 */ 5754 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5755 return nested_vmx_failInvalid(vcpu); 5756 5757 offset = evmcs_field_offset(field, NULL); 5758 if (offset < 0) 5759 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5760 5761 /* Read the field, zero-extended to a u64 value */ 5762 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5763 } 5764 5765 /* 5766 * Now copy part of this value to register or memory, as requested. 5767 * Note that the number of bits actually copied is 32 or 64 depending 5768 * on the guest's mode (32 or 64 bit), not on the given field's length. 5769 */ 5770 if (instr_info & BIT(10)) { 5771 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5772 } else { 5773 len = is_64_bit_mode(vcpu) ? 8 : 4; 5774 if (get_vmx_mem_address(vcpu, exit_qualification, 5775 instr_info, true, len, &gva)) 5776 return 1; 5777 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5778 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5779 if (r != X86EMUL_CONTINUE) 5780 return kvm_handle_memory_failure(vcpu, r, &e); 5781 } 5782 5783 return nested_vmx_succeed(vcpu); 5784 } 5785 5786 static bool is_shadow_field_rw(unsigned long field) 5787 { 5788 switch (field) { 5789 #define SHADOW_FIELD_RW(x, y) case x: 5790 #include "vmcs_shadow_fields.h" 5791 return true; 5792 default: 5793 break; 5794 } 5795 return false; 5796 } 5797 5798 static bool is_shadow_field_ro(unsigned long field) 5799 { 5800 switch (field) { 5801 #define SHADOW_FIELD_RO(x, y) case x: 5802 #include "vmcs_shadow_fields.h" 5803 return true; 5804 default: 5805 break; 5806 } 5807 return false; 5808 } 5809 5810 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5811 { 5812 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5813 : get_vmcs12(vcpu); 5814 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5815 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5816 struct vcpu_vmx *vmx = to_vmx(vcpu); 5817 struct x86_exception e; 5818 unsigned long field; 5819 short offset; 5820 gva_t gva; 5821 int len, r; 5822 5823 /* 5824 * The value to write might be 32 or 64 bits, depending on L1's long 5825 * mode, and eventually we need to write that into a field of several 5826 * possible lengths. The code below first zero-extends the value to 64 5827 * bit (value), and then copies only the appropriate number of 5828 * bits into the vmcs12 field. 5829 */ 5830 u64 value = 0; 5831 5832 if (!nested_vmx_check_permission(vcpu)) 5833 return 1; 5834 5835 /* 5836 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5837 * any VMWRITE sets the ALU flags for VMfailInvalid. 5838 */ 5839 if (vmx->nested.current_vmptr == INVALID_GPA || 5840 (is_guest_mode(vcpu) && 5841 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5842 return nested_vmx_failInvalid(vcpu); 5843 5844 if (instr_info & BIT(10)) 5845 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5846 else { 5847 len = is_64_bit_mode(vcpu) ? 8 : 4; 5848 if (get_vmx_mem_address(vcpu, exit_qualification, 5849 instr_info, false, len, &gva)) 5850 return 1; 5851 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5852 if (r != X86EMUL_CONTINUE) 5853 return kvm_handle_memory_failure(vcpu, r, &e); 5854 } 5855 5856 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5857 5858 offset = get_vmcs12_field_offset(field); 5859 if (offset < 0) 5860 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5861 5862 /* 5863 * If the vCPU supports "VMWRITE to any supported field in the 5864 * VMCS," then the "read-only" fields are actually read/write. 5865 */ 5866 if (vmcs_field_readonly(field) && 5867 !nested_cpu_has_vmwrite_any_field(vcpu)) 5868 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5869 5870 /* 5871 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5872 * vmcs12, else we may crush a field or consume a stale value. 5873 */ 5874 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5875 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5876 5877 /* 5878 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5879 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5880 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5881 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5882 * from L1 will return a different value than VMREAD from L2 (L1 sees 5883 * the stripped down value, L2 sees the full value as stored by KVM). 5884 */ 5885 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5886 value &= 0x1f0ff; 5887 5888 vmcs12_write_any(vmcs12, field, offset, value); 5889 5890 /* 5891 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5892 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5893 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5894 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5895 */ 5896 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5897 /* 5898 * L1 can read these fields without exiting, ensure the 5899 * shadow VMCS is up-to-date. 5900 */ 5901 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5902 preempt_disable(); 5903 vmcs_load(vmx->vmcs01.shadow_vmcs); 5904 5905 __vmcs_writel(field, value); 5906 5907 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5908 vmcs_load(vmx->loaded_vmcs->vmcs); 5909 preempt_enable(); 5910 } 5911 vmx->nested.dirty_vmcs12 = true; 5912 } 5913 5914 return nested_vmx_succeed(vcpu); 5915 } 5916 5917 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5918 { 5919 vmx->nested.current_vmptr = vmptr; 5920 if (enable_shadow_vmcs) { 5921 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5922 vmcs_write64(VMCS_LINK_POINTER, 5923 __pa(vmx->vmcs01.shadow_vmcs)); 5924 vmx->nested.need_vmcs12_to_shadow_sync = true; 5925 } 5926 vmx->nested.dirty_vmcs12 = true; 5927 vmx->nested.force_msr_bitmap_recalc = true; 5928 } 5929 5930 /* Emulate the VMPTRLD instruction */ 5931 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5932 { 5933 struct vcpu_vmx *vmx = to_vmx(vcpu); 5934 gpa_t vmptr; 5935 int r; 5936 5937 if (!nested_vmx_check_permission(vcpu)) 5938 return 1; 5939 5940 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5941 return r; 5942 5943 if (!page_address_valid(vcpu, vmptr)) 5944 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5945 5946 if (vmptr == vmx->nested.vmxon_ptr) 5947 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5948 5949 /* Forbid normal VMPTRLD if Enlightened version was used */ 5950 if (nested_vmx_is_evmptr12_valid(vmx)) 5951 return 1; 5952 5953 if (vmx->nested.current_vmptr != vmptr) { 5954 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5955 struct vmcs_hdr hdr; 5956 5957 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5958 /* 5959 * Reads from an unbacked page return all 1s, 5960 * which means that the 32 bits located at the 5961 * given physical address won't match the required 5962 * VMCS12_REVISION identifier. 5963 */ 5964 return nested_vmx_fail(vcpu, 5965 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5966 } 5967 5968 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5969 offsetof(struct vmcs12, hdr), 5970 sizeof(hdr))) { 5971 return nested_vmx_fail(vcpu, 5972 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5973 } 5974 5975 if (hdr.revision_id != VMCS12_REVISION || 5976 (hdr.shadow_vmcs && 5977 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5978 return nested_vmx_fail(vcpu, 5979 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5980 } 5981 5982 nested_release_vmcs12(vcpu); 5983 5984 /* 5985 * Load VMCS12 from guest memory since it is not already 5986 * cached. 5987 */ 5988 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5989 VMCS12_SIZE)) { 5990 return nested_vmx_fail(vcpu, 5991 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5992 } 5993 5994 set_current_vmptr(vmx, vmptr); 5995 } 5996 5997 return nested_vmx_succeed(vcpu); 5998 } 5999 6000 /* Emulate the VMPTRST instruction */ 6001 static int handle_vmptrst(struct kvm_vcpu *vcpu) 6002 { 6003 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6004 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6005 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 6006 struct x86_exception e; 6007 gva_t gva; 6008 int r; 6009 6010 if (!nested_vmx_check_permission(vcpu)) 6011 return 1; 6012 6013 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 6014 return 1; 6015 6016 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 6017 true, sizeof(gpa_t), &gva)) 6018 return 1; 6019 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 6020 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 6021 sizeof(gpa_t), &e); 6022 if (r != X86EMUL_CONTINUE) 6023 return kvm_handle_memory_failure(vcpu, r, &e); 6024 6025 return nested_vmx_succeed(vcpu); 6026 } 6027 6028 /* Emulate the INVEPT instruction */ 6029 static int handle_invept(struct kvm_vcpu *vcpu) 6030 { 6031 struct vcpu_vmx *vmx = to_vmx(vcpu); 6032 u32 vmx_instruction_info, types; 6033 unsigned long type, roots_to_free; 6034 struct kvm_mmu *mmu; 6035 gva_t gva; 6036 struct x86_exception e; 6037 struct { 6038 u64 eptp, gpa; 6039 } operand; 6040 int i, r, gpr_index; 6041 6042 if (!(vmx->nested.msrs.secondary_ctls_high & 6043 SECONDARY_EXEC_ENABLE_EPT) || 6044 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 6045 kvm_queue_exception(vcpu, UD_VECTOR); 6046 return 1; 6047 } 6048 6049 if (!nested_vmx_check_permission(vcpu)) 6050 return 1; 6051 6052 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6053 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6054 type = kvm_register_read(vcpu, gpr_index); 6055 6056 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 6057 6058 if (type >= 32 || !(types & (1 << type))) 6059 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6060 6061 /* According to the Intel VMX instruction reference, the memory 6062 * operand is read even if it isn't needed (e.g., for type==global) 6063 */ 6064 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6065 vmx_instruction_info, false, sizeof(operand), &gva)) 6066 return 1; 6067 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6068 if (r != X86EMUL_CONTINUE) 6069 return kvm_handle_memory_failure(vcpu, r, &e); 6070 6071 /* 6072 * Nested EPT roots are always held through guest_mmu, 6073 * not root_mmu. 6074 */ 6075 mmu = &vcpu->arch.guest_mmu; 6076 6077 switch (type) { 6078 case VMX_EPT_EXTENT_CONTEXT: 6079 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6080 return nested_vmx_fail(vcpu, 6081 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6082 6083 roots_to_free = 0; 6084 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6085 operand.eptp)) 6086 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6087 6088 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6089 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6090 mmu->prev_roots[i].pgd, 6091 operand.eptp)) 6092 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6093 } 6094 break; 6095 case VMX_EPT_EXTENT_GLOBAL: 6096 roots_to_free = KVM_MMU_ROOTS_ALL; 6097 break; 6098 default: 6099 BUG(); 6100 break; 6101 } 6102 6103 if (roots_to_free) 6104 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6105 6106 return nested_vmx_succeed(vcpu); 6107 } 6108 6109 static int handle_invvpid(struct kvm_vcpu *vcpu) 6110 { 6111 struct vcpu_vmx *vmx = to_vmx(vcpu); 6112 u32 vmx_instruction_info; 6113 unsigned long type, types; 6114 gva_t gva; 6115 struct x86_exception e; 6116 struct { 6117 u64 vpid; 6118 u64 gla; 6119 } operand; 6120 u16 vpid02; 6121 int r, gpr_index; 6122 6123 if (!(vmx->nested.msrs.secondary_ctls_high & 6124 SECONDARY_EXEC_ENABLE_VPID) || 6125 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6126 kvm_queue_exception(vcpu, UD_VECTOR); 6127 return 1; 6128 } 6129 6130 if (!nested_vmx_check_permission(vcpu)) 6131 return 1; 6132 6133 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6134 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6135 type = kvm_register_read(vcpu, gpr_index); 6136 6137 types = (vmx->nested.msrs.vpid_caps & 6138 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6139 6140 if (type >= 32 || !(types & (1 << type))) 6141 return nested_vmx_fail(vcpu, 6142 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6143 6144 /* according to the intel vmx instruction reference, the memory 6145 * operand is read even if it isn't needed (e.g., for type==global) 6146 */ 6147 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6148 vmx_instruction_info, false, sizeof(operand), &gva)) 6149 return 1; 6150 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6151 if (r != X86EMUL_CONTINUE) 6152 return kvm_handle_memory_failure(vcpu, r, &e); 6153 6154 if (operand.vpid >> 16) 6155 return nested_vmx_fail(vcpu, 6156 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6157 6158 /* 6159 * Always flush the effective vpid02, i.e. never flush the current VPID 6160 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6161 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6162 * irrelevant (and there may not be a loaded vmcs12). 6163 */ 6164 vpid02 = nested_get_vpid02(vcpu); 6165 switch (type) { 6166 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6167 /* 6168 * LAM doesn't apply to addresses that are inputs to TLB 6169 * invalidation. 6170 */ 6171 if (!operand.vpid || 6172 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6173 return nested_vmx_fail(vcpu, 6174 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6175 vpid_sync_vcpu_addr(vpid02, operand.gla); 6176 break; 6177 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6178 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6179 if (!operand.vpid) 6180 return nested_vmx_fail(vcpu, 6181 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6182 vpid_sync_context(vpid02); 6183 break; 6184 case VMX_VPID_EXTENT_ALL_CONTEXT: 6185 vpid_sync_context(vpid02); 6186 break; 6187 default: 6188 WARN_ON_ONCE(1); 6189 return kvm_skip_emulated_instruction(vcpu); 6190 } 6191 6192 /* 6193 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6194 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6195 * roots as VPIDs are not tracked in the MMU role. 6196 * 6197 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6198 * an MMU when EPT is disabled. 6199 * 6200 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6201 */ 6202 if (!enable_ept) 6203 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6204 6205 return nested_vmx_succeed(vcpu); 6206 } 6207 6208 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6209 struct vmcs12 *vmcs12) 6210 { 6211 u32 index = kvm_rcx_read(vcpu); 6212 u64 new_eptp; 6213 6214 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6215 return 1; 6216 if (index >= VMFUNC_EPTP_ENTRIES) 6217 return 1; 6218 6219 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6220 &new_eptp, index * 8, 8)) 6221 return 1; 6222 6223 /* 6224 * If the (L2) guest does a vmfunc to the currently 6225 * active ept pointer, we don't have to do anything else 6226 */ 6227 if (vmcs12->ept_pointer != new_eptp) { 6228 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6229 return 1; 6230 6231 vmcs12->ept_pointer = new_eptp; 6232 nested_ept_new_eptp(vcpu); 6233 6234 if (!nested_cpu_has_vpid(vmcs12)) 6235 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6236 } 6237 6238 return 0; 6239 } 6240 6241 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6242 { 6243 struct vcpu_vmx *vmx = to_vmx(vcpu); 6244 struct vmcs12 *vmcs12; 6245 u32 function = kvm_rax_read(vcpu); 6246 6247 /* 6248 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6249 * VMFUNC for nested VMs, but not for L1. 6250 */ 6251 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6252 kvm_queue_exception(vcpu, UD_VECTOR); 6253 return 1; 6254 } 6255 6256 vmcs12 = get_vmcs12(vcpu); 6257 6258 /* 6259 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6260 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6261 */ 6262 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6263 kvm_queue_exception(vcpu, UD_VECTOR); 6264 return 1; 6265 } 6266 6267 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6268 goto fail; 6269 6270 switch (function) { 6271 case 0: 6272 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6273 goto fail; 6274 break; 6275 default: 6276 goto fail; 6277 } 6278 return kvm_skip_emulated_instruction(vcpu); 6279 6280 fail: 6281 /* 6282 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6283 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6284 * EXIT_REASON_VMFUNC as the exit reason. 6285 */ 6286 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6287 vmx_get_intr_info(vcpu), 6288 vmx_get_exit_qual(vcpu)); 6289 return 1; 6290 } 6291 6292 /* 6293 * Return true if an IO instruction with the specified port and size should cause 6294 * a VM-exit into L1. 6295 */ 6296 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6297 int size) 6298 { 6299 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6300 gpa_t bitmap, last_bitmap; 6301 u8 b; 6302 6303 last_bitmap = INVALID_GPA; 6304 b = -1; 6305 6306 while (size > 0) { 6307 if (port < 0x8000) 6308 bitmap = vmcs12->io_bitmap_a; 6309 else if (port < 0x10000) 6310 bitmap = vmcs12->io_bitmap_b; 6311 else 6312 return true; 6313 bitmap += (port & 0x7fff) / 8; 6314 6315 if (last_bitmap != bitmap) 6316 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6317 return true; 6318 if (b & (1 << (port & 7))) 6319 return true; 6320 6321 port++; 6322 size--; 6323 last_bitmap = bitmap; 6324 } 6325 6326 return false; 6327 } 6328 6329 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6330 struct vmcs12 *vmcs12) 6331 { 6332 unsigned long exit_qualification; 6333 unsigned short port; 6334 int size; 6335 6336 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6337 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6338 6339 exit_qualification = vmx_get_exit_qual(vcpu); 6340 6341 port = exit_qualification >> 16; 6342 size = (exit_qualification & 7) + 1; 6343 6344 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6345 } 6346 6347 /* 6348 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6349 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6350 * disinterest in the current event (read or write a specific MSR) by using an 6351 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6352 */ 6353 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6354 struct vmcs12 *vmcs12, 6355 union vmx_exit_reason exit_reason) 6356 { 6357 u32 msr_index; 6358 gpa_t bitmap; 6359 6360 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6361 return true; 6362 6363 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6364 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6365 msr_index = vmx_get_exit_qual(vcpu); 6366 else 6367 msr_index = kvm_rcx_read(vcpu); 6368 6369 /* 6370 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6371 * for the four combinations of read/write and low/high MSR numbers. 6372 * First we need to figure out which of the four to use: 6373 */ 6374 bitmap = vmcs12->msr_bitmap; 6375 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6376 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6377 bitmap += 2048; 6378 if (msr_index >= 0xc0000000) { 6379 msr_index -= 0xc0000000; 6380 bitmap += 1024; 6381 } 6382 6383 /* Then read the msr_index'th bit from this bitmap: */ 6384 if (msr_index < 1024*8) { 6385 unsigned char b; 6386 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6387 return true; 6388 return 1 & (b >> (msr_index & 7)); 6389 } else 6390 return true; /* let L1 handle the wrong parameter */ 6391 } 6392 6393 /* 6394 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6395 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6396 * intercept (via guest_host_mask etc.) the current event. 6397 */ 6398 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6399 struct vmcs12 *vmcs12) 6400 { 6401 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6402 int cr = exit_qualification & 15; 6403 int reg; 6404 unsigned long val; 6405 6406 switch ((exit_qualification >> 4) & 3) { 6407 case 0: /* mov to cr */ 6408 reg = (exit_qualification >> 8) & 15; 6409 val = kvm_register_read(vcpu, reg); 6410 switch (cr) { 6411 case 0: 6412 if (vmcs12->cr0_guest_host_mask & 6413 (val ^ vmcs12->cr0_read_shadow)) 6414 return true; 6415 break; 6416 case 3: 6417 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6418 return true; 6419 break; 6420 case 4: 6421 if (vmcs12->cr4_guest_host_mask & 6422 (vmcs12->cr4_read_shadow ^ val)) 6423 return true; 6424 break; 6425 case 8: 6426 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6427 return true; 6428 break; 6429 } 6430 break; 6431 case 2: /* clts */ 6432 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6433 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6434 return true; 6435 break; 6436 case 1: /* mov from cr */ 6437 switch (cr) { 6438 case 3: 6439 if (vmcs12->cpu_based_vm_exec_control & 6440 CPU_BASED_CR3_STORE_EXITING) 6441 return true; 6442 break; 6443 case 8: 6444 if (vmcs12->cpu_based_vm_exec_control & 6445 CPU_BASED_CR8_STORE_EXITING) 6446 return true; 6447 break; 6448 } 6449 break; 6450 case 3: /* lmsw */ 6451 /* 6452 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6453 * cr0. Other attempted changes are ignored, with no exit. 6454 */ 6455 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6456 if (vmcs12->cr0_guest_host_mask & 0xe & 6457 (val ^ vmcs12->cr0_read_shadow)) 6458 return true; 6459 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6460 !(vmcs12->cr0_read_shadow & 0x1) && 6461 (val & 0x1)) 6462 return true; 6463 break; 6464 } 6465 return false; 6466 } 6467 6468 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6469 struct vmcs12 *vmcs12) 6470 { 6471 u32 encls_leaf; 6472 6473 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6474 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6475 return false; 6476 6477 encls_leaf = kvm_rax_read(vcpu); 6478 if (encls_leaf > 62) 6479 encls_leaf = 63; 6480 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6481 } 6482 6483 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6484 struct vmcs12 *vmcs12, gpa_t bitmap) 6485 { 6486 u32 vmx_instruction_info; 6487 unsigned long field; 6488 u8 b; 6489 6490 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6491 return true; 6492 6493 /* Decode instruction info and find the field to access */ 6494 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6495 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6496 6497 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6498 if (field >> 15) 6499 return true; 6500 6501 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6502 return true; 6503 6504 return 1 & (b >> (field & 7)); 6505 } 6506 6507 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6508 { 6509 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6510 6511 if (nested_cpu_has_mtf(vmcs12)) 6512 return true; 6513 6514 /* 6515 * An MTF VM-exit may be injected into the guest by setting the 6516 * interruption-type to 7 (other event) and the vector field to 0. Such 6517 * is the case regardless of the 'monitor trap flag' VM-execution 6518 * control. 6519 */ 6520 return entry_intr_info == (INTR_INFO_VALID_MASK 6521 | INTR_TYPE_OTHER_EVENT); 6522 } 6523 6524 /* 6525 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6526 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6527 */ 6528 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6529 union vmx_exit_reason exit_reason) 6530 { 6531 u32 intr_info; 6532 6533 switch ((u16)exit_reason.basic) { 6534 case EXIT_REASON_EXCEPTION_NMI: 6535 intr_info = vmx_get_intr_info(vcpu); 6536 if (is_nmi(intr_info)) 6537 return true; 6538 else if (is_page_fault(intr_info)) 6539 return vcpu->arch.apf.host_apf_flags || 6540 vmx_need_pf_intercept(vcpu); 6541 else if (is_debug(intr_info) && 6542 vcpu->guest_debug & 6543 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6544 return true; 6545 else if (is_breakpoint(intr_info) && 6546 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6547 return true; 6548 else if (is_alignment_check(intr_info) && 6549 !vmx_guest_inject_ac(vcpu)) 6550 return true; 6551 else if (is_ve_fault(intr_info)) 6552 return true; 6553 return false; 6554 case EXIT_REASON_EXTERNAL_INTERRUPT: 6555 return true; 6556 case EXIT_REASON_MCE_DURING_VMENTRY: 6557 return true; 6558 case EXIT_REASON_EPT_VIOLATION: 6559 /* 6560 * L0 always deals with the EPT violation. If nested EPT is 6561 * used, and the nested mmu code discovers that the address is 6562 * missing in the guest EPT table (EPT12), the EPT violation 6563 * will be injected with nested_ept_inject_page_fault() 6564 */ 6565 return true; 6566 case EXIT_REASON_EPT_MISCONFIG: 6567 /* 6568 * L2 never uses directly L1's EPT, but rather L0's own EPT 6569 * table (shadow on EPT) or a merged EPT table that L0 built 6570 * (EPT on EPT). So any problems with the structure of the 6571 * table is L0's fault. 6572 */ 6573 return true; 6574 case EXIT_REASON_PREEMPTION_TIMER: 6575 return true; 6576 case EXIT_REASON_PML_FULL: 6577 /* 6578 * PML is emulated for an L1 VMM and should never be enabled in 6579 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6580 */ 6581 return true; 6582 case EXIT_REASON_VMFUNC: 6583 /* VM functions are emulated through L2->L0 vmexits. */ 6584 return true; 6585 case EXIT_REASON_BUS_LOCK: 6586 /* 6587 * At present, bus lock VM exit is never exposed to L1. 6588 * Handle L2's bus locks in L0 directly. 6589 */ 6590 return true; 6591 #ifdef CONFIG_KVM_HYPERV 6592 case EXIT_REASON_VMCALL: 6593 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6594 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6595 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6596 kvm_hv_is_tlb_flush_hcall(vcpu); 6597 #endif 6598 default: 6599 break; 6600 } 6601 return false; 6602 } 6603 6604 /* 6605 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6606 * is_guest_mode (L2). 6607 */ 6608 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6609 union vmx_exit_reason exit_reason) 6610 { 6611 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6612 u32 intr_info; 6613 6614 switch ((u16)exit_reason.basic) { 6615 case EXIT_REASON_EXCEPTION_NMI: 6616 intr_info = vmx_get_intr_info(vcpu); 6617 if (is_nmi(intr_info)) 6618 return true; 6619 else if (is_page_fault(intr_info)) 6620 return true; 6621 return vmcs12->exception_bitmap & 6622 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6623 case EXIT_REASON_EXTERNAL_INTERRUPT: 6624 return nested_exit_on_intr(vcpu); 6625 case EXIT_REASON_TRIPLE_FAULT: 6626 return true; 6627 case EXIT_REASON_INTERRUPT_WINDOW: 6628 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6629 case EXIT_REASON_NMI_WINDOW: 6630 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6631 case EXIT_REASON_TASK_SWITCH: 6632 return true; 6633 case EXIT_REASON_CPUID: 6634 return true; 6635 case EXIT_REASON_HLT: 6636 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6637 case EXIT_REASON_INVD: 6638 return true; 6639 case EXIT_REASON_INVLPG: 6640 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6641 case EXIT_REASON_RDPMC: 6642 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6643 case EXIT_REASON_RDRAND: 6644 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6645 case EXIT_REASON_RDSEED: 6646 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6647 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6648 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6649 case EXIT_REASON_VMREAD: 6650 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6651 vmcs12->vmread_bitmap); 6652 case EXIT_REASON_VMWRITE: 6653 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6654 vmcs12->vmwrite_bitmap); 6655 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6656 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6657 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6658 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6659 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6660 /* 6661 * VMX instructions trap unconditionally. This allows L1 to 6662 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6663 */ 6664 return true; 6665 case EXIT_REASON_CR_ACCESS: 6666 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6667 case EXIT_REASON_DR_ACCESS: 6668 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6669 case EXIT_REASON_IO_INSTRUCTION: 6670 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6671 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6672 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6673 case EXIT_REASON_MSR_READ: 6674 case EXIT_REASON_MSR_WRITE: 6675 case EXIT_REASON_MSR_READ_IMM: 6676 case EXIT_REASON_MSR_WRITE_IMM: 6677 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6678 case EXIT_REASON_INVALID_STATE: 6679 return true; 6680 case EXIT_REASON_MWAIT_INSTRUCTION: 6681 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6682 case EXIT_REASON_MONITOR_TRAP_FLAG: 6683 return nested_vmx_exit_handled_mtf(vmcs12); 6684 case EXIT_REASON_MONITOR_INSTRUCTION: 6685 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6686 case EXIT_REASON_PAUSE_INSTRUCTION: 6687 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6688 nested_cpu_has2(vmcs12, 6689 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6690 case EXIT_REASON_MCE_DURING_VMENTRY: 6691 return true; 6692 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6693 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6694 case EXIT_REASON_APIC_ACCESS: 6695 case EXIT_REASON_APIC_WRITE: 6696 case EXIT_REASON_EOI_INDUCED: 6697 /* 6698 * The controls for "virtualize APIC accesses," "APIC- 6699 * register virtualization," and "virtual-interrupt 6700 * delivery" only come from vmcs12. 6701 */ 6702 return true; 6703 case EXIT_REASON_INVPCID: 6704 return 6705 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6706 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6707 case EXIT_REASON_WBINVD: 6708 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6709 case EXIT_REASON_XSETBV: 6710 return true; 6711 case EXIT_REASON_XSAVES: 6712 case EXIT_REASON_XRSTORS: 6713 /* 6714 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6715 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6716 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6717 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6718 * in that case, before consulting the XSS-bitmap. 6719 */ 6720 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6721 return true; 6722 case EXIT_REASON_UMWAIT: 6723 case EXIT_REASON_TPAUSE: 6724 return nested_cpu_has2(vmcs12, 6725 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6726 case EXIT_REASON_ENCLS: 6727 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6728 case EXIT_REASON_NOTIFY: 6729 /* Notify VM exit is not exposed to L1 */ 6730 return false; 6731 default: 6732 return true; 6733 } 6734 } 6735 6736 /* 6737 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6738 * reflected into L1. 6739 */ 6740 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6741 { 6742 struct vcpu_vmx *vmx = to_vmx(vcpu); 6743 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6744 unsigned long exit_qual; 6745 u32 exit_intr_info; 6746 6747 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6748 6749 /* 6750 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6751 * has already loaded L2's state. 6752 */ 6753 if (unlikely(vmx->fail)) { 6754 trace_kvm_nested_vmenter_failed( 6755 "hardware VM-instruction error: ", 6756 vmcs_read32(VM_INSTRUCTION_ERROR)); 6757 exit_intr_info = 0; 6758 exit_qual = 0; 6759 goto reflect_vmexit; 6760 } 6761 6762 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6763 6764 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6765 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6766 return false; 6767 6768 /* If L1 doesn't want the exit, handle it in L0. */ 6769 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6770 return false; 6771 6772 /* 6773 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6774 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6775 * need to be synthesized by querying the in-kernel LAPIC, but external 6776 * interrupts are never reflected to L1 so it's a non-issue. 6777 */ 6778 exit_intr_info = vmx_get_intr_info(vcpu); 6779 if (is_exception_with_error_code(exit_intr_info)) { 6780 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6781 6782 vmcs12->vm_exit_intr_error_code = 6783 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6784 } 6785 exit_qual = vmx_get_exit_qual(vcpu); 6786 6787 reflect_vmexit: 6788 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6789 return true; 6790 } 6791 6792 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6793 struct kvm_nested_state __user *user_kvm_nested_state, 6794 u32 user_data_size) 6795 { 6796 struct vcpu_vmx *vmx; 6797 struct vmcs12 *vmcs12; 6798 struct kvm_nested_state kvm_state = { 6799 .flags = 0, 6800 .format = KVM_STATE_NESTED_FORMAT_VMX, 6801 .size = sizeof(kvm_state), 6802 .hdr.vmx.flags = 0, 6803 .hdr.vmx.vmxon_pa = INVALID_GPA, 6804 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6805 .hdr.vmx.preemption_timer_deadline = 0, 6806 }; 6807 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6808 &user_kvm_nested_state->data.vmx[0]; 6809 6810 if (!vcpu) 6811 return kvm_state.size + sizeof(*user_vmx_nested_state); 6812 6813 vmx = to_vmx(vcpu); 6814 vmcs12 = get_vmcs12(vcpu); 6815 6816 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6817 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6818 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6819 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6820 6821 if (vmx_has_valid_vmcs12(vcpu)) { 6822 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6823 6824 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6825 if (nested_vmx_is_evmptr12_set(vmx)) 6826 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6827 6828 if (is_guest_mode(vcpu) && 6829 nested_cpu_has_shadow_vmcs(vmcs12) && 6830 vmcs12->vmcs_link_pointer != INVALID_GPA) 6831 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6832 } 6833 6834 if (vmx->nested.smm.vmxon) 6835 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6836 6837 if (vmx->nested.smm.guest_mode) 6838 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6839 6840 if (is_guest_mode(vcpu)) { 6841 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6842 6843 if (vmx->nested.nested_run_pending) 6844 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6845 6846 if (vmx->nested.mtf_pending) 6847 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6848 6849 if (nested_cpu_has_preemption_timer(vmcs12) && 6850 vmx->nested.has_preemption_timer_deadline) { 6851 kvm_state.hdr.vmx.flags |= 6852 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6853 kvm_state.hdr.vmx.preemption_timer_deadline = 6854 vmx->nested.preemption_timer_deadline; 6855 } 6856 } 6857 } 6858 6859 if (user_data_size < kvm_state.size) 6860 goto out; 6861 6862 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6863 return -EFAULT; 6864 6865 if (!vmx_has_valid_vmcs12(vcpu)) 6866 goto out; 6867 6868 /* 6869 * When running L2, the authoritative vmcs12 state is in the 6870 * vmcs02. When running L1, the authoritative vmcs12 state is 6871 * in the shadow or enlightened vmcs linked to vmcs01, unless 6872 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6873 * vmcs12 state is in the vmcs12 already. 6874 */ 6875 if (is_guest_mode(vcpu)) { 6876 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6877 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6878 } else { 6879 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6880 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6881 if (nested_vmx_is_evmptr12_valid(vmx)) 6882 /* 6883 * L1 hypervisor is not obliged to keep eVMCS 6884 * clean fields data always up-to-date while 6885 * not in guest mode, 'hv_clean_fields' is only 6886 * supposed to be actual upon vmentry so we need 6887 * to ignore it here and do full copy. 6888 */ 6889 copy_enlightened_to_vmcs12(vmx, 0); 6890 else if (enable_shadow_vmcs) 6891 copy_shadow_to_vmcs12(vmx); 6892 } 6893 } 6894 6895 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6896 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6897 6898 /* 6899 * Copy over the full allocated size of vmcs12 rather than just the size 6900 * of the struct. 6901 */ 6902 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6903 return -EFAULT; 6904 6905 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6906 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6907 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6908 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6909 return -EFAULT; 6910 } 6911 out: 6912 return kvm_state.size; 6913 } 6914 6915 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6916 { 6917 if (is_guest_mode(vcpu)) { 6918 to_vmx(vcpu)->nested.nested_run_pending = 0; 6919 nested_vmx_vmexit(vcpu, -1, 0, 0); 6920 } 6921 free_nested(vcpu); 6922 } 6923 6924 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6925 struct kvm_nested_state __user *user_kvm_nested_state, 6926 struct kvm_nested_state *kvm_state) 6927 { 6928 struct vcpu_vmx *vmx = to_vmx(vcpu); 6929 struct vmcs12 *vmcs12; 6930 enum vm_entry_failure_code ignored; 6931 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6932 &user_kvm_nested_state->data.vmx[0]; 6933 int ret; 6934 6935 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6936 return -EINVAL; 6937 6938 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6939 if (kvm_state->hdr.vmx.smm.flags) 6940 return -EINVAL; 6941 6942 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6943 return -EINVAL; 6944 6945 /* 6946 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6947 * enable eVMCS capability on vCPU. However, since then 6948 * code was changed such that flag signals vmcs12 should 6949 * be copied into eVMCS in guest memory. 6950 * 6951 * To preserve backwards compatibility, allow user 6952 * to set this flag even when there is no VMXON region. 6953 */ 6954 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6955 return -EINVAL; 6956 } else { 6957 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6958 return -EINVAL; 6959 6960 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6961 return -EINVAL; 6962 } 6963 6964 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6965 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6966 return -EINVAL; 6967 6968 if (kvm_state->hdr.vmx.smm.flags & 6969 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6970 return -EINVAL; 6971 6972 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6973 return -EINVAL; 6974 6975 /* 6976 * SMM temporarily disables VMX, so we cannot be in guest mode, 6977 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6978 * must be zero. 6979 */ 6980 if (is_smm(vcpu) ? 6981 (kvm_state->flags & 6982 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6983 : kvm_state->hdr.vmx.smm.flags) 6984 return -EINVAL; 6985 6986 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6987 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6988 return -EINVAL; 6989 6990 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6991 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6992 !vmx->nested.enlightened_vmcs_enabled)) 6993 return -EINVAL; 6994 6995 vmx_leave_nested(vcpu); 6996 6997 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6998 return 0; 6999 7000 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 7001 ret = enter_vmx_operation(vcpu); 7002 if (ret) 7003 return ret; 7004 7005 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 7006 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 7007 /* See vmx_has_valid_vmcs12. */ 7008 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 7009 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 7010 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 7011 return -EINVAL; 7012 else 7013 return 0; 7014 } 7015 7016 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 7017 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 7018 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 7019 return -EINVAL; 7020 7021 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 7022 #ifdef CONFIG_KVM_HYPERV 7023 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 7024 /* 7025 * nested_vmx_handle_enlightened_vmptrld() cannot be called 7026 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 7027 * restored yet. EVMCS will be mapped from 7028 * nested_get_vmcs12_pages(). 7029 */ 7030 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 7031 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 7032 #endif 7033 } else { 7034 return -EINVAL; 7035 } 7036 7037 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 7038 vmx->nested.smm.vmxon = true; 7039 vmx->nested.vmxon = false; 7040 7041 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 7042 vmx->nested.smm.guest_mode = true; 7043 } 7044 7045 vmcs12 = get_vmcs12(vcpu); 7046 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7047 return -EFAULT; 7048 7049 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7050 return -EINVAL; 7051 7052 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7053 return 0; 7054 7055 vmx->nested.nested_run_pending = 7056 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 7057 7058 vmx->nested.mtf_pending = 7059 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7060 7061 ret = -EINVAL; 7062 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7063 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7064 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7065 7066 if (kvm_state->size < 7067 sizeof(*kvm_state) + 7068 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7069 goto error_guest_mode; 7070 7071 if (copy_from_user(shadow_vmcs12, 7072 user_vmx_nested_state->shadow_vmcs12, 7073 sizeof(*shadow_vmcs12))) { 7074 ret = -EFAULT; 7075 goto error_guest_mode; 7076 } 7077 7078 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 7079 !shadow_vmcs12->hdr.shadow_vmcs) 7080 goto error_guest_mode; 7081 } 7082 7083 vmx->nested.has_preemption_timer_deadline = false; 7084 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7085 vmx->nested.has_preemption_timer_deadline = true; 7086 vmx->nested.preemption_timer_deadline = 7087 kvm_state->hdr.vmx.preemption_timer_deadline; 7088 } 7089 7090 if (nested_vmx_check_controls(vcpu, vmcs12) || 7091 nested_vmx_check_host_state(vcpu, vmcs12) || 7092 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 7093 goto error_guest_mode; 7094 7095 vmx->nested.dirty_vmcs12 = true; 7096 vmx->nested.force_msr_bitmap_recalc = true; 7097 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7098 if (ret) 7099 goto error_guest_mode; 7100 7101 if (vmx->nested.mtf_pending) 7102 kvm_make_request(KVM_REQ_EVENT, vcpu); 7103 7104 return 0; 7105 7106 error_guest_mode: 7107 vmx->nested.nested_run_pending = 0; 7108 return ret; 7109 } 7110 7111 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7112 { 7113 if (enable_shadow_vmcs) { 7114 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7115 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7116 } 7117 } 7118 7119 /* 7120 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 7121 * that madness to get the encoding for comparison. 7122 */ 7123 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 7124 7125 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7126 { 7127 /* 7128 * Note these are the so called "index" of the VMCS field encoding, not 7129 * the index into vmcs12. 7130 */ 7131 unsigned int max_idx, idx; 7132 int i; 7133 7134 /* 7135 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7136 * vmcs12, regardless of whether or not the associated feature is 7137 * exposed to L1. Simply find the field with the highest index. 7138 */ 7139 max_idx = 0; 7140 for (i = 0; i < nr_vmcs12_fields; i++) { 7141 /* The vmcs12 table is very, very sparsely populated. */ 7142 if (!vmcs12_field_offsets[i]) 7143 continue; 7144 7145 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7146 if (idx > max_idx) 7147 max_idx = idx; 7148 } 7149 7150 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7151 } 7152 7153 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7154 struct nested_vmx_msrs *msrs) 7155 { 7156 msrs->pinbased_ctls_low = 7157 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7158 7159 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7160 msrs->pinbased_ctls_high &= 7161 PIN_BASED_EXT_INTR_MASK | 7162 PIN_BASED_NMI_EXITING | 7163 PIN_BASED_VIRTUAL_NMIS | 7164 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7165 msrs->pinbased_ctls_high |= 7166 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7167 PIN_BASED_VMX_PREEMPTION_TIMER; 7168 } 7169 7170 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7171 struct nested_vmx_msrs *msrs) 7172 { 7173 msrs->exit_ctls_low = 7174 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7175 7176 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7177 msrs->exit_ctls_high &= 7178 #ifdef CONFIG_X86_64 7179 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7180 #endif 7181 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7182 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7183 msrs->exit_ctls_high |= 7184 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7185 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7186 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7187 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7188 7189 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7190 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7191 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7192 7193 /* We support free control of debug control saving. */ 7194 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7195 } 7196 7197 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7198 struct nested_vmx_msrs *msrs) 7199 { 7200 msrs->entry_ctls_low = 7201 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7202 7203 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7204 msrs->entry_ctls_high &= 7205 #ifdef CONFIG_X86_64 7206 VM_ENTRY_IA32E_MODE | 7207 #endif 7208 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7209 VM_ENTRY_LOAD_CET_STATE; 7210 msrs->entry_ctls_high |= 7211 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7212 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7213 7214 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7215 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7216 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7217 7218 /* We support free control of debug control loading. */ 7219 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7220 } 7221 7222 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7223 struct nested_vmx_msrs *msrs) 7224 { 7225 msrs->procbased_ctls_low = 7226 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7227 7228 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7229 msrs->procbased_ctls_high &= 7230 CPU_BASED_INTR_WINDOW_EXITING | 7231 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7232 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7233 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7234 CPU_BASED_CR3_STORE_EXITING | 7235 #ifdef CONFIG_X86_64 7236 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7237 #endif 7238 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7239 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7240 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7241 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7242 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7243 /* 7244 * We can allow some features even when not supported by the 7245 * hardware. For example, L1 can specify an MSR bitmap - and we 7246 * can use it to avoid exits to L1 - even when L0 runs L2 7247 * without MSR bitmaps. 7248 */ 7249 msrs->procbased_ctls_high |= 7250 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7251 CPU_BASED_USE_MSR_BITMAPS; 7252 7253 /* We support free control of CR3 access interception. */ 7254 msrs->procbased_ctls_low &= 7255 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7256 } 7257 7258 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7259 struct vmcs_config *vmcs_conf, 7260 struct nested_vmx_msrs *msrs) 7261 { 7262 msrs->secondary_ctls_low = 0; 7263 7264 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7265 msrs->secondary_ctls_high &= 7266 SECONDARY_EXEC_DESC | 7267 SECONDARY_EXEC_ENABLE_RDTSCP | 7268 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7269 SECONDARY_EXEC_WBINVD_EXITING | 7270 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7271 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7272 SECONDARY_EXEC_RDRAND_EXITING | 7273 SECONDARY_EXEC_ENABLE_INVPCID | 7274 SECONDARY_EXEC_ENABLE_VMFUNC | 7275 SECONDARY_EXEC_RDSEED_EXITING | 7276 SECONDARY_EXEC_ENABLE_XSAVES | 7277 SECONDARY_EXEC_TSC_SCALING | 7278 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7279 7280 /* 7281 * We can emulate "VMCS shadowing," even if the hardware 7282 * doesn't support it. 7283 */ 7284 msrs->secondary_ctls_high |= 7285 SECONDARY_EXEC_SHADOW_VMCS; 7286 7287 if (enable_ept) { 7288 /* nested EPT: emulate EPT also to L1 */ 7289 msrs->secondary_ctls_high |= 7290 SECONDARY_EXEC_ENABLE_EPT; 7291 msrs->ept_caps = 7292 VMX_EPT_PAGE_WALK_4_BIT | 7293 VMX_EPT_PAGE_WALK_5_BIT | 7294 VMX_EPTP_WB_BIT | 7295 VMX_EPT_INVEPT_BIT | 7296 VMX_EPT_EXECUTE_ONLY_BIT; 7297 7298 msrs->ept_caps &= ept_caps; 7299 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7300 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7301 VMX_EPT_1GB_PAGE_BIT; 7302 if (enable_ept_ad_bits) { 7303 msrs->secondary_ctls_high |= 7304 SECONDARY_EXEC_ENABLE_PML; 7305 msrs->ept_caps |= VMX_EPT_AD_BIT; 7306 } 7307 7308 /* 7309 * Advertise EPTP switching irrespective of hardware support, 7310 * KVM emulates it in software so long as VMFUNC is supported. 7311 */ 7312 if (cpu_has_vmx_vmfunc()) 7313 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7314 } 7315 7316 /* 7317 * Old versions of KVM use the single-context version without 7318 * checking for support, so declare that it is supported even 7319 * though it is treated as global context. The alternative is 7320 * not failing the single-context invvpid, and it is worse. 7321 */ 7322 if (enable_vpid) { 7323 msrs->secondary_ctls_high |= 7324 SECONDARY_EXEC_ENABLE_VPID; 7325 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7326 VMX_VPID_EXTENT_SUPPORTED_MASK; 7327 } 7328 7329 if (enable_unrestricted_guest) 7330 msrs->secondary_ctls_high |= 7331 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7332 7333 if (flexpriority_enabled) 7334 msrs->secondary_ctls_high |= 7335 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7336 7337 if (enable_sgx) 7338 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7339 } 7340 7341 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7342 struct nested_vmx_msrs *msrs) 7343 { 7344 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7345 msrs->misc_low |= 7346 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7347 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7348 VMX_MISC_ACTIVITY_HLT | 7349 VMX_MISC_ACTIVITY_WAIT_SIPI; 7350 msrs->misc_high = 0; 7351 } 7352 7353 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7354 { 7355 /* 7356 * This MSR reports some information about VMX support. We 7357 * should return information about the VMX we emulate for the 7358 * guest, and the VMCS structure we give it - not about the 7359 * VMX support of the underlying hardware. 7360 */ 7361 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7362 X86_MEMTYPE_WB); 7363 7364 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7365 if (cpu_has_vmx_basic_inout()) 7366 msrs->basic |= VMX_BASIC_INOUT; 7367 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7368 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7369 } 7370 7371 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7372 { 7373 /* 7374 * These MSRs specify bits which the guest must keep fixed on 7375 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7376 * We picked the standard core2 setting. 7377 */ 7378 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7379 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7380 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7381 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7382 7383 /* These MSRs specify bits which the guest must keep fixed off. */ 7384 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7385 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7386 7387 if (vmx_umip_emulated()) 7388 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7389 } 7390 7391 /* 7392 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7393 * returned for the various VMX controls MSRs when nested VMX is enabled. 7394 * The same values should also be used to verify that vmcs12 control fields are 7395 * valid during nested entry from L1 to L2. 7396 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7397 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7398 * bit in the high half is on if the corresponding bit in the control field 7399 * may be on. See also vmx_control_verify(). 7400 */ 7401 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7402 { 7403 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7404 7405 /* 7406 * Note that as a general rule, the high half of the MSRs (bits in 7407 * the control fields which may be 1) should be initialized by the 7408 * intersection of the underlying hardware's MSR (i.e., features which 7409 * can be supported) and the list of features we want to expose - 7410 * because they are known to be properly supported in our code. 7411 * Also, usually, the low half of the MSRs (bits which must be 1) can 7412 * be set to 0, meaning that L1 may turn off any of these bits. The 7413 * reason is that if one of these bits is necessary, it will appear 7414 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7415 * fields of vmcs01 and vmcs02, will turn these bits off - and 7416 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7417 * These rules have exceptions below. 7418 */ 7419 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7420 7421 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7422 7423 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7424 7425 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7426 7427 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7428 7429 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7430 7431 nested_vmx_setup_basic(msrs); 7432 7433 nested_vmx_setup_cr_fixed(msrs); 7434 7435 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7436 } 7437 7438 void nested_vmx_hardware_unsetup(void) 7439 { 7440 int i; 7441 7442 if (enable_shadow_vmcs) { 7443 for (i = 0; i < VMX_BITMAP_NR; i++) 7444 free_page((unsigned long)vmx_bitmap[i]); 7445 } 7446 } 7447 7448 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7449 { 7450 int i; 7451 7452 if (!cpu_has_vmx_shadow_vmcs()) 7453 enable_shadow_vmcs = 0; 7454 if (enable_shadow_vmcs) { 7455 for (i = 0; i < VMX_BITMAP_NR; i++) { 7456 /* 7457 * The vmx_bitmap is not tied to a VM and so should 7458 * not be charged to a memcg. 7459 */ 7460 vmx_bitmap[i] = (unsigned long *) 7461 __get_free_page(GFP_KERNEL); 7462 if (!vmx_bitmap[i]) { 7463 nested_vmx_hardware_unsetup(); 7464 return -ENOMEM; 7465 } 7466 } 7467 7468 init_vmcs_shadow_fields(); 7469 } 7470 7471 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7472 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7473 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7474 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7475 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7476 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7477 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7478 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7479 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7480 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7481 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7482 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7483 7484 return 0; 7485 } 7486 7487 struct kvm_x86_nested_ops vmx_nested_ops = { 7488 .leave_nested = vmx_leave_nested, 7489 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7490 .check_events = vmx_check_nested_events, 7491 .has_events = vmx_has_nested_events, 7492 .triple_fault = nested_vmx_triple_fault, 7493 .get_state = vmx_get_nested_state, 7494 .set_state = vmx_set_nested_state, 7495 .get_nested_state_pages = vmx_get_nested_state_pages, 7496 .write_log_dirty = nested_vmx_write_pml_buffer, 7497 #ifdef CONFIG_KVM_HYPERV 7498 .enable_evmcs = nested_enable_evmcs, 7499 .get_evmcs_version = nested_get_evmcs_version, 7500 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7501 #endif 7502 }; 7503