1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 vcpu->arch.regs_dirty = 0; 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault) 415 { 416 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 417 struct vcpu_vmx *vmx = to_vmx(vcpu); 418 unsigned long exit_qualification; 419 u32 vm_exit_reason; 420 421 if (vmx->nested.pml_full) { 422 vm_exit_reason = EXIT_REASON_PML_FULL; 423 vmx->nested.pml_full = false; 424 425 /* 426 * It should be impossible to trigger a nested PML Full VM-Exit 427 * for anything other than an EPT Violation from L2. KVM *can* 428 * trigger nEPT page fault injection in response to an EPT 429 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 430 * tables also changed, but KVM should not treat EPT Misconfig 431 * VM-Exits as writes. 432 */ 433 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 434 435 /* 436 * PML Full and EPT Violation VM-Exits both use bit 12 to report 437 * "NMI unblocking due to IRET", i.e. the bit can be propagated 438 * as-is from the original EXIT_QUALIFICATION. 439 */ 440 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 441 } else { 442 if (fault->error_code & PFERR_RSVD_MASK) { 443 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 444 exit_qualification = 0; 445 } else { 446 exit_qualification = fault->exit_qualification; 447 exit_qualification |= vmx_get_exit_qual(vcpu) & 448 (EPT_VIOLATION_GVA_IS_VALID | 449 EPT_VIOLATION_GVA_TRANSLATED); 450 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 451 } 452 453 /* 454 * Although the caller (kvm_inject_emulated_page_fault) would 455 * have already synced the faulting address in the shadow EPT 456 * tables for the current EPTP12, we also need to sync it for 457 * any other cached EPTP02s based on the same EP4TA, since the 458 * TLB associates mappings to the EP4TA rather than the full EPTP. 459 */ 460 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 461 fault->address); 462 } 463 464 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 465 vmcs12->guest_physical_address = fault->address; 466 } 467 468 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 469 { 470 struct vcpu_vmx *vmx = to_vmx(vcpu); 471 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 472 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 473 474 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 475 nested_ept_ad_enabled(vcpu), 476 nested_ept_get_eptp(vcpu)); 477 } 478 479 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 480 { 481 WARN_ON(mmu_is_nested(vcpu)); 482 483 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 484 nested_ept_new_eptp(vcpu); 485 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 486 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 487 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 488 489 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 490 } 491 492 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 493 { 494 vcpu->arch.mmu = &vcpu->arch.root_mmu; 495 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 496 } 497 498 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 499 u16 error_code) 500 { 501 bool inequality, bit; 502 503 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 504 inequality = 505 (error_code & vmcs12->page_fault_error_code_mask) != 506 vmcs12->page_fault_error_code_match; 507 return inequality ^ bit; 508 } 509 510 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 511 u32 error_code) 512 { 513 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 514 515 /* 516 * Drop bits 31:16 of the error code when performing the #PF mask+match 517 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 518 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 519 * error code. Including the to-be-dropped bits in the check might 520 * result in an "impossible" or missed exit from L1's perspective. 521 */ 522 if (vector == PF_VECTOR) 523 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 524 525 return (vmcs12->exception_bitmap & (1u << vector)); 526 } 527 528 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 529 struct vmcs12 *vmcs12) 530 { 531 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 532 return 0; 533 534 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 535 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 536 return -EINVAL; 537 538 return 0; 539 } 540 541 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 542 struct vmcs12 *vmcs12) 543 { 544 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 545 return 0; 546 547 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 548 return -EINVAL; 549 550 return 0; 551 } 552 553 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 554 struct vmcs12 *vmcs12) 555 { 556 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 557 return 0; 558 559 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 560 return -EINVAL; 561 562 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 563 return -EINVAL; 564 565 return 0; 566 } 567 568 /* 569 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 570 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 571 * only the "disable intercept" case needs to be handled. 572 */ 573 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 574 unsigned long *msr_bitmap_l0, 575 u32 msr, int type) 576 { 577 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 578 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 579 580 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 581 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 582 } 583 584 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 585 { 586 int msr; 587 588 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 589 unsigned word = msr / BITS_PER_LONG; 590 591 msr_bitmap[word] = ~0; 592 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 593 } 594 } 595 596 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 597 static inline \ 598 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 599 unsigned long *msr_bitmap_l1, \ 600 unsigned long *msr_bitmap_l0, u32 msr) \ 601 { \ 602 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 603 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 604 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 605 else \ 606 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 607 } 608 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 609 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 610 611 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 612 unsigned long *msr_bitmap_l1, 613 unsigned long *msr_bitmap_l0, 614 u32 msr, int types) 615 { 616 if (types & MSR_TYPE_R) 617 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 618 msr_bitmap_l0, msr); 619 if (types & MSR_TYPE_W) 620 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 621 msr_bitmap_l0, msr); 622 } 623 624 #define nested_vmx_merge_msr_bitmaps(msr, type) \ 625 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 626 msr_bitmap_l0, msr, type) 627 628 #define nested_vmx_merge_msr_bitmaps_read(msr) \ 629 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 630 631 #define nested_vmx_merge_msr_bitmaps_write(msr) \ 632 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 633 634 #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 635 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 636 637 static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 638 unsigned long *msr_bitmap_l1, 639 unsigned long *msr_bitmap_l0) 640 { 641 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 642 struct vcpu_vmx *vmx = to_vmx(vcpu); 643 int i; 644 645 /* 646 * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 647 * none of the MSRs can possibly be passed through to L1. 648 */ 649 if (!kvm_vcpu_has_mediated_pmu(vcpu)) 650 return; 651 652 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 653 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 654 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 655 } 656 657 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 658 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 659 660 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 661 nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 662 nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 663 } 664 665 /* 666 * Merge L0's and L1's MSR bitmap, return false to indicate that 667 * we do not use the hardware. 668 */ 669 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 670 struct vmcs12 *vmcs12) 671 { 672 struct vcpu_vmx *vmx = to_vmx(vcpu); 673 int msr; 674 unsigned long *msr_bitmap_l1; 675 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 676 struct kvm_host_map map; 677 678 /* Nothing to do if the MSR bitmap is not in use. */ 679 if (!cpu_has_vmx_msr_bitmap() || 680 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 681 return false; 682 683 /* 684 * MSR bitmap update can be skipped when: 685 * - MSR bitmap for L1 hasn't changed. 686 * - Nested hypervisor (L1) is attempting to launch the same L2 as 687 * before. 688 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 689 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 690 */ 691 if (!vmx->nested.force_msr_bitmap_recalc) { 692 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 693 694 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 695 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 696 return true; 697 } 698 699 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 700 return false; 701 702 msr_bitmap_l1 = (unsigned long *)map.hva; 703 704 /* 705 * To keep the control flow simple, pay eight 8-byte writes (sixteen 706 * 4-byte writes on 32-bit systems) up front to enable intercepts for 707 * the x2APIC MSR range and selectively toggle those relevant to L2. 708 */ 709 enable_x2apic_msr_intercepts(msr_bitmap_l0); 710 711 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 712 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 713 /* 714 * L0 need not intercept reads for MSRs between 0x800 715 * and 0x8ff, it just lets the processor take the value 716 * from the virtual-APIC page; take those 256 bits 717 * directly from the L1 bitmap. 718 */ 719 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 720 unsigned word = msr / BITS_PER_LONG; 721 722 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 723 } 724 } 725 726 nested_vmx_disable_intercept_for_x2apic_msr( 727 msr_bitmap_l1, msr_bitmap_l0, 728 X2APIC_MSR(APIC_TASKPRI), 729 MSR_TYPE_R | MSR_TYPE_W); 730 731 if (nested_cpu_has_vid(vmcs12)) { 732 nested_vmx_disable_intercept_for_x2apic_msr( 733 msr_bitmap_l1, msr_bitmap_l0, 734 X2APIC_MSR(APIC_EOI), 735 MSR_TYPE_W); 736 nested_vmx_disable_intercept_for_x2apic_msr( 737 msr_bitmap_l1, msr_bitmap_l0, 738 X2APIC_MSR(APIC_SELF_IPI), 739 MSR_TYPE_W); 740 } 741 } 742 743 /* 744 * Always check vmcs01's bitmap to honor userspace MSR filters and any 745 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 746 */ 747 #ifdef CONFIG_X86_64 748 nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 749 nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 750 nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 751 #endif 752 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 753 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 754 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 755 756 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 757 MSR_IA32_APERF, MSR_TYPE_R); 758 759 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 760 MSR_IA32_MPERF, MSR_TYPE_R); 761 762 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 763 MSR_IA32_U_CET, MSR_TYPE_RW); 764 765 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 766 MSR_IA32_S_CET, MSR_TYPE_RW); 767 768 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 769 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 770 771 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 772 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 773 774 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 775 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 776 777 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 778 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 779 780 nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 781 782 kvm_vcpu_unmap(vcpu, &map); 783 784 vmx->nested.force_msr_bitmap_recalc = false; 785 786 return true; 787 } 788 789 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 790 struct vmcs12 *vmcs12) 791 { 792 struct vcpu_vmx *vmx = to_vmx(vcpu); 793 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 794 795 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 796 vmcs12->vmcs_link_pointer == INVALID_GPA) 797 return; 798 799 if (ghc->gpa != vmcs12->vmcs_link_pointer && 800 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 801 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 802 return; 803 804 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 805 VMCS12_SIZE); 806 } 807 808 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 809 struct vmcs12 *vmcs12) 810 { 811 struct vcpu_vmx *vmx = to_vmx(vcpu); 812 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 813 814 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 815 vmcs12->vmcs_link_pointer == INVALID_GPA) 816 return; 817 818 if (ghc->gpa != vmcs12->vmcs_link_pointer && 819 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 820 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 821 return; 822 823 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 824 VMCS12_SIZE); 825 } 826 827 /* 828 * In nested virtualization, check if L1 has set 829 * VM_EXIT_ACK_INTR_ON_EXIT 830 */ 831 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 832 { 833 return get_vmcs12(vcpu)->vm_exit_controls & 834 VM_EXIT_ACK_INTR_ON_EXIT; 835 } 836 837 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 838 struct vmcs12 *vmcs12) 839 { 840 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 841 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 842 return -EINVAL; 843 else 844 return 0; 845 } 846 847 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 848 struct vmcs12 *vmcs12) 849 { 850 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 851 !nested_cpu_has_apic_reg_virt(vmcs12) && 852 !nested_cpu_has_vid(vmcs12) && 853 !nested_cpu_has_posted_intr(vmcs12)) 854 return 0; 855 856 /* 857 * If virtualize x2apic mode is enabled, 858 * virtualize apic access must be disabled. 859 */ 860 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 861 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 862 return -EINVAL; 863 864 /* 865 * If virtual interrupt delivery is enabled, 866 * we must exit on external interrupts. 867 */ 868 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 869 return -EINVAL; 870 871 /* 872 * bits 15:8 should be zero in posted_intr_nv, 873 * the descriptor address has been already checked 874 * in nested_get_vmcs12_pages. 875 * 876 * bits 5:0 of posted_intr_desc_addr should be zero. 877 */ 878 if (nested_cpu_has_posted_intr(vmcs12) && 879 (CC(!nested_cpu_has_vid(vmcs12)) || 880 CC(!nested_exit_intr_ack_set(vcpu)) || 881 CC((vmcs12->posted_intr_nv & 0xff00)) || 882 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 883 return -EINVAL; 884 885 /* tpr shadow is needed by all apicv features. */ 886 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 887 return -EINVAL; 888 889 return 0; 890 } 891 892 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 893 { 894 struct vcpu_vmx *vmx = to_vmx(vcpu); 895 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 896 vmx->nested.msrs.misc_high); 897 898 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 899 } 900 901 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 902 u32 count, u64 addr) 903 { 904 if (count == 0) 905 return 0; 906 907 /* 908 * Exceeding the limit results in architecturally _undefined_ behavior, 909 * i.e. KVM is allowed to do literally anything in response to a bad 910 * limit. Immediately generate a consistency check so that code that 911 * consumes the count doesn't need to worry about extreme edge cases. 912 */ 913 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 914 return -EINVAL; 915 916 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 917 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 918 return -EINVAL; 919 920 return 0; 921 } 922 923 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 924 struct vmcs12 *vmcs12) 925 { 926 if (CC(nested_vmx_check_msr_switch(vcpu, 927 vmcs12->vm_exit_msr_load_count, 928 vmcs12->vm_exit_msr_load_addr)) || 929 CC(nested_vmx_check_msr_switch(vcpu, 930 vmcs12->vm_exit_msr_store_count, 931 vmcs12->vm_exit_msr_store_addr))) 932 return -EINVAL; 933 934 return 0; 935 } 936 937 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 938 struct vmcs12 *vmcs12) 939 { 940 if (CC(nested_vmx_check_msr_switch(vcpu, 941 vmcs12->vm_entry_msr_load_count, 942 vmcs12->vm_entry_msr_load_addr))) 943 return -EINVAL; 944 945 return 0; 946 } 947 948 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 949 struct vmcs12 *vmcs12) 950 { 951 if (!nested_cpu_has_pml(vmcs12)) 952 return 0; 953 954 if (CC(!nested_cpu_has_ept(vmcs12)) || 955 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 956 return -EINVAL; 957 958 return 0; 959 } 960 961 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 962 struct vmcs12 *vmcs12) 963 { 964 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 965 !nested_cpu_has_ept(vmcs12))) 966 return -EINVAL; 967 return 0; 968 } 969 970 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 971 struct vmcs12 *vmcs12) 972 { 973 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 974 !nested_cpu_has_ept(vmcs12))) 975 return -EINVAL; 976 return 0; 977 } 978 979 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 980 struct vmcs12 *vmcs12) 981 { 982 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 983 return 0; 984 985 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 986 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 987 return -EINVAL; 988 989 return 0; 990 } 991 992 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 993 struct vmx_msr_entry *e) 994 { 995 /* x2APIC MSR accesses are not allowed */ 996 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 997 return -EINVAL; 998 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 999 CC(e->index == MSR_IA32_UCODE_REV)) 1000 return -EINVAL; 1001 if (CC(e->reserved != 0)) 1002 return -EINVAL; 1003 return 0; 1004 } 1005 1006 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 1007 struct vmx_msr_entry *e) 1008 { 1009 if (CC(e->index == MSR_FS_BASE) || 1010 CC(e->index == MSR_GS_BASE) || 1011 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 1012 nested_vmx_msr_check_common(vcpu, e)) 1013 return -EINVAL; 1014 return 0; 1015 } 1016 1017 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 1018 struct vmx_msr_entry *e) 1019 { 1020 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 1021 nested_vmx_msr_check_common(vcpu, e)) 1022 return -EINVAL; 1023 return 0; 1024 } 1025 1026 /* 1027 * Load guest's/host's msr at nested entry/exit. 1028 * return 0 for success, entry index for failure. 1029 * 1030 * One of the failure modes for MSR load/store is when a list exceeds the 1031 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 1032 * as possible, process all valid entries before failing rather than precheck 1033 * for a capacity violation. 1034 */ 1035 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1036 { 1037 u32 i; 1038 struct vmx_msr_entry e; 1039 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1040 1041 for (i = 0; i < count; i++) { 1042 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1043 goto fail; 1044 1045 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1046 &e, sizeof(e))) { 1047 pr_debug_ratelimited( 1048 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1049 __func__, i, gpa + i * sizeof(e)); 1050 goto fail; 1051 } 1052 if (nested_vmx_load_msr_check(vcpu, &e)) { 1053 pr_debug_ratelimited( 1054 "%s check failed (%u, 0x%x, 0x%x)\n", 1055 __func__, i, e.index, e.reserved); 1056 goto fail; 1057 } 1058 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1059 pr_debug_ratelimited( 1060 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1061 __func__, i, e.index, e.value); 1062 goto fail; 1063 } 1064 } 1065 return 0; 1066 fail: 1067 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1068 return i + 1; 1069 } 1070 1071 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1072 u32 msr_index, 1073 u64 *data) 1074 { 1075 struct vcpu_vmx *vmx = to_vmx(vcpu); 1076 1077 /* 1078 * If the L0 hypervisor stored a more accurate value for the TSC that 1079 * does not include the time taken for emulation of the L2->L1 1080 * VM-exit in L0, use the more accurate value. 1081 */ 1082 if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1083 int slot = vmx->nested.tsc_autostore_slot; 1084 u64 host_tsc = vmx->msr_autostore.val[slot].value; 1085 1086 *data = kvm_read_l1_tsc(vcpu, host_tsc); 1087 return true; 1088 } 1089 1090 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1091 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1092 msr_index); 1093 return false; 1094 } 1095 return true; 1096 } 1097 1098 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1099 struct vmx_msr_entry *e) 1100 { 1101 if (kvm_vcpu_read_guest(vcpu, 1102 gpa + i * sizeof(*e), 1103 e, 2 * sizeof(u32))) { 1104 pr_debug_ratelimited( 1105 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1106 __func__, i, gpa + i * sizeof(*e)); 1107 return false; 1108 } 1109 if (nested_vmx_store_msr_check(vcpu, e)) { 1110 pr_debug_ratelimited( 1111 "%s check failed (%u, 0x%x, 0x%x)\n", 1112 __func__, i, e->index, e->reserved); 1113 return false; 1114 } 1115 return true; 1116 } 1117 1118 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1119 { 1120 u64 data; 1121 u32 i; 1122 struct vmx_msr_entry e; 1123 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1124 1125 for (i = 0; i < count; i++) { 1126 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1127 return -EINVAL; 1128 1129 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1130 return -EINVAL; 1131 1132 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1133 return -EINVAL; 1134 1135 if (kvm_vcpu_write_guest(vcpu, 1136 gpa + i * sizeof(e) + 1137 offsetof(struct vmx_msr_entry, value), 1138 &data, sizeof(data))) { 1139 pr_debug_ratelimited( 1140 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1141 __func__, i, e.index, data); 1142 return -EINVAL; 1143 } 1144 } 1145 return 0; 1146 } 1147 1148 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1149 { 1150 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1151 u32 count = vmcs12->vm_exit_msr_store_count; 1152 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1153 struct vmx_msr_entry e; 1154 u32 i; 1155 1156 for (i = 0; i < count; i++) { 1157 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1158 return false; 1159 1160 if (e.index == msr_index) 1161 return true; 1162 } 1163 return false; 1164 } 1165 1166 /* 1167 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1168 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1169 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1170 * @entry_failure_code. 1171 */ 1172 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1173 bool nested_ept, bool reload_pdptrs, 1174 enum vm_entry_failure_code *entry_failure_code) 1175 { 1176 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1177 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1178 return -EINVAL; 1179 } 1180 1181 /* 1182 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1183 * must not be dereferenced. 1184 */ 1185 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1186 CC(!load_pdptrs(vcpu, cr3))) { 1187 *entry_failure_code = ENTRY_FAIL_PDPTE; 1188 return -EINVAL; 1189 } 1190 1191 vcpu->arch.cr3 = cr3; 1192 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1193 1194 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1195 kvm_init_mmu(vcpu); 1196 1197 if (!nested_ept) 1198 kvm_mmu_new_pgd(vcpu, cr3); 1199 1200 return 0; 1201 } 1202 1203 /* 1204 * Returns if KVM is able to config CPU to tag TLB entries 1205 * populated by L2 differently than TLB entries populated 1206 * by L1. 1207 * 1208 * If L0 uses EPT, L1 and L2 run with different EPTP because 1209 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1210 * are tagged with different EPTP. 1211 * 1212 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1213 * with different VPID (L1 entries are tagged with vmx->vpid 1214 * while L2 entries are tagged with vmx->nested.vpid02). 1215 */ 1216 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1217 { 1218 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1219 1220 return enable_ept || 1221 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1222 } 1223 1224 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1225 struct vmcs12 *vmcs12, 1226 bool is_vmenter) 1227 { 1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1229 1230 /* Handle pending Hyper-V TLB flush requests */ 1231 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1232 1233 /* 1234 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1235 * same VPID as the host, and so architecturally, linear and combined 1236 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1237 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1238 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1239 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1240 * VPIDs) still occurs from L1's perspective, and KVM may need to 1241 * synchronize the MMU in response to the guest TLB flush. 1242 * 1243 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1244 * EPT is a special snowflake, as guest-physical mappings aren't 1245 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1246 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1247 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1248 * those mappings. 1249 */ 1250 if (!nested_cpu_has_vpid(vmcs12)) { 1251 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1252 return; 1253 } 1254 1255 /* L2 should never have a VPID if VPID is disabled. */ 1256 WARN_ON(!enable_vpid); 1257 1258 /* 1259 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1260 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1261 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1262 * that the new vpid12 has never been used and thus represents a new 1263 * guest ASID that cannot have entries in the TLB. 1264 */ 1265 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1266 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1267 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1268 return; 1269 } 1270 1271 /* 1272 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1273 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1274 * KVM was unable to allocate a VPID for L2, flush the current context 1275 * as the effective ASID is common to both L1 and L2. 1276 */ 1277 if (!nested_has_guest_tlb_tag(vcpu)) 1278 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1279 } 1280 1281 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1282 { 1283 superset &= mask; 1284 subset &= mask; 1285 1286 return (superset | subset) == superset; 1287 } 1288 1289 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1290 { 1291 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1292 VMX_BASIC_INOUT | 1293 VMX_BASIC_TRUE_CTLS | 1294 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1295 1296 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1297 GENMASK_ULL(47, 45) | 1298 BIT_ULL(31); 1299 1300 u64 vmx_basic = vmcs_config.nested.basic; 1301 1302 BUILD_BUG_ON(feature_bits & reserved_bits); 1303 1304 /* 1305 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1306 * inverted polarity), the incoming value must not set feature bits or 1307 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1308 * multi-bit values, are explicitly checked below. 1309 */ 1310 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1311 return -EINVAL; 1312 1313 /* 1314 * KVM does not emulate a version of VMX that constrains physical 1315 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1316 */ 1317 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1318 return -EINVAL; 1319 1320 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1321 vmx_basic_vmcs_revision_id(data)) 1322 return -EINVAL; 1323 1324 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1325 return -EINVAL; 1326 1327 vmx->nested.msrs.basic = data; 1328 return 0; 1329 } 1330 1331 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1332 u32 **low, u32 **high) 1333 { 1334 switch (msr_index) { 1335 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1336 *low = &msrs->pinbased_ctls_low; 1337 *high = &msrs->pinbased_ctls_high; 1338 break; 1339 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1340 *low = &msrs->procbased_ctls_low; 1341 *high = &msrs->procbased_ctls_high; 1342 break; 1343 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1344 *low = &msrs->exit_ctls_low; 1345 *high = &msrs->exit_ctls_high; 1346 break; 1347 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1348 *low = &msrs->entry_ctls_low; 1349 *high = &msrs->entry_ctls_high; 1350 break; 1351 case MSR_IA32_VMX_PROCBASED_CTLS2: 1352 *low = &msrs->secondary_ctls_low; 1353 *high = &msrs->secondary_ctls_high; 1354 break; 1355 default: 1356 BUG(); 1357 } 1358 } 1359 1360 static int 1361 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1362 { 1363 u32 *lowp, *highp; 1364 u64 supported; 1365 1366 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1367 1368 supported = vmx_control_msr(*lowp, *highp); 1369 1370 /* Check must-be-1 bits are still 1. */ 1371 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1372 return -EINVAL; 1373 1374 /* Check must-be-0 bits are still 0. */ 1375 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1376 return -EINVAL; 1377 1378 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1379 *lowp = data; 1380 *highp = data >> 32; 1381 return 0; 1382 } 1383 1384 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1385 { 1386 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1387 VMX_MISC_ACTIVITY_HLT | 1388 VMX_MISC_ACTIVITY_SHUTDOWN | 1389 VMX_MISC_ACTIVITY_WAIT_SIPI | 1390 VMX_MISC_INTEL_PT | 1391 VMX_MISC_RDMSR_IN_SMM | 1392 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1393 VMX_MISC_VMXOFF_BLOCK_SMI | 1394 VMX_MISC_ZERO_LEN_INS; 1395 1396 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1397 1398 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1399 vmcs_config.nested.misc_high); 1400 1401 BUILD_BUG_ON(feature_bits & reserved_bits); 1402 1403 /* 1404 * The incoming value must not set feature bits or reserved bits that 1405 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1406 * explicitly checked below. 1407 */ 1408 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1409 return -EINVAL; 1410 1411 if ((vmx->nested.msrs.pinbased_ctls_high & 1412 PIN_BASED_VMX_PREEMPTION_TIMER) && 1413 vmx_misc_preemption_timer_rate(data) != 1414 vmx_misc_preemption_timer_rate(vmx_misc)) 1415 return -EINVAL; 1416 1417 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1418 return -EINVAL; 1419 1420 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1421 return -EINVAL; 1422 1423 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1424 return -EINVAL; 1425 1426 vmx->nested.msrs.misc_low = data; 1427 vmx->nested.msrs.misc_high = data >> 32; 1428 1429 return 0; 1430 } 1431 1432 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1433 { 1434 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1435 vmcs_config.nested.vpid_caps); 1436 1437 /* Every bit is either reserved or a feature bit. */ 1438 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1439 return -EINVAL; 1440 1441 vmx->nested.msrs.ept_caps = data; 1442 vmx->nested.msrs.vpid_caps = data >> 32; 1443 return 0; 1444 } 1445 1446 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1447 { 1448 switch (msr_index) { 1449 case MSR_IA32_VMX_CR0_FIXED0: 1450 return &msrs->cr0_fixed0; 1451 case MSR_IA32_VMX_CR4_FIXED0: 1452 return &msrs->cr4_fixed0; 1453 default: 1454 BUG(); 1455 } 1456 } 1457 1458 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1459 { 1460 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1461 1462 /* 1463 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1464 * must be 1 in the restored value. 1465 */ 1466 if (!is_bitwise_subset(data, *msr, -1ULL)) 1467 return -EINVAL; 1468 1469 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1470 return 0; 1471 } 1472 1473 /* 1474 * Called when userspace is restoring VMX MSRs. 1475 * 1476 * Returns 0 on success, non-0 otherwise. 1477 */ 1478 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1479 { 1480 struct vcpu_vmx *vmx = to_vmx(vcpu); 1481 1482 /* 1483 * Don't allow changes to the VMX capability MSRs while the vCPU 1484 * is in VMX operation. 1485 */ 1486 if (vmx->nested.vmxon) 1487 return -EBUSY; 1488 1489 switch (msr_index) { 1490 case MSR_IA32_VMX_BASIC: 1491 return vmx_restore_vmx_basic(vmx, data); 1492 case MSR_IA32_VMX_PINBASED_CTLS: 1493 case MSR_IA32_VMX_PROCBASED_CTLS: 1494 case MSR_IA32_VMX_EXIT_CTLS: 1495 case MSR_IA32_VMX_ENTRY_CTLS: 1496 /* 1497 * The "non-true" VMX capability MSRs are generated from the 1498 * "true" MSRs, so we do not support restoring them directly. 1499 * 1500 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1501 * should restore the "true" MSRs with the must-be-1 bits 1502 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1503 * DEFAULT SETTINGS". 1504 */ 1505 return -EINVAL; 1506 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1507 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1508 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1509 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1510 case MSR_IA32_VMX_PROCBASED_CTLS2: 1511 return vmx_restore_control_msr(vmx, msr_index, data); 1512 case MSR_IA32_VMX_MISC: 1513 return vmx_restore_vmx_misc(vmx, data); 1514 case MSR_IA32_VMX_CR0_FIXED0: 1515 case MSR_IA32_VMX_CR4_FIXED0: 1516 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1517 case MSR_IA32_VMX_CR0_FIXED1: 1518 case MSR_IA32_VMX_CR4_FIXED1: 1519 /* 1520 * These MSRs are generated based on the vCPU's CPUID, so we 1521 * do not support restoring them directly. 1522 */ 1523 return -EINVAL; 1524 case MSR_IA32_VMX_EPT_VPID_CAP: 1525 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1526 case MSR_IA32_VMX_VMCS_ENUM: 1527 vmx->nested.msrs.vmcs_enum = data; 1528 return 0; 1529 case MSR_IA32_VMX_VMFUNC: 1530 if (data & ~vmcs_config.nested.vmfunc_controls) 1531 return -EINVAL; 1532 vmx->nested.msrs.vmfunc_controls = data; 1533 return 0; 1534 default: 1535 /* 1536 * The rest of the VMX capability MSRs do not support restore. 1537 */ 1538 return -EINVAL; 1539 } 1540 } 1541 1542 /* Returns 0 on success, non-0 otherwise. */ 1543 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1544 { 1545 switch (msr_index) { 1546 case MSR_IA32_VMX_BASIC: 1547 *pdata = msrs->basic; 1548 break; 1549 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1550 case MSR_IA32_VMX_PINBASED_CTLS: 1551 *pdata = vmx_control_msr( 1552 msrs->pinbased_ctls_low, 1553 msrs->pinbased_ctls_high); 1554 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1555 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1556 break; 1557 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1558 case MSR_IA32_VMX_PROCBASED_CTLS: 1559 *pdata = vmx_control_msr( 1560 msrs->procbased_ctls_low, 1561 msrs->procbased_ctls_high); 1562 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1563 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1564 break; 1565 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1566 case MSR_IA32_VMX_EXIT_CTLS: 1567 *pdata = vmx_control_msr( 1568 msrs->exit_ctls_low, 1569 msrs->exit_ctls_high); 1570 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1571 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1572 break; 1573 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1574 case MSR_IA32_VMX_ENTRY_CTLS: 1575 *pdata = vmx_control_msr( 1576 msrs->entry_ctls_low, 1577 msrs->entry_ctls_high); 1578 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1579 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1580 break; 1581 case MSR_IA32_VMX_MISC: 1582 *pdata = vmx_control_msr( 1583 msrs->misc_low, 1584 msrs->misc_high); 1585 break; 1586 case MSR_IA32_VMX_CR0_FIXED0: 1587 *pdata = msrs->cr0_fixed0; 1588 break; 1589 case MSR_IA32_VMX_CR0_FIXED1: 1590 *pdata = msrs->cr0_fixed1; 1591 break; 1592 case MSR_IA32_VMX_CR4_FIXED0: 1593 *pdata = msrs->cr4_fixed0; 1594 break; 1595 case MSR_IA32_VMX_CR4_FIXED1: 1596 *pdata = msrs->cr4_fixed1; 1597 break; 1598 case MSR_IA32_VMX_VMCS_ENUM: 1599 *pdata = msrs->vmcs_enum; 1600 break; 1601 case MSR_IA32_VMX_PROCBASED_CTLS2: 1602 *pdata = vmx_control_msr( 1603 msrs->secondary_ctls_low, 1604 msrs->secondary_ctls_high); 1605 break; 1606 case MSR_IA32_VMX_EPT_VPID_CAP: 1607 *pdata = msrs->ept_caps | 1608 ((u64)msrs->vpid_caps << 32); 1609 break; 1610 case MSR_IA32_VMX_VMFUNC: 1611 *pdata = msrs->vmfunc_controls; 1612 break; 1613 default: 1614 return 1; 1615 } 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1622 * been modified by the L1 guest. Note, "writable" in this context means 1623 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1624 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1625 * VM-exit information fields (which are actually writable if the vCPU is 1626 * configured to support "VMWRITE to any supported field in the VMCS"). 1627 */ 1628 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1629 { 1630 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1631 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1632 struct shadow_vmcs_field field; 1633 unsigned long val; 1634 int i; 1635 1636 if (WARN_ON(!shadow_vmcs)) 1637 return; 1638 1639 preempt_disable(); 1640 1641 vmcs_load(shadow_vmcs); 1642 1643 for (i = 0; i < max_shadow_read_write_fields; i++) { 1644 field = shadow_read_write_fields[i]; 1645 val = __vmcs_readl(field.encoding); 1646 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1647 } 1648 1649 vmcs_clear(shadow_vmcs); 1650 vmcs_load(vmx->loaded_vmcs->vmcs); 1651 1652 preempt_enable(); 1653 } 1654 1655 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1656 { 1657 const struct shadow_vmcs_field *fields[] = { 1658 shadow_read_write_fields, 1659 shadow_read_only_fields 1660 }; 1661 const int max_fields[] = { 1662 max_shadow_read_write_fields, 1663 max_shadow_read_only_fields 1664 }; 1665 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1666 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1667 struct shadow_vmcs_field field; 1668 unsigned long val; 1669 int i, q; 1670 1671 if (WARN_ON(!shadow_vmcs)) 1672 return; 1673 1674 vmcs_load(shadow_vmcs); 1675 1676 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1677 for (i = 0; i < max_fields[q]; i++) { 1678 field = fields[q][i]; 1679 val = vmcs12_read_any(vmcs12, field.encoding, 1680 field.offset); 1681 __vmcs_writel(field.encoding, val); 1682 } 1683 } 1684 1685 vmcs_clear(shadow_vmcs); 1686 vmcs_load(vmx->loaded_vmcs->vmcs); 1687 } 1688 1689 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1690 { 1691 #ifdef CONFIG_KVM_HYPERV 1692 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1693 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1694 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1695 1696 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1697 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1698 vmcs12->guest_rip = evmcs->guest_rip; 1699 1700 if (unlikely(!(hv_clean_fields & 1701 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1702 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1703 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1704 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1705 } 1706 1707 if (unlikely(!(hv_clean_fields & 1708 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1709 vmcs12->guest_rsp = evmcs->guest_rsp; 1710 vmcs12->guest_rflags = evmcs->guest_rflags; 1711 vmcs12->guest_interruptibility_info = 1712 evmcs->guest_interruptibility_info; 1713 /* 1714 * Not present in struct vmcs12: 1715 * vmcs12->guest_ssp = evmcs->guest_ssp; 1716 */ 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1721 vmcs12->cpu_based_vm_exec_control = 1722 evmcs->cpu_based_vm_exec_control; 1723 } 1724 1725 if (unlikely(!(hv_clean_fields & 1726 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1727 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1728 } 1729 1730 if (unlikely(!(hv_clean_fields & 1731 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1732 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1733 } 1734 1735 if (unlikely(!(hv_clean_fields & 1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1737 vmcs12->vm_entry_intr_info_field = 1738 evmcs->vm_entry_intr_info_field; 1739 vmcs12->vm_entry_exception_error_code = 1740 evmcs->vm_entry_exception_error_code; 1741 vmcs12->vm_entry_instruction_len = 1742 evmcs->vm_entry_instruction_len; 1743 } 1744 1745 if (unlikely(!(hv_clean_fields & 1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1747 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1748 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1749 vmcs12->host_cr0 = evmcs->host_cr0; 1750 vmcs12->host_cr3 = evmcs->host_cr3; 1751 vmcs12->host_cr4 = evmcs->host_cr4; 1752 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1753 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1754 vmcs12->host_rip = evmcs->host_rip; 1755 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1756 vmcs12->host_es_selector = evmcs->host_es_selector; 1757 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1758 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1759 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1760 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1761 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1762 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1763 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1764 /* 1765 * Not present in struct vmcs12: 1766 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1767 * vmcs12->host_ssp = evmcs->host_ssp; 1768 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1769 */ 1770 } 1771 1772 if (unlikely(!(hv_clean_fields & 1773 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1774 vmcs12->pin_based_vm_exec_control = 1775 evmcs->pin_based_vm_exec_control; 1776 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1777 vmcs12->secondary_vm_exec_control = 1778 evmcs->secondary_vm_exec_control; 1779 } 1780 1781 if (unlikely(!(hv_clean_fields & 1782 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1783 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1784 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1785 } 1786 1787 if (unlikely(!(hv_clean_fields & 1788 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1789 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1790 } 1791 1792 if (unlikely(!(hv_clean_fields & 1793 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1794 vmcs12->guest_es_base = evmcs->guest_es_base; 1795 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1796 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1797 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1798 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1799 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1800 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1801 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1802 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1803 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1804 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1805 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1806 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1807 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1808 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1809 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1810 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1811 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1812 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1813 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1814 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1815 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1816 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1817 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1818 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1819 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1820 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1821 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1822 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1823 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1824 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1825 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1826 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1827 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1828 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1829 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1830 } 1831 1832 if (unlikely(!(hv_clean_fields & 1833 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1834 vmcs12->tsc_offset = evmcs->tsc_offset; 1835 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1836 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1837 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1838 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1839 } 1840 1841 if (unlikely(!(hv_clean_fields & 1842 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1843 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1844 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1845 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1846 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1847 vmcs12->guest_cr0 = evmcs->guest_cr0; 1848 vmcs12->guest_cr3 = evmcs->guest_cr3; 1849 vmcs12->guest_cr4 = evmcs->guest_cr4; 1850 vmcs12->guest_dr7 = evmcs->guest_dr7; 1851 } 1852 1853 if (unlikely(!(hv_clean_fields & 1854 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1855 vmcs12->host_fs_base = evmcs->host_fs_base; 1856 vmcs12->host_gs_base = evmcs->host_gs_base; 1857 vmcs12->host_tr_base = evmcs->host_tr_base; 1858 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1859 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1860 vmcs12->host_rsp = evmcs->host_rsp; 1861 } 1862 1863 if (unlikely(!(hv_clean_fields & 1864 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1865 vmcs12->ept_pointer = evmcs->ept_pointer; 1866 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1867 } 1868 1869 if (unlikely(!(hv_clean_fields & 1870 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1871 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1872 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1873 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1874 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1875 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1876 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1877 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1878 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1879 vmcs12->guest_pending_dbg_exceptions = 1880 evmcs->guest_pending_dbg_exceptions; 1881 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1882 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1883 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1884 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1885 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1886 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1887 /* 1888 * Not present in struct vmcs12: 1889 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1890 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1891 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1892 */ 1893 } 1894 1895 /* 1896 * Not used? 1897 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1898 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1899 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1900 * vmcs12->page_fault_error_code_mask = 1901 * evmcs->page_fault_error_code_mask; 1902 * vmcs12->page_fault_error_code_match = 1903 * evmcs->page_fault_error_code_match; 1904 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1905 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1906 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1907 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1908 */ 1909 1910 /* 1911 * Read only fields: 1912 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1913 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1914 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1915 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1916 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1917 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1918 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1919 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1920 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1921 * vmcs12->exit_qualification = evmcs->exit_qualification; 1922 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1923 * 1924 * Not present in struct vmcs12: 1925 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1926 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1927 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1928 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1929 */ 1930 1931 return; 1932 #else /* CONFIG_KVM_HYPERV */ 1933 KVM_BUG_ON(1, vmx->vcpu.kvm); 1934 #endif /* CONFIG_KVM_HYPERV */ 1935 } 1936 1937 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1938 { 1939 #ifdef CONFIG_KVM_HYPERV 1940 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1941 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1942 1943 /* 1944 * Should not be changed by KVM: 1945 * 1946 * evmcs->host_es_selector = vmcs12->host_es_selector; 1947 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1948 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1949 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1950 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1951 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1952 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1953 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1954 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1955 * evmcs->host_cr0 = vmcs12->host_cr0; 1956 * evmcs->host_cr3 = vmcs12->host_cr3; 1957 * evmcs->host_cr4 = vmcs12->host_cr4; 1958 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1959 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1960 * evmcs->host_rip = vmcs12->host_rip; 1961 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1962 * evmcs->host_fs_base = vmcs12->host_fs_base; 1963 * evmcs->host_gs_base = vmcs12->host_gs_base; 1964 * evmcs->host_tr_base = vmcs12->host_tr_base; 1965 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1966 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1967 * evmcs->host_rsp = vmcs12->host_rsp; 1968 * sync_vmcs02_to_vmcs12() doesn't read these: 1969 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1970 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1971 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1972 * evmcs->ept_pointer = vmcs12->ept_pointer; 1973 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1974 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1975 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1976 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1977 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1978 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1979 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1980 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1981 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1982 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1983 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1984 * evmcs->page_fault_error_code_mask = 1985 * vmcs12->page_fault_error_code_mask; 1986 * evmcs->page_fault_error_code_match = 1987 * vmcs12->page_fault_error_code_match; 1988 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1989 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1990 * evmcs->tsc_offset = vmcs12->tsc_offset; 1991 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1992 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1993 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1994 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1995 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1996 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1997 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1998 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1999 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2000 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2001 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2002 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2003 * 2004 * Not present in struct vmcs12: 2005 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2006 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2007 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2008 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2009 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2010 * evmcs->host_ssp = vmcs12->host_ssp; 2011 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2012 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2013 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2014 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2015 * evmcs->guest_ssp = vmcs12->guest_ssp; 2016 */ 2017 2018 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2019 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2020 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2021 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2022 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2023 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2024 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2025 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2026 2027 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2028 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2029 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2030 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2031 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2032 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2033 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2034 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2035 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2036 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2037 2038 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2039 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2040 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2041 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2042 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2043 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2044 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2045 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2046 2047 evmcs->guest_es_base = vmcs12->guest_es_base; 2048 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2049 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2050 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2051 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2052 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2053 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2054 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2055 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2056 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2057 2058 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2059 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2060 2061 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2062 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2063 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2064 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2065 2066 evmcs->guest_pending_dbg_exceptions = 2067 vmcs12->guest_pending_dbg_exceptions; 2068 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2069 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2070 2071 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2072 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2073 2074 evmcs->guest_cr0 = vmcs12->guest_cr0; 2075 evmcs->guest_cr3 = vmcs12->guest_cr3; 2076 evmcs->guest_cr4 = vmcs12->guest_cr4; 2077 evmcs->guest_dr7 = vmcs12->guest_dr7; 2078 2079 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2080 2081 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2082 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2083 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2084 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2085 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2086 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2087 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2088 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2089 2090 evmcs->exit_qualification = vmcs12->exit_qualification; 2091 2092 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2093 evmcs->guest_rsp = vmcs12->guest_rsp; 2094 evmcs->guest_rflags = vmcs12->guest_rflags; 2095 2096 evmcs->guest_interruptibility_info = 2097 vmcs12->guest_interruptibility_info; 2098 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2099 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2100 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2101 evmcs->vm_entry_exception_error_code = 2102 vmcs12->vm_entry_exception_error_code; 2103 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2104 2105 evmcs->guest_rip = vmcs12->guest_rip; 2106 2107 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2108 2109 return; 2110 #else /* CONFIG_KVM_HYPERV */ 2111 KVM_BUG_ON(1, vmx->vcpu.kvm); 2112 #endif /* CONFIG_KVM_HYPERV */ 2113 } 2114 2115 /* 2116 * This is an equivalent of the nested hypervisor executing the vmptrld 2117 * instruction. 2118 */ 2119 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2120 struct kvm_vcpu *vcpu, bool from_launch) 2121 { 2122 #ifdef CONFIG_KVM_HYPERV 2123 struct vcpu_vmx *vmx = to_vmx(vcpu); 2124 bool evmcs_gpa_changed = false; 2125 u64 evmcs_gpa; 2126 2127 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2128 return EVMPTRLD_DISABLED; 2129 2130 evmcs_gpa = nested_get_evmptr(vcpu); 2131 if (!evmptr_is_valid(evmcs_gpa)) { 2132 nested_release_evmcs(vcpu); 2133 return EVMPTRLD_DISABLED; 2134 } 2135 2136 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2137 vmx->nested.current_vmptr = INVALID_GPA; 2138 2139 nested_release_evmcs(vcpu); 2140 2141 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2142 &vmx->nested.hv_evmcs_map)) 2143 return EVMPTRLD_ERROR; 2144 2145 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2146 2147 /* 2148 * Currently, KVM only supports eVMCS version 1 2149 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2150 * value to first u32 field of eVMCS which should specify eVMCS 2151 * VersionNumber. 2152 * 2153 * Guest should be aware of supported eVMCS versions by host by 2154 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2155 * expected to set this CPUID leaf according to the value 2156 * returned in vmcs_version from nested_enable_evmcs(). 2157 * 2158 * However, it turns out that Microsoft Hyper-V fails to comply 2159 * to their own invented interface: When Hyper-V use eVMCS, it 2160 * just sets first u32 field of eVMCS to revision_id specified 2161 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2162 * which is one of the supported versions specified in 2163 * CPUID.0x4000000A.EAX[0:15]. 2164 * 2165 * To overcome Hyper-V bug, we accept here either a supported 2166 * eVMCS version or VMCS12 revision_id as valid values for first 2167 * u32 field of eVMCS. 2168 */ 2169 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2170 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2171 nested_release_evmcs(vcpu); 2172 return EVMPTRLD_VMFAIL; 2173 } 2174 2175 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2176 2177 evmcs_gpa_changed = true; 2178 /* 2179 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2180 * reloaded from guest's memory (read only fields, fields not 2181 * present in struct hv_enlightened_vmcs, ...). Make sure there 2182 * are no leftovers. 2183 */ 2184 if (from_launch) { 2185 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2186 memset(vmcs12, 0, sizeof(*vmcs12)); 2187 vmcs12->hdr.revision_id = VMCS12_REVISION; 2188 } 2189 2190 } 2191 2192 /* 2193 * Clean fields data can't be used on VMLAUNCH and when we switch 2194 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2195 */ 2196 if (from_launch || evmcs_gpa_changed) { 2197 vmx->nested.hv_evmcs->hv_clean_fields &= 2198 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2199 2200 vmx->nested.force_msr_bitmap_recalc = true; 2201 } 2202 2203 return EVMPTRLD_SUCCEEDED; 2204 #else 2205 return EVMPTRLD_DISABLED; 2206 #endif 2207 } 2208 2209 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2210 { 2211 struct vcpu_vmx *vmx = to_vmx(vcpu); 2212 2213 if (nested_vmx_is_evmptr12_valid(vmx)) 2214 copy_vmcs12_to_enlightened(vmx); 2215 else 2216 copy_vmcs12_to_shadow(vmx); 2217 2218 vmx->nested.need_vmcs12_to_shadow_sync = false; 2219 } 2220 2221 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2222 { 2223 struct vcpu_vmx *vmx = 2224 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2225 2226 vmx->nested.preemption_timer_expired = true; 2227 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2228 kvm_vcpu_kick(&vmx->vcpu); 2229 2230 return HRTIMER_NORESTART; 2231 } 2232 2233 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2234 { 2235 struct vcpu_vmx *vmx = to_vmx(vcpu); 2236 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2237 2238 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2239 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2240 2241 if (!vmx->nested.has_preemption_timer_deadline) { 2242 vmx->nested.preemption_timer_deadline = 2243 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2244 vmx->nested.has_preemption_timer_deadline = true; 2245 } 2246 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2247 } 2248 2249 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2250 u64 preemption_timeout) 2251 { 2252 struct vcpu_vmx *vmx = to_vmx(vcpu); 2253 2254 /* 2255 * A timer value of zero is architecturally guaranteed to cause 2256 * a VMExit prior to executing any instructions in the guest. 2257 */ 2258 if (preemption_timeout == 0) { 2259 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2260 return; 2261 } 2262 2263 if (vcpu->arch.virtual_tsc_khz == 0) 2264 return; 2265 2266 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2267 preemption_timeout *= 1000000; 2268 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2269 hrtimer_start(&vmx->nested.preemption_timer, 2270 ktime_add_ns(ktime_get(), preemption_timeout), 2271 HRTIMER_MODE_ABS_PINNED); 2272 } 2273 2274 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2275 { 2276 if (vmx->nested.nested_run_pending && 2277 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2278 return vmcs12->guest_ia32_efer; 2279 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2280 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2281 else 2282 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2283 } 2284 2285 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2286 { 2287 struct kvm *kvm = vmx->vcpu.kvm; 2288 2289 /* 2290 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2291 * according to L0's settings (vmcs12 is irrelevant here). Host 2292 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2293 * will be set as needed prior to VMLAUNCH/VMRESUME. 2294 */ 2295 if (vmx->nested.vmcs02_initialized) 2296 return; 2297 vmx->nested.vmcs02_initialized = true; 2298 2299 if (vmx->ve_info) 2300 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2301 2302 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2303 if (cpu_has_vmx_vmfunc()) 2304 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2305 2306 if (cpu_has_vmx_posted_intr()) 2307 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2308 2309 if (cpu_has_vmx_msr_bitmap()) 2310 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2311 2312 /* 2313 * PML is emulated for L2, but never enabled in hardware as the MMU 2314 * handles A/D emulation. Disabling PML for L2 also avoids having to 2315 * deal with filtering out L2 GPAs from the buffer. 2316 */ 2317 if (enable_pml) { 2318 vmcs_write64(PML_ADDRESS, 0); 2319 vmcs_write16(GUEST_PML_INDEX, -1); 2320 } 2321 2322 if (cpu_has_vmx_encls_vmexit()) 2323 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2324 2325 if (kvm_notify_vmexit_enabled(kvm)) 2326 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2327 2328 /* 2329 * Set the MSR load/store lists to match L0's settings. Only the 2330 * addresses are constant (for vmcs02), the counts can change based 2331 * on L2's behavior, e.g. switching to/from long mode. 2332 */ 2333 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2334 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2335 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2336 2337 vmx_set_constant_host_state(vmx); 2338 } 2339 2340 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2341 struct vmcs12 *vmcs12) 2342 { 2343 prepare_vmcs02_constant_state(vmx); 2344 2345 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2346 2347 /* 2348 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2349 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2350 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2351 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2352 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2353 * required flushes), but doing so would cause KVM to over-flush. E.g. 2354 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2355 * and then runs L2 X again, then KVM can and should retain TLB entries 2356 * for VPID12=1. 2357 */ 2358 if (enable_vpid) { 2359 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2360 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2361 else 2362 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2363 } 2364 } 2365 2366 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2367 struct vmcs12 *vmcs12) 2368 { 2369 u32 exec_control; 2370 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2371 2372 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2373 prepare_vmcs02_early_rare(vmx, vmcs12); 2374 2375 /* 2376 * PIN CONTROLS 2377 */ 2378 exec_control = __pin_controls_get(vmcs01); 2379 exec_control |= (vmcs12->pin_based_vm_exec_control & 2380 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2381 2382 /* Posted interrupts setting is only taken from vmcs12. */ 2383 vmx->nested.pi_pending = false; 2384 if (nested_cpu_has_posted_intr(vmcs12)) { 2385 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2386 } else { 2387 vmx->nested.posted_intr_nv = -1; 2388 exec_control &= ~PIN_BASED_POSTED_INTR; 2389 } 2390 pin_controls_set(vmx, exec_control); 2391 2392 /* 2393 * EXEC CONTROLS 2394 */ 2395 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2396 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2397 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2398 exec_control &= ~CPU_BASED_TPR_SHADOW; 2399 exec_control |= vmcs12->cpu_based_vm_exec_control; 2400 2401 if (exec_control & CPU_BASED_TPR_SHADOW) 2402 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2403 #ifdef CONFIG_X86_64 2404 else 2405 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2406 CPU_BASED_CR8_STORE_EXITING; 2407 #endif 2408 2409 /* 2410 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2411 * for I/O port accesses. 2412 */ 2413 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2414 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2415 2416 /* 2417 * This bit will be computed in nested_get_vmcs12_pages, because 2418 * we do not have access to L1's MSR bitmap yet. For now, keep 2419 * the same bit as before, hoping to avoid multiple VMWRITEs that 2420 * only set/clear this bit. 2421 */ 2422 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2423 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2424 2425 exec_controls_set(vmx, exec_control); 2426 2427 /* 2428 * SECONDARY EXEC CONTROLS 2429 */ 2430 if (cpu_has_secondary_exec_ctrls()) { 2431 exec_control = __secondary_exec_controls_get(vmcs01); 2432 2433 /* Take the following fields only from vmcs12 */ 2434 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2435 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2436 SECONDARY_EXEC_ENABLE_INVPCID | 2437 SECONDARY_EXEC_ENABLE_RDTSCP | 2438 SECONDARY_EXEC_ENABLE_XSAVES | 2439 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2440 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2441 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2442 SECONDARY_EXEC_ENABLE_VMFUNC | 2443 SECONDARY_EXEC_DESC); 2444 2445 if (nested_cpu_has(vmcs12, 2446 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2447 exec_control |= vmcs12->secondary_vm_exec_control; 2448 2449 /* PML is emulated and never enabled in hardware for L2. */ 2450 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2451 2452 /* VMCS shadowing for L2 is emulated for now */ 2453 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2454 2455 /* 2456 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2457 * will not have to rewrite the controls just for this bit. 2458 */ 2459 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2460 exec_control |= SECONDARY_EXEC_DESC; 2461 2462 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2463 vmcs_write16(GUEST_INTR_STATUS, 2464 vmcs12->guest_intr_status); 2465 2466 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2467 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2468 2469 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2470 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2471 2472 secondary_exec_controls_set(vmx, exec_control); 2473 } 2474 2475 /* 2476 * ENTRY CONTROLS 2477 * 2478 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2479 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2480 * on the related bits (if supported by the CPU) in the hope that 2481 * we can avoid VMWrites during vmx_set_efer(). 2482 * 2483 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2484 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2485 * do the same for L2. 2486 */ 2487 exec_control = __vm_entry_controls_get(vmcs01); 2488 exec_control |= (vmcs12->vm_entry_controls & 2489 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2490 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2491 if (cpu_has_load_ia32_efer()) { 2492 if (guest_efer & EFER_LMA) 2493 exec_control |= VM_ENTRY_IA32E_MODE; 2494 if (guest_efer != kvm_host.efer) 2495 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2496 } 2497 vm_entry_controls_set(vmx, exec_control); 2498 2499 /* 2500 * EXIT CONTROLS 2501 * 2502 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2503 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2504 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2505 */ 2506 exec_control = __vm_exit_controls_get(vmcs01); 2507 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2508 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2509 else 2510 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2511 vm_exit_controls_set(vmx, exec_control); 2512 2513 /* 2514 * Interrupt/Exception Fields 2515 */ 2516 if (vmx->nested.nested_run_pending) { 2517 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2518 vmcs12->vm_entry_intr_info_field); 2519 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2520 vmcs12->vm_entry_exception_error_code); 2521 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2522 vmcs12->vm_entry_instruction_len); 2523 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2524 vmcs12->guest_interruptibility_info); 2525 vmx->loaded_vmcs->nmi_known_unmasked = 2526 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2527 } else { 2528 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2529 } 2530 } 2531 2532 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2533 u64 *ssp, u64 *ssp_tbl) 2534 { 2535 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2536 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2537 *s_cet = vmcs_readl(GUEST_S_CET); 2538 2539 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2540 *ssp = vmcs_readl(GUEST_SSP); 2541 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2542 } 2543 } 2544 2545 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2546 u64 ssp, u64 ssp_tbl) 2547 { 2548 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2549 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2550 vmcs_writel(GUEST_S_CET, s_cet); 2551 2552 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2553 vmcs_writel(GUEST_SSP, ssp); 2554 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2555 } 2556 } 2557 2558 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2559 { 2560 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2561 2562 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2563 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2564 2565 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2566 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2567 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2568 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2569 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2570 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2571 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2572 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2573 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2574 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2575 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2576 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2577 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2578 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2579 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2580 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2581 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2582 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2583 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2584 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2585 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2586 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2587 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2588 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2589 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2590 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2591 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2592 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2593 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2594 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2595 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2596 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2597 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2598 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2599 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2600 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2601 2602 vmx_segment_cache_clear(vmx); 2603 } 2604 2605 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2606 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2607 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2608 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2609 vmcs12->guest_pending_dbg_exceptions); 2610 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2611 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2612 2613 /* 2614 * L1 may access the L2's PDPTR, so save them to construct 2615 * vmcs12 2616 */ 2617 if (enable_ept) { 2618 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2619 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2620 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2621 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2622 } 2623 2624 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2625 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2626 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2627 } 2628 2629 if (nested_cpu_has_xsaves(vmcs12)) 2630 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2631 2632 /* 2633 * Whether page-faults are trapped is determined by a combination of 2634 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2635 * doesn't care about page faults then we should set all of these to 2636 * L1's desires. However, if L0 does care about (some) page faults, it 2637 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2638 * simply ask to exit on each and every L2 page fault. This is done by 2639 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2640 * Note that below we don't need special code to set EB.PF beyond the 2641 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2642 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2643 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2644 */ 2645 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2646 /* 2647 * TODO: if both L0 and L1 need the same MASK and MATCH, 2648 * go ahead and use it? 2649 */ 2650 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2651 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2652 } else { 2653 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2654 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2655 } 2656 2657 if (cpu_has_vmx_apicv()) { 2658 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2659 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2660 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2661 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2662 } 2663 2664 /* 2665 * If vmcs12 is configured to save TSC on exit via the auto-store list, 2666 * append the MSR to vmcs02's auto-store list so that KVM effectively 2667 * reads TSC at the time of VM-Exit from L2. The saved value will be 2668 * propagated to vmcs12's list on nested VM-Exit. 2669 * 2670 * Don't increment the number of MSRs in the vCPU structure, as saving 2671 * TSC is specific to this particular incarnation of vmcb02, i.e. must 2672 * not bleed into vmcs01. 2673 */ 2674 if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2675 !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2676 vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2677 vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2678 2679 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2680 } else { 2681 vmx->nested.tsc_autostore_slot = -1; 2682 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2683 } 2684 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2685 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2686 2687 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2688 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2689 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2690 2691 set_cr4_guest_host_mask(vmx); 2692 } 2693 2694 /* 2695 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2696 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2697 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2698 * guest in a way that will both be appropriate to L1's requests, and our 2699 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2700 * function also has additional necessary side-effects, like setting various 2701 * vcpu->arch fields. 2702 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2703 * is assigned to entry_failure_code on failure. 2704 */ 2705 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2706 bool from_vmentry, 2707 enum vm_entry_failure_code *entry_failure_code) 2708 { 2709 struct vcpu_vmx *vmx = to_vmx(vcpu); 2710 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2711 bool load_guest_pdptrs_vmcs12 = false; 2712 2713 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2714 prepare_vmcs02_rare(vmx, vmcs12); 2715 vmx->nested.dirty_vmcs12 = false; 2716 2717 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2718 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2719 } 2720 2721 if (vmx->nested.nested_run_pending && 2722 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2723 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2724 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2725 vmx_get_supported_debugctl(vcpu, false)); 2726 } else { 2727 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2728 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2729 } 2730 2731 if (!vmx->nested.nested_run_pending || 2732 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2733 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2734 vmx->nested.pre_vmenter_ssp, 2735 vmx->nested.pre_vmenter_ssp_tbl); 2736 2737 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2738 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2739 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2740 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2741 2742 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2743 * bitwise-or of what L1 wants to trap for L2, and what we want to 2744 * trap. Note that CR0.TS also needs updating - we do this later. 2745 */ 2746 vmx_update_exception_bitmap(vcpu); 2747 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2748 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2749 2750 if (vmx->nested.nested_run_pending && 2751 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2752 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2753 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2754 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2755 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2756 } 2757 2758 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2759 vcpu->arch.l1_tsc_offset, 2760 vmx_get_l2_tsc_offset(vcpu), 2761 vmx_get_l2_tsc_multiplier(vcpu)); 2762 2763 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2764 vcpu->arch.l1_tsc_scaling_ratio, 2765 vmx_get_l2_tsc_multiplier(vcpu)); 2766 2767 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2768 if (kvm_caps.has_tsc_control) 2769 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2770 2771 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2772 2773 if (nested_cpu_has_ept(vmcs12)) 2774 nested_ept_init_mmu_context(vcpu); 2775 2776 /* 2777 * Override the CR0/CR4 read shadows after setting the effective guest 2778 * CR0/CR4. The common helpers also set the shadows, but they don't 2779 * account for vmcs12's cr0/4_guest_host_mask. 2780 */ 2781 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2782 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2783 2784 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2785 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2786 2787 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2788 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2789 vmx_set_efer(vcpu, vcpu->arch.efer); 2790 2791 /* 2792 * Guest state is invalid and unrestricted guest is disabled, 2793 * which means L1 attempted VMEntry to L2 with invalid state. 2794 * Fail the VMEntry. 2795 * 2796 * However when force loading the guest state (SMM exit or 2797 * loading nested state after migration, it is possible to 2798 * have invalid guest state now, which will be later fixed by 2799 * restoring L2 register state 2800 */ 2801 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2802 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2803 return -EINVAL; 2804 } 2805 2806 /* Shadow page tables on either EPT or shadow page tables. */ 2807 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2808 from_vmentry, entry_failure_code)) 2809 return -EINVAL; 2810 2811 /* 2812 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2813 * on nested VM-Exit, which can occur without actually running L2 and 2814 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2815 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2816 * transition to HLT instead of running L2. 2817 */ 2818 if (enable_ept) 2819 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2820 2821 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2822 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2823 is_pae_paging(vcpu)) { 2824 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2825 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2826 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2827 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2828 } 2829 2830 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2831 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2832 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2833 vmcs12->guest_ia32_perf_global_ctrl))) { 2834 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2835 return -EINVAL; 2836 } 2837 2838 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2839 kvm_rip_write(vcpu, vmcs12->guest_rip); 2840 2841 /* 2842 * It was observed that genuine Hyper-V running in L1 doesn't reset 2843 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2844 * bits when it changes a field in eVMCS. Mark all fields as clean 2845 * here. 2846 */ 2847 if (nested_vmx_is_evmptr12_valid(vmx)) 2848 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2849 2850 return 0; 2851 } 2852 2853 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2854 { 2855 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2856 nested_cpu_has_virtual_nmis(vmcs12))) 2857 return -EINVAL; 2858 2859 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2860 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2861 return -EINVAL; 2862 2863 return 0; 2864 } 2865 2866 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2867 { 2868 struct vcpu_vmx *vmx = to_vmx(vcpu); 2869 2870 /* Check for memory type validity */ 2871 switch (new_eptp & VMX_EPTP_MT_MASK) { 2872 case VMX_EPTP_MT_UC: 2873 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2874 return false; 2875 break; 2876 case VMX_EPTP_MT_WB: 2877 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2878 return false; 2879 break; 2880 default: 2881 return false; 2882 } 2883 2884 /* Page-walk levels validity. */ 2885 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2886 case VMX_EPTP_PWL_5: 2887 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2888 return false; 2889 break; 2890 case VMX_EPTP_PWL_4: 2891 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2892 return false; 2893 break; 2894 default: 2895 return false; 2896 } 2897 2898 /* Reserved bits should not be set */ 2899 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2900 return false; 2901 2902 /* AD, if set, should be supported */ 2903 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2904 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2905 return false; 2906 } 2907 2908 return true; 2909 } 2910 2911 /* 2912 * Checks related to VM-Execution Control Fields 2913 */ 2914 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2915 struct vmcs12 *vmcs12) 2916 { 2917 struct vcpu_vmx *vmx = to_vmx(vcpu); 2918 2919 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2920 vmx->nested.msrs.pinbased_ctls_low, 2921 vmx->nested.msrs.pinbased_ctls_high)) || 2922 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2923 vmx->nested.msrs.procbased_ctls_low, 2924 vmx->nested.msrs.procbased_ctls_high))) 2925 return -EINVAL; 2926 2927 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2928 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2929 vmx->nested.msrs.secondary_ctls_low, 2930 vmx->nested.msrs.secondary_ctls_high))) 2931 return -EINVAL; 2932 2933 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2934 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2935 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2936 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2937 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2938 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2939 nested_vmx_check_nmi_controls(vmcs12) || 2940 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2941 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2942 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2943 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2944 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2945 return -EINVAL; 2946 2947 if (!nested_cpu_has_preemption_timer(vmcs12) && 2948 nested_cpu_has_save_preemption_timer(vmcs12)) 2949 return -EINVAL; 2950 2951 if (nested_cpu_has_ept(vmcs12) && 2952 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2953 return -EINVAL; 2954 2955 if (nested_cpu_has_vmfunc(vmcs12)) { 2956 if (CC(vmcs12->vm_function_control & 2957 ~vmx->nested.msrs.vmfunc_controls)) 2958 return -EINVAL; 2959 2960 if (nested_cpu_has_eptp_switching(vmcs12)) { 2961 if (CC(!nested_cpu_has_ept(vmcs12)) || 2962 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2963 return -EINVAL; 2964 } 2965 } 2966 2967 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2968 CC(!vmcs12->tsc_multiplier)) 2969 return -EINVAL; 2970 2971 return 0; 2972 } 2973 2974 /* 2975 * Checks related to VM-Exit Control Fields 2976 */ 2977 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2978 struct vmcs12 *vmcs12) 2979 { 2980 struct vcpu_vmx *vmx = to_vmx(vcpu); 2981 2982 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2983 vmx->nested.msrs.exit_ctls_low, 2984 vmx->nested.msrs.exit_ctls_high)) || 2985 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2986 return -EINVAL; 2987 2988 return 0; 2989 } 2990 2991 /* 2992 * Checks related to VM-Entry Control Fields 2993 */ 2994 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2995 struct vmcs12 *vmcs12) 2996 { 2997 struct vcpu_vmx *vmx = to_vmx(vcpu); 2998 2999 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 3000 vmx->nested.msrs.entry_ctls_low, 3001 vmx->nested.msrs.entry_ctls_high))) 3002 return -EINVAL; 3003 3004 /* 3005 * From the Intel SDM, volume 3: 3006 * Fields relevant to VM-entry event injection must be set properly. 3007 * These fields are the VM-entry interruption-information field, the 3008 * VM-entry exception error code, and the VM-entry instruction length. 3009 */ 3010 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3011 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3012 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3013 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3014 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3015 bool urg = nested_cpu_has2(vmcs12, 3016 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3017 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3018 3019 /* VM-entry interruption-info field: interruption type */ 3020 if (CC(intr_type == INTR_TYPE_RESERVED) || 3021 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3022 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3023 return -EINVAL; 3024 3025 /* VM-entry interruption-info field: vector */ 3026 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3027 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3028 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3029 return -EINVAL; 3030 3031 /* 3032 * Cannot deliver error code in real mode or if the interrupt 3033 * type is not hardware exception. For other cases, do the 3034 * consistency check only if the vCPU doesn't enumerate 3035 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3036 */ 3037 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3038 if (CC(has_error_code)) 3039 return -EINVAL; 3040 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3041 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3042 return -EINVAL; 3043 } 3044 3045 /* VM-entry exception error code */ 3046 if (CC(has_error_code && 3047 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3048 return -EINVAL; 3049 3050 /* VM-entry interruption-info field: reserved bits */ 3051 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3052 return -EINVAL; 3053 3054 /* VM-entry instruction length */ 3055 switch (intr_type) { 3056 case INTR_TYPE_SOFT_EXCEPTION: 3057 case INTR_TYPE_SOFT_INTR: 3058 case INTR_TYPE_PRIV_SW_EXCEPTION: 3059 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3060 CC(vmcs12->vm_entry_instruction_len == 0 && 3061 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3062 return -EINVAL; 3063 } 3064 } 3065 3066 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3067 return -EINVAL; 3068 3069 return 0; 3070 } 3071 3072 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3073 struct vmcs12 *vmcs12) 3074 { 3075 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3076 nested_check_vm_exit_controls(vcpu, vmcs12) || 3077 nested_check_vm_entry_controls(vcpu, vmcs12)) 3078 return -EINVAL; 3079 3080 #ifdef CONFIG_KVM_HYPERV 3081 if (guest_cpu_cap_has_evmcs(vcpu)) 3082 return nested_evmcs_check_controls(vmcs12); 3083 #endif 3084 3085 return 0; 3086 } 3087 3088 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3089 struct vmcs12 *vmcs12) 3090 { 3091 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3092 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3093 3094 /* 3095 * Don't bother with the consistency checks if KVM isn't configured to 3096 * WARN on missed consistency checks, as KVM needs to rely on hardware 3097 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3098 * the vTPR being writable by L1 at all times (it's an in-memory value, 3099 * not a VMCS field). I.e. even if the check passes now, it might fail 3100 * at the actual VM-Enter. 3101 * 3102 * Keying off the module param also allows treating an invalid vAPIC 3103 * mapping as a consistency check failure without increasing the risk 3104 * of breaking a "real" VM. 3105 */ 3106 if (!warn_on_missed_cc) 3107 return 0; 3108 3109 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3110 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3111 !nested_cpu_has_vid(vmcs12) && 3112 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3113 (CC(!vapic) || 3114 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3115 return -EINVAL; 3116 3117 return 0; 3118 } 3119 3120 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3121 struct vmcs12 *vmcs12) 3122 { 3123 #ifdef CONFIG_X86_64 3124 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3125 !!(vcpu->arch.efer & EFER_LMA))) 3126 return -EINVAL; 3127 #endif 3128 return 0; 3129 } 3130 3131 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3132 { 3133 /* 3134 * Check that the given linear address is canonical after a VM exit 3135 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3136 */ 3137 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3138 3139 return !__is_canonical_address(la, l1_address_bits_on_exit); 3140 } 3141 3142 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3143 u64 ssp, u64 ssp_tbl) 3144 { 3145 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3146 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3147 return -EINVAL; 3148 3149 return 0; 3150 } 3151 3152 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3153 struct vmcs12 *vmcs12) 3154 { 3155 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3156 3157 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3158 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3159 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3160 return -EINVAL; 3161 3162 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3163 return -EINVAL; 3164 3165 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3166 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3167 return -EINVAL; 3168 3169 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3170 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3171 return -EINVAL; 3172 3173 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3174 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3175 vmcs12->host_ia32_perf_global_ctrl))) 3176 return -EINVAL; 3177 3178 if (ia32e) { 3179 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3180 return -EINVAL; 3181 } else { 3182 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3183 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3184 CC((vmcs12->host_rip) >> 32)) 3185 return -EINVAL; 3186 } 3187 3188 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3189 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3190 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3191 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3192 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3193 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3194 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3195 CC(vmcs12->host_cs_selector == 0) || 3196 CC(vmcs12->host_tr_selector == 0) || 3197 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3198 return -EINVAL; 3199 3200 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3201 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3202 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3203 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3204 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3205 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3206 return -EINVAL; 3207 3208 /* 3209 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3210 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3211 * the values of the LMA and LME bits in the field must each be that of 3212 * the host address-space size VM-exit control. 3213 */ 3214 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3215 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3216 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3217 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3218 return -EINVAL; 3219 } 3220 3221 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3222 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3223 vmcs12->host_ssp, 3224 vmcs12->host_ssp_tbl)) 3225 return -EINVAL; 3226 3227 /* 3228 * IA32_S_CET and SSP must be canonical if the host will 3229 * enter 64-bit mode after VM-exit; otherwise, higher 3230 * 32-bits must be all 0s. 3231 */ 3232 if (ia32e) { 3233 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3234 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3235 return -EINVAL; 3236 } else { 3237 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3238 return -EINVAL; 3239 } 3240 } 3241 3242 return 0; 3243 } 3244 3245 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3246 struct vmcs12 *vmcs12) 3247 { 3248 struct vcpu_vmx *vmx = to_vmx(vcpu); 3249 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3250 struct vmcs_hdr hdr; 3251 3252 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3253 return 0; 3254 3255 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3256 return -EINVAL; 3257 3258 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3259 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3260 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3261 return -EINVAL; 3262 3263 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3264 offsetof(struct vmcs12, hdr), 3265 sizeof(hdr)))) 3266 return -EINVAL; 3267 3268 if (CC(hdr.revision_id != VMCS12_REVISION) || 3269 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3270 return -EINVAL; 3271 3272 return 0; 3273 } 3274 3275 /* 3276 * Checks related to Guest Non-register State 3277 */ 3278 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3279 { 3280 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3281 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3282 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3283 return -EINVAL; 3284 3285 return 0; 3286 } 3287 3288 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3289 struct vmcs12 *vmcs12, 3290 enum vm_entry_failure_code *entry_failure_code) 3291 { 3292 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3293 3294 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3295 3296 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3297 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3298 return -EINVAL; 3299 3300 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3301 return -EINVAL; 3302 3303 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3304 u64 debugctl = vmcs12->guest_ia32_debugctl; 3305 3306 /* 3307 * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in 3308 * vmcs12's DEBUGCTL under a quirk for backwards compatibility. 3309 * Note that the quirk only relaxes the consistency check. The 3310 * vmcc02 bit is still under the control of the host. In 3311 * particular, if a host administrator decides to clear the bit, 3312 * then L1 has no say in the matter. 3313 */ 3314 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)) 3315 debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM; 3316 3317 if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3318 CC(!vmx_is_valid_debugctl(vcpu, debugctl, false))) 3319 return -EINVAL; 3320 } 3321 3322 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3323 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3324 return -EINVAL; 3325 3326 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3327 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3328 return -EINVAL; 3329 } 3330 3331 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3332 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3333 vmcs12->guest_ia32_perf_global_ctrl))) 3334 return -EINVAL; 3335 3336 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3337 return -EINVAL; 3338 3339 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3340 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3341 return -EINVAL; 3342 3343 /* 3344 * If the load IA32_EFER VM-entry control is 1, the following checks 3345 * are performed on the field for the IA32_EFER MSR: 3346 * - Bits reserved in the IA32_EFER MSR must be 0. 3347 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3348 * the IA-32e mode guest VM-exit control. It must also be identical 3349 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3350 * CR0.PG) is 1. 3351 */ 3352 if (to_vmx(vcpu)->nested.nested_run_pending && 3353 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3354 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3355 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3356 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3357 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3358 return -EINVAL; 3359 } 3360 3361 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3362 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3363 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3364 return -EINVAL; 3365 3366 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3367 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3368 vmcs12->guest_ssp, 3369 vmcs12->guest_ssp_tbl)) 3370 return -EINVAL; 3371 3372 /* 3373 * Guest SSP must have 63:N bits identical, rather than 3374 * be canonical (i.e., 63:N-1 bits identical), where N is 3375 * the CPU's maximum linear-address width. Similar to 3376 * is_noncanonical_msr_address(), use the host's 3377 * linear-address width. 3378 */ 3379 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3380 return -EINVAL; 3381 } 3382 3383 if (nested_check_guest_non_reg_state(vmcs12)) 3384 return -EINVAL; 3385 3386 return 0; 3387 } 3388 3389 #ifdef CONFIG_KVM_HYPERV 3390 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3391 { 3392 struct vcpu_vmx *vmx = to_vmx(vcpu); 3393 3394 /* 3395 * hv_evmcs may end up being not mapped after migration (when 3396 * L2 was running), map it here to make sure vmcs12 changes are 3397 * properly reflected. 3398 */ 3399 if (guest_cpu_cap_has_evmcs(vcpu) && 3400 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3401 enum nested_evmptrld_status evmptrld_status = 3402 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3403 3404 if (evmptrld_status == EVMPTRLD_VMFAIL || 3405 evmptrld_status == EVMPTRLD_ERROR) 3406 return false; 3407 3408 /* 3409 * Post migration VMCS12 always provides the most actual 3410 * information, copy it to eVMCS upon entry. 3411 */ 3412 vmx->nested.need_vmcs12_to_shadow_sync = true; 3413 } 3414 3415 return true; 3416 } 3417 #endif 3418 3419 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3420 { 3421 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3422 struct vcpu_vmx *vmx = to_vmx(vcpu); 3423 struct kvm_host_map *map; 3424 3425 if (!vcpu->arch.pdptrs_from_userspace && 3426 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3427 /* 3428 * Reload the guest's PDPTRs since after a migration 3429 * the guest CR3 might be restored prior to setting the nested 3430 * state which can lead to a load of wrong PDPTRs. 3431 */ 3432 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3433 return false; 3434 } 3435 3436 3437 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3438 map = &vmx->nested.apic_access_page_map; 3439 3440 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3441 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3442 } else { 3443 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3444 __func__); 3445 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3446 vcpu->run->internal.suberror = 3447 KVM_INTERNAL_ERROR_EMULATION; 3448 vcpu->run->internal.ndata = 0; 3449 return false; 3450 } 3451 } 3452 3453 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3454 map = &vmx->nested.virtual_apic_map; 3455 3456 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3457 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3458 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3459 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3460 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3461 /* 3462 * The processor will never use the TPR shadow, simply 3463 * clear the bit from the execution control. Such a 3464 * configuration is useless, but it happens in tests. 3465 * For any other configuration, failing the vm entry is 3466 * _not_ what the processor does but it's basically the 3467 * only possibility we have. 3468 */ 3469 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3470 } else { 3471 /* 3472 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3473 * force VM-Entry to fail. 3474 */ 3475 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3476 } 3477 } 3478 3479 if (nested_cpu_has_posted_intr(vmcs12)) { 3480 map = &vmx->nested.pi_desc_map; 3481 3482 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3483 vmx->nested.pi_desc = 3484 (struct pi_desc *)(((void *)map->hva) + 3485 offset_in_page(vmcs12->posted_intr_desc_addr)); 3486 vmcs_write64(POSTED_INTR_DESC_ADDR, 3487 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3488 } else { 3489 /* 3490 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3491 * access the contents of the VMCS12 posted interrupt 3492 * descriptor. (Note that KVM may do this when it 3493 * should not, per the architectural specification.) 3494 */ 3495 vmx->nested.pi_desc = NULL; 3496 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3497 } 3498 } 3499 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3500 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3501 else 3502 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3503 3504 return true; 3505 } 3506 3507 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3508 { 3509 #ifdef CONFIG_KVM_HYPERV 3510 /* 3511 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3512 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3513 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3514 * migration. 3515 */ 3516 if (!nested_get_evmcs_page(vcpu)) { 3517 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3518 __func__); 3519 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3520 vcpu->run->internal.suberror = 3521 KVM_INTERNAL_ERROR_EMULATION; 3522 vcpu->run->internal.ndata = 0; 3523 3524 return false; 3525 } 3526 #endif 3527 3528 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3529 return false; 3530 3531 return true; 3532 } 3533 3534 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3535 { 3536 struct vmcs12 *vmcs12; 3537 struct vcpu_vmx *vmx = to_vmx(vcpu); 3538 gpa_t dst; 3539 3540 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3541 return 0; 3542 3543 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3544 return 1; 3545 3546 /* 3547 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3548 * set is already checked as part of A/D emulation. 3549 */ 3550 vmcs12 = get_vmcs12(vcpu); 3551 if (!nested_cpu_has_pml(vmcs12)) 3552 return 0; 3553 3554 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3555 vmx->nested.pml_full = true; 3556 return 1; 3557 } 3558 3559 gpa &= ~0xFFFull; 3560 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3561 3562 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3563 offset_in_page(dst), sizeof(gpa))) 3564 return 0; 3565 3566 vmcs12->guest_pml_index--; 3567 3568 return 0; 3569 } 3570 3571 /* 3572 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3573 * for running VMX instructions (except VMXON, whose prerequisites are 3574 * slightly different). It also specifies what exception to inject otherwise. 3575 * Note that many of these exceptions have priority over VM exits, so they 3576 * don't have to be checked again here. 3577 */ 3578 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3579 { 3580 if (!to_vmx(vcpu)->nested.vmxon) { 3581 kvm_queue_exception(vcpu, UD_VECTOR); 3582 return 0; 3583 } 3584 3585 if (vmx_get_cpl(vcpu)) { 3586 kvm_inject_gp(vcpu, 0); 3587 return 0; 3588 } 3589 3590 return 1; 3591 } 3592 3593 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3594 struct vmcs12 *vmcs12); 3595 3596 /* 3597 * If from_vmentry is false, this is being called from state restore (either RSM 3598 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3599 * 3600 * Returns: 3601 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3602 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3603 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3604 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3605 */ 3606 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3607 bool from_vmentry) 3608 { 3609 struct vcpu_vmx *vmx = to_vmx(vcpu); 3610 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3611 enum vm_entry_failure_code entry_failure_code; 3612 union vmx_exit_reason exit_reason = { 3613 .basic = EXIT_REASON_INVALID_STATE, 3614 .failed_vmentry = 1, 3615 }; 3616 u32 failed_index; 3617 3618 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3619 vmx->nested.current_vmptr, 3620 vmcs12->guest_rip, 3621 vmcs12->guest_intr_status, 3622 vmcs12->vm_entry_intr_info_field, 3623 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3624 vmcs12->ept_pointer, 3625 vmcs12->guest_cr3, 3626 KVM_ISA_VMX); 3627 3628 kvm_service_local_tlb_flush_requests(vcpu); 3629 3630 if (!vmx->nested.nested_run_pending || 3631 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3632 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3633 if (kvm_mpx_supported() && 3634 (!vmx->nested.nested_run_pending || 3635 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3636 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3637 3638 if (!vmx->nested.nested_run_pending || 3639 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3640 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3641 &vmx->nested.pre_vmenter_ssp, 3642 &vmx->nested.pre_vmenter_ssp_tbl); 3643 3644 /* 3645 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3646 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3647 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3648 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3649 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3650 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3651 * unwind naturally setting arch.cr3 to the correct value. Smashing 3652 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3653 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3654 * overwritten with a shadow CR3 prior to re-entering L1. 3655 */ 3656 if (!enable_ept) 3657 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3658 3659 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3660 3661 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3662 3663 if (from_vmentry) { 3664 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3665 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3666 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3667 } 3668 3669 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3670 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3671 return NVMX_VMENTRY_VMFAIL; 3672 } 3673 3674 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3675 &entry_failure_code)) { 3676 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3677 vmcs12->exit_qualification = entry_failure_code; 3678 goto vmentry_fail_vmexit; 3679 } 3680 } 3681 3682 enter_guest_mode(vcpu); 3683 3684 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3685 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3686 vmcs12->exit_qualification = entry_failure_code; 3687 goto vmentry_fail_vmexit_guest_mode; 3688 } 3689 3690 if (from_vmentry) { 3691 failed_index = nested_vmx_load_msr(vcpu, 3692 vmcs12->vm_entry_msr_load_addr, 3693 vmcs12->vm_entry_msr_load_count); 3694 if (failed_index) { 3695 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3696 vmcs12->exit_qualification = failed_index; 3697 goto vmentry_fail_vmexit_guest_mode; 3698 } 3699 } else { 3700 /* 3701 * The MMU is not initialized to point at the right entities yet and 3702 * "get pages" would need to read data from the guest (i.e. we will 3703 * need to perform gpa to hpa translation). Request a call 3704 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3705 * have already been set at vmentry time and should not be reset. 3706 */ 3707 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3708 } 3709 3710 /* 3711 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3712 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3713 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3714 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3715 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3716 */ 3717 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3718 CPU_BASED_NMI_WINDOW_EXITING)) || 3719 kvm_apic_has_pending_init_or_sipi(vcpu) || 3720 kvm_apic_has_interrupt(vcpu)) 3721 kvm_make_request(KVM_REQ_EVENT, vcpu); 3722 3723 /* 3724 * Do not start the preemption timer hrtimer until after we know 3725 * we are successful, so that only nested_vmx_vmexit needs to cancel 3726 * the timer. 3727 */ 3728 vmx->nested.preemption_timer_expired = false; 3729 if (nested_cpu_has_preemption_timer(vmcs12)) { 3730 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3731 vmx_start_preemption_timer(vcpu, timer_value); 3732 } 3733 3734 /* 3735 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3736 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3737 * returned as far as L1 is concerned. It will only return (and set 3738 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3739 */ 3740 return NVMX_VMENTRY_SUCCESS; 3741 3742 /* 3743 * A failed consistency check that leads to a VMExit during L1's 3744 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3745 * 26.7 "VM-entry failures during or after loading guest state". 3746 */ 3747 vmentry_fail_vmexit_guest_mode: 3748 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3749 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3750 leave_guest_mode(vcpu); 3751 3752 vmentry_fail_vmexit: 3753 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3754 3755 if (!from_vmentry) 3756 return NVMX_VMENTRY_VMEXIT; 3757 3758 load_vmcs12_host_state(vcpu, vmcs12); 3759 vmcs12->vm_exit_reason = exit_reason.full; 3760 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3761 vmx->nested.need_vmcs12_to_shadow_sync = true; 3762 return NVMX_VMENTRY_VMEXIT; 3763 } 3764 3765 /* 3766 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3767 * for running an L2 nested guest. 3768 */ 3769 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3770 { 3771 struct vmcs12 *vmcs12; 3772 enum nvmx_vmentry_status status; 3773 struct vcpu_vmx *vmx = to_vmx(vcpu); 3774 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3775 enum nested_evmptrld_status evmptrld_status; 3776 3777 if (!nested_vmx_check_permission(vcpu)) 3778 return 1; 3779 3780 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3781 if (evmptrld_status == EVMPTRLD_ERROR) { 3782 kvm_queue_exception(vcpu, UD_VECTOR); 3783 return 1; 3784 } 3785 3786 kvm_pmu_branch_retired(vcpu); 3787 3788 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3789 return nested_vmx_failInvalid(vcpu); 3790 3791 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3792 vmx->nested.current_vmptr == INVALID_GPA)) 3793 return nested_vmx_failInvalid(vcpu); 3794 3795 vmcs12 = get_vmcs12(vcpu); 3796 3797 /* 3798 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3799 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3800 * rather than RFLAGS.ZF, and no error number is stored to the 3801 * VM-instruction error field. 3802 */ 3803 if (CC(vmcs12->hdr.shadow_vmcs)) 3804 return nested_vmx_failInvalid(vcpu); 3805 3806 if (nested_vmx_is_evmptr12_valid(vmx)) { 3807 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3808 3809 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3810 /* Enlightened VMCS doesn't have launch state */ 3811 vmcs12->launch_state = !launch; 3812 } else if (enable_shadow_vmcs) { 3813 copy_shadow_to_vmcs12(vmx); 3814 } 3815 3816 /* 3817 * The nested entry process starts with enforcing various prerequisites 3818 * on vmcs12 as required by the Intel SDM, and act appropriately when 3819 * they fail: As the SDM explains, some conditions should cause the 3820 * instruction to fail, while others will cause the instruction to seem 3821 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3822 * To speed up the normal (success) code path, we should avoid checking 3823 * for misconfigurations which will anyway be caught by the processor 3824 * when using the merged vmcs02. 3825 */ 3826 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3827 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3828 3829 if (CC(vmcs12->launch_state == launch)) 3830 return nested_vmx_fail(vcpu, 3831 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3832 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3833 3834 if (nested_vmx_check_controls(vcpu, vmcs12)) 3835 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3836 3837 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3838 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3839 3840 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3841 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3842 3843 /* 3844 * We're finally done with prerequisite checking, and can start with 3845 * the nested entry. 3846 */ 3847 vmx->nested.nested_run_pending = 1; 3848 vmx->nested.has_preemption_timer_deadline = false; 3849 status = nested_vmx_enter_non_root_mode(vcpu, true); 3850 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3851 goto vmentry_failed; 3852 3853 /* Hide L1D cache contents from the nested guest. */ 3854 kvm_request_l1tf_flush_l1d(); 3855 3856 /* 3857 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3858 * also be used as part of restoring nVMX state for 3859 * snapshot restore (migration). 3860 * 3861 * In this flow, it is assumed that vmcs12 cache was 3862 * transferred as part of captured nVMX state and should 3863 * therefore not be read from guest memory (which may not 3864 * exist on destination host yet). 3865 */ 3866 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3867 3868 switch (vmcs12->guest_activity_state) { 3869 case GUEST_ACTIVITY_HLT: 3870 /* 3871 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3872 * awakened by event injection or by an NMI-window VM-exit or 3873 * by an interrupt-window VM-exit, halt the vcpu. 3874 */ 3875 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3876 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3877 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3878 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3879 vmx->nested.nested_run_pending = 0; 3880 return kvm_emulate_halt_noskip(vcpu); 3881 } 3882 break; 3883 case GUEST_ACTIVITY_WAIT_SIPI: 3884 vmx->nested.nested_run_pending = 0; 3885 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3886 break; 3887 default: 3888 break; 3889 } 3890 3891 return 1; 3892 3893 vmentry_failed: 3894 vmx->nested.nested_run_pending = 0; 3895 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3896 return 0; 3897 if (status == NVMX_VMENTRY_VMEXIT) 3898 return 1; 3899 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3900 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3901 } 3902 3903 /* 3904 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3905 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3906 * This function returns the new value we should put in vmcs12.guest_cr0. 3907 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3908 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3909 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3910 * didn't trap the bit, because if L1 did, so would L0). 3911 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3912 * been modified by L2, and L1 knows it. So just leave the old value of 3913 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3914 * isn't relevant, because if L0 traps this bit it can set it to anything. 3915 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3916 * changed these bits, and therefore they need to be updated, but L0 3917 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3918 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3919 */ 3920 static inline unsigned long 3921 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3922 { 3923 return 3924 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3925 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3926 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3927 vcpu->arch.cr0_guest_owned_bits)); 3928 } 3929 3930 static inline unsigned long 3931 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3932 { 3933 return 3934 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3935 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3936 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3937 vcpu->arch.cr4_guest_owned_bits)); 3938 } 3939 3940 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3941 struct vmcs12 *vmcs12, 3942 u32 vm_exit_reason, u32 exit_intr_info) 3943 { 3944 u32 idt_vectoring; 3945 unsigned int nr; 3946 3947 /* 3948 * Per the SDM, VM-Exits due to double and triple faults are never 3949 * considered to occur during event delivery, even if the double/triple 3950 * fault is the result of an escalating vectoring issue. 3951 * 3952 * Note, the SDM qualifies the double fault behavior with "The original 3953 * event results in a double-fault exception". It's unclear why the 3954 * qualification exists since exits due to double fault can occur only 3955 * while vectoring a different exception (injected events are never 3956 * subject to interception), i.e. there's _always_ an original event. 3957 * 3958 * The SDM also uses NMI as a confusing example for the "original event 3959 * causes the VM exit directly" clause. NMI isn't special in any way, 3960 * the same rule applies to all events that cause an exit directly. 3961 * NMI is an odd choice for the example because NMIs can only occur on 3962 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3963 */ 3964 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3965 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3966 is_double_fault(exit_intr_info))) { 3967 vmcs12->idt_vectoring_info_field = 0; 3968 } else if (vcpu->arch.exception.injected) { 3969 nr = vcpu->arch.exception.vector; 3970 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3971 3972 if (kvm_exception_is_soft(nr)) { 3973 vmcs12->vm_exit_instruction_len = 3974 vcpu->arch.event_exit_inst_len; 3975 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3976 } else 3977 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3978 3979 if (vcpu->arch.exception.has_error_code) { 3980 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3981 vmcs12->idt_vectoring_error_code = 3982 vcpu->arch.exception.error_code; 3983 } 3984 3985 vmcs12->idt_vectoring_info_field = idt_vectoring; 3986 } else if (vcpu->arch.nmi_injected) { 3987 vmcs12->idt_vectoring_info_field = 3988 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3989 } else if (vcpu->arch.interrupt.injected) { 3990 nr = vcpu->arch.interrupt.nr; 3991 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3992 3993 if (vcpu->arch.interrupt.soft) { 3994 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3995 vmcs12->vm_entry_instruction_len = 3996 vcpu->arch.event_exit_inst_len; 3997 } else 3998 idt_vectoring |= INTR_TYPE_EXT_INTR; 3999 4000 vmcs12->idt_vectoring_info_field = idt_vectoring; 4001 } else { 4002 vmcs12->idt_vectoring_info_field = 0; 4003 } 4004 } 4005 4006 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4007 { 4008 struct vcpu_vmx *vmx = to_vmx(vcpu); 4009 int max_irr; 4010 void *vapic_page; 4011 u16 status; 4012 4013 if (!vmx->nested.pi_pending) 4014 return 0; 4015 4016 if (!vmx->nested.pi_desc) 4017 goto mmio_needed; 4018 4019 vmx->nested.pi_pending = false; 4020 4021 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4022 return 0; 4023 4024 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4025 if (max_irr > 0) { 4026 vapic_page = vmx->nested.virtual_apic_map.hva; 4027 if (!vapic_page) 4028 goto mmio_needed; 4029 4030 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4031 vapic_page, &max_irr); 4032 status = vmcs_read16(GUEST_INTR_STATUS); 4033 if ((u8)max_irr > ((u8)status & 0xff)) { 4034 status &= ~0xff; 4035 status |= (u8)max_irr; 4036 vmcs_write16(GUEST_INTR_STATUS, status); 4037 } 4038 } 4039 4040 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 4041 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 4042 return 0; 4043 4044 mmio_needed: 4045 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4046 return -ENXIO; 4047 } 4048 4049 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4050 { 4051 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4052 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4053 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4054 unsigned long exit_qual; 4055 4056 if (ex->has_payload) { 4057 exit_qual = ex->payload; 4058 } else if (ex->vector == PF_VECTOR) { 4059 exit_qual = vcpu->arch.cr2; 4060 } else if (ex->vector == DB_VECTOR) { 4061 exit_qual = vcpu->arch.dr6; 4062 exit_qual &= ~DR6_BT; 4063 exit_qual ^= DR6_ACTIVE_LOW; 4064 } else { 4065 exit_qual = 0; 4066 } 4067 4068 /* 4069 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4070 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4071 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4072 */ 4073 if (ex->has_error_code && is_protmode(vcpu)) { 4074 /* 4075 * Intel CPUs do not generate error codes with bits 31:16 set, 4076 * and more importantly VMX disallows setting bits 31:16 in the 4077 * injected error code for VM-Entry. Drop the bits to mimic 4078 * hardware and avoid inducing failure on nested VM-Entry if L1 4079 * chooses to inject the exception back to L2. AMD CPUs _do_ 4080 * generate "full" 32-bit error codes, so KVM allows userspace 4081 * to inject exception error codes with bits 31:16 set. 4082 */ 4083 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4084 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4085 } 4086 4087 if (kvm_exception_is_soft(ex->vector)) 4088 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4089 else 4090 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4091 4092 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4093 vmx_get_nmi_mask(vcpu)) 4094 intr_info |= INTR_INFO_UNBLOCK_NMI; 4095 4096 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4097 } 4098 4099 /* 4100 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4101 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4102 * Using the payload is flawed because code breakpoints (fault-like) and data 4103 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4104 * this will return false positives if a to-be-injected code breakpoint #DB is 4105 * pending (from KVM's perspective, but not "pending" across an instruction 4106 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4107 * too is trap-like. 4108 * 4109 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4110 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4111 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4112 * from the emulator (because such #DBs are fault-like and thus don't trigger 4113 * actions that fire on instruction retire). 4114 */ 4115 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4116 { 4117 if (!ex->pending || ex->vector != DB_VECTOR) 4118 return 0; 4119 4120 /* General Detect #DBs are always fault-like. */ 4121 return ex->payload & ~DR6_BD; 4122 } 4123 4124 /* 4125 * Returns true if there's a pending #DB exception that is lower priority than 4126 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4127 * KVM, but could theoretically be injected by userspace. Note, this code is 4128 * imperfect, see above. 4129 */ 4130 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4131 { 4132 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4133 } 4134 4135 /* 4136 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4137 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4138 * represents these debug traps with a payload that is said to be compatible 4139 * with the 'pending debug exceptions' field, write the payload to the VMCS 4140 * field if a VM-exit is delivered before the debug trap. 4141 */ 4142 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4143 { 4144 unsigned long pending_dbg; 4145 4146 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4147 if (pending_dbg) 4148 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4149 } 4150 4151 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4152 { 4153 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4154 to_vmx(vcpu)->nested.preemption_timer_expired; 4155 } 4156 4157 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4158 { 4159 struct vcpu_vmx *vmx = to_vmx(vcpu); 4160 void *vapic = vmx->nested.virtual_apic_map.hva; 4161 int max_irr, vppr; 4162 4163 if (nested_vmx_preemption_timer_pending(vcpu) || 4164 vmx->nested.mtf_pending) 4165 return true; 4166 4167 /* 4168 * Virtual Interrupt Delivery doesn't require manual injection. Either 4169 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4170 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4171 * the interrupt from the PIR to RVI prior to entering the guest. 4172 */ 4173 if (for_injection) 4174 return false; 4175 4176 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4177 __vmx_interrupt_blocked(vcpu)) 4178 return false; 4179 4180 if (!vapic) 4181 return false; 4182 4183 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4184 4185 max_irr = vmx_get_rvi(); 4186 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4187 return true; 4188 4189 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4190 pi_test_on(vmx->nested.pi_desc)) { 4191 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4192 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4193 return true; 4194 } 4195 4196 return false; 4197 } 4198 4199 /* 4200 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4201 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4202 * and less minor edits to splice in the priority of VMX Non-Root specific 4203 * events, e.g. MTF and NMI/INTR-window exiting. 4204 * 4205 * 1 Hardware Reset and Machine Checks 4206 * - RESET 4207 * - Machine Check 4208 * 4209 * 2 Trap on Task Switch 4210 * - T flag in TSS is set (on task switch) 4211 * 4212 * 3 External Hardware Interventions 4213 * - FLUSH 4214 * - STOPCLK 4215 * - SMI 4216 * - INIT 4217 * 4218 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4219 * 4220 * 4 Traps on Previous Instruction 4221 * - Breakpoints 4222 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4223 * breakpoint, or #DB due to a split-lock access) 4224 * 4225 * 4.3 VMX-preemption timer expired VM-exit 4226 * 4227 * 4.6 NMI-window exiting VM-exit[2] 4228 * 4229 * 5 Nonmaskable Interrupts (NMI) 4230 * 4231 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4232 * 4233 * 6 Maskable Hardware Interrupts 4234 * 4235 * 7 Code Breakpoint Fault 4236 * 4237 * 8 Faults from Fetching Next Instruction 4238 * - Code-Segment Limit Violation 4239 * - Code Page Fault 4240 * - Control protection exception (missing ENDBRANCH at target of indirect 4241 * call or jump) 4242 * 4243 * 9 Faults from Decoding Next Instruction 4244 * - Instruction length > 15 bytes 4245 * - Invalid Opcode 4246 * - Coprocessor Not Available 4247 * 4248 *10 Faults on Executing Instruction 4249 * - Overflow 4250 * - Bound error 4251 * - Invalid TSS 4252 * - Segment Not Present 4253 * - Stack fault 4254 * - General Protection 4255 * - Data Page Fault 4256 * - Alignment Check 4257 * - x86 FPU Floating-point exception 4258 * - SIMD floating-point exception 4259 * - Virtualization exception 4260 * - Control protection exception 4261 * 4262 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4263 * INIT signals, and higher priority events take priority over MTF VM exits. 4264 * MTF VM exits take priority over debug-trap exceptions and lower priority 4265 * events. 4266 * 4267 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4268 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4269 * timer take priority over VM exits caused by the "NMI-window exiting" 4270 * VM-execution control and lower priority events. 4271 * 4272 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4273 * caused by "NMI-window exiting". VM exits caused by this control take 4274 * priority over non-maskable interrupts (NMIs) and lower priority events. 4275 * 4276 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4277 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4278 * non-maskable interrupts (NMIs) and higher priority events take priority over 4279 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4280 * priority over external interrupts and lower priority events. 4281 */ 4282 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4283 { 4284 struct kvm_lapic *apic = vcpu->arch.apic; 4285 struct vcpu_vmx *vmx = to_vmx(vcpu); 4286 /* 4287 * Only a pending nested run blocks a pending exception. If there is a 4288 * previously injected event, the pending exception occurred while said 4289 * event was being delivered and thus needs to be handled. 4290 */ 4291 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4292 /* 4293 * Events that don't require injection, i.e. that are virtualized by 4294 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4295 * to regain control in order to deliver the event, and hardware will 4296 * handle event ordering, e.g. with respect to injected exceptions. 4297 * 4298 * But, new events (not exceptions) are only recognized at instruction 4299 * boundaries. If an event needs reinjection, then KVM is handling a 4300 * VM-Exit that occurred _during_ instruction execution; new events, 4301 * irrespective of whether or not they're injected, are blocked until 4302 * the instruction completes. 4303 */ 4304 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4305 /* 4306 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4307 * for managing priority between concurrent events, i.e. KVM needs to 4308 * wait until after VM-Enter completes to deliver injected events. 4309 */ 4310 bool block_nested_events = block_nested_exceptions || 4311 block_non_injected_events; 4312 4313 if (lapic_in_kernel(vcpu) && 4314 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4315 if (block_nested_events) 4316 return -EBUSY; 4317 nested_vmx_update_pending_dbg(vcpu); 4318 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4319 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4320 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4321 4322 /* MTF is discarded if the vCPU is in WFS. */ 4323 vmx->nested.mtf_pending = false; 4324 return 0; 4325 } 4326 4327 if (lapic_in_kernel(vcpu) && 4328 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4329 if (block_nested_events) 4330 return -EBUSY; 4331 4332 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4333 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4334 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4335 apic->sipi_vector & 0xFFUL); 4336 return 0; 4337 } 4338 /* Fallthrough, the SIPI is completely ignored. */ 4339 } 4340 4341 /* 4342 * Process exceptions that are higher priority than Monitor Trap Flag: 4343 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4344 * could theoretically come in from userspace), and ICEBP (INT1). 4345 * 4346 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4347 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4348 * across SMI/RSM as it should; that needs to be addressed in order to 4349 * prioritize SMI over MTF and trap-like #DBs. 4350 */ 4351 if (vcpu->arch.exception_vmexit.pending && 4352 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4353 if (block_nested_exceptions) 4354 return -EBUSY; 4355 4356 nested_vmx_inject_exception_vmexit(vcpu); 4357 return 0; 4358 } 4359 4360 if (vcpu->arch.exception.pending && 4361 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4362 if (block_nested_exceptions) 4363 return -EBUSY; 4364 goto no_vmexit; 4365 } 4366 4367 if (vmx->nested.mtf_pending) { 4368 if (block_nested_events) 4369 return -EBUSY; 4370 nested_vmx_update_pending_dbg(vcpu); 4371 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4372 return 0; 4373 } 4374 4375 if (vcpu->arch.exception_vmexit.pending) { 4376 if (block_nested_exceptions) 4377 return -EBUSY; 4378 4379 nested_vmx_inject_exception_vmexit(vcpu); 4380 return 0; 4381 } 4382 4383 if (vcpu->arch.exception.pending) { 4384 if (block_nested_exceptions) 4385 return -EBUSY; 4386 goto no_vmexit; 4387 } 4388 4389 if (nested_vmx_preemption_timer_pending(vcpu)) { 4390 if (block_nested_events) 4391 return -EBUSY; 4392 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4393 return 0; 4394 } 4395 4396 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4397 if (block_nested_events) 4398 return -EBUSY; 4399 goto no_vmexit; 4400 } 4401 4402 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4403 if (block_nested_events) 4404 return -EBUSY; 4405 if (!nested_exit_on_nmi(vcpu)) 4406 goto no_vmexit; 4407 4408 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4409 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4410 INTR_INFO_VALID_MASK, 0); 4411 /* 4412 * The NMI-triggered VM exit counts as injection: 4413 * clear this one and block further NMIs. 4414 */ 4415 vcpu->arch.nmi_pending = 0; 4416 vmx_set_nmi_mask(vcpu, true); 4417 return 0; 4418 } 4419 4420 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4421 int irq; 4422 4423 if (!nested_exit_on_intr(vcpu)) { 4424 if (block_nested_events) 4425 return -EBUSY; 4426 4427 goto no_vmexit; 4428 } 4429 4430 if (!nested_exit_intr_ack_set(vcpu)) { 4431 if (block_nested_events) 4432 return -EBUSY; 4433 4434 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4435 return 0; 4436 } 4437 4438 irq = kvm_cpu_get_extint(vcpu); 4439 if (irq != -1) { 4440 if (block_nested_events) 4441 return -EBUSY; 4442 4443 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4444 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4445 return 0; 4446 } 4447 4448 irq = kvm_apic_has_interrupt(vcpu); 4449 if (WARN_ON_ONCE(irq < 0)) 4450 goto no_vmexit; 4451 4452 /* 4453 * If the IRQ is L2's PI notification vector, process posted 4454 * interrupts for L2 instead of injecting VM-Exit, as the 4455 * detection/morphing architecturally occurs when the IRQ is 4456 * delivered to the CPU. Note, only interrupts that are routed 4457 * through the local APIC trigger posted interrupt processing, 4458 * and enabling posted interrupts requires ACK-on-exit. 4459 */ 4460 if (irq == vmx->nested.posted_intr_nv) { 4461 /* 4462 * Nested posted interrupts are delivered via RVI, i.e. 4463 * aren't injected by KVM, and so can be queued even if 4464 * manual event injection is disallowed. 4465 */ 4466 if (block_non_injected_events) 4467 return -EBUSY; 4468 4469 vmx->nested.pi_pending = true; 4470 kvm_apic_clear_irr(vcpu, irq); 4471 goto no_vmexit; 4472 } 4473 4474 if (block_nested_events) 4475 return -EBUSY; 4476 4477 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4478 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4479 4480 /* 4481 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4482 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4483 * if APICv is active. 4484 */ 4485 kvm_apic_ack_interrupt(vcpu, irq); 4486 return 0; 4487 } 4488 4489 no_vmexit: 4490 return vmx_complete_nested_posted_interrupt(vcpu); 4491 } 4492 4493 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4494 { 4495 ktime_t remaining = 4496 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4497 u64 value; 4498 4499 if (ktime_to_ns(remaining) <= 0) 4500 return 0; 4501 4502 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4503 do_div(value, 1000000); 4504 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4505 } 4506 4507 static bool is_vmcs12_ext_field(unsigned long field) 4508 { 4509 switch (field) { 4510 case GUEST_ES_SELECTOR: 4511 case GUEST_CS_SELECTOR: 4512 case GUEST_SS_SELECTOR: 4513 case GUEST_DS_SELECTOR: 4514 case GUEST_FS_SELECTOR: 4515 case GUEST_GS_SELECTOR: 4516 case GUEST_LDTR_SELECTOR: 4517 case GUEST_TR_SELECTOR: 4518 case GUEST_ES_LIMIT: 4519 case GUEST_CS_LIMIT: 4520 case GUEST_SS_LIMIT: 4521 case GUEST_DS_LIMIT: 4522 case GUEST_FS_LIMIT: 4523 case GUEST_GS_LIMIT: 4524 case GUEST_LDTR_LIMIT: 4525 case GUEST_TR_LIMIT: 4526 case GUEST_GDTR_LIMIT: 4527 case GUEST_IDTR_LIMIT: 4528 case GUEST_ES_AR_BYTES: 4529 case GUEST_DS_AR_BYTES: 4530 case GUEST_FS_AR_BYTES: 4531 case GUEST_GS_AR_BYTES: 4532 case GUEST_LDTR_AR_BYTES: 4533 case GUEST_TR_AR_BYTES: 4534 case GUEST_ES_BASE: 4535 case GUEST_CS_BASE: 4536 case GUEST_SS_BASE: 4537 case GUEST_DS_BASE: 4538 case GUEST_FS_BASE: 4539 case GUEST_GS_BASE: 4540 case GUEST_LDTR_BASE: 4541 case GUEST_TR_BASE: 4542 case GUEST_GDTR_BASE: 4543 case GUEST_IDTR_BASE: 4544 case GUEST_PENDING_DBG_EXCEPTIONS: 4545 case GUEST_BNDCFGS: 4546 return true; 4547 default: 4548 break; 4549 } 4550 4551 return false; 4552 } 4553 4554 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4555 struct vmcs12 *vmcs12) 4556 { 4557 struct vcpu_vmx *vmx = to_vmx(vcpu); 4558 4559 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4560 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4561 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4562 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4563 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4564 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4565 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4566 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4567 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4568 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4569 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4570 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4571 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4572 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4573 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4574 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4575 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4576 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4577 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4578 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4579 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4580 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4581 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4582 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4583 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4584 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4585 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4586 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4587 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4588 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4589 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4590 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4591 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4592 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4593 vmcs12->guest_pending_dbg_exceptions = 4594 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4595 4596 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4597 } 4598 4599 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4600 struct vmcs12 *vmcs12) 4601 { 4602 struct vcpu_vmx *vmx = to_vmx(vcpu); 4603 int cpu; 4604 4605 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4606 return; 4607 4608 4609 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4610 4611 cpu = get_cpu(); 4612 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4613 vmx_vcpu_load_vmcs(vcpu, cpu); 4614 4615 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4616 4617 vmx->loaded_vmcs = &vmx->vmcs01; 4618 vmx_vcpu_load_vmcs(vcpu, cpu); 4619 put_cpu(); 4620 } 4621 4622 /* 4623 * Update the guest state fields of vmcs12 to reflect changes that 4624 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4625 * VM-entry controls is also updated, since this is really a guest 4626 * state bit.) 4627 */ 4628 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4629 { 4630 struct vcpu_vmx *vmx = to_vmx(vcpu); 4631 4632 if (nested_vmx_is_evmptr12_valid(vmx)) 4633 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4634 4635 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4636 !nested_vmx_is_evmptr12_valid(vmx); 4637 4638 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4639 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4640 4641 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4642 vmcs12->guest_rip = kvm_rip_read(vcpu); 4643 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4644 4645 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4646 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4647 4648 vmcs12->guest_interruptibility_info = 4649 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4650 4651 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4652 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4653 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4654 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4655 else 4656 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4657 4658 if (nested_cpu_has_preemption_timer(vmcs12) && 4659 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4660 !vmx->nested.nested_run_pending) 4661 vmcs12->vmx_preemption_timer_value = 4662 vmx_get_preemption_timer_value(vcpu); 4663 4664 /* 4665 * In some cases (usually, nested EPT), L2 is allowed to change its 4666 * own CR3 without exiting. If it has changed it, we must keep it. 4667 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4668 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4669 * 4670 * Additionally, restore L2's PDPTR to vmcs12. 4671 */ 4672 if (enable_ept) { 4673 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4674 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4675 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4676 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4677 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4678 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4679 } 4680 } 4681 4682 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4683 4684 if (nested_cpu_has_vid(vmcs12)) 4685 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4686 4687 vmcs12->vm_entry_controls = 4688 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4689 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4690 4691 /* 4692 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4693 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4694 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4695 * vmcs02 doesn't strictly track vmcs12. 4696 */ 4697 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4698 vmcs12->guest_dr7 = vcpu->arch.dr7; 4699 4700 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4701 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4702 4703 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4704 &vmcs12->guest_ssp, 4705 &vmcs12->guest_ssp_tbl); 4706 } 4707 4708 /* 4709 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4710 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4711 * and this function updates it to reflect the changes to the guest state while 4712 * L2 was running (and perhaps made some exits which were handled directly by L0 4713 * without going back to L1), and to reflect the exit reason. 4714 * Note that we do not have to copy here all VMCS fields, just those that 4715 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4716 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4717 * which already writes to vmcs12 directly. 4718 */ 4719 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4720 u32 vm_exit_reason, u32 exit_intr_info, 4721 unsigned long exit_qualification, u32 exit_insn_len) 4722 { 4723 /* update exit information fields: */ 4724 vmcs12->vm_exit_reason = vm_exit_reason; 4725 if (vmx_get_exit_reason(vcpu).enclave_mode) 4726 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4727 vmcs12->exit_qualification = exit_qualification; 4728 4729 /* 4730 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4731 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4732 * exit info fields are unmodified. 4733 */ 4734 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4735 vmcs12->launch_state = 1; 4736 4737 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4738 * instead of reading the real value. */ 4739 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4740 4741 /* 4742 * Transfer the event that L0 or L1 may wanted to inject into 4743 * L2 to IDT_VECTORING_INFO_FIELD. 4744 */ 4745 vmcs12_save_pending_event(vcpu, vmcs12, 4746 vm_exit_reason, exit_intr_info); 4747 4748 vmcs12->vm_exit_intr_info = exit_intr_info; 4749 vmcs12->vm_exit_instruction_len = exit_insn_len; 4750 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4751 4752 /* 4753 * According to spec, there's no need to store the guest's 4754 * MSRs if the exit is due to a VM-entry failure that occurs 4755 * during or after loading the guest state. Since this exit 4756 * does not fall in that category, we need to save the MSRs. 4757 */ 4758 if (nested_vmx_store_msr(vcpu, 4759 vmcs12->vm_exit_msr_store_addr, 4760 vmcs12->vm_exit_msr_store_count)) 4761 nested_vmx_abort(vcpu, 4762 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4763 } 4764 } 4765 4766 /* 4767 * A part of what we need to when the nested L2 guest exits and we want to 4768 * run its L1 parent, is to reset L1's guest state to the host state specified 4769 * in vmcs12. 4770 * This function is to be called not only on normal nested exit, but also on 4771 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4772 * Failures During or After Loading Guest State"). 4773 * This function should be called when the active VMCS is L1's (vmcs01). 4774 */ 4775 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4776 struct vmcs12 *vmcs12) 4777 { 4778 enum vm_entry_failure_code ignored; 4779 struct kvm_segment seg; 4780 4781 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4782 vcpu->arch.efer = vmcs12->host_ia32_efer; 4783 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4784 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4785 else 4786 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4787 vmx_set_efer(vcpu, vcpu->arch.efer); 4788 4789 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4790 kvm_rip_write(vcpu, vmcs12->host_rip); 4791 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4792 vmx_set_interrupt_shadow(vcpu, 0); 4793 4794 /* 4795 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4796 * actually changed, because vmx_set_cr0 refers to efer set above. 4797 * 4798 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4799 * (KVM doesn't change it); 4800 */ 4801 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4802 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4803 4804 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4805 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4806 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4807 4808 nested_ept_uninit_mmu_context(vcpu); 4809 4810 /* 4811 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4812 * couldn't have changed. 4813 */ 4814 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4815 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4816 4817 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4818 4819 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4820 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4821 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4822 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4823 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4824 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4825 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4826 4827 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4828 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4829 vmcs_write64(GUEST_BNDCFGS, 0); 4830 4831 /* 4832 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4833 * otherwise CET state should be retained across VM-exit, i.e., 4834 * guest values should be propagated from vmcs12 to vmcs01. 4835 */ 4836 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4837 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4838 vmcs12->host_ssp_tbl); 4839 else 4840 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4841 vmcs12->guest_ssp_tbl); 4842 4843 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4844 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4845 vcpu->arch.pat = vmcs12->host_ia32_pat; 4846 } 4847 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4848 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4849 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4850 vmcs12->host_ia32_perf_global_ctrl)); 4851 4852 /* Set L1 segment info according to Intel SDM 4853 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4854 seg = (struct kvm_segment) { 4855 .base = 0, 4856 .limit = 0xFFFFFFFF, 4857 .selector = vmcs12->host_cs_selector, 4858 .type = 11, 4859 .present = 1, 4860 .s = 1, 4861 .g = 1 4862 }; 4863 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4864 seg.l = 1; 4865 else 4866 seg.db = 1; 4867 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4868 seg = (struct kvm_segment) { 4869 .base = 0, 4870 .limit = 0xFFFFFFFF, 4871 .type = 3, 4872 .present = 1, 4873 .s = 1, 4874 .db = 1, 4875 .g = 1 4876 }; 4877 seg.selector = vmcs12->host_ds_selector; 4878 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4879 seg.selector = vmcs12->host_es_selector; 4880 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4881 seg.selector = vmcs12->host_ss_selector; 4882 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4883 seg.selector = vmcs12->host_fs_selector; 4884 seg.base = vmcs12->host_fs_base; 4885 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4886 seg.selector = vmcs12->host_gs_selector; 4887 seg.base = vmcs12->host_gs_base; 4888 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4889 seg = (struct kvm_segment) { 4890 .base = vmcs12->host_tr_base, 4891 .limit = 0x67, 4892 .selector = vmcs12->host_tr_selector, 4893 .type = 11, 4894 .present = 1 4895 }; 4896 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4897 4898 memset(&seg, 0, sizeof(seg)); 4899 seg.unusable = 1; 4900 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4901 4902 kvm_set_dr(vcpu, 7, 0x400); 4903 vmx_guest_debugctl_write(vcpu, 0); 4904 4905 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4906 vmcs12->vm_exit_msr_load_count)) 4907 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4908 4909 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4910 } 4911 4912 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4913 { 4914 struct vmx_uret_msr *efer_msr; 4915 unsigned int i; 4916 4917 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4918 return vmcs_read64(GUEST_IA32_EFER); 4919 4920 if (cpu_has_load_ia32_efer()) 4921 return kvm_host.efer; 4922 4923 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4924 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4925 return vmx->msr_autoload.guest.val[i].value; 4926 } 4927 4928 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4929 if (efer_msr) 4930 return efer_msr->data; 4931 4932 return kvm_host.efer; 4933 } 4934 4935 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4936 { 4937 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4938 struct vcpu_vmx *vmx = to_vmx(vcpu); 4939 struct vmx_msr_entry g, h; 4940 gpa_t gpa; 4941 u32 i, j; 4942 4943 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4944 4945 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4946 /* 4947 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4948 * as vmcs01.GUEST_DR7 contains a userspace defined value 4949 * and vcpu->arch.dr7 is not squirreled away before the 4950 * nested VMENTER (not worth adding a variable in nested_vmx). 4951 */ 4952 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4953 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4954 else 4955 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4956 } 4957 4958 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4959 vmx_reload_guest_debugctl(vcpu); 4960 4961 /* 4962 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4963 * handle a variety of side effects to KVM's software model. 4964 */ 4965 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4966 4967 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4968 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4969 4970 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4971 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4972 4973 nested_ept_uninit_mmu_context(vcpu); 4974 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4975 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4976 4977 /* 4978 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4979 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4980 * VMFail, like everything else we just need to ensure our 4981 * software model is up-to-date. 4982 */ 4983 if (enable_ept && is_pae_paging(vcpu)) 4984 ept_save_pdptrs(vcpu); 4985 4986 kvm_mmu_reset_context(vcpu); 4987 4988 /* 4989 * This nasty bit of open coding is a compromise between blindly 4990 * loading L1's MSRs using the exit load lists (incorrect emulation 4991 * of VMFail), leaving the nested VM's MSRs in the software model 4992 * (incorrect behavior) and snapshotting the modified MSRs (too 4993 * expensive since the lists are unbound by hardware). For each 4994 * MSR that was (prematurely) loaded from the nested VMEntry load 4995 * list, reload it from the exit load list if it exists and differs 4996 * from the guest value. The intent is to stuff host state as 4997 * silently as possible, not to fully process the exit load list. 4998 */ 4999 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5000 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5001 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5002 pr_debug_ratelimited( 5003 "%s read MSR index failed (%u, 0x%08llx)\n", 5004 __func__, i, gpa); 5005 goto vmabort; 5006 } 5007 5008 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5009 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5010 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5011 pr_debug_ratelimited( 5012 "%s read MSR failed (%u, 0x%08llx)\n", 5013 __func__, j, gpa); 5014 goto vmabort; 5015 } 5016 if (h.index != g.index) 5017 continue; 5018 if (h.value == g.value) 5019 break; 5020 5021 if (nested_vmx_load_msr_check(vcpu, &h)) { 5022 pr_debug_ratelimited( 5023 "%s check failed (%u, 0x%x, 0x%x)\n", 5024 __func__, j, h.index, h.reserved); 5025 goto vmabort; 5026 } 5027 5028 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5029 pr_debug_ratelimited( 5030 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5031 __func__, j, h.index, h.value); 5032 goto vmabort; 5033 } 5034 } 5035 } 5036 5037 return; 5038 5039 vmabort: 5040 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5041 } 5042 5043 /* 5044 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5045 * and modify vmcs12 to make it see what it would expect to see there if 5046 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5047 */ 5048 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5049 u32 exit_intr_info, unsigned long exit_qualification, 5050 u32 exit_insn_len) 5051 { 5052 struct vcpu_vmx *vmx = to_vmx(vcpu); 5053 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5054 5055 /* Pending MTF traps are discarded on VM-Exit. */ 5056 vmx->nested.mtf_pending = false; 5057 5058 /* trying to cancel vmlaunch/vmresume is a bug */ 5059 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5060 5061 #ifdef CONFIG_KVM_HYPERV 5062 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5063 /* 5064 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5065 * Enlightened VMCS after migration and we still need to 5066 * do that when something is forcing L2->L1 exit prior to 5067 * the first L2 run. 5068 */ 5069 (void)nested_get_evmcs_page(vcpu); 5070 } 5071 #endif 5072 5073 /* Service pending TLB flush requests for L2 before switching to L1. */ 5074 kvm_service_local_tlb_flush_requests(vcpu); 5075 5076 /* 5077 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5078 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5079 * up-to-date before switching to L1. 5080 */ 5081 if (enable_ept && is_pae_paging(vcpu)) 5082 vmx_ept_load_pdptrs(vcpu); 5083 5084 leave_guest_mode(vcpu); 5085 5086 if (nested_cpu_has_preemption_timer(vmcs12)) 5087 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5088 5089 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5090 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5091 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5092 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5093 } 5094 5095 if (likely(!vmx->fail)) { 5096 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5097 5098 if (vm_exit_reason != -1) 5099 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5100 exit_intr_info, exit_qualification, 5101 exit_insn_len); 5102 5103 /* 5104 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5105 * also be used to capture vmcs12 cache as part of 5106 * capturing nVMX state for snapshot (migration). 5107 * 5108 * Otherwise, this flush will dirty guest memory at a 5109 * point it is already assumed by user-space to be 5110 * immutable. 5111 */ 5112 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5113 } else { 5114 /* 5115 * The only expected VM-instruction error is "VM entry with 5116 * invalid control field(s)." Anything else indicates a 5117 * problem with L0. 5118 */ 5119 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5120 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5121 5122 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5123 WARN_ON_ONCE(warn_on_missed_cc); 5124 } 5125 5126 /* 5127 * Drop events/exceptions that were queued for re-injection to L2 5128 * (picked up via vmx_complete_interrupts()), as well as exceptions 5129 * that were pending for L2. Note, this must NOT be hoisted above 5130 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5131 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5132 */ 5133 vcpu->arch.nmi_injected = false; 5134 kvm_clear_exception_queue(vcpu); 5135 kvm_clear_interrupt_queue(vcpu); 5136 5137 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5138 5139 kvm_nested_vmexit_handle_ibrs(vcpu); 5140 5141 /* 5142 * Update any VMCS fields that might have changed while vmcs02 was the 5143 * active VMCS. The tracking is per-vCPU, not per-VMCS. 5144 */ 5145 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5146 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5147 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5148 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5149 if (kvm_caps.has_tsc_control) 5150 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5151 5152 nested_put_vmcs12_pages(vcpu); 5153 5154 if ((vm_exit_reason != -1) && 5155 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5156 vmx->nested.need_vmcs12_to_shadow_sync = true; 5157 5158 /* in case we halted in L2 */ 5159 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5160 5161 if (likely(!vmx->fail)) { 5162 if (vm_exit_reason != -1) 5163 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5164 vmcs12->exit_qualification, 5165 vmcs12->idt_vectoring_info_field, 5166 vmcs12->vm_exit_intr_info, 5167 vmcs12->vm_exit_intr_error_code, 5168 KVM_ISA_VMX); 5169 5170 load_vmcs12_host_state(vcpu, vmcs12); 5171 5172 /* 5173 * Process events if an injectable IRQ or NMI is pending, even 5174 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5175 * If an event became pending while L2 was active, KVM needs to 5176 * either inject the event or request an IRQ/NMI window. SMIs 5177 * don't need to be processed as SMM is mutually exclusive with 5178 * non-root mode. INIT/SIPI don't need to be checked as INIT 5179 * is blocked post-VMXON, and SIPIs are ignored. 5180 */ 5181 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5182 kvm_make_request(KVM_REQ_EVENT, vcpu); 5183 return; 5184 } 5185 5186 /* 5187 * After an early L2 VM-entry failure, we're now back 5188 * in L1 which thinks it just finished a VMLAUNCH or 5189 * VMRESUME instruction, so we need to set the failure 5190 * flag and the VM-instruction error field of the VMCS 5191 * accordingly, and skip the emulated instruction. 5192 */ 5193 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5194 5195 /* 5196 * Restore L1's host state to KVM's software model. We're here 5197 * because a consistency check was caught by hardware, which 5198 * means some amount of guest state has been propagated to KVM's 5199 * model and needs to be unwound to the host's state. 5200 */ 5201 nested_vmx_restore_host_state(vcpu); 5202 5203 vmx->fail = 0; 5204 } 5205 5206 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5207 { 5208 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5209 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5210 } 5211 5212 /* 5213 * Decode the memory-address operand of a vmx instruction, as recorded on an 5214 * exit caused by such an instruction (run by a guest hypervisor). 5215 * On success, returns 0. When the operand is invalid, returns 1 and throws 5216 * #UD, #GP, or #SS. 5217 */ 5218 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5219 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5220 { 5221 gva_t off; 5222 bool exn; 5223 struct kvm_segment s; 5224 5225 /* 5226 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5227 * Execution", on an exit, vmx_instruction_info holds most of the 5228 * addressing components of the operand. Only the displacement part 5229 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5230 * For how an actual address is calculated from all these components, 5231 * refer to Vol. 1, "Operand Addressing". 5232 */ 5233 int scaling = vmx_instruction_info & 3; 5234 int addr_size = (vmx_instruction_info >> 7) & 7; 5235 bool is_reg = vmx_instruction_info & (1u << 10); 5236 int seg_reg = (vmx_instruction_info >> 15) & 7; 5237 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5238 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5239 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5240 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5241 5242 if (is_reg) { 5243 kvm_queue_exception(vcpu, UD_VECTOR); 5244 return 1; 5245 } 5246 5247 /* Addr = segment_base + offset */ 5248 /* offset = base + [index * scale] + displacement */ 5249 off = exit_qualification; /* holds the displacement */ 5250 if (addr_size == 1) 5251 off = (gva_t)sign_extend64(off, 31); 5252 else if (addr_size == 0) 5253 off = (gva_t)sign_extend64(off, 15); 5254 if (base_is_valid) 5255 off += kvm_register_read(vcpu, base_reg); 5256 if (index_is_valid) 5257 off += kvm_register_read(vcpu, index_reg) << scaling; 5258 vmx_get_segment(vcpu, &s, seg_reg); 5259 5260 /* 5261 * The effective address, i.e. @off, of a memory operand is truncated 5262 * based on the address size of the instruction. Note that this is 5263 * the *effective address*, i.e. the address prior to accounting for 5264 * the segment's base. 5265 */ 5266 if (addr_size == 1) /* 32 bit */ 5267 off &= 0xffffffff; 5268 else if (addr_size == 0) /* 16 bit */ 5269 off &= 0xffff; 5270 5271 /* Checks for #GP/#SS exceptions. */ 5272 exn = false; 5273 if (is_long_mode(vcpu)) { 5274 /* 5275 * The virtual/linear address is never truncated in 64-bit 5276 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5277 * address when using FS/GS with a non-zero base. 5278 */ 5279 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5280 *ret = s.base + off; 5281 else 5282 *ret = off; 5283 5284 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5285 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5286 * non-canonical form. This is the only check on the memory 5287 * destination for long mode! 5288 */ 5289 exn = is_noncanonical_address(*ret, vcpu, 0); 5290 } else { 5291 /* 5292 * When not in long mode, the virtual/linear address is 5293 * unconditionally truncated to 32 bits regardless of the 5294 * address size. 5295 */ 5296 *ret = (s.base + off) & 0xffffffff; 5297 5298 /* Protected mode: apply checks for segment validity in the 5299 * following order: 5300 * - segment type check (#GP(0) may be thrown) 5301 * - usability check (#GP(0)/#SS(0)) 5302 * - limit check (#GP(0)/#SS(0)) 5303 */ 5304 if (wr) 5305 /* #GP(0) if the destination operand is located in a 5306 * read-only data segment or any code segment. 5307 */ 5308 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5309 else 5310 /* #GP(0) if the source operand is located in an 5311 * execute-only code segment 5312 */ 5313 exn = ((s.type & 0xa) == 8); 5314 if (exn) { 5315 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5316 return 1; 5317 } 5318 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5319 */ 5320 exn = (s.unusable != 0); 5321 5322 /* 5323 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5324 * outside the segment limit. All CPUs that support VMX ignore 5325 * limit checks for flat segments, i.e. segments with base==0, 5326 * limit==0xffffffff and of type expand-up data or code. 5327 */ 5328 if (!(s.base == 0 && s.limit == 0xffffffff && 5329 ((s.type & 8) || !(s.type & 4)))) 5330 exn = exn || ((u64)off + len - 1 > s.limit); 5331 } 5332 if (exn) { 5333 kvm_queue_exception_e(vcpu, 5334 seg_reg == VCPU_SREG_SS ? 5335 SS_VECTOR : GP_VECTOR, 5336 0); 5337 return 1; 5338 } 5339 5340 return 0; 5341 } 5342 5343 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5344 int *ret) 5345 { 5346 gva_t gva; 5347 struct x86_exception e; 5348 int r; 5349 5350 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5351 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5352 sizeof(*vmpointer), &gva)) { 5353 *ret = 1; 5354 return -EINVAL; 5355 } 5356 5357 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5358 if (r != X86EMUL_CONTINUE) { 5359 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5360 return -EINVAL; 5361 } 5362 5363 return 0; 5364 } 5365 5366 /* 5367 * Allocate a shadow VMCS and associate it with the currently loaded 5368 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5369 * VMCS is also VMCLEARed, so that it is ready for use. 5370 */ 5371 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5372 { 5373 struct vcpu_vmx *vmx = to_vmx(vcpu); 5374 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5375 5376 /* 5377 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5378 * when L1 executes VMXOFF or the vCPU is forced out of nested 5379 * operation. VMXON faults if the CPU is already post-VMXON, so it 5380 * should be impossible to already have an allocated shadow VMCS. KVM 5381 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5382 * always be the loaded VMCS. 5383 */ 5384 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5385 return loaded_vmcs->shadow_vmcs; 5386 5387 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5388 if (loaded_vmcs->shadow_vmcs) 5389 vmcs_clear(loaded_vmcs->shadow_vmcs); 5390 5391 return loaded_vmcs->shadow_vmcs; 5392 } 5393 5394 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5395 { 5396 struct vcpu_vmx *vmx = to_vmx(vcpu); 5397 int r; 5398 5399 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5400 if (r < 0) 5401 goto out_vmcs02; 5402 5403 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5404 if (!vmx->nested.cached_vmcs12) 5405 goto out_cached_vmcs12; 5406 5407 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5408 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5409 if (!vmx->nested.cached_shadow_vmcs12) 5410 goto out_cached_shadow_vmcs12; 5411 5412 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5413 goto out_shadow_vmcs; 5414 5415 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5416 HRTIMER_MODE_ABS_PINNED); 5417 5418 vmx->nested.vpid02 = allocate_vpid(); 5419 5420 vmx->nested.vmcs02_initialized = false; 5421 vmx->nested.vmxon = true; 5422 5423 if (vmx_pt_mode_is_host_guest()) { 5424 vmx->pt_desc.guest.ctl = 0; 5425 pt_update_intercept_for_msr(vcpu); 5426 } 5427 5428 return 0; 5429 5430 out_shadow_vmcs: 5431 kfree(vmx->nested.cached_shadow_vmcs12); 5432 5433 out_cached_shadow_vmcs12: 5434 kfree(vmx->nested.cached_vmcs12); 5435 5436 out_cached_vmcs12: 5437 free_loaded_vmcs(&vmx->nested.vmcs02); 5438 5439 out_vmcs02: 5440 return -ENOMEM; 5441 } 5442 5443 /* Emulate the VMXON instruction. */ 5444 static int handle_vmxon(struct kvm_vcpu *vcpu) 5445 { 5446 int ret; 5447 gpa_t vmptr; 5448 uint32_t revision; 5449 struct vcpu_vmx *vmx = to_vmx(vcpu); 5450 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5451 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5452 5453 /* 5454 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5455 * the guest and so cannot rely on hardware to perform the check, 5456 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5457 * for VMXON). 5458 * 5459 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5460 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5461 * force any of the relevant guest state. For a restricted guest, KVM 5462 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5463 * Real Mode, and so there's no need to check CR0.PE manually. 5464 */ 5465 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5466 kvm_queue_exception(vcpu, UD_VECTOR); 5467 return 1; 5468 } 5469 5470 /* 5471 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5472 * and has higher priority than the VM-Fail due to being post-VMXON, 5473 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5474 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5475 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5476 * VMX non-root. 5477 * 5478 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5479 * #UD checks (see above), is functionally ok because KVM doesn't allow 5480 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5481 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5482 * missed by hardware due to shadowing CR0 and/or CR4. 5483 */ 5484 if (vmx_get_cpl(vcpu)) { 5485 kvm_inject_gp(vcpu, 0); 5486 return 1; 5487 } 5488 5489 if (vmx->nested.vmxon) 5490 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5491 5492 /* 5493 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5494 * only if the vCPU isn't already in VMX operation, i.e. effectively 5495 * have lower priority than the VM-Fail above. 5496 */ 5497 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5498 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5499 kvm_inject_gp(vcpu, 0); 5500 return 1; 5501 } 5502 5503 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5504 != VMXON_NEEDED_FEATURES) { 5505 kvm_inject_gp(vcpu, 0); 5506 return 1; 5507 } 5508 5509 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5510 return ret; 5511 5512 /* 5513 * SDM 3: 24.11.5 5514 * The first 4 bytes of VMXON region contain the supported 5515 * VMCS revision identifier 5516 * 5517 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5518 * which replaces physical address width with 32 5519 */ 5520 if (!page_address_valid(vcpu, vmptr)) 5521 return nested_vmx_failInvalid(vcpu); 5522 5523 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5524 revision != VMCS12_REVISION) 5525 return nested_vmx_failInvalid(vcpu); 5526 5527 vmx->nested.vmxon_ptr = vmptr; 5528 ret = enter_vmx_operation(vcpu); 5529 if (ret) 5530 return ret; 5531 5532 return nested_vmx_succeed(vcpu); 5533 } 5534 5535 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5536 { 5537 struct vcpu_vmx *vmx = to_vmx(vcpu); 5538 5539 if (vmx->nested.current_vmptr == INVALID_GPA) 5540 return; 5541 5542 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5543 5544 if (enable_shadow_vmcs) { 5545 /* copy to memory all shadowed fields in case 5546 they were modified */ 5547 copy_shadow_to_vmcs12(vmx); 5548 vmx_disable_shadow_vmcs(vmx); 5549 } 5550 vmx->nested.posted_intr_nv = -1; 5551 5552 /* Flush VMCS12 to guest memory */ 5553 kvm_vcpu_write_guest_page(vcpu, 5554 vmx->nested.current_vmptr >> PAGE_SHIFT, 5555 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5556 5557 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5558 5559 vmx->nested.current_vmptr = INVALID_GPA; 5560 } 5561 5562 /* Emulate the VMXOFF instruction */ 5563 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5564 { 5565 if (!nested_vmx_check_permission(vcpu)) 5566 return 1; 5567 5568 free_nested(vcpu); 5569 5570 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5571 kvm_make_request(KVM_REQ_EVENT, vcpu); 5572 5573 return nested_vmx_succeed(vcpu); 5574 } 5575 5576 /* Emulate the VMCLEAR instruction */ 5577 static int handle_vmclear(struct kvm_vcpu *vcpu) 5578 { 5579 struct vcpu_vmx *vmx = to_vmx(vcpu); 5580 u32 zero = 0; 5581 gpa_t vmptr; 5582 int r; 5583 5584 if (!nested_vmx_check_permission(vcpu)) 5585 return 1; 5586 5587 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5588 return r; 5589 5590 if (!page_address_valid(vcpu, vmptr)) 5591 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5592 5593 if (vmptr == vmx->nested.vmxon_ptr) 5594 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5595 5596 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5597 if (vmptr == vmx->nested.current_vmptr) 5598 nested_release_vmcs12(vcpu); 5599 5600 /* 5601 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5602 * for VMCLEAR includes a "ensure that data for VMCS referenced 5603 * by the operand is in memory" clause that guards writes to 5604 * memory, i.e. doing nothing for I/O is architecturally valid. 5605 * 5606 * FIXME: Suppress failures if and only if no memslot is found, 5607 * i.e. exit to userspace if __copy_to_user() fails. 5608 */ 5609 (void)kvm_vcpu_write_guest(vcpu, 5610 vmptr + offsetof(struct vmcs12, 5611 launch_state), 5612 &zero, sizeof(zero)); 5613 } 5614 5615 return nested_vmx_succeed(vcpu); 5616 } 5617 5618 /* Emulate the VMLAUNCH instruction */ 5619 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5620 { 5621 return nested_vmx_run(vcpu, true); 5622 } 5623 5624 /* Emulate the VMRESUME instruction */ 5625 static int handle_vmresume(struct kvm_vcpu *vcpu) 5626 { 5627 5628 return nested_vmx_run(vcpu, false); 5629 } 5630 5631 static int handle_vmread(struct kvm_vcpu *vcpu) 5632 { 5633 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5634 : get_vmcs12(vcpu); 5635 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5636 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5637 struct vcpu_vmx *vmx = to_vmx(vcpu); 5638 struct x86_exception e; 5639 unsigned long field; 5640 u64 value; 5641 gva_t gva = 0; 5642 short offset; 5643 int len, r; 5644 5645 if (!nested_vmx_check_permission(vcpu)) 5646 return 1; 5647 5648 /* Decode instruction info and find the field to read */ 5649 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5650 5651 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5652 /* 5653 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5654 * any VMREAD sets the ALU flags for VMfailInvalid. 5655 */ 5656 if (vmx->nested.current_vmptr == INVALID_GPA || 5657 (is_guest_mode(vcpu) && 5658 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5659 return nested_vmx_failInvalid(vcpu); 5660 5661 offset = get_vmcs12_field_offset(field); 5662 if (offset < 0) 5663 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5664 5665 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5666 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5667 5668 /* Read the field, zero-extended to a u64 value */ 5669 value = vmcs12_read_any(vmcs12, field, offset); 5670 } else { 5671 /* 5672 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5673 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5674 * unsupported. Unfortunately, certain versions of Windows 11 5675 * don't comply with this requirement which is not enforced in 5676 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5677 * workaround, as misbehaving guests will panic on VM-Fail. 5678 * Note, enlightened VMCS is incompatible with shadow VMCS so 5679 * all VMREADs from L2 should go to L1. 5680 */ 5681 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5682 return nested_vmx_failInvalid(vcpu); 5683 5684 offset = evmcs_field_offset(field, NULL); 5685 if (offset < 0) 5686 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5687 5688 /* Read the field, zero-extended to a u64 value */ 5689 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5690 } 5691 5692 /* 5693 * Now copy part of this value to register or memory, as requested. 5694 * Note that the number of bits actually copied is 32 or 64 depending 5695 * on the guest's mode (32 or 64 bit), not on the given field's length. 5696 */ 5697 if (instr_info & BIT(10)) { 5698 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5699 } else { 5700 len = is_64_bit_mode(vcpu) ? 8 : 4; 5701 if (get_vmx_mem_address(vcpu, exit_qualification, 5702 instr_info, true, len, &gva)) 5703 return 1; 5704 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5705 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5706 if (r != X86EMUL_CONTINUE) 5707 return kvm_handle_memory_failure(vcpu, r, &e); 5708 } 5709 5710 return nested_vmx_succeed(vcpu); 5711 } 5712 5713 static bool is_shadow_field_rw(unsigned long field) 5714 { 5715 switch (field) { 5716 #define SHADOW_FIELD_RW(x, y) case x: 5717 #include "vmcs_shadow_fields.h" 5718 return true; 5719 default: 5720 break; 5721 } 5722 return false; 5723 } 5724 5725 static bool is_shadow_field_ro(unsigned long field) 5726 { 5727 switch (field) { 5728 #define SHADOW_FIELD_RO(x, y) case x: 5729 #include "vmcs_shadow_fields.h" 5730 return true; 5731 default: 5732 break; 5733 } 5734 return false; 5735 } 5736 5737 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5738 { 5739 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5740 : get_vmcs12(vcpu); 5741 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5742 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5743 struct vcpu_vmx *vmx = to_vmx(vcpu); 5744 struct x86_exception e; 5745 unsigned long field; 5746 short offset; 5747 gva_t gva; 5748 int len, r; 5749 5750 /* 5751 * The value to write might be 32 or 64 bits, depending on L1's long 5752 * mode, and eventually we need to write that into a field of several 5753 * possible lengths. The code below first zero-extends the value to 64 5754 * bit (value), and then copies only the appropriate number of 5755 * bits into the vmcs12 field. 5756 */ 5757 u64 value = 0; 5758 5759 if (!nested_vmx_check_permission(vcpu)) 5760 return 1; 5761 5762 /* 5763 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5764 * any VMWRITE sets the ALU flags for VMfailInvalid. 5765 */ 5766 if (vmx->nested.current_vmptr == INVALID_GPA || 5767 (is_guest_mode(vcpu) && 5768 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5769 return nested_vmx_failInvalid(vcpu); 5770 5771 if (instr_info & BIT(10)) 5772 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5773 else { 5774 len = is_64_bit_mode(vcpu) ? 8 : 4; 5775 if (get_vmx_mem_address(vcpu, exit_qualification, 5776 instr_info, false, len, &gva)) 5777 return 1; 5778 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5779 if (r != X86EMUL_CONTINUE) 5780 return kvm_handle_memory_failure(vcpu, r, &e); 5781 } 5782 5783 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5784 5785 offset = get_vmcs12_field_offset(field); 5786 if (offset < 0) 5787 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5788 5789 /* 5790 * If the vCPU supports "VMWRITE to any supported field in the 5791 * VMCS," then the "read-only" fields are actually read/write. 5792 */ 5793 if (vmcs_field_readonly(field) && 5794 !nested_cpu_has_vmwrite_any_field(vcpu)) 5795 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5796 5797 /* 5798 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5799 * vmcs12, else we may crush a field or consume a stale value. 5800 */ 5801 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5802 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5803 5804 /* 5805 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5806 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5807 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5808 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5809 * from L1 will return a different value than VMREAD from L2 (L1 sees 5810 * the stripped down value, L2 sees the full value as stored by KVM). 5811 */ 5812 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5813 value &= 0x1f0ff; 5814 5815 vmcs12_write_any(vmcs12, field, offset, value); 5816 5817 /* 5818 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5819 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5820 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5821 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5822 */ 5823 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5824 /* 5825 * L1 can read these fields without exiting, ensure the 5826 * shadow VMCS is up-to-date. 5827 */ 5828 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5829 preempt_disable(); 5830 vmcs_load(vmx->vmcs01.shadow_vmcs); 5831 5832 __vmcs_writel(field, value); 5833 5834 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5835 vmcs_load(vmx->loaded_vmcs->vmcs); 5836 preempt_enable(); 5837 } 5838 vmx->nested.dirty_vmcs12 = true; 5839 } 5840 5841 return nested_vmx_succeed(vcpu); 5842 } 5843 5844 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5845 { 5846 vmx->nested.current_vmptr = vmptr; 5847 if (enable_shadow_vmcs) { 5848 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5849 vmcs_write64(VMCS_LINK_POINTER, 5850 __pa(vmx->vmcs01.shadow_vmcs)); 5851 vmx->nested.need_vmcs12_to_shadow_sync = true; 5852 } 5853 vmx->nested.dirty_vmcs12 = true; 5854 vmx->nested.force_msr_bitmap_recalc = true; 5855 } 5856 5857 /* Emulate the VMPTRLD instruction */ 5858 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5859 { 5860 struct vcpu_vmx *vmx = to_vmx(vcpu); 5861 gpa_t vmptr; 5862 int r; 5863 5864 if (!nested_vmx_check_permission(vcpu)) 5865 return 1; 5866 5867 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5868 return r; 5869 5870 if (!page_address_valid(vcpu, vmptr)) 5871 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5872 5873 if (vmptr == vmx->nested.vmxon_ptr) 5874 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5875 5876 /* Forbid normal VMPTRLD if Enlightened version was used */ 5877 if (nested_vmx_is_evmptr12_valid(vmx)) 5878 return 1; 5879 5880 if (vmx->nested.current_vmptr != vmptr) { 5881 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5882 struct vmcs_hdr hdr; 5883 5884 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5885 /* 5886 * Reads from an unbacked page return all 1s, 5887 * which means that the 32 bits located at the 5888 * given physical address won't match the required 5889 * VMCS12_REVISION identifier. 5890 */ 5891 return nested_vmx_fail(vcpu, 5892 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5893 } 5894 5895 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5896 offsetof(struct vmcs12, hdr), 5897 sizeof(hdr))) { 5898 return nested_vmx_fail(vcpu, 5899 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5900 } 5901 5902 if (hdr.revision_id != VMCS12_REVISION || 5903 (hdr.shadow_vmcs && 5904 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5905 return nested_vmx_fail(vcpu, 5906 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5907 } 5908 5909 nested_release_vmcs12(vcpu); 5910 5911 /* 5912 * Load VMCS12 from guest memory since it is not already 5913 * cached. 5914 */ 5915 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5916 VMCS12_SIZE)) { 5917 return nested_vmx_fail(vcpu, 5918 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5919 } 5920 5921 set_current_vmptr(vmx, vmptr); 5922 } 5923 5924 return nested_vmx_succeed(vcpu); 5925 } 5926 5927 /* Emulate the VMPTRST instruction */ 5928 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5929 { 5930 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5931 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5932 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5933 struct x86_exception e; 5934 gva_t gva; 5935 int r; 5936 5937 if (!nested_vmx_check_permission(vcpu)) 5938 return 1; 5939 5940 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5941 return 1; 5942 5943 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5944 true, sizeof(gpa_t), &gva)) 5945 return 1; 5946 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5947 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5948 sizeof(gpa_t), &e); 5949 if (r != X86EMUL_CONTINUE) 5950 return kvm_handle_memory_failure(vcpu, r, &e); 5951 5952 return nested_vmx_succeed(vcpu); 5953 } 5954 5955 /* Emulate the INVEPT instruction */ 5956 static int handle_invept(struct kvm_vcpu *vcpu) 5957 { 5958 struct vcpu_vmx *vmx = to_vmx(vcpu); 5959 u32 vmx_instruction_info, types; 5960 unsigned long type, roots_to_free; 5961 struct kvm_mmu *mmu; 5962 gva_t gva; 5963 struct x86_exception e; 5964 struct { 5965 u64 eptp, gpa; 5966 } operand; 5967 int i, r, gpr_index; 5968 5969 if (!(vmx->nested.msrs.secondary_ctls_high & 5970 SECONDARY_EXEC_ENABLE_EPT) || 5971 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5972 kvm_queue_exception(vcpu, UD_VECTOR); 5973 return 1; 5974 } 5975 5976 if (!nested_vmx_check_permission(vcpu)) 5977 return 1; 5978 5979 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5980 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5981 type = kvm_register_read(vcpu, gpr_index); 5982 5983 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5984 5985 if (type >= 32 || !(types & (1 << type))) 5986 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5987 5988 /* According to the Intel VMX instruction reference, the memory 5989 * operand is read even if it isn't needed (e.g., for type==global) 5990 */ 5991 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5992 vmx_instruction_info, false, sizeof(operand), &gva)) 5993 return 1; 5994 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5995 if (r != X86EMUL_CONTINUE) 5996 return kvm_handle_memory_failure(vcpu, r, &e); 5997 5998 /* 5999 * Nested EPT roots are always held through guest_mmu, 6000 * not root_mmu. 6001 */ 6002 mmu = &vcpu->arch.guest_mmu; 6003 6004 switch (type) { 6005 case VMX_EPT_EXTENT_CONTEXT: 6006 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6007 return nested_vmx_fail(vcpu, 6008 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6009 6010 roots_to_free = 0; 6011 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6012 operand.eptp)) 6013 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6014 6015 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6016 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6017 mmu->prev_roots[i].pgd, 6018 operand.eptp)) 6019 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6020 } 6021 break; 6022 case VMX_EPT_EXTENT_GLOBAL: 6023 roots_to_free = KVM_MMU_ROOTS_ALL; 6024 break; 6025 default: 6026 BUG(); 6027 break; 6028 } 6029 6030 if (roots_to_free) 6031 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6032 6033 return nested_vmx_succeed(vcpu); 6034 } 6035 6036 static int handle_invvpid(struct kvm_vcpu *vcpu) 6037 { 6038 struct vcpu_vmx *vmx = to_vmx(vcpu); 6039 u32 vmx_instruction_info; 6040 unsigned long type, types; 6041 gva_t gva; 6042 struct x86_exception e; 6043 struct { 6044 u64 vpid; 6045 u64 gla; 6046 } operand; 6047 u16 vpid02; 6048 int r, gpr_index; 6049 6050 if (!(vmx->nested.msrs.secondary_ctls_high & 6051 SECONDARY_EXEC_ENABLE_VPID) || 6052 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6053 kvm_queue_exception(vcpu, UD_VECTOR); 6054 return 1; 6055 } 6056 6057 if (!nested_vmx_check_permission(vcpu)) 6058 return 1; 6059 6060 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6061 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6062 type = kvm_register_read(vcpu, gpr_index); 6063 6064 types = (vmx->nested.msrs.vpid_caps & 6065 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6066 6067 if (type >= 32 || !(types & (1 << type))) 6068 return nested_vmx_fail(vcpu, 6069 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6070 6071 /* according to the intel vmx instruction reference, the memory 6072 * operand is read even if it isn't needed (e.g., for type==global) 6073 */ 6074 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6075 vmx_instruction_info, false, sizeof(operand), &gva)) 6076 return 1; 6077 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6078 if (r != X86EMUL_CONTINUE) 6079 return kvm_handle_memory_failure(vcpu, r, &e); 6080 6081 if (operand.vpid >> 16) 6082 return nested_vmx_fail(vcpu, 6083 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6084 6085 /* 6086 * Always flush the effective vpid02, i.e. never flush the current VPID 6087 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6088 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6089 * irrelevant (and there may not be a loaded vmcs12). 6090 */ 6091 vpid02 = nested_get_vpid02(vcpu); 6092 switch (type) { 6093 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6094 /* 6095 * LAM doesn't apply to addresses that are inputs to TLB 6096 * invalidation. 6097 */ 6098 if (!operand.vpid || 6099 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6100 return nested_vmx_fail(vcpu, 6101 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6102 vpid_sync_vcpu_addr(vpid02, operand.gla); 6103 break; 6104 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6105 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6106 if (!operand.vpid) 6107 return nested_vmx_fail(vcpu, 6108 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6109 vpid_sync_context(vpid02); 6110 break; 6111 case VMX_VPID_EXTENT_ALL_CONTEXT: 6112 vpid_sync_context(vpid02); 6113 break; 6114 default: 6115 WARN_ON_ONCE(1); 6116 return kvm_skip_emulated_instruction(vcpu); 6117 } 6118 6119 /* 6120 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6121 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6122 * roots as VPIDs are not tracked in the MMU role. 6123 * 6124 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6125 * an MMU when EPT is disabled. 6126 * 6127 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6128 */ 6129 if (!enable_ept) 6130 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6131 6132 return nested_vmx_succeed(vcpu); 6133 } 6134 6135 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6136 struct vmcs12 *vmcs12) 6137 { 6138 u32 index = kvm_rcx_read(vcpu); 6139 u64 new_eptp; 6140 6141 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6142 return 1; 6143 if (index >= VMFUNC_EPTP_ENTRIES) 6144 return 1; 6145 6146 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6147 &new_eptp, index * 8, 8)) 6148 return 1; 6149 6150 /* 6151 * If the (L2) guest does a vmfunc to the currently 6152 * active ept pointer, we don't have to do anything else 6153 */ 6154 if (vmcs12->ept_pointer != new_eptp) { 6155 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6156 return 1; 6157 6158 vmcs12->ept_pointer = new_eptp; 6159 nested_ept_new_eptp(vcpu); 6160 6161 if (!nested_cpu_has_vpid(vmcs12)) 6162 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6163 } 6164 6165 return 0; 6166 } 6167 6168 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6169 { 6170 struct vcpu_vmx *vmx = to_vmx(vcpu); 6171 struct vmcs12 *vmcs12; 6172 u32 function = kvm_rax_read(vcpu); 6173 6174 /* 6175 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6176 * VMFUNC for nested VMs, but not for L1. 6177 */ 6178 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6179 kvm_queue_exception(vcpu, UD_VECTOR); 6180 return 1; 6181 } 6182 6183 vmcs12 = get_vmcs12(vcpu); 6184 6185 /* 6186 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6187 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6188 */ 6189 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6190 kvm_queue_exception(vcpu, UD_VECTOR); 6191 return 1; 6192 } 6193 6194 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6195 goto fail; 6196 6197 switch (function) { 6198 case 0: 6199 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6200 goto fail; 6201 break; 6202 default: 6203 goto fail; 6204 } 6205 return kvm_skip_emulated_instruction(vcpu); 6206 6207 fail: 6208 /* 6209 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6210 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6211 * EXIT_REASON_VMFUNC as the exit reason. 6212 */ 6213 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6214 vmx_get_intr_info(vcpu), 6215 vmx_get_exit_qual(vcpu)); 6216 return 1; 6217 } 6218 6219 /* 6220 * Return true if an IO instruction with the specified port and size should cause 6221 * a VM-exit into L1. 6222 */ 6223 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6224 int size) 6225 { 6226 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6227 gpa_t bitmap, last_bitmap; 6228 u8 b; 6229 6230 last_bitmap = INVALID_GPA; 6231 b = -1; 6232 6233 while (size > 0) { 6234 if (port < 0x8000) 6235 bitmap = vmcs12->io_bitmap_a; 6236 else if (port < 0x10000) 6237 bitmap = vmcs12->io_bitmap_b; 6238 else 6239 return true; 6240 bitmap += (port & 0x7fff) / 8; 6241 6242 if (last_bitmap != bitmap) 6243 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6244 return true; 6245 if (b & (1 << (port & 7))) 6246 return true; 6247 6248 port++; 6249 size--; 6250 last_bitmap = bitmap; 6251 } 6252 6253 return false; 6254 } 6255 6256 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6257 struct vmcs12 *vmcs12) 6258 { 6259 unsigned long exit_qualification; 6260 unsigned short port; 6261 int size; 6262 6263 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6264 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6265 6266 exit_qualification = vmx_get_exit_qual(vcpu); 6267 6268 port = exit_qualification >> 16; 6269 size = (exit_qualification & 7) + 1; 6270 6271 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6272 } 6273 6274 /* 6275 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6276 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6277 * disinterest in the current event (read or write a specific MSR) by using an 6278 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6279 */ 6280 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6281 struct vmcs12 *vmcs12, 6282 union vmx_exit_reason exit_reason) 6283 { 6284 u32 msr_index; 6285 gpa_t bitmap; 6286 6287 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6288 return true; 6289 6290 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6291 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6292 msr_index = vmx_get_exit_qual(vcpu); 6293 else 6294 msr_index = kvm_rcx_read(vcpu); 6295 6296 /* 6297 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6298 * for the four combinations of read/write and low/high MSR numbers. 6299 * First we need to figure out which of the four to use: 6300 */ 6301 bitmap = vmcs12->msr_bitmap; 6302 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6303 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6304 bitmap += 2048; 6305 if (msr_index >= 0xc0000000) { 6306 msr_index -= 0xc0000000; 6307 bitmap += 1024; 6308 } 6309 6310 /* Then read the msr_index'th bit from this bitmap: */ 6311 if (msr_index < 1024*8) { 6312 unsigned char b; 6313 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6314 return true; 6315 return 1 & (b >> (msr_index & 7)); 6316 } else 6317 return true; /* let L1 handle the wrong parameter */ 6318 } 6319 6320 /* 6321 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6322 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6323 * intercept (via guest_host_mask etc.) the current event. 6324 */ 6325 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6326 struct vmcs12 *vmcs12) 6327 { 6328 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6329 int cr = exit_qualification & 15; 6330 int reg; 6331 unsigned long val; 6332 6333 switch ((exit_qualification >> 4) & 3) { 6334 case 0: /* mov to cr */ 6335 reg = (exit_qualification >> 8) & 15; 6336 val = kvm_register_read(vcpu, reg); 6337 switch (cr) { 6338 case 0: 6339 if (vmcs12->cr0_guest_host_mask & 6340 (val ^ vmcs12->cr0_read_shadow)) 6341 return true; 6342 break; 6343 case 3: 6344 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6345 return true; 6346 break; 6347 case 4: 6348 if (vmcs12->cr4_guest_host_mask & 6349 (vmcs12->cr4_read_shadow ^ val)) 6350 return true; 6351 break; 6352 case 8: 6353 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6354 return true; 6355 break; 6356 } 6357 break; 6358 case 2: /* clts */ 6359 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6360 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6361 return true; 6362 break; 6363 case 1: /* mov from cr */ 6364 switch (cr) { 6365 case 3: 6366 if (vmcs12->cpu_based_vm_exec_control & 6367 CPU_BASED_CR3_STORE_EXITING) 6368 return true; 6369 break; 6370 case 8: 6371 if (vmcs12->cpu_based_vm_exec_control & 6372 CPU_BASED_CR8_STORE_EXITING) 6373 return true; 6374 break; 6375 } 6376 break; 6377 case 3: /* lmsw */ 6378 /* 6379 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6380 * cr0. Other attempted changes are ignored, with no exit. 6381 */ 6382 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6383 if (vmcs12->cr0_guest_host_mask & 0xe & 6384 (val ^ vmcs12->cr0_read_shadow)) 6385 return true; 6386 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6387 !(vmcs12->cr0_read_shadow & 0x1) && 6388 (val & 0x1)) 6389 return true; 6390 break; 6391 } 6392 return false; 6393 } 6394 6395 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6396 struct vmcs12 *vmcs12) 6397 { 6398 u32 encls_leaf; 6399 6400 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6401 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6402 return false; 6403 6404 encls_leaf = kvm_rax_read(vcpu); 6405 if (encls_leaf > 62) 6406 encls_leaf = 63; 6407 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6408 } 6409 6410 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6411 struct vmcs12 *vmcs12, gpa_t bitmap) 6412 { 6413 u32 vmx_instruction_info; 6414 unsigned long field; 6415 u8 b; 6416 6417 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6418 return true; 6419 6420 /* Decode instruction info and find the field to access */ 6421 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6422 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6423 6424 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6425 if (field >> 15) 6426 return true; 6427 6428 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6429 return true; 6430 6431 return 1 & (b >> (field & 7)); 6432 } 6433 6434 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6435 { 6436 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6437 6438 if (nested_cpu_has_mtf(vmcs12)) 6439 return true; 6440 6441 /* 6442 * An MTF VM-exit may be injected into the guest by setting the 6443 * interruption-type to 7 (other event) and the vector field to 0. Such 6444 * is the case regardless of the 'monitor trap flag' VM-execution 6445 * control. 6446 */ 6447 return entry_intr_info == (INTR_INFO_VALID_MASK 6448 | INTR_TYPE_OTHER_EVENT); 6449 } 6450 6451 /* 6452 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6453 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6454 */ 6455 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6456 union vmx_exit_reason exit_reason) 6457 { 6458 u32 intr_info; 6459 6460 switch ((u16)exit_reason.basic) { 6461 case EXIT_REASON_EXCEPTION_NMI: 6462 intr_info = vmx_get_intr_info(vcpu); 6463 if (is_nmi(intr_info)) 6464 return true; 6465 else if (is_page_fault(intr_info)) 6466 return vcpu->arch.apf.host_apf_flags || 6467 vmx_need_pf_intercept(vcpu); 6468 else if (is_debug(intr_info) && 6469 vcpu->guest_debug & 6470 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6471 return true; 6472 else if (is_breakpoint(intr_info) && 6473 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6474 return true; 6475 else if (is_alignment_check(intr_info) && 6476 !vmx_guest_inject_ac(vcpu)) 6477 return true; 6478 else if (is_ve_fault(intr_info)) 6479 return true; 6480 return false; 6481 case EXIT_REASON_EXTERNAL_INTERRUPT: 6482 return true; 6483 case EXIT_REASON_MCE_DURING_VMENTRY: 6484 return true; 6485 case EXIT_REASON_EPT_VIOLATION: 6486 /* 6487 * L0 always deals with the EPT violation. If nested EPT is 6488 * used, and the nested mmu code discovers that the address is 6489 * missing in the guest EPT table (EPT12), the EPT violation 6490 * will be injected with nested_ept_inject_page_fault() 6491 */ 6492 return true; 6493 case EXIT_REASON_EPT_MISCONFIG: 6494 /* 6495 * L2 never uses directly L1's EPT, but rather L0's own EPT 6496 * table (shadow on EPT) or a merged EPT table that L0 built 6497 * (EPT on EPT). So any problems with the structure of the 6498 * table is L0's fault. 6499 */ 6500 return true; 6501 case EXIT_REASON_PREEMPTION_TIMER: 6502 return true; 6503 case EXIT_REASON_PML_FULL: 6504 /* 6505 * PML is emulated for an L1 VMM and should never be enabled in 6506 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6507 */ 6508 return true; 6509 case EXIT_REASON_VMFUNC: 6510 /* VM functions are emulated through L2->L0 vmexits. */ 6511 return true; 6512 case EXIT_REASON_BUS_LOCK: 6513 /* 6514 * At present, bus lock VM exit is never exposed to L1. 6515 * Handle L2's bus locks in L0 directly. 6516 */ 6517 return true; 6518 #ifdef CONFIG_KVM_HYPERV 6519 case EXIT_REASON_VMCALL: 6520 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6521 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6522 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6523 kvm_hv_is_tlb_flush_hcall(vcpu); 6524 #endif 6525 default: 6526 break; 6527 } 6528 return false; 6529 } 6530 6531 /* 6532 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6533 * is_guest_mode (L2). 6534 */ 6535 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6536 union vmx_exit_reason exit_reason) 6537 { 6538 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6539 u32 intr_info; 6540 6541 switch ((u16)exit_reason.basic) { 6542 case EXIT_REASON_EXCEPTION_NMI: 6543 intr_info = vmx_get_intr_info(vcpu); 6544 if (is_nmi(intr_info)) 6545 return true; 6546 else if (is_page_fault(intr_info)) 6547 return true; 6548 return vmcs12->exception_bitmap & 6549 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6550 case EXIT_REASON_EXTERNAL_INTERRUPT: 6551 return nested_exit_on_intr(vcpu); 6552 case EXIT_REASON_TRIPLE_FAULT: 6553 return true; 6554 case EXIT_REASON_INTERRUPT_WINDOW: 6555 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6556 case EXIT_REASON_NMI_WINDOW: 6557 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6558 case EXIT_REASON_TASK_SWITCH: 6559 return true; 6560 case EXIT_REASON_CPUID: 6561 return true; 6562 case EXIT_REASON_HLT: 6563 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6564 case EXIT_REASON_INVD: 6565 return true; 6566 case EXIT_REASON_INVLPG: 6567 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6568 case EXIT_REASON_RDPMC: 6569 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6570 case EXIT_REASON_RDRAND: 6571 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6572 case EXIT_REASON_RDSEED: 6573 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6574 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6575 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6576 case EXIT_REASON_VMREAD: 6577 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6578 vmcs12->vmread_bitmap); 6579 case EXIT_REASON_VMWRITE: 6580 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6581 vmcs12->vmwrite_bitmap); 6582 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6583 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6584 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6585 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6586 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6587 /* 6588 * VMX instructions trap unconditionally. This allows L1 to 6589 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6590 */ 6591 return true; 6592 case EXIT_REASON_CR_ACCESS: 6593 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6594 case EXIT_REASON_DR_ACCESS: 6595 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6596 case EXIT_REASON_IO_INSTRUCTION: 6597 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6598 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6599 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6600 case EXIT_REASON_MSR_READ: 6601 case EXIT_REASON_MSR_WRITE: 6602 case EXIT_REASON_MSR_READ_IMM: 6603 case EXIT_REASON_MSR_WRITE_IMM: 6604 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6605 case EXIT_REASON_INVALID_STATE: 6606 return true; 6607 case EXIT_REASON_MWAIT_INSTRUCTION: 6608 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6609 case EXIT_REASON_MONITOR_TRAP_FLAG: 6610 return nested_vmx_exit_handled_mtf(vmcs12); 6611 case EXIT_REASON_MONITOR_INSTRUCTION: 6612 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6613 case EXIT_REASON_PAUSE_INSTRUCTION: 6614 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6615 nested_cpu_has2(vmcs12, 6616 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6617 case EXIT_REASON_MCE_DURING_VMENTRY: 6618 return true; 6619 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6620 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6621 case EXIT_REASON_APIC_ACCESS: 6622 case EXIT_REASON_APIC_WRITE: 6623 case EXIT_REASON_EOI_INDUCED: 6624 /* 6625 * The controls for "virtualize APIC accesses," "APIC- 6626 * register virtualization," and "virtual-interrupt 6627 * delivery" only come from vmcs12. 6628 */ 6629 return true; 6630 case EXIT_REASON_INVPCID: 6631 return 6632 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6633 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6634 case EXIT_REASON_WBINVD: 6635 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6636 case EXIT_REASON_XSETBV: 6637 return true; 6638 case EXIT_REASON_XSAVES: 6639 case EXIT_REASON_XRSTORS: 6640 /* 6641 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6642 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6643 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6644 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6645 * in that case, before consulting the XSS-bitmap. 6646 */ 6647 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6648 return true; 6649 case EXIT_REASON_UMWAIT: 6650 case EXIT_REASON_TPAUSE: 6651 return nested_cpu_has2(vmcs12, 6652 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6653 case EXIT_REASON_ENCLS: 6654 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6655 case EXIT_REASON_NOTIFY: 6656 /* Notify VM exit is not exposed to L1 */ 6657 return false; 6658 case EXIT_REASON_SEAMCALL: 6659 case EXIT_REASON_TDCALL: 6660 /* 6661 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6662 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6663 * never want or expect such an exit. 6664 */ 6665 return false; 6666 default: 6667 return true; 6668 } 6669 } 6670 6671 /* 6672 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6673 * reflected into L1. 6674 */ 6675 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6676 { 6677 struct vcpu_vmx *vmx = to_vmx(vcpu); 6678 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6679 unsigned long exit_qual; 6680 u32 exit_intr_info; 6681 6682 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6683 6684 /* 6685 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6686 * has already loaded L2's state. 6687 */ 6688 if (unlikely(vmx->fail)) { 6689 trace_kvm_nested_vmenter_failed( 6690 "hardware VM-instruction error: ", 6691 vmcs_read32(VM_INSTRUCTION_ERROR)); 6692 exit_intr_info = 0; 6693 exit_qual = 0; 6694 goto reflect_vmexit; 6695 } 6696 6697 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6698 6699 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6700 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6701 return false; 6702 6703 /* If L1 doesn't want the exit, handle it in L0. */ 6704 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6705 return false; 6706 6707 /* 6708 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6709 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6710 * need to be synthesized by querying the in-kernel LAPIC, but external 6711 * interrupts are never reflected to L1 so it's a non-issue. 6712 */ 6713 exit_intr_info = vmx_get_intr_info(vcpu); 6714 if (is_exception_with_error_code(exit_intr_info)) { 6715 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6716 6717 vmcs12->vm_exit_intr_error_code = 6718 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6719 } 6720 exit_qual = vmx_get_exit_qual(vcpu); 6721 6722 reflect_vmexit: 6723 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6724 return true; 6725 } 6726 6727 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6728 struct kvm_nested_state __user *user_kvm_nested_state, 6729 u32 user_data_size) 6730 { 6731 struct vcpu_vmx *vmx; 6732 struct vmcs12 *vmcs12; 6733 struct kvm_nested_state kvm_state = { 6734 .flags = 0, 6735 .format = KVM_STATE_NESTED_FORMAT_VMX, 6736 .size = sizeof(kvm_state), 6737 .hdr.vmx.flags = 0, 6738 .hdr.vmx.vmxon_pa = INVALID_GPA, 6739 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6740 .hdr.vmx.preemption_timer_deadline = 0, 6741 }; 6742 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6743 &user_kvm_nested_state->data.vmx[0]; 6744 6745 if (!vcpu) 6746 return kvm_state.size + sizeof(*user_vmx_nested_state); 6747 6748 vmx = to_vmx(vcpu); 6749 vmcs12 = get_vmcs12(vcpu); 6750 6751 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6752 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6753 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6754 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6755 6756 if (vmx_has_valid_vmcs12(vcpu)) { 6757 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6758 6759 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6760 if (nested_vmx_is_evmptr12_set(vmx)) 6761 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6762 6763 if (is_guest_mode(vcpu) && 6764 nested_cpu_has_shadow_vmcs(vmcs12) && 6765 vmcs12->vmcs_link_pointer != INVALID_GPA) 6766 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6767 } 6768 6769 if (vmx->nested.smm.vmxon) 6770 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6771 6772 if (vmx->nested.smm.guest_mode) 6773 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6774 6775 if (is_guest_mode(vcpu)) { 6776 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6777 6778 if (vmx->nested.nested_run_pending) 6779 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6780 6781 if (vmx->nested.mtf_pending) 6782 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6783 6784 if (nested_cpu_has_preemption_timer(vmcs12) && 6785 vmx->nested.has_preemption_timer_deadline) { 6786 kvm_state.hdr.vmx.flags |= 6787 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6788 kvm_state.hdr.vmx.preemption_timer_deadline = 6789 vmx->nested.preemption_timer_deadline; 6790 } 6791 } 6792 } 6793 6794 if (user_data_size < kvm_state.size) 6795 goto out; 6796 6797 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6798 return -EFAULT; 6799 6800 if (!vmx_has_valid_vmcs12(vcpu)) 6801 goto out; 6802 6803 /* 6804 * When running L2, the authoritative vmcs12 state is in the 6805 * vmcs02. When running L1, the authoritative vmcs12 state is 6806 * in the shadow or enlightened vmcs linked to vmcs01, unless 6807 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6808 * vmcs12 state is in the vmcs12 already. 6809 */ 6810 if (is_guest_mode(vcpu)) { 6811 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6812 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6813 } else { 6814 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6815 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6816 if (nested_vmx_is_evmptr12_valid(vmx)) 6817 /* 6818 * L1 hypervisor is not obliged to keep eVMCS 6819 * clean fields data always up-to-date while 6820 * not in guest mode, 'hv_clean_fields' is only 6821 * supposed to be actual upon vmentry so we need 6822 * to ignore it here and do full copy. 6823 */ 6824 copy_enlightened_to_vmcs12(vmx, 0); 6825 else if (enable_shadow_vmcs) 6826 copy_shadow_to_vmcs12(vmx); 6827 } 6828 } 6829 6830 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6831 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6832 6833 /* 6834 * Copy over the full allocated size of vmcs12 rather than just the size 6835 * of the struct. 6836 */ 6837 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6838 return -EFAULT; 6839 6840 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6841 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6842 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6843 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6844 return -EFAULT; 6845 } 6846 out: 6847 return kvm_state.size; 6848 } 6849 6850 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6851 { 6852 if (is_guest_mode(vcpu)) { 6853 to_vmx(vcpu)->nested.nested_run_pending = 0; 6854 nested_vmx_vmexit(vcpu, -1, 0, 0); 6855 } 6856 free_nested(vcpu); 6857 } 6858 6859 int nested_vmx_check_restored_vmcs12(struct kvm_vcpu *vcpu) 6860 { 6861 enum vm_entry_failure_code ignored; 6862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6863 6864 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6865 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6866 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6867 6868 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6869 !shadow_vmcs12->hdr.shadow_vmcs) 6870 return -EINVAL; 6871 } 6872 6873 if (nested_vmx_check_controls(vcpu, vmcs12) || 6874 nested_vmx_check_host_state(vcpu, vmcs12) || 6875 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6876 return -EINVAL; 6877 6878 return 0; 6879 } 6880 6881 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6882 struct kvm_nested_state __user *user_kvm_nested_state, 6883 struct kvm_nested_state *kvm_state) 6884 { 6885 struct vcpu_vmx *vmx = to_vmx(vcpu); 6886 struct vmcs12 *vmcs12; 6887 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6888 &user_kvm_nested_state->data.vmx[0]; 6889 int ret; 6890 6891 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6892 return -EINVAL; 6893 6894 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6895 if (kvm_state->hdr.vmx.smm.flags) 6896 return -EINVAL; 6897 6898 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6899 return -EINVAL; 6900 6901 /* 6902 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6903 * enable eVMCS capability on vCPU. However, since then 6904 * code was changed such that flag signals vmcs12 should 6905 * be copied into eVMCS in guest memory. 6906 * 6907 * To preserve backwards compatibility, allow user 6908 * to set this flag even when there is no VMXON region. 6909 */ 6910 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6911 return -EINVAL; 6912 } else { 6913 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6914 return -EINVAL; 6915 6916 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6917 return -EINVAL; 6918 } 6919 6920 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6921 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6922 return -EINVAL; 6923 6924 if (kvm_state->hdr.vmx.smm.flags & 6925 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6926 return -EINVAL; 6927 6928 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6929 return -EINVAL; 6930 6931 /* 6932 * SMM temporarily disables VMX, so we cannot be in guest mode, 6933 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6934 * must be zero. 6935 */ 6936 if (is_smm(vcpu) ? 6937 (kvm_state->flags & 6938 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6939 : kvm_state->hdr.vmx.smm.flags) 6940 return -EINVAL; 6941 6942 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6943 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6944 return -EINVAL; 6945 6946 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6947 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6948 !vmx->nested.enlightened_vmcs_enabled)) 6949 return -EINVAL; 6950 6951 vmx_leave_nested(vcpu); 6952 6953 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6954 return 0; 6955 6956 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6957 ret = enter_vmx_operation(vcpu); 6958 if (ret) 6959 return ret; 6960 6961 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6962 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6963 /* See vmx_has_valid_vmcs12. */ 6964 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6965 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6966 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6967 return -EINVAL; 6968 else 6969 return 0; 6970 } 6971 6972 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6973 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6974 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6975 return -EINVAL; 6976 6977 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6978 #ifdef CONFIG_KVM_HYPERV 6979 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6980 /* 6981 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6982 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6983 * restored yet. EVMCS will be mapped from 6984 * nested_get_vmcs12_pages(). 6985 */ 6986 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6987 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6988 #endif 6989 } else { 6990 return -EINVAL; 6991 } 6992 6993 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6994 vmx->nested.smm.vmxon = true; 6995 vmx->nested.vmxon = false; 6996 6997 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6998 vmx->nested.smm.guest_mode = true; 6999 } 7000 7001 vmcs12 = get_vmcs12(vcpu); 7002 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7003 return -EFAULT; 7004 7005 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7006 return -EINVAL; 7007 7008 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7009 return 0; 7010 7011 vmx->nested.nested_run_pending = 7012 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 7013 7014 vmx->nested.mtf_pending = 7015 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7016 7017 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7018 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7019 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7020 7021 ret = -EINVAL; 7022 if (kvm_state->size < 7023 sizeof(*kvm_state) + 7024 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7025 goto error_guest_mode; 7026 7027 ret = -EFAULT; 7028 if (copy_from_user(shadow_vmcs12, 7029 user_vmx_nested_state->shadow_vmcs12, 7030 sizeof(*shadow_vmcs12))) 7031 goto error_guest_mode; 7032 } 7033 7034 vmx->nested.has_preemption_timer_deadline = false; 7035 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7036 vmx->nested.has_preemption_timer_deadline = true; 7037 vmx->nested.preemption_timer_deadline = 7038 kvm_state->hdr.vmx.preemption_timer_deadline; 7039 } 7040 7041 ret = nested_vmx_check_restored_vmcs12(vcpu); 7042 if (ret < 0) 7043 goto error_guest_mode; 7044 7045 vmx->nested.dirty_vmcs12 = true; 7046 vmx->nested.force_msr_bitmap_recalc = true; 7047 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7048 if (ret) 7049 goto error_guest_mode; 7050 7051 if (vmx->nested.mtf_pending) 7052 kvm_make_request(KVM_REQ_EVENT, vcpu); 7053 7054 return 0; 7055 7056 error_guest_mode: 7057 vmx->nested.nested_run_pending = 0; 7058 return ret; 7059 } 7060 7061 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7062 { 7063 if (enable_shadow_vmcs) { 7064 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7065 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7066 } 7067 } 7068 7069 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7070 { 7071 /* 7072 * Note these are the so called "index" of the VMCS field encoding, not 7073 * the index into vmcs12. 7074 */ 7075 unsigned int max_idx, idx; 7076 int i; 7077 7078 /* 7079 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7080 * vmcs12, regardless of whether or not the associated feature is 7081 * exposed to L1. Simply find the field with the highest index. 7082 */ 7083 max_idx = 0; 7084 for (i = 0; i < nr_vmcs12_fields; i++) { 7085 /* The vmcs12 table is very, very sparsely populated. */ 7086 if (!vmcs12_field_offsets[i]) 7087 continue; 7088 7089 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7090 if (idx > max_idx) 7091 max_idx = idx; 7092 } 7093 7094 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7095 } 7096 7097 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7098 struct nested_vmx_msrs *msrs) 7099 { 7100 msrs->pinbased_ctls_low = 7101 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7102 7103 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7104 msrs->pinbased_ctls_high &= 7105 PIN_BASED_EXT_INTR_MASK | 7106 PIN_BASED_NMI_EXITING | 7107 PIN_BASED_VIRTUAL_NMIS | 7108 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7109 msrs->pinbased_ctls_high |= 7110 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7111 PIN_BASED_VMX_PREEMPTION_TIMER; 7112 } 7113 7114 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7115 struct nested_vmx_msrs *msrs) 7116 { 7117 msrs->exit_ctls_low = 7118 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7119 7120 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7121 msrs->exit_ctls_high &= 7122 #ifdef CONFIG_X86_64 7123 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7124 #endif 7125 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7126 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7127 msrs->exit_ctls_high |= 7128 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7129 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7130 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7131 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7132 7133 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7134 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7135 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7136 7137 /* We support free control of debug control saving. */ 7138 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7139 } 7140 7141 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7142 struct nested_vmx_msrs *msrs) 7143 { 7144 msrs->entry_ctls_low = 7145 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7146 7147 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7148 msrs->entry_ctls_high &= 7149 #ifdef CONFIG_X86_64 7150 VM_ENTRY_IA32E_MODE | 7151 #endif 7152 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7153 VM_ENTRY_LOAD_CET_STATE; 7154 msrs->entry_ctls_high |= 7155 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7156 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7157 7158 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7159 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7160 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7161 7162 /* We support free control of debug control loading. */ 7163 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7164 } 7165 7166 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7167 struct nested_vmx_msrs *msrs) 7168 { 7169 msrs->procbased_ctls_low = 7170 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7171 7172 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7173 msrs->procbased_ctls_high &= 7174 CPU_BASED_INTR_WINDOW_EXITING | 7175 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7176 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7177 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7178 CPU_BASED_CR3_STORE_EXITING | 7179 #ifdef CONFIG_X86_64 7180 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7181 #endif 7182 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7183 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7184 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7185 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7186 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7187 /* 7188 * We can allow some features even when not supported by the 7189 * hardware. For example, L1 can specify an MSR bitmap - and we 7190 * can use it to avoid exits to L1 - even when L0 runs L2 7191 * without MSR bitmaps. 7192 */ 7193 msrs->procbased_ctls_high |= 7194 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7195 CPU_BASED_USE_MSR_BITMAPS; 7196 7197 /* We support free control of CR3 access interception. */ 7198 msrs->procbased_ctls_low &= 7199 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7200 } 7201 7202 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7203 struct vmcs_config *vmcs_conf, 7204 struct nested_vmx_msrs *msrs) 7205 { 7206 msrs->secondary_ctls_low = 0; 7207 7208 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7209 msrs->secondary_ctls_high &= 7210 SECONDARY_EXEC_DESC | 7211 SECONDARY_EXEC_ENABLE_RDTSCP | 7212 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7213 SECONDARY_EXEC_WBINVD_EXITING | 7214 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7215 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7216 SECONDARY_EXEC_RDRAND_EXITING | 7217 SECONDARY_EXEC_ENABLE_INVPCID | 7218 SECONDARY_EXEC_ENABLE_VMFUNC | 7219 SECONDARY_EXEC_RDSEED_EXITING | 7220 SECONDARY_EXEC_ENABLE_XSAVES | 7221 SECONDARY_EXEC_TSC_SCALING | 7222 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7223 7224 /* 7225 * We can emulate "VMCS shadowing," even if the hardware 7226 * doesn't support it. 7227 */ 7228 msrs->secondary_ctls_high |= 7229 SECONDARY_EXEC_SHADOW_VMCS; 7230 7231 if (enable_ept) { 7232 /* nested EPT: emulate EPT also to L1 */ 7233 msrs->secondary_ctls_high |= 7234 SECONDARY_EXEC_ENABLE_EPT; 7235 msrs->ept_caps = 7236 VMX_EPT_PAGE_WALK_4_BIT | 7237 VMX_EPT_PAGE_WALK_5_BIT | 7238 VMX_EPTP_WB_BIT | 7239 VMX_EPT_INVEPT_BIT | 7240 VMX_EPT_EXECUTE_ONLY_BIT; 7241 7242 msrs->ept_caps &= ept_caps; 7243 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7244 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7245 VMX_EPT_1GB_PAGE_BIT; 7246 if (enable_ept_ad_bits) { 7247 msrs->secondary_ctls_high |= 7248 SECONDARY_EXEC_ENABLE_PML; 7249 msrs->ept_caps |= VMX_EPT_AD_BIT; 7250 } 7251 7252 /* 7253 * Advertise EPTP switching irrespective of hardware support, 7254 * KVM emulates it in software so long as VMFUNC is supported. 7255 */ 7256 if (cpu_has_vmx_vmfunc()) 7257 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7258 } 7259 7260 /* 7261 * Old versions of KVM use the single-context version without 7262 * checking for support, so declare that it is supported even 7263 * though it is treated as global context. The alternative is 7264 * not failing the single-context invvpid, and it is worse. 7265 */ 7266 if (enable_vpid) { 7267 msrs->secondary_ctls_high |= 7268 SECONDARY_EXEC_ENABLE_VPID; 7269 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7270 VMX_VPID_EXTENT_SUPPORTED_MASK; 7271 } 7272 7273 if (enable_unrestricted_guest) 7274 msrs->secondary_ctls_high |= 7275 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7276 7277 if (flexpriority_enabled) 7278 msrs->secondary_ctls_high |= 7279 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7280 7281 if (enable_sgx) 7282 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7283 } 7284 7285 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7286 struct nested_vmx_msrs *msrs) 7287 { 7288 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7289 msrs->misc_low |= 7290 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7291 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7292 VMX_MISC_ACTIVITY_HLT | 7293 VMX_MISC_ACTIVITY_WAIT_SIPI; 7294 msrs->misc_high = 0; 7295 } 7296 7297 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7298 { 7299 /* 7300 * This MSR reports some information about VMX support. We 7301 * should return information about the VMX we emulate for the 7302 * guest, and the VMCS structure we give it - not about the 7303 * VMX support of the underlying hardware. 7304 */ 7305 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7306 X86_MEMTYPE_WB); 7307 7308 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7309 if (cpu_has_vmx_basic_inout()) 7310 msrs->basic |= VMX_BASIC_INOUT; 7311 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7312 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7313 } 7314 7315 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7316 { 7317 /* 7318 * These MSRs specify bits which the guest must keep fixed on 7319 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7320 * We picked the standard core2 setting. 7321 */ 7322 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7323 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7324 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7325 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7326 7327 /* These MSRs specify bits which the guest must keep fixed off. */ 7328 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7329 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7330 7331 if (vmx_umip_emulated()) 7332 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7333 } 7334 7335 /* 7336 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7337 * returned for the various VMX controls MSRs when nested VMX is enabled. 7338 * The same values should also be used to verify that vmcs12 control fields are 7339 * valid during nested entry from L1 to L2. 7340 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7341 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7342 * bit in the high half is on if the corresponding bit in the control field 7343 * may be on. See also vmx_control_verify(). 7344 */ 7345 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7346 { 7347 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7348 7349 /* 7350 * Note that as a general rule, the high half of the MSRs (bits in 7351 * the control fields which may be 1) should be initialized by the 7352 * intersection of the underlying hardware's MSR (i.e., features which 7353 * can be supported) and the list of features we want to expose - 7354 * because they are known to be properly supported in our code. 7355 * Also, usually, the low half of the MSRs (bits which must be 1) can 7356 * be set to 0, meaning that L1 may turn off any of these bits. The 7357 * reason is that if one of these bits is necessary, it will appear 7358 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7359 * fields of vmcs01 and vmcs02, will turn these bits off - and 7360 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7361 * These rules have exceptions below. 7362 */ 7363 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7364 7365 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7366 7367 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7368 7369 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7370 7371 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7372 7373 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7374 7375 nested_vmx_setup_basic(msrs); 7376 7377 nested_vmx_setup_cr_fixed(msrs); 7378 7379 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7380 } 7381 7382 void nested_vmx_hardware_unsetup(void) 7383 { 7384 int i; 7385 7386 if (enable_shadow_vmcs) { 7387 for (i = 0; i < VMX_BITMAP_NR; i++) 7388 free_page((unsigned long)vmx_bitmap[i]); 7389 } 7390 } 7391 7392 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7393 { 7394 int i; 7395 7396 /* 7397 * Note! The set of supported vmcs12 fields is consumed by both VMX 7398 * MSR and shadow VMCS setup. 7399 */ 7400 nested_vmx_setup_vmcs12_fields(); 7401 7402 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7403 7404 if (!cpu_has_vmx_shadow_vmcs()) 7405 enable_shadow_vmcs = 0; 7406 if (enable_shadow_vmcs) { 7407 for (i = 0; i < VMX_BITMAP_NR; i++) { 7408 /* 7409 * The vmx_bitmap is not tied to a VM and so should 7410 * not be charged to a memcg. 7411 */ 7412 vmx_bitmap[i] = (unsigned long *) 7413 __get_free_page(GFP_KERNEL); 7414 if (!vmx_bitmap[i]) { 7415 nested_vmx_hardware_unsetup(); 7416 return -ENOMEM; 7417 } 7418 } 7419 7420 init_vmcs_shadow_fields(); 7421 } 7422 7423 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7424 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7425 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7426 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7427 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7428 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7429 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7430 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7431 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7432 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7433 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7434 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7435 7436 return 0; 7437 } 7438 7439 struct kvm_x86_nested_ops vmx_nested_ops = { 7440 .leave_nested = vmx_leave_nested, 7441 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7442 .check_events = vmx_check_nested_events, 7443 .has_events = vmx_has_nested_events, 7444 .triple_fault = nested_vmx_triple_fault, 7445 .get_state = vmx_get_nested_state, 7446 .set_state = vmx_set_nested_state, 7447 .get_nested_state_pages = vmx_get_nested_state_pages, 7448 .write_log_dirty = nested_vmx_write_pml_buffer, 7449 #ifdef CONFIG_KVM_HYPERV 7450 .enable_evmcs = nested_enable_evmcs, 7451 .get_evmcs_version = nested_get_evmcs_version, 7452 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7453 #endif 7454 }; 7455