1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 vcpu->arch.regs_dirty = 0; 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault) 415 { 416 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 417 struct vcpu_vmx *vmx = to_vmx(vcpu); 418 unsigned long exit_qualification; 419 u32 vm_exit_reason; 420 421 if (vmx->nested.pml_full) { 422 vm_exit_reason = EXIT_REASON_PML_FULL; 423 vmx->nested.pml_full = false; 424 425 /* 426 * It should be impossible to trigger a nested PML Full VM-Exit 427 * for anything other than an EPT Violation from L2. KVM *can* 428 * trigger nEPT page fault injection in response to an EPT 429 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 430 * tables also changed, but KVM should not treat EPT Misconfig 431 * VM-Exits as writes. 432 */ 433 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 434 435 /* 436 * PML Full and EPT Violation VM-Exits both use bit 12 to report 437 * "NMI unblocking due to IRET", i.e. the bit can be propagated 438 * as-is from the original EXIT_QUALIFICATION. 439 */ 440 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 441 } else { 442 if (fault->error_code & PFERR_RSVD_MASK) { 443 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 444 exit_qualification = 0; 445 } else { 446 exit_qualification = fault->exit_qualification; 447 exit_qualification |= vmx_get_exit_qual(vcpu) & 448 (EPT_VIOLATION_GVA_IS_VALID | 449 EPT_VIOLATION_GVA_TRANSLATED); 450 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 451 } 452 453 /* 454 * Although the caller (kvm_inject_emulated_page_fault) would 455 * have already synced the faulting address in the shadow EPT 456 * tables for the current EPTP12, we also need to sync it for 457 * any other cached EPTP02s based on the same EP4TA, since the 458 * TLB associates mappings to the EP4TA rather than the full EPTP. 459 */ 460 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 461 fault->address); 462 } 463 464 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 465 vmcs12->guest_physical_address = fault->address; 466 } 467 468 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 469 { 470 struct vcpu_vmx *vmx = to_vmx(vcpu); 471 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 472 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 473 474 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 475 nested_ept_ad_enabled(vcpu), 476 nested_ept_get_eptp(vcpu)); 477 } 478 479 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 480 { 481 WARN_ON(mmu_is_nested(vcpu)); 482 483 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 484 nested_ept_new_eptp(vcpu); 485 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 486 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 487 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 488 489 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 490 } 491 492 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 493 { 494 vcpu->arch.mmu = &vcpu->arch.root_mmu; 495 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 496 } 497 498 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 499 u16 error_code) 500 { 501 bool inequality, bit; 502 503 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 504 inequality = 505 (error_code & vmcs12->page_fault_error_code_mask) != 506 vmcs12->page_fault_error_code_match; 507 return inequality ^ bit; 508 } 509 510 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 511 u32 error_code) 512 { 513 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 514 515 /* 516 * Drop bits 31:16 of the error code when performing the #PF mask+match 517 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 518 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 519 * error code. Including the to-be-dropped bits in the check might 520 * result in an "impossible" or missed exit from L1's perspective. 521 */ 522 if (vector == PF_VECTOR) 523 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 524 525 return (vmcs12->exception_bitmap & (1u << vector)); 526 } 527 528 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 529 struct vmcs12 *vmcs12) 530 { 531 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 532 return 0; 533 534 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 535 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 536 return -EINVAL; 537 538 return 0; 539 } 540 541 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 542 struct vmcs12 *vmcs12) 543 { 544 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 545 return 0; 546 547 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 548 return -EINVAL; 549 550 return 0; 551 } 552 553 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 554 struct vmcs12 *vmcs12) 555 { 556 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 557 return 0; 558 559 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 560 return -EINVAL; 561 562 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 563 return -EINVAL; 564 565 return 0; 566 } 567 568 /* 569 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 570 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 571 * only the "disable intercept" case needs to be handled. 572 */ 573 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 574 unsigned long *msr_bitmap_l0, 575 u32 msr, int type) 576 { 577 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 578 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 579 580 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 581 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 582 } 583 584 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 585 { 586 int msr; 587 588 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 589 unsigned word = msr / BITS_PER_LONG; 590 591 msr_bitmap[word] = ~0; 592 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 593 } 594 } 595 596 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 597 static inline \ 598 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 599 unsigned long *msr_bitmap_l1, \ 600 unsigned long *msr_bitmap_l0, u32 msr) \ 601 { \ 602 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 603 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 604 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 605 else \ 606 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 607 } 608 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 609 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 610 611 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 612 unsigned long *msr_bitmap_l1, 613 unsigned long *msr_bitmap_l0, 614 u32 msr, int types) 615 { 616 if (types & MSR_TYPE_R) 617 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 618 msr_bitmap_l0, msr); 619 if (types & MSR_TYPE_W) 620 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 621 msr_bitmap_l0, msr); 622 } 623 624 #define nested_vmx_merge_msr_bitmaps(msr, type) \ 625 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 626 msr_bitmap_l0, msr, type) 627 628 #define nested_vmx_merge_msr_bitmaps_read(msr) \ 629 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 630 631 #define nested_vmx_merge_msr_bitmaps_write(msr) \ 632 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 633 634 #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 635 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 636 637 static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 638 unsigned long *msr_bitmap_l1, 639 unsigned long *msr_bitmap_l0) 640 { 641 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 642 struct vcpu_vmx *vmx = to_vmx(vcpu); 643 int i; 644 645 /* 646 * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 647 * none of the MSRs can possibly be passed through to L1. 648 */ 649 if (!kvm_vcpu_has_mediated_pmu(vcpu)) 650 return; 651 652 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 653 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 654 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 655 } 656 657 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 658 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 659 660 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 661 nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 662 nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 663 } 664 665 /* 666 * Merge L0's and L1's MSR bitmap, return false to indicate that 667 * we do not use the hardware. 668 */ 669 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 670 struct vmcs12 *vmcs12) 671 { 672 struct vcpu_vmx *vmx = to_vmx(vcpu); 673 int msr; 674 unsigned long *msr_bitmap_l1; 675 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 676 struct kvm_host_map map; 677 678 /* Nothing to do if the MSR bitmap is not in use. */ 679 if (!cpu_has_vmx_msr_bitmap() || 680 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 681 return false; 682 683 /* 684 * MSR bitmap update can be skipped when: 685 * - MSR bitmap for L1 hasn't changed. 686 * - Nested hypervisor (L1) is attempting to launch the same L2 as 687 * before. 688 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 689 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 690 */ 691 if (!vmx->nested.force_msr_bitmap_recalc) { 692 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 693 694 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 695 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 696 return true; 697 } 698 699 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 700 return false; 701 702 msr_bitmap_l1 = (unsigned long *)map.hva; 703 704 /* 705 * To keep the control flow simple, pay eight 8-byte writes (sixteen 706 * 4-byte writes on 32-bit systems) up front to enable intercepts for 707 * the x2APIC MSR range and selectively toggle those relevant to L2. 708 */ 709 enable_x2apic_msr_intercepts(msr_bitmap_l0); 710 711 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 712 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 713 /* 714 * L0 need not intercept reads for MSRs between 0x800 715 * and 0x8ff, it just lets the processor take the value 716 * from the virtual-APIC page; take those 256 bits 717 * directly from the L1 bitmap. 718 */ 719 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 720 unsigned word = msr / BITS_PER_LONG; 721 722 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 723 } 724 } 725 726 nested_vmx_disable_intercept_for_x2apic_msr( 727 msr_bitmap_l1, msr_bitmap_l0, 728 X2APIC_MSR(APIC_TASKPRI), 729 MSR_TYPE_R | MSR_TYPE_W); 730 731 if (nested_cpu_has_vid(vmcs12)) { 732 nested_vmx_disable_intercept_for_x2apic_msr( 733 msr_bitmap_l1, msr_bitmap_l0, 734 X2APIC_MSR(APIC_EOI), 735 MSR_TYPE_W); 736 nested_vmx_disable_intercept_for_x2apic_msr( 737 msr_bitmap_l1, msr_bitmap_l0, 738 X2APIC_MSR(APIC_SELF_IPI), 739 MSR_TYPE_W); 740 } 741 } 742 743 /* 744 * Always check vmcs01's bitmap to honor userspace MSR filters and any 745 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 746 */ 747 #ifdef CONFIG_X86_64 748 nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 749 nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 750 nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 751 #endif 752 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 753 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 754 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 755 756 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 757 MSR_IA32_APERF, MSR_TYPE_R); 758 759 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 760 MSR_IA32_MPERF, MSR_TYPE_R); 761 762 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 763 MSR_IA32_U_CET, MSR_TYPE_RW); 764 765 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 766 MSR_IA32_S_CET, MSR_TYPE_RW); 767 768 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 769 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 770 771 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 772 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 773 774 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 775 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 776 777 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 778 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 779 780 nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 781 782 kvm_vcpu_unmap(vcpu, &map); 783 784 vmx->nested.force_msr_bitmap_recalc = false; 785 786 return true; 787 } 788 789 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 790 struct vmcs12 *vmcs12) 791 { 792 struct vcpu_vmx *vmx = to_vmx(vcpu); 793 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 794 795 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 796 vmcs12->vmcs_link_pointer == INVALID_GPA) 797 return; 798 799 if (ghc->gpa != vmcs12->vmcs_link_pointer && 800 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 801 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 802 return; 803 804 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 805 VMCS12_SIZE); 806 } 807 808 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 809 struct vmcs12 *vmcs12) 810 { 811 struct vcpu_vmx *vmx = to_vmx(vcpu); 812 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 813 814 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 815 vmcs12->vmcs_link_pointer == INVALID_GPA) 816 return; 817 818 if (ghc->gpa != vmcs12->vmcs_link_pointer && 819 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 820 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 821 return; 822 823 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 824 VMCS12_SIZE); 825 } 826 827 /* 828 * In nested virtualization, check if L1 has set 829 * VM_EXIT_ACK_INTR_ON_EXIT 830 */ 831 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 832 { 833 return get_vmcs12(vcpu)->vm_exit_controls & 834 VM_EXIT_ACK_INTR_ON_EXIT; 835 } 836 837 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 838 struct vmcs12 *vmcs12) 839 { 840 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 841 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 842 return -EINVAL; 843 else 844 return 0; 845 } 846 847 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 848 struct vmcs12 *vmcs12) 849 { 850 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 851 !nested_cpu_has_apic_reg_virt(vmcs12) && 852 !nested_cpu_has_vid(vmcs12) && 853 !nested_cpu_has_posted_intr(vmcs12)) 854 return 0; 855 856 /* 857 * If virtualize x2apic mode is enabled, 858 * virtualize apic access must be disabled. 859 */ 860 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 861 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 862 return -EINVAL; 863 864 /* 865 * If virtual interrupt delivery is enabled, 866 * we must exit on external interrupts. 867 */ 868 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 869 return -EINVAL; 870 871 /* 872 * bits 15:8 should be zero in posted_intr_nv, 873 * the descriptor address has been already checked 874 * in nested_get_vmcs12_pages. 875 * 876 * bits 5:0 of posted_intr_desc_addr should be zero. 877 */ 878 if (nested_cpu_has_posted_intr(vmcs12) && 879 (CC(!nested_cpu_has_vid(vmcs12)) || 880 CC(!nested_exit_intr_ack_set(vcpu)) || 881 CC((vmcs12->posted_intr_nv & 0xff00)) || 882 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 883 return -EINVAL; 884 885 /* tpr shadow is needed by all apicv features. */ 886 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 887 return -EINVAL; 888 889 return 0; 890 } 891 892 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 893 { 894 struct vcpu_vmx *vmx = to_vmx(vcpu); 895 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 896 vmx->nested.msrs.misc_high); 897 898 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 899 } 900 901 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 902 u32 count, u64 addr) 903 { 904 if (count == 0) 905 return 0; 906 907 /* 908 * Exceeding the limit results in architecturally _undefined_ behavior, 909 * i.e. KVM is allowed to do literally anything in response to a bad 910 * limit. Immediately generate a consistency check so that code that 911 * consumes the count doesn't need to worry about extreme edge cases. 912 */ 913 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 914 return -EINVAL; 915 916 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 917 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 918 return -EINVAL; 919 920 return 0; 921 } 922 923 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 924 struct vmcs12 *vmcs12) 925 { 926 if (CC(nested_vmx_check_msr_switch(vcpu, 927 vmcs12->vm_exit_msr_load_count, 928 vmcs12->vm_exit_msr_load_addr)) || 929 CC(nested_vmx_check_msr_switch(vcpu, 930 vmcs12->vm_exit_msr_store_count, 931 vmcs12->vm_exit_msr_store_addr))) 932 return -EINVAL; 933 934 return 0; 935 } 936 937 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 938 struct vmcs12 *vmcs12) 939 { 940 if (CC(nested_vmx_check_msr_switch(vcpu, 941 vmcs12->vm_entry_msr_load_count, 942 vmcs12->vm_entry_msr_load_addr))) 943 return -EINVAL; 944 945 return 0; 946 } 947 948 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 949 struct vmcs12 *vmcs12) 950 { 951 if (!nested_cpu_has_pml(vmcs12)) 952 return 0; 953 954 if (CC(!nested_cpu_has_ept(vmcs12)) || 955 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 956 return -EINVAL; 957 958 return 0; 959 } 960 961 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 962 struct vmcs12 *vmcs12) 963 { 964 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 965 !nested_cpu_has_ept(vmcs12))) 966 return -EINVAL; 967 return 0; 968 } 969 970 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 971 struct vmcs12 *vmcs12) 972 { 973 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 974 !nested_cpu_has_ept(vmcs12))) 975 return -EINVAL; 976 return 0; 977 } 978 979 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 980 struct vmcs12 *vmcs12) 981 { 982 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 983 return 0; 984 985 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 986 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 987 return -EINVAL; 988 989 return 0; 990 } 991 992 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 993 struct vmx_msr_entry *e) 994 { 995 /* x2APIC MSR accesses are not allowed */ 996 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 997 return -EINVAL; 998 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 999 CC(e->index == MSR_IA32_UCODE_REV)) 1000 return -EINVAL; 1001 if (CC(e->reserved != 0)) 1002 return -EINVAL; 1003 return 0; 1004 } 1005 1006 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 1007 struct vmx_msr_entry *e) 1008 { 1009 if (CC(e->index == MSR_FS_BASE) || 1010 CC(e->index == MSR_GS_BASE) || 1011 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 1012 nested_vmx_msr_check_common(vcpu, e)) 1013 return -EINVAL; 1014 return 0; 1015 } 1016 1017 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 1018 struct vmx_msr_entry *e) 1019 { 1020 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 1021 nested_vmx_msr_check_common(vcpu, e)) 1022 return -EINVAL; 1023 return 0; 1024 } 1025 1026 /* 1027 * Load guest's/host's msr at nested entry/exit. 1028 * return 0 for success, entry index for failure. 1029 * 1030 * One of the failure modes for MSR load/store is when a list exceeds the 1031 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 1032 * as possible, process all valid entries before failing rather than precheck 1033 * for a capacity violation. 1034 */ 1035 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1036 { 1037 u32 i; 1038 struct vmx_msr_entry e; 1039 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1040 1041 for (i = 0; i < count; i++) { 1042 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1043 goto fail; 1044 1045 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1046 &e, sizeof(e))) { 1047 pr_debug_ratelimited( 1048 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1049 __func__, i, gpa + i * sizeof(e)); 1050 goto fail; 1051 } 1052 if (nested_vmx_load_msr_check(vcpu, &e)) { 1053 pr_debug_ratelimited( 1054 "%s check failed (%u, 0x%x, 0x%x)\n", 1055 __func__, i, e.index, e.reserved); 1056 goto fail; 1057 } 1058 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1059 pr_debug_ratelimited( 1060 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1061 __func__, i, e.index, e.value); 1062 goto fail; 1063 } 1064 } 1065 return 0; 1066 fail: 1067 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1068 return i + 1; 1069 } 1070 1071 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1072 u32 msr_index, 1073 u64 *data) 1074 { 1075 struct vcpu_vmx *vmx = to_vmx(vcpu); 1076 1077 /* 1078 * If the L0 hypervisor stored a more accurate value for the TSC that 1079 * does not include the time taken for emulation of the L2->L1 1080 * VM-exit in L0, use the more accurate value. 1081 */ 1082 if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1083 int slot = vmx->nested.tsc_autostore_slot; 1084 u64 host_tsc = vmx->msr_autostore.val[slot].value; 1085 1086 *data = kvm_read_l1_tsc(vcpu, host_tsc); 1087 return true; 1088 } 1089 1090 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1091 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1092 msr_index); 1093 return false; 1094 } 1095 return true; 1096 } 1097 1098 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1099 struct vmx_msr_entry *e) 1100 { 1101 if (kvm_vcpu_read_guest(vcpu, 1102 gpa + i * sizeof(*e), 1103 e, 2 * sizeof(u32))) { 1104 pr_debug_ratelimited( 1105 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1106 __func__, i, gpa + i * sizeof(*e)); 1107 return false; 1108 } 1109 if (nested_vmx_store_msr_check(vcpu, e)) { 1110 pr_debug_ratelimited( 1111 "%s check failed (%u, 0x%x, 0x%x)\n", 1112 __func__, i, e->index, e->reserved); 1113 return false; 1114 } 1115 return true; 1116 } 1117 1118 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1119 { 1120 u64 data; 1121 u32 i; 1122 struct vmx_msr_entry e; 1123 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1124 1125 for (i = 0; i < count; i++) { 1126 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1127 return -EINVAL; 1128 1129 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1130 return -EINVAL; 1131 1132 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1133 return -EINVAL; 1134 1135 if (kvm_vcpu_write_guest(vcpu, 1136 gpa + i * sizeof(e) + 1137 offsetof(struct vmx_msr_entry, value), 1138 &data, sizeof(data))) { 1139 pr_debug_ratelimited( 1140 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1141 __func__, i, e.index, data); 1142 return -EINVAL; 1143 } 1144 } 1145 return 0; 1146 } 1147 1148 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1149 { 1150 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1151 u32 count = vmcs12->vm_exit_msr_store_count; 1152 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1153 struct vmx_msr_entry e; 1154 u32 i; 1155 1156 for (i = 0; i < count; i++) { 1157 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1158 return false; 1159 1160 if (e.index == msr_index) 1161 return true; 1162 } 1163 return false; 1164 } 1165 1166 /* 1167 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1168 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1169 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1170 * @entry_failure_code. 1171 */ 1172 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1173 bool nested_ept, bool reload_pdptrs, 1174 enum vm_entry_failure_code *entry_failure_code) 1175 { 1176 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1177 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1178 return -EINVAL; 1179 } 1180 1181 /* 1182 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1183 * must not be dereferenced. 1184 */ 1185 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1186 CC(!load_pdptrs(vcpu, cr3))) { 1187 *entry_failure_code = ENTRY_FAIL_PDPTE; 1188 return -EINVAL; 1189 } 1190 1191 vcpu->arch.cr3 = cr3; 1192 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1193 1194 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1195 kvm_init_mmu(vcpu); 1196 1197 if (!nested_ept) 1198 kvm_mmu_new_pgd(vcpu, cr3); 1199 1200 return 0; 1201 } 1202 1203 /* 1204 * Returns if KVM is able to config CPU to tag TLB entries 1205 * populated by L2 differently than TLB entries populated 1206 * by L1. 1207 * 1208 * If L0 uses EPT, L1 and L2 run with different EPTP because 1209 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1210 * are tagged with different EPTP. 1211 * 1212 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1213 * with different VPID (L1 entries are tagged with vmx->vpid 1214 * while L2 entries are tagged with vmx->nested.vpid02). 1215 */ 1216 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1217 { 1218 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1219 1220 return enable_ept || 1221 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1222 } 1223 1224 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1225 struct vmcs12 *vmcs12, 1226 bool is_vmenter) 1227 { 1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1229 1230 /* Handle pending Hyper-V TLB flush requests */ 1231 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1232 1233 /* 1234 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1235 * same VPID as the host, and so architecturally, linear and combined 1236 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1237 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1238 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1239 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1240 * VPIDs) still occurs from L1's perspective, and KVM may need to 1241 * synchronize the MMU in response to the guest TLB flush. 1242 * 1243 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1244 * EPT is a special snowflake, as guest-physical mappings aren't 1245 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1246 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1247 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1248 * those mappings. 1249 */ 1250 if (!nested_cpu_has_vpid(vmcs12)) { 1251 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1252 return; 1253 } 1254 1255 /* L2 should never have a VPID if VPID is disabled. */ 1256 WARN_ON(!enable_vpid); 1257 1258 /* 1259 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1260 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1261 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1262 * that the new vpid12 has never been used and thus represents a new 1263 * guest ASID that cannot have entries in the TLB. 1264 */ 1265 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1266 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1267 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1268 return; 1269 } 1270 1271 /* 1272 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1273 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1274 * KVM was unable to allocate a VPID for L2, flush the current context 1275 * as the effective ASID is common to both L1 and L2. 1276 */ 1277 if (!nested_has_guest_tlb_tag(vcpu)) 1278 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1279 } 1280 1281 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1282 { 1283 superset &= mask; 1284 subset &= mask; 1285 1286 return (superset | subset) == superset; 1287 } 1288 1289 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1290 { 1291 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1292 VMX_BASIC_INOUT | 1293 VMX_BASIC_TRUE_CTLS | 1294 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1295 1296 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1297 GENMASK_ULL(47, 45) | 1298 BIT_ULL(31); 1299 1300 u64 vmx_basic = vmcs_config.nested.basic; 1301 1302 BUILD_BUG_ON(feature_bits & reserved_bits); 1303 1304 /* 1305 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1306 * inverted polarity), the incoming value must not set feature bits or 1307 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1308 * multi-bit values, are explicitly checked below. 1309 */ 1310 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1311 return -EINVAL; 1312 1313 /* 1314 * KVM does not emulate a version of VMX that constrains physical 1315 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1316 */ 1317 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1318 return -EINVAL; 1319 1320 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1321 vmx_basic_vmcs_revision_id(data)) 1322 return -EINVAL; 1323 1324 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1325 return -EINVAL; 1326 1327 vmx->nested.msrs.basic = data; 1328 return 0; 1329 } 1330 1331 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1332 u32 **low, u32 **high) 1333 { 1334 switch (msr_index) { 1335 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1336 *low = &msrs->pinbased_ctls_low; 1337 *high = &msrs->pinbased_ctls_high; 1338 break; 1339 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1340 *low = &msrs->procbased_ctls_low; 1341 *high = &msrs->procbased_ctls_high; 1342 break; 1343 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1344 *low = &msrs->exit_ctls_low; 1345 *high = &msrs->exit_ctls_high; 1346 break; 1347 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1348 *low = &msrs->entry_ctls_low; 1349 *high = &msrs->entry_ctls_high; 1350 break; 1351 case MSR_IA32_VMX_PROCBASED_CTLS2: 1352 *low = &msrs->secondary_ctls_low; 1353 *high = &msrs->secondary_ctls_high; 1354 break; 1355 default: 1356 BUG(); 1357 } 1358 } 1359 1360 static int 1361 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1362 { 1363 u32 *lowp, *highp; 1364 u64 supported; 1365 1366 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1367 1368 supported = vmx_control_msr(*lowp, *highp); 1369 1370 /* Check must-be-1 bits are still 1. */ 1371 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1372 return -EINVAL; 1373 1374 /* Check must-be-0 bits are still 0. */ 1375 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1376 return -EINVAL; 1377 1378 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1379 *lowp = data; 1380 *highp = data >> 32; 1381 return 0; 1382 } 1383 1384 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1385 { 1386 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1387 VMX_MISC_ACTIVITY_HLT | 1388 VMX_MISC_ACTIVITY_SHUTDOWN | 1389 VMX_MISC_ACTIVITY_WAIT_SIPI | 1390 VMX_MISC_INTEL_PT | 1391 VMX_MISC_RDMSR_IN_SMM | 1392 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1393 VMX_MISC_VMXOFF_BLOCK_SMI | 1394 VMX_MISC_ZERO_LEN_INS; 1395 1396 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1397 1398 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1399 vmcs_config.nested.misc_high); 1400 1401 BUILD_BUG_ON(feature_bits & reserved_bits); 1402 1403 /* 1404 * The incoming value must not set feature bits or reserved bits that 1405 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1406 * explicitly checked below. 1407 */ 1408 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1409 return -EINVAL; 1410 1411 if ((vmx->nested.msrs.pinbased_ctls_high & 1412 PIN_BASED_VMX_PREEMPTION_TIMER) && 1413 vmx_misc_preemption_timer_rate(data) != 1414 vmx_misc_preemption_timer_rate(vmx_misc)) 1415 return -EINVAL; 1416 1417 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1418 return -EINVAL; 1419 1420 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1421 return -EINVAL; 1422 1423 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1424 return -EINVAL; 1425 1426 vmx->nested.msrs.misc_low = data; 1427 vmx->nested.msrs.misc_high = data >> 32; 1428 1429 return 0; 1430 } 1431 1432 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1433 { 1434 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1435 vmcs_config.nested.vpid_caps); 1436 1437 /* Every bit is either reserved or a feature bit. */ 1438 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1439 return -EINVAL; 1440 1441 vmx->nested.msrs.ept_caps = data; 1442 vmx->nested.msrs.vpid_caps = data >> 32; 1443 return 0; 1444 } 1445 1446 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1447 { 1448 switch (msr_index) { 1449 case MSR_IA32_VMX_CR0_FIXED0: 1450 return &msrs->cr0_fixed0; 1451 case MSR_IA32_VMX_CR4_FIXED0: 1452 return &msrs->cr4_fixed0; 1453 default: 1454 BUG(); 1455 } 1456 } 1457 1458 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1459 { 1460 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1461 1462 /* 1463 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1464 * must be 1 in the restored value. 1465 */ 1466 if (!is_bitwise_subset(data, *msr, -1ULL)) 1467 return -EINVAL; 1468 1469 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1470 return 0; 1471 } 1472 1473 /* 1474 * Called when userspace is restoring VMX MSRs. 1475 * 1476 * Returns 0 on success, non-0 otherwise. 1477 */ 1478 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1479 { 1480 struct vcpu_vmx *vmx = to_vmx(vcpu); 1481 1482 /* 1483 * Don't allow changes to the VMX capability MSRs while the vCPU 1484 * is in VMX operation. 1485 */ 1486 if (vmx->nested.vmxon) 1487 return -EBUSY; 1488 1489 switch (msr_index) { 1490 case MSR_IA32_VMX_BASIC: 1491 return vmx_restore_vmx_basic(vmx, data); 1492 case MSR_IA32_VMX_PINBASED_CTLS: 1493 case MSR_IA32_VMX_PROCBASED_CTLS: 1494 case MSR_IA32_VMX_EXIT_CTLS: 1495 case MSR_IA32_VMX_ENTRY_CTLS: 1496 /* 1497 * The "non-true" VMX capability MSRs are generated from the 1498 * "true" MSRs, so we do not support restoring them directly. 1499 * 1500 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1501 * should restore the "true" MSRs with the must-be-1 bits 1502 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1503 * DEFAULT SETTINGS". 1504 */ 1505 return -EINVAL; 1506 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1507 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1508 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1509 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1510 case MSR_IA32_VMX_PROCBASED_CTLS2: 1511 return vmx_restore_control_msr(vmx, msr_index, data); 1512 case MSR_IA32_VMX_MISC: 1513 return vmx_restore_vmx_misc(vmx, data); 1514 case MSR_IA32_VMX_CR0_FIXED0: 1515 case MSR_IA32_VMX_CR4_FIXED0: 1516 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1517 case MSR_IA32_VMX_CR0_FIXED1: 1518 case MSR_IA32_VMX_CR4_FIXED1: 1519 /* 1520 * These MSRs are generated based on the vCPU's CPUID, so we 1521 * do not support restoring them directly. 1522 */ 1523 return -EINVAL; 1524 case MSR_IA32_VMX_EPT_VPID_CAP: 1525 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1526 case MSR_IA32_VMX_VMCS_ENUM: 1527 vmx->nested.msrs.vmcs_enum = data; 1528 return 0; 1529 case MSR_IA32_VMX_VMFUNC: 1530 if (data & ~vmcs_config.nested.vmfunc_controls) 1531 return -EINVAL; 1532 vmx->nested.msrs.vmfunc_controls = data; 1533 return 0; 1534 default: 1535 /* 1536 * The rest of the VMX capability MSRs do not support restore. 1537 */ 1538 return -EINVAL; 1539 } 1540 } 1541 1542 /* Returns 0 on success, non-0 otherwise. */ 1543 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1544 { 1545 switch (msr_index) { 1546 case MSR_IA32_VMX_BASIC: 1547 *pdata = msrs->basic; 1548 break; 1549 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1550 case MSR_IA32_VMX_PINBASED_CTLS: 1551 *pdata = vmx_control_msr( 1552 msrs->pinbased_ctls_low, 1553 msrs->pinbased_ctls_high); 1554 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1555 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1556 break; 1557 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1558 case MSR_IA32_VMX_PROCBASED_CTLS: 1559 *pdata = vmx_control_msr( 1560 msrs->procbased_ctls_low, 1561 msrs->procbased_ctls_high); 1562 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1563 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1564 break; 1565 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1566 case MSR_IA32_VMX_EXIT_CTLS: 1567 *pdata = vmx_control_msr( 1568 msrs->exit_ctls_low, 1569 msrs->exit_ctls_high); 1570 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1571 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1572 break; 1573 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1574 case MSR_IA32_VMX_ENTRY_CTLS: 1575 *pdata = vmx_control_msr( 1576 msrs->entry_ctls_low, 1577 msrs->entry_ctls_high); 1578 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1579 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1580 break; 1581 case MSR_IA32_VMX_MISC: 1582 *pdata = vmx_control_msr( 1583 msrs->misc_low, 1584 msrs->misc_high); 1585 break; 1586 case MSR_IA32_VMX_CR0_FIXED0: 1587 *pdata = msrs->cr0_fixed0; 1588 break; 1589 case MSR_IA32_VMX_CR0_FIXED1: 1590 *pdata = msrs->cr0_fixed1; 1591 break; 1592 case MSR_IA32_VMX_CR4_FIXED0: 1593 *pdata = msrs->cr4_fixed0; 1594 break; 1595 case MSR_IA32_VMX_CR4_FIXED1: 1596 *pdata = msrs->cr4_fixed1; 1597 break; 1598 case MSR_IA32_VMX_VMCS_ENUM: 1599 *pdata = msrs->vmcs_enum; 1600 break; 1601 case MSR_IA32_VMX_PROCBASED_CTLS2: 1602 *pdata = vmx_control_msr( 1603 msrs->secondary_ctls_low, 1604 msrs->secondary_ctls_high); 1605 break; 1606 case MSR_IA32_VMX_EPT_VPID_CAP: 1607 *pdata = msrs->ept_caps | 1608 ((u64)msrs->vpid_caps << 32); 1609 break; 1610 case MSR_IA32_VMX_VMFUNC: 1611 *pdata = msrs->vmfunc_controls; 1612 break; 1613 default: 1614 return 1; 1615 } 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1622 * been modified by the L1 guest. Note, "writable" in this context means 1623 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1624 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1625 * VM-exit information fields (which are actually writable if the vCPU is 1626 * configured to support "VMWRITE to any supported field in the VMCS"). 1627 */ 1628 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1629 { 1630 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1631 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1632 struct shadow_vmcs_field field; 1633 unsigned long val; 1634 int i; 1635 1636 if (WARN_ON(!shadow_vmcs)) 1637 return; 1638 1639 preempt_disable(); 1640 1641 vmcs_load(shadow_vmcs); 1642 1643 for (i = 0; i < max_shadow_read_write_fields; i++) { 1644 field = shadow_read_write_fields[i]; 1645 val = __vmcs_readl(field.encoding); 1646 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1647 } 1648 1649 vmcs_clear(shadow_vmcs); 1650 vmcs_load(vmx->loaded_vmcs->vmcs); 1651 1652 preempt_enable(); 1653 } 1654 1655 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1656 { 1657 const struct shadow_vmcs_field *fields[] = { 1658 shadow_read_write_fields, 1659 shadow_read_only_fields 1660 }; 1661 const int max_fields[] = { 1662 max_shadow_read_write_fields, 1663 max_shadow_read_only_fields 1664 }; 1665 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1666 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1667 struct shadow_vmcs_field field; 1668 unsigned long val; 1669 int i, q; 1670 1671 if (WARN_ON(!shadow_vmcs)) 1672 return; 1673 1674 vmcs_load(shadow_vmcs); 1675 1676 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1677 for (i = 0; i < max_fields[q]; i++) { 1678 field = fields[q][i]; 1679 val = vmcs12_read_any(vmcs12, field.encoding, 1680 field.offset); 1681 __vmcs_writel(field.encoding, val); 1682 } 1683 } 1684 1685 vmcs_clear(shadow_vmcs); 1686 vmcs_load(vmx->loaded_vmcs->vmcs); 1687 } 1688 1689 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1690 { 1691 #ifdef CONFIG_KVM_HYPERV 1692 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1693 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1694 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1695 1696 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1697 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1698 vmcs12->guest_rip = evmcs->guest_rip; 1699 1700 if (unlikely(!(hv_clean_fields & 1701 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1702 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1703 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1704 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1705 } 1706 1707 if (unlikely(!(hv_clean_fields & 1708 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1709 vmcs12->guest_rsp = evmcs->guest_rsp; 1710 vmcs12->guest_rflags = evmcs->guest_rflags; 1711 vmcs12->guest_interruptibility_info = 1712 evmcs->guest_interruptibility_info; 1713 /* 1714 * Not present in struct vmcs12: 1715 * vmcs12->guest_ssp = evmcs->guest_ssp; 1716 */ 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1721 vmcs12->cpu_based_vm_exec_control = 1722 evmcs->cpu_based_vm_exec_control; 1723 } 1724 1725 if (unlikely(!(hv_clean_fields & 1726 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1727 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1728 } 1729 1730 if (unlikely(!(hv_clean_fields & 1731 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1732 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1733 } 1734 1735 if (unlikely(!(hv_clean_fields & 1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1737 vmcs12->vm_entry_intr_info_field = 1738 evmcs->vm_entry_intr_info_field; 1739 vmcs12->vm_entry_exception_error_code = 1740 evmcs->vm_entry_exception_error_code; 1741 vmcs12->vm_entry_instruction_len = 1742 evmcs->vm_entry_instruction_len; 1743 } 1744 1745 if (unlikely(!(hv_clean_fields & 1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1747 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1748 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1749 vmcs12->host_cr0 = evmcs->host_cr0; 1750 vmcs12->host_cr3 = evmcs->host_cr3; 1751 vmcs12->host_cr4 = evmcs->host_cr4; 1752 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1753 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1754 vmcs12->host_rip = evmcs->host_rip; 1755 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1756 vmcs12->host_es_selector = evmcs->host_es_selector; 1757 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1758 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1759 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1760 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1761 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1762 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1763 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1764 /* 1765 * Not present in struct vmcs12: 1766 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1767 * vmcs12->host_ssp = evmcs->host_ssp; 1768 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1769 */ 1770 } 1771 1772 if (unlikely(!(hv_clean_fields & 1773 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1774 vmcs12->pin_based_vm_exec_control = 1775 evmcs->pin_based_vm_exec_control; 1776 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1777 vmcs12->secondary_vm_exec_control = 1778 evmcs->secondary_vm_exec_control; 1779 } 1780 1781 if (unlikely(!(hv_clean_fields & 1782 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1783 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1784 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1785 } 1786 1787 if (unlikely(!(hv_clean_fields & 1788 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1789 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1790 } 1791 1792 if (unlikely(!(hv_clean_fields & 1793 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1794 vmcs12->guest_es_base = evmcs->guest_es_base; 1795 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1796 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1797 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1798 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1799 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1800 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1801 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1802 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1803 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1804 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1805 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1806 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1807 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1808 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1809 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1810 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1811 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1812 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1813 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1814 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1815 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1816 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1817 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1818 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1819 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1820 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1821 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1822 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1823 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1824 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1825 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1826 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1827 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1828 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1829 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1830 } 1831 1832 if (unlikely(!(hv_clean_fields & 1833 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1834 vmcs12->tsc_offset = evmcs->tsc_offset; 1835 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1836 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1837 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1838 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1839 } 1840 1841 if (unlikely(!(hv_clean_fields & 1842 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1843 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1844 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1845 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1846 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1847 vmcs12->guest_cr0 = evmcs->guest_cr0; 1848 vmcs12->guest_cr3 = evmcs->guest_cr3; 1849 vmcs12->guest_cr4 = evmcs->guest_cr4; 1850 vmcs12->guest_dr7 = evmcs->guest_dr7; 1851 } 1852 1853 if (unlikely(!(hv_clean_fields & 1854 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1855 vmcs12->host_fs_base = evmcs->host_fs_base; 1856 vmcs12->host_gs_base = evmcs->host_gs_base; 1857 vmcs12->host_tr_base = evmcs->host_tr_base; 1858 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1859 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1860 vmcs12->host_rsp = evmcs->host_rsp; 1861 } 1862 1863 if (unlikely(!(hv_clean_fields & 1864 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1865 vmcs12->ept_pointer = evmcs->ept_pointer; 1866 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1867 } 1868 1869 if (unlikely(!(hv_clean_fields & 1870 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1871 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1872 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1873 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1874 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1875 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1876 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1877 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1878 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1879 vmcs12->guest_pending_dbg_exceptions = 1880 evmcs->guest_pending_dbg_exceptions; 1881 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1882 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1883 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1884 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1885 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1886 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1887 /* 1888 * Not present in struct vmcs12: 1889 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1890 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1891 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1892 */ 1893 } 1894 1895 /* 1896 * Not used? 1897 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1898 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1899 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1900 * vmcs12->page_fault_error_code_mask = 1901 * evmcs->page_fault_error_code_mask; 1902 * vmcs12->page_fault_error_code_match = 1903 * evmcs->page_fault_error_code_match; 1904 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1905 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1906 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1907 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1908 */ 1909 1910 /* 1911 * Read only fields: 1912 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1913 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1914 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1915 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1916 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1917 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1918 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1919 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1920 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1921 * vmcs12->exit_qualification = evmcs->exit_qualification; 1922 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1923 * 1924 * Not present in struct vmcs12: 1925 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1926 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1927 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1928 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1929 */ 1930 1931 return; 1932 #else /* CONFIG_KVM_HYPERV */ 1933 KVM_BUG_ON(1, vmx->vcpu.kvm); 1934 #endif /* CONFIG_KVM_HYPERV */ 1935 } 1936 1937 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1938 { 1939 #ifdef CONFIG_KVM_HYPERV 1940 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1941 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1942 1943 /* 1944 * Should not be changed by KVM: 1945 * 1946 * evmcs->host_es_selector = vmcs12->host_es_selector; 1947 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1948 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1949 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1950 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1951 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1952 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1953 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1954 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1955 * evmcs->host_cr0 = vmcs12->host_cr0; 1956 * evmcs->host_cr3 = vmcs12->host_cr3; 1957 * evmcs->host_cr4 = vmcs12->host_cr4; 1958 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1959 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1960 * evmcs->host_rip = vmcs12->host_rip; 1961 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1962 * evmcs->host_fs_base = vmcs12->host_fs_base; 1963 * evmcs->host_gs_base = vmcs12->host_gs_base; 1964 * evmcs->host_tr_base = vmcs12->host_tr_base; 1965 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1966 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1967 * evmcs->host_rsp = vmcs12->host_rsp; 1968 * sync_vmcs02_to_vmcs12() doesn't read these: 1969 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1970 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1971 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1972 * evmcs->ept_pointer = vmcs12->ept_pointer; 1973 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1974 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1975 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1976 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1977 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1978 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1979 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1980 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1981 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1982 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1983 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1984 * evmcs->page_fault_error_code_mask = 1985 * vmcs12->page_fault_error_code_mask; 1986 * evmcs->page_fault_error_code_match = 1987 * vmcs12->page_fault_error_code_match; 1988 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1989 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1990 * evmcs->tsc_offset = vmcs12->tsc_offset; 1991 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1992 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1993 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1994 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1995 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1996 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1997 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1998 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1999 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2000 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2001 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2002 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2003 * 2004 * Not present in struct vmcs12: 2005 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2006 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2007 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2008 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2009 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2010 * evmcs->host_ssp = vmcs12->host_ssp; 2011 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2012 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2013 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2014 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2015 * evmcs->guest_ssp = vmcs12->guest_ssp; 2016 */ 2017 2018 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2019 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2020 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2021 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2022 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2023 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2024 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2025 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2026 2027 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2028 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2029 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2030 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2031 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2032 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2033 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2034 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2035 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2036 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2037 2038 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2039 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2040 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2041 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2042 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2043 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2044 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2045 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2046 2047 evmcs->guest_es_base = vmcs12->guest_es_base; 2048 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2049 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2050 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2051 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2052 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2053 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2054 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2055 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2056 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2057 2058 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2059 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2060 2061 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2062 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2063 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2064 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2065 2066 evmcs->guest_pending_dbg_exceptions = 2067 vmcs12->guest_pending_dbg_exceptions; 2068 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2069 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2070 2071 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2072 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2073 2074 evmcs->guest_cr0 = vmcs12->guest_cr0; 2075 evmcs->guest_cr3 = vmcs12->guest_cr3; 2076 evmcs->guest_cr4 = vmcs12->guest_cr4; 2077 evmcs->guest_dr7 = vmcs12->guest_dr7; 2078 2079 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2080 2081 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2082 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2083 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2084 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2085 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2086 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2087 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2088 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2089 2090 evmcs->exit_qualification = vmcs12->exit_qualification; 2091 2092 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2093 evmcs->guest_rsp = vmcs12->guest_rsp; 2094 evmcs->guest_rflags = vmcs12->guest_rflags; 2095 2096 evmcs->guest_interruptibility_info = 2097 vmcs12->guest_interruptibility_info; 2098 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2099 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2100 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2101 evmcs->vm_entry_exception_error_code = 2102 vmcs12->vm_entry_exception_error_code; 2103 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2104 2105 evmcs->guest_rip = vmcs12->guest_rip; 2106 2107 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2108 2109 return; 2110 #else /* CONFIG_KVM_HYPERV */ 2111 KVM_BUG_ON(1, vmx->vcpu.kvm); 2112 #endif /* CONFIG_KVM_HYPERV */ 2113 } 2114 2115 /* 2116 * This is an equivalent of the nested hypervisor executing the vmptrld 2117 * instruction. 2118 */ 2119 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2120 struct kvm_vcpu *vcpu, bool from_launch) 2121 { 2122 #ifdef CONFIG_KVM_HYPERV 2123 struct vcpu_vmx *vmx = to_vmx(vcpu); 2124 bool evmcs_gpa_changed = false; 2125 u64 evmcs_gpa; 2126 2127 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2128 return EVMPTRLD_DISABLED; 2129 2130 evmcs_gpa = nested_get_evmptr(vcpu); 2131 if (!evmptr_is_valid(evmcs_gpa)) { 2132 nested_release_evmcs(vcpu); 2133 return EVMPTRLD_DISABLED; 2134 } 2135 2136 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2137 vmx->nested.current_vmptr = INVALID_GPA; 2138 2139 nested_release_evmcs(vcpu); 2140 2141 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2142 &vmx->nested.hv_evmcs_map)) 2143 return EVMPTRLD_ERROR; 2144 2145 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2146 2147 /* 2148 * Currently, KVM only supports eVMCS version 1 2149 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2150 * value to first u32 field of eVMCS which should specify eVMCS 2151 * VersionNumber. 2152 * 2153 * Guest should be aware of supported eVMCS versions by host by 2154 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2155 * expected to set this CPUID leaf according to the value 2156 * returned in vmcs_version from nested_enable_evmcs(). 2157 * 2158 * However, it turns out that Microsoft Hyper-V fails to comply 2159 * to their own invented interface: When Hyper-V use eVMCS, it 2160 * just sets first u32 field of eVMCS to revision_id specified 2161 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2162 * which is one of the supported versions specified in 2163 * CPUID.0x4000000A.EAX[0:15]. 2164 * 2165 * To overcome Hyper-V bug, we accept here either a supported 2166 * eVMCS version or VMCS12 revision_id as valid values for first 2167 * u32 field of eVMCS. 2168 */ 2169 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2170 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2171 nested_release_evmcs(vcpu); 2172 return EVMPTRLD_VMFAIL; 2173 } 2174 2175 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2176 2177 evmcs_gpa_changed = true; 2178 /* 2179 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2180 * reloaded from guest's memory (read only fields, fields not 2181 * present in struct hv_enlightened_vmcs, ...). Make sure there 2182 * are no leftovers. 2183 */ 2184 if (from_launch) { 2185 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2186 memset(vmcs12, 0, sizeof(*vmcs12)); 2187 vmcs12->hdr.revision_id = VMCS12_REVISION; 2188 } 2189 2190 } 2191 2192 /* 2193 * Clean fields data can't be used on VMLAUNCH and when we switch 2194 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2195 */ 2196 if (from_launch || evmcs_gpa_changed) { 2197 vmx->nested.hv_evmcs->hv_clean_fields &= 2198 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2199 2200 vmx->nested.force_msr_bitmap_recalc = true; 2201 } 2202 2203 return EVMPTRLD_SUCCEEDED; 2204 #else 2205 return EVMPTRLD_DISABLED; 2206 #endif 2207 } 2208 2209 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2210 { 2211 struct vcpu_vmx *vmx = to_vmx(vcpu); 2212 2213 if (nested_vmx_is_evmptr12_valid(vmx)) 2214 copy_vmcs12_to_enlightened(vmx); 2215 else 2216 copy_vmcs12_to_shadow(vmx); 2217 2218 vmx->nested.need_vmcs12_to_shadow_sync = false; 2219 } 2220 2221 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2222 { 2223 struct vcpu_vmx *vmx = 2224 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2225 2226 vmx->nested.preemption_timer_expired = true; 2227 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2228 kvm_vcpu_kick(&vmx->vcpu); 2229 2230 return HRTIMER_NORESTART; 2231 } 2232 2233 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2234 { 2235 struct vcpu_vmx *vmx = to_vmx(vcpu); 2236 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2237 2238 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2239 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2240 2241 if (!vmx->nested.has_preemption_timer_deadline) { 2242 vmx->nested.preemption_timer_deadline = 2243 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2244 vmx->nested.has_preemption_timer_deadline = true; 2245 } 2246 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2247 } 2248 2249 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2250 u64 preemption_timeout) 2251 { 2252 struct vcpu_vmx *vmx = to_vmx(vcpu); 2253 2254 /* 2255 * A timer value of zero is architecturally guaranteed to cause 2256 * a VMExit prior to executing any instructions in the guest. 2257 */ 2258 if (preemption_timeout == 0) { 2259 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2260 return; 2261 } 2262 2263 if (vcpu->arch.virtual_tsc_khz == 0) 2264 return; 2265 2266 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2267 preemption_timeout *= 1000000; 2268 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2269 hrtimer_start(&vmx->nested.preemption_timer, 2270 ktime_add_ns(ktime_get(), preemption_timeout), 2271 HRTIMER_MODE_ABS_PINNED); 2272 } 2273 2274 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2275 { 2276 if (vmx->nested.nested_run_pending && 2277 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2278 return vmcs12->guest_ia32_efer; 2279 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2280 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2281 else 2282 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2283 } 2284 2285 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2286 { 2287 struct kvm *kvm = vmx->vcpu.kvm; 2288 2289 /* 2290 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2291 * according to L0's settings (vmcs12 is irrelevant here). Host 2292 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2293 * will be set as needed prior to VMLAUNCH/VMRESUME. 2294 */ 2295 if (vmx->nested.vmcs02_initialized) 2296 return; 2297 vmx->nested.vmcs02_initialized = true; 2298 2299 if (vmx->ve_info) 2300 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2301 2302 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2303 if (cpu_has_vmx_vmfunc()) 2304 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2305 2306 if (cpu_has_vmx_posted_intr()) 2307 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2308 2309 if (cpu_has_vmx_msr_bitmap()) 2310 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2311 2312 /* 2313 * PML is emulated for L2, but never enabled in hardware as the MMU 2314 * handles A/D emulation. Disabling PML for L2 also avoids having to 2315 * deal with filtering out L2 GPAs from the buffer. 2316 */ 2317 if (enable_pml) { 2318 vmcs_write64(PML_ADDRESS, 0); 2319 vmcs_write16(GUEST_PML_INDEX, -1); 2320 } 2321 2322 if (cpu_has_vmx_encls_vmexit()) 2323 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2324 2325 if (kvm_notify_vmexit_enabled(kvm)) 2326 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2327 2328 /* 2329 * Set the MSR load/store lists to match L0's settings. Only the 2330 * addresses are constant (for vmcs02), the counts can change based 2331 * on L2's behavior, e.g. switching to/from long mode. 2332 */ 2333 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2334 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2335 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2336 2337 vmx_set_constant_host_state(vmx); 2338 } 2339 2340 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2341 struct vmcs12 *vmcs12) 2342 { 2343 prepare_vmcs02_constant_state(vmx); 2344 2345 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2346 2347 /* 2348 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2349 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2350 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2351 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2352 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2353 * required flushes), but doing so would cause KVM to over-flush. E.g. 2354 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2355 * and then runs L2 X again, then KVM can and should retain TLB entries 2356 * for VPID12=1. 2357 */ 2358 if (enable_vpid) { 2359 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2360 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2361 else 2362 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2363 } 2364 } 2365 2366 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2367 struct vmcs12 *vmcs12) 2368 { 2369 u32 exec_control; 2370 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2371 2372 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2373 prepare_vmcs02_early_rare(vmx, vmcs12); 2374 2375 /* 2376 * PIN CONTROLS 2377 */ 2378 exec_control = __pin_controls_get(vmcs01); 2379 exec_control |= (vmcs12->pin_based_vm_exec_control & 2380 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2381 2382 /* Posted interrupts setting is only taken from vmcs12. */ 2383 vmx->nested.pi_pending = false; 2384 if (nested_cpu_has_posted_intr(vmcs12)) { 2385 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2386 } else { 2387 vmx->nested.posted_intr_nv = -1; 2388 exec_control &= ~PIN_BASED_POSTED_INTR; 2389 } 2390 pin_controls_set(vmx, exec_control); 2391 2392 /* 2393 * EXEC CONTROLS 2394 */ 2395 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2396 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2397 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2398 exec_control &= ~CPU_BASED_TPR_SHADOW; 2399 exec_control |= vmcs12->cpu_based_vm_exec_control; 2400 2401 if (exec_control & CPU_BASED_TPR_SHADOW) 2402 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2403 #ifdef CONFIG_X86_64 2404 else 2405 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2406 CPU_BASED_CR8_STORE_EXITING; 2407 #endif 2408 2409 /* 2410 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2411 * for I/O port accesses. 2412 */ 2413 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2414 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2415 2416 /* 2417 * This bit will be computed in nested_get_vmcs12_pages, because 2418 * we do not have access to L1's MSR bitmap yet. For now, keep 2419 * the same bit as before, hoping to avoid multiple VMWRITEs that 2420 * only set/clear this bit. 2421 */ 2422 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2423 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2424 2425 exec_controls_set(vmx, exec_control); 2426 2427 /* 2428 * SECONDARY EXEC CONTROLS 2429 */ 2430 if (cpu_has_secondary_exec_ctrls()) { 2431 exec_control = __secondary_exec_controls_get(vmcs01); 2432 2433 /* Take the following fields only from vmcs12 */ 2434 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2435 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2436 SECONDARY_EXEC_ENABLE_INVPCID | 2437 SECONDARY_EXEC_ENABLE_RDTSCP | 2438 SECONDARY_EXEC_ENABLE_XSAVES | 2439 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2440 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2441 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2442 SECONDARY_EXEC_ENABLE_VMFUNC | 2443 SECONDARY_EXEC_DESC); 2444 2445 if (nested_cpu_has(vmcs12, 2446 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2447 exec_control |= vmcs12->secondary_vm_exec_control; 2448 2449 /* PML is emulated and never enabled in hardware for L2. */ 2450 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2451 2452 /* VMCS shadowing for L2 is emulated for now */ 2453 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2454 2455 /* 2456 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2457 * will not have to rewrite the controls just for this bit. 2458 */ 2459 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2460 exec_control |= SECONDARY_EXEC_DESC; 2461 2462 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2463 vmcs_write16(GUEST_INTR_STATUS, 2464 vmcs12->guest_intr_status); 2465 2466 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2467 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2468 2469 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2470 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2471 2472 secondary_exec_controls_set(vmx, exec_control); 2473 } 2474 2475 /* 2476 * ENTRY CONTROLS 2477 * 2478 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2479 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2480 * on the related bits (if supported by the CPU) in the hope that 2481 * we can avoid VMWrites during vmx_set_efer(). 2482 * 2483 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2484 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2485 * do the same for L2. 2486 */ 2487 exec_control = __vm_entry_controls_get(vmcs01); 2488 exec_control |= (vmcs12->vm_entry_controls & 2489 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2490 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2491 if (cpu_has_load_ia32_efer()) { 2492 if (guest_efer & EFER_LMA) 2493 exec_control |= VM_ENTRY_IA32E_MODE; 2494 if (guest_efer != kvm_host.efer) 2495 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2496 } 2497 vm_entry_controls_set(vmx, exec_control); 2498 2499 /* 2500 * EXIT CONTROLS 2501 * 2502 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2503 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2504 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2505 */ 2506 exec_control = __vm_exit_controls_get(vmcs01); 2507 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2508 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2509 else 2510 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2511 vm_exit_controls_set(vmx, exec_control); 2512 2513 /* 2514 * Interrupt/Exception Fields 2515 */ 2516 if (vmx->nested.nested_run_pending) { 2517 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2518 vmcs12->vm_entry_intr_info_field); 2519 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2520 vmcs12->vm_entry_exception_error_code); 2521 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2522 vmcs12->vm_entry_instruction_len); 2523 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2524 vmcs12->guest_interruptibility_info); 2525 vmx->loaded_vmcs->nmi_known_unmasked = 2526 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2527 } else { 2528 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2529 } 2530 } 2531 2532 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2533 u64 *ssp, u64 *ssp_tbl) 2534 { 2535 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2536 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2537 *s_cet = vmcs_readl(GUEST_S_CET); 2538 2539 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2540 *ssp = vmcs_readl(GUEST_SSP); 2541 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2542 } 2543 } 2544 2545 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2546 u64 ssp, u64 ssp_tbl) 2547 { 2548 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2549 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2550 vmcs_writel(GUEST_S_CET, s_cet); 2551 2552 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2553 vmcs_writel(GUEST_SSP, ssp); 2554 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2555 } 2556 } 2557 2558 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2559 { 2560 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2561 2562 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2563 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2564 2565 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2566 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2567 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2568 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2569 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2570 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2571 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2572 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2573 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2574 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2575 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2576 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2577 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2578 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2579 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2580 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2581 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2582 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2583 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2584 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2585 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2586 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2587 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2588 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2589 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2590 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2591 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2592 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2593 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2594 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2595 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2596 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2597 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2598 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2599 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2600 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2601 2602 vmx_segment_cache_clear(vmx); 2603 } 2604 2605 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2606 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2607 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2608 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2609 vmcs12->guest_pending_dbg_exceptions); 2610 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2611 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2612 2613 /* 2614 * L1 may access the L2's PDPTR, so save them to construct 2615 * vmcs12 2616 */ 2617 if (enable_ept) { 2618 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2619 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2620 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2621 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2622 } 2623 2624 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2625 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2626 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2627 } 2628 2629 if (nested_cpu_has_xsaves(vmcs12)) 2630 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2631 2632 /* 2633 * Whether page-faults are trapped is determined by a combination of 2634 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2635 * doesn't care about page faults then we should set all of these to 2636 * L1's desires. However, if L0 does care about (some) page faults, it 2637 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2638 * simply ask to exit on each and every L2 page fault. This is done by 2639 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2640 * Note that below we don't need special code to set EB.PF beyond the 2641 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2642 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2643 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2644 */ 2645 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2646 /* 2647 * TODO: if both L0 and L1 need the same MASK and MATCH, 2648 * go ahead and use it? 2649 */ 2650 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2651 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2652 } else { 2653 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2654 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2655 } 2656 2657 if (cpu_has_vmx_apicv()) { 2658 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2659 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2660 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2661 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2662 } 2663 2664 /* 2665 * If vmcs12 is configured to save TSC on exit via the auto-store list, 2666 * append the MSR to vmcs02's auto-store list so that KVM effectively 2667 * reads TSC at the time of VM-Exit from L2. The saved value will be 2668 * propagated to vmcs12's list on nested VM-Exit. 2669 * 2670 * Don't increment the number of MSRs in the vCPU structure, as saving 2671 * TSC is specific to this particular incarnation of vmcb02, i.e. must 2672 * not bleed into vmcs01. 2673 */ 2674 if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2675 !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2676 vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2677 vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2678 2679 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2680 } else { 2681 vmx->nested.tsc_autostore_slot = -1; 2682 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2683 } 2684 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2685 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2686 2687 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2688 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2689 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2690 2691 set_cr4_guest_host_mask(vmx); 2692 } 2693 2694 /* 2695 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2696 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2697 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2698 * guest in a way that will both be appropriate to L1's requests, and our 2699 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2700 * function also has additional necessary side-effects, like setting various 2701 * vcpu->arch fields. 2702 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2703 * is assigned to entry_failure_code on failure. 2704 */ 2705 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2706 bool from_vmentry, 2707 enum vm_entry_failure_code *entry_failure_code) 2708 { 2709 struct vcpu_vmx *vmx = to_vmx(vcpu); 2710 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2711 bool load_guest_pdptrs_vmcs12 = false; 2712 2713 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2714 prepare_vmcs02_rare(vmx, vmcs12); 2715 vmx->nested.dirty_vmcs12 = false; 2716 2717 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2718 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2719 } 2720 2721 if (vmx->nested.nested_run_pending && 2722 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2723 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2724 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2725 vmx_get_supported_debugctl(vcpu, false)); 2726 } else { 2727 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2728 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2729 } 2730 2731 if (!vmx->nested.nested_run_pending || 2732 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2733 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2734 vmx->nested.pre_vmenter_ssp, 2735 vmx->nested.pre_vmenter_ssp_tbl); 2736 2737 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2738 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2739 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2740 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2741 2742 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2743 * bitwise-or of what L1 wants to trap for L2, and what we want to 2744 * trap. Note that CR0.TS also needs updating - we do this later. 2745 */ 2746 vmx_update_exception_bitmap(vcpu); 2747 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2748 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2749 2750 if (vmx->nested.nested_run_pending && 2751 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2752 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2753 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2754 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2755 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2756 } 2757 2758 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2759 vcpu->arch.l1_tsc_offset, 2760 vmx_get_l2_tsc_offset(vcpu), 2761 vmx_get_l2_tsc_multiplier(vcpu)); 2762 2763 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2764 vcpu->arch.l1_tsc_scaling_ratio, 2765 vmx_get_l2_tsc_multiplier(vcpu)); 2766 2767 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2768 if (kvm_caps.has_tsc_control) 2769 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2770 2771 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2772 2773 if (nested_cpu_has_ept(vmcs12)) 2774 nested_ept_init_mmu_context(vcpu); 2775 2776 /* 2777 * Override the CR0/CR4 read shadows after setting the effective guest 2778 * CR0/CR4. The common helpers also set the shadows, but they don't 2779 * account for vmcs12's cr0/4_guest_host_mask. 2780 */ 2781 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2782 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2783 2784 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2785 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2786 2787 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2788 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2789 vmx_set_efer(vcpu, vcpu->arch.efer); 2790 2791 /* 2792 * Guest state is invalid and unrestricted guest is disabled, 2793 * which means L1 attempted VMEntry to L2 with invalid state. 2794 * Fail the VMEntry. 2795 * 2796 * However when force loading the guest state (SMM exit or 2797 * loading nested state after migration, it is possible to 2798 * have invalid guest state now, which will be later fixed by 2799 * restoring L2 register state 2800 */ 2801 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2802 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2803 return -EINVAL; 2804 } 2805 2806 /* Shadow page tables on either EPT or shadow page tables. */ 2807 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2808 from_vmentry, entry_failure_code)) 2809 return -EINVAL; 2810 2811 /* 2812 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2813 * on nested VM-Exit, which can occur without actually running L2 and 2814 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2815 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2816 * transition to HLT instead of running L2. 2817 */ 2818 if (enable_ept) 2819 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2820 2821 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2822 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2823 is_pae_paging(vcpu)) { 2824 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2825 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2826 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2827 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2828 } 2829 2830 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2831 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2832 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2833 vmcs12->guest_ia32_perf_global_ctrl))) { 2834 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2835 return -EINVAL; 2836 } 2837 2838 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2839 kvm_rip_write(vcpu, vmcs12->guest_rip); 2840 2841 /* 2842 * It was observed that genuine Hyper-V running in L1 doesn't reset 2843 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2844 * bits when it changes a field in eVMCS. Mark all fields as clean 2845 * here. 2846 */ 2847 if (nested_vmx_is_evmptr12_valid(vmx)) 2848 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2849 2850 return 0; 2851 } 2852 2853 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2854 { 2855 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2856 nested_cpu_has_virtual_nmis(vmcs12))) 2857 return -EINVAL; 2858 2859 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2860 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2861 return -EINVAL; 2862 2863 return 0; 2864 } 2865 2866 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2867 { 2868 struct vcpu_vmx *vmx = to_vmx(vcpu); 2869 2870 /* Check for memory type validity */ 2871 switch (new_eptp & VMX_EPTP_MT_MASK) { 2872 case VMX_EPTP_MT_UC: 2873 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2874 return false; 2875 break; 2876 case VMX_EPTP_MT_WB: 2877 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2878 return false; 2879 break; 2880 default: 2881 return false; 2882 } 2883 2884 /* Page-walk levels validity. */ 2885 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2886 case VMX_EPTP_PWL_5: 2887 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2888 return false; 2889 break; 2890 case VMX_EPTP_PWL_4: 2891 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2892 return false; 2893 break; 2894 default: 2895 return false; 2896 } 2897 2898 /* Reserved bits should not be set */ 2899 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2900 return false; 2901 2902 /* AD, if set, should be supported */ 2903 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2904 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2905 return false; 2906 } 2907 2908 return true; 2909 } 2910 2911 /* 2912 * Checks related to VM-Execution Control Fields 2913 */ 2914 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2915 struct vmcs12 *vmcs12) 2916 { 2917 struct vcpu_vmx *vmx = to_vmx(vcpu); 2918 2919 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2920 vmx->nested.msrs.pinbased_ctls_low, 2921 vmx->nested.msrs.pinbased_ctls_high)) || 2922 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2923 vmx->nested.msrs.procbased_ctls_low, 2924 vmx->nested.msrs.procbased_ctls_high))) 2925 return -EINVAL; 2926 2927 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2928 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2929 vmx->nested.msrs.secondary_ctls_low, 2930 vmx->nested.msrs.secondary_ctls_high))) 2931 return -EINVAL; 2932 2933 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2934 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2935 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2936 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2937 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2938 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2939 nested_vmx_check_nmi_controls(vmcs12) || 2940 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2941 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2942 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2943 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2944 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2945 return -EINVAL; 2946 2947 if (!nested_cpu_has_preemption_timer(vmcs12) && 2948 nested_cpu_has_save_preemption_timer(vmcs12)) 2949 return -EINVAL; 2950 2951 if (nested_cpu_has_ept(vmcs12) && 2952 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2953 return -EINVAL; 2954 2955 if (nested_cpu_has_vmfunc(vmcs12)) { 2956 if (CC(vmcs12->vm_function_control & 2957 ~vmx->nested.msrs.vmfunc_controls)) 2958 return -EINVAL; 2959 2960 if (nested_cpu_has_eptp_switching(vmcs12)) { 2961 if (CC(!nested_cpu_has_ept(vmcs12)) || 2962 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2963 return -EINVAL; 2964 } 2965 } 2966 2967 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2968 CC(!vmcs12->tsc_multiplier)) 2969 return -EINVAL; 2970 2971 return 0; 2972 } 2973 2974 /* 2975 * Checks related to VM-Exit Control Fields 2976 */ 2977 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2978 struct vmcs12 *vmcs12) 2979 { 2980 struct vcpu_vmx *vmx = to_vmx(vcpu); 2981 2982 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2983 vmx->nested.msrs.exit_ctls_low, 2984 vmx->nested.msrs.exit_ctls_high)) || 2985 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2986 return -EINVAL; 2987 2988 return 0; 2989 } 2990 2991 /* 2992 * Checks related to VM-Entry Control Fields 2993 */ 2994 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2995 struct vmcs12 *vmcs12) 2996 { 2997 struct vcpu_vmx *vmx = to_vmx(vcpu); 2998 2999 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 3000 vmx->nested.msrs.entry_ctls_low, 3001 vmx->nested.msrs.entry_ctls_high))) 3002 return -EINVAL; 3003 3004 /* 3005 * From the Intel SDM, volume 3: 3006 * Fields relevant to VM-entry event injection must be set properly. 3007 * These fields are the VM-entry interruption-information field, the 3008 * VM-entry exception error code, and the VM-entry instruction length. 3009 */ 3010 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3011 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3012 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3013 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3014 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3015 bool urg = nested_cpu_has2(vmcs12, 3016 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3017 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3018 3019 /* VM-entry interruption-info field: interruption type */ 3020 if (CC(intr_type == INTR_TYPE_RESERVED) || 3021 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3022 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3023 return -EINVAL; 3024 3025 /* VM-entry interruption-info field: vector */ 3026 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3027 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3028 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3029 return -EINVAL; 3030 3031 /* 3032 * Cannot deliver error code in real mode or if the interrupt 3033 * type is not hardware exception. For other cases, do the 3034 * consistency check only if the vCPU doesn't enumerate 3035 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3036 */ 3037 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3038 if (CC(has_error_code)) 3039 return -EINVAL; 3040 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3041 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3042 return -EINVAL; 3043 } 3044 3045 /* VM-entry exception error code */ 3046 if (CC(has_error_code && 3047 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3048 return -EINVAL; 3049 3050 /* VM-entry interruption-info field: reserved bits */ 3051 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3052 return -EINVAL; 3053 3054 /* VM-entry instruction length */ 3055 switch (intr_type) { 3056 case INTR_TYPE_SOFT_EXCEPTION: 3057 case INTR_TYPE_SOFT_INTR: 3058 case INTR_TYPE_PRIV_SW_EXCEPTION: 3059 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3060 CC(vmcs12->vm_entry_instruction_len == 0 && 3061 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3062 return -EINVAL; 3063 } 3064 } 3065 3066 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3067 return -EINVAL; 3068 3069 return 0; 3070 } 3071 3072 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3073 struct vmcs12 *vmcs12) 3074 { 3075 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3076 nested_check_vm_exit_controls(vcpu, vmcs12) || 3077 nested_check_vm_entry_controls(vcpu, vmcs12)) 3078 return -EINVAL; 3079 3080 #ifdef CONFIG_KVM_HYPERV 3081 if (guest_cpu_cap_has_evmcs(vcpu)) 3082 return nested_evmcs_check_controls(vmcs12); 3083 #endif 3084 3085 return 0; 3086 } 3087 3088 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3089 struct vmcs12 *vmcs12) 3090 { 3091 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3092 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3093 3094 /* 3095 * Don't bother with the consistency checks if KVM isn't configured to 3096 * WARN on missed consistency checks, as KVM needs to rely on hardware 3097 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3098 * the vTPR being writable by L1 at all times (it's an in-memory value, 3099 * not a VMCS field). I.e. even if the check passes now, it might fail 3100 * at the actual VM-Enter. 3101 * 3102 * Keying off the module param also allows treating an invalid vAPIC 3103 * mapping as a consistency check failure without increasing the risk 3104 * of breaking a "real" VM. 3105 */ 3106 if (!warn_on_missed_cc) 3107 return 0; 3108 3109 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3110 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3111 !nested_cpu_has_vid(vmcs12) && 3112 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3113 (CC(!vapic) || 3114 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3115 return -EINVAL; 3116 3117 return 0; 3118 } 3119 3120 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3121 struct vmcs12 *vmcs12) 3122 { 3123 #ifdef CONFIG_X86_64 3124 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3125 !!(vcpu->arch.efer & EFER_LMA))) 3126 return -EINVAL; 3127 #endif 3128 return 0; 3129 } 3130 3131 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3132 { 3133 /* 3134 * Check that the given linear address is canonical after a VM exit 3135 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3136 */ 3137 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3138 3139 return !__is_canonical_address(la, l1_address_bits_on_exit); 3140 } 3141 3142 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3143 u64 ssp, u64 ssp_tbl) 3144 { 3145 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3146 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3147 return -EINVAL; 3148 3149 return 0; 3150 } 3151 3152 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3153 struct vmcs12 *vmcs12) 3154 { 3155 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3156 3157 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3158 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3159 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3160 return -EINVAL; 3161 3162 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3163 return -EINVAL; 3164 3165 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3166 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3167 return -EINVAL; 3168 3169 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3170 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3171 return -EINVAL; 3172 3173 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3174 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3175 vmcs12->host_ia32_perf_global_ctrl))) 3176 return -EINVAL; 3177 3178 if (ia32e) { 3179 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3180 return -EINVAL; 3181 } else { 3182 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3183 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3184 CC((vmcs12->host_rip) >> 32)) 3185 return -EINVAL; 3186 } 3187 3188 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3189 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3190 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3191 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3192 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3193 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3194 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3195 CC(vmcs12->host_cs_selector == 0) || 3196 CC(vmcs12->host_tr_selector == 0) || 3197 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3198 return -EINVAL; 3199 3200 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3201 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3202 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3203 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3204 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3205 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3206 return -EINVAL; 3207 3208 /* 3209 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3210 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3211 * the values of the LMA and LME bits in the field must each be that of 3212 * the host address-space size VM-exit control. 3213 */ 3214 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3215 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3216 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3217 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3218 return -EINVAL; 3219 } 3220 3221 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3222 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3223 vmcs12->host_ssp, 3224 vmcs12->host_ssp_tbl)) 3225 return -EINVAL; 3226 3227 /* 3228 * IA32_S_CET and SSP must be canonical if the host will 3229 * enter 64-bit mode after VM-exit; otherwise, higher 3230 * 32-bits must be all 0s. 3231 */ 3232 if (ia32e) { 3233 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3234 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3235 return -EINVAL; 3236 } else { 3237 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3238 return -EINVAL; 3239 } 3240 } 3241 3242 return 0; 3243 } 3244 3245 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3246 struct vmcs12 *vmcs12) 3247 { 3248 struct vcpu_vmx *vmx = to_vmx(vcpu); 3249 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3250 struct vmcs_hdr hdr; 3251 3252 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3253 return 0; 3254 3255 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3256 return -EINVAL; 3257 3258 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3259 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3260 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3261 return -EINVAL; 3262 3263 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3264 offsetof(struct vmcs12, hdr), 3265 sizeof(hdr)))) 3266 return -EINVAL; 3267 3268 if (CC(hdr.revision_id != VMCS12_REVISION) || 3269 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3270 return -EINVAL; 3271 3272 return 0; 3273 } 3274 3275 /* 3276 * Checks related to Guest Non-register State 3277 */ 3278 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3279 { 3280 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3281 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3282 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3283 return -EINVAL; 3284 3285 return 0; 3286 } 3287 3288 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3289 struct vmcs12 *vmcs12, 3290 enum vm_entry_failure_code *entry_failure_code) 3291 { 3292 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3293 3294 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3295 3296 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3297 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3298 return -EINVAL; 3299 3300 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3301 return -EINVAL; 3302 3303 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3304 (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3305 CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) 3306 return -EINVAL; 3307 3308 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3309 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3310 return -EINVAL; 3311 3312 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3313 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3314 return -EINVAL; 3315 } 3316 3317 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3318 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3319 vmcs12->guest_ia32_perf_global_ctrl))) 3320 return -EINVAL; 3321 3322 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3323 return -EINVAL; 3324 3325 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3326 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3327 return -EINVAL; 3328 3329 /* 3330 * If the load IA32_EFER VM-entry control is 1, the following checks 3331 * are performed on the field for the IA32_EFER MSR: 3332 * - Bits reserved in the IA32_EFER MSR must be 0. 3333 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3334 * the IA-32e mode guest VM-exit control. It must also be identical 3335 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3336 * CR0.PG) is 1. 3337 */ 3338 if (to_vmx(vcpu)->nested.nested_run_pending && 3339 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3340 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3341 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3342 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3343 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3344 return -EINVAL; 3345 } 3346 3347 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3348 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3349 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3350 return -EINVAL; 3351 3352 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3353 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3354 vmcs12->guest_ssp, 3355 vmcs12->guest_ssp_tbl)) 3356 return -EINVAL; 3357 3358 /* 3359 * Guest SSP must have 63:N bits identical, rather than 3360 * be canonical (i.e., 63:N-1 bits identical), where N is 3361 * the CPU's maximum linear-address width. Similar to 3362 * is_noncanonical_msr_address(), use the host's 3363 * linear-address width. 3364 */ 3365 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3366 return -EINVAL; 3367 } 3368 3369 if (nested_check_guest_non_reg_state(vmcs12)) 3370 return -EINVAL; 3371 3372 return 0; 3373 } 3374 3375 #ifdef CONFIG_KVM_HYPERV 3376 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3377 { 3378 struct vcpu_vmx *vmx = to_vmx(vcpu); 3379 3380 /* 3381 * hv_evmcs may end up being not mapped after migration (when 3382 * L2 was running), map it here to make sure vmcs12 changes are 3383 * properly reflected. 3384 */ 3385 if (guest_cpu_cap_has_evmcs(vcpu) && 3386 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3387 enum nested_evmptrld_status evmptrld_status = 3388 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3389 3390 if (evmptrld_status == EVMPTRLD_VMFAIL || 3391 evmptrld_status == EVMPTRLD_ERROR) 3392 return false; 3393 3394 /* 3395 * Post migration VMCS12 always provides the most actual 3396 * information, copy it to eVMCS upon entry. 3397 */ 3398 vmx->nested.need_vmcs12_to_shadow_sync = true; 3399 } 3400 3401 return true; 3402 } 3403 #endif 3404 3405 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3406 { 3407 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3409 struct kvm_host_map *map; 3410 3411 if (!vcpu->arch.pdptrs_from_userspace && 3412 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3413 /* 3414 * Reload the guest's PDPTRs since after a migration 3415 * the guest CR3 might be restored prior to setting the nested 3416 * state which can lead to a load of wrong PDPTRs. 3417 */ 3418 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3419 return false; 3420 } 3421 3422 3423 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3424 map = &vmx->nested.apic_access_page_map; 3425 3426 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3427 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3428 } else { 3429 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3430 __func__); 3431 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3432 vcpu->run->internal.suberror = 3433 KVM_INTERNAL_ERROR_EMULATION; 3434 vcpu->run->internal.ndata = 0; 3435 return false; 3436 } 3437 } 3438 3439 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3440 map = &vmx->nested.virtual_apic_map; 3441 3442 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3443 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3444 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3445 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3446 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3447 /* 3448 * The processor will never use the TPR shadow, simply 3449 * clear the bit from the execution control. Such a 3450 * configuration is useless, but it happens in tests. 3451 * For any other configuration, failing the vm entry is 3452 * _not_ what the processor does but it's basically the 3453 * only possibility we have. 3454 */ 3455 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3456 } else { 3457 /* 3458 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3459 * force VM-Entry to fail. 3460 */ 3461 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3462 } 3463 } 3464 3465 if (nested_cpu_has_posted_intr(vmcs12)) { 3466 map = &vmx->nested.pi_desc_map; 3467 3468 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3469 vmx->nested.pi_desc = 3470 (struct pi_desc *)(((void *)map->hva) + 3471 offset_in_page(vmcs12->posted_intr_desc_addr)); 3472 vmcs_write64(POSTED_INTR_DESC_ADDR, 3473 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3474 } else { 3475 /* 3476 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3477 * access the contents of the VMCS12 posted interrupt 3478 * descriptor. (Note that KVM may do this when it 3479 * should not, per the architectural specification.) 3480 */ 3481 vmx->nested.pi_desc = NULL; 3482 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3483 } 3484 } 3485 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3486 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3487 else 3488 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3489 3490 return true; 3491 } 3492 3493 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3494 { 3495 #ifdef CONFIG_KVM_HYPERV 3496 /* 3497 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3498 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3499 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3500 * migration. 3501 */ 3502 if (!nested_get_evmcs_page(vcpu)) { 3503 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3504 __func__); 3505 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3506 vcpu->run->internal.suberror = 3507 KVM_INTERNAL_ERROR_EMULATION; 3508 vcpu->run->internal.ndata = 0; 3509 3510 return false; 3511 } 3512 #endif 3513 3514 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3515 return false; 3516 3517 return true; 3518 } 3519 3520 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3521 { 3522 struct vmcs12 *vmcs12; 3523 struct vcpu_vmx *vmx = to_vmx(vcpu); 3524 gpa_t dst; 3525 3526 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3527 return 0; 3528 3529 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3530 return 1; 3531 3532 /* 3533 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3534 * set is already checked as part of A/D emulation. 3535 */ 3536 vmcs12 = get_vmcs12(vcpu); 3537 if (!nested_cpu_has_pml(vmcs12)) 3538 return 0; 3539 3540 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3541 vmx->nested.pml_full = true; 3542 return 1; 3543 } 3544 3545 gpa &= ~0xFFFull; 3546 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3547 3548 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3549 offset_in_page(dst), sizeof(gpa))) 3550 return 0; 3551 3552 vmcs12->guest_pml_index--; 3553 3554 return 0; 3555 } 3556 3557 /* 3558 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3559 * for running VMX instructions (except VMXON, whose prerequisites are 3560 * slightly different). It also specifies what exception to inject otherwise. 3561 * Note that many of these exceptions have priority over VM exits, so they 3562 * don't have to be checked again here. 3563 */ 3564 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3565 { 3566 if (!to_vmx(vcpu)->nested.vmxon) { 3567 kvm_queue_exception(vcpu, UD_VECTOR); 3568 return 0; 3569 } 3570 3571 if (vmx_get_cpl(vcpu)) { 3572 kvm_inject_gp(vcpu, 0); 3573 return 0; 3574 } 3575 3576 return 1; 3577 } 3578 3579 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3580 struct vmcs12 *vmcs12); 3581 3582 /* 3583 * If from_vmentry is false, this is being called from state restore (either RSM 3584 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3585 * 3586 * Returns: 3587 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3588 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3589 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3590 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3591 */ 3592 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3593 bool from_vmentry) 3594 { 3595 struct vcpu_vmx *vmx = to_vmx(vcpu); 3596 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3597 enum vm_entry_failure_code entry_failure_code; 3598 union vmx_exit_reason exit_reason = { 3599 .basic = EXIT_REASON_INVALID_STATE, 3600 .failed_vmentry = 1, 3601 }; 3602 u32 failed_index; 3603 3604 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3605 vmx->nested.current_vmptr, 3606 vmcs12->guest_rip, 3607 vmcs12->guest_intr_status, 3608 vmcs12->vm_entry_intr_info_field, 3609 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3610 vmcs12->ept_pointer, 3611 vmcs12->guest_cr3, 3612 KVM_ISA_VMX); 3613 3614 kvm_service_local_tlb_flush_requests(vcpu); 3615 3616 if (!vmx->nested.nested_run_pending || 3617 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3618 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3619 if (kvm_mpx_supported() && 3620 (!vmx->nested.nested_run_pending || 3621 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3622 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3623 3624 if (!vmx->nested.nested_run_pending || 3625 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3626 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3627 &vmx->nested.pre_vmenter_ssp, 3628 &vmx->nested.pre_vmenter_ssp_tbl); 3629 3630 /* 3631 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3632 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3633 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3634 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3635 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3636 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3637 * unwind naturally setting arch.cr3 to the correct value. Smashing 3638 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3639 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3640 * overwritten with a shadow CR3 prior to re-entering L1. 3641 */ 3642 if (!enable_ept) 3643 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3644 3645 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3646 3647 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3648 3649 if (from_vmentry) { 3650 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3651 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3652 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3653 } 3654 3655 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3656 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3657 return NVMX_VMENTRY_VMFAIL; 3658 } 3659 3660 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3661 &entry_failure_code)) { 3662 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3663 vmcs12->exit_qualification = entry_failure_code; 3664 goto vmentry_fail_vmexit; 3665 } 3666 } 3667 3668 enter_guest_mode(vcpu); 3669 3670 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3671 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3672 vmcs12->exit_qualification = entry_failure_code; 3673 goto vmentry_fail_vmexit_guest_mode; 3674 } 3675 3676 if (from_vmentry) { 3677 failed_index = nested_vmx_load_msr(vcpu, 3678 vmcs12->vm_entry_msr_load_addr, 3679 vmcs12->vm_entry_msr_load_count); 3680 if (failed_index) { 3681 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3682 vmcs12->exit_qualification = failed_index; 3683 goto vmentry_fail_vmexit_guest_mode; 3684 } 3685 } else { 3686 /* 3687 * The MMU is not initialized to point at the right entities yet and 3688 * "get pages" would need to read data from the guest (i.e. we will 3689 * need to perform gpa to hpa translation). Request a call 3690 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3691 * have already been set at vmentry time and should not be reset. 3692 */ 3693 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3694 } 3695 3696 /* 3697 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3698 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3699 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3700 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3701 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3702 */ 3703 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3704 CPU_BASED_NMI_WINDOW_EXITING)) || 3705 kvm_apic_has_pending_init_or_sipi(vcpu) || 3706 kvm_apic_has_interrupt(vcpu)) 3707 kvm_make_request(KVM_REQ_EVENT, vcpu); 3708 3709 /* 3710 * Do not start the preemption timer hrtimer until after we know 3711 * we are successful, so that only nested_vmx_vmexit needs to cancel 3712 * the timer. 3713 */ 3714 vmx->nested.preemption_timer_expired = false; 3715 if (nested_cpu_has_preemption_timer(vmcs12)) { 3716 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3717 vmx_start_preemption_timer(vcpu, timer_value); 3718 } 3719 3720 /* 3721 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3722 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3723 * returned as far as L1 is concerned. It will only return (and set 3724 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3725 */ 3726 return NVMX_VMENTRY_SUCCESS; 3727 3728 /* 3729 * A failed consistency check that leads to a VMExit during L1's 3730 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3731 * 26.7 "VM-entry failures during or after loading guest state". 3732 */ 3733 vmentry_fail_vmexit_guest_mode: 3734 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3735 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3736 leave_guest_mode(vcpu); 3737 3738 vmentry_fail_vmexit: 3739 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3740 3741 if (!from_vmentry) 3742 return NVMX_VMENTRY_VMEXIT; 3743 3744 load_vmcs12_host_state(vcpu, vmcs12); 3745 vmcs12->vm_exit_reason = exit_reason.full; 3746 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3747 vmx->nested.need_vmcs12_to_shadow_sync = true; 3748 return NVMX_VMENTRY_VMEXIT; 3749 } 3750 3751 /* 3752 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3753 * for running an L2 nested guest. 3754 */ 3755 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3756 { 3757 struct vmcs12 *vmcs12; 3758 enum nvmx_vmentry_status status; 3759 struct vcpu_vmx *vmx = to_vmx(vcpu); 3760 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3761 enum nested_evmptrld_status evmptrld_status; 3762 3763 if (!nested_vmx_check_permission(vcpu)) 3764 return 1; 3765 3766 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3767 if (evmptrld_status == EVMPTRLD_ERROR) { 3768 kvm_queue_exception(vcpu, UD_VECTOR); 3769 return 1; 3770 } 3771 3772 kvm_pmu_branch_retired(vcpu); 3773 3774 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3775 return nested_vmx_failInvalid(vcpu); 3776 3777 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3778 vmx->nested.current_vmptr == INVALID_GPA)) 3779 return nested_vmx_failInvalid(vcpu); 3780 3781 vmcs12 = get_vmcs12(vcpu); 3782 3783 /* 3784 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3785 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3786 * rather than RFLAGS.ZF, and no error number is stored to the 3787 * VM-instruction error field. 3788 */ 3789 if (CC(vmcs12->hdr.shadow_vmcs)) 3790 return nested_vmx_failInvalid(vcpu); 3791 3792 if (nested_vmx_is_evmptr12_valid(vmx)) { 3793 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3794 3795 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3796 /* Enlightened VMCS doesn't have launch state */ 3797 vmcs12->launch_state = !launch; 3798 } else if (enable_shadow_vmcs) { 3799 copy_shadow_to_vmcs12(vmx); 3800 } 3801 3802 /* 3803 * The nested entry process starts with enforcing various prerequisites 3804 * on vmcs12 as required by the Intel SDM, and act appropriately when 3805 * they fail: As the SDM explains, some conditions should cause the 3806 * instruction to fail, while others will cause the instruction to seem 3807 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3808 * To speed up the normal (success) code path, we should avoid checking 3809 * for misconfigurations which will anyway be caught by the processor 3810 * when using the merged vmcs02. 3811 */ 3812 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3813 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3814 3815 if (CC(vmcs12->launch_state == launch)) 3816 return nested_vmx_fail(vcpu, 3817 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3818 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3819 3820 if (nested_vmx_check_controls(vcpu, vmcs12)) 3821 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3822 3823 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3824 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3825 3826 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3827 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3828 3829 /* 3830 * We're finally done with prerequisite checking, and can start with 3831 * the nested entry. 3832 */ 3833 vmx->nested.nested_run_pending = 1; 3834 vmx->nested.has_preemption_timer_deadline = false; 3835 status = nested_vmx_enter_non_root_mode(vcpu, true); 3836 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3837 goto vmentry_failed; 3838 3839 /* Hide L1D cache contents from the nested guest. */ 3840 kvm_request_l1tf_flush_l1d(); 3841 3842 /* 3843 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3844 * also be used as part of restoring nVMX state for 3845 * snapshot restore (migration). 3846 * 3847 * In this flow, it is assumed that vmcs12 cache was 3848 * transferred as part of captured nVMX state and should 3849 * therefore not be read from guest memory (which may not 3850 * exist on destination host yet). 3851 */ 3852 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3853 3854 switch (vmcs12->guest_activity_state) { 3855 case GUEST_ACTIVITY_HLT: 3856 /* 3857 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3858 * awakened by event injection or by an NMI-window VM-exit or 3859 * by an interrupt-window VM-exit, halt the vcpu. 3860 */ 3861 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3862 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3863 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3864 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3865 vmx->nested.nested_run_pending = 0; 3866 return kvm_emulate_halt_noskip(vcpu); 3867 } 3868 break; 3869 case GUEST_ACTIVITY_WAIT_SIPI: 3870 vmx->nested.nested_run_pending = 0; 3871 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3872 break; 3873 default: 3874 break; 3875 } 3876 3877 return 1; 3878 3879 vmentry_failed: 3880 vmx->nested.nested_run_pending = 0; 3881 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3882 return 0; 3883 if (status == NVMX_VMENTRY_VMEXIT) 3884 return 1; 3885 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3886 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3887 } 3888 3889 /* 3890 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3891 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3892 * This function returns the new value we should put in vmcs12.guest_cr0. 3893 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3894 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3895 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3896 * didn't trap the bit, because if L1 did, so would L0). 3897 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3898 * been modified by L2, and L1 knows it. So just leave the old value of 3899 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3900 * isn't relevant, because if L0 traps this bit it can set it to anything. 3901 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3902 * changed these bits, and therefore they need to be updated, but L0 3903 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3904 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3905 */ 3906 static inline unsigned long 3907 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3908 { 3909 return 3910 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3911 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3912 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3913 vcpu->arch.cr0_guest_owned_bits)); 3914 } 3915 3916 static inline unsigned long 3917 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3918 { 3919 return 3920 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3921 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3922 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3923 vcpu->arch.cr4_guest_owned_bits)); 3924 } 3925 3926 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3927 struct vmcs12 *vmcs12, 3928 u32 vm_exit_reason, u32 exit_intr_info) 3929 { 3930 u32 idt_vectoring; 3931 unsigned int nr; 3932 3933 /* 3934 * Per the SDM, VM-Exits due to double and triple faults are never 3935 * considered to occur during event delivery, even if the double/triple 3936 * fault is the result of an escalating vectoring issue. 3937 * 3938 * Note, the SDM qualifies the double fault behavior with "The original 3939 * event results in a double-fault exception". It's unclear why the 3940 * qualification exists since exits due to double fault can occur only 3941 * while vectoring a different exception (injected events are never 3942 * subject to interception), i.e. there's _always_ an original event. 3943 * 3944 * The SDM also uses NMI as a confusing example for the "original event 3945 * causes the VM exit directly" clause. NMI isn't special in any way, 3946 * the same rule applies to all events that cause an exit directly. 3947 * NMI is an odd choice for the example because NMIs can only occur on 3948 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3949 */ 3950 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3951 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3952 is_double_fault(exit_intr_info))) { 3953 vmcs12->idt_vectoring_info_field = 0; 3954 } else if (vcpu->arch.exception.injected) { 3955 nr = vcpu->arch.exception.vector; 3956 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3957 3958 if (kvm_exception_is_soft(nr)) { 3959 vmcs12->vm_exit_instruction_len = 3960 vcpu->arch.event_exit_inst_len; 3961 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3962 } else 3963 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3964 3965 if (vcpu->arch.exception.has_error_code) { 3966 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3967 vmcs12->idt_vectoring_error_code = 3968 vcpu->arch.exception.error_code; 3969 } 3970 3971 vmcs12->idt_vectoring_info_field = idt_vectoring; 3972 } else if (vcpu->arch.nmi_injected) { 3973 vmcs12->idt_vectoring_info_field = 3974 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3975 } else if (vcpu->arch.interrupt.injected) { 3976 nr = vcpu->arch.interrupt.nr; 3977 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3978 3979 if (vcpu->arch.interrupt.soft) { 3980 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3981 vmcs12->vm_entry_instruction_len = 3982 vcpu->arch.event_exit_inst_len; 3983 } else 3984 idt_vectoring |= INTR_TYPE_EXT_INTR; 3985 3986 vmcs12->idt_vectoring_info_field = idt_vectoring; 3987 } else { 3988 vmcs12->idt_vectoring_info_field = 0; 3989 } 3990 } 3991 3992 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3993 { 3994 struct vcpu_vmx *vmx = to_vmx(vcpu); 3995 int max_irr; 3996 void *vapic_page; 3997 u16 status; 3998 3999 if (!vmx->nested.pi_pending) 4000 return 0; 4001 4002 if (!vmx->nested.pi_desc) 4003 goto mmio_needed; 4004 4005 vmx->nested.pi_pending = false; 4006 4007 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4008 return 0; 4009 4010 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4011 if (max_irr > 0) { 4012 vapic_page = vmx->nested.virtual_apic_map.hva; 4013 if (!vapic_page) 4014 goto mmio_needed; 4015 4016 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4017 vapic_page, &max_irr); 4018 status = vmcs_read16(GUEST_INTR_STATUS); 4019 if ((u8)max_irr > ((u8)status & 0xff)) { 4020 status &= ~0xff; 4021 status |= (u8)max_irr; 4022 vmcs_write16(GUEST_INTR_STATUS, status); 4023 } 4024 } 4025 4026 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 4027 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 4028 return 0; 4029 4030 mmio_needed: 4031 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4032 return -ENXIO; 4033 } 4034 4035 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4036 { 4037 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4038 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4039 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4040 unsigned long exit_qual; 4041 4042 if (ex->has_payload) { 4043 exit_qual = ex->payload; 4044 } else if (ex->vector == PF_VECTOR) { 4045 exit_qual = vcpu->arch.cr2; 4046 } else if (ex->vector == DB_VECTOR) { 4047 exit_qual = vcpu->arch.dr6; 4048 exit_qual &= ~DR6_BT; 4049 exit_qual ^= DR6_ACTIVE_LOW; 4050 } else { 4051 exit_qual = 0; 4052 } 4053 4054 /* 4055 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4056 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4057 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4058 */ 4059 if (ex->has_error_code && is_protmode(vcpu)) { 4060 /* 4061 * Intel CPUs do not generate error codes with bits 31:16 set, 4062 * and more importantly VMX disallows setting bits 31:16 in the 4063 * injected error code for VM-Entry. Drop the bits to mimic 4064 * hardware and avoid inducing failure on nested VM-Entry if L1 4065 * chooses to inject the exception back to L2. AMD CPUs _do_ 4066 * generate "full" 32-bit error codes, so KVM allows userspace 4067 * to inject exception error codes with bits 31:16 set. 4068 */ 4069 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4070 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4071 } 4072 4073 if (kvm_exception_is_soft(ex->vector)) 4074 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4075 else 4076 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4077 4078 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4079 vmx_get_nmi_mask(vcpu)) 4080 intr_info |= INTR_INFO_UNBLOCK_NMI; 4081 4082 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4083 } 4084 4085 /* 4086 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4087 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4088 * Using the payload is flawed because code breakpoints (fault-like) and data 4089 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4090 * this will return false positives if a to-be-injected code breakpoint #DB is 4091 * pending (from KVM's perspective, but not "pending" across an instruction 4092 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4093 * too is trap-like. 4094 * 4095 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4096 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4097 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4098 * from the emulator (because such #DBs are fault-like and thus don't trigger 4099 * actions that fire on instruction retire). 4100 */ 4101 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4102 { 4103 if (!ex->pending || ex->vector != DB_VECTOR) 4104 return 0; 4105 4106 /* General Detect #DBs are always fault-like. */ 4107 return ex->payload & ~DR6_BD; 4108 } 4109 4110 /* 4111 * Returns true if there's a pending #DB exception that is lower priority than 4112 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4113 * KVM, but could theoretically be injected by userspace. Note, this code is 4114 * imperfect, see above. 4115 */ 4116 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4117 { 4118 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4119 } 4120 4121 /* 4122 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4123 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4124 * represents these debug traps with a payload that is said to be compatible 4125 * with the 'pending debug exceptions' field, write the payload to the VMCS 4126 * field if a VM-exit is delivered before the debug trap. 4127 */ 4128 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4129 { 4130 unsigned long pending_dbg; 4131 4132 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4133 if (pending_dbg) 4134 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4135 } 4136 4137 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4138 { 4139 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4140 to_vmx(vcpu)->nested.preemption_timer_expired; 4141 } 4142 4143 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4144 { 4145 struct vcpu_vmx *vmx = to_vmx(vcpu); 4146 void *vapic = vmx->nested.virtual_apic_map.hva; 4147 int max_irr, vppr; 4148 4149 if (nested_vmx_preemption_timer_pending(vcpu) || 4150 vmx->nested.mtf_pending) 4151 return true; 4152 4153 /* 4154 * Virtual Interrupt Delivery doesn't require manual injection. Either 4155 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4156 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4157 * the interrupt from the PIR to RVI prior to entering the guest. 4158 */ 4159 if (for_injection) 4160 return false; 4161 4162 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4163 __vmx_interrupt_blocked(vcpu)) 4164 return false; 4165 4166 if (!vapic) 4167 return false; 4168 4169 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4170 4171 max_irr = vmx_get_rvi(); 4172 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4173 return true; 4174 4175 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4176 pi_test_on(vmx->nested.pi_desc)) { 4177 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4178 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4179 return true; 4180 } 4181 4182 return false; 4183 } 4184 4185 /* 4186 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4187 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4188 * and less minor edits to splice in the priority of VMX Non-Root specific 4189 * events, e.g. MTF and NMI/INTR-window exiting. 4190 * 4191 * 1 Hardware Reset and Machine Checks 4192 * - RESET 4193 * - Machine Check 4194 * 4195 * 2 Trap on Task Switch 4196 * - T flag in TSS is set (on task switch) 4197 * 4198 * 3 External Hardware Interventions 4199 * - FLUSH 4200 * - STOPCLK 4201 * - SMI 4202 * - INIT 4203 * 4204 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4205 * 4206 * 4 Traps on Previous Instruction 4207 * - Breakpoints 4208 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4209 * breakpoint, or #DB due to a split-lock access) 4210 * 4211 * 4.3 VMX-preemption timer expired VM-exit 4212 * 4213 * 4.6 NMI-window exiting VM-exit[2] 4214 * 4215 * 5 Nonmaskable Interrupts (NMI) 4216 * 4217 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4218 * 4219 * 6 Maskable Hardware Interrupts 4220 * 4221 * 7 Code Breakpoint Fault 4222 * 4223 * 8 Faults from Fetching Next Instruction 4224 * - Code-Segment Limit Violation 4225 * - Code Page Fault 4226 * - Control protection exception (missing ENDBRANCH at target of indirect 4227 * call or jump) 4228 * 4229 * 9 Faults from Decoding Next Instruction 4230 * - Instruction length > 15 bytes 4231 * - Invalid Opcode 4232 * - Coprocessor Not Available 4233 * 4234 *10 Faults on Executing Instruction 4235 * - Overflow 4236 * - Bound error 4237 * - Invalid TSS 4238 * - Segment Not Present 4239 * - Stack fault 4240 * - General Protection 4241 * - Data Page Fault 4242 * - Alignment Check 4243 * - x86 FPU Floating-point exception 4244 * - SIMD floating-point exception 4245 * - Virtualization exception 4246 * - Control protection exception 4247 * 4248 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4249 * INIT signals, and higher priority events take priority over MTF VM exits. 4250 * MTF VM exits take priority over debug-trap exceptions and lower priority 4251 * events. 4252 * 4253 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4254 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4255 * timer take priority over VM exits caused by the "NMI-window exiting" 4256 * VM-execution control and lower priority events. 4257 * 4258 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4259 * caused by "NMI-window exiting". VM exits caused by this control take 4260 * priority over non-maskable interrupts (NMIs) and lower priority events. 4261 * 4262 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4263 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4264 * non-maskable interrupts (NMIs) and higher priority events take priority over 4265 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4266 * priority over external interrupts and lower priority events. 4267 */ 4268 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4269 { 4270 struct kvm_lapic *apic = vcpu->arch.apic; 4271 struct vcpu_vmx *vmx = to_vmx(vcpu); 4272 /* 4273 * Only a pending nested run blocks a pending exception. If there is a 4274 * previously injected event, the pending exception occurred while said 4275 * event was being delivered and thus needs to be handled. 4276 */ 4277 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4278 /* 4279 * Events that don't require injection, i.e. that are virtualized by 4280 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4281 * to regain control in order to deliver the event, and hardware will 4282 * handle event ordering, e.g. with respect to injected exceptions. 4283 * 4284 * But, new events (not exceptions) are only recognized at instruction 4285 * boundaries. If an event needs reinjection, then KVM is handling a 4286 * VM-Exit that occurred _during_ instruction execution; new events, 4287 * irrespective of whether or not they're injected, are blocked until 4288 * the instruction completes. 4289 */ 4290 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4291 /* 4292 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4293 * for managing priority between concurrent events, i.e. KVM needs to 4294 * wait until after VM-Enter completes to deliver injected events. 4295 */ 4296 bool block_nested_events = block_nested_exceptions || 4297 block_non_injected_events; 4298 4299 if (lapic_in_kernel(vcpu) && 4300 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4301 if (block_nested_events) 4302 return -EBUSY; 4303 nested_vmx_update_pending_dbg(vcpu); 4304 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4305 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4306 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4307 4308 /* MTF is discarded if the vCPU is in WFS. */ 4309 vmx->nested.mtf_pending = false; 4310 return 0; 4311 } 4312 4313 if (lapic_in_kernel(vcpu) && 4314 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4315 if (block_nested_events) 4316 return -EBUSY; 4317 4318 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4319 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4320 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4321 apic->sipi_vector & 0xFFUL); 4322 return 0; 4323 } 4324 /* Fallthrough, the SIPI is completely ignored. */ 4325 } 4326 4327 /* 4328 * Process exceptions that are higher priority than Monitor Trap Flag: 4329 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4330 * could theoretically come in from userspace), and ICEBP (INT1). 4331 * 4332 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4333 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4334 * across SMI/RSM as it should; that needs to be addressed in order to 4335 * prioritize SMI over MTF and trap-like #DBs. 4336 */ 4337 if (vcpu->arch.exception_vmexit.pending && 4338 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4339 if (block_nested_exceptions) 4340 return -EBUSY; 4341 4342 nested_vmx_inject_exception_vmexit(vcpu); 4343 return 0; 4344 } 4345 4346 if (vcpu->arch.exception.pending && 4347 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4348 if (block_nested_exceptions) 4349 return -EBUSY; 4350 goto no_vmexit; 4351 } 4352 4353 if (vmx->nested.mtf_pending) { 4354 if (block_nested_events) 4355 return -EBUSY; 4356 nested_vmx_update_pending_dbg(vcpu); 4357 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4358 return 0; 4359 } 4360 4361 if (vcpu->arch.exception_vmexit.pending) { 4362 if (block_nested_exceptions) 4363 return -EBUSY; 4364 4365 nested_vmx_inject_exception_vmexit(vcpu); 4366 return 0; 4367 } 4368 4369 if (vcpu->arch.exception.pending) { 4370 if (block_nested_exceptions) 4371 return -EBUSY; 4372 goto no_vmexit; 4373 } 4374 4375 if (nested_vmx_preemption_timer_pending(vcpu)) { 4376 if (block_nested_events) 4377 return -EBUSY; 4378 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4379 return 0; 4380 } 4381 4382 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4383 if (block_nested_events) 4384 return -EBUSY; 4385 goto no_vmexit; 4386 } 4387 4388 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4389 if (block_nested_events) 4390 return -EBUSY; 4391 if (!nested_exit_on_nmi(vcpu)) 4392 goto no_vmexit; 4393 4394 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4395 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4396 INTR_INFO_VALID_MASK, 0); 4397 /* 4398 * The NMI-triggered VM exit counts as injection: 4399 * clear this one and block further NMIs. 4400 */ 4401 vcpu->arch.nmi_pending = 0; 4402 vmx_set_nmi_mask(vcpu, true); 4403 return 0; 4404 } 4405 4406 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4407 int irq; 4408 4409 if (!nested_exit_on_intr(vcpu)) { 4410 if (block_nested_events) 4411 return -EBUSY; 4412 4413 goto no_vmexit; 4414 } 4415 4416 if (!nested_exit_intr_ack_set(vcpu)) { 4417 if (block_nested_events) 4418 return -EBUSY; 4419 4420 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4421 return 0; 4422 } 4423 4424 irq = kvm_cpu_get_extint(vcpu); 4425 if (irq != -1) { 4426 if (block_nested_events) 4427 return -EBUSY; 4428 4429 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4430 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4431 return 0; 4432 } 4433 4434 irq = kvm_apic_has_interrupt(vcpu); 4435 if (WARN_ON_ONCE(irq < 0)) 4436 goto no_vmexit; 4437 4438 /* 4439 * If the IRQ is L2's PI notification vector, process posted 4440 * interrupts for L2 instead of injecting VM-Exit, as the 4441 * detection/morphing architecturally occurs when the IRQ is 4442 * delivered to the CPU. Note, only interrupts that are routed 4443 * through the local APIC trigger posted interrupt processing, 4444 * and enabling posted interrupts requires ACK-on-exit. 4445 */ 4446 if (irq == vmx->nested.posted_intr_nv) { 4447 /* 4448 * Nested posted interrupts are delivered via RVI, i.e. 4449 * aren't injected by KVM, and so can be queued even if 4450 * manual event injection is disallowed. 4451 */ 4452 if (block_non_injected_events) 4453 return -EBUSY; 4454 4455 vmx->nested.pi_pending = true; 4456 kvm_apic_clear_irr(vcpu, irq); 4457 goto no_vmexit; 4458 } 4459 4460 if (block_nested_events) 4461 return -EBUSY; 4462 4463 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4464 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4465 4466 /* 4467 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4468 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4469 * if APICv is active. 4470 */ 4471 kvm_apic_ack_interrupt(vcpu, irq); 4472 return 0; 4473 } 4474 4475 no_vmexit: 4476 return vmx_complete_nested_posted_interrupt(vcpu); 4477 } 4478 4479 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4480 { 4481 ktime_t remaining = 4482 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4483 u64 value; 4484 4485 if (ktime_to_ns(remaining) <= 0) 4486 return 0; 4487 4488 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4489 do_div(value, 1000000); 4490 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4491 } 4492 4493 static bool is_vmcs12_ext_field(unsigned long field) 4494 { 4495 switch (field) { 4496 case GUEST_ES_SELECTOR: 4497 case GUEST_CS_SELECTOR: 4498 case GUEST_SS_SELECTOR: 4499 case GUEST_DS_SELECTOR: 4500 case GUEST_FS_SELECTOR: 4501 case GUEST_GS_SELECTOR: 4502 case GUEST_LDTR_SELECTOR: 4503 case GUEST_TR_SELECTOR: 4504 case GUEST_ES_LIMIT: 4505 case GUEST_CS_LIMIT: 4506 case GUEST_SS_LIMIT: 4507 case GUEST_DS_LIMIT: 4508 case GUEST_FS_LIMIT: 4509 case GUEST_GS_LIMIT: 4510 case GUEST_LDTR_LIMIT: 4511 case GUEST_TR_LIMIT: 4512 case GUEST_GDTR_LIMIT: 4513 case GUEST_IDTR_LIMIT: 4514 case GUEST_ES_AR_BYTES: 4515 case GUEST_DS_AR_BYTES: 4516 case GUEST_FS_AR_BYTES: 4517 case GUEST_GS_AR_BYTES: 4518 case GUEST_LDTR_AR_BYTES: 4519 case GUEST_TR_AR_BYTES: 4520 case GUEST_ES_BASE: 4521 case GUEST_CS_BASE: 4522 case GUEST_SS_BASE: 4523 case GUEST_DS_BASE: 4524 case GUEST_FS_BASE: 4525 case GUEST_GS_BASE: 4526 case GUEST_LDTR_BASE: 4527 case GUEST_TR_BASE: 4528 case GUEST_GDTR_BASE: 4529 case GUEST_IDTR_BASE: 4530 case GUEST_PENDING_DBG_EXCEPTIONS: 4531 case GUEST_BNDCFGS: 4532 return true; 4533 default: 4534 break; 4535 } 4536 4537 return false; 4538 } 4539 4540 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4541 struct vmcs12 *vmcs12) 4542 { 4543 struct vcpu_vmx *vmx = to_vmx(vcpu); 4544 4545 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4546 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4547 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4548 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4549 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4550 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4551 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4552 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4553 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4554 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4555 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4556 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4557 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4558 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4559 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4560 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4561 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4562 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4563 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4564 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4565 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4566 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4567 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4568 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4569 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4570 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4571 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4572 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4573 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4574 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4575 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4576 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4577 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4578 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4579 vmcs12->guest_pending_dbg_exceptions = 4580 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4581 4582 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4583 } 4584 4585 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4586 struct vmcs12 *vmcs12) 4587 { 4588 struct vcpu_vmx *vmx = to_vmx(vcpu); 4589 int cpu; 4590 4591 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4592 return; 4593 4594 4595 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4596 4597 cpu = get_cpu(); 4598 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4599 vmx_vcpu_load_vmcs(vcpu, cpu); 4600 4601 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4602 4603 vmx->loaded_vmcs = &vmx->vmcs01; 4604 vmx_vcpu_load_vmcs(vcpu, cpu); 4605 put_cpu(); 4606 } 4607 4608 /* 4609 * Update the guest state fields of vmcs12 to reflect changes that 4610 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4611 * VM-entry controls is also updated, since this is really a guest 4612 * state bit.) 4613 */ 4614 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4615 { 4616 struct vcpu_vmx *vmx = to_vmx(vcpu); 4617 4618 if (nested_vmx_is_evmptr12_valid(vmx)) 4619 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4620 4621 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4622 !nested_vmx_is_evmptr12_valid(vmx); 4623 4624 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4625 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4626 4627 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4628 vmcs12->guest_rip = kvm_rip_read(vcpu); 4629 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4630 4631 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4632 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4633 4634 vmcs12->guest_interruptibility_info = 4635 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4636 4637 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4638 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4639 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4640 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4641 else 4642 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4643 4644 if (nested_cpu_has_preemption_timer(vmcs12) && 4645 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4646 !vmx->nested.nested_run_pending) 4647 vmcs12->vmx_preemption_timer_value = 4648 vmx_get_preemption_timer_value(vcpu); 4649 4650 /* 4651 * In some cases (usually, nested EPT), L2 is allowed to change its 4652 * own CR3 without exiting. If it has changed it, we must keep it. 4653 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4654 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4655 * 4656 * Additionally, restore L2's PDPTR to vmcs12. 4657 */ 4658 if (enable_ept) { 4659 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4660 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4661 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4662 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4663 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4664 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4665 } 4666 } 4667 4668 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4669 4670 if (nested_cpu_has_vid(vmcs12)) 4671 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4672 4673 vmcs12->vm_entry_controls = 4674 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4675 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4676 4677 /* 4678 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4679 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4680 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4681 * vmcs02 doesn't strictly track vmcs12. 4682 */ 4683 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4684 vmcs12->guest_dr7 = vcpu->arch.dr7; 4685 4686 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4687 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4688 4689 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4690 &vmcs12->guest_ssp, 4691 &vmcs12->guest_ssp_tbl); 4692 } 4693 4694 /* 4695 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4696 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4697 * and this function updates it to reflect the changes to the guest state while 4698 * L2 was running (and perhaps made some exits which were handled directly by L0 4699 * without going back to L1), and to reflect the exit reason. 4700 * Note that we do not have to copy here all VMCS fields, just those that 4701 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4702 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4703 * which already writes to vmcs12 directly. 4704 */ 4705 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4706 u32 vm_exit_reason, u32 exit_intr_info, 4707 unsigned long exit_qualification, u32 exit_insn_len) 4708 { 4709 /* update exit information fields: */ 4710 vmcs12->vm_exit_reason = vm_exit_reason; 4711 if (vmx_get_exit_reason(vcpu).enclave_mode) 4712 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4713 vmcs12->exit_qualification = exit_qualification; 4714 4715 /* 4716 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4717 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4718 * exit info fields are unmodified. 4719 */ 4720 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4721 vmcs12->launch_state = 1; 4722 4723 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4724 * instead of reading the real value. */ 4725 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4726 4727 /* 4728 * Transfer the event that L0 or L1 may wanted to inject into 4729 * L2 to IDT_VECTORING_INFO_FIELD. 4730 */ 4731 vmcs12_save_pending_event(vcpu, vmcs12, 4732 vm_exit_reason, exit_intr_info); 4733 4734 vmcs12->vm_exit_intr_info = exit_intr_info; 4735 vmcs12->vm_exit_instruction_len = exit_insn_len; 4736 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4737 4738 /* 4739 * According to spec, there's no need to store the guest's 4740 * MSRs if the exit is due to a VM-entry failure that occurs 4741 * during or after loading the guest state. Since this exit 4742 * does not fall in that category, we need to save the MSRs. 4743 */ 4744 if (nested_vmx_store_msr(vcpu, 4745 vmcs12->vm_exit_msr_store_addr, 4746 vmcs12->vm_exit_msr_store_count)) 4747 nested_vmx_abort(vcpu, 4748 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4749 } 4750 } 4751 4752 /* 4753 * A part of what we need to when the nested L2 guest exits and we want to 4754 * run its L1 parent, is to reset L1's guest state to the host state specified 4755 * in vmcs12. 4756 * This function is to be called not only on normal nested exit, but also on 4757 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4758 * Failures During or After Loading Guest State"). 4759 * This function should be called when the active VMCS is L1's (vmcs01). 4760 */ 4761 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4762 struct vmcs12 *vmcs12) 4763 { 4764 enum vm_entry_failure_code ignored; 4765 struct kvm_segment seg; 4766 4767 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4768 vcpu->arch.efer = vmcs12->host_ia32_efer; 4769 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4770 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4771 else 4772 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4773 vmx_set_efer(vcpu, vcpu->arch.efer); 4774 4775 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4776 kvm_rip_write(vcpu, vmcs12->host_rip); 4777 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4778 vmx_set_interrupt_shadow(vcpu, 0); 4779 4780 /* 4781 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4782 * actually changed, because vmx_set_cr0 refers to efer set above. 4783 * 4784 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4785 * (KVM doesn't change it); 4786 */ 4787 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4788 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4789 4790 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4791 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4792 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4793 4794 nested_ept_uninit_mmu_context(vcpu); 4795 4796 /* 4797 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4798 * couldn't have changed. 4799 */ 4800 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4801 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4802 4803 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4804 4805 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4806 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4807 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4808 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4809 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4810 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4811 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4812 4813 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4814 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4815 vmcs_write64(GUEST_BNDCFGS, 0); 4816 4817 /* 4818 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4819 * otherwise CET state should be retained across VM-exit, i.e., 4820 * guest values should be propagated from vmcs12 to vmcs01. 4821 */ 4822 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4823 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4824 vmcs12->host_ssp_tbl); 4825 else 4826 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4827 vmcs12->guest_ssp_tbl); 4828 4829 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4830 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4831 vcpu->arch.pat = vmcs12->host_ia32_pat; 4832 } 4833 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4834 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4835 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4836 vmcs12->host_ia32_perf_global_ctrl)); 4837 4838 /* Set L1 segment info according to Intel SDM 4839 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4840 seg = (struct kvm_segment) { 4841 .base = 0, 4842 .limit = 0xFFFFFFFF, 4843 .selector = vmcs12->host_cs_selector, 4844 .type = 11, 4845 .present = 1, 4846 .s = 1, 4847 .g = 1 4848 }; 4849 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4850 seg.l = 1; 4851 else 4852 seg.db = 1; 4853 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4854 seg = (struct kvm_segment) { 4855 .base = 0, 4856 .limit = 0xFFFFFFFF, 4857 .type = 3, 4858 .present = 1, 4859 .s = 1, 4860 .db = 1, 4861 .g = 1 4862 }; 4863 seg.selector = vmcs12->host_ds_selector; 4864 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4865 seg.selector = vmcs12->host_es_selector; 4866 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4867 seg.selector = vmcs12->host_ss_selector; 4868 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4869 seg.selector = vmcs12->host_fs_selector; 4870 seg.base = vmcs12->host_fs_base; 4871 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4872 seg.selector = vmcs12->host_gs_selector; 4873 seg.base = vmcs12->host_gs_base; 4874 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4875 seg = (struct kvm_segment) { 4876 .base = vmcs12->host_tr_base, 4877 .limit = 0x67, 4878 .selector = vmcs12->host_tr_selector, 4879 .type = 11, 4880 .present = 1 4881 }; 4882 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4883 4884 memset(&seg, 0, sizeof(seg)); 4885 seg.unusable = 1; 4886 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4887 4888 kvm_set_dr(vcpu, 7, 0x400); 4889 vmx_guest_debugctl_write(vcpu, 0); 4890 4891 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4892 vmcs12->vm_exit_msr_load_count)) 4893 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4894 4895 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4896 } 4897 4898 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4899 { 4900 struct vmx_uret_msr *efer_msr; 4901 unsigned int i; 4902 4903 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4904 return vmcs_read64(GUEST_IA32_EFER); 4905 4906 if (cpu_has_load_ia32_efer()) 4907 return kvm_host.efer; 4908 4909 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4910 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4911 return vmx->msr_autoload.guest.val[i].value; 4912 } 4913 4914 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4915 if (efer_msr) 4916 return efer_msr->data; 4917 4918 return kvm_host.efer; 4919 } 4920 4921 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4922 { 4923 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4924 struct vcpu_vmx *vmx = to_vmx(vcpu); 4925 struct vmx_msr_entry g, h; 4926 gpa_t gpa; 4927 u32 i, j; 4928 4929 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4930 4931 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4932 /* 4933 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4934 * as vmcs01.GUEST_DR7 contains a userspace defined value 4935 * and vcpu->arch.dr7 is not squirreled away before the 4936 * nested VMENTER (not worth adding a variable in nested_vmx). 4937 */ 4938 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4939 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4940 else 4941 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4942 } 4943 4944 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4945 vmx_reload_guest_debugctl(vcpu); 4946 4947 /* 4948 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4949 * handle a variety of side effects to KVM's software model. 4950 */ 4951 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4952 4953 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4954 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4955 4956 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4957 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4958 4959 nested_ept_uninit_mmu_context(vcpu); 4960 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4961 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4962 4963 /* 4964 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4965 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4966 * VMFail, like everything else we just need to ensure our 4967 * software model is up-to-date. 4968 */ 4969 if (enable_ept && is_pae_paging(vcpu)) 4970 ept_save_pdptrs(vcpu); 4971 4972 kvm_mmu_reset_context(vcpu); 4973 4974 /* 4975 * This nasty bit of open coding is a compromise between blindly 4976 * loading L1's MSRs using the exit load lists (incorrect emulation 4977 * of VMFail), leaving the nested VM's MSRs in the software model 4978 * (incorrect behavior) and snapshotting the modified MSRs (too 4979 * expensive since the lists are unbound by hardware). For each 4980 * MSR that was (prematurely) loaded from the nested VMEntry load 4981 * list, reload it from the exit load list if it exists and differs 4982 * from the guest value. The intent is to stuff host state as 4983 * silently as possible, not to fully process the exit load list. 4984 */ 4985 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4986 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4987 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4988 pr_debug_ratelimited( 4989 "%s read MSR index failed (%u, 0x%08llx)\n", 4990 __func__, i, gpa); 4991 goto vmabort; 4992 } 4993 4994 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4995 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4996 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4997 pr_debug_ratelimited( 4998 "%s read MSR failed (%u, 0x%08llx)\n", 4999 __func__, j, gpa); 5000 goto vmabort; 5001 } 5002 if (h.index != g.index) 5003 continue; 5004 if (h.value == g.value) 5005 break; 5006 5007 if (nested_vmx_load_msr_check(vcpu, &h)) { 5008 pr_debug_ratelimited( 5009 "%s check failed (%u, 0x%x, 0x%x)\n", 5010 __func__, j, h.index, h.reserved); 5011 goto vmabort; 5012 } 5013 5014 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5015 pr_debug_ratelimited( 5016 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5017 __func__, j, h.index, h.value); 5018 goto vmabort; 5019 } 5020 } 5021 } 5022 5023 return; 5024 5025 vmabort: 5026 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5027 } 5028 5029 /* 5030 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5031 * and modify vmcs12 to make it see what it would expect to see there if 5032 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5033 */ 5034 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5035 u32 exit_intr_info, unsigned long exit_qualification, 5036 u32 exit_insn_len) 5037 { 5038 struct vcpu_vmx *vmx = to_vmx(vcpu); 5039 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5040 5041 /* Pending MTF traps are discarded on VM-Exit. */ 5042 vmx->nested.mtf_pending = false; 5043 5044 /* trying to cancel vmlaunch/vmresume is a bug */ 5045 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5046 5047 #ifdef CONFIG_KVM_HYPERV 5048 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5049 /* 5050 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5051 * Enlightened VMCS after migration and we still need to 5052 * do that when something is forcing L2->L1 exit prior to 5053 * the first L2 run. 5054 */ 5055 (void)nested_get_evmcs_page(vcpu); 5056 } 5057 #endif 5058 5059 /* Service pending TLB flush requests for L2 before switching to L1. */ 5060 kvm_service_local_tlb_flush_requests(vcpu); 5061 5062 /* 5063 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5064 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5065 * up-to-date before switching to L1. 5066 */ 5067 if (enable_ept && is_pae_paging(vcpu)) 5068 vmx_ept_load_pdptrs(vcpu); 5069 5070 leave_guest_mode(vcpu); 5071 5072 if (nested_cpu_has_preemption_timer(vmcs12)) 5073 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5074 5075 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5076 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5077 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5078 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5079 } 5080 5081 if (likely(!vmx->fail)) { 5082 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5083 5084 if (vm_exit_reason != -1) 5085 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5086 exit_intr_info, exit_qualification, 5087 exit_insn_len); 5088 5089 /* 5090 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5091 * also be used to capture vmcs12 cache as part of 5092 * capturing nVMX state for snapshot (migration). 5093 * 5094 * Otherwise, this flush will dirty guest memory at a 5095 * point it is already assumed by user-space to be 5096 * immutable. 5097 */ 5098 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5099 } else { 5100 /* 5101 * The only expected VM-instruction error is "VM entry with 5102 * invalid control field(s)." Anything else indicates a 5103 * problem with L0. 5104 */ 5105 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5106 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5107 5108 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5109 WARN_ON_ONCE(warn_on_missed_cc); 5110 } 5111 5112 /* 5113 * Drop events/exceptions that were queued for re-injection to L2 5114 * (picked up via vmx_complete_interrupts()), as well as exceptions 5115 * that were pending for L2. Note, this must NOT be hoisted above 5116 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5117 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5118 */ 5119 vcpu->arch.nmi_injected = false; 5120 kvm_clear_exception_queue(vcpu); 5121 kvm_clear_interrupt_queue(vcpu); 5122 5123 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5124 5125 kvm_nested_vmexit_handle_ibrs(vcpu); 5126 5127 /* 5128 * Update any VMCS fields that might have changed while vmcs02 was the 5129 * active VMCS. The tracking is per-vCPU, not per-VMCS. 5130 */ 5131 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5132 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5133 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5134 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5135 if (kvm_caps.has_tsc_control) 5136 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5137 5138 nested_put_vmcs12_pages(vcpu); 5139 5140 if ((vm_exit_reason != -1) && 5141 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5142 vmx->nested.need_vmcs12_to_shadow_sync = true; 5143 5144 /* in case we halted in L2 */ 5145 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5146 5147 if (likely(!vmx->fail)) { 5148 if (vm_exit_reason != -1) 5149 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5150 vmcs12->exit_qualification, 5151 vmcs12->idt_vectoring_info_field, 5152 vmcs12->vm_exit_intr_info, 5153 vmcs12->vm_exit_intr_error_code, 5154 KVM_ISA_VMX); 5155 5156 load_vmcs12_host_state(vcpu, vmcs12); 5157 5158 /* 5159 * Process events if an injectable IRQ or NMI is pending, even 5160 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5161 * If an event became pending while L2 was active, KVM needs to 5162 * either inject the event or request an IRQ/NMI window. SMIs 5163 * don't need to be processed as SMM is mutually exclusive with 5164 * non-root mode. INIT/SIPI don't need to be checked as INIT 5165 * is blocked post-VMXON, and SIPIs are ignored. 5166 */ 5167 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5168 kvm_make_request(KVM_REQ_EVENT, vcpu); 5169 return; 5170 } 5171 5172 /* 5173 * After an early L2 VM-entry failure, we're now back 5174 * in L1 which thinks it just finished a VMLAUNCH or 5175 * VMRESUME instruction, so we need to set the failure 5176 * flag and the VM-instruction error field of the VMCS 5177 * accordingly, and skip the emulated instruction. 5178 */ 5179 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5180 5181 /* 5182 * Restore L1's host state to KVM's software model. We're here 5183 * because a consistency check was caught by hardware, which 5184 * means some amount of guest state has been propagated to KVM's 5185 * model and needs to be unwound to the host's state. 5186 */ 5187 nested_vmx_restore_host_state(vcpu); 5188 5189 vmx->fail = 0; 5190 } 5191 5192 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5193 { 5194 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5195 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5196 } 5197 5198 /* 5199 * Decode the memory-address operand of a vmx instruction, as recorded on an 5200 * exit caused by such an instruction (run by a guest hypervisor). 5201 * On success, returns 0. When the operand is invalid, returns 1 and throws 5202 * #UD, #GP, or #SS. 5203 */ 5204 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5205 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5206 { 5207 gva_t off; 5208 bool exn; 5209 struct kvm_segment s; 5210 5211 /* 5212 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5213 * Execution", on an exit, vmx_instruction_info holds most of the 5214 * addressing components of the operand. Only the displacement part 5215 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5216 * For how an actual address is calculated from all these components, 5217 * refer to Vol. 1, "Operand Addressing". 5218 */ 5219 int scaling = vmx_instruction_info & 3; 5220 int addr_size = (vmx_instruction_info >> 7) & 7; 5221 bool is_reg = vmx_instruction_info & (1u << 10); 5222 int seg_reg = (vmx_instruction_info >> 15) & 7; 5223 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5224 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5225 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5226 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5227 5228 if (is_reg) { 5229 kvm_queue_exception(vcpu, UD_VECTOR); 5230 return 1; 5231 } 5232 5233 /* Addr = segment_base + offset */ 5234 /* offset = base + [index * scale] + displacement */ 5235 off = exit_qualification; /* holds the displacement */ 5236 if (addr_size == 1) 5237 off = (gva_t)sign_extend64(off, 31); 5238 else if (addr_size == 0) 5239 off = (gva_t)sign_extend64(off, 15); 5240 if (base_is_valid) 5241 off += kvm_register_read(vcpu, base_reg); 5242 if (index_is_valid) 5243 off += kvm_register_read(vcpu, index_reg) << scaling; 5244 vmx_get_segment(vcpu, &s, seg_reg); 5245 5246 /* 5247 * The effective address, i.e. @off, of a memory operand is truncated 5248 * based on the address size of the instruction. Note that this is 5249 * the *effective address*, i.e. the address prior to accounting for 5250 * the segment's base. 5251 */ 5252 if (addr_size == 1) /* 32 bit */ 5253 off &= 0xffffffff; 5254 else if (addr_size == 0) /* 16 bit */ 5255 off &= 0xffff; 5256 5257 /* Checks for #GP/#SS exceptions. */ 5258 exn = false; 5259 if (is_long_mode(vcpu)) { 5260 /* 5261 * The virtual/linear address is never truncated in 64-bit 5262 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5263 * address when using FS/GS with a non-zero base. 5264 */ 5265 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5266 *ret = s.base + off; 5267 else 5268 *ret = off; 5269 5270 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5271 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5272 * non-canonical form. This is the only check on the memory 5273 * destination for long mode! 5274 */ 5275 exn = is_noncanonical_address(*ret, vcpu, 0); 5276 } else { 5277 /* 5278 * When not in long mode, the virtual/linear address is 5279 * unconditionally truncated to 32 bits regardless of the 5280 * address size. 5281 */ 5282 *ret = (s.base + off) & 0xffffffff; 5283 5284 /* Protected mode: apply checks for segment validity in the 5285 * following order: 5286 * - segment type check (#GP(0) may be thrown) 5287 * - usability check (#GP(0)/#SS(0)) 5288 * - limit check (#GP(0)/#SS(0)) 5289 */ 5290 if (wr) 5291 /* #GP(0) if the destination operand is located in a 5292 * read-only data segment or any code segment. 5293 */ 5294 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5295 else 5296 /* #GP(0) if the source operand is located in an 5297 * execute-only code segment 5298 */ 5299 exn = ((s.type & 0xa) == 8); 5300 if (exn) { 5301 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5302 return 1; 5303 } 5304 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5305 */ 5306 exn = (s.unusable != 0); 5307 5308 /* 5309 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5310 * outside the segment limit. All CPUs that support VMX ignore 5311 * limit checks for flat segments, i.e. segments with base==0, 5312 * limit==0xffffffff and of type expand-up data or code. 5313 */ 5314 if (!(s.base == 0 && s.limit == 0xffffffff && 5315 ((s.type & 8) || !(s.type & 4)))) 5316 exn = exn || ((u64)off + len - 1 > s.limit); 5317 } 5318 if (exn) { 5319 kvm_queue_exception_e(vcpu, 5320 seg_reg == VCPU_SREG_SS ? 5321 SS_VECTOR : GP_VECTOR, 5322 0); 5323 return 1; 5324 } 5325 5326 return 0; 5327 } 5328 5329 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5330 int *ret) 5331 { 5332 gva_t gva; 5333 struct x86_exception e; 5334 int r; 5335 5336 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5337 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5338 sizeof(*vmpointer), &gva)) { 5339 *ret = 1; 5340 return -EINVAL; 5341 } 5342 5343 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5344 if (r != X86EMUL_CONTINUE) { 5345 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5346 return -EINVAL; 5347 } 5348 5349 return 0; 5350 } 5351 5352 /* 5353 * Allocate a shadow VMCS and associate it with the currently loaded 5354 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5355 * VMCS is also VMCLEARed, so that it is ready for use. 5356 */ 5357 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5358 { 5359 struct vcpu_vmx *vmx = to_vmx(vcpu); 5360 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5361 5362 /* 5363 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5364 * when L1 executes VMXOFF or the vCPU is forced out of nested 5365 * operation. VMXON faults if the CPU is already post-VMXON, so it 5366 * should be impossible to already have an allocated shadow VMCS. KVM 5367 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5368 * always be the loaded VMCS. 5369 */ 5370 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5371 return loaded_vmcs->shadow_vmcs; 5372 5373 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5374 if (loaded_vmcs->shadow_vmcs) 5375 vmcs_clear(loaded_vmcs->shadow_vmcs); 5376 5377 return loaded_vmcs->shadow_vmcs; 5378 } 5379 5380 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5381 { 5382 struct vcpu_vmx *vmx = to_vmx(vcpu); 5383 int r; 5384 5385 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5386 if (r < 0) 5387 goto out_vmcs02; 5388 5389 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5390 if (!vmx->nested.cached_vmcs12) 5391 goto out_cached_vmcs12; 5392 5393 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5394 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5395 if (!vmx->nested.cached_shadow_vmcs12) 5396 goto out_cached_shadow_vmcs12; 5397 5398 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5399 goto out_shadow_vmcs; 5400 5401 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5402 HRTIMER_MODE_ABS_PINNED); 5403 5404 vmx->nested.vpid02 = allocate_vpid(); 5405 5406 vmx->nested.vmcs02_initialized = false; 5407 vmx->nested.vmxon = true; 5408 5409 if (vmx_pt_mode_is_host_guest()) { 5410 vmx->pt_desc.guest.ctl = 0; 5411 pt_update_intercept_for_msr(vcpu); 5412 } 5413 5414 return 0; 5415 5416 out_shadow_vmcs: 5417 kfree(vmx->nested.cached_shadow_vmcs12); 5418 5419 out_cached_shadow_vmcs12: 5420 kfree(vmx->nested.cached_vmcs12); 5421 5422 out_cached_vmcs12: 5423 free_loaded_vmcs(&vmx->nested.vmcs02); 5424 5425 out_vmcs02: 5426 return -ENOMEM; 5427 } 5428 5429 /* Emulate the VMXON instruction. */ 5430 static int handle_vmxon(struct kvm_vcpu *vcpu) 5431 { 5432 int ret; 5433 gpa_t vmptr; 5434 uint32_t revision; 5435 struct vcpu_vmx *vmx = to_vmx(vcpu); 5436 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5437 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5438 5439 /* 5440 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5441 * the guest and so cannot rely on hardware to perform the check, 5442 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5443 * for VMXON). 5444 * 5445 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5446 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5447 * force any of the relevant guest state. For a restricted guest, KVM 5448 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5449 * Real Mode, and so there's no need to check CR0.PE manually. 5450 */ 5451 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5452 kvm_queue_exception(vcpu, UD_VECTOR); 5453 return 1; 5454 } 5455 5456 /* 5457 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5458 * and has higher priority than the VM-Fail due to being post-VMXON, 5459 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5460 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5461 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5462 * VMX non-root. 5463 * 5464 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5465 * #UD checks (see above), is functionally ok because KVM doesn't allow 5466 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5467 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5468 * missed by hardware due to shadowing CR0 and/or CR4. 5469 */ 5470 if (vmx_get_cpl(vcpu)) { 5471 kvm_inject_gp(vcpu, 0); 5472 return 1; 5473 } 5474 5475 if (vmx->nested.vmxon) 5476 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5477 5478 /* 5479 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5480 * only if the vCPU isn't already in VMX operation, i.e. effectively 5481 * have lower priority than the VM-Fail above. 5482 */ 5483 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5484 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5485 kvm_inject_gp(vcpu, 0); 5486 return 1; 5487 } 5488 5489 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5490 != VMXON_NEEDED_FEATURES) { 5491 kvm_inject_gp(vcpu, 0); 5492 return 1; 5493 } 5494 5495 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5496 return ret; 5497 5498 /* 5499 * SDM 3: 24.11.5 5500 * The first 4 bytes of VMXON region contain the supported 5501 * VMCS revision identifier 5502 * 5503 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5504 * which replaces physical address width with 32 5505 */ 5506 if (!page_address_valid(vcpu, vmptr)) 5507 return nested_vmx_failInvalid(vcpu); 5508 5509 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5510 revision != VMCS12_REVISION) 5511 return nested_vmx_failInvalid(vcpu); 5512 5513 vmx->nested.vmxon_ptr = vmptr; 5514 ret = enter_vmx_operation(vcpu); 5515 if (ret) 5516 return ret; 5517 5518 return nested_vmx_succeed(vcpu); 5519 } 5520 5521 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5522 { 5523 struct vcpu_vmx *vmx = to_vmx(vcpu); 5524 5525 if (vmx->nested.current_vmptr == INVALID_GPA) 5526 return; 5527 5528 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5529 5530 if (enable_shadow_vmcs) { 5531 /* copy to memory all shadowed fields in case 5532 they were modified */ 5533 copy_shadow_to_vmcs12(vmx); 5534 vmx_disable_shadow_vmcs(vmx); 5535 } 5536 vmx->nested.posted_intr_nv = -1; 5537 5538 /* Flush VMCS12 to guest memory */ 5539 kvm_vcpu_write_guest_page(vcpu, 5540 vmx->nested.current_vmptr >> PAGE_SHIFT, 5541 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5542 5543 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5544 5545 vmx->nested.current_vmptr = INVALID_GPA; 5546 } 5547 5548 /* Emulate the VMXOFF instruction */ 5549 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5550 { 5551 if (!nested_vmx_check_permission(vcpu)) 5552 return 1; 5553 5554 free_nested(vcpu); 5555 5556 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5557 kvm_make_request(KVM_REQ_EVENT, vcpu); 5558 5559 return nested_vmx_succeed(vcpu); 5560 } 5561 5562 /* Emulate the VMCLEAR instruction */ 5563 static int handle_vmclear(struct kvm_vcpu *vcpu) 5564 { 5565 struct vcpu_vmx *vmx = to_vmx(vcpu); 5566 u32 zero = 0; 5567 gpa_t vmptr; 5568 int r; 5569 5570 if (!nested_vmx_check_permission(vcpu)) 5571 return 1; 5572 5573 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5574 return r; 5575 5576 if (!page_address_valid(vcpu, vmptr)) 5577 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5578 5579 if (vmptr == vmx->nested.vmxon_ptr) 5580 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5581 5582 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5583 if (vmptr == vmx->nested.current_vmptr) 5584 nested_release_vmcs12(vcpu); 5585 5586 /* 5587 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5588 * for VMCLEAR includes a "ensure that data for VMCS referenced 5589 * by the operand is in memory" clause that guards writes to 5590 * memory, i.e. doing nothing for I/O is architecturally valid. 5591 * 5592 * FIXME: Suppress failures if and only if no memslot is found, 5593 * i.e. exit to userspace if __copy_to_user() fails. 5594 */ 5595 (void)kvm_vcpu_write_guest(vcpu, 5596 vmptr + offsetof(struct vmcs12, 5597 launch_state), 5598 &zero, sizeof(zero)); 5599 } 5600 5601 return nested_vmx_succeed(vcpu); 5602 } 5603 5604 /* Emulate the VMLAUNCH instruction */ 5605 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5606 { 5607 return nested_vmx_run(vcpu, true); 5608 } 5609 5610 /* Emulate the VMRESUME instruction */ 5611 static int handle_vmresume(struct kvm_vcpu *vcpu) 5612 { 5613 5614 return nested_vmx_run(vcpu, false); 5615 } 5616 5617 static int handle_vmread(struct kvm_vcpu *vcpu) 5618 { 5619 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5620 : get_vmcs12(vcpu); 5621 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5622 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5623 struct vcpu_vmx *vmx = to_vmx(vcpu); 5624 struct x86_exception e; 5625 unsigned long field; 5626 u64 value; 5627 gva_t gva = 0; 5628 short offset; 5629 int len, r; 5630 5631 if (!nested_vmx_check_permission(vcpu)) 5632 return 1; 5633 5634 /* Decode instruction info and find the field to read */ 5635 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5636 5637 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5638 /* 5639 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5640 * any VMREAD sets the ALU flags for VMfailInvalid. 5641 */ 5642 if (vmx->nested.current_vmptr == INVALID_GPA || 5643 (is_guest_mode(vcpu) && 5644 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5645 return nested_vmx_failInvalid(vcpu); 5646 5647 offset = get_vmcs12_field_offset(field); 5648 if (offset < 0) 5649 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5650 5651 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5652 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5653 5654 /* Read the field, zero-extended to a u64 value */ 5655 value = vmcs12_read_any(vmcs12, field, offset); 5656 } else { 5657 /* 5658 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5659 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5660 * unsupported. Unfortunately, certain versions of Windows 11 5661 * don't comply with this requirement which is not enforced in 5662 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5663 * workaround, as misbehaving guests will panic on VM-Fail. 5664 * Note, enlightened VMCS is incompatible with shadow VMCS so 5665 * all VMREADs from L2 should go to L1. 5666 */ 5667 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5668 return nested_vmx_failInvalid(vcpu); 5669 5670 offset = evmcs_field_offset(field, NULL); 5671 if (offset < 0) 5672 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5673 5674 /* Read the field, zero-extended to a u64 value */ 5675 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5676 } 5677 5678 /* 5679 * Now copy part of this value to register or memory, as requested. 5680 * Note that the number of bits actually copied is 32 or 64 depending 5681 * on the guest's mode (32 or 64 bit), not on the given field's length. 5682 */ 5683 if (instr_info & BIT(10)) { 5684 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5685 } else { 5686 len = is_64_bit_mode(vcpu) ? 8 : 4; 5687 if (get_vmx_mem_address(vcpu, exit_qualification, 5688 instr_info, true, len, &gva)) 5689 return 1; 5690 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5691 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5692 if (r != X86EMUL_CONTINUE) 5693 return kvm_handle_memory_failure(vcpu, r, &e); 5694 } 5695 5696 return nested_vmx_succeed(vcpu); 5697 } 5698 5699 static bool is_shadow_field_rw(unsigned long field) 5700 { 5701 switch (field) { 5702 #define SHADOW_FIELD_RW(x, y) case x: 5703 #include "vmcs_shadow_fields.h" 5704 return true; 5705 default: 5706 break; 5707 } 5708 return false; 5709 } 5710 5711 static bool is_shadow_field_ro(unsigned long field) 5712 { 5713 switch (field) { 5714 #define SHADOW_FIELD_RO(x, y) case x: 5715 #include "vmcs_shadow_fields.h" 5716 return true; 5717 default: 5718 break; 5719 } 5720 return false; 5721 } 5722 5723 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5724 { 5725 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5726 : get_vmcs12(vcpu); 5727 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5728 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5729 struct vcpu_vmx *vmx = to_vmx(vcpu); 5730 struct x86_exception e; 5731 unsigned long field; 5732 short offset; 5733 gva_t gva; 5734 int len, r; 5735 5736 /* 5737 * The value to write might be 32 or 64 bits, depending on L1's long 5738 * mode, and eventually we need to write that into a field of several 5739 * possible lengths. The code below first zero-extends the value to 64 5740 * bit (value), and then copies only the appropriate number of 5741 * bits into the vmcs12 field. 5742 */ 5743 u64 value = 0; 5744 5745 if (!nested_vmx_check_permission(vcpu)) 5746 return 1; 5747 5748 /* 5749 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5750 * any VMWRITE sets the ALU flags for VMfailInvalid. 5751 */ 5752 if (vmx->nested.current_vmptr == INVALID_GPA || 5753 (is_guest_mode(vcpu) && 5754 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5755 return nested_vmx_failInvalid(vcpu); 5756 5757 if (instr_info & BIT(10)) 5758 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5759 else { 5760 len = is_64_bit_mode(vcpu) ? 8 : 4; 5761 if (get_vmx_mem_address(vcpu, exit_qualification, 5762 instr_info, false, len, &gva)) 5763 return 1; 5764 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5765 if (r != X86EMUL_CONTINUE) 5766 return kvm_handle_memory_failure(vcpu, r, &e); 5767 } 5768 5769 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5770 5771 offset = get_vmcs12_field_offset(field); 5772 if (offset < 0) 5773 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5774 5775 /* 5776 * If the vCPU supports "VMWRITE to any supported field in the 5777 * VMCS," then the "read-only" fields are actually read/write. 5778 */ 5779 if (vmcs_field_readonly(field) && 5780 !nested_cpu_has_vmwrite_any_field(vcpu)) 5781 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5782 5783 /* 5784 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5785 * vmcs12, else we may crush a field or consume a stale value. 5786 */ 5787 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5788 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5789 5790 /* 5791 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5792 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5793 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5794 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5795 * from L1 will return a different value than VMREAD from L2 (L1 sees 5796 * the stripped down value, L2 sees the full value as stored by KVM). 5797 */ 5798 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5799 value &= 0x1f0ff; 5800 5801 vmcs12_write_any(vmcs12, field, offset, value); 5802 5803 /* 5804 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5805 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5806 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5807 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5808 */ 5809 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5810 /* 5811 * L1 can read these fields without exiting, ensure the 5812 * shadow VMCS is up-to-date. 5813 */ 5814 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5815 preempt_disable(); 5816 vmcs_load(vmx->vmcs01.shadow_vmcs); 5817 5818 __vmcs_writel(field, value); 5819 5820 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5821 vmcs_load(vmx->loaded_vmcs->vmcs); 5822 preempt_enable(); 5823 } 5824 vmx->nested.dirty_vmcs12 = true; 5825 } 5826 5827 return nested_vmx_succeed(vcpu); 5828 } 5829 5830 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5831 { 5832 vmx->nested.current_vmptr = vmptr; 5833 if (enable_shadow_vmcs) { 5834 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5835 vmcs_write64(VMCS_LINK_POINTER, 5836 __pa(vmx->vmcs01.shadow_vmcs)); 5837 vmx->nested.need_vmcs12_to_shadow_sync = true; 5838 } 5839 vmx->nested.dirty_vmcs12 = true; 5840 vmx->nested.force_msr_bitmap_recalc = true; 5841 } 5842 5843 /* Emulate the VMPTRLD instruction */ 5844 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5845 { 5846 struct vcpu_vmx *vmx = to_vmx(vcpu); 5847 gpa_t vmptr; 5848 int r; 5849 5850 if (!nested_vmx_check_permission(vcpu)) 5851 return 1; 5852 5853 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5854 return r; 5855 5856 if (!page_address_valid(vcpu, vmptr)) 5857 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5858 5859 if (vmptr == vmx->nested.vmxon_ptr) 5860 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5861 5862 /* Forbid normal VMPTRLD if Enlightened version was used */ 5863 if (nested_vmx_is_evmptr12_valid(vmx)) 5864 return 1; 5865 5866 if (vmx->nested.current_vmptr != vmptr) { 5867 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5868 struct vmcs_hdr hdr; 5869 5870 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5871 /* 5872 * Reads from an unbacked page return all 1s, 5873 * which means that the 32 bits located at the 5874 * given physical address won't match the required 5875 * VMCS12_REVISION identifier. 5876 */ 5877 return nested_vmx_fail(vcpu, 5878 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5879 } 5880 5881 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5882 offsetof(struct vmcs12, hdr), 5883 sizeof(hdr))) { 5884 return nested_vmx_fail(vcpu, 5885 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5886 } 5887 5888 if (hdr.revision_id != VMCS12_REVISION || 5889 (hdr.shadow_vmcs && 5890 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5891 return nested_vmx_fail(vcpu, 5892 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5893 } 5894 5895 nested_release_vmcs12(vcpu); 5896 5897 /* 5898 * Load VMCS12 from guest memory since it is not already 5899 * cached. 5900 */ 5901 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5902 VMCS12_SIZE)) { 5903 return nested_vmx_fail(vcpu, 5904 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5905 } 5906 5907 set_current_vmptr(vmx, vmptr); 5908 } 5909 5910 return nested_vmx_succeed(vcpu); 5911 } 5912 5913 /* Emulate the VMPTRST instruction */ 5914 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5915 { 5916 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5917 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5918 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5919 struct x86_exception e; 5920 gva_t gva; 5921 int r; 5922 5923 if (!nested_vmx_check_permission(vcpu)) 5924 return 1; 5925 5926 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5927 return 1; 5928 5929 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5930 true, sizeof(gpa_t), &gva)) 5931 return 1; 5932 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5933 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5934 sizeof(gpa_t), &e); 5935 if (r != X86EMUL_CONTINUE) 5936 return kvm_handle_memory_failure(vcpu, r, &e); 5937 5938 return nested_vmx_succeed(vcpu); 5939 } 5940 5941 /* Emulate the INVEPT instruction */ 5942 static int handle_invept(struct kvm_vcpu *vcpu) 5943 { 5944 struct vcpu_vmx *vmx = to_vmx(vcpu); 5945 u32 vmx_instruction_info, types; 5946 unsigned long type, roots_to_free; 5947 struct kvm_mmu *mmu; 5948 gva_t gva; 5949 struct x86_exception e; 5950 struct { 5951 u64 eptp, gpa; 5952 } operand; 5953 int i, r, gpr_index; 5954 5955 if (!(vmx->nested.msrs.secondary_ctls_high & 5956 SECONDARY_EXEC_ENABLE_EPT) || 5957 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5958 kvm_queue_exception(vcpu, UD_VECTOR); 5959 return 1; 5960 } 5961 5962 if (!nested_vmx_check_permission(vcpu)) 5963 return 1; 5964 5965 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5966 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5967 type = kvm_register_read(vcpu, gpr_index); 5968 5969 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5970 5971 if (type >= 32 || !(types & (1 << type))) 5972 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5973 5974 /* According to the Intel VMX instruction reference, the memory 5975 * operand is read even if it isn't needed (e.g., for type==global) 5976 */ 5977 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5978 vmx_instruction_info, false, sizeof(operand), &gva)) 5979 return 1; 5980 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5981 if (r != X86EMUL_CONTINUE) 5982 return kvm_handle_memory_failure(vcpu, r, &e); 5983 5984 /* 5985 * Nested EPT roots are always held through guest_mmu, 5986 * not root_mmu. 5987 */ 5988 mmu = &vcpu->arch.guest_mmu; 5989 5990 switch (type) { 5991 case VMX_EPT_EXTENT_CONTEXT: 5992 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5993 return nested_vmx_fail(vcpu, 5994 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5995 5996 roots_to_free = 0; 5997 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5998 operand.eptp)) 5999 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6000 6001 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6002 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6003 mmu->prev_roots[i].pgd, 6004 operand.eptp)) 6005 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6006 } 6007 break; 6008 case VMX_EPT_EXTENT_GLOBAL: 6009 roots_to_free = KVM_MMU_ROOTS_ALL; 6010 break; 6011 default: 6012 BUG(); 6013 break; 6014 } 6015 6016 if (roots_to_free) 6017 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6018 6019 return nested_vmx_succeed(vcpu); 6020 } 6021 6022 static int handle_invvpid(struct kvm_vcpu *vcpu) 6023 { 6024 struct vcpu_vmx *vmx = to_vmx(vcpu); 6025 u32 vmx_instruction_info; 6026 unsigned long type, types; 6027 gva_t gva; 6028 struct x86_exception e; 6029 struct { 6030 u64 vpid; 6031 u64 gla; 6032 } operand; 6033 u16 vpid02; 6034 int r, gpr_index; 6035 6036 if (!(vmx->nested.msrs.secondary_ctls_high & 6037 SECONDARY_EXEC_ENABLE_VPID) || 6038 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6039 kvm_queue_exception(vcpu, UD_VECTOR); 6040 return 1; 6041 } 6042 6043 if (!nested_vmx_check_permission(vcpu)) 6044 return 1; 6045 6046 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6047 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6048 type = kvm_register_read(vcpu, gpr_index); 6049 6050 types = (vmx->nested.msrs.vpid_caps & 6051 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6052 6053 if (type >= 32 || !(types & (1 << type))) 6054 return nested_vmx_fail(vcpu, 6055 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6056 6057 /* according to the intel vmx instruction reference, the memory 6058 * operand is read even if it isn't needed (e.g., for type==global) 6059 */ 6060 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6061 vmx_instruction_info, false, sizeof(operand), &gva)) 6062 return 1; 6063 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6064 if (r != X86EMUL_CONTINUE) 6065 return kvm_handle_memory_failure(vcpu, r, &e); 6066 6067 if (operand.vpid >> 16) 6068 return nested_vmx_fail(vcpu, 6069 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6070 6071 /* 6072 * Always flush the effective vpid02, i.e. never flush the current VPID 6073 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6074 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6075 * irrelevant (and there may not be a loaded vmcs12). 6076 */ 6077 vpid02 = nested_get_vpid02(vcpu); 6078 switch (type) { 6079 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6080 /* 6081 * LAM doesn't apply to addresses that are inputs to TLB 6082 * invalidation. 6083 */ 6084 if (!operand.vpid || 6085 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6086 return nested_vmx_fail(vcpu, 6087 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6088 vpid_sync_vcpu_addr(vpid02, operand.gla); 6089 break; 6090 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6091 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6092 if (!operand.vpid) 6093 return nested_vmx_fail(vcpu, 6094 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6095 vpid_sync_context(vpid02); 6096 break; 6097 case VMX_VPID_EXTENT_ALL_CONTEXT: 6098 vpid_sync_context(vpid02); 6099 break; 6100 default: 6101 WARN_ON_ONCE(1); 6102 return kvm_skip_emulated_instruction(vcpu); 6103 } 6104 6105 /* 6106 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6107 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6108 * roots as VPIDs are not tracked in the MMU role. 6109 * 6110 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6111 * an MMU when EPT is disabled. 6112 * 6113 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6114 */ 6115 if (!enable_ept) 6116 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6117 6118 return nested_vmx_succeed(vcpu); 6119 } 6120 6121 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6122 struct vmcs12 *vmcs12) 6123 { 6124 u32 index = kvm_rcx_read(vcpu); 6125 u64 new_eptp; 6126 6127 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6128 return 1; 6129 if (index >= VMFUNC_EPTP_ENTRIES) 6130 return 1; 6131 6132 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6133 &new_eptp, index * 8, 8)) 6134 return 1; 6135 6136 /* 6137 * If the (L2) guest does a vmfunc to the currently 6138 * active ept pointer, we don't have to do anything else 6139 */ 6140 if (vmcs12->ept_pointer != new_eptp) { 6141 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6142 return 1; 6143 6144 vmcs12->ept_pointer = new_eptp; 6145 nested_ept_new_eptp(vcpu); 6146 6147 if (!nested_cpu_has_vpid(vmcs12)) 6148 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6149 } 6150 6151 return 0; 6152 } 6153 6154 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6155 { 6156 struct vcpu_vmx *vmx = to_vmx(vcpu); 6157 struct vmcs12 *vmcs12; 6158 u32 function = kvm_rax_read(vcpu); 6159 6160 /* 6161 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6162 * VMFUNC for nested VMs, but not for L1. 6163 */ 6164 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6165 kvm_queue_exception(vcpu, UD_VECTOR); 6166 return 1; 6167 } 6168 6169 vmcs12 = get_vmcs12(vcpu); 6170 6171 /* 6172 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6173 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6174 */ 6175 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6176 kvm_queue_exception(vcpu, UD_VECTOR); 6177 return 1; 6178 } 6179 6180 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6181 goto fail; 6182 6183 switch (function) { 6184 case 0: 6185 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6186 goto fail; 6187 break; 6188 default: 6189 goto fail; 6190 } 6191 return kvm_skip_emulated_instruction(vcpu); 6192 6193 fail: 6194 /* 6195 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6196 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6197 * EXIT_REASON_VMFUNC as the exit reason. 6198 */ 6199 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6200 vmx_get_intr_info(vcpu), 6201 vmx_get_exit_qual(vcpu)); 6202 return 1; 6203 } 6204 6205 /* 6206 * Return true if an IO instruction with the specified port and size should cause 6207 * a VM-exit into L1. 6208 */ 6209 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6210 int size) 6211 { 6212 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6213 gpa_t bitmap, last_bitmap; 6214 u8 b; 6215 6216 last_bitmap = INVALID_GPA; 6217 b = -1; 6218 6219 while (size > 0) { 6220 if (port < 0x8000) 6221 bitmap = vmcs12->io_bitmap_a; 6222 else if (port < 0x10000) 6223 bitmap = vmcs12->io_bitmap_b; 6224 else 6225 return true; 6226 bitmap += (port & 0x7fff) / 8; 6227 6228 if (last_bitmap != bitmap) 6229 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6230 return true; 6231 if (b & (1 << (port & 7))) 6232 return true; 6233 6234 port++; 6235 size--; 6236 last_bitmap = bitmap; 6237 } 6238 6239 return false; 6240 } 6241 6242 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6243 struct vmcs12 *vmcs12) 6244 { 6245 unsigned long exit_qualification; 6246 unsigned short port; 6247 int size; 6248 6249 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6250 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6251 6252 exit_qualification = vmx_get_exit_qual(vcpu); 6253 6254 port = exit_qualification >> 16; 6255 size = (exit_qualification & 7) + 1; 6256 6257 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6258 } 6259 6260 /* 6261 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6262 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6263 * disinterest in the current event (read or write a specific MSR) by using an 6264 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6265 */ 6266 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6267 struct vmcs12 *vmcs12, 6268 union vmx_exit_reason exit_reason) 6269 { 6270 u32 msr_index; 6271 gpa_t bitmap; 6272 6273 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6274 return true; 6275 6276 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6277 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6278 msr_index = vmx_get_exit_qual(vcpu); 6279 else 6280 msr_index = kvm_rcx_read(vcpu); 6281 6282 /* 6283 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6284 * for the four combinations of read/write and low/high MSR numbers. 6285 * First we need to figure out which of the four to use: 6286 */ 6287 bitmap = vmcs12->msr_bitmap; 6288 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6289 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6290 bitmap += 2048; 6291 if (msr_index >= 0xc0000000) { 6292 msr_index -= 0xc0000000; 6293 bitmap += 1024; 6294 } 6295 6296 /* Then read the msr_index'th bit from this bitmap: */ 6297 if (msr_index < 1024*8) { 6298 unsigned char b; 6299 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6300 return true; 6301 return 1 & (b >> (msr_index & 7)); 6302 } else 6303 return true; /* let L1 handle the wrong parameter */ 6304 } 6305 6306 /* 6307 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6308 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6309 * intercept (via guest_host_mask etc.) the current event. 6310 */ 6311 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6312 struct vmcs12 *vmcs12) 6313 { 6314 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6315 int cr = exit_qualification & 15; 6316 int reg; 6317 unsigned long val; 6318 6319 switch ((exit_qualification >> 4) & 3) { 6320 case 0: /* mov to cr */ 6321 reg = (exit_qualification >> 8) & 15; 6322 val = kvm_register_read(vcpu, reg); 6323 switch (cr) { 6324 case 0: 6325 if (vmcs12->cr0_guest_host_mask & 6326 (val ^ vmcs12->cr0_read_shadow)) 6327 return true; 6328 break; 6329 case 3: 6330 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6331 return true; 6332 break; 6333 case 4: 6334 if (vmcs12->cr4_guest_host_mask & 6335 (vmcs12->cr4_read_shadow ^ val)) 6336 return true; 6337 break; 6338 case 8: 6339 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6340 return true; 6341 break; 6342 } 6343 break; 6344 case 2: /* clts */ 6345 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6346 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6347 return true; 6348 break; 6349 case 1: /* mov from cr */ 6350 switch (cr) { 6351 case 3: 6352 if (vmcs12->cpu_based_vm_exec_control & 6353 CPU_BASED_CR3_STORE_EXITING) 6354 return true; 6355 break; 6356 case 8: 6357 if (vmcs12->cpu_based_vm_exec_control & 6358 CPU_BASED_CR8_STORE_EXITING) 6359 return true; 6360 break; 6361 } 6362 break; 6363 case 3: /* lmsw */ 6364 /* 6365 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6366 * cr0. Other attempted changes are ignored, with no exit. 6367 */ 6368 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6369 if (vmcs12->cr0_guest_host_mask & 0xe & 6370 (val ^ vmcs12->cr0_read_shadow)) 6371 return true; 6372 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6373 !(vmcs12->cr0_read_shadow & 0x1) && 6374 (val & 0x1)) 6375 return true; 6376 break; 6377 } 6378 return false; 6379 } 6380 6381 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6382 struct vmcs12 *vmcs12) 6383 { 6384 u32 encls_leaf; 6385 6386 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6387 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6388 return false; 6389 6390 encls_leaf = kvm_rax_read(vcpu); 6391 if (encls_leaf > 62) 6392 encls_leaf = 63; 6393 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6394 } 6395 6396 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6397 struct vmcs12 *vmcs12, gpa_t bitmap) 6398 { 6399 u32 vmx_instruction_info; 6400 unsigned long field; 6401 u8 b; 6402 6403 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6404 return true; 6405 6406 /* Decode instruction info and find the field to access */ 6407 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6408 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6409 6410 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6411 if (field >> 15) 6412 return true; 6413 6414 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6415 return true; 6416 6417 return 1 & (b >> (field & 7)); 6418 } 6419 6420 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6421 { 6422 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6423 6424 if (nested_cpu_has_mtf(vmcs12)) 6425 return true; 6426 6427 /* 6428 * An MTF VM-exit may be injected into the guest by setting the 6429 * interruption-type to 7 (other event) and the vector field to 0. Such 6430 * is the case regardless of the 'monitor trap flag' VM-execution 6431 * control. 6432 */ 6433 return entry_intr_info == (INTR_INFO_VALID_MASK 6434 | INTR_TYPE_OTHER_EVENT); 6435 } 6436 6437 /* 6438 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6439 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6440 */ 6441 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6442 union vmx_exit_reason exit_reason) 6443 { 6444 u32 intr_info; 6445 6446 switch ((u16)exit_reason.basic) { 6447 case EXIT_REASON_EXCEPTION_NMI: 6448 intr_info = vmx_get_intr_info(vcpu); 6449 if (is_nmi(intr_info)) 6450 return true; 6451 else if (is_page_fault(intr_info)) 6452 return vcpu->arch.apf.host_apf_flags || 6453 vmx_need_pf_intercept(vcpu); 6454 else if (is_debug(intr_info) && 6455 vcpu->guest_debug & 6456 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6457 return true; 6458 else if (is_breakpoint(intr_info) && 6459 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6460 return true; 6461 else if (is_alignment_check(intr_info) && 6462 !vmx_guest_inject_ac(vcpu)) 6463 return true; 6464 else if (is_ve_fault(intr_info)) 6465 return true; 6466 return false; 6467 case EXIT_REASON_EXTERNAL_INTERRUPT: 6468 return true; 6469 case EXIT_REASON_MCE_DURING_VMENTRY: 6470 return true; 6471 case EXIT_REASON_EPT_VIOLATION: 6472 /* 6473 * L0 always deals with the EPT violation. If nested EPT is 6474 * used, and the nested mmu code discovers that the address is 6475 * missing in the guest EPT table (EPT12), the EPT violation 6476 * will be injected with nested_ept_inject_page_fault() 6477 */ 6478 return true; 6479 case EXIT_REASON_EPT_MISCONFIG: 6480 /* 6481 * L2 never uses directly L1's EPT, but rather L0's own EPT 6482 * table (shadow on EPT) or a merged EPT table that L0 built 6483 * (EPT on EPT). So any problems with the structure of the 6484 * table is L0's fault. 6485 */ 6486 return true; 6487 case EXIT_REASON_PREEMPTION_TIMER: 6488 return true; 6489 case EXIT_REASON_PML_FULL: 6490 /* 6491 * PML is emulated for an L1 VMM and should never be enabled in 6492 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6493 */ 6494 return true; 6495 case EXIT_REASON_VMFUNC: 6496 /* VM functions are emulated through L2->L0 vmexits. */ 6497 return true; 6498 case EXIT_REASON_BUS_LOCK: 6499 /* 6500 * At present, bus lock VM exit is never exposed to L1. 6501 * Handle L2's bus locks in L0 directly. 6502 */ 6503 return true; 6504 #ifdef CONFIG_KVM_HYPERV 6505 case EXIT_REASON_VMCALL: 6506 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6507 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6508 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6509 kvm_hv_is_tlb_flush_hcall(vcpu); 6510 #endif 6511 default: 6512 break; 6513 } 6514 return false; 6515 } 6516 6517 /* 6518 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6519 * is_guest_mode (L2). 6520 */ 6521 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6522 union vmx_exit_reason exit_reason) 6523 { 6524 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6525 u32 intr_info; 6526 6527 switch ((u16)exit_reason.basic) { 6528 case EXIT_REASON_EXCEPTION_NMI: 6529 intr_info = vmx_get_intr_info(vcpu); 6530 if (is_nmi(intr_info)) 6531 return true; 6532 else if (is_page_fault(intr_info)) 6533 return true; 6534 return vmcs12->exception_bitmap & 6535 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6536 case EXIT_REASON_EXTERNAL_INTERRUPT: 6537 return nested_exit_on_intr(vcpu); 6538 case EXIT_REASON_TRIPLE_FAULT: 6539 return true; 6540 case EXIT_REASON_INTERRUPT_WINDOW: 6541 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6542 case EXIT_REASON_NMI_WINDOW: 6543 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6544 case EXIT_REASON_TASK_SWITCH: 6545 return true; 6546 case EXIT_REASON_CPUID: 6547 return true; 6548 case EXIT_REASON_HLT: 6549 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6550 case EXIT_REASON_INVD: 6551 return true; 6552 case EXIT_REASON_INVLPG: 6553 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6554 case EXIT_REASON_RDPMC: 6555 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6556 case EXIT_REASON_RDRAND: 6557 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6558 case EXIT_REASON_RDSEED: 6559 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6560 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6561 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6562 case EXIT_REASON_VMREAD: 6563 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6564 vmcs12->vmread_bitmap); 6565 case EXIT_REASON_VMWRITE: 6566 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6567 vmcs12->vmwrite_bitmap); 6568 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6569 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6570 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6571 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6572 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6573 /* 6574 * VMX instructions trap unconditionally. This allows L1 to 6575 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6576 */ 6577 return true; 6578 case EXIT_REASON_CR_ACCESS: 6579 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6580 case EXIT_REASON_DR_ACCESS: 6581 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6582 case EXIT_REASON_IO_INSTRUCTION: 6583 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6584 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6585 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6586 case EXIT_REASON_MSR_READ: 6587 case EXIT_REASON_MSR_WRITE: 6588 case EXIT_REASON_MSR_READ_IMM: 6589 case EXIT_REASON_MSR_WRITE_IMM: 6590 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6591 case EXIT_REASON_INVALID_STATE: 6592 return true; 6593 case EXIT_REASON_MWAIT_INSTRUCTION: 6594 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6595 case EXIT_REASON_MONITOR_TRAP_FLAG: 6596 return nested_vmx_exit_handled_mtf(vmcs12); 6597 case EXIT_REASON_MONITOR_INSTRUCTION: 6598 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6599 case EXIT_REASON_PAUSE_INSTRUCTION: 6600 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6601 nested_cpu_has2(vmcs12, 6602 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6603 case EXIT_REASON_MCE_DURING_VMENTRY: 6604 return true; 6605 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6606 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6607 case EXIT_REASON_APIC_ACCESS: 6608 case EXIT_REASON_APIC_WRITE: 6609 case EXIT_REASON_EOI_INDUCED: 6610 /* 6611 * The controls for "virtualize APIC accesses," "APIC- 6612 * register virtualization," and "virtual-interrupt 6613 * delivery" only come from vmcs12. 6614 */ 6615 return true; 6616 case EXIT_REASON_INVPCID: 6617 return 6618 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6619 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6620 case EXIT_REASON_WBINVD: 6621 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6622 case EXIT_REASON_XSETBV: 6623 return true; 6624 case EXIT_REASON_XSAVES: 6625 case EXIT_REASON_XRSTORS: 6626 /* 6627 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6628 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6629 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6630 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6631 * in that case, before consulting the XSS-bitmap. 6632 */ 6633 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6634 return true; 6635 case EXIT_REASON_UMWAIT: 6636 case EXIT_REASON_TPAUSE: 6637 return nested_cpu_has2(vmcs12, 6638 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6639 case EXIT_REASON_ENCLS: 6640 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6641 case EXIT_REASON_NOTIFY: 6642 /* Notify VM exit is not exposed to L1 */ 6643 return false; 6644 case EXIT_REASON_SEAMCALL: 6645 case EXIT_REASON_TDCALL: 6646 /* 6647 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6648 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6649 * never want or expect such an exit. 6650 */ 6651 return false; 6652 default: 6653 return true; 6654 } 6655 } 6656 6657 /* 6658 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6659 * reflected into L1. 6660 */ 6661 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6662 { 6663 struct vcpu_vmx *vmx = to_vmx(vcpu); 6664 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6665 unsigned long exit_qual; 6666 u32 exit_intr_info; 6667 6668 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6669 6670 /* 6671 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6672 * has already loaded L2's state. 6673 */ 6674 if (unlikely(vmx->fail)) { 6675 trace_kvm_nested_vmenter_failed( 6676 "hardware VM-instruction error: ", 6677 vmcs_read32(VM_INSTRUCTION_ERROR)); 6678 exit_intr_info = 0; 6679 exit_qual = 0; 6680 goto reflect_vmexit; 6681 } 6682 6683 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6684 6685 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6686 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6687 return false; 6688 6689 /* If L1 doesn't want the exit, handle it in L0. */ 6690 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6691 return false; 6692 6693 /* 6694 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6695 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6696 * need to be synthesized by querying the in-kernel LAPIC, but external 6697 * interrupts are never reflected to L1 so it's a non-issue. 6698 */ 6699 exit_intr_info = vmx_get_intr_info(vcpu); 6700 if (is_exception_with_error_code(exit_intr_info)) { 6701 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6702 6703 vmcs12->vm_exit_intr_error_code = 6704 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6705 } 6706 exit_qual = vmx_get_exit_qual(vcpu); 6707 6708 reflect_vmexit: 6709 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6710 return true; 6711 } 6712 6713 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6714 struct kvm_nested_state __user *user_kvm_nested_state, 6715 u32 user_data_size) 6716 { 6717 struct vcpu_vmx *vmx; 6718 struct vmcs12 *vmcs12; 6719 struct kvm_nested_state kvm_state = { 6720 .flags = 0, 6721 .format = KVM_STATE_NESTED_FORMAT_VMX, 6722 .size = sizeof(kvm_state), 6723 .hdr.vmx.flags = 0, 6724 .hdr.vmx.vmxon_pa = INVALID_GPA, 6725 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6726 .hdr.vmx.preemption_timer_deadline = 0, 6727 }; 6728 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6729 &user_kvm_nested_state->data.vmx[0]; 6730 6731 if (!vcpu) 6732 return kvm_state.size + sizeof(*user_vmx_nested_state); 6733 6734 vmx = to_vmx(vcpu); 6735 vmcs12 = get_vmcs12(vcpu); 6736 6737 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6738 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6739 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6740 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6741 6742 if (vmx_has_valid_vmcs12(vcpu)) { 6743 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6744 6745 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6746 if (nested_vmx_is_evmptr12_set(vmx)) 6747 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6748 6749 if (is_guest_mode(vcpu) && 6750 nested_cpu_has_shadow_vmcs(vmcs12) && 6751 vmcs12->vmcs_link_pointer != INVALID_GPA) 6752 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6753 } 6754 6755 if (vmx->nested.smm.vmxon) 6756 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6757 6758 if (vmx->nested.smm.guest_mode) 6759 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6760 6761 if (is_guest_mode(vcpu)) { 6762 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6763 6764 if (vmx->nested.nested_run_pending) 6765 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6766 6767 if (vmx->nested.mtf_pending) 6768 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6769 6770 if (nested_cpu_has_preemption_timer(vmcs12) && 6771 vmx->nested.has_preemption_timer_deadline) { 6772 kvm_state.hdr.vmx.flags |= 6773 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6774 kvm_state.hdr.vmx.preemption_timer_deadline = 6775 vmx->nested.preemption_timer_deadline; 6776 } 6777 } 6778 } 6779 6780 if (user_data_size < kvm_state.size) 6781 goto out; 6782 6783 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6784 return -EFAULT; 6785 6786 if (!vmx_has_valid_vmcs12(vcpu)) 6787 goto out; 6788 6789 /* 6790 * When running L2, the authoritative vmcs12 state is in the 6791 * vmcs02. When running L1, the authoritative vmcs12 state is 6792 * in the shadow or enlightened vmcs linked to vmcs01, unless 6793 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6794 * vmcs12 state is in the vmcs12 already. 6795 */ 6796 if (is_guest_mode(vcpu)) { 6797 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6798 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6799 } else { 6800 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6801 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6802 if (nested_vmx_is_evmptr12_valid(vmx)) 6803 /* 6804 * L1 hypervisor is not obliged to keep eVMCS 6805 * clean fields data always up-to-date while 6806 * not in guest mode, 'hv_clean_fields' is only 6807 * supposed to be actual upon vmentry so we need 6808 * to ignore it here and do full copy. 6809 */ 6810 copy_enlightened_to_vmcs12(vmx, 0); 6811 else if (enable_shadow_vmcs) 6812 copy_shadow_to_vmcs12(vmx); 6813 } 6814 } 6815 6816 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6817 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6818 6819 /* 6820 * Copy over the full allocated size of vmcs12 rather than just the size 6821 * of the struct. 6822 */ 6823 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6824 return -EFAULT; 6825 6826 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6827 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6828 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6829 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6830 return -EFAULT; 6831 } 6832 out: 6833 return kvm_state.size; 6834 } 6835 6836 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6837 { 6838 if (is_guest_mode(vcpu)) { 6839 to_vmx(vcpu)->nested.nested_run_pending = 0; 6840 nested_vmx_vmexit(vcpu, -1, 0, 0); 6841 } 6842 free_nested(vcpu); 6843 } 6844 6845 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6846 struct kvm_nested_state __user *user_kvm_nested_state, 6847 struct kvm_nested_state *kvm_state) 6848 { 6849 struct vcpu_vmx *vmx = to_vmx(vcpu); 6850 struct vmcs12 *vmcs12; 6851 enum vm_entry_failure_code ignored; 6852 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6853 &user_kvm_nested_state->data.vmx[0]; 6854 int ret; 6855 6856 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6857 return -EINVAL; 6858 6859 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6860 if (kvm_state->hdr.vmx.smm.flags) 6861 return -EINVAL; 6862 6863 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6864 return -EINVAL; 6865 6866 /* 6867 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6868 * enable eVMCS capability on vCPU. However, since then 6869 * code was changed such that flag signals vmcs12 should 6870 * be copied into eVMCS in guest memory. 6871 * 6872 * To preserve backwards compatibility, allow user 6873 * to set this flag even when there is no VMXON region. 6874 */ 6875 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6876 return -EINVAL; 6877 } else { 6878 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6879 return -EINVAL; 6880 6881 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6882 return -EINVAL; 6883 } 6884 6885 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6886 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6887 return -EINVAL; 6888 6889 if (kvm_state->hdr.vmx.smm.flags & 6890 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6891 return -EINVAL; 6892 6893 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6894 return -EINVAL; 6895 6896 /* 6897 * SMM temporarily disables VMX, so we cannot be in guest mode, 6898 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6899 * must be zero. 6900 */ 6901 if (is_smm(vcpu) ? 6902 (kvm_state->flags & 6903 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6904 : kvm_state->hdr.vmx.smm.flags) 6905 return -EINVAL; 6906 6907 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6908 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6909 return -EINVAL; 6910 6911 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6912 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6913 !vmx->nested.enlightened_vmcs_enabled)) 6914 return -EINVAL; 6915 6916 vmx_leave_nested(vcpu); 6917 6918 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6919 return 0; 6920 6921 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6922 ret = enter_vmx_operation(vcpu); 6923 if (ret) 6924 return ret; 6925 6926 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6927 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6928 /* See vmx_has_valid_vmcs12. */ 6929 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6930 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6931 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6932 return -EINVAL; 6933 else 6934 return 0; 6935 } 6936 6937 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6938 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6939 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6940 return -EINVAL; 6941 6942 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6943 #ifdef CONFIG_KVM_HYPERV 6944 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6945 /* 6946 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6947 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6948 * restored yet. EVMCS will be mapped from 6949 * nested_get_vmcs12_pages(). 6950 */ 6951 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6952 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6953 #endif 6954 } else { 6955 return -EINVAL; 6956 } 6957 6958 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6959 vmx->nested.smm.vmxon = true; 6960 vmx->nested.vmxon = false; 6961 6962 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6963 vmx->nested.smm.guest_mode = true; 6964 } 6965 6966 vmcs12 = get_vmcs12(vcpu); 6967 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6968 return -EFAULT; 6969 6970 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6971 return -EINVAL; 6972 6973 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6974 return 0; 6975 6976 vmx->nested.nested_run_pending = 6977 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6978 6979 vmx->nested.mtf_pending = 6980 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6981 6982 ret = -EINVAL; 6983 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6984 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6985 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6986 6987 if (kvm_state->size < 6988 sizeof(*kvm_state) + 6989 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6990 goto error_guest_mode; 6991 6992 if (copy_from_user(shadow_vmcs12, 6993 user_vmx_nested_state->shadow_vmcs12, 6994 sizeof(*shadow_vmcs12))) { 6995 ret = -EFAULT; 6996 goto error_guest_mode; 6997 } 6998 6999 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 7000 !shadow_vmcs12->hdr.shadow_vmcs) 7001 goto error_guest_mode; 7002 } 7003 7004 vmx->nested.has_preemption_timer_deadline = false; 7005 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7006 vmx->nested.has_preemption_timer_deadline = true; 7007 vmx->nested.preemption_timer_deadline = 7008 kvm_state->hdr.vmx.preemption_timer_deadline; 7009 } 7010 7011 if (nested_vmx_check_controls(vcpu, vmcs12) || 7012 nested_vmx_check_host_state(vcpu, vmcs12) || 7013 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 7014 goto error_guest_mode; 7015 7016 vmx->nested.dirty_vmcs12 = true; 7017 vmx->nested.force_msr_bitmap_recalc = true; 7018 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7019 if (ret) 7020 goto error_guest_mode; 7021 7022 if (vmx->nested.mtf_pending) 7023 kvm_make_request(KVM_REQ_EVENT, vcpu); 7024 7025 return 0; 7026 7027 error_guest_mode: 7028 vmx->nested.nested_run_pending = 0; 7029 return ret; 7030 } 7031 7032 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7033 { 7034 if (enable_shadow_vmcs) { 7035 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7036 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7037 } 7038 } 7039 7040 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7041 { 7042 /* 7043 * Note these are the so called "index" of the VMCS field encoding, not 7044 * the index into vmcs12. 7045 */ 7046 unsigned int max_idx, idx; 7047 int i; 7048 7049 /* 7050 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7051 * vmcs12, regardless of whether or not the associated feature is 7052 * exposed to L1. Simply find the field with the highest index. 7053 */ 7054 max_idx = 0; 7055 for (i = 0; i < nr_vmcs12_fields; i++) { 7056 /* The vmcs12 table is very, very sparsely populated. */ 7057 if (!vmcs12_field_offsets[i]) 7058 continue; 7059 7060 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7061 if (idx > max_idx) 7062 max_idx = idx; 7063 } 7064 7065 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7066 } 7067 7068 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7069 struct nested_vmx_msrs *msrs) 7070 { 7071 msrs->pinbased_ctls_low = 7072 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7073 7074 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7075 msrs->pinbased_ctls_high &= 7076 PIN_BASED_EXT_INTR_MASK | 7077 PIN_BASED_NMI_EXITING | 7078 PIN_BASED_VIRTUAL_NMIS | 7079 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7080 msrs->pinbased_ctls_high |= 7081 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7082 PIN_BASED_VMX_PREEMPTION_TIMER; 7083 } 7084 7085 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7086 struct nested_vmx_msrs *msrs) 7087 { 7088 msrs->exit_ctls_low = 7089 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7090 7091 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7092 msrs->exit_ctls_high &= 7093 #ifdef CONFIG_X86_64 7094 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7095 #endif 7096 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7097 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7098 msrs->exit_ctls_high |= 7099 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7100 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7101 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7102 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7103 7104 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7105 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7106 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7107 7108 /* We support free control of debug control saving. */ 7109 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7110 } 7111 7112 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7113 struct nested_vmx_msrs *msrs) 7114 { 7115 msrs->entry_ctls_low = 7116 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7117 7118 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7119 msrs->entry_ctls_high &= 7120 #ifdef CONFIG_X86_64 7121 VM_ENTRY_IA32E_MODE | 7122 #endif 7123 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7124 VM_ENTRY_LOAD_CET_STATE; 7125 msrs->entry_ctls_high |= 7126 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7127 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7128 7129 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7130 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7131 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7132 7133 /* We support free control of debug control loading. */ 7134 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7135 } 7136 7137 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7138 struct nested_vmx_msrs *msrs) 7139 { 7140 msrs->procbased_ctls_low = 7141 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7142 7143 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7144 msrs->procbased_ctls_high &= 7145 CPU_BASED_INTR_WINDOW_EXITING | 7146 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7147 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7148 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7149 CPU_BASED_CR3_STORE_EXITING | 7150 #ifdef CONFIG_X86_64 7151 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7152 #endif 7153 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7154 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7155 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7156 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7157 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7158 /* 7159 * We can allow some features even when not supported by the 7160 * hardware. For example, L1 can specify an MSR bitmap - and we 7161 * can use it to avoid exits to L1 - even when L0 runs L2 7162 * without MSR bitmaps. 7163 */ 7164 msrs->procbased_ctls_high |= 7165 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7166 CPU_BASED_USE_MSR_BITMAPS; 7167 7168 /* We support free control of CR3 access interception. */ 7169 msrs->procbased_ctls_low &= 7170 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7171 } 7172 7173 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7174 struct vmcs_config *vmcs_conf, 7175 struct nested_vmx_msrs *msrs) 7176 { 7177 msrs->secondary_ctls_low = 0; 7178 7179 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7180 msrs->secondary_ctls_high &= 7181 SECONDARY_EXEC_DESC | 7182 SECONDARY_EXEC_ENABLE_RDTSCP | 7183 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7184 SECONDARY_EXEC_WBINVD_EXITING | 7185 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7186 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7187 SECONDARY_EXEC_RDRAND_EXITING | 7188 SECONDARY_EXEC_ENABLE_INVPCID | 7189 SECONDARY_EXEC_ENABLE_VMFUNC | 7190 SECONDARY_EXEC_RDSEED_EXITING | 7191 SECONDARY_EXEC_ENABLE_XSAVES | 7192 SECONDARY_EXEC_TSC_SCALING | 7193 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7194 7195 /* 7196 * We can emulate "VMCS shadowing," even if the hardware 7197 * doesn't support it. 7198 */ 7199 msrs->secondary_ctls_high |= 7200 SECONDARY_EXEC_SHADOW_VMCS; 7201 7202 if (enable_ept) { 7203 /* nested EPT: emulate EPT also to L1 */ 7204 msrs->secondary_ctls_high |= 7205 SECONDARY_EXEC_ENABLE_EPT; 7206 msrs->ept_caps = 7207 VMX_EPT_PAGE_WALK_4_BIT | 7208 VMX_EPT_PAGE_WALK_5_BIT | 7209 VMX_EPTP_WB_BIT | 7210 VMX_EPT_INVEPT_BIT | 7211 VMX_EPT_EXECUTE_ONLY_BIT; 7212 7213 msrs->ept_caps &= ept_caps; 7214 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7215 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7216 VMX_EPT_1GB_PAGE_BIT; 7217 if (enable_ept_ad_bits) { 7218 msrs->secondary_ctls_high |= 7219 SECONDARY_EXEC_ENABLE_PML; 7220 msrs->ept_caps |= VMX_EPT_AD_BIT; 7221 } 7222 7223 /* 7224 * Advertise EPTP switching irrespective of hardware support, 7225 * KVM emulates it in software so long as VMFUNC is supported. 7226 */ 7227 if (cpu_has_vmx_vmfunc()) 7228 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7229 } 7230 7231 /* 7232 * Old versions of KVM use the single-context version without 7233 * checking for support, so declare that it is supported even 7234 * though it is treated as global context. The alternative is 7235 * not failing the single-context invvpid, and it is worse. 7236 */ 7237 if (enable_vpid) { 7238 msrs->secondary_ctls_high |= 7239 SECONDARY_EXEC_ENABLE_VPID; 7240 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7241 VMX_VPID_EXTENT_SUPPORTED_MASK; 7242 } 7243 7244 if (enable_unrestricted_guest) 7245 msrs->secondary_ctls_high |= 7246 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7247 7248 if (flexpriority_enabled) 7249 msrs->secondary_ctls_high |= 7250 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7251 7252 if (enable_sgx) 7253 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7254 } 7255 7256 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7257 struct nested_vmx_msrs *msrs) 7258 { 7259 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7260 msrs->misc_low |= 7261 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7262 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7263 VMX_MISC_ACTIVITY_HLT | 7264 VMX_MISC_ACTIVITY_WAIT_SIPI; 7265 msrs->misc_high = 0; 7266 } 7267 7268 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7269 { 7270 /* 7271 * This MSR reports some information about VMX support. We 7272 * should return information about the VMX we emulate for the 7273 * guest, and the VMCS structure we give it - not about the 7274 * VMX support of the underlying hardware. 7275 */ 7276 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7277 X86_MEMTYPE_WB); 7278 7279 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7280 if (cpu_has_vmx_basic_inout()) 7281 msrs->basic |= VMX_BASIC_INOUT; 7282 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7283 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7284 } 7285 7286 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7287 { 7288 /* 7289 * These MSRs specify bits which the guest must keep fixed on 7290 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7291 * We picked the standard core2 setting. 7292 */ 7293 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7294 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7295 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7296 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7297 7298 /* These MSRs specify bits which the guest must keep fixed off. */ 7299 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7300 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7301 7302 if (vmx_umip_emulated()) 7303 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7304 } 7305 7306 /* 7307 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7308 * returned for the various VMX controls MSRs when nested VMX is enabled. 7309 * The same values should also be used to verify that vmcs12 control fields are 7310 * valid during nested entry from L1 to L2. 7311 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7312 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7313 * bit in the high half is on if the corresponding bit in the control field 7314 * may be on. See also vmx_control_verify(). 7315 */ 7316 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7317 { 7318 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7319 7320 /* 7321 * Note that as a general rule, the high half of the MSRs (bits in 7322 * the control fields which may be 1) should be initialized by the 7323 * intersection of the underlying hardware's MSR (i.e., features which 7324 * can be supported) and the list of features we want to expose - 7325 * because they are known to be properly supported in our code. 7326 * Also, usually, the low half of the MSRs (bits which must be 1) can 7327 * be set to 0, meaning that L1 may turn off any of these bits. The 7328 * reason is that if one of these bits is necessary, it will appear 7329 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7330 * fields of vmcs01 and vmcs02, will turn these bits off - and 7331 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7332 * These rules have exceptions below. 7333 */ 7334 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7335 7336 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7337 7338 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7339 7340 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7341 7342 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7343 7344 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7345 7346 nested_vmx_setup_basic(msrs); 7347 7348 nested_vmx_setup_cr_fixed(msrs); 7349 7350 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7351 } 7352 7353 void nested_vmx_hardware_unsetup(void) 7354 { 7355 int i; 7356 7357 if (enable_shadow_vmcs) { 7358 for (i = 0; i < VMX_BITMAP_NR; i++) 7359 free_page((unsigned long)vmx_bitmap[i]); 7360 } 7361 } 7362 7363 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7364 { 7365 int i; 7366 7367 /* 7368 * Note! The set of supported vmcs12 fields is consumed by both VMX 7369 * MSR and shadow VMCS setup. 7370 */ 7371 nested_vmx_setup_vmcs12_fields(); 7372 7373 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7374 7375 if (!cpu_has_vmx_shadow_vmcs()) 7376 enable_shadow_vmcs = 0; 7377 if (enable_shadow_vmcs) { 7378 for (i = 0; i < VMX_BITMAP_NR; i++) { 7379 /* 7380 * The vmx_bitmap is not tied to a VM and so should 7381 * not be charged to a memcg. 7382 */ 7383 vmx_bitmap[i] = (unsigned long *) 7384 __get_free_page(GFP_KERNEL); 7385 if (!vmx_bitmap[i]) { 7386 nested_vmx_hardware_unsetup(); 7387 return -ENOMEM; 7388 } 7389 } 7390 7391 init_vmcs_shadow_fields(); 7392 } 7393 7394 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7395 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7396 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7397 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7398 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7399 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7400 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7401 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7402 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7403 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7404 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7405 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7406 7407 return 0; 7408 } 7409 7410 struct kvm_x86_nested_ops vmx_nested_ops = { 7411 .leave_nested = vmx_leave_nested, 7412 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7413 .check_events = vmx_check_nested_events, 7414 .has_events = vmx_has_nested_events, 7415 .triple_fault = nested_vmx_triple_fault, 7416 .get_state = vmx_get_nested_state, 7417 .set_state = vmx_set_nested_state, 7418 .get_nested_state_pages = vmx_get_nested_state_pages, 7419 .write_log_dirty = nested_vmx_write_pml_buffer, 7420 #ifdef CONFIG_KVM_HYPERV 7421 .enable_evmcs = nested_enable_evmcs, 7422 .get_evmcs_version = nested_get_evmcs_version, 7423 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7424 #endif 7425 }; 7426