1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/kvm_host.h> 19 #include <linux/kernel.h> 20 21 #include <asm/msr-index.h> 22 #include <asm/debugreg.h> 23 24 #include "kvm_emulate.h" 25 #include "trace.h" 26 #include "mmu.h" 27 #include "x86.h" 28 #include "smm.h" 29 #include "cpuid.h" 30 #include "lapic.h" 31 #include "svm.h" 32 #include "hyperv.h" 33 34 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 35 36 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, 37 struct x86_exception *fault) 38 { 39 struct vcpu_svm *svm = to_svm(vcpu); 40 struct vmcb *vmcb = svm->vmcb; 41 42 if (vmcb->control.exit_code != SVM_EXIT_NPF) { 43 /* 44 * TODO: track the cause of the nested page fault, and 45 * correctly fill in the high bits of exit_info_1. 46 */ 47 vmcb->control.exit_code = SVM_EXIT_NPF; 48 vmcb->control.exit_info_1 = (1ULL << 32); 49 vmcb->control.exit_info_2 = fault->address; 50 } 51 52 vmcb->control.exit_info_1 &= ~0xffffffffULL; 53 vmcb->control.exit_info_1 |= fault->error_code; 54 55 nested_svm_vmexit(svm); 56 } 57 58 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) 59 { 60 struct vcpu_svm *svm = to_svm(vcpu); 61 u64 cr3 = svm->nested.ctl.nested_cr3; 62 u64 pdpte; 63 int ret; 64 65 /* 66 * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores 67 * nCR3[4:0] when loading PDPTEs from memory. 68 */ 69 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, 70 (cr3 & GENMASK(11, 5)) + index * 8, 8); 71 if (ret) 72 return 0; 73 return pdpte; 74 } 75 76 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) 77 { 78 struct vcpu_svm *svm = to_svm(vcpu); 79 80 return svm->nested.ctl.nested_cr3; 81 } 82 83 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) 84 { 85 struct vcpu_svm *svm = to_svm(vcpu); 86 87 WARN_ON(mmu_is_nested(vcpu)); 88 89 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 90 91 /* 92 * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note, 93 * when called via KVM_SET_NESTED_STATE, that state may _not_ match current 94 * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required. 95 */ 96 kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, 97 svm->vmcb01.ptr->save.efer, 98 svm->nested.ctl.nested_cr3); 99 vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; 100 vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; 101 vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; 102 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 103 } 104 105 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) 106 { 107 vcpu->arch.mmu = &vcpu->arch.root_mmu; 108 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 109 } 110 111 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) 112 { 113 if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) 114 return true; 115 116 if (!nested_npt_enabled(svm)) 117 return true; 118 119 if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) 120 return true; 121 122 return false; 123 } 124 125 void recalc_intercepts(struct vcpu_svm *svm) 126 { 127 struct vmcb_control_area *c, *h; 128 struct vmcb_ctrl_area_cached *g; 129 unsigned int i; 130 131 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 132 133 if (!is_guest_mode(&svm->vcpu)) 134 return; 135 136 c = &svm->vmcb->control; 137 h = &svm->vmcb01.ptr->control; 138 g = &svm->nested.ctl; 139 140 for (i = 0; i < MAX_INTERCEPT; i++) 141 c->intercepts[i] = h->intercepts[i]; 142 143 if (g->int_ctl & V_INTR_MASKING_MASK) { 144 /* 145 * If L2 is active and V_INTR_MASKING is enabled in vmcb12, 146 * disable intercept of CR8 writes as L2's CR8 does not affect 147 * any interrupt KVM may want to inject. 148 * 149 * Similarly, disable intercept of virtual interrupts (used to 150 * detect interrupt windows) if the saved RFLAGS.IF is '0', as 151 * the effective RFLAGS.IF for L1 interrupts will never be set 152 * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs). 153 */ 154 vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); 155 if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)) 156 vmcb_clr_intercept(c, INTERCEPT_VINTR); 157 } 158 159 /* 160 * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB 161 * flush feature is enabled. 162 */ 163 if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu)) 164 vmcb_clr_intercept(c, INTERCEPT_VMMCALL); 165 166 for (i = 0; i < MAX_INTERCEPT; i++) 167 c->intercepts[i] |= g->intercepts[i]; 168 169 /* If SMI is not intercepted, ignore guest SMI intercept as well */ 170 if (!intercept_smi) 171 vmcb_clr_intercept(c, INTERCEPT_SMI); 172 173 if (nested_vmcb_needs_vls_intercept(svm)) { 174 /* 175 * If the virtual VMLOAD/VMSAVE is not enabled for the L2, 176 * we must intercept these instructions to correctly 177 * emulate them in case L1 doesn't intercept them. 178 */ 179 vmcb_set_intercept(c, INTERCEPT_VMLOAD); 180 vmcb_set_intercept(c, INTERCEPT_VMSAVE); 181 } else { 182 WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); 183 } 184 } 185 186 /* 187 * This array (and its actual size) holds the set of offsets (indexing by chunk 188 * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the 189 * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g. 190 * based on CPUID features. This array only tracks MSRs that *might* be passed 191 * through to the guest. 192 * 193 * Hardcode the capacity of the array based on the maximum number of _offsets_. 194 * MSRs are batched together, so there are fewer offsets than MSRs. 195 */ 196 static int nested_svm_msrpm_merge_offsets[10] __ro_after_init; 197 static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; 198 typedef unsigned long nsvm_msrpm_merge_t; 199 200 int __init nested_svm_init_msrpm_merge_offsets(void) 201 { 202 static const u32 merge_msrs[] __initconst = { 203 MSR_STAR, 204 MSR_IA32_SYSENTER_CS, 205 MSR_IA32_SYSENTER_EIP, 206 MSR_IA32_SYSENTER_ESP, 207 #ifdef CONFIG_X86_64 208 MSR_GS_BASE, 209 MSR_FS_BASE, 210 MSR_KERNEL_GS_BASE, 211 MSR_LSTAR, 212 MSR_CSTAR, 213 MSR_SYSCALL_MASK, 214 #endif 215 MSR_IA32_SPEC_CTRL, 216 MSR_IA32_PRED_CMD, 217 MSR_IA32_FLUSH_CMD, 218 MSR_IA32_APERF, 219 MSR_IA32_MPERF, 220 MSR_IA32_LASTBRANCHFROMIP, 221 MSR_IA32_LASTBRANCHTOIP, 222 MSR_IA32_LASTINTFROMIP, 223 MSR_IA32_LASTINTTOIP, 224 225 MSR_K7_PERFCTR0, 226 MSR_K7_PERFCTR1, 227 MSR_K7_PERFCTR2, 228 MSR_K7_PERFCTR3, 229 MSR_F15H_PERF_CTR0, 230 MSR_F15H_PERF_CTR1, 231 MSR_F15H_PERF_CTR2, 232 MSR_F15H_PERF_CTR3, 233 MSR_F15H_PERF_CTR4, 234 MSR_F15H_PERF_CTR5, 235 236 MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 237 MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 238 MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 239 MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, 240 }; 241 int i, j; 242 243 for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) { 244 int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]); 245 u32 offset; 246 247 if (WARN_ON(bit_nr < 0)) 248 return -EIO; 249 250 /* 251 * Merging is done in chunks to reduce the number of accesses 252 * to L1's bitmap. 253 */ 254 offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t); 255 256 for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) { 257 if (nested_svm_msrpm_merge_offsets[j] == offset) 258 break; 259 } 260 261 if (j < nested_svm_nr_msrpm_merge_offsets) 262 continue; 263 264 if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets))) 265 return -EIO; 266 267 nested_svm_msrpm_merge_offsets[j] = offset; 268 nested_svm_nr_msrpm_merge_offsets++; 269 } 270 271 return 0; 272 } 273 274 /* 275 * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function 276 * is optimized in that it only merges the parts where KVM MSR permission bitmap 277 * may contain zero bits. 278 */ 279 static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu) 280 { 281 struct vcpu_svm *svm = to_svm(vcpu); 282 nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm; 283 nsvm_msrpm_merge_t *msrpm01 = svm->msrpm; 284 int i; 285 286 /* 287 * MSR bitmap update can be skipped when: 288 * - MSR bitmap for L1 hasn't changed. 289 * - Nested hypervisor (L1) is attempting to launch the same L2 as 290 * before. 291 * - Nested hypervisor (L1) is using Hyper-V emulation interface and 292 * tells KVM (L0) there were no changes in MSR bitmap for L2. 293 */ 294 #ifdef CONFIG_KVM_HYPERV 295 if (!svm->nested.force_msr_bitmap_recalc) { 296 struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; 297 298 if (kvm_hv_hypercall_enabled(vcpu) && 299 hve->hv_enlightenments_control.msr_bitmap && 300 (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) 301 goto set_msrpm_base_pa; 302 } 303 #endif 304 305 if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) 306 return true; 307 308 for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) { 309 const int p = nested_svm_msrpm_merge_offsets[i]; 310 nsvm_msrpm_merge_t l1_val; 311 gpa_t gpa; 312 313 gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val)); 314 315 if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val))) 316 return false; 317 318 msrpm02[p] = msrpm01[p] | l1_val; 319 } 320 321 svm->nested.force_msr_bitmap_recalc = false; 322 323 #ifdef CONFIG_KVM_HYPERV 324 set_msrpm_base_pa: 325 #endif 326 svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); 327 328 return true; 329 } 330 331 /* 332 * Bits 11:0 of bitmap address are ignored by hardware 333 */ 334 static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) 335 { 336 u64 addr = PAGE_ALIGN(pa); 337 338 return kvm_vcpu_is_legal_gpa(vcpu, addr) && 339 kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); 340 } 341 342 static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, 343 struct vmcb_ctrl_area_cached *control) 344 { 345 if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) 346 return false; 347 348 if (CC(control->asid == 0)) 349 return false; 350 351 if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) 352 return false; 353 354 if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, 355 MSRPM_SIZE))) 356 return false; 357 if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa, 358 IOPM_SIZE))) 359 return false; 360 361 if (CC((control->int_ctl & V_NMI_ENABLE_MASK) && 362 !vmcb12_is_intercept(control, INTERCEPT_NMI))) { 363 return false; 364 } 365 366 return true; 367 } 368 369 /* Common checks that apply to both L1 and L2 state. */ 370 static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, 371 struct vmcb_save_area_cached *save) 372 { 373 if (CC(!(save->efer & EFER_SVME))) 374 return false; 375 376 if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || 377 CC(save->cr0 & ~0xffffffffULL)) 378 return false; 379 380 if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) 381 return false; 382 383 /* 384 * These checks are also performed by KVM_SET_SREGS, 385 * except that EFER.LMA is not checked by SVM against 386 * CR0.PG && EFER.LME. 387 */ 388 if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { 389 if (CC(!(save->cr4 & X86_CR4_PAE)) || 390 CC(!(save->cr0 & X86_CR0_PE)) || 391 CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3))) 392 return false; 393 } 394 395 /* Note, SVM doesn't have any additional restrictions on CR4. */ 396 if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) 397 return false; 398 399 if (CC(!kvm_valid_efer(vcpu, save->efer))) 400 return false; 401 402 return true; 403 } 404 405 static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) 406 { 407 struct vcpu_svm *svm = to_svm(vcpu); 408 struct vmcb_save_area_cached *save = &svm->nested.save; 409 410 return __nested_vmcb_check_save(vcpu, save); 411 } 412 413 static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) 414 { 415 struct vcpu_svm *svm = to_svm(vcpu); 416 struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; 417 418 return __nested_vmcb_check_controls(vcpu, ctl); 419 } 420 421 /* 422 * If a feature is not advertised to L1, clear the corresponding vmcb12 423 * intercept. 424 */ 425 #define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \ 426 do { \ 427 if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \ 428 vmcb12_clr_intercept(__control, INTERCEPT_##iname); \ 429 } while (0) 430 431 #define nested_svm_sanitize_intercept(__vcpu, __control, name) \ 432 __nested_svm_sanitize_intercept(__vcpu, __control, name, name) 433 434 static 435 void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, 436 struct vmcb_ctrl_area_cached *to, 437 struct vmcb_control_area *from) 438 { 439 unsigned int i; 440 441 for (i = 0; i < MAX_INTERCEPT; i++) 442 to->intercepts[i] = from->intercepts[i]; 443 444 __nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV); 445 nested_svm_sanitize_intercept(vcpu, to, INVPCID); 446 nested_svm_sanitize_intercept(vcpu, to, RDTSCP); 447 nested_svm_sanitize_intercept(vcpu, to, SKINIT); 448 nested_svm_sanitize_intercept(vcpu, to, RDPRU); 449 450 to->iopm_base_pa = from->iopm_base_pa; 451 to->msrpm_base_pa = from->msrpm_base_pa; 452 to->tsc_offset = from->tsc_offset; 453 to->tlb_ctl = from->tlb_ctl; 454 to->erap_ctl = from->erap_ctl; 455 to->int_ctl = from->int_ctl; 456 to->int_vector = from->int_vector; 457 to->int_state = from->int_state; 458 to->exit_code = from->exit_code; 459 to->exit_info_1 = from->exit_info_1; 460 to->exit_info_2 = from->exit_info_2; 461 to->exit_int_info = from->exit_int_info; 462 to->exit_int_info_err = from->exit_int_info_err; 463 to->nested_ctl = from->nested_ctl; 464 to->event_inj = from->event_inj; 465 to->event_inj_err = from->event_inj_err; 466 to->next_rip = from->next_rip; 467 to->nested_cr3 = from->nested_cr3; 468 to->virt_ext = from->virt_ext; 469 to->pause_filter_count = from->pause_filter_count; 470 to->pause_filter_thresh = from->pause_filter_thresh; 471 472 /* Copy asid here because nested_vmcb_check_controls will check it. */ 473 to->asid = from->asid; 474 to->msrpm_base_pa &= ~0x0fffULL; 475 to->iopm_base_pa &= ~0x0fffULL; 476 477 #ifdef CONFIG_KVM_HYPERV 478 /* Hyper-V extensions (Enlightened VMCB) */ 479 if (kvm_hv_hypercall_enabled(vcpu)) { 480 to->clean = from->clean; 481 memcpy(&to->hv_enlightenments, &from->hv_enlightenments, 482 sizeof(to->hv_enlightenments)); 483 } 484 #endif 485 } 486 487 void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, 488 struct vmcb_control_area *control) 489 { 490 __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); 491 } 492 493 static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, 494 struct vmcb_save_area *from) 495 { 496 /* 497 * Copy only fields that are validated, as we need them 498 * to avoid TOC/TOU races. 499 */ 500 to->efer = from->efer; 501 to->cr0 = from->cr0; 502 to->cr3 = from->cr3; 503 to->cr4 = from->cr4; 504 505 to->dr6 = from->dr6; 506 to->dr7 = from->dr7; 507 } 508 509 void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, 510 struct vmcb_save_area *save) 511 { 512 __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); 513 } 514 515 /* 516 * Synchronize fields that are written by the processor, so that 517 * they can be copied back into the vmcb12. 518 */ 519 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) 520 { 521 u32 mask; 522 svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; 523 svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; 524 525 /* Only a few fields of int_ctl are written by the processor. */ 526 mask = V_IRQ_MASK | V_TPR_MASK; 527 /* 528 * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting 529 * virtual interrupts in order to request an interrupt window, as KVM 530 * has usurped vmcb02's int_ctl. If an interrupt window opens before 531 * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl. 532 * If no window opens, V_IRQ will be correctly preserved in vmcb12's 533 * int_ctl (because it was never recognized while L2 was running). 534 */ 535 if (svm_is_intercept(svm, INTERCEPT_VINTR) && 536 !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts)) 537 mask &= ~V_IRQ_MASK; 538 539 if (nested_vgif_enabled(svm)) 540 mask |= V_GIF_MASK; 541 542 if (nested_vnmi_enabled(svm)) 543 mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK; 544 545 svm->nested.ctl.int_ctl &= ~mask; 546 svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; 547 } 548 549 /* 550 * Transfer any event that L0 or L1 wanted to inject into L2 to 551 * EXIT_INT_INFO. 552 */ 553 static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, 554 struct vmcb *vmcb12) 555 { 556 struct kvm_vcpu *vcpu = &svm->vcpu; 557 u32 exit_int_info = 0; 558 unsigned int nr; 559 560 if (vcpu->arch.exception.injected) { 561 nr = vcpu->arch.exception.vector; 562 exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; 563 564 if (vcpu->arch.exception.has_error_code) { 565 exit_int_info |= SVM_EVTINJ_VALID_ERR; 566 vmcb12->control.exit_int_info_err = 567 vcpu->arch.exception.error_code; 568 } 569 570 } else if (vcpu->arch.nmi_injected) { 571 exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 572 573 } else if (vcpu->arch.interrupt.injected) { 574 nr = vcpu->arch.interrupt.nr; 575 exit_int_info = nr | SVM_EVTINJ_VALID; 576 577 if (vcpu->arch.interrupt.soft) 578 exit_int_info |= SVM_EVTINJ_TYPE_SOFT; 579 else 580 exit_int_info |= SVM_EVTINJ_TYPE_INTR; 581 } 582 583 vmcb12->control.exit_int_info = exit_int_info; 584 } 585 586 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) 587 { 588 /* Handle pending Hyper-V TLB flush requests */ 589 kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled); 590 591 /* 592 * TODO: optimize unconditional TLB flush/MMU sync. A partial list of 593 * things to fix before this can be conditional: 594 * 595 * - Flush TLBs for both L1 and L2 remote TLB flush 596 * - Honor L1's request to flush an ASID on nested VMRUN 597 * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*] 598 * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN 599 * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST 600 * 601 * [*] Unlike nested EPT, SVM's ASID management can invalidate nested 602 * NPT guest-physical mappings on VMRUN. 603 */ 604 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 605 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 606 } 607 608 /* 609 * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true 610 * if we are emulating VM-Entry into a guest with NPT enabled. 611 */ 612 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 613 bool nested_npt, bool reload_pdptrs) 614 { 615 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) 616 return -EINVAL; 617 618 if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && 619 CC(!load_pdptrs(vcpu, cr3))) 620 return -EINVAL; 621 622 vcpu->arch.cr3 = cr3; 623 624 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 625 kvm_init_mmu(vcpu); 626 627 if (!nested_npt) 628 kvm_mmu_new_pgd(vcpu, cr3); 629 630 return 0; 631 } 632 633 void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) 634 { 635 if (!svm->nested.vmcb02.ptr) 636 return; 637 638 /* FIXME: merge g_pat from vmcb01 and vmcb12. */ 639 svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; 640 } 641 642 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) 643 { 644 bool new_vmcb12 = false; 645 struct vmcb *vmcb01 = svm->vmcb01.ptr; 646 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 647 struct kvm_vcpu *vcpu = &svm->vcpu; 648 649 nested_vmcb02_compute_g_pat(svm); 650 vmcb_mark_dirty(vmcb02, VMCB_NPT); 651 652 /* Load the nested guest state */ 653 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { 654 new_vmcb12 = true; 655 svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; 656 svm->nested.force_msr_bitmap_recalc = true; 657 } 658 659 if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { 660 vmcb02->save.es = vmcb12->save.es; 661 vmcb02->save.cs = vmcb12->save.cs; 662 vmcb02->save.ss = vmcb12->save.ss; 663 vmcb02->save.ds = vmcb12->save.ds; 664 vmcb02->save.cpl = vmcb12->save.cpl; 665 vmcb_mark_dirty(vmcb02, VMCB_SEG); 666 } 667 668 if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { 669 vmcb02->save.gdtr = vmcb12->save.gdtr; 670 vmcb02->save.idtr = vmcb12->save.idtr; 671 vmcb_mark_dirty(vmcb02, VMCB_DT); 672 } 673 674 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 675 (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { 676 vmcb02->save.s_cet = vmcb12->save.s_cet; 677 vmcb02->save.isst_addr = vmcb12->save.isst_addr; 678 vmcb02->save.ssp = vmcb12->save.ssp; 679 vmcb_mark_dirty(vmcb02, VMCB_CET); 680 } 681 682 kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); 683 684 svm_set_efer(vcpu, svm->nested.save.efer); 685 686 svm_set_cr0(vcpu, svm->nested.save.cr0); 687 svm_set_cr4(vcpu, svm->nested.save.cr4); 688 689 svm->vcpu.arch.cr2 = vmcb12->save.cr2; 690 691 kvm_rax_write(vcpu, vmcb12->save.rax); 692 kvm_rsp_write(vcpu, vmcb12->save.rsp); 693 kvm_rip_write(vcpu, vmcb12->save.rip); 694 695 /* In case we don't even reach vcpu_run, the fields are not updated */ 696 vmcb02->save.rax = vmcb12->save.rax; 697 vmcb02->save.rsp = vmcb12->save.rsp; 698 vmcb02->save.rip = vmcb12->save.rip; 699 700 if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { 701 vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; 702 svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; 703 vmcb_mark_dirty(vmcb02, VMCB_DR); 704 } 705 706 if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 707 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { 708 /* 709 * Reserved bits of DEBUGCTL are ignored. Be consistent with 710 * svm_set_msr's definition of reserved bits. 711 */ 712 svm_copy_lbrs(vmcb02, vmcb12); 713 vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; 714 } else { 715 svm_copy_lbrs(vmcb02, vmcb01); 716 } 717 svm_update_lbrv(&svm->vcpu); 718 } 719 720 static inline bool is_evtinj_soft(u32 evtinj) 721 { 722 u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; 723 u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; 724 725 if (!(evtinj & SVM_EVTINJ_VALID)) 726 return false; 727 728 if (type == SVM_EVTINJ_TYPE_SOFT) 729 return true; 730 731 return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); 732 } 733 734 static bool is_evtinj_nmi(u32 evtinj) 735 { 736 u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; 737 738 if (!(evtinj & SVM_EVTINJ_VALID)) 739 return false; 740 741 return type == SVM_EVTINJ_TYPE_NMI; 742 } 743 744 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, 745 unsigned long vmcb12_rip, 746 unsigned long vmcb12_csbase) 747 { 748 u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; 749 u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; 750 751 struct kvm_vcpu *vcpu = &svm->vcpu; 752 struct vmcb *vmcb01 = svm->vmcb01.ptr; 753 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 754 u32 pause_count12; 755 u32 pause_thresh12; 756 757 nested_svm_transition_tlb_flush(vcpu); 758 759 /* Enter Guest-Mode */ 760 enter_guest_mode(vcpu); 761 762 /* 763 * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, 764 * exit_int_info_err, next_rip, insn_len, insn_bytes. 765 */ 766 767 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && 768 (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) 769 int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); 770 else 771 int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); 772 773 if (vnmi) { 774 if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) { 775 svm->vcpu.arch.nmi_pending++; 776 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 777 } 778 if (nested_vnmi_enabled(svm)) 779 int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK | 780 V_NMI_ENABLE_MASK | 781 V_NMI_BLOCKING_MASK); 782 } 783 784 /* Copied from vmcb01. msrpm_base can be overwritten later. */ 785 vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; 786 vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; 787 vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; 788 vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); 789 790 /* 791 * Stash vmcb02's counter if the guest hasn't moved past the guilty 792 * instruction; otherwise, reset the counter to '0'. 793 * 794 * In order to detect if L2 has made forward progress or not, track the 795 * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP 796 * is changed, guest has clearly made forward progress, bus_lock_counter 797 * still remained '1', so reset bus_lock_counter to '0'. Eg. In the 798 * scenario, where a buslock happened in L1 before VMRUN, the bus lock 799 * firmly happened on an instruction in the past. Even if vmcb01's 800 * counter is still '1', (because the guilty instruction got patched), 801 * the vCPU has clearly made forward progress and so KVM should reset 802 * vmcb02's counter to '0'. 803 * 804 * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN 805 * to prevent the same guilty instruction from triggering a VM-Exit. Eg. 806 * if userspace rate-limits the vCPU, then it's entirely possible that 807 * L1's tick interrupt is pending by the time userspace re-runs the 808 * vCPU. If KVM unconditionally clears the counter on VMRUN, then when 809 * L1 re-enters L2, the same instruction will trigger a VM-Exit and the 810 * entire cycle start over. 811 */ 812 if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) 813 vmcb02->control.bus_lock_counter = 1; 814 else 815 vmcb02->control.bus_lock_counter = 0; 816 817 /* Done at vmrun: asid. */ 818 819 /* Also overwritten later if necessary. */ 820 vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 821 822 /* nested_cr3. */ 823 if (nested_npt_enabled(svm)) 824 nested_svm_init_mmu_context(vcpu); 825 826 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 827 vcpu->arch.l1_tsc_offset, 828 svm->nested.ctl.tsc_offset, 829 svm->tsc_ratio_msr); 830 831 vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; 832 833 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && 834 svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) 835 nested_svm_update_tsc_ratio_msr(vcpu); 836 837 vmcb02->control.int_ctl = 838 (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | 839 (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); 840 841 vmcb02->control.int_vector = svm->nested.ctl.int_vector; 842 vmcb02->control.int_state = svm->nested.ctl.int_state; 843 vmcb02->control.event_inj = svm->nested.ctl.event_inj; 844 vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; 845 846 /* 847 * next_rip is consumed on VMRUN as the return address pushed on the 848 * stack for injected soft exceptions/interrupts. If nrips is exposed 849 * to L1, take it verbatim from vmcb12. If nrips is supported in 850 * hardware but not exposed to L1, stuff the actual L2 RIP to emulate 851 * what a nrips=0 CPU would do (L1 is responsible for advancing RIP 852 * prior to injecting the event). 853 */ 854 if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 855 vmcb02->control.next_rip = svm->nested.ctl.next_rip; 856 else if (boot_cpu_has(X86_FEATURE_NRIPS)) 857 vmcb02->control.next_rip = vmcb12_rip; 858 859 svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); 860 if (is_evtinj_soft(vmcb02->control.event_inj)) { 861 svm->soft_int_injected = true; 862 svm->soft_int_csbase = vmcb12_csbase; 863 svm->soft_int_old_rip = vmcb12_rip; 864 if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 865 svm->soft_int_next_rip = svm->nested.ctl.next_rip; 866 else 867 svm->soft_int_next_rip = vmcb12_rip; 868 } 869 870 /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ 871 872 if (!nested_vmcb_needs_vls_intercept(svm)) 873 vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 874 875 if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) 876 pause_count12 = svm->nested.ctl.pause_filter_count; 877 else 878 pause_count12 = 0; 879 if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) 880 pause_thresh12 = svm->nested.ctl.pause_filter_thresh; 881 else 882 pause_thresh12 = 0; 883 if (kvm_pause_in_guest(svm->vcpu.kvm)) { 884 /* use guest values since host doesn't intercept PAUSE */ 885 vmcb02->control.pause_filter_count = pause_count12; 886 vmcb02->control.pause_filter_thresh = pause_thresh12; 887 888 } else { 889 /* start from host values otherwise */ 890 vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; 891 vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; 892 893 /* ... but ensure filtering is disabled if so requested. */ 894 if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { 895 if (!pause_count12) 896 vmcb02->control.pause_filter_count = 0; 897 if (!pause_thresh12) 898 vmcb02->control.pause_filter_thresh = 0; 899 } 900 } 901 902 /* 903 * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to 904 * let L2 use a larger RAP since KVM will emulate the necessary clears, 905 * as it's possible L1 deliberately wants to restrict L2 to the legacy 906 * RAP size. Unconditionally clear the RAP on nested VMRUN, as KVM is 907 * responsible for emulating the host vs. guest tags (L1 is the "host", 908 * L2 is the "guest"). 909 */ 910 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) 911 vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & 912 ERAP_CONTROL_ALLOW_LARGER_RAP) | 913 ERAP_CONTROL_CLEAR_RAP; 914 915 /* 916 * Merge guest and host intercepts - must be called with vcpu in 917 * guest-mode to take effect. 918 */ 919 recalc_intercepts(svm); 920 } 921 922 static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) 923 { 924 /* 925 * Some VMCB state is shared between L1 and L2 and thus has to be 926 * moved at the time of nested vmrun and vmexit. 927 * 928 * VMLOAD/VMSAVE state would also belong in this category, but KVM 929 * always performs VMLOAD and VMSAVE from the VMCB01. 930 */ 931 to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; 932 } 933 934 int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, 935 struct vmcb *vmcb12, bool from_vmrun) 936 { 937 struct vcpu_svm *svm = to_svm(vcpu); 938 int ret; 939 940 trace_kvm_nested_vmenter(svm->vmcb->save.rip, 941 vmcb12_gpa, 942 vmcb12->save.rip, 943 vmcb12->control.int_ctl, 944 vmcb12->control.event_inj, 945 vmcb12->control.nested_ctl, 946 vmcb12->control.nested_cr3, 947 vmcb12->save.cr3, 948 KVM_ISA_SVM); 949 950 trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, 951 vmcb12->control.intercepts[INTERCEPT_CR] >> 16, 952 vmcb12->control.intercepts[INTERCEPT_EXCEPTION], 953 vmcb12->control.intercepts[INTERCEPT_WORD3], 954 vmcb12->control.intercepts[INTERCEPT_WORD4], 955 vmcb12->control.intercepts[INTERCEPT_WORD5]); 956 957 958 svm->nested.vmcb12_gpa = vmcb12_gpa; 959 960 WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr); 961 962 nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); 963 964 svm_switch_vmcb(svm, &svm->nested.vmcb02); 965 nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); 966 nested_vmcb02_prepare_save(svm, vmcb12); 967 968 ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, 969 nested_npt_enabled(svm), from_vmrun); 970 if (ret) 971 return ret; 972 973 if (!from_vmrun) 974 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 975 976 svm_set_gif(svm, true); 977 978 if (kvm_vcpu_apicv_active(vcpu)) 979 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 980 981 nested_svm_hv_update_vm_vp_ids(vcpu); 982 983 return 0; 984 } 985 986 int nested_svm_vmrun(struct kvm_vcpu *vcpu) 987 { 988 struct vcpu_svm *svm = to_svm(vcpu); 989 int ret; 990 struct vmcb *vmcb12; 991 struct kvm_host_map map; 992 u64 vmcb12_gpa; 993 struct vmcb *vmcb01 = svm->vmcb01.ptr; 994 995 if (!svm->nested.hsave_msr) { 996 kvm_inject_gp(vcpu, 0); 997 return 1; 998 } 999 1000 if (is_smm(vcpu)) { 1001 kvm_queue_exception(vcpu, UD_VECTOR); 1002 return 1; 1003 } 1004 1005 /* This fails when VP assist page is enabled but the supplied GPA is bogus */ 1006 ret = kvm_hv_verify_vp_assist(vcpu); 1007 if (ret) { 1008 kvm_inject_gp(vcpu, 0); 1009 return ret; 1010 } 1011 1012 vmcb12_gpa = svm->vmcb->save.rax; 1013 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); 1014 if (ret == -EINVAL) { 1015 kvm_inject_gp(vcpu, 0); 1016 return 1; 1017 } else if (ret) { 1018 return kvm_skip_emulated_instruction(vcpu); 1019 } 1020 1021 ret = kvm_skip_emulated_instruction(vcpu); 1022 1023 vmcb12 = map.hva; 1024 1025 if (WARN_ON_ONCE(!svm->nested.initialized)) 1026 return -EINVAL; 1027 1028 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 1029 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 1030 1031 if (!nested_vmcb_check_save(vcpu) || 1032 !nested_vmcb_check_controls(vcpu)) { 1033 vmcb12->control.exit_code = SVM_EXIT_ERR; 1034 vmcb12->control.exit_info_1 = 0; 1035 vmcb12->control.exit_info_2 = 0; 1036 goto out; 1037 } 1038 1039 /* 1040 * Since vmcb01 is not in use, we can use it to store some of the L1 1041 * state. 1042 */ 1043 vmcb01->save.efer = vcpu->arch.efer; 1044 vmcb01->save.cr0 = kvm_read_cr0(vcpu); 1045 vmcb01->save.cr4 = vcpu->arch.cr4; 1046 vmcb01->save.rflags = kvm_get_rflags(vcpu); 1047 vmcb01->save.rip = kvm_rip_read(vcpu); 1048 1049 if (!npt_enabled) 1050 vmcb01->save.cr3 = kvm_read_cr3(vcpu); 1051 1052 svm->nested.nested_run_pending = 1; 1053 1054 if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) 1055 goto out_exit_err; 1056 1057 if (nested_svm_merge_msrpm(vcpu)) 1058 goto out; 1059 1060 out_exit_err: 1061 svm->nested.nested_run_pending = 0; 1062 svm->nmi_l1_to_l2 = false; 1063 svm->soft_int_injected = false; 1064 1065 svm->vmcb->control.exit_code = SVM_EXIT_ERR; 1066 svm->vmcb->control.exit_info_1 = 0; 1067 svm->vmcb->control.exit_info_2 = 0; 1068 1069 nested_svm_vmexit(svm); 1070 1071 out: 1072 kvm_vcpu_unmap(vcpu, &map); 1073 1074 return ret; 1075 } 1076 1077 /* Copy state save area fields which are handled by VMRUN */ 1078 void svm_copy_vmrun_state(struct vmcb_save_area *to_save, 1079 struct vmcb_save_area *from_save) 1080 { 1081 to_save->es = from_save->es; 1082 to_save->cs = from_save->cs; 1083 to_save->ss = from_save->ss; 1084 to_save->ds = from_save->ds; 1085 to_save->gdtr = from_save->gdtr; 1086 to_save->idtr = from_save->idtr; 1087 to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED; 1088 to_save->efer = from_save->efer; 1089 to_save->cr0 = from_save->cr0; 1090 to_save->cr3 = from_save->cr3; 1091 to_save->cr4 = from_save->cr4; 1092 to_save->rax = from_save->rax; 1093 to_save->rsp = from_save->rsp; 1094 to_save->rip = from_save->rip; 1095 to_save->cpl = 0; 1096 1097 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 1098 to_save->s_cet = from_save->s_cet; 1099 to_save->isst_addr = from_save->isst_addr; 1100 to_save->ssp = from_save->ssp; 1101 } 1102 } 1103 1104 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 1105 { 1106 to_vmcb->save.fs = from_vmcb->save.fs; 1107 to_vmcb->save.gs = from_vmcb->save.gs; 1108 to_vmcb->save.tr = from_vmcb->save.tr; 1109 to_vmcb->save.ldtr = from_vmcb->save.ldtr; 1110 to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; 1111 to_vmcb->save.star = from_vmcb->save.star; 1112 to_vmcb->save.lstar = from_vmcb->save.lstar; 1113 to_vmcb->save.cstar = from_vmcb->save.cstar; 1114 to_vmcb->save.sfmask = from_vmcb->save.sfmask; 1115 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; 1116 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; 1117 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1118 } 1119 1120 int nested_svm_vmexit(struct vcpu_svm *svm) 1121 { 1122 struct kvm_vcpu *vcpu = &svm->vcpu; 1123 struct vmcb *vmcb01 = svm->vmcb01.ptr; 1124 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 1125 struct vmcb *vmcb12; 1126 struct kvm_host_map map; 1127 int rc; 1128 1129 rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); 1130 if (rc) { 1131 if (rc == -EINVAL) 1132 kvm_inject_gp(vcpu, 0); 1133 return 1; 1134 } 1135 1136 vmcb12 = map.hva; 1137 1138 /* Exit Guest-Mode */ 1139 leave_guest_mode(vcpu); 1140 svm->nested.vmcb12_gpa = 0; 1141 WARN_ON_ONCE(svm->nested.nested_run_pending); 1142 1143 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 1144 1145 /* in case we halted in L2 */ 1146 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 1147 1148 /* Give the current vmcb to the guest */ 1149 1150 vmcb12->save.es = vmcb02->save.es; 1151 vmcb12->save.cs = vmcb02->save.cs; 1152 vmcb12->save.ss = vmcb02->save.ss; 1153 vmcb12->save.ds = vmcb02->save.ds; 1154 vmcb12->save.gdtr = vmcb02->save.gdtr; 1155 vmcb12->save.idtr = vmcb02->save.idtr; 1156 vmcb12->save.efer = svm->vcpu.arch.efer; 1157 vmcb12->save.cr0 = kvm_read_cr0(vcpu); 1158 vmcb12->save.cr3 = kvm_read_cr3(vcpu); 1159 vmcb12->save.cr2 = vmcb02->save.cr2; 1160 vmcb12->save.cr4 = svm->vcpu.arch.cr4; 1161 vmcb12->save.rflags = kvm_get_rflags(vcpu); 1162 vmcb12->save.rip = kvm_rip_read(vcpu); 1163 vmcb12->save.rsp = kvm_rsp_read(vcpu); 1164 vmcb12->save.rax = kvm_rax_read(vcpu); 1165 vmcb12->save.dr7 = vmcb02->save.dr7; 1166 vmcb12->save.dr6 = svm->vcpu.arch.dr6; 1167 vmcb12->save.cpl = vmcb02->save.cpl; 1168 1169 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 1170 vmcb12->save.s_cet = vmcb02->save.s_cet; 1171 vmcb12->save.isst_addr = vmcb02->save.isst_addr; 1172 vmcb12->save.ssp = vmcb02->save.ssp; 1173 } 1174 1175 vmcb12->control.int_state = vmcb02->control.int_state; 1176 vmcb12->control.exit_code = vmcb02->control.exit_code; 1177 vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; 1178 vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; 1179 1180 if (!svm_is_vmrun_failure(vmcb12->control.exit_code)) 1181 nested_save_pending_event_to_vmcb12(svm, vmcb12); 1182 1183 if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 1184 vmcb12->control.next_rip = vmcb02->control.next_rip; 1185 1186 vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; 1187 vmcb12->control.event_inj = svm->nested.ctl.event_inj; 1188 vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; 1189 1190 if (!kvm_pause_in_guest(vcpu->kvm)) { 1191 vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; 1192 vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); 1193 1194 } 1195 1196 /* 1197 * Invalidate bus_lock_rip unless KVM is still waiting for the guest 1198 * to make forward progress before re-enabling bus lock detection. 1199 */ 1200 if (!vmcb02->control.bus_lock_counter) 1201 svm->nested.ctl.bus_lock_rip = INVALID_GPA; 1202 1203 nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); 1204 1205 kvm_nested_vmexit_handle_ibrs(vcpu); 1206 1207 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) 1208 vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; 1209 1210 svm_switch_vmcb(svm, &svm->vmcb01); 1211 1212 /* 1213 * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01: 1214 * 1215 * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't 1216 * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related 1217 * flags) to detect interrupt windows for L1 IRQs (even if L1 uses 1218 * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that 1219 * KVM re-requests an interrupt window if necessary, which implicitly 1220 * copies this bits from vmcb02 to vmcb01. 1221 * 1222 * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR 1223 * is stored in vmcb02, but its value doesn't need to be copied from/to 1224 * vmcb01 because it is copied from/to the virtual APIC's TPR register 1225 * on each VM entry/exit. 1226 * 1227 * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's 1228 * V_GIF. However, GIF is architecturally clear on each VM exit, thus 1229 * there is no need to copy V_GIF from vmcb02 to vmcb01. 1230 */ 1231 if (!nested_exit_on_intr(svm)) 1232 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 1233 1234 if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 1235 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) 1236 svm_copy_lbrs(vmcb12, vmcb02); 1237 else 1238 svm_copy_lbrs(vmcb01, vmcb02); 1239 1240 svm_update_lbrv(vcpu); 1241 1242 if (vnmi) { 1243 if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) 1244 vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK; 1245 else 1246 vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK; 1247 1248 if (vcpu->arch.nmi_pending) { 1249 vcpu->arch.nmi_pending--; 1250 vmcb01->control.int_ctl |= V_NMI_PENDING_MASK; 1251 } else { 1252 vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK; 1253 } 1254 } 1255 1256 /* 1257 * On vmexit the GIF is set to false and 1258 * no event can be injected in L1. 1259 */ 1260 svm_set_gif(svm, false); 1261 vmcb01->control.exit_int_info = 0; 1262 1263 svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; 1264 if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) { 1265 vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset; 1266 vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); 1267 } 1268 1269 if (kvm_caps.has_tsc_control && 1270 vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { 1271 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 1272 svm_write_tsc_multiplier(vcpu); 1273 } 1274 1275 svm->nested.ctl.nested_cr3 = 0; 1276 1277 /* 1278 * Restore processor state that had been saved in vmcb01 1279 */ 1280 kvm_set_rflags(vcpu, vmcb01->save.rflags); 1281 svm_set_efer(vcpu, vmcb01->save.efer); 1282 svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE); 1283 svm_set_cr4(vcpu, vmcb01->save.cr4); 1284 kvm_rax_write(vcpu, vmcb01->save.rax); 1285 kvm_rsp_write(vcpu, vmcb01->save.rsp); 1286 kvm_rip_write(vcpu, vmcb01->save.rip); 1287 1288 svm->vcpu.arch.dr7 = DR7_FIXED_1; 1289 kvm_update_dr7(&svm->vcpu); 1290 1291 trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, 1292 vmcb12->control.exit_info_1, 1293 vmcb12->control.exit_info_2, 1294 vmcb12->control.exit_int_info, 1295 vmcb12->control.exit_int_info_err, 1296 KVM_ISA_SVM); 1297 1298 kvm_vcpu_unmap(vcpu, &map); 1299 1300 nested_svm_transition_tlb_flush(vcpu); 1301 1302 nested_svm_uninit_mmu_context(vcpu); 1303 1304 rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); 1305 if (rc) 1306 return 1; 1307 1308 /* 1309 * Drop what we picked up for L2 via svm_complete_interrupts() so it 1310 * doesn't end up in L1. 1311 */ 1312 svm->vcpu.arch.nmi_injected = false; 1313 kvm_clear_exception_queue(vcpu); 1314 kvm_clear_interrupt_queue(vcpu); 1315 1316 /* 1317 * If we are here following the completion of a VMRUN that 1318 * is being single-stepped, queue the pending #DB intercept 1319 * right now so that it an be accounted for before we execute 1320 * L1's next instruction. 1321 */ 1322 if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF)) 1323 kvm_queue_exception(&(svm->vcpu), DB_VECTOR); 1324 1325 /* 1326 * Un-inhibit the AVIC right away, so that other vCPUs can start 1327 * to benefit from it right away. 1328 */ 1329 if (kvm_apicv_activated(vcpu->kvm)) 1330 __kvm_vcpu_update_apicv(vcpu); 1331 1332 return 0; 1333 } 1334 1335 static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) 1336 { 1337 struct vcpu_svm *svm = to_svm(vcpu); 1338 1339 if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN)) 1340 return; 1341 1342 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1343 nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN); 1344 } 1345 1346 int svm_allocate_nested(struct vcpu_svm *svm) 1347 { 1348 struct page *vmcb02_page; 1349 1350 if (svm->nested.initialized) 1351 return 0; 1352 1353 vmcb02_page = snp_safe_alloc_page(); 1354 if (!vmcb02_page) 1355 return -ENOMEM; 1356 svm->nested.vmcb02.ptr = page_address(vmcb02_page); 1357 svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT); 1358 1359 svm->nested.msrpm = svm_vcpu_alloc_msrpm(); 1360 if (!svm->nested.msrpm) 1361 goto err_free_vmcb02; 1362 1363 svm->nested.initialized = true; 1364 return 0; 1365 1366 err_free_vmcb02: 1367 __free_page(vmcb02_page); 1368 return -ENOMEM; 1369 } 1370 1371 void svm_free_nested(struct vcpu_svm *svm) 1372 { 1373 if (!svm->nested.initialized) 1374 return; 1375 1376 if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr)) 1377 svm_switch_vmcb(svm, &svm->vmcb01); 1378 1379 svm_vcpu_free_msrpm(svm->nested.msrpm); 1380 svm->nested.msrpm = NULL; 1381 1382 __free_page(virt_to_page(svm->nested.vmcb02.ptr)); 1383 svm->nested.vmcb02.ptr = NULL; 1384 1385 /* 1386 * When last_vmcb12_gpa matches the current vmcb12 gpa, 1387 * some vmcb12 fields are not loaded if they are marked clean 1388 * in the vmcb12, since in this case they are up to date already. 1389 * 1390 * When the vmcb02 is freed, this optimization becomes invalid. 1391 */ 1392 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1393 1394 svm->nested.initialized = false; 1395 } 1396 1397 void svm_leave_nested(struct kvm_vcpu *vcpu) 1398 { 1399 struct vcpu_svm *svm = to_svm(vcpu); 1400 1401 if (is_guest_mode(vcpu)) { 1402 svm->nested.nested_run_pending = 0; 1403 svm->nested.vmcb12_gpa = INVALID_GPA; 1404 1405 leave_guest_mode(vcpu); 1406 1407 svm_switch_vmcb(svm, &svm->vmcb01); 1408 1409 nested_svm_uninit_mmu_context(vcpu); 1410 vmcb_mark_all_dirty(svm->vmcb); 1411 1412 svm_set_gif(svm, true); 1413 1414 if (kvm_apicv_activated(vcpu->kvm)) 1415 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 1416 } 1417 1418 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 1419 } 1420 1421 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1422 { 1423 gpa_t base = svm->nested.ctl.msrpm_base_pa; 1424 int write, bit_nr; 1425 u8 value, mask; 1426 u32 msr; 1427 1428 if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) 1429 return NESTED_EXIT_HOST; 1430 1431 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1432 bit_nr = svm_msrpm_bit_nr(msr); 1433 write = svm->vmcb->control.exit_info_1 & 1; 1434 1435 if (bit_nr < 0) 1436 return NESTED_EXIT_DONE; 1437 1438 if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE, 1439 &value, sizeof(value))) 1440 return NESTED_EXIT_DONE; 1441 1442 mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1)); 1443 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 1444 } 1445 1446 static int nested_svm_intercept_ioio(struct vcpu_svm *svm) 1447 { 1448 unsigned port, size, iopm_len; 1449 u16 val, mask; 1450 u8 start_bit; 1451 u64 gpa; 1452 1453 if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) 1454 return NESTED_EXIT_HOST; 1455 1456 port = svm->vmcb->control.exit_info_1 >> 16; 1457 size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> 1458 SVM_IOIO_SIZE_SHIFT; 1459 gpa = svm->nested.ctl.iopm_base_pa + (port / 8); 1460 start_bit = port % 8; 1461 iopm_len = (start_bit + size > 8) ? 2 : 1; 1462 mask = (0xf >> (4 - size)) << start_bit; 1463 val = 0; 1464 1465 if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) 1466 return NESTED_EXIT_DONE; 1467 1468 return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 1469 } 1470 1471 static int nested_svm_intercept(struct vcpu_svm *svm) 1472 { 1473 u64 exit_code = svm->vmcb->control.exit_code; 1474 int vmexit = NESTED_EXIT_HOST; 1475 1476 if (svm_is_vmrun_failure(exit_code)) 1477 return NESTED_EXIT_DONE; 1478 1479 switch (exit_code) { 1480 case SVM_EXIT_MSR: 1481 vmexit = nested_svm_exit_handled_msr(svm); 1482 break; 1483 case SVM_EXIT_IOIO: 1484 vmexit = nested_svm_intercept_ioio(svm); 1485 break; 1486 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: 1487 /* 1488 * Host-intercepted exceptions have been checked already in 1489 * nested_svm_exit_special. There is nothing to do here, 1490 * the vmexit is injected by svm_check_nested_events. 1491 */ 1492 vmexit = NESTED_EXIT_DONE; 1493 break; 1494 default: 1495 if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) 1496 vmexit = NESTED_EXIT_DONE; 1497 break; 1498 } 1499 1500 return vmexit; 1501 } 1502 1503 int nested_svm_exit_handled(struct vcpu_svm *svm) 1504 { 1505 int vmexit; 1506 1507 vmexit = nested_svm_intercept(svm); 1508 1509 if (vmexit == NESTED_EXIT_DONE) 1510 nested_svm_vmexit(svm); 1511 1512 return vmexit; 1513 } 1514 1515 int nested_svm_check_permissions(struct kvm_vcpu *vcpu) 1516 { 1517 if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) { 1518 kvm_queue_exception(vcpu, UD_VECTOR); 1519 return 1; 1520 } 1521 1522 if (to_svm(vcpu)->vmcb->save.cpl) { 1523 kvm_inject_gp(vcpu, 0); 1524 return 1; 1525 } 1526 1527 return 0; 1528 } 1529 1530 static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 1531 u32 error_code) 1532 { 1533 struct vcpu_svm *svm = to_svm(vcpu); 1534 1535 return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector)); 1536 } 1537 1538 static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) 1539 { 1540 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 1541 struct vcpu_svm *svm = to_svm(vcpu); 1542 struct vmcb *vmcb = svm->vmcb; 1543 1544 vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; 1545 1546 if (ex->has_error_code) 1547 vmcb->control.exit_info_1 = ex->error_code; 1548 1549 /* 1550 * EXITINFO2 is undefined for all exception intercepts other 1551 * than #PF. 1552 */ 1553 if (ex->vector == PF_VECTOR) { 1554 if (ex->has_payload) 1555 vmcb->control.exit_info_2 = ex->payload; 1556 else 1557 vmcb->control.exit_info_2 = vcpu->arch.cr2; 1558 } else if (ex->vector == DB_VECTOR) { 1559 /* See kvm_check_and_inject_events(). */ 1560 kvm_deliver_exception_payload(vcpu, ex); 1561 1562 if (vcpu->arch.dr7 & DR7_GD) { 1563 vcpu->arch.dr7 &= ~DR7_GD; 1564 kvm_update_dr7(vcpu); 1565 } 1566 } else { 1567 WARN_ON(ex->has_payload); 1568 } 1569 1570 nested_svm_vmexit(svm); 1571 } 1572 1573 static inline bool nested_exit_on_init(struct vcpu_svm *svm) 1574 { 1575 return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); 1576 } 1577 1578 static int svm_check_nested_events(struct kvm_vcpu *vcpu) 1579 { 1580 struct kvm_lapic *apic = vcpu->arch.apic; 1581 struct vcpu_svm *svm = to_svm(vcpu); 1582 /* 1583 * Only a pending nested run blocks a pending exception. If there is a 1584 * previously injected event, the pending exception occurred while said 1585 * event was being delivered and thus needs to be handled. 1586 */ 1587 bool block_nested_exceptions = svm->nested.nested_run_pending; 1588 /* 1589 * New events (not exceptions) are only recognized at instruction 1590 * boundaries. If an event needs reinjection, then KVM is handling a 1591 * VM-Exit that occurred _during_ instruction execution; new events are 1592 * blocked until the instruction completes. 1593 */ 1594 bool block_nested_events = block_nested_exceptions || 1595 kvm_event_needs_reinjection(vcpu); 1596 1597 if (lapic_in_kernel(vcpu) && 1598 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 1599 if (block_nested_events) 1600 return -EBUSY; 1601 if (!nested_exit_on_init(svm)) 1602 return 0; 1603 nested_svm_simple_vmexit(svm, SVM_EXIT_INIT); 1604 return 0; 1605 } 1606 1607 if (vcpu->arch.exception_vmexit.pending) { 1608 if (block_nested_exceptions) 1609 return -EBUSY; 1610 nested_svm_inject_exception_vmexit(vcpu); 1611 return 0; 1612 } 1613 1614 if (vcpu->arch.exception.pending) { 1615 if (block_nested_exceptions) 1616 return -EBUSY; 1617 return 0; 1618 } 1619 1620 #ifdef CONFIG_KVM_SMM 1621 if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { 1622 if (block_nested_events) 1623 return -EBUSY; 1624 if (!nested_exit_on_smi(svm)) 1625 return 0; 1626 nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); 1627 return 0; 1628 } 1629 #endif 1630 1631 if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { 1632 if (block_nested_events) 1633 return -EBUSY; 1634 if (!nested_exit_on_nmi(svm)) 1635 return 0; 1636 nested_svm_simple_vmexit(svm, SVM_EXIT_NMI); 1637 return 0; 1638 } 1639 1640 if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) { 1641 if (block_nested_events) 1642 return -EBUSY; 1643 if (!nested_exit_on_intr(svm)) 1644 return 0; 1645 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1646 nested_svm_simple_vmexit(svm, SVM_EXIT_INTR); 1647 return 0; 1648 } 1649 1650 return 0; 1651 } 1652 1653 int nested_svm_exit_special(struct vcpu_svm *svm) 1654 { 1655 u32 exit_code = svm->vmcb->control.exit_code; 1656 struct kvm_vcpu *vcpu = &svm->vcpu; 1657 1658 switch (exit_code) { 1659 case SVM_EXIT_INTR: 1660 case SVM_EXIT_NMI: 1661 case SVM_EXIT_NPF: 1662 return NESTED_EXIT_HOST; 1663 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 1664 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 1665 1666 if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] & 1667 excp_bits) 1668 return NESTED_EXIT_HOST; 1669 else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && 1670 svm->vcpu.arch.apf.host_apf_flags) 1671 /* Trap async PF even if not shadowing */ 1672 return NESTED_EXIT_HOST; 1673 break; 1674 } 1675 case SVM_EXIT_VMMCALL: 1676 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 1677 if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 1678 nested_svm_l2_tlb_flush_enabled(vcpu) && 1679 kvm_hv_is_tlb_flush_hcall(vcpu)) 1680 return NESTED_EXIT_HOST; 1681 break; 1682 default: 1683 break; 1684 } 1685 1686 return NESTED_EXIT_CONTINUE; 1687 } 1688 1689 void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) 1690 { 1691 struct vcpu_svm *svm = to_svm(vcpu); 1692 1693 vcpu->arch.tsc_scaling_ratio = 1694 kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, 1695 svm->tsc_ratio_msr); 1696 svm_write_tsc_multiplier(vcpu); 1697 } 1698 1699 /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ 1700 static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, 1701 struct vmcb_ctrl_area_cached *from) 1702 { 1703 unsigned int i; 1704 1705 memset(dst, 0, sizeof(struct vmcb_control_area)); 1706 1707 for (i = 0; i < MAX_INTERCEPT; i++) 1708 dst->intercepts[i] = from->intercepts[i]; 1709 1710 dst->iopm_base_pa = from->iopm_base_pa; 1711 dst->msrpm_base_pa = from->msrpm_base_pa; 1712 dst->tsc_offset = from->tsc_offset; 1713 dst->asid = from->asid; 1714 dst->tlb_ctl = from->tlb_ctl; 1715 dst->erap_ctl = from->erap_ctl; 1716 dst->int_ctl = from->int_ctl; 1717 dst->int_vector = from->int_vector; 1718 dst->int_state = from->int_state; 1719 dst->exit_code = from->exit_code; 1720 dst->exit_info_1 = from->exit_info_1; 1721 dst->exit_info_2 = from->exit_info_2; 1722 dst->exit_int_info = from->exit_int_info; 1723 dst->exit_int_info_err = from->exit_int_info_err; 1724 dst->nested_ctl = from->nested_ctl; 1725 dst->event_inj = from->event_inj; 1726 dst->event_inj_err = from->event_inj_err; 1727 dst->next_rip = from->next_rip; 1728 dst->nested_cr3 = from->nested_cr3; 1729 dst->virt_ext = from->virt_ext; 1730 dst->pause_filter_count = from->pause_filter_count; 1731 dst->pause_filter_thresh = from->pause_filter_thresh; 1732 /* 'clean' and 'hv_enlightenments' are not changed by KVM */ 1733 } 1734 1735 static int svm_get_nested_state(struct kvm_vcpu *vcpu, 1736 struct kvm_nested_state __user *user_kvm_nested_state, 1737 u32 user_data_size) 1738 { 1739 struct vcpu_svm *svm; 1740 struct vmcb_control_area *ctl; 1741 unsigned long r; 1742 struct kvm_nested_state kvm_state = { 1743 .flags = 0, 1744 .format = KVM_STATE_NESTED_FORMAT_SVM, 1745 .size = sizeof(kvm_state), 1746 }; 1747 struct vmcb __user *user_vmcb = (struct vmcb __user *) 1748 &user_kvm_nested_state->data.svm[0]; 1749 1750 if (!vcpu) 1751 return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE; 1752 1753 svm = to_svm(vcpu); 1754 1755 if (user_data_size < kvm_state.size) 1756 goto out; 1757 1758 /* First fill in the header and copy it out. */ 1759 if (is_guest_mode(vcpu)) { 1760 kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; 1761 kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; 1762 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 1763 1764 if (svm->nested.nested_run_pending) 1765 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 1766 } 1767 1768 if (gif_set(svm)) 1769 kvm_state.flags |= KVM_STATE_NESTED_GIF_SET; 1770 1771 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 1772 return -EFAULT; 1773 1774 if (!is_guest_mode(vcpu)) 1775 goto out; 1776 1777 /* 1778 * Copy over the full size of the VMCB rather than just the size 1779 * of the structs. 1780 */ 1781 if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) 1782 return -EFAULT; 1783 1784 ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); 1785 if (!ctl) 1786 return -ENOMEM; 1787 1788 nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); 1789 r = copy_to_user(&user_vmcb->control, ctl, 1790 sizeof(user_vmcb->control)); 1791 kfree(ctl); 1792 if (r) 1793 return -EFAULT; 1794 1795 if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, 1796 sizeof(user_vmcb->save))) 1797 return -EFAULT; 1798 out: 1799 return kvm_state.size; 1800 } 1801 1802 static int svm_set_nested_state(struct kvm_vcpu *vcpu, 1803 struct kvm_nested_state __user *user_kvm_nested_state, 1804 struct kvm_nested_state *kvm_state) 1805 { 1806 struct vcpu_svm *svm = to_svm(vcpu); 1807 struct vmcb __user *user_vmcb = (struct vmcb __user *) 1808 &user_kvm_nested_state->data.svm[0]; 1809 struct vmcb_control_area *ctl; 1810 struct vmcb_save_area *save; 1811 struct vmcb_save_area_cached save_cached; 1812 struct vmcb_ctrl_area_cached ctl_cached; 1813 unsigned long cr0; 1814 int ret; 1815 1816 BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > 1817 KVM_STATE_NESTED_SVM_VMCB_SIZE); 1818 1819 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) 1820 return -EINVAL; 1821 1822 if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE | 1823 KVM_STATE_NESTED_RUN_PENDING | 1824 KVM_STATE_NESTED_GIF_SET)) 1825 return -EINVAL; 1826 1827 /* 1828 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's 1829 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. 1830 * If SVME is disabled, the only valid states are "none" and GIF=1 1831 * (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed). 1832 */ 1833 if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags && 1834 kvm_state->flags != KVM_STATE_NESTED_GIF_SET) 1835 return -EINVAL; 1836 1837 /* SMM temporarily disables SVM, so we cannot be in guest mode. */ 1838 if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 1839 return -EINVAL; 1840 1841 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { 1842 svm_leave_nested(vcpu); 1843 svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); 1844 return 0; 1845 } 1846 1847 if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) 1848 return -EINVAL; 1849 if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) 1850 return -EINVAL; 1851 1852 ctl = memdup_user(&user_vmcb->control, sizeof(*ctl)); 1853 if (IS_ERR(ctl)) 1854 return PTR_ERR(ctl); 1855 1856 save = memdup_user(&user_vmcb->save, sizeof(*save)); 1857 if (IS_ERR(save)) { 1858 kfree(ctl); 1859 return PTR_ERR(save); 1860 } 1861 1862 ret = -EINVAL; 1863 __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); 1864 if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) 1865 goto out_free; 1866 1867 /* 1868 * Processor state contains L2 state. Check that it is 1869 * valid for guest mode (see nested_vmcb_check_save). 1870 */ 1871 cr0 = kvm_read_cr0(vcpu); 1872 if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) 1873 goto out_free; 1874 1875 /* 1876 * Validate host state saved from before VMRUN (see 1877 * nested_svm_check_permissions). 1878 */ 1879 __nested_copy_vmcb_save_to_cache(&save_cached, save); 1880 if (!(save->cr0 & X86_CR0_PG) || 1881 !(save->cr0 & X86_CR0_PE) || 1882 (save->rflags & X86_EFLAGS_VM) || 1883 !__nested_vmcb_check_save(vcpu, &save_cached)) 1884 goto out_free; 1885 1886 1887 /* 1888 * All checks done, we can enter guest mode. Userspace provides 1889 * vmcb12.control, which will be combined with L1 and stored into 1890 * vmcb02, and the L1 save state which we store in vmcb01. 1891 * L2 registers if needed are moved from the current VMCB to VMCB02. 1892 */ 1893 1894 if (is_guest_mode(vcpu)) 1895 svm_leave_nested(vcpu); 1896 else 1897 svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; 1898 1899 svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); 1900 1901 svm->nested.nested_run_pending = 1902 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 1903 1904 svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; 1905 1906 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); 1907 nested_copy_vmcb_control_to_cache(svm, ctl); 1908 1909 svm_switch_vmcb(svm, &svm->nested.vmcb02); 1910 nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); 1911 1912 /* 1913 * While the nested guest CR3 is already checked and set by 1914 * KVM_SET_SREGS, it was set when nested state was yet loaded, 1915 * thus MMU might not be initialized correctly. 1916 * Set it again to fix this. 1917 */ 1918 ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, 1919 nested_npt_enabled(svm), false); 1920 if (ret) 1921 goto out_free; 1922 1923 svm->nested.force_msr_bitmap_recalc = true; 1924 1925 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 1926 ret = 0; 1927 out_free: 1928 kfree(save); 1929 kfree(ctl); 1930 1931 return ret; 1932 } 1933 1934 static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) 1935 { 1936 if (WARN_ON(!is_guest_mode(vcpu))) 1937 return true; 1938 1939 if (!vcpu->arch.pdptrs_from_userspace && 1940 !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu)) 1941 /* 1942 * Reload the guest's PDPTRs since after a migration 1943 * the guest CR3 might be restored prior to setting the nested 1944 * state which can lead to a load of wrong PDPTRs. 1945 */ 1946 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 1947 return false; 1948 1949 if (!nested_svm_merge_msrpm(vcpu)) { 1950 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1951 vcpu->run->internal.suberror = 1952 KVM_INTERNAL_ERROR_EMULATION; 1953 vcpu->run->internal.ndata = 0; 1954 return false; 1955 } 1956 1957 if (kvm_hv_verify_vp_assist(vcpu)) 1958 return false; 1959 1960 return true; 1961 } 1962 1963 struct kvm_x86_nested_ops svm_nested_ops = { 1964 .leave_nested = svm_leave_nested, 1965 .is_exception_vmexit = nested_svm_is_exception_vmexit, 1966 .check_events = svm_check_nested_events, 1967 .triple_fault = nested_svm_triple_fault, 1968 .get_nested_state_pages = svm_get_nested_state_pages, 1969 .get_state = svm_get_nested_state, 1970 .set_state = svm_set_nested_state, 1971 .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush, 1972 }; 1973