1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 #include <linux/kvm_irqfd.h> 22 23 #include <asm/irq_remapping.h> 24 #include <asm/msr.h> 25 26 #include "trace.h" 27 #include "lapic.h" 28 #include "x86.h" 29 #include "irq.h" 30 #include "svm.h" 31 32 /* 33 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 34 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 35 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 36 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 37 * lookup on the index, where as vCPUs whose index doesn't match their ID need 38 * to walk the entire xarray of vCPUs in the worst case scenario. 39 * 40 * For the vCPU index, use however many bits are currently allowed for the max 41 * guest physical APIC ID (limited by the size of the physical ID table), and 42 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 43 * size of the GATag is defined by hardware (32 bits), but is an opaque value 44 * as far as hardware is concerned. 45 */ 46 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 47 48 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 49 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 50 51 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 52 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 53 54 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 55 ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 56 #define AVIC_GATAG(vm_id, vcpu_idx) \ 57 ({ \ 58 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 59 \ 60 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 61 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 62 ga_tag; \ 63 }) 64 65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 66 67 #define AVIC_AUTO_MODE -1 68 69 static int avic_param_set(const char *val, const struct kernel_param *kp) 70 { 71 if (val && sysfs_streq(val, "auto")) { 72 *(int *)kp->arg = AVIC_AUTO_MODE; 73 return 0; 74 } 75 76 return param_set_bint(val, kp); 77 } 78 79 static const struct kernel_param_ops avic_ops = { 80 .flags = KERNEL_PARAM_OPS_FL_NOARG, 81 .set = avic_param_set, 82 .get = param_get_bool, 83 }; 84 85 /* 86 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled 87 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). 88 */ 89 static int avic = AVIC_AUTO_MODE; 90 module_param_cb(avic, &avic_ops, &avic, 0444); 91 __MODULE_PARM_TYPE(avic, "bool"); 92 93 module_param(enable_ipiv, bool, 0444); 94 95 static bool force_avic; 96 module_param_unsafe(force_avic, bool, 0444); 97 98 /* Note: 99 * This hash table is used to map VM_ID to a struct kvm_svm, 100 * when handling AMD IOMMU GALOG notification to schedule in 101 * a particular vCPU. 102 */ 103 #define SVM_VM_DATA_HASH_BITS 8 104 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 105 static u32 next_vm_id = 0; 106 static bool next_vm_id_wrapped = 0; 107 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 108 static bool x2avic_enabled; 109 110 111 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, 112 bool intercept) 113 { 114 static const u32 x2avic_passthrough_msrs[] = { 115 X2APIC_MSR(APIC_ID), 116 X2APIC_MSR(APIC_LVR), 117 X2APIC_MSR(APIC_TASKPRI), 118 X2APIC_MSR(APIC_ARBPRI), 119 X2APIC_MSR(APIC_PROCPRI), 120 X2APIC_MSR(APIC_EOI), 121 X2APIC_MSR(APIC_RRR), 122 X2APIC_MSR(APIC_LDR), 123 X2APIC_MSR(APIC_DFR), 124 X2APIC_MSR(APIC_SPIV), 125 X2APIC_MSR(APIC_ISR), 126 X2APIC_MSR(APIC_TMR), 127 X2APIC_MSR(APIC_IRR), 128 X2APIC_MSR(APIC_ESR), 129 X2APIC_MSR(APIC_ICR), 130 X2APIC_MSR(APIC_ICR2), 131 132 /* 133 * Note! Always intercept LVTT, as TSC-deadline timer mode 134 * isn't virtualized by hardware, and the CPU will generate a 135 * #GP instead of a #VMEXIT. 136 */ 137 X2APIC_MSR(APIC_LVTTHMR), 138 X2APIC_MSR(APIC_LVTPC), 139 X2APIC_MSR(APIC_LVT0), 140 X2APIC_MSR(APIC_LVT1), 141 X2APIC_MSR(APIC_LVTERR), 142 X2APIC_MSR(APIC_TMICT), 143 X2APIC_MSR(APIC_TMCCT), 144 X2APIC_MSR(APIC_TDCR), 145 }; 146 int i; 147 148 if (intercept == svm->x2avic_msrs_intercepted) 149 return; 150 151 if (!x2avic_enabled) 152 return; 153 154 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 155 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 156 MSR_TYPE_RW, intercept); 157 158 svm->x2avic_msrs_intercepted = intercept; 159 } 160 161 static void avic_activate_vmcb(struct vcpu_svm *svm) 162 { 163 struct vmcb *vmcb = svm->vmcb01.ptr; 164 165 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 166 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 167 168 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 169 170 /* 171 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 172 * accesses, while interrupt injection to a running vCPU can be 173 * achieved using AVIC doorbell. KVM disables the APIC access page 174 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 175 * AVIC in hybrid mode activates only the doorbell mechanism. 176 */ 177 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 178 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 179 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; 180 /* Disabling MSR intercept for x2APIC registers */ 181 avic_set_x2apic_msr_interception(svm, false); 182 } else { 183 /* 184 * Flush the TLB, the guest may have inserted a non-APIC 185 * mapping into the TLB while AVIC was disabled. 186 */ 187 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 188 189 /* For xAVIC and hybrid-xAVIC modes */ 190 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; 191 /* Enabling MSR intercept for x2APIC registers */ 192 avic_set_x2apic_msr_interception(svm, true); 193 } 194 } 195 196 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 197 { 198 struct vmcb *vmcb = svm->vmcb01.ptr; 199 200 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 201 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 202 203 /* 204 * If running nested and the guest uses its own MSR bitmap, there 205 * is no need to update L0's msr bitmap 206 */ 207 if (is_guest_mode(&svm->vcpu) && 208 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 209 return; 210 211 /* Enabling MSR intercept for x2APIC registers */ 212 avic_set_x2apic_msr_interception(svm, true); 213 } 214 215 /* Note: 216 * This function is called from IOMMU driver to notify 217 * SVM to schedule in a particular vCPU of a particular VM. 218 */ 219 int avic_ga_log_notifier(u32 ga_tag) 220 { 221 unsigned long flags; 222 struct kvm_svm *kvm_svm; 223 struct kvm_vcpu *vcpu = NULL; 224 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 225 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 226 227 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 228 trace_kvm_avic_ga_log(vm_id, vcpu_idx); 229 230 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 231 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 232 if (kvm_svm->avic_vm_id != vm_id) 233 continue; 234 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 235 break; 236 } 237 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 238 239 /* Note: 240 * At this point, the IOMMU should have already set the pending 241 * bit in the vAPIC backing page. So, we just need to schedule 242 * in the vcpu. 243 */ 244 if (vcpu) 245 kvm_vcpu_wake_up(vcpu); 246 247 return 0; 248 } 249 250 void avic_vm_destroy(struct kvm *kvm) 251 { 252 unsigned long flags; 253 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 254 255 if (!enable_apicv) 256 return; 257 258 free_page((unsigned long)kvm_svm->avic_logical_id_table); 259 free_page((unsigned long)kvm_svm->avic_physical_id_table); 260 261 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 262 hash_del(&kvm_svm->hnode); 263 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 264 } 265 266 int avic_vm_init(struct kvm *kvm) 267 { 268 unsigned long flags; 269 int err = -ENOMEM; 270 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 271 struct kvm_svm *k2; 272 u32 vm_id; 273 274 if (!enable_apicv) 275 return 0; 276 277 kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 278 if (!kvm_svm->avic_physical_id_table) 279 goto free_avic; 280 281 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 282 if (!kvm_svm->avic_logical_id_table) 283 goto free_avic; 284 285 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 286 again: 287 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 288 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 289 next_vm_id_wrapped = 1; 290 goto again; 291 } 292 /* Is it still in use? Only possible if wrapped at least once */ 293 if (next_vm_id_wrapped) { 294 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 295 if (k2->avic_vm_id == vm_id) 296 goto again; 297 } 298 } 299 kvm_svm->avic_vm_id = vm_id; 300 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 301 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 302 303 return 0; 304 305 free_avic: 306 avic_vm_destroy(kvm); 307 return err; 308 } 309 310 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 311 { 312 return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 313 } 314 315 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 316 { 317 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 318 319 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 320 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 321 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 322 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 323 324 if (kvm_apicv_activated(svm->vcpu.kvm)) 325 avic_activate_vmcb(svm); 326 else 327 avic_deactivate_vmcb(svm); 328 } 329 330 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 331 { 332 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 333 struct vcpu_svm *svm = to_svm(vcpu); 334 u32 id = vcpu->vcpu_id; 335 u64 new_entry; 336 337 /* 338 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 339 * hardware. Immediately clear apicv_active, i.e. don't wait until the 340 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 341 * avic_vcpu_load() expects to be called if and only if the vCPU has 342 * fully initialized AVIC. 343 */ 344 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || 345 (id > X2AVIC_MAX_PHYSICAL_ID)) { 346 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 347 vcpu->arch.apic->apicv_active = false; 348 return 0; 349 } 350 351 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 352 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 353 354 if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 355 return -EINVAL; 356 357 if (kvm_apicv_activated(vcpu->kvm)) { 358 int ret; 359 360 /* 361 * Note, AVIC hardware walks the nested page table to check 362 * permissions, but does not use the SPA address specified in 363 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 364 * pointer field of the VMCB. 365 */ 366 ret = kvm_alloc_apic_access_page(vcpu->kvm); 367 if (ret) 368 return ret; 369 } 370 371 /* Note, fls64() returns the bit position, +1. */ 372 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 373 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 374 375 /* Setting AVIC backing page address in the phy APIC ID table */ 376 new_entry = avic_get_backing_page_address(svm) | 377 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 378 svm->avic_physical_id_entry = new_entry; 379 380 /* 381 * Initialize the real table, as vCPUs must have a valid entry in order 382 * for broadcast IPIs to function correctly (broadcast IPIs ignore 383 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 384 */ 385 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 386 387 return 0; 388 } 389 390 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 391 { 392 /* 393 * Note, the vCPU could get migrated to a different pCPU at any point, 394 * which could result in signalling the wrong/previous pCPU. But if 395 * that happens the vCPU is guaranteed to do a VMRUN (after being 396 * migrated) and thus will process pending interrupts, i.e. a doorbell 397 * is not needed (and the spurious one is harmless). 398 */ 399 int cpu = READ_ONCE(vcpu->cpu); 400 401 if (cpu != get_cpu()) { 402 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 403 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 404 } 405 put_cpu(); 406 } 407 408 409 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 410 { 411 vcpu->arch.apic->irr_pending = true; 412 svm_complete_interrupt_delivery(vcpu, 413 icrl & APIC_MODE_MASK, 414 icrl & APIC_INT_LEVELTRIG, 415 icrl & APIC_VECTOR_MASK); 416 } 417 418 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 419 u32 icrl) 420 { 421 /* 422 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 423 * i.e. APIC ID == vCPU ID. 424 */ 425 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 426 427 /* Once again, nothing to do if the target vCPU doesn't exist. */ 428 if (unlikely(!target_vcpu)) 429 return; 430 431 avic_kick_vcpu(target_vcpu, icrl); 432 } 433 434 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 435 u32 logid_index, u32 icrl) 436 { 437 u32 physical_id; 438 439 if (avic_logical_id_table) { 440 u32 logid_entry = avic_logical_id_table[logid_index]; 441 442 /* Nothing to do if the logical destination is invalid. */ 443 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 444 return; 445 446 physical_id = logid_entry & 447 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 448 } else { 449 /* 450 * For x2APIC, the logical APIC ID is a read-only value that is 451 * derived from the x2APIC ID, thus the x2APIC ID can be found 452 * by reversing the calculation (stored in logid_index). Note, 453 * bits 31:20 of the x2APIC ID aren't propagated to the logical 454 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 455 */ 456 physical_id = logid_index; 457 } 458 459 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 460 } 461 462 /* 463 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 464 * destination APIC ID to vCPU without looping through all vCPUs. 465 */ 466 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 467 u32 icrl, u32 icrh, u32 index) 468 { 469 int dest_mode = icrl & APIC_DEST_MASK; 470 int shorthand = icrl & APIC_SHORT_MASK; 471 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 472 u32 dest; 473 474 if (shorthand != APIC_DEST_NOSHORT) 475 return -EINVAL; 476 477 if (apic_x2apic_mode(source)) 478 dest = icrh; 479 else 480 dest = GET_XAPIC_DEST_FIELD(icrh); 481 482 if (dest_mode == APIC_DEST_PHYSICAL) { 483 /* broadcast destination, use slow path */ 484 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 485 return -EINVAL; 486 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 487 return -EINVAL; 488 489 if (WARN_ON_ONCE(dest != index)) 490 return -EINVAL; 491 492 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 493 } else { 494 u32 *avic_logical_id_table; 495 unsigned long bitmap, i; 496 u32 cluster; 497 498 if (apic_x2apic_mode(source)) { 499 /* 16 bit dest mask, 16 bit cluster id */ 500 bitmap = dest & 0xFFFF; 501 cluster = (dest >> 16) << 4; 502 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 503 /* 8 bit dest mask*/ 504 bitmap = dest; 505 cluster = 0; 506 } else { 507 /* 4 bit desk mask, 4 bit cluster id */ 508 bitmap = dest & 0xF; 509 cluster = (dest >> 4) << 2; 510 } 511 512 /* Nothing to do if there are no destinations in the cluster. */ 513 if (unlikely(!bitmap)) 514 return 0; 515 516 if (apic_x2apic_mode(source)) 517 avic_logical_id_table = NULL; 518 else 519 avic_logical_id_table = kvm_svm->avic_logical_id_table; 520 521 /* 522 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 523 * IDs, thus each bit in the destination is guaranteed to map 524 * to at most one vCPU. 525 */ 526 for_each_set_bit(i, &bitmap, 16) 527 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 528 cluster + i, icrl); 529 } 530 531 return 0; 532 } 533 534 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 535 u32 icrl, u32 icrh, u32 index) 536 { 537 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 538 unsigned long i; 539 struct kvm_vcpu *vcpu; 540 541 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 542 return; 543 544 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 545 546 /* 547 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 548 * event. There's no need to signal doorbells, as hardware has handled 549 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 550 * since entered the guest will have processed pending IRQs at VMRUN. 551 */ 552 kvm_for_each_vcpu(i, vcpu, kvm) { 553 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 554 dest, icrl & APIC_DEST_MASK)) 555 avic_kick_vcpu(vcpu, icrl); 556 } 557 } 558 559 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 560 { 561 struct vcpu_svm *svm = to_svm(vcpu); 562 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 563 u32 icrl = svm->vmcb->control.exit_info_1; 564 u32 id = svm->vmcb->control.exit_info_2 >> 32; 565 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; 566 struct kvm_lapic *apic = vcpu->arch.apic; 567 568 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 569 570 switch (id) { 571 case AVIC_IPI_FAILURE_INVALID_TARGET: 572 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 573 /* 574 * Emulate IPIs that are not handled by AVIC hardware, which 575 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 576 * if _any_ targets are invalid, e.g. if the logical mode mask 577 * is a superset of running vCPUs. 578 * 579 * The exit is a trap, e.g. ICR holds the correct value and RIP 580 * has been advanced, KVM is responsible only for emulating the 581 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 582 * in which case KVM needs to emulate the ICR write as well in 583 * order to clear the BUSY flag. 584 */ 585 if (icrl & APIC_ICR_BUSY) 586 kvm_apic_write_nodecode(vcpu, APIC_ICR); 587 else 588 kvm_apic_send_ipi(apic, icrl, icrh); 589 break; 590 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 591 /* 592 * At this point, we expect that the AVIC HW has already 593 * set the appropriate IRR bits on the valid target 594 * vcpus. So, we just need to kick the appropriate vcpu. 595 */ 596 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 597 break; 598 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 599 WARN_ONCE(1, "Invalid backing page\n"); 600 break; 601 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: 602 /* Invalid IPI with vector < 16 */ 603 break; 604 default: 605 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); 606 } 607 608 return 1; 609 } 610 611 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 612 { 613 if (is_guest_mode(vcpu)) 614 return APICV_INHIBIT_REASON_NESTED; 615 return 0; 616 } 617 618 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 619 { 620 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 621 u32 cluster, index; 622 623 ldr = GET_APIC_LOGICAL_ID(ldr); 624 625 if (flat) { 626 cluster = 0; 627 } else { 628 cluster = (ldr >> 4); 629 if (cluster >= 0xf) 630 return NULL; 631 ldr &= 0xf; 632 } 633 if (!ldr || !is_power_of_2(ldr)) 634 return NULL; 635 636 index = __ffs(ldr); 637 if (WARN_ON_ONCE(index > 7)) 638 return NULL; 639 index += (cluster << 2); 640 641 return &kvm_svm->avic_logical_id_table[index]; 642 } 643 644 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 645 { 646 bool flat; 647 u32 *entry, new_entry; 648 649 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 650 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 651 if (!entry) 652 return; 653 654 new_entry = READ_ONCE(*entry); 655 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 656 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 657 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 658 WRITE_ONCE(*entry, new_entry); 659 } 660 661 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 662 { 663 struct vcpu_svm *svm = to_svm(vcpu); 664 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 665 u32 *entry; 666 667 /* Note: x2AVIC does not use logical APIC ID table */ 668 if (apic_x2apic_mode(vcpu->arch.apic)) 669 return; 670 671 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 672 if (entry) 673 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 674 } 675 676 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 677 { 678 struct vcpu_svm *svm = to_svm(vcpu); 679 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 680 u32 id = kvm_xapic_id(vcpu->arch.apic); 681 682 /* AVIC does not support LDR update for x2APIC */ 683 if (apic_x2apic_mode(vcpu->arch.apic)) 684 return; 685 686 if (ldr == svm->ldr_reg) 687 return; 688 689 avic_invalidate_logical_id_entry(vcpu); 690 691 svm->ldr_reg = ldr; 692 avic_ldr_write(vcpu, id, ldr); 693 } 694 695 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 696 { 697 struct vcpu_svm *svm = to_svm(vcpu); 698 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 699 700 if (svm->dfr_reg == dfr) 701 return; 702 703 avic_invalidate_logical_id_entry(vcpu); 704 svm->dfr_reg = dfr; 705 } 706 707 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 708 { 709 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 710 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 711 712 switch (offset) { 713 case APIC_LDR: 714 avic_handle_ldr_update(vcpu); 715 break; 716 case APIC_DFR: 717 avic_handle_dfr_update(vcpu); 718 break; 719 case APIC_RRR: 720 /* Ignore writes to Read Remote Data, it's read-only. */ 721 return 1; 722 default: 723 break; 724 } 725 726 kvm_apic_write_nodecode(vcpu, offset); 727 return 1; 728 } 729 730 static bool is_avic_unaccelerated_access_trap(u32 offset) 731 { 732 bool ret = false; 733 734 switch (offset) { 735 case APIC_ID: 736 case APIC_EOI: 737 case APIC_RRR: 738 case APIC_LDR: 739 case APIC_DFR: 740 case APIC_SPIV: 741 case APIC_ESR: 742 case APIC_ICR: 743 case APIC_LVTT: 744 case APIC_LVTTHMR: 745 case APIC_LVTPC: 746 case APIC_LVT0: 747 case APIC_LVT1: 748 case APIC_LVTERR: 749 case APIC_TMICT: 750 case APIC_TDCR: 751 ret = true; 752 break; 753 default: 754 break; 755 } 756 return ret; 757 } 758 759 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 760 { 761 struct vcpu_svm *svm = to_svm(vcpu); 762 int ret = 0; 763 u32 offset = svm->vmcb->control.exit_info_1 & 764 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 765 u32 vector = svm->vmcb->control.exit_info_2 & 766 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 767 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 768 AVIC_UNACCEL_ACCESS_WRITE_MASK; 769 bool trap = is_avic_unaccelerated_access_trap(offset); 770 771 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 772 trap, write, vector); 773 if (trap) { 774 /* Handling Trap */ 775 WARN_ONCE(!write, "svm: Handling trap read.\n"); 776 ret = avic_unaccel_trap_write(vcpu); 777 } else { 778 /* Handling Fault */ 779 ret = kvm_emulate_instruction(vcpu, 0); 780 } 781 782 return ret; 783 } 784 785 int avic_init_vcpu(struct vcpu_svm *svm) 786 { 787 int ret; 788 struct kvm_vcpu *vcpu = &svm->vcpu; 789 790 INIT_LIST_HEAD(&svm->ir_list); 791 spin_lock_init(&svm->ir_list_lock); 792 793 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 794 return 0; 795 796 ret = avic_init_backing_page(vcpu); 797 if (ret) 798 return ret; 799 800 svm->dfr_reg = APIC_DFR_FLAT; 801 802 return ret; 803 } 804 805 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 806 { 807 avic_handle_dfr_update(vcpu); 808 avic_handle_ldr_update(vcpu); 809 } 810 811 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 812 { 813 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 814 unsigned long flags; 815 816 if (!vcpu) 817 return; 818 819 spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 820 list_del(&irqfd->vcpu_list); 821 spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 822 } 823 824 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 825 unsigned int host_irq, uint32_t guest_irq, 826 struct kvm_vcpu *vcpu, u32 vector) 827 { 828 /* 829 * If the IRQ was affined to a different vCPU, remove the IRTE metadata 830 * from the *previous* vCPU's list. 831 */ 832 svm_ir_list_del(irqfd); 833 834 if (vcpu) { 835 /* 836 * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 837 * in which case configure the IRTE for legacy mode, but track 838 * the IRTE metadata so that it can be converted to guest mode 839 * if AVIC is enabled/uninhibited in the future. 840 */ 841 struct amd_iommu_pi_data pi_data = { 842 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 843 vcpu->vcpu_idx), 844 .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 845 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 846 .vector = vector, 847 }; 848 struct vcpu_svm *svm = to_svm(vcpu); 849 u64 entry; 850 int ret; 851 852 /* 853 * Prevent the vCPU from being scheduled out or migrated until 854 * the IRTE is updated and its metadata has been added to the 855 * list of IRQs being posted to the vCPU, to ensure the IRTE 856 * isn't programmed with stale pCPU/IsRunning information. 857 */ 858 guard(spinlock_irqsave)(&svm->ir_list_lock); 859 860 /* 861 * Update the target pCPU for IOMMU doorbells if the vCPU is 862 * running. If the vCPU is NOT running, i.e. is blocking or 863 * scheduled out, KVM will update the pCPU info when the vCPU 864 * is awakened and/or scheduled in. See also avic_vcpu_load(). 865 */ 866 entry = svm->avic_physical_id_entry; 867 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 868 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 869 } else { 870 pi_data.cpu = -1; 871 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 872 } 873 874 ret = irq_set_vcpu_affinity(host_irq, &pi_data); 875 if (ret) 876 return ret; 877 878 /* 879 * Revert to legacy mode if the IOMMU didn't provide metadata 880 * for the IRTE, which KVM needs to keep the IRTE up-to-date, 881 * e.g. if the vCPU is migrated or AVIC is disabled. 882 */ 883 if (WARN_ON_ONCE(!pi_data.ir_data)) { 884 irq_set_vcpu_affinity(host_irq, NULL); 885 return -EIO; 886 } 887 888 irqfd->irq_bypass_data = pi_data.ir_data; 889 list_add(&irqfd->vcpu_list, &svm->ir_list); 890 return 0; 891 } 892 return irq_set_vcpu_affinity(host_irq, NULL); 893 } 894 895 enum avic_vcpu_action { 896 /* 897 * There is no need to differentiate between activate and deactivate, 898 * as KVM only refreshes AVIC state when the vCPU is scheduled in and 899 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 900 * being (de)activated. 901 */ 902 AVIC_TOGGLE_ON_OFF = BIT(0), 903 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 904 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 905 906 /* 907 * No unique action is required to deal with a vCPU that stops/starts 908 * running. A vCPU that starts running by definition stops blocking as 909 * well, and a vCPU that stops running can't have been blocking, i.e. 910 * doesn't need to toggle GALogIntr. 911 */ 912 AVIC_START_RUNNING = 0, 913 AVIC_STOP_RUNNING = 0, 914 915 /* 916 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 917 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 918 * sent to the vCPU. 919 */ 920 AVIC_START_BLOCKING = BIT(1), 921 }; 922 923 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 924 enum avic_vcpu_action action) 925 { 926 bool ga_log_intr = (action & AVIC_START_BLOCKING); 927 struct vcpu_svm *svm = to_svm(vcpu); 928 struct kvm_kernel_irqfd *irqfd; 929 930 lockdep_assert_held(&svm->ir_list_lock); 931 932 /* 933 * Here, we go through the per-vcpu ir_list to update all existing 934 * interrupt remapping table entry targeting this vcpu. 935 */ 936 if (list_empty(&svm->ir_list)) 937 return; 938 939 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 940 void *data = irqfd->irq_bypass_data; 941 942 if (!(action & AVIC_TOGGLE_ON_OFF)) 943 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 944 else if (cpu >= 0) 945 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 946 else 947 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 948 } 949 } 950 951 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 952 enum avic_vcpu_action action) 953 { 954 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 955 int h_physical_id = kvm_cpu_get_apicid(cpu); 956 struct vcpu_svm *svm = to_svm(vcpu); 957 unsigned long flags; 958 u64 entry; 959 960 lockdep_assert_preemption_disabled(); 961 962 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 963 return; 964 965 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 966 return; 967 968 /* 969 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't 970 * _currently_ have assigned devices, as that can change. Holding 971 * ir_list_lock ensures that either svm_ir_list_add() will consume 972 * up-to-date entry information, or that this task will wait until 973 * svm_ir_list_add() completes to set the new target pCPU. 974 */ 975 spin_lock_irqsave(&svm->ir_list_lock, flags); 976 977 entry = svm->avic_physical_id_entry; 978 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 979 980 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 981 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 982 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 983 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 984 985 svm->avic_physical_id_entry = entry; 986 987 /* 988 * If IPI virtualization is disabled, clear IsRunning when updating the 989 * actual Physical ID table, so that the CPU never sees IsRunning=1. 990 * Keep the APIC ID up-to-date in the entry to minimize the chances of 991 * things going sideways if hardware peeks at the ID. 992 */ 993 if (!enable_ipiv) 994 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 995 996 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 997 998 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 999 1000 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1001 } 1002 1003 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1004 { 1005 /* 1006 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1007 * is being scheduled in after being preempted. The CPU entries in the 1008 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1009 * If the vCPU was migrated, its new CPU value will be stuffed when the 1010 * vCPU unblocks. 1011 */ 1012 if (kvm_vcpu_is_blocking(vcpu)) 1013 return; 1014 1015 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 1016 } 1017 1018 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 1019 { 1020 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1021 struct vcpu_svm *svm = to_svm(vcpu); 1022 unsigned long flags; 1023 u64 entry = svm->avic_physical_id_entry; 1024 1025 lockdep_assert_preemption_disabled(); 1026 1027 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 1028 return; 1029 1030 /* 1031 * Take and hold the per-vCPU interrupt remapping lock while updating 1032 * the Physical ID entry even though the lock doesn't protect against 1033 * multiple writers (see above). Holding ir_list_lock ensures that 1034 * either svm_ir_list_add() will consume up-to-date entry information, 1035 * or that this task will wait until svm_ir_list_add() completes to 1036 * mark the vCPU as not running. 1037 */ 1038 spin_lock_irqsave(&svm->ir_list_lock, flags); 1039 1040 avic_update_iommu_vcpu_affinity(vcpu, -1, action); 1041 1042 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1043 1044 /* 1045 * Keep the previous APIC ID in the entry so that a rogue doorbell from 1046 * hardware is at least restricted to a CPU associated with the vCPU. 1047 */ 1048 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1049 1050 if (enable_ipiv) 1051 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1052 1053 /* 1054 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 1055 * it's a synthetic flag that usurps an unused should-be-zero bit. 1056 */ 1057 if (action & AVIC_START_BLOCKING) 1058 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 1059 1060 svm->avic_physical_id_entry = entry; 1061 1062 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1063 } 1064 1065 void avic_vcpu_put(struct kvm_vcpu *vcpu) 1066 { 1067 /* 1068 * Note, reading the Physical ID entry outside of ir_list_lock is safe 1069 * as only the pCPU that has loaded (or is loading) the vCPU is allowed 1070 * to modify the entry, and preemption is disabled. I.e. the vCPU 1071 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 1072 * recursively. 1073 */ 1074 u64 entry = to_svm(vcpu)->avic_physical_id_entry; 1075 1076 /* 1077 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 1078 * vCPU is preempted while its in the process of blocking. WARN if the 1079 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1080 * the AVIC if it wasn't previously loaded. 1081 */ 1082 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1083 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1084 return; 1085 1086 /* 1087 * The vCPU was preempted while blocking, ensure its IRTEs are 1088 * configured to generate GA Log Interrupts. 1089 */ 1090 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1091 return; 1092 } 1093 1094 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1095 AVIC_STOP_RUNNING); 1096 } 1097 1098 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1099 { 1100 struct vcpu_svm *svm = to_svm(vcpu); 1101 struct vmcb *vmcb = svm->vmcb01.ptr; 1102 1103 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1104 return; 1105 1106 if (kvm_vcpu_apicv_active(vcpu)) { 1107 /** 1108 * During AVIC temporary deactivation, guest could update 1109 * APIC ID, DFR and LDR registers, which would not be trapped 1110 * by avic_unaccelerated_access_interception(). In this case, 1111 * we need to check and update the AVIC logical APIC ID table 1112 * accordingly before re-activating. 1113 */ 1114 avic_apicv_post_state_restore(vcpu); 1115 avic_activate_vmcb(svm); 1116 } else { 1117 avic_deactivate_vmcb(svm); 1118 } 1119 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1120 } 1121 1122 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1123 { 1124 if (!enable_apicv) 1125 return; 1126 1127 /* APICv should only be toggled on/off while the vCPU is running. */ 1128 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1129 1130 avic_refresh_virtual_apic_mode(vcpu); 1131 1132 if (kvm_vcpu_apicv_active(vcpu)) 1133 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1134 else 1135 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1136 } 1137 1138 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1139 { 1140 if (!kvm_vcpu_apicv_active(vcpu)) 1141 return; 1142 1143 /* 1144 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1145 * actually blocks. 1146 * 1147 * Note, any IRQs that arrive before IsRunning=0 will not cause an 1148 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1149 * this by checking vIRR one last time before blocking. The memory 1150 * barrier implicit in set_current_state orders writing IsRunning=0 1151 * before reading the vIRR. The processor needs a matching memory 1152 * barrier on interrupt delivery between writing IRR and reading 1153 * IsRunning; the lack of this barrier might be the cause of errata #1235). 1154 * 1155 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1156 * doesn't need to detect events for scheduling purposes. The doorbell 1157 * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1158 * CPU and cause noisy neighbor problems if the VM is sending interrupts 1159 * to the vCPU while it's scheduled out. 1160 */ 1161 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1162 } 1163 1164 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1165 { 1166 if (!kvm_vcpu_apicv_active(vcpu)) 1167 return; 1168 1169 avic_vcpu_load(vcpu, vcpu->cpu); 1170 } 1171 1172 static bool __init avic_want_avic_enabled(void) 1173 { 1174 /* 1175 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is 1176 * supported (to avoid enabling partial support by default, and because 1177 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for 1178 * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags 1179 * aren't inclusive of previous generations, i.e. the kernel will set 1180 * at most one ZenX feature flag. 1181 */ 1182 if (avic == AVIC_AUTO_MODE) 1183 avic = boot_cpu_has(X86_FEATURE_X2AVIC) && 1184 (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); 1185 1186 if (!avic || !npt_enabled) 1187 return false; 1188 1189 /* AVIC is a prerequisite for x2AVIC. */ 1190 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1191 if (boot_cpu_has(X86_FEATURE_X2AVIC)) 1192 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); 1193 return false; 1194 } 1195 1196 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && 1197 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { 1198 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); 1199 return false; 1200 } 1201 1202 /* 1203 * Print a scary message if AVIC is force enabled to make it abundantly 1204 * clear that ignoring CPUID could have repercussions. See Revision 1205 * Guide for specific AMD processor for more details. 1206 */ 1207 if (!boot_cpu_has(X86_FEATURE_AVIC)) 1208 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); 1209 1210 return true; 1211 } 1212 1213 /* 1214 * Note: 1215 * - The module param avic enable both xAPIC and x2APIC mode. 1216 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1217 * - The mode can be switched at run-time. 1218 */ 1219 bool __init avic_hardware_setup(void) 1220 { 1221 avic = avic_want_avic_enabled(); 1222 if (!avic) 1223 return false; 1224 1225 pr_info("AVIC enabled\n"); 1226 1227 /* AVIC is a prerequisite for x2AVIC. */ 1228 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1229 if (x2avic_enabled) 1230 pr_info("x2AVIC enabled\n"); 1231 else 1232 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 1233 1234 /* 1235 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1236 * due to erratum 1235, which results in missed VM-Exits on the sender 1237 * and thus missed wake events for blocking vCPUs due to the CPU 1238 * failing to see a software update to clear IsRunning. 1239 */ 1240 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1241 1242 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1243 1244 return true; 1245 } 1246