1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 #include <linux/kvm_irqfd.h> 22 #include <linux/sysfs.h> 23 24 #include <asm/irq_remapping.h> 25 #include <asm/msr.h> 26 27 #include "trace.h" 28 #include "lapic.h" 29 #include "x86.h" 30 #include "irq.h" 31 #include "svm.h" 32 33 /* 34 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 35 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 36 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 37 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 38 * lookup on the index, where as vCPUs whose index doesn't match their ID need 39 * to walk the entire xarray of vCPUs in the worst case scenario. 40 * 41 * For the vCPU index, use however many bits are currently allowed for the max 42 * guest physical APIC ID (limited by the size of the physical ID table), and 43 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 44 * size of the GATag is defined by hardware (32 bits), but is an opaque value 45 * as far as hardware is concerned. 46 */ 47 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 48 49 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 50 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 51 52 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 53 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 54 55 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 56 ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 57 #define AVIC_GATAG(vm_id, vcpu_idx) \ 58 ({ \ 59 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 60 \ 61 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 62 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 63 ga_tag; \ 64 }) 65 66 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 67 68 #define AVIC_AUTO_MODE -1 69 70 static int avic_param_set(const char *val, const struct kernel_param *kp) 71 { 72 if (val && sysfs_streq(val, "auto")) { 73 *(int *)kp->arg = AVIC_AUTO_MODE; 74 return 0; 75 } 76 77 return param_set_bint(val, kp); 78 } 79 80 static int avic_param_get(char *buffer, const struct kernel_param *kp) 81 { 82 int val = *(int *)kp->arg; 83 84 if (val == AVIC_AUTO_MODE) 85 return sysfs_emit(buffer, "N\n"); 86 87 return param_get_bool(buffer, kp); 88 } 89 90 static const struct kernel_param_ops avic_ops = { 91 .flags = KERNEL_PARAM_OPS_FL_NOARG, 92 .set = avic_param_set, 93 .get = avic_param_get, 94 }; 95 96 /* 97 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled 98 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). 99 */ 100 static int __ro_after_init avic = AVIC_AUTO_MODE; 101 module_param_cb(avic, &avic_ops, &avic, 0444); 102 __MODULE_PARM_TYPE(avic, "bool"); 103 104 module_param(enable_ipiv, bool, 0444); 105 106 static bool __ro_after_init force_avic; 107 module_param_unsafe(force_avic, bool, 0444); 108 109 /* Note: 110 * This hash table is used to map VM_ID to a struct kvm_svm, 111 * when handling AMD IOMMU GALOG notification to schedule in 112 * a particular vCPU. 113 */ 114 #define SVM_VM_DATA_HASH_BITS 8 115 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 116 static u32 next_vm_id = 0; 117 static bool next_vm_id_wrapped = 0; 118 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 119 static bool x2avic_enabled; 120 static u32 x2avic_max_physical_id; 121 122 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, 123 bool intercept) 124 { 125 static const u32 x2avic_passthrough_msrs[] = { 126 X2APIC_MSR(APIC_ID), 127 X2APIC_MSR(APIC_LVR), 128 X2APIC_MSR(APIC_TASKPRI), 129 X2APIC_MSR(APIC_ARBPRI), 130 X2APIC_MSR(APIC_PROCPRI), 131 X2APIC_MSR(APIC_EOI), 132 X2APIC_MSR(APIC_RRR), 133 X2APIC_MSR(APIC_LDR), 134 X2APIC_MSR(APIC_DFR), 135 X2APIC_MSR(APIC_SPIV), 136 X2APIC_MSR(APIC_ISR), 137 X2APIC_MSR(APIC_TMR), 138 X2APIC_MSR(APIC_IRR), 139 X2APIC_MSR(APIC_ESR), 140 X2APIC_MSR(APIC_ICR), 141 X2APIC_MSR(APIC_ICR2), 142 143 /* 144 * Note! Always intercept LVTT, as TSC-deadline timer mode 145 * isn't virtualized by hardware, and the CPU will generate a 146 * #GP instead of a #VMEXIT. 147 */ 148 X2APIC_MSR(APIC_LVTTHMR), 149 X2APIC_MSR(APIC_LVTPC), 150 X2APIC_MSR(APIC_LVT0), 151 X2APIC_MSR(APIC_LVT1), 152 X2APIC_MSR(APIC_LVTERR), 153 X2APIC_MSR(APIC_TMICT), 154 X2APIC_MSR(APIC_TMCCT), 155 X2APIC_MSR(APIC_TDCR), 156 }; 157 int i; 158 159 if (intercept == svm->x2avic_msrs_intercepted) 160 return; 161 162 if (!x2avic_enabled) 163 return; 164 165 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 166 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 167 MSR_TYPE_RW, intercept); 168 169 svm->x2avic_msrs_intercepted = intercept; 170 } 171 172 static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) 173 { 174 u32 arch_max; 175 176 /* 177 * Return the largest size (x2APIC) when querying without a vCPU, e.g. 178 * to allocate the per-VM table.. 179 */ 180 if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) 181 arch_max = x2avic_max_physical_id; 182 else 183 arch_max = AVIC_MAX_PHYSICAL_ID; 184 185 /* 186 * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID 187 * plus one, so the max possible APIC ID is one less than that. 188 */ 189 return min(kvm->arch.max_vcpu_ids - 1, arch_max); 190 } 191 192 static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) 193 { 194 return __avic_get_max_physical_id(vcpu->kvm, vcpu); 195 } 196 197 static void avic_activate_vmcb(struct vcpu_svm *svm) 198 { 199 struct vmcb *vmcb = svm->vmcb01.ptr; 200 struct kvm_vcpu *vcpu = &svm->vcpu; 201 202 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 203 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 204 vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); 205 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 206 207 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 208 209 /* 210 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 211 * accesses, while interrupt injection to a running vCPU can be 212 * achieved using AVIC doorbell. KVM disables the APIC access page 213 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 214 * AVIC in hybrid mode activates only the doorbell mechanism. 215 */ 216 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 217 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 218 219 /* Disabling MSR intercept for x2APIC registers */ 220 avic_set_x2apic_msr_interception(svm, false); 221 } else { 222 /* 223 * Flush the TLB, the guest may have inserted a non-APIC 224 * mapping into the TLB while AVIC was disabled. 225 */ 226 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 227 228 /* Enabling MSR intercept for x2APIC registers */ 229 avic_set_x2apic_msr_interception(svm, true); 230 } 231 } 232 233 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 234 { 235 struct vmcb *vmcb = svm->vmcb01.ptr; 236 237 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 238 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 239 240 if (!is_sev_es_guest(&svm->vcpu)) 241 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 242 243 /* 244 * If running nested and the guest uses its own MSR bitmap, there 245 * is no need to update L0's msr bitmap 246 */ 247 if (is_guest_mode(&svm->vcpu) && 248 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 249 return; 250 251 /* Enabling MSR intercept for x2APIC registers */ 252 avic_set_x2apic_msr_interception(svm, true); 253 } 254 255 /* Note: 256 * This function is called from IOMMU driver to notify 257 * SVM to schedule in a particular vCPU of a particular VM. 258 */ 259 static int avic_ga_log_notifier(u32 ga_tag) 260 { 261 unsigned long flags; 262 struct kvm_svm *kvm_svm; 263 struct kvm_vcpu *vcpu = NULL; 264 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 265 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 266 267 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 268 trace_kvm_avic_ga_log(vm_id, vcpu_idx); 269 270 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 271 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 272 if (kvm_svm->avic_vm_id != vm_id) 273 continue; 274 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 275 break; 276 } 277 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 278 279 /* Note: 280 * At this point, the IOMMU should have already set the pending 281 * bit in the vAPIC backing page. So, we just need to schedule 282 * in the vcpu. 283 */ 284 if (vcpu) 285 kvm_vcpu_wake_up(vcpu); 286 287 return 0; 288 } 289 290 static int avic_get_physical_id_table_order(struct kvm *kvm) 291 { 292 /* Provision for the maximum physical ID supported in x2avic mode */ 293 return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); 294 } 295 296 int avic_alloc_physical_id_table(struct kvm *kvm) 297 { 298 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 299 300 if (!irqchip_in_kernel(kvm) || !enable_apicv) 301 return 0; 302 303 if (kvm_svm->avic_physical_id_table) 304 return 0; 305 306 kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 307 avic_get_physical_id_table_order(kvm)); 308 if (!kvm_svm->avic_physical_id_table) 309 return -ENOMEM; 310 311 return 0; 312 } 313 314 void avic_vm_destroy(struct kvm *kvm) 315 { 316 unsigned long flags; 317 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 318 319 if (!enable_apicv) 320 return; 321 322 free_page((unsigned long)kvm_svm->avic_logical_id_table); 323 free_pages((unsigned long)kvm_svm->avic_physical_id_table, 324 avic_get_physical_id_table_order(kvm)); 325 326 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 327 hash_del(&kvm_svm->hnode); 328 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 329 } 330 331 int avic_vm_init(struct kvm *kvm) 332 { 333 unsigned long flags; 334 int err = -ENOMEM; 335 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 336 struct kvm_svm *k2; 337 u32 vm_id; 338 339 if (!enable_apicv) 340 return 0; 341 342 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 343 if (!kvm_svm->avic_logical_id_table) 344 goto free_avic; 345 346 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 347 again: 348 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 349 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 350 next_vm_id_wrapped = 1; 351 goto again; 352 } 353 /* Is it still in use? Only possible if wrapped at least once */ 354 if (next_vm_id_wrapped) { 355 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 356 if (k2->avic_vm_id == vm_id) 357 goto again; 358 } 359 } 360 kvm_svm->avic_vm_id = vm_id; 361 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 362 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 363 364 return 0; 365 366 free_avic: 367 avic_vm_destroy(kvm); 368 return err; 369 } 370 371 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 372 { 373 return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 374 } 375 376 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 377 { 378 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 379 380 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 381 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 382 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 383 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 384 385 if (kvm_vcpu_apicv_active(&svm->vcpu)) 386 avic_activate_vmcb(svm); 387 else 388 avic_deactivate_vmcb(svm); 389 } 390 391 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 392 { 393 u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID; 394 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 395 struct vcpu_svm *svm = to_svm(vcpu); 396 u32 id = vcpu->vcpu_id; 397 u64 new_entry; 398 399 /* 400 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 401 * hardware. Immediately clear apicv_active, i.e. don't wait until the 402 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 403 * avic_vcpu_load() expects to be called if and only if the vCPU has 404 * fully initialized AVIC. 405 */ 406 if (id > max_id) { 407 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 408 vcpu->arch.apic->apicv_active = false; 409 return 0; 410 } 411 412 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 413 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 414 415 if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 416 return -EINVAL; 417 418 if (kvm_apicv_activated(vcpu->kvm)) { 419 int ret; 420 421 /* 422 * Note, AVIC hardware walks the nested page table to check 423 * permissions, but does not use the SPA address specified in 424 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 425 * pointer field of the VMCB. 426 */ 427 ret = kvm_alloc_apic_access_page(vcpu->kvm); 428 if (ret) 429 return ret; 430 } 431 432 /* Note, fls64() returns the bit position, +1. */ 433 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 434 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 435 436 /* Setting AVIC backing page address in the phy APIC ID table */ 437 new_entry = avic_get_backing_page_address(svm) | 438 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 439 svm->avic_physical_id_entry = new_entry; 440 441 /* 442 * Initialize the real table, as vCPUs must have a valid entry in order 443 * for broadcast IPIs to function correctly (broadcast IPIs ignore 444 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 445 */ 446 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 447 448 return 0; 449 } 450 451 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 452 { 453 /* 454 * Note, the vCPU could get migrated to a different pCPU at any point, 455 * which could result in signalling the wrong/previous pCPU. But if 456 * that happens the vCPU is guaranteed to do a VMRUN (after being 457 * migrated) and thus will process pending interrupts, i.e. a doorbell 458 * is not needed (and the spurious one is harmless). 459 */ 460 int cpu = READ_ONCE(vcpu->cpu); 461 462 if (cpu != get_cpu()) { 463 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 464 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 465 } 466 put_cpu(); 467 } 468 469 470 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 471 { 472 vcpu->arch.apic->irr_pending = true; 473 svm_complete_interrupt_delivery(vcpu, 474 icrl & APIC_MODE_MASK, 475 icrl & APIC_INT_LEVELTRIG, 476 icrl & APIC_VECTOR_MASK); 477 } 478 479 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 480 u32 icrl) 481 { 482 /* 483 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 484 * i.e. APIC ID == vCPU ID. 485 */ 486 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 487 488 /* Once again, nothing to do if the target vCPU doesn't exist. */ 489 if (unlikely(!target_vcpu)) 490 return; 491 492 avic_kick_vcpu(target_vcpu, icrl); 493 } 494 495 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 496 u32 logid_index, u32 icrl) 497 { 498 u32 physical_id; 499 500 if (avic_logical_id_table) { 501 u32 logid_entry = avic_logical_id_table[logid_index]; 502 503 /* Nothing to do if the logical destination is invalid. */ 504 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 505 return; 506 507 physical_id = logid_entry & 508 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 509 } else { 510 /* 511 * For x2APIC, the logical APIC ID is a read-only value that is 512 * derived from the x2APIC ID, thus the x2APIC ID can be found 513 * by reversing the calculation (stored in logid_index). Note, 514 * bits 31:20 of the x2APIC ID aren't propagated to the logical 515 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 516 */ 517 physical_id = logid_index; 518 } 519 520 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 521 } 522 523 /* 524 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 525 * destination APIC ID to vCPU without looping through all vCPUs. 526 */ 527 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 528 u32 icrl, u32 icrh, u32 index) 529 { 530 int dest_mode = icrl & APIC_DEST_MASK; 531 int shorthand = icrl & APIC_SHORT_MASK; 532 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 533 u32 dest; 534 535 if (shorthand != APIC_DEST_NOSHORT) 536 return -EINVAL; 537 538 if (apic_x2apic_mode(source)) 539 dest = icrh; 540 else 541 dest = GET_XAPIC_DEST_FIELD(icrh); 542 543 if (dest_mode == APIC_DEST_PHYSICAL) { 544 /* broadcast destination, use slow path */ 545 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 546 return -EINVAL; 547 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 548 return -EINVAL; 549 550 if (WARN_ON_ONCE(dest != index)) 551 return -EINVAL; 552 553 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 554 } else { 555 u32 *avic_logical_id_table; 556 unsigned long bitmap, i; 557 u32 cluster; 558 559 if (apic_x2apic_mode(source)) { 560 /* 16 bit dest mask, 16 bit cluster id */ 561 bitmap = dest & 0xFFFF; 562 cluster = (dest >> 16) << 4; 563 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 564 /* 8 bit dest mask*/ 565 bitmap = dest; 566 cluster = 0; 567 } else { 568 /* 4 bit desk mask, 4 bit cluster id */ 569 bitmap = dest & 0xF; 570 cluster = (dest >> 4) << 2; 571 } 572 573 /* Nothing to do if there are no destinations in the cluster. */ 574 if (unlikely(!bitmap)) 575 return 0; 576 577 if (apic_x2apic_mode(source)) 578 avic_logical_id_table = NULL; 579 else 580 avic_logical_id_table = kvm_svm->avic_logical_id_table; 581 582 /* 583 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 584 * IDs, thus each bit in the destination is guaranteed to map 585 * to at most one vCPU. 586 */ 587 for_each_set_bit(i, &bitmap, 16) 588 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 589 cluster + i, icrl); 590 } 591 592 return 0; 593 } 594 595 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 596 u32 icrl, u32 icrh, u32 index) 597 { 598 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 599 unsigned long i; 600 struct kvm_vcpu *vcpu; 601 602 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 603 return; 604 605 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 606 607 /* 608 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 609 * event. There's no need to signal doorbells, as hardware has handled 610 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 611 * since entered the guest will have processed pending IRQs at VMRUN. 612 */ 613 kvm_for_each_vcpu(i, vcpu, kvm) { 614 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 615 dest, icrl & APIC_DEST_MASK)) 616 avic_kick_vcpu(vcpu, icrl); 617 } 618 } 619 620 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 621 { 622 struct vcpu_svm *svm = to_svm(vcpu); 623 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 624 u32 icrl = svm->vmcb->control.exit_info_1; 625 u32 id = svm->vmcb->control.exit_info_2 >> 32; 626 u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; 627 struct kvm_lapic *apic = vcpu->arch.apic; 628 629 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 630 631 switch (id) { 632 case AVIC_IPI_FAILURE_INVALID_TARGET: 633 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 634 /* 635 * Emulate IPIs that are not handled by AVIC hardware, which 636 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 637 * if _any_ targets are invalid, e.g. if the logical mode mask 638 * is a superset of running vCPUs. 639 * 640 * The exit is a trap, e.g. ICR holds the correct value and RIP 641 * has been advanced, KVM is responsible only for emulating the 642 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 643 * in which case KVM needs to emulate the ICR write as well in 644 * order to clear the BUSY flag. 645 */ 646 if (icrl & APIC_ICR_BUSY) 647 kvm_apic_write_nodecode(vcpu, APIC_ICR); 648 else 649 kvm_apic_send_ipi(apic, icrl, icrh); 650 break; 651 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 652 /* 653 * At this point, we expect that the AVIC HW has already 654 * set the appropriate IRR bits on the valid target 655 * vcpus. So, we just need to kick the appropriate vcpu. 656 */ 657 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 658 break; 659 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 660 WARN_ONCE(1, "Invalid backing page\n"); 661 break; 662 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: 663 /* Invalid IPI with vector < 16 */ 664 break; 665 default: 666 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); 667 } 668 669 return 1; 670 } 671 672 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 673 { 674 if (is_guest_mode(vcpu)) 675 return APICV_INHIBIT_REASON_NESTED; 676 return 0; 677 } 678 679 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 680 { 681 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 682 u32 cluster, index; 683 684 ldr = GET_APIC_LOGICAL_ID(ldr); 685 686 if (flat) { 687 cluster = 0; 688 } else { 689 cluster = (ldr >> 4); 690 if (cluster >= 0xf) 691 return NULL; 692 ldr &= 0xf; 693 } 694 if (!ldr || !is_power_of_2(ldr)) 695 return NULL; 696 697 index = __ffs(ldr); 698 if (WARN_ON_ONCE(index > 7)) 699 return NULL; 700 index += (cluster << 2); 701 702 return &kvm_svm->avic_logical_id_table[index]; 703 } 704 705 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 706 { 707 bool flat; 708 u32 *entry, new_entry; 709 710 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 711 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 712 if (!entry) 713 return; 714 715 new_entry = READ_ONCE(*entry); 716 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 717 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 718 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 719 WRITE_ONCE(*entry, new_entry); 720 } 721 722 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 723 { 724 struct vcpu_svm *svm = to_svm(vcpu); 725 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 726 u32 *entry; 727 728 /* Note: x2AVIC does not use logical APIC ID table */ 729 if (apic_x2apic_mode(vcpu->arch.apic)) 730 return; 731 732 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 733 if (entry) 734 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 735 } 736 737 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 738 { 739 struct vcpu_svm *svm = to_svm(vcpu); 740 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 741 u32 id = kvm_xapic_id(vcpu->arch.apic); 742 743 /* AVIC does not support LDR update for x2APIC */ 744 if (apic_x2apic_mode(vcpu->arch.apic)) 745 return; 746 747 if (ldr == svm->ldr_reg) 748 return; 749 750 avic_invalidate_logical_id_entry(vcpu); 751 752 svm->ldr_reg = ldr; 753 avic_ldr_write(vcpu, id, ldr); 754 } 755 756 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 757 { 758 struct vcpu_svm *svm = to_svm(vcpu); 759 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 760 761 if (svm->dfr_reg == dfr) 762 return; 763 764 avic_invalidate_logical_id_entry(vcpu); 765 svm->dfr_reg = dfr; 766 } 767 768 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 769 { 770 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 771 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 772 773 switch (offset) { 774 case APIC_LDR: 775 avic_handle_ldr_update(vcpu); 776 break; 777 case APIC_DFR: 778 avic_handle_dfr_update(vcpu); 779 break; 780 case APIC_RRR: 781 /* Ignore writes to Read Remote Data, it's read-only. */ 782 return 1; 783 default: 784 break; 785 } 786 787 kvm_apic_write_nodecode(vcpu, offset); 788 return 1; 789 } 790 791 static bool is_avic_unaccelerated_access_trap(u32 offset) 792 { 793 bool ret = false; 794 795 switch (offset) { 796 case APIC_ID: 797 case APIC_EOI: 798 case APIC_RRR: 799 case APIC_LDR: 800 case APIC_DFR: 801 case APIC_SPIV: 802 case APIC_ESR: 803 case APIC_ICR: 804 case APIC_LVTT: 805 case APIC_LVTTHMR: 806 case APIC_LVTPC: 807 case APIC_LVT0: 808 case APIC_LVT1: 809 case APIC_LVTERR: 810 case APIC_TMICT: 811 case APIC_TDCR: 812 ret = true; 813 break; 814 default: 815 break; 816 } 817 return ret; 818 } 819 820 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 821 { 822 struct vcpu_svm *svm = to_svm(vcpu); 823 int ret = 0; 824 u32 offset = svm->vmcb->control.exit_info_1 & 825 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 826 u32 vector = svm->vmcb->control.exit_info_2 & 827 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 828 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 829 AVIC_UNACCEL_ACCESS_WRITE_MASK; 830 bool trap = is_avic_unaccelerated_access_trap(offset); 831 832 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 833 trap, write, vector); 834 if (trap) { 835 /* Handling Trap */ 836 WARN_ONCE(!write, "svm: Handling trap read.\n"); 837 ret = avic_unaccel_trap_write(vcpu); 838 } else { 839 /* Handling Fault */ 840 ret = kvm_emulate_instruction(vcpu, 0); 841 } 842 843 return ret; 844 } 845 846 int avic_init_vcpu(struct vcpu_svm *svm) 847 { 848 int ret; 849 struct kvm_vcpu *vcpu = &svm->vcpu; 850 851 INIT_LIST_HEAD(&svm->ir_list); 852 raw_spin_lock_init(&svm->ir_list_lock); 853 854 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 855 return 0; 856 857 ret = avic_init_backing_page(vcpu); 858 if (ret) 859 return ret; 860 861 svm->dfr_reg = APIC_DFR_FLAT; 862 863 return ret; 864 } 865 866 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 867 { 868 avic_handle_dfr_update(vcpu); 869 avic_handle_ldr_update(vcpu); 870 } 871 872 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 873 { 874 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 875 unsigned long flags; 876 877 if (!vcpu) 878 return; 879 880 raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 881 list_del(&irqfd->vcpu_list); 882 raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 883 } 884 885 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 886 unsigned int host_irq, uint32_t guest_irq, 887 struct kvm_vcpu *vcpu, u32 vector) 888 { 889 /* 890 * If the IRQ was affined to a different vCPU, remove the IRTE metadata 891 * from the *previous* vCPU's list. 892 */ 893 svm_ir_list_del(irqfd); 894 895 if (vcpu) { 896 /* 897 * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 898 * in which case configure the IRTE for legacy mode, but track 899 * the IRTE metadata so that it can be converted to guest mode 900 * if AVIC is enabled/uninhibited in the future. 901 */ 902 struct amd_iommu_pi_data pi_data = { 903 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 904 vcpu->vcpu_idx), 905 .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 906 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 907 .vector = vector, 908 }; 909 struct vcpu_svm *svm = to_svm(vcpu); 910 u64 entry; 911 int ret; 912 913 /* 914 * Prevent the vCPU from being scheduled out or migrated until 915 * the IRTE is updated and its metadata has been added to the 916 * list of IRQs being posted to the vCPU, to ensure the IRTE 917 * isn't programmed with stale pCPU/IsRunning information. 918 */ 919 guard(raw_spinlock_irqsave)(&svm->ir_list_lock); 920 921 /* 922 * Update the target pCPU for IOMMU doorbells if the vCPU is 923 * running. If the vCPU is NOT running, i.e. is blocking or 924 * scheduled out, KVM will update the pCPU info when the vCPU 925 * is awakened and/or scheduled in. See also avic_vcpu_load(). 926 */ 927 entry = svm->avic_physical_id_entry; 928 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 929 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 930 } else { 931 pi_data.cpu = -1; 932 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 933 } 934 935 ret = irq_set_vcpu_affinity(host_irq, &pi_data); 936 if (ret) 937 return ret; 938 939 /* 940 * Revert to legacy mode if the IOMMU didn't provide metadata 941 * for the IRTE, which KVM needs to keep the IRTE up-to-date, 942 * e.g. if the vCPU is migrated or AVIC is disabled. 943 */ 944 if (WARN_ON_ONCE(!pi_data.ir_data)) { 945 irq_set_vcpu_affinity(host_irq, NULL); 946 return -EIO; 947 } 948 949 irqfd->irq_bypass_data = pi_data.ir_data; 950 list_add(&irqfd->vcpu_list, &svm->ir_list); 951 return 0; 952 } 953 return irq_set_vcpu_affinity(host_irq, NULL); 954 } 955 956 enum avic_vcpu_action { 957 /* 958 * There is no need to differentiate between activate and deactivate, 959 * as KVM only refreshes AVIC state when the vCPU is scheduled in and 960 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 961 * being (de)activated. 962 */ 963 AVIC_TOGGLE_ON_OFF = BIT(0), 964 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 965 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 966 967 /* 968 * No unique action is required to deal with a vCPU that stops/starts 969 * running. A vCPU that starts running by definition stops blocking as 970 * well, and a vCPU that stops running can't have been blocking, i.e. 971 * doesn't need to toggle GALogIntr. 972 */ 973 AVIC_START_RUNNING = 0, 974 AVIC_STOP_RUNNING = 0, 975 976 /* 977 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 978 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 979 * sent to the vCPU. 980 */ 981 AVIC_START_BLOCKING = BIT(1), 982 }; 983 984 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 985 enum avic_vcpu_action action) 986 { 987 bool ga_log_intr = (action & AVIC_START_BLOCKING); 988 struct vcpu_svm *svm = to_svm(vcpu); 989 struct kvm_kernel_irqfd *irqfd; 990 991 lockdep_assert_held(&svm->ir_list_lock); 992 993 /* 994 * Here, we go through the per-vcpu ir_list to update all existing 995 * interrupt remapping table entry targeting this vcpu. 996 */ 997 if (list_empty(&svm->ir_list)) 998 return; 999 1000 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 1001 void *data = irqfd->irq_bypass_data; 1002 1003 if (!(action & AVIC_TOGGLE_ON_OFF)) 1004 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 1005 else if (cpu >= 0) 1006 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 1007 else 1008 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 1009 } 1010 } 1011 1012 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 1013 enum avic_vcpu_action action) 1014 { 1015 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1016 int h_physical_id = kvm_cpu_get_apicid(cpu); 1017 struct vcpu_svm *svm = to_svm(vcpu); 1018 unsigned long flags; 1019 u64 entry; 1020 1021 lockdep_assert_preemption_disabled(); 1022 1023 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 1024 return; 1025 1026 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1027 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1028 return; 1029 1030 /* 1031 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't 1032 * _currently_ have assigned devices, as that can change. Holding 1033 * ir_list_lock ensures that either svm_ir_list_add() will consume 1034 * up-to-date entry information, or that this task will wait until 1035 * svm_ir_list_add() completes to set the new target pCPU. 1036 */ 1037 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1038 1039 entry = svm->avic_physical_id_entry; 1040 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 1041 1042 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 1043 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1044 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 1045 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1046 1047 svm->avic_physical_id_entry = entry; 1048 1049 /* 1050 * If IPI virtualization is disabled, clear IsRunning when updating the 1051 * actual Physical ID table, so that the CPU never sees IsRunning=1. 1052 * Keep the APIC ID up-to-date in the entry to minimize the chances of 1053 * things going sideways if hardware peeks at the ID. 1054 */ 1055 if (!enable_ipiv) 1056 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1057 1058 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1059 1060 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 1061 1062 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1063 } 1064 1065 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1066 { 1067 /* 1068 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1069 * is being scheduled in after being preempted. The CPU entries in the 1070 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1071 * If the vCPU was migrated, its new CPU value will be stuffed when the 1072 * vCPU unblocks. 1073 */ 1074 if (kvm_vcpu_is_blocking(vcpu)) 1075 return; 1076 1077 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 1078 } 1079 1080 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 1081 { 1082 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1083 struct vcpu_svm *svm = to_svm(vcpu); 1084 unsigned long flags; 1085 u64 entry = svm->avic_physical_id_entry; 1086 1087 lockdep_assert_preemption_disabled(); 1088 1089 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1090 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1091 return; 1092 1093 /* 1094 * Take and hold the per-vCPU interrupt remapping lock while updating 1095 * the Physical ID entry even though the lock doesn't protect against 1096 * multiple writers (see above). Holding ir_list_lock ensures that 1097 * either svm_ir_list_add() will consume up-to-date entry information, 1098 * or that this task will wait until svm_ir_list_add() completes to 1099 * mark the vCPU as not running. 1100 */ 1101 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1102 1103 avic_update_iommu_vcpu_affinity(vcpu, -1, action); 1104 1105 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1106 1107 /* 1108 * Keep the previous APIC ID in the entry so that a rogue doorbell from 1109 * hardware is at least restricted to a CPU associated with the vCPU. 1110 */ 1111 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1112 1113 if (enable_ipiv) 1114 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1115 1116 /* 1117 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 1118 * it's a synthetic flag that usurps an unused should-be-zero bit. 1119 */ 1120 if (action & AVIC_START_BLOCKING) 1121 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 1122 1123 svm->avic_physical_id_entry = entry; 1124 1125 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1126 } 1127 1128 void avic_vcpu_put(struct kvm_vcpu *vcpu) 1129 { 1130 /* 1131 * Note, reading the Physical ID entry outside of ir_list_lock is safe 1132 * as only the pCPU that has loaded (or is loading) the vCPU is allowed 1133 * to modify the entry, and preemption is disabled. I.e. the vCPU 1134 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 1135 * recursively. 1136 */ 1137 u64 entry = to_svm(vcpu)->avic_physical_id_entry; 1138 1139 /* 1140 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 1141 * vCPU is preempted while its in the process of blocking. WARN if the 1142 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1143 * the AVIC if it wasn't previously loaded. 1144 */ 1145 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1146 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1147 return; 1148 1149 /* 1150 * The vCPU was preempted while blocking, ensure its IRTEs are 1151 * configured to generate GA Log Interrupts. 1152 */ 1153 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1154 return; 1155 } 1156 1157 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1158 AVIC_STOP_RUNNING); 1159 } 1160 1161 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1162 { 1163 struct vcpu_svm *svm = to_svm(vcpu); 1164 struct vmcb *vmcb = svm->vmcb01.ptr; 1165 1166 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1167 return; 1168 1169 if (kvm_vcpu_apicv_active(vcpu)) { 1170 /** 1171 * During AVIC temporary deactivation, guest could update 1172 * APIC ID, DFR and LDR registers, which would not be trapped 1173 * by avic_unaccelerated_access_interception(). In this case, 1174 * we need to check and update the AVIC logical APIC ID table 1175 * accordingly before re-activating. 1176 */ 1177 avic_apicv_post_state_restore(vcpu); 1178 avic_activate_vmcb(svm); 1179 } else { 1180 avic_deactivate_vmcb(svm); 1181 } 1182 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1183 } 1184 1185 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1186 { 1187 if (!enable_apicv) 1188 return; 1189 1190 /* APICv should only be toggled on/off while the vCPU is running. */ 1191 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1192 1193 avic_refresh_virtual_apic_mode(vcpu); 1194 1195 if (kvm_vcpu_apicv_active(vcpu)) 1196 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1197 else 1198 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1199 } 1200 1201 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1202 { 1203 if (!kvm_vcpu_apicv_active(vcpu)) 1204 return; 1205 1206 /* 1207 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1208 * actually blocks. 1209 * 1210 * Note, any IRQs that arrive before IsRunning=0 will not cause an 1211 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1212 * this by checking vIRR one last time before blocking. The memory 1213 * barrier implicit in set_current_state orders writing IsRunning=0 1214 * before reading the vIRR. The processor needs a matching memory 1215 * barrier on interrupt delivery between writing IRR and reading 1216 * IsRunning; the lack of this barrier might be the cause of errata #1235). 1217 * 1218 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1219 * doesn't need to detect events for scheduling purposes. The doorbell 1220 * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1221 * CPU and cause noisy neighbor problems if the VM is sending interrupts 1222 * to the vCPU while it's scheduled out. 1223 */ 1224 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1225 } 1226 1227 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1228 { 1229 if (!kvm_vcpu_apicv_active(vcpu)) 1230 return; 1231 1232 avic_vcpu_load(vcpu, vcpu->cpu); 1233 } 1234 1235 static bool __init avic_want_avic_enabled(void) 1236 { 1237 /* 1238 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is 1239 * supported (to avoid enabling partial support by default, and because 1240 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for 1241 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags 1242 * aren't inclusive of previous generations, i.e. the kernel will set 1243 * at most one ZenX feature flag. 1244 */ 1245 if (avic == AVIC_AUTO_MODE) 1246 avic = boot_cpu_has(X86_FEATURE_X2AVIC) && 1247 (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A); 1248 1249 if (!avic || !npt_enabled) 1250 return false; 1251 1252 /* AVIC is a prerequisite for x2AVIC. */ 1253 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1254 if (boot_cpu_has(X86_FEATURE_X2AVIC)) 1255 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); 1256 return false; 1257 } 1258 1259 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && 1260 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { 1261 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); 1262 return false; 1263 } 1264 1265 /* 1266 * Print a scary message if AVIC is force enabled to make it abundantly 1267 * clear that ignoring CPUID could have repercussions. See Revision 1268 * Guide for specific AMD processor for more details. 1269 */ 1270 if (!boot_cpu_has(X86_FEATURE_AVIC)) 1271 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); 1272 1273 return true; 1274 } 1275 1276 /* 1277 * Note: 1278 * - The module param avic enable both xAPIC and x2APIC mode. 1279 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1280 * - The mode can be switched at run-time. 1281 */ 1282 bool __init avic_hardware_setup(void) 1283 { 1284 avic = avic_want_avic_enabled(); 1285 if (!avic) 1286 return false; 1287 1288 pr_info("AVIC enabled\n"); 1289 1290 /* AVIC is a prerequisite for x2AVIC. */ 1291 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1292 if (x2avic_enabled) { 1293 if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) 1294 x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; 1295 else 1296 x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; 1297 pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); 1298 } else { 1299 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 1300 } 1301 1302 /* 1303 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1304 * due to erratum 1235, which results in missed VM-Exits on the sender 1305 * and thus missed wake events for blocking vCPUs due to the CPU 1306 * failing to see a software update to clear IsRunning. 1307 */ 1308 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1309 1310 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1311 1312 return true; 1313 } 1314 1315 void avic_hardware_unsetup(void) 1316 { 1317 if (avic) 1318 amd_iommu_register_ga_log_notifier(NULL); 1319 } 1320