1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 #include <linux/kvm_irqfd.h> 22 23 #include <asm/irq_remapping.h> 24 #include <asm/msr.h> 25 26 #include "trace.h" 27 #include "lapic.h" 28 #include "x86.h" 29 #include "irq.h" 30 #include "svm.h" 31 32 /* 33 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 34 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 35 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 36 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 37 * lookup on the index, where as vCPUs whose index doesn't match their ID need 38 * to walk the entire xarray of vCPUs in the worst case scenario. 39 * 40 * For the vCPU index, use however many bits are currently allowed for the max 41 * guest physical APIC ID (limited by the size of the physical ID table), and 42 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 43 * size of the GATag is defined by hardware (32 bits), but is an opaque value 44 * as far as hardware is concerned. 45 */ 46 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 47 48 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 49 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 50 51 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 52 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 53 54 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 55 ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 56 #define AVIC_GATAG(vm_id, vcpu_idx) \ 57 ({ \ 58 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 59 \ 60 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 61 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 62 ga_tag; \ 63 }) 64 65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 66 67 #define AVIC_AUTO_MODE -1 68 69 static int avic_param_set(const char *val, const struct kernel_param *kp) 70 { 71 if (val && sysfs_streq(val, "auto")) { 72 *(int *)kp->arg = AVIC_AUTO_MODE; 73 return 0; 74 } 75 76 return param_set_bint(val, kp); 77 } 78 79 static const struct kernel_param_ops avic_ops = { 80 .flags = KERNEL_PARAM_OPS_FL_NOARG, 81 .set = avic_param_set, 82 .get = param_get_bool, 83 }; 84 85 /* 86 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled 87 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). 88 */ 89 static int avic = AVIC_AUTO_MODE; 90 module_param_cb(avic, &avic_ops, &avic, 0444); 91 __MODULE_PARM_TYPE(avic, "bool"); 92 93 module_param(enable_ipiv, bool, 0444); 94 95 static bool force_avic; 96 module_param_unsafe(force_avic, bool, 0444); 97 98 /* Note: 99 * This hash table is used to map VM_ID to a struct kvm_svm, 100 * when handling AMD IOMMU GALOG notification to schedule in 101 * a particular vCPU. 102 */ 103 #define SVM_VM_DATA_HASH_BITS 8 104 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 105 static u32 next_vm_id = 0; 106 static bool next_vm_id_wrapped = 0; 107 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 108 static bool x2avic_enabled; 109 static u32 x2avic_max_physical_id; 110 111 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, 112 bool intercept) 113 { 114 static const u32 x2avic_passthrough_msrs[] = { 115 X2APIC_MSR(APIC_ID), 116 X2APIC_MSR(APIC_LVR), 117 X2APIC_MSR(APIC_TASKPRI), 118 X2APIC_MSR(APIC_ARBPRI), 119 X2APIC_MSR(APIC_PROCPRI), 120 X2APIC_MSR(APIC_EOI), 121 X2APIC_MSR(APIC_RRR), 122 X2APIC_MSR(APIC_LDR), 123 X2APIC_MSR(APIC_DFR), 124 X2APIC_MSR(APIC_SPIV), 125 X2APIC_MSR(APIC_ISR), 126 X2APIC_MSR(APIC_TMR), 127 X2APIC_MSR(APIC_IRR), 128 X2APIC_MSR(APIC_ESR), 129 X2APIC_MSR(APIC_ICR), 130 X2APIC_MSR(APIC_ICR2), 131 132 /* 133 * Note! Always intercept LVTT, as TSC-deadline timer mode 134 * isn't virtualized by hardware, and the CPU will generate a 135 * #GP instead of a #VMEXIT. 136 */ 137 X2APIC_MSR(APIC_LVTTHMR), 138 X2APIC_MSR(APIC_LVTPC), 139 X2APIC_MSR(APIC_LVT0), 140 X2APIC_MSR(APIC_LVT1), 141 X2APIC_MSR(APIC_LVTERR), 142 X2APIC_MSR(APIC_TMICT), 143 X2APIC_MSR(APIC_TMCCT), 144 X2APIC_MSR(APIC_TDCR), 145 }; 146 int i; 147 148 if (intercept == svm->x2avic_msrs_intercepted) 149 return; 150 151 if (!x2avic_enabled) 152 return; 153 154 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 155 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 156 MSR_TYPE_RW, intercept); 157 158 svm->x2avic_msrs_intercepted = intercept; 159 } 160 161 static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) 162 { 163 u32 arch_max; 164 165 /* 166 * Return the largest size (x2APIC) when querying without a vCPU, e.g. 167 * to allocate the per-VM table.. 168 */ 169 if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) 170 arch_max = x2avic_max_physical_id; 171 else 172 arch_max = AVIC_MAX_PHYSICAL_ID; 173 174 /* 175 * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID 176 * plus one, so the max possible APIC ID is one less than that. 177 */ 178 return min(kvm->arch.max_vcpu_ids - 1, arch_max); 179 } 180 181 static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) 182 { 183 return __avic_get_max_physical_id(vcpu->kvm, vcpu); 184 } 185 186 static void avic_activate_vmcb(struct vcpu_svm *svm) 187 { 188 struct vmcb *vmcb = svm->vmcb01.ptr; 189 struct kvm_vcpu *vcpu = &svm->vcpu; 190 191 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 192 193 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 194 vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); 195 196 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 197 198 /* 199 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 200 * accesses, while interrupt injection to a running vCPU can be 201 * achieved using AVIC doorbell. KVM disables the APIC access page 202 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 203 * AVIC in hybrid mode activates only the doorbell mechanism. 204 */ 205 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 206 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 207 208 /* Disabling MSR intercept for x2APIC registers */ 209 avic_set_x2apic_msr_interception(svm, false); 210 } else { 211 /* 212 * Flush the TLB, the guest may have inserted a non-APIC 213 * mapping into the TLB while AVIC was disabled. 214 */ 215 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 216 217 /* Enabling MSR intercept for x2APIC registers */ 218 avic_set_x2apic_msr_interception(svm, true); 219 } 220 } 221 222 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 223 { 224 struct vmcb *vmcb = svm->vmcb01.ptr; 225 226 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 227 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 228 229 /* 230 * If running nested and the guest uses its own MSR bitmap, there 231 * is no need to update L0's msr bitmap 232 */ 233 if (is_guest_mode(&svm->vcpu) && 234 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 235 return; 236 237 /* Enabling MSR intercept for x2APIC registers */ 238 avic_set_x2apic_msr_interception(svm, true); 239 } 240 241 /* Note: 242 * This function is called from IOMMU driver to notify 243 * SVM to schedule in a particular vCPU of a particular VM. 244 */ 245 static int avic_ga_log_notifier(u32 ga_tag) 246 { 247 unsigned long flags; 248 struct kvm_svm *kvm_svm; 249 struct kvm_vcpu *vcpu = NULL; 250 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 251 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 252 253 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 254 trace_kvm_avic_ga_log(vm_id, vcpu_idx); 255 256 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 257 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 258 if (kvm_svm->avic_vm_id != vm_id) 259 continue; 260 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 261 break; 262 } 263 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 264 265 /* Note: 266 * At this point, the IOMMU should have already set the pending 267 * bit in the vAPIC backing page. So, we just need to schedule 268 * in the vcpu. 269 */ 270 if (vcpu) 271 kvm_vcpu_wake_up(vcpu); 272 273 return 0; 274 } 275 276 static int avic_get_physical_id_table_order(struct kvm *kvm) 277 { 278 /* Provision for the maximum physical ID supported in x2avic mode */ 279 return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); 280 } 281 282 int avic_alloc_physical_id_table(struct kvm *kvm) 283 { 284 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 285 286 if (!irqchip_in_kernel(kvm) || !enable_apicv) 287 return 0; 288 289 if (kvm_svm->avic_physical_id_table) 290 return 0; 291 292 kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 293 avic_get_physical_id_table_order(kvm)); 294 if (!kvm_svm->avic_physical_id_table) 295 return -ENOMEM; 296 297 return 0; 298 } 299 300 void avic_vm_destroy(struct kvm *kvm) 301 { 302 unsigned long flags; 303 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 304 305 if (!enable_apicv) 306 return; 307 308 free_page((unsigned long)kvm_svm->avic_logical_id_table); 309 free_pages((unsigned long)kvm_svm->avic_physical_id_table, 310 avic_get_physical_id_table_order(kvm)); 311 312 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 313 hash_del(&kvm_svm->hnode); 314 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 315 } 316 317 int avic_vm_init(struct kvm *kvm) 318 { 319 unsigned long flags; 320 int err = -ENOMEM; 321 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 322 struct kvm_svm *k2; 323 u32 vm_id; 324 325 if (!enable_apicv) 326 return 0; 327 328 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 329 if (!kvm_svm->avic_logical_id_table) 330 goto free_avic; 331 332 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 333 again: 334 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 335 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 336 next_vm_id_wrapped = 1; 337 goto again; 338 } 339 /* Is it still in use? Only possible if wrapped at least once */ 340 if (next_vm_id_wrapped) { 341 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 342 if (k2->avic_vm_id == vm_id) 343 goto again; 344 } 345 } 346 kvm_svm->avic_vm_id = vm_id; 347 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 348 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 349 350 return 0; 351 352 free_avic: 353 avic_vm_destroy(kvm); 354 return err; 355 } 356 357 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 358 { 359 return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 360 } 361 362 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 363 { 364 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 365 366 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 367 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 368 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 369 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 370 371 if (kvm_apicv_activated(svm->vcpu.kvm)) 372 avic_activate_vmcb(svm); 373 else 374 avic_deactivate_vmcb(svm); 375 } 376 377 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 378 { 379 u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID; 380 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 381 struct vcpu_svm *svm = to_svm(vcpu); 382 u32 id = vcpu->vcpu_id; 383 u64 new_entry; 384 385 /* 386 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 387 * hardware. Immediately clear apicv_active, i.e. don't wait until the 388 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 389 * avic_vcpu_load() expects to be called if and only if the vCPU has 390 * fully initialized AVIC. 391 */ 392 if (id > max_id) { 393 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 394 vcpu->arch.apic->apicv_active = false; 395 return 0; 396 } 397 398 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 399 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 400 401 if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 402 return -EINVAL; 403 404 if (kvm_apicv_activated(vcpu->kvm)) { 405 int ret; 406 407 /* 408 * Note, AVIC hardware walks the nested page table to check 409 * permissions, but does not use the SPA address specified in 410 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 411 * pointer field of the VMCB. 412 */ 413 ret = kvm_alloc_apic_access_page(vcpu->kvm); 414 if (ret) 415 return ret; 416 } 417 418 /* Note, fls64() returns the bit position, +1. */ 419 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 420 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 421 422 /* Setting AVIC backing page address in the phy APIC ID table */ 423 new_entry = avic_get_backing_page_address(svm) | 424 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 425 svm->avic_physical_id_entry = new_entry; 426 427 /* 428 * Initialize the real table, as vCPUs must have a valid entry in order 429 * for broadcast IPIs to function correctly (broadcast IPIs ignore 430 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 431 */ 432 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 433 434 return 0; 435 } 436 437 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 438 { 439 /* 440 * Note, the vCPU could get migrated to a different pCPU at any point, 441 * which could result in signalling the wrong/previous pCPU. But if 442 * that happens the vCPU is guaranteed to do a VMRUN (after being 443 * migrated) and thus will process pending interrupts, i.e. a doorbell 444 * is not needed (and the spurious one is harmless). 445 */ 446 int cpu = READ_ONCE(vcpu->cpu); 447 448 if (cpu != get_cpu()) { 449 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 450 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 451 } 452 put_cpu(); 453 } 454 455 456 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 457 { 458 vcpu->arch.apic->irr_pending = true; 459 svm_complete_interrupt_delivery(vcpu, 460 icrl & APIC_MODE_MASK, 461 icrl & APIC_INT_LEVELTRIG, 462 icrl & APIC_VECTOR_MASK); 463 } 464 465 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 466 u32 icrl) 467 { 468 /* 469 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 470 * i.e. APIC ID == vCPU ID. 471 */ 472 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 473 474 /* Once again, nothing to do if the target vCPU doesn't exist. */ 475 if (unlikely(!target_vcpu)) 476 return; 477 478 avic_kick_vcpu(target_vcpu, icrl); 479 } 480 481 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 482 u32 logid_index, u32 icrl) 483 { 484 u32 physical_id; 485 486 if (avic_logical_id_table) { 487 u32 logid_entry = avic_logical_id_table[logid_index]; 488 489 /* Nothing to do if the logical destination is invalid. */ 490 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 491 return; 492 493 physical_id = logid_entry & 494 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 495 } else { 496 /* 497 * For x2APIC, the logical APIC ID is a read-only value that is 498 * derived from the x2APIC ID, thus the x2APIC ID can be found 499 * by reversing the calculation (stored in logid_index). Note, 500 * bits 31:20 of the x2APIC ID aren't propagated to the logical 501 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 502 */ 503 physical_id = logid_index; 504 } 505 506 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 507 } 508 509 /* 510 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 511 * destination APIC ID to vCPU without looping through all vCPUs. 512 */ 513 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 514 u32 icrl, u32 icrh, u32 index) 515 { 516 int dest_mode = icrl & APIC_DEST_MASK; 517 int shorthand = icrl & APIC_SHORT_MASK; 518 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 519 u32 dest; 520 521 if (shorthand != APIC_DEST_NOSHORT) 522 return -EINVAL; 523 524 if (apic_x2apic_mode(source)) 525 dest = icrh; 526 else 527 dest = GET_XAPIC_DEST_FIELD(icrh); 528 529 if (dest_mode == APIC_DEST_PHYSICAL) { 530 /* broadcast destination, use slow path */ 531 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 532 return -EINVAL; 533 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 534 return -EINVAL; 535 536 if (WARN_ON_ONCE(dest != index)) 537 return -EINVAL; 538 539 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 540 } else { 541 u32 *avic_logical_id_table; 542 unsigned long bitmap, i; 543 u32 cluster; 544 545 if (apic_x2apic_mode(source)) { 546 /* 16 bit dest mask, 16 bit cluster id */ 547 bitmap = dest & 0xFFFF; 548 cluster = (dest >> 16) << 4; 549 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 550 /* 8 bit dest mask*/ 551 bitmap = dest; 552 cluster = 0; 553 } else { 554 /* 4 bit desk mask, 4 bit cluster id */ 555 bitmap = dest & 0xF; 556 cluster = (dest >> 4) << 2; 557 } 558 559 /* Nothing to do if there are no destinations in the cluster. */ 560 if (unlikely(!bitmap)) 561 return 0; 562 563 if (apic_x2apic_mode(source)) 564 avic_logical_id_table = NULL; 565 else 566 avic_logical_id_table = kvm_svm->avic_logical_id_table; 567 568 /* 569 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 570 * IDs, thus each bit in the destination is guaranteed to map 571 * to at most one vCPU. 572 */ 573 for_each_set_bit(i, &bitmap, 16) 574 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 575 cluster + i, icrl); 576 } 577 578 return 0; 579 } 580 581 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 582 u32 icrl, u32 icrh, u32 index) 583 { 584 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 585 unsigned long i; 586 struct kvm_vcpu *vcpu; 587 588 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 589 return; 590 591 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 592 593 /* 594 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 595 * event. There's no need to signal doorbells, as hardware has handled 596 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 597 * since entered the guest will have processed pending IRQs at VMRUN. 598 */ 599 kvm_for_each_vcpu(i, vcpu, kvm) { 600 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 601 dest, icrl & APIC_DEST_MASK)) 602 avic_kick_vcpu(vcpu, icrl); 603 } 604 } 605 606 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 607 { 608 struct vcpu_svm *svm = to_svm(vcpu); 609 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 610 u32 icrl = svm->vmcb->control.exit_info_1; 611 u32 id = svm->vmcb->control.exit_info_2 >> 32; 612 u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; 613 struct kvm_lapic *apic = vcpu->arch.apic; 614 615 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 616 617 switch (id) { 618 case AVIC_IPI_FAILURE_INVALID_TARGET: 619 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 620 /* 621 * Emulate IPIs that are not handled by AVIC hardware, which 622 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 623 * if _any_ targets are invalid, e.g. if the logical mode mask 624 * is a superset of running vCPUs. 625 * 626 * The exit is a trap, e.g. ICR holds the correct value and RIP 627 * has been advanced, KVM is responsible only for emulating the 628 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 629 * in which case KVM needs to emulate the ICR write as well in 630 * order to clear the BUSY flag. 631 */ 632 if (icrl & APIC_ICR_BUSY) 633 kvm_apic_write_nodecode(vcpu, APIC_ICR); 634 else 635 kvm_apic_send_ipi(apic, icrl, icrh); 636 break; 637 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 638 /* 639 * At this point, we expect that the AVIC HW has already 640 * set the appropriate IRR bits on the valid target 641 * vcpus. So, we just need to kick the appropriate vcpu. 642 */ 643 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 644 break; 645 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 646 WARN_ONCE(1, "Invalid backing page\n"); 647 break; 648 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: 649 /* Invalid IPI with vector < 16 */ 650 break; 651 default: 652 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); 653 } 654 655 return 1; 656 } 657 658 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 659 { 660 if (is_guest_mode(vcpu)) 661 return APICV_INHIBIT_REASON_NESTED; 662 return 0; 663 } 664 665 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 666 { 667 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 668 u32 cluster, index; 669 670 ldr = GET_APIC_LOGICAL_ID(ldr); 671 672 if (flat) { 673 cluster = 0; 674 } else { 675 cluster = (ldr >> 4); 676 if (cluster >= 0xf) 677 return NULL; 678 ldr &= 0xf; 679 } 680 if (!ldr || !is_power_of_2(ldr)) 681 return NULL; 682 683 index = __ffs(ldr); 684 if (WARN_ON_ONCE(index > 7)) 685 return NULL; 686 index += (cluster << 2); 687 688 return &kvm_svm->avic_logical_id_table[index]; 689 } 690 691 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 692 { 693 bool flat; 694 u32 *entry, new_entry; 695 696 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 697 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 698 if (!entry) 699 return; 700 701 new_entry = READ_ONCE(*entry); 702 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 703 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 704 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 705 WRITE_ONCE(*entry, new_entry); 706 } 707 708 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 709 { 710 struct vcpu_svm *svm = to_svm(vcpu); 711 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 712 u32 *entry; 713 714 /* Note: x2AVIC does not use logical APIC ID table */ 715 if (apic_x2apic_mode(vcpu->arch.apic)) 716 return; 717 718 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 719 if (entry) 720 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 721 } 722 723 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 724 { 725 struct vcpu_svm *svm = to_svm(vcpu); 726 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 727 u32 id = kvm_xapic_id(vcpu->arch.apic); 728 729 /* AVIC does not support LDR update for x2APIC */ 730 if (apic_x2apic_mode(vcpu->arch.apic)) 731 return; 732 733 if (ldr == svm->ldr_reg) 734 return; 735 736 avic_invalidate_logical_id_entry(vcpu); 737 738 svm->ldr_reg = ldr; 739 avic_ldr_write(vcpu, id, ldr); 740 } 741 742 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 743 { 744 struct vcpu_svm *svm = to_svm(vcpu); 745 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 746 747 if (svm->dfr_reg == dfr) 748 return; 749 750 avic_invalidate_logical_id_entry(vcpu); 751 svm->dfr_reg = dfr; 752 } 753 754 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 755 { 756 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 757 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 758 759 switch (offset) { 760 case APIC_LDR: 761 avic_handle_ldr_update(vcpu); 762 break; 763 case APIC_DFR: 764 avic_handle_dfr_update(vcpu); 765 break; 766 case APIC_RRR: 767 /* Ignore writes to Read Remote Data, it's read-only. */ 768 return 1; 769 default: 770 break; 771 } 772 773 kvm_apic_write_nodecode(vcpu, offset); 774 return 1; 775 } 776 777 static bool is_avic_unaccelerated_access_trap(u32 offset) 778 { 779 bool ret = false; 780 781 switch (offset) { 782 case APIC_ID: 783 case APIC_EOI: 784 case APIC_RRR: 785 case APIC_LDR: 786 case APIC_DFR: 787 case APIC_SPIV: 788 case APIC_ESR: 789 case APIC_ICR: 790 case APIC_LVTT: 791 case APIC_LVTTHMR: 792 case APIC_LVTPC: 793 case APIC_LVT0: 794 case APIC_LVT1: 795 case APIC_LVTERR: 796 case APIC_TMICT: 797 case APIC_TDCR: 798 ret = true; 799 break; 800 default: 801 break; 802 } 803 return ret; 804 } 805 806 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 807 { 808 struct vcpu_svm *svm = to_svm(vcpu); 809 int ret = 0; 810 u32 offset = svm->vmcb->control.exit_info_1 & 811 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 812 u32 vector = svm->vmcb->control.exit_info_2 & 813 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 814 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 815 AVIC_UNACCEL_ACCESS_WRITE_MASK; 816 bool trap = is_avic_unaccelerated_access_trap(offset); 817 818 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 819 trap, write, vector); 820 if (trap) { 821 /* Handling Trap */ 822 WARN_ONCE(!write, "svm: Handling trap read.\n"); 823 ret = avic_unaccel_trap_write(vcpu); 824 } else { 825 /* Handling Fault */ 826 ret = kvm_emulate_instruction(vcpu, 0); 827 } 828 829 return ret; 830 } 831 832 int avic_init_vcpu(struct vcpu_svm *svm) 833 { 834 int ret; 835 struct kvm_vcpu *vcpu = &svm->vcpu; 836 837 INIT_LIST_HEAD(&svm->ir_list); 838 raw_spin_lock_init(&svm->ir_list_lock); 839 840 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 841 return 0; 842 843 ret = avic_init_backing_page(vcpu); 844 if (ret) 845 return ret; 846 847 svm->dfr_reg = APIC_DFR_FLAT; 848 849 return ret; 850 } 851 852 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 853 { 854 avic_handle_dfr_update(vcpu); 855 avic_handle_ldr_update(vcpu); 856 } 857 858 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 859 { 860 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 861 unsigned long flags; 862 863 if (!vcpu) 864 return; 865 866 raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 867 list_del(&irqfd->vcpu_list); 868 raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 869 } 870 871 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 872 unsigned int host_irq, uint32_t guest_irq, 873 struct kvm_vcpu *vcpu, u32 vector) 874 { 875 /* 876 * If the IRQ was affined to a different vCPU, remove the IRTE metadata 877 * from the *previous* vCPU's list. 878 */ 879 svm_ir_list_del(irqfd); 880 881 if (vcpu) { 882 /* 883 * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 884 * in which case configure the IRTE for legacy mode, but track 885 * the IRTE metadata so that it can be converted to guest mode 886 * if AVIC is enabled/uninhibited in the future. 887 */ 888 struct amd_iommu_pi_data pi_data = { 889 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 890 vcpu->vcpu_idx), 891 .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 892 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 893 .vector = vector, 894 }; 895 struct vcpu_svm *svm = to_svm(vcpu); 896 u64 entry; 897 int ret; 898 899 /* 900 * Prevent the vCPU from being scheduled out or migrated until 901 * the IRTE is updated and its metadata has been added to the 902 * list of IRQs being posted to the vCPU, to ensure the IRTE 903 * isn't programmed with stale pCPU/IsRunning information. 904 */ 905 guard(raw_spinlock_irqsave)(&svm->ir_list_lock); 906 907 /* 908 * Update the target pCPU for IOMMU doorbells if the vCPU is 909 * running. If the vCPU is NOT running, i.e. is blocking or 910 * scheduled out, KVM will update the pCPU info when the vCPU 911 * is awakened and/or scheduled in. See also avic_vcpu_load(). 912 */ 913 entry = svm->avic_physical_id_entry; 914 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 915 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 916 } else { 917 pi_data.cpu = -1; 918 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 919 } 920 921 ret = irq_set_vcpu_affinity(host_irq, &pi_data); 922 if (ret) 923 return ret; 924 925 /* 926 * Revert to legacy mode if the IOMMU didn't provide metadata 927 * for the IRTE, which KVM needs to keep the IRTE up-to-date, 928 * e.g. if the vCPU is migrated or AVIC is disabled. 929 */ 930 if (WARN_ON_ONCE(!pi_data.ir_data)) { 931 irq_set_vcpu_affinity(host_irq, NULL); 932 return -EIO; 933 } 934 935 irqfd->irq_bypass_data = pi_data.ir_data; 936 list_add(&irqfd->vcpu_list, &svm->ir_list); 937 return 0; 938 } 939 return irq_set_vcpu_affinity(host_irq, NULL); 940 } 941 942 enum avic_vcpu_action { 943 /* 944 * There is no need to differentiate between activate and deactivate, 945 * as KVM only refreshes AVIC state when the vCPU is scheduled in and 946 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 947 * being (de)activated. 948 */ 949 AVIC_TOGGLE_ON_OFF = BIT(0), 950 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 951 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 952 953 /* 954 * No unique action is required to deal with a vCPU that stops/starts 955 * running. A vCPU that starts running by definition stops blocking as 956 * well, and a vCPU that stops running can't have been blocking, i.e. 957 * doesn't need to toggle GALogIntr. 958 */ 959 AVIC_START_RUNNING = 0, 960 AVIC_STOP_RUNNING = 0, 961 962 /* 963 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 964 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 965 * sent to the vCPU. 966 */ 967 AVIC_START_BLOCKING = BIT(1), 968 }; 969 970 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 971 enum avic_vcpu_action action) 972 { 973 bool ga_log_intr = (action & AVIC_START_BLOCKING); 974 struct vcpu_svm *svm = to_svm(vcpu); 975 struct kvm_kernel_irqfd *irqfd; 976 977 lockdep_assert_held(&svm->ir_list_lock); 978 979 /* 980 * Here, we go through the per-vcpu ir_list to update all existing 981 * interrupt remapping table entry targeting this vcpu. 982 */ 983 if (list_empty(&svm->ir_list)) 984 return; 985 986 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 987 void *data = irqfd->irq_bypass_data; 988 989 if (!(action & AVIC_TOGGLE_ON_OFF)) 990 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 991 else if (cpu >= 0) 992 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 993 else 994 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 995 } 996 } 997 998 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 999 enum avic_vcpu_action action) 1000 { 1001 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1002 int h_physical_id = kvm_cpu_get_apicid(cpu); 1003 struct vcpu_svm *svm = to_svm(vcpu); 1004 unsigned long flags; 1005 u64 entry; 1006 1007 lockdep_assert_preemption_disabled(); 1008 1009 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 1010 return; 1011 1012 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1013 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1014 return; 1015 1016 /* 1017 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't 1018 * _currently_ have assigned devices, as that can change. Holding 1019 * ir_list_lock ensures that either svm_ir_list_add() will consume 1020 * up-to-date entry information, or that this task will wait until 1021 * svm_ir_list_add() completes to set the new target pCPU. 1022 */ 1023 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1024 1025 entry = svm->avic_physical_id_entry; 1026 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 1027 1028 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 1029 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1030 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 1031 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1032 1033 svm->avic_physical_id_entry = entry; 1034 1035 /* 1036 * If IPI virtualization is disabled, clear IsRunning when updating the 1037 * actual Physical ID table, so that the CPU never sees IsRunning=1. 1038 * Keep the APIC ID up-to-date in the entry to minimize the chances of 1039 * things going sideways if hardware peeks at the ID. 1040 */ 1041 if (!enable_ipiv) 1042 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1043 1044 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1045 1046 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 1047 1048 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1049 } 1050 1051 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1052 { 1053 /* 1054 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1055 * is being scheduled in after being preempted. The CPU entries in the 1056 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1057 * If the vCPU was migrated, its new CPU value will be stuffed when the 1058 * vCPU unblocks. 1059 */ 1060 if (kvm_vcpu_is_blocking(vcpu)) 1061 return; 1062 1063 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 1064 } 1065 1066 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 1067 { 1068 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1069 struct vcpu_svm *svm = to_svm(vcpu); 1070 unsigned long flags; 1071 u64 entry = svm->avic_physical_id_entry; 1072 1073 lockdep_assert_preemption_disabled(); 1074 1075 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1076 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1077 return; 1078 1079 /* 1080 * Take and hold the per-vCPU interrupt remapping lock while updating 1081 * the Physical ID entry even though the lock doesn't protect against 1082 * multiple writers (see above). Holding ir_list_lock ensures that 1083 * either svm_ir_list_add() will consume up-to-date entry information, 1084 * or that this task will wait until svm_ir_list_add() completes to 1085 * mark the vCPU as not running. 1086 */ 1087 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1088 1089 avic_update_iommu_vcpu_affinity(vcpu, -1, action); 1090 1091 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1092 1093 /* 1094 * Keep the previous APIC ID in the entry so that a rogue doorbell from 1095 * hardware is at least restricted to a CPU associated with the vCPU. 1096 */ 1097 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1098 1099 if (enable_ipiv) 1100 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1101 1102 /* 1103 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 1104 * it's a synthetic flag that usurps an unused should-be-zero bit. 1105 */ 1106 if (action & AVIC_START_BLOCKING) 1107 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 1108 1109 svm->avic_physical_id_entry = entry; 1110 1111 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1112 } 1113 1114 void avic_vcpu_put(struct kvm_vcpu *vcpu) 1115 { 1116 /* 1117 * Note, reading the Physical ID entry outside of ir_list_lock is safe 1118 * as only the pCPU that has loaded (or is loading) the vCPU is allowed 1119 * to modify the entry, and preemption is disabled. I.e. the vCPU 1120 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 1121 * recursively. 1122 */ 1123 u64 entry = to_svm(vcpu)->avic_physical_id_entry; 1124 1125 /* 1126 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 1127 * vCPU is preempted while its in the process of blocking. WARN if the 1128 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1129 * the AVIC if it wasn't previously loaded. 1130 */ 1131 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1132 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1133 return; 1134 1135 /* 1136 * The vCPU was preempted while blocking, ensure its IRTEs are 1137 * configured to generate GA Log Interrupts. 1138 */ 1139 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1140 return; 1141 } 1142 1143 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1144 AVIC_STOP_RUNNING); 1145 } 1146 1147 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1148 { 1149 struct vcpu_svm *svm = to_svm(vcpu); 1150 struct vmcb *vmcb = svm->vmcb01.ptr; 1151 1152 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1153 return; 1154 1155 if (kvm_vcpu_apicv_active(vcpu)) { 1156 /** 1157 * During AVIC temporary deactivation, guest could update 1158 * APIC ID, DFR and LDR registers, which would not be trapped 1159 * by avic_unaccelerated_access_interception(). In this case, 1160 * we need to check and update the AVIC logical APIC ID table 1161 * accordingly before re-activating. 1162 */ 1163 avic_apicv_post_state_restore(vcpu); 1164 avic_activate_vmcb(svm); 1165 } else { 1166 avic_deactivate_vmcb(svm); 1167 } 1168 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1169 } 1170 1171 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1172 { 1173 if (!enable_apicv) 1174 return; 1175 1176 /* APICv should only be toggled on/off while the vCPU is running. */ 1177 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1178 1179 avic_refresh_virtual_apic_mode(vcpu); 1180 1181 if (kvm_vcpu_apicv_active(vcpu)) 1182 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1183 else 1184 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1185 } 1186 1187 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1188 { 1189 if (!kvm_vcpu_apicv_active(vcpu)) 1190 return; 1191 1192 /* 1193 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1194 * actually blocks. 1195 * 1196 * Note, any IRQs that arrive before IsRunning=0 will not cause an 1197 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1198 * this by checking vIRR one last time before blocking. The memory 1199 * barrier implicit in set_current_state orders writing IsRunning=0 1200 * before reading the vIRR. The processor needs a matching memory 1201 * barrier on interrupt delivery between writing IRR and reading 1202 * IsRunning; the lack of this barrier might be the cause of errata #1235). 1203 * 1204 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1205 * doesn't need to detect events for scheduling purposes. The doorbell 1206 * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1207 * CPU and cause noisy neighbor problems if the VM is sending interrupts 1208 * to the vCPU while it's scheduled out. 1209 */ 1210 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1211 } 1212 1213 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1214 { 1215 if (!kvm_vcpu_apicv_active(vcpu)) 1216 return; 1217 1218 avic_vcpu_load(vcpu, vcpu->cpu); 1219 } 1220 1221 static bool __init avic_want_avic_enabled(void) 1222 { 1223 /* 1224 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is 1225 * supported (to avoid enabling partial support by default, and because 1226 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for 1227 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags 1228 * aren't inclusive of previous generations, i.e. the kernel will set 1229 * at most one ZenX feature flag. 1230 */ 1231 if (avic == AVIC_AUTO_MODE) 1232 avic = boot_cpu_has(X86_FEATURE_X2AVIC) && 1233 (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A); 1234 1235 if (!avic || !npt_enabled) 1236 return false; 1237 1238 /* AVIC is a prerequisite for x2AVIC. */ 1239 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1240 if (boot_cpu_has(X86_FEATURE_X2AVIC)) 1241 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); 1242 return false; 1243 } 1244 1245 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && 1246 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { 1247 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); 1248 return false; 1249 } 1250 1251 /* 1252 * Print a scary message if AVIC is force enabled to make it abundantly 1253 * clear that ignoring CPUID could have repercussions. See Revision 1254 * Guide for specific AMD processor for more details. 1255 */ 1256 if (!boot_cpu_has(X86_FEATURE_AVIC)) 1257 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); 1258 1259 return true; 1260 } 1261 1262 /* 1263 * Note: 1264 * - The module param avic enable both xAPIC and x2APIC mode. 1265 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1266 * - The mode can be switched at run-time. 1267 */ 1268 bool __init avic_hardware_setup(void) 1269 { 1270 avic = avic_want_avic_enabled(); 1271 if (!avic) 1272 return false; 1273 1274 pr_info("AVIC enabled\n"); 1275 1276 /* AVIC is a prerequisite for x2AVIC. */ 1277 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1278 if (x2avic_enabled) { 1279 if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) 1280 x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; 1281 else 1282 x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; 1283 pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); 1284 } else { 1285 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 1286 } 1287 1288 /* 1289 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1290 * due to erratum 1235, which results in missed VM-Exits on the sender 1291 * and thus missed wake events for blocking vCPUs due to the CPU 1292 * failing to see a software update to clear IsRunning. 1293 */ 1294 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1295 1296 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1297 1298 return true; 1299 } 1300 1301 void avic_hardware_unsetup(void) 1302 { 1303 if (avic) 1304 amd_iommu_register_ga_log_notifier(NULL); 1305 } 1306