1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 #include <linux/kvm_irqfd.h> 22 23 #include <asm/irq_remapping.h> 24 #include <asm/msr.h> 25 26 #include "trace.h" 27 #include "lapic.h" 28 #include "x86.h" 29 #include "irq.h" 30 #include "svm.h" 31 32 /* 33 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 34 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 35 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 36 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 37 * lookup on the index, where as vCPUs whose index doesn't match their ID need 38 * to walk the entire xarray of vCPUs in the worst case scenario. 39 * 40 * For the vCPU index, use however many bits are currently allowed for the max 41 * guest physical APIC ID (limited by the size of the physical ID table), and 42 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 43 * size of the GATag is defined by hardware (32 bits), but is an opaque value 44 * as far as hardware is concerned. 45 */ 46 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 47 48 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 49 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 50 51 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 52 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 53 54 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 55 ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 56 #define AVIC_GATAG(vm_id, vcpu_idx) \ 57 ({ \ 58 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 59 \ 60 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 61 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 62 ga_tag; \ 63 }) 64 65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 66 67 #define AVIC_AUTO_MODE -1 68 69 static int avic_param_set(const char *val, const struct kernel_param *kp) 70 { 71 if (val && sysfs_streq(val, "auto")) { 72 *(int *)kp->arg = AVIC_AUTO_MODE; 73 return 0; 74 } 75 76 return param_set_bint(val, kp); 77 } 78 79 static const struct kernel_param_ops avic_ops = { 80 .flags = KERNEL_PARAM_OPS_FL_NOARG, 81 .set = avic_param_set, 82 .get = param_get_bool, 83 }; 84 85 /* 86 * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled 87 * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). 88 */ 89 static int avic = AVIC_AUTO_MODE; 90 module_param_cb(avic, &avic_ops, &avic, 0444); 91 __MODULE_PARM_TYPE(avic, "bool"); 92 93 module_param(enable_ipiv, bool, 0444); 94 95 static bool force_avic; 96 module_param_unsafe(force_avic, bool, 0444); 97 98 /* Note: 99 * This hash table is used to map VM_ID to a struct kvm_svm, 100 * when handling AMD IOMMU GALOG notification to schedule in 101 * a particular vCPU. 102 */ 103 #define SVM_VM_DATA_HASH_BITS 8 104 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 105 static u32 next_vm_id = 0; 106 static bool next_vm_id_wrapped = 0; 107 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 108 static bool x2avic_enabled; 109 static u32 x2avic_max_physical_id; 110 111 static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, 112 bool intercept) 113 { 114 static const u32 x2avic_passthrough_msrs[] = { 115 X2APIC_MSR(APIC_ID), 116 X2APIC_MSR(APIC_LVR), 117 X2APIC_MSR(APIC_TASKPRI), 118 X2APIC_MSR(APIC_ARBPRI), 119 X2APIC_MSR(APIC_PROCPRI), 120 X2APIC_MSR(APIC_EOI), 121 X2APIC_MSR(APIC_RRR), 122 X2APIC_MSR(APIC_LDR), 123 X2APIC_MSR(APIC_DFR), 124 X2APIC_MSR(APIC_SPIV), 125 X2APIC_MSR(APIC_ISR), 126 X2APIC_MSR(APIC_TMR), 127 X2APIC_MSR(APIC_IRR), 128 X2APIC_MSR(APIC_ESR), 129 X2APIC_MSR(APIC_ICR), 130 X2APIC_MSR(APIC_ICR2), 131 132 /* 133 * Note! Always intercept LVTT, as TSC-deadline timer mode 134 * isn't virtualized by hardware, and the CPU will generate a 135 * #GP instead of a #VMEXIT. 136 */ 137 X2APIC_MSR(APIC_LVTTHMR), 138 X2APIC_MSR(APIC_LVTPC), 139 X2APIC_MSR(APIC_LVT0), 140 X2APIC_MSR(APIC_LVT1), 141 X2APIC_MSR(APIC_LVTERR), 142 X2APIC_MSR(APIC_TMICT), 143 X2APIC_MSR(APIC_TMCCT), 144 X2APIC_MSR(APIC_TDCR), 145 }; 146 int i; 147 148 if (intercept == svm->x2avic_msrs_intercepted) 149 return; 150 151 if (!x2avic_enabled) 152 return; 153 154 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 155 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 156 MSR_TYPE_RW, intercept); 157 158 svm->x2avic_msrs_intercepted = intercept; 159 } 160 161 static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) 162 { 163 u32 arch_max; 164 165 /* 166 * Return the largest size (x2APIC) when querying without a vCPU, e.g. 167 * to allocate the per-VM table.. 168 */ 169 if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) 170 arch_max = x2avic_max_physical_id; 171 else 172 arch_max = AVIC_MAX_PHYSICAL_ID; 173 174 /* 175 * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID 176 * plus one, so the max possible APIC ID is one less than that. 177 */ 178 return min(kvm->arch.max_vcpu_ids - 1, arch_max); 179 } 180 181 static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) 182 { 183 return __avic_get_max_physical_id(vcpu->kvm, vcpu); 184 } 185 186 static void avic_activate_vmcb(struct vcpu_svm *svm) 187 { 188 struct vmcb *vmcb = svm->vmcb01.ptr; 189 struct kvm_vcpu *vcpu = &svm->vcpu; 190 191 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 192 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 193 vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); 194 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 195 196 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 197 198 /* 199 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 200 * accesses, while interrupt injection to a running vCPU can be 201 * achieved using AVIC doorbell. KVM disables the APIC access page 202 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 203 * AVIC in hybrid mode activates only the doorbell mechanism. 204 */ 205 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 206 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 207 208 /* Disabling MSR intercept for x2APIC registers */ 209 avic_set_x2apic_msr_interception(svm, false); 210 } else { 211 /* 212 * Flush the TLB, the guest may have inserted a non-APIC 213 * mapping into the TLB while AVIC was disabled. 214 */ 215 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 216 217 /* Enabling MSR intercept for x2APIC registers */ 218 avic_set_x2apic_msr_interception(svm, true); 219 } 220 } 221 222 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 223 { 224 struct vmcb *vmcb = svm->vmcb01.ptr; 225 226 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 227 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 228 229 if (!sev_es_guest(svm->vcpu.kvm)) 230 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 231 232 /* 233 * If running nested and the guest uses its own MSR bitmap, there 234 * is no need to update L0's msr bitmap 235 */ 236 if (is_guest_mode(&svm->vcpu) && 237 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 238 return; 239 240 /* Enabling MSR intercept for x2APIC registers */ 241 avic_set_x2apic_msr_interception(svm, true); 242 } 243 244 /* Note: 245 * This function is called from IOMMU driver to notify 246 * SVM to schedule in a particular vCPU of a particular VM. 247 */ 248 static int avic_ga_log_notifier(u32 ga_tag) 249 { 250 unsigned long flags; 251 struct kvm_svm *kvm_svm; 252 struct kvm_vcpu *vcpu = NULL; 253 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 254 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 255 256 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 257 trace_kvm_avic_ga_log(vm_id, vcpu_idx); 258 259 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 260 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 261 if (kvm_svm->avic_vm_id != vm_id) 262 continue; 263 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 264 break; 265 } 266 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 267 268 /* Note: 269 * At this point, the IOMMU should have already set the pending 270 * bit in the vAPIC backing page. So, we just need to schedule 271 * in the vcpu. 272 */ 273 if (vcpu) 274 kvm_vcpu_wake_up(vcpu); 275 276 return 0; 277 } 278 279 static int avic_get_physical_id_table_order(struct kvm *kvm) 280 { 281 /* Provision for the maximum physical ID supported in x2avic mode */ 282 return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); 283 } 284 285 int avic_alloc_physical_id_table(struct kvm *kvm) 286 { 287 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 288 289 if (!irqchip_in_kernel(kvm) || !enable_apicv) 290 return 0; 291 292 if (kvm_svm->avic_physical_id_table) 293 return 0; 294 295 kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 296 avic_get_physical_id_table_order(kvm)); 297 if (!kvm_svm->avic_physical_id_table) 298 return -ENOMEM; 299 300 return 0; 301 } 302 303 void avic_vm_destroy(struct kvm *kvm) 304 { 305 unsigned long flags; 306 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 307 308 if (!enable_apicv) 309 return; 310 311 free_page((unsigned long)kvm_svm->avic_logical_id_table); 312 free_pages((unsigned long)kvm_svm->avic_physical_id_table, 313 avic_get_physical_id_table_order(kvm)); 314 315 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 316 hash_del(&kvm_svm->hnode); 317 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 318 } 319 320 int avic_vm_init(struct kvm *kvm) 321 { 322 unsigned long flags; 323 int err = -ENOMEM; 324 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 325 struct kvm_svm *k2; 326 u32 vm_id; 327 328 if (!enable_apicv) 329 return 0; 330 331 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 332 if (!kvm_svm->avic_logical_id_table) 333 goto free_avic; 334 335 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 336 again: 337 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 338 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 339 next_vm_id_wrapped = 1; 340 goto again; 341 } 342 /* Is it still in use? Only possible if wrapped at least once */ 343 if (next_vm_id_wrapped) { 344 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 345 if (k2->avic_vm_id == vm_id) 346 goto again; 347 } 348 } 349 kvm_svm->avic_vm_id = vm_id; 350 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 351 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 352 353 return 0; 354 355 free_avic: 356 avic_vm_destroy(kvm); 357 return err; 358 } 359 360 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 361 { 362 return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 363 } 364 365 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 366 { 367 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 368 369 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 370 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 371 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 372 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 373 374 if (kvm_vcpu_apicv_active(&svm->vcpu)) 375 avic_activate_vmcb(svm); 376 else 377 avic_deactivate_vmcb(svm); 378 } 379 380 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 381 { 382 u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID; 383 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 384 struct vcpu_svm *svm = to_svm(vcpu); 385 u32 id = vcpu->vcpu_id; 386 u64 new_entry; 387 388 /* 389 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 390 * hardware. Immediately clear apicv_active, i.e. don't wait until the 391 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 392 * avic_vcpu_load() expects to be called if and only if the vCPU has 393 * fully initialized AVIC. 394 */ 395 if (id > max_id) { 396 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 397 vcpu->arch.apic->apicv_active = false; 398 return 0; 399 } 400 401 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 402 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 403 404 if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 405 return -EINVAL; 406 407 if (kvm_apicv_activated(vcpu->kvm)) { 408 int ret; 409 410 /* 411 * Note, AVIC hardware walks the nested page table to check 412 * permissions, but does not use the SPA address specified in 413 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 414 * pointer field of the VMCB. 415 */ 416 ret = kvm_alloc_apic_access_page(vcpu->kvm); 417 if (ret) 418 return ret; 419 } 420 421 /* Note, fls64() returns the bit position, +1. */ 422 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 423 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 424 425 /* Setting AVIC backing page address in the phy APIC ID table */ 426 new_entry = avic_get_backing_page_address(svm) | 427 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 428 svm->avic_physical_id_entry = new_entry; 429 430 /* 431 * Initialize the real table, as vCPUs must have a valid entry in order 432 * for broadcast IPIs to function correctly (broadcast IPIs ignore 433 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 434 */ 435 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 436 437 return 0; 438 } 439 440 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 441 { 442 /* 443 * Note, the vCPU could get migrated to a different pCPU at any point, 444 * which could result in signalling the wrong/previous pCPU. But if 445 * that happens the vCPU is guaranteed to do a VMRUN (after being 446 * migrated) and thus will process pending interrupts, i.e. a doorbell 447 * is not needed (and the spurious one is harmless). 448 */ 449 int cpu = READ_ONCE(vcpu->cpu); 450 451 if (cpu != get_cpu()) { 452 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 453 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 454 } 455 put_cpu(); 456 } 457 458 459 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 460 { 461 vcpu->arch.apic->irr_pending = true; 462 svm_complete_interrupt_delivery(vcpu, 463 icrl & APIC_MODE_MASK, 464 icrl & APIC_INT_LEVELTRIG, 465 icrl & APIC_VECTOR_MASK); 466 } 467 468 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 469 u32 icrl) 470 { 471 /* 472 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 473 * i.e. APIC ID == vCPU ID. 474 */ 475 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 476 477 /* Once again, nothing to do if the target vCPU doesn't exist. */ 478 if (unlikely(!target_vcpu)) 479 return; 480 481 avic_kick_vcpu(target_vcpu, icrl); 482 } 483 484 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 485 u32 logid_index, u32 icrl) 486 { 487 u32 physical_id; 488 489 if (avic_logical_id_table) { 490 u32 logid_entry = avic_logical_id_table[logid_index]; 491 492 /* Nothing to do if the logical destination is invalid. */ 493 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 494 return; 495 496 physical_id = logid_entry & 497 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 498 } else { 499 /* 500 * For x2APIC, the logical APIC ID is a read-only value that is 501 * derived from the x2APIC ID, thus the x2APIC ID can be found 502 * by reversing the calculation (stored in logid_index). Note, 503 * bits 31:20 of the x2APIC ID aren't propagated to the logical 504 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 505 */ 506 physical_id = logid_index; 507 } 508 509 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 510 } 511 512 /* 513 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 514 * destination APIC ID to vCPU without looping through all vCPUs. 515 */ 516 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 517 u32 icrl, u32 icrh, u32 index) 518 { 519 int dest_mode = icrl & APIC_DEST_MASK; 520 int shorthand = icrl & APIC_SHORT_MASK; 521 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 522 u32 dest; 523 524 if (shorthand != APIC_DEST_NOSHORT) 525 return -EINVAL; 526 527 if (apic_x2apic_mode(source)) 528 dest = icrh; 529 else 530 dest = GET_XAPIC_DEST_FIELD(icrh); 531 532 if (dest_mode == APIC_DEST_PHYSICAL) { 533 /* broadcast destination, use slow path */ 534 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 535 return -EINVAL; 536 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 537 return -EINVAL; 538 539 if (WARN_ON_ONCE(dest != index)) 540 return -EINVAL; 541 542 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 543 } else { 544 u32 *avic_logical_id_table; 545 unsigned long bitmap, i; 546 u32 cluster; 547 548 if (apic_x2apic_mode(source)) { 549 /* 16 bit dest mask, 16 bit cluster id */ 550 bitmap = dest & 0xFFFF; 551 cluster = (dest >> 16) << 4; 552 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 553 /* 8 bit dest mask*/ 554 bitmap = dest; 555 cluster = 0; 556 } else { 557 /* 4 bit desk mask, 4 bit cluster id */ 558 bitmap = dest & 0xF; 559 cluster = (dest >> 4) << 2; 560 } 561 562 /* Nothing to do if there are no destinations in the cluster. */ 563 if (unlikely(!bitmap)) 564 return 0; 565 566 if (apic_x2apic_mode(source)) 567 avic_logical_id_table = NULL; 568 else 569 avic_logical_id_table = kvm_svm->avic_logical_id_table; 570 571 /* 572 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 573 * IDs, thus each bit in the destination is guaranteed to map 574 * to at most one vCPU. 575 */ 576 for_each_set_bit(i, &bitmap, 16) 577 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 578 cluster + i, icrl); 579 } 580 581 return 0; 582 } 583 584 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 585 u32 icrl, u32 icrh, u32 index) 586 { 587 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 588 unsigned long i; 589 struct kvm_vcpu *vcpu; 590 591 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 592 return; 593 594 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 595 596 /* 597 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 598 * event. There's no need to signal doorbells, as hardware has handled 599 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 600 * since entered the guest will have processed pending IRQs at VMRUN. 601 */ 602 kvm_for_each_vcpu(i, vcpu, kvm) { 603 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 604 dest, icrl & APIC_DEST_MASK)) 605 avic_kick_vcpu(vcpu, icrl); 606 } 607 } 608 609 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 610 { 611 struct vcpu_svm *svm = to_svm(vcpu); 612 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 613 u32 icrl = svm->vmcb->control.exit_info_1; 614 u32 id = svm->vmcb->control.exit_info_2 >> 32; 615 u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; 616 struct kvm_lapic *apic = vcpu->arch.apic; 617 618 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 619 620 switch (id) { 621 case AVIC_IPI_FAILURE_INVALID_TARGET: 622 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 623 /* 624 * Emulate IPIs that are not handled by AVIC hardware, which 625 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 626 * if _any_ targets are invalid, e.g. if the logical mode mask 627 * is a superset of running vCPUs. 628 * 629 * The exit is a trap, e.g. ICR holds the correct value and RIP 630 * has been advanced, KVM is responsible only for emulating the 631 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 632 * in which case KVM needs to emulate the ICR write as well in 633 * order to clear the BUSY flag. 634 */ 635 if (icrl & APIC_ICR_BUSY) 636 kvm_apic_write_nodecode(vcpu, APIC_ICR); 637 else 638 kvm_apic_send_ipi(apic, icrl, icrh); 639 break; 640 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 641 /* 642 * At this point, we expect that the AVIC HW has already 643 * set the appropriate IRR bits on the valid target 644 * vcpus. So, we just need to kick the appropriate vcpu. 645 */ 646 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 647 break; 648 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 649 WARN_ONCE(1, "Invalid backing page\n"); 650 break; 651 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: 652 /* Invalid IPI with vector < 16 */ 653 break; 654 default: 655 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); 656 } 657 658 return 1; 659 } 660 661 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 662 { 663 if (is_guest_mode(vcpu)) 664 return APICV_INHIBIT_REASON_NESTED; 665 return 0; 666 } 667 668 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 669 { 670 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 671 u32 cluster, index; 672 673 ldr = GET_APIC_LOGICAL_ID(ldr); 674 675 if (flat) { 676 cluster = 0; 677 } else { 678 cluster = (ldr >> 4); 679 if (cluster >= 0xf) 680 return NULL; 681 ldr &= 0xf; 682 } 683 if (!ldr || !is_power_of_2(ldr)) 684 return NULL; 685 686 index = __ffs(ldr); 687 if (WARN_ON_ONCE(index > 7)) 688 return NULL; 689 index += (cluster << 2); 690 691 return &kvm_svm->avic_logical_id_table[index]; 692 } 693 694 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 695 { 696 bool flat; 697 u32 *entry, new_entry; 698 699 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 700 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 701 if (!entry) 702 return; 703 704 new_entry = READ_ONCE(*entry); 705 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 706 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 707 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 708 WRITE_ONCE(*entry, new_entry); 709 } 710 711 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 712 { 713 struct vcpu_svm *svm = to_svm(vcpu); 714 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 715 u32 *entry; 716 717 /* Note: x2AVIC does not use logical APIC ID table */ 718 if (apic_x2apic_mode(vcpu->arch.apic)) 719 return; 720 721 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 722 if (entry) 723 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 724 } 725 726 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 727 { 728 struct vcpu_svm *svm = to_svm(vcpu); 729 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 730 u32 id = kvm_xapic_id(vcpu->arch.apic); 731 732 /* AVIC does not support LDR update for x2APIC */ 733 if (apic_x2apic_mode(vcpu->arch.apic)) 734 return; 735 736 if (ldr == svm->ldr_reg) 737 return; 738 739 avic_invalidate_logical_id_entry(vcpu); 740 741 svm->ldr_reg = ldr; 742 avic_ldr_write(vcpu, id, ldr); 743 } 744 745 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 746 { 747 struct vcpu_svm *svm = to_svm(vcpu); 748 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 749 750 if (svm->dfr_reg == dfr) 751 return; 752 753 avic_invalidate_logical_id_entry(vcpu); 754 svm->dfr_reg = dfr; 755 } 756 757 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 758 { 759 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 760 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 761 762 switch (offset) { 763 case APIC_LDR: 764 avic_handle_ldr_update(vcpu); 765 break; 766 case APIC_DFR: 767 avic_handle_dfr_update(vcpu); 768 break; 769 case APIC_RRR: 770 /* Ignore writes to Read Remote Data, it's read-only. */ 771 return 1; 772 default: 773 break; 774 } 775 776 kvm_apic_write_nodecode(vcpu, offset); 777 return 1; 778 } 779 780 static bool is_avic_unaccelerated_access_trap(u32 offset) 781 { 782 bool ret = false; 783 784 switch (offset) { 785 case APIC_ID: 786 case APIC_EOI: 787 case APIC_RRR: 788 case APIC_LDR: 789 case APIC_DFR: 790 case APIC_SPIV: 791 case APIC_ESR: 792 case APIC_ICR: 793 case APIC_LVTT: 794 case APIC_LVTTHMR: 795 case APIC_LVTPC: 796 case APIC_LVT0: 797 case APIC_LVT1: 798 case APIC_LVTERR: 799 case APIC_TMICT: 800 case APIC_TDCR: 801 ret = true; 802 break; 803 default: 804 break; 805 } 806 return ret; 807 } 808 809 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 810 { 811 struct vcpu_svm *svm = to_svm(vcpu); 812 int ret = 0; 813 u32 offset = svm->vmcb->control.exit_info_1 & 814 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 815 u32 vector = svm->vmcb->control.exit_info_2 & 816 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 817 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 818 AVIC_UNACCEL_ACCESS_WRITE_MASK; 819 bool trap = is_avic_unaccelerated_access_trap(offset); 820 821 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 822 trap, write, vector); 823 if (trap) { 824 /* Handling Trap */ 825 WARN_ONCE(!write, "svm: Handling trap read.\n"); 826 ret = avic_unaccel_trap_write(vcpu); 827 } else { 828 /* Handling Fault */ 829 ret = kvm_emulate_instruction(vcpu, 0); 830 } 831 832 return ret; 833 } 834 835 int avic_init_vcpu(struct vcpu_svm *svm) 836 { 837 int ret; 838 struct kvm_vcpu *vcpu = &svm->vcpu; 839 840 INIT_LIST_HEAD(&svm->ir_list); 841 raw_spin_lock_init(&svm->ir_list_lock); 842 843 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 844 return 0; 845 846 ret = avic_init_backing_page(vcpu); 847 if (ret) 848 return ret; 849 850 svm->dfr_reg = APIC_DFR_FLAT; 851 852 return ret; 853 } 854 855 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 856 { 857 avic_handle_dfr_update(vcpu); 858 avic_handle_ldr_update(vcpu); 859 } 860 861 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 862 { 863 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 864 unsigned long flags; 865 866 if (!vcpu) 867 return; 868 869 raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 870 list_del(&irqfd->vcpu_list); 871 raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 872 } 873 874 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 875 unsigned int host_irq, uint32_t guest_irq, 876 struct kvm_vcpu *vcpu, u32 vector) 877 { 878 /* 879 * If the IRQ was affined to a different vCPU, remove the IRTE metadata 880 * from the *previous* vCPU's list. 881 */ 882 svm_ir_list_del(irqfd); 883 884 if (vcpu) { 885 /* 886 * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 887 * in which case configure the IRTE for legacy mode, but track 888 * the IRTE metadata so that it can be converted to guest mode 889 * if AVIC is enabled/uninhibited in the future. 890 */ 891 struct amd_iommu_pi_data pi_data = { 892 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 893 vcpu->vcpu_idx), 894 .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 895 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 896 .vector = vector, 897 }; 898 struct vcpu_svm *svm = to_svm(vcpu); 899 u64 entry; 900 int ret; 901 902 /* 903 * Prevent the vCPU from being scheduled out or migrated until 904 * the IRTE is updated and its metadata has been added to the 905 * list of IRQs being posted to the vCPU, to ensure the IRTE 906 * isn't programmed with stale pCPU/IsRunning information. 907 */ 908 guard(raw_spinlock_irqsave)(&svm->ir_list_lock); 909 910 /* 911 * Update the target pCPU for IOMMU doorbells if the vCPU is 912 * running. If the vCPU is NOT running, i.e. is blocking or 913 * scheduled out, KVM will update the pCPU info when the vCPU 914 * is awakened and/or scheduled in. See also avic_vcpu_load(). 915 */ 916 entry = svm->avic_physical_id_entry; 917 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 918 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 919 } else { 920 pi_data.cpu = -1; 921 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 922 } 923 924 ret = irq_set_vcpu_affinity(host_irq, &pi_data); 925 if (ret) 926 return ret; 927 928 /* 929 * Revert to legacy mode if the IOMMU didn't provide metadata 930 * for the IRTE, which KVM needs to keep the IRTE up-to-date, 931 * e.g. if the vCPU is migrated or AVIC is disabled. 932 */ 933 if (WARN_ON_ONCE(!pi_data.ir_data)) { 934 irq_set_vcpu_affinity(host_irq, NULL); 935 return -EIO; 936 } 937 938 irqfd->irq_bypass_data = pi_data.ir_data; 939 list_add(&irqfd->vcpu_list, &svm->ir_list); 940 return 0; 941 } 942 return irq_set_vcpu_affinity(host_irq, NULL); 943 } 944 945 enum avic_vcpu_action { 946 /* 947 * There is no need to differentiate between activate and deactivate, 948 * as KVM only refreshes AVIC state when the vCPU is scheduled in and 949 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 950 * being (de)activated. 951 */ 952 AVIC_TOGGLE_ON_OFF = BIT(0), 953 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 954 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 955 956 /* 957 * No unique action is required to deal with a vCPU that stops/starts 958 * running. A vCPU that starts running by definition stops blocking as 959 * well, and a vCPU that stops running can't have been blocking, i.e. 960 * doesn't need to toggle GALogIntr. 961 */ 962 AVIC_START_RUNNING = 0, 963 AVIC_STOP_RUNNING = 0, 964 965 /* 966 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 967 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 968 * sent to the vCPU. 969 */ 970 AVIC_START_BLOCKING = BIT(1), 971 }; 972 973 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 974 enum avic_vcpu_action action) 975 { 976 bool ga_log_intr = (action & AVIC_START_BLOCKING); 977 struct vcpu_svm *svm = to_svm(vcpu); 978 struct kvm_kernel_irqfd *irqfd; 979 980 lockdep_assert_held(&svm->ir_list_lock); 981 982 /* 983 * Here, we go through the per-vcpu ir_list to update all existing 984 * interrupt remapping table entry targeting this vcpu. 985 */ 986 if (list_empty(&svm->ir_list)) 987 return; 988 989 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 990 void *data = irqfd->irq_bypass_data; 991 992 if (!(action & AVIC_TOGGLE_ON_OFF)) 993 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 994 else if (cpu >= 0) 995 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 996 else 997 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 998 } 999 } 1000 1001 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 1002 enum avic_vcpu_action action) 1003 { 1004 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1005 int h_physical_id = kvm_cpu_get_apicid(cpu); 1006 struct vcpu_svm *svm = to_svm(vcpu); 1007 unsigned long flags; 1008 u64 entry; 1009 1010 lockdep_assert_preemption_disabled(); 1011 1012 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 1013 return; 1014 1015 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1016 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1017 return; 1018 1019 /* 1020 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't 1021 * _currently_ have assigned devices, as that can change. Holding 1022 * ir_list_lock ensures that either svm_ir_list_add() will consume 1023 * up-to-date entry information, or that this task will wait until 1024 * svm_ir_list_add() completes to set the new target pCPU. 1025 */ 1026 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1027 1028 entry = svm->avic_physical_id_entry; 1029 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 1030 1031 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 1032 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1033 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 1034 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1035 1036 svm->avic_physical_id_entry = entry; 1037 1038 /* 1039 * If IPI virtualization is disabled, clear IsRunning when updating the 1040 * actual Physical ID table, so that the CPU never sees IsRunning=1. 1041 * Keep the APIC ID up-to-date in the entry to minimize the chances of 1042 * things going sideways if hardware peeks at the ID. 1043 */ 1044 if (!enable_ipiv) 1045 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1046 1047 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1048 1049 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 1050 1051 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1052 } 1053 1054 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1055 { 1056 /* 1057 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1058 * is being scheduled in after being preempted. The CPU entries in the 1059 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1060 * If the vCPU was migrated, its new CPU value will be stuffed when the 1061 * vCPU unblocks. 1062 */ 1063 if (kvm_vcpu_is_blocking(vcpu)) 1064 return; 1065 1066 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 1067 } 1068 1069 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 1070 { 1071 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1072 struct vcpu_svm *svm = to_svm(vcpu); 1073 unsigned long flags; 1074 u64 entry = svm->avic_physical_id_entry; 1075 1076 lockdep_assert_preemption_disabled(); 1077 1078 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= 1079 PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) 1080 return; 1081 1082 /* 1083 * Take and hold the per-vCPU interrupt remapping lock while updating 1084 * the Physical ID entry even though the lock doesn't protect against 1085 * multiple writers (see above). Holding ir_list_lock ensures that 1086 * either svm_ir_list_add() will consume up-to-date entry information, 1087 * or that this task will wait until svm_ir_list_add() completes to 1088 * mark the vCPU as not running. 1089 */ 1090 raw_spin_lock_irqsave(&svm->ir_list_lock, flags); 1091 1092 avic_update_iommu_vcpu_affinity(vcpu, -1, action); 1093 1094 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1095 1096 /* 1097 * Keep the previous APIC ID in the entry so that a rogue doorbell from 1098 * hardware is at least restricted to a CPU associated with the vCPU. 1099 */ 1100 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1101 1102 if (enable_ipiv) 1103 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1104 1105 /* 1106 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 1107 * it's a synthetic flag that usurps an unused should-be-zero bit. 1108 */ 1109 if (action & AVIC_START_BLOCKING) 1110 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 1111 1112 svm->avic_physical_id_entry = entry; 1113 1114 raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1115 } 1116 1117 void avic_vcpu_put(struct kvm_vcpu *vcpu) 1118 { 1119 /* 1120 * Note, reading the Physical ID entry outside of ir_list_lock is safe 1121 * as only the pCPU that has loaded (or is loading) the vCPU is allowed 1122 * to modify the entry, and preemption is disabled. I.e. the vCPU 1123 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 1124 * recursively. 1125 */ 1126 u64 entry = to_svm(vcpu)->avic_physical_id_entry; 1127 1128 /* 1129 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 1130 * vCPU is preempted while its in the process of blocking. WARN if the 1131 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1132 * the AVIC if it wasn't previously loaded. 1133 */ 1134 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1135 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1136 return; 1137 1138 /* 1139 * The vCPU was preempted while blocking, ensure its IRTEs are 1140 * configured to generate GA Log Interrupts. 1141 */ 1142 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1143 return; 1144 } 1145 1146 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1147 AVIC_STOP_RUNNING); 1148 } 1149 1150 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1151 { 1152 struct vcpu_svm *svm = to_svm(vcpu); 1153 struct vmcb *vmcb = svm->vmcb01.ptr; 1154 1155 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1156 return; 1157 1158 if (kvm_vcpu_apicv_active(vcpu)) { 1159 /** 1160 * During AVIC temporary deactivation, guest could update 1161 * APIC ID, DFR and LDR registers, which would not be trapped 1162 * by avic_unaccelerated_access_interception(). In this case, 1163 * we need to check and update the AVIC logical APIC ID table 1164 * accordingly before re-activating. 1165 */ 1166 avic_apicv_post_state_restore(vcpu); 1167 avic_activate_vmcb(svm); 1168 } else { 1169 avic_deactivate_vmcb(svm); 1170 } 1171 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1172 } 1173 1174 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1175 { 1176 if (!enable_apicv) 1177 return; 1178 1179 /* APICv should only be toggled on/off while the vCPU is running. */ 1180 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1181 1182 avic_refresh_virtual_apic_mode(vcpu); 1183 1184 if (kvm_vcpu_apicv_active(vcpu)) 1185 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1186 else 1187 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1188 } 1189 1190 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1191 { 1192 if (!kvm_vcpu_apicv_active(vcpu)) 1193 return; 1194 1195 /* 1196 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1197 * actually blocks. 1198 * 1199 * Note, any IRQs that arrive before IsRunning=0 will not cause an 1200 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1201 * this by checking vIRR one last time before blocking. The memory 1202 * barrier implicit in set_current_state orders writing IsRunning=0 1203 * before reading the vIRR. The processor needs a matching memory 1204 * barrier on interrupt delivery between writing IRR and reading 1205 * IsRunning; the lack of this barrier might be the cause of errata #1235). 1206 * 1207 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1208 * doesn't need to detect events for scheduling purposes. The doorbell 1209 * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1210 * CPU and cause noisy neighbor problems if the VM is sending interrupts 1211 * to the vCPU while it's scheduled out. 1212 */ 1213 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1214 } 1215 1216 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1217 { 1218 if (!kvm_vcpu_apicv_active(vcpu)) 1219 return; 1220 1221 avic_vcpu_load(vcpu, vcpu->cpu); 1222 } 1223 1224 static bool __init avic_want_avic_enabled(void) 1225 { 1226 /* 1227 * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is 1228 * supported (to avoid enabling partial support by default, and because 1229 * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for 1230 * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags 1231 * aren't inclusive of previous generations, i.e. the kernel will set 1232 * at most one ZenX feature flag. 1233 */ 1234 if (avic == AVIC_AUTO_MODE) 1235 avic = boot_cpu_has(X86_FEATURE_X2AVIC) && 1236 (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A); 1237 1238 if (!avic || !npt_enabled) 1239 return false; 1240 1241 /* AVIC is a prerequisite for x2AVIC. */ 1242 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1243 if (boot_cpu_has(X86_FEATURE_X2AVIC)) 1244 pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); 1245 return false; 1246 } 1247 1248 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && 1249 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { 1250 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); 1251 return false; 1252 } 1253 1254 /* 1255 * Print a scary message if AVIC is force enabled to make it abundantly 1256 * clear that ignoring CPUID could have repercussions. See Revision 1257 * Guide for specific AMD processor for more details. 1258 */ 1259 if (!boot_cpu_has(X86_FEATURE_AVIC)) 1260 pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); 1261 1262 return true; 1263 } 1264 1265 /* 1266 * Note: 1267 * - The module param avic enable both xAPIC and x2APIC mode. 1268 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1269 * - The mode can be switched at run-time. 1270 */ 1271 bool __init avic_hardware_setup(void) 1272 { 1273 avic = avic_want_avic_enabled(); 1274 if (!avic) 1275 return false; 1276 1277 pr_info("AVIC enabled\n"); 1278 1279 /* AVIC is a prerequisite for x2AVIC. */ 1280 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1281 if (x2avic_enabled) { 1282 if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) 1283 x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; 1284 else 1285 x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; 1286 pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); 1287 } else { 1288 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 1289 } 1290 1291 /* 1292 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1293 * due to erratum 1235, which results in missed VM-Exits on the sender 1294 * and thus missed wake events for blocking vCPUs due to the CPU 1295 * failing to see a software update to clear IsRunning. 1296 */ 1297 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1298 1299 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1300 1301 return true; 1302 } 1303 1304 void avic_hardware_unsetup(void) 1305 { 1306 if (avic) 1307 amd_iommu_register_ga_log_notifier(NULL); 1308 } 1309