1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM support 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * Avi Kivity <avi@qumranet.com> 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/kvm_types.h> 18 #include <linux/hashtable.h> 19 #include <linux/amd-iommu.h> 20 #include <linux/kvm_host.h> 21 #include <linux/kvm_irqfd.h> 22 23 #include <asm/irq_remapping.h> 24 #include <asm/msr.h> 25 26 #include "trace.h" 27 #include "lapic.h" 28 #include "x86.h" 29 #include "irq.h" 30 #include "svm.h" 31 32 /* 33 * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 34 * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 35 * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 36 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 37 * lookup on the index, where as vCPUs whose index doesn't match their ID need 38 * to walk the entire xarray of vCPUs in the worst case scenario. 39 * 40 * For the vCPU index, use however many bits are currently allowed for the max 41 * guest physical APIC ID (limited by the size of the physical ID table), and 42 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 43 * size of the GATag is defined by hardware (32 bits), but is an opaque value 44 * as far as hardware is concerned. 45 */ 46 #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 47 48 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 49 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 50 51 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 52 #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 53 54 #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 55 ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 56 #define AVIC_GATAG(vm_id, vcpu_idx) \ 57 ({ \ 58 u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 59 \ 60 WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 61 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 62 ga_tag; \ 63 }) 64 65 static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 66 67 static bool force_avic; 68 module_param_unsafe(force_avic, bool, 0444); 69 70 /* Note: 71 * This hash table is used to map VM_ID to a struct kvm_svm, 72 * when handling AMD IOMMU GALOG notification to schedule in 73 * a particular vCPU. 74 */ 75 #define SVM_VM_DATA_HASH_BITS 8 76 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 77 static u32 next_vm_id = 0; 78 static bool next_vm_id_wrapped = 0; 79 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 80 bool x2avic_enabled; 81 82 static void avic_activate_vmcb(struct vcpu_svm *svm) 83 { 84 struct vmcb *vmcb = svm->vmcb01.ptr; 85 86 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 87 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 88 89 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 90 91 /* 92 * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR 93 * accesses, while interrupt injection to a running vCPU can be 94 * achieved using AVIC doorbell. KVM disables the APIC access page 95 * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling 96 * AVIC in hybrid mode activates only the doorbell mechanism. 97 */ 98 if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { 99 vmcb->control.int_ctl |= X2APIC_MODE_MASK; 100 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; 101 /* Disabling MSR intercept for x2APIC registers */ 102 svm_set_x2apic_msr_interception(svm, false); 103 } else { 104 /* 105 * Flush the TLB, the guest may have inserted a non-APIC 106 * mapping into the TLB while AVIC was disabled. 107 */ 108 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); 109 110 /* For xAVIC and hybrid-xAVIC modes */ 111 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; 112 /* Enabling MSR intercept for x2APIC registers */ 113 svm_set_x2apic_msr_interception(svm, true); 114 } 115 } 116 117 static void avic_deactivate_vmcb(struct vcpu_svm *svm) 118 { 119 struct vmcb *vmcb = svm->vmcb01.ptr; 120 121 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 122 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 123 124 /* 125 * If running nested and the guest uses its own MSR bitmap, there 126 * is no need to update L0's msr bitmap 127 */ 128 if (is_guest_mode(&svm->vcpu) && 129 vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) 130 return; 131 132 /* Enabling MSR intercept for x2APIC registers */ 133 svm_set_x2apic_msr_interception(svm, true); 134 } 135 136 /* Note: 137 * This function is called from IOMMU driver to notify 138 * SVM to schedule in a particular vCPU of a particular VM. 139 */ 140 int avic_ga_log_notifier(u32 ga_tag) 141 { 142 unsigned long flags; 143 struct kvm_svm *kvm_svm; 144 struct kvm_vcpu *vcpu = NULL; 145 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 146 u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 147 148 pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 149 trace_kvm_avic_ga_log(vm_id, vcpu_idx); 150 151 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 152 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 153 if (kvm_svm->avic_vm_id != vm_id) 154 continue; 155 vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 156 break; 157 } 158 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 159 160 /* Note: 161 * At this point, the IOMMU should have already set the pending 162 * bit in the vAPIC backing page. So, we just need to schedule 163 * in the vcpu. 164 */ 165 if (vcpu) 166 kvm_vcpu_wake_up(vcpu); 167 168 return 0; 169 } 170 171 void avic_vm_destroy(struct kvm *kvm) 172 { 173 unsigned long flags; 174 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 175 176 if (!enable_apicv) 177 return; 178 179 free_page((unsigned long)kvm_svm->avic_logical_id_table); 180 free_page((unsigned long)kvm_svm->avic_physical_id_table); 181 182 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 183 hash_del(&kvm_svm->hnode); 184 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 185 } 186 187 int avic_vm_init(struct kvm *kvm) 188 { 189 unsigned long flags; 190 int err = -ENOMEM; 191 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 192 struct kvm_svm *k2; 193 u32 vm_id; 194 195 if (!enable_apicv) 196 return 0; 197 198 kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 199 if (!kvm_svm->avic_physical_id_table) 200 goto free_avic; 201 202 kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 203 if (!kvm_svm->avic_logical_id_table) 204 goto free_avic; 205 206 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 207 again: 208 vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; 209 if (vm_id == 0) { /* id is 1-based, zero is not okay */ 210 next_vm_id_wrapped = 1; 211 goto again; 212 } 213 /* Is it still in use? Only possible if wrapped at least once */ 214 if (next_vm_id_wrapped) { 215 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { 216 if (k2->avic_vm_id == vm_id) 217 goto again; 218 } 219 } 220 kvm_svm->avic_vm_id = vm_id; 221 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); 222 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 223 224 return 0; 225 226 free_avic: 227 avic_vm_destroy(kvm); 228 return err; 229 } 230 231 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 232 { 233 return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 234 } 235 236 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 237 { 238 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 239 240 vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 241 vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 242 vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 243 vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 244 245 if (kvm_apicv_activated(svm->vcpu.kvm)) 246 avic_activate_vmcb(svm); 247 else 248 avic_deactivate_vmcb(svm); 249 } 250 251 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 252 { 253 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 254 struct vcpu_svm *svm = to_svm(vcpu); 255 u32 id = vcpu->vcpu_id; 256 u64 new_entry; 257 258 /* 259 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 260 * hardware. Immediately clear apicv_active, i.e. don't wait until the 261 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 262 * avic_vcpu_load() expects to be called if and only if the vCPU has 263 * fully initialized AVIC. 264 */ 265 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || 266 (id > X2AVIC_MAX_PHYSICAL_ID)) { 267 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 268 vcpu->arch.apic->apicv_active = false; 269 return 0; 270 } 271 272 BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 273 (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 274 275 if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 276 return -EINVAL; 277 278 if (kvm_apicv_activated(vcpu->kvm)) { 279 int ret; 280 281 /* 282 * Note, AVIC hardware walks the nested page table to check 283 * permissions, but does not use the SPA address specified in 284 * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE 285 * pointer field of the VMCB. 286 */ 287 ret = kvm_alloc_apic_access_page(vcpu->kvm); 288 if (ret) 289 return ret; 290 } 291 292 /* Note, fls64() returns the bit position, +1. */ 293 BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 294 fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 295 296 /* Setting AVIC backing page address in the phy APIC ID table */ 297 new_entry = avic_get_backing_page_address(svm) | 298 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 299 svm->avic_physical_id_entry = new_entry; 300 301 /* 302 * Initialize the real table, as vCPUs must have a valid entry in order 303 * for broadcast IPIs to function correctly (broadcast IPIs ignore 304 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 305 */ 306 WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 307 308 return 0; 309 } 310 311 void avic_ring_doorbell(struct kvm_vcpu *vcpu) 312 { 313 /* 314 * Note, the vCPU could get migrated to a different pCPU at any point, 315 * which could result in signalling the wrong/previous pCPU. But if 316 * that happens the vCPU is guaranteed to do a VMRUN (after being 317 * migrated) and thus will process pending interrupts, i.e. a doorbell 318 * is not needed (and the spurious one is harmless). 319 */ 320 int cpu = READ_ONCE(vcpu->cpu); 321 322 if (cpu != get_cpu()) { 323 wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 324 trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); 325 } 326 put_cpu(); 327 } 328 329 330 static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) 331 { 332 vcpu->arch.apic->irr_pending = true; 333 svm_complete_interrupt_delivery(vcpu, 334 icrl & APIC_MODE_MASK, 335 icrl & APIC_INT_LEVELTRIG, 336 icrl & APIC_VECTOR_MASK); 337 } 338 339 static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, 340 u32 icrl) 341 { 342 /* 343 * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, 344 * i.e. APIC ID == vCPU ID. 345 */ 346 struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id); 347 348 /* Once again, nothing to do if the target vCPU doesn't exist. */ 349 if (unlikely(!target_vcpu)) 350 return; 351 352 avic_kick_vcpu(target_vcpu, icrl); 353 } 354 355 static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, 356 u32 logid_index, u32 icrl) 357 { 358 u32 physical_id; 359 360 if (avic_logical_id_table) { 361 u32 logid_entry = avic_logical_id_table[logid_index]; 362 363 /* Nothing to do if the logical destination is invalid. */ 364 if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) 365 return; 366 367 physical_id = logid_entry & 368 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 369 } else { 370 /* 371 * For x2APIC, the logical APIC ID is a read-only value that is 372 * derived from the x2APIC ID, thus the x2APIC ID can be found 373 * by reversing the calculation (stored in logid_index). Note, 374 * bits 31:20 of the x2APIC ID aren't propagated to the logical 375 * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. 376 */ 377 physical_id = logid_index; 378 } 379 380 avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); 381 } 382 383 /* 384 * A fast-path version of avic_kick_target_vcpus(), which attempts to match 385 * destination APIC ID to vCPU without looping through all vCPUs. 386 */ 387 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, 388 u32 icrl, u32 icrh, u32 index) 389 { 390 int dest_mode = icrl & APIC_DEST_MASK; 391 int shorthand = icrl & APIC_SHORT_MASK; 392 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 393 u32 dest; 394 395 if (shorthand != APIC_DEST_NOSHORT) 396 return -EINVAL; 397 398 if (apic_x2apic_mode(source)) 399 dest = icrh; 400 else 401 dest = GET_XAPIC_DEST_FIELD(icrh); 402 403 if (dest_mode == APIC_DEST_PHYSICAL) { 404 /* broadcast destination, use slow path */ 405 if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) 406 return -EINVAL; 407 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) 408 return -EINVAL; 409 410 if (WARN_ON_ONCE(dest != index)) 411 return -EINVAL; 412 413 avic_kick_vcpu_by_physical_id(kvm, dest, icrl); 414 } else { 415 u32 *avic_logical_id_table; 416 unsigned long bitmap, i; 417 u32 cluster; 418 419 if (apic_x2apic_mode(source)) { 420 /* 16 bit dest mask, 16 bit cluster id */ 421 bitmap = dest & 0xFFFF; 422 cluster = (dest >> 16) << 4; 423 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { 424 /* 8 bit dest mask*/ 425 bitmap = dest; 426 cluster = 0; 427 } else { 428 /* 4 bit desk mask, 4 bit cluster id */ 429 bitmap = dest & 0xF; 430 cluster = (dest >> 4) << 2; 431 } 432 433 /* Nothing to do if there are no destinations in the cluster. */ 434 if (unlikely(!bitmap)) 435 return 0; 436 437 if (apic_x2apic_mode(source)) 438 avic_logical_id_table = NULL; 439 else 440 avic_logical_id_table = kvm_svm->avic_logical_id_table; 441 442 /* 443 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical 444 * IDs, thus each bit in the destination is guaranteed to map 445 * to at most one vCPU. 446 */ 447 for_each_set_bit(i, &bitmap, 16) 448 avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, 449 cluster + i, icrl); 450 } 451 452 return 0; 453 } 454 455 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, 456 u32 icrl, u32 icrh, u32 index) 457 { 458 u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); 459 unsigned long i; 460 struct kvm_vcpu *vcpu; 461 462 if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) 463 return; 464 465 trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); 466 467 /* 468 * Wake any target vCPUs that are blocking, i.e. waiting for a wake 469 * event. There's no need to signal doorbells, as hardware has handled 470 * vCPUs that were in guest at the time of the IPI, and vCPUs that have 471 * since entered the guest will have processed pending IRQs at VMRUN. 472 */ 473 kvm_for_each_vcpu(i, vcpu, kvm) { 474 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 475 dest, icrl & APIC_DEST_MASK)) 476 avic_kick_vcpu(vcpu, icrl); 477 } 478 } 479 480 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) 481 { 482 struct vcpu_svm *svm = to_svm(vcpu); 483 u32 icrh = svm->vmcb->control.exit_info_1 >> 32; 484 u32 icrl = svm->vmcb->control.exit_info_1; 485 u32 id = svm->vmcb->control.exit_info_2 >> 32; 486 u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; 487 struct kvm_lapic *apic = vcpu->arch.apic; 488 489 trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); 490 491 switch (id) { 492 case AVIC_IPI_FAILURE_INVALID_TARGET: 493 case AVIC_IPI_FAILURE_INVALID_INT_TYPE: 494 /* 495 * Emulate IPIs that are not handled by AVIC hardware, which 496 * only virtualizes Fixed, Edge-Triggered INTRs, and falls over 497 * if _any_ targets are invalid, e.g. if the logical mode mask 498 * is a superset of running vCPUs. 499 * 500 * The exit is a trap, e.g. ICR holds the correct value and RIP 501 * has been advanced, KVM is responsible only for emulating the 502 * IPI. Sadly, hardware may sometimes leave the BUSY flag set, 503 * in which case KVM needs to emulate the ICR write as well in 504 * order to clear the BUSY flag. 505 */ 506 if (icrl & APIC_ICR_BUSY) 507 kvm_apic_write_nodecode(vcpu, APIC_ICR); 508 else 509 kvm_apic_send_ipi(apic, icrl, icrh); 510 break; 511 case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: 512 /* 513 * At this point, we expect that the AVIC HW has already 514 * set the appropriate IRR bits on the valid target 515 * vcpus. So, we just need to kick the appropriate vcpu. 516 */ 517 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); 518 break; 519 case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: 520 WARN_ONCE(1, "Invalid backing page\n"); 521 break; 522 case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: 523 /* Invalid IPI with vector < 16 */ 524 break; 525 default: 526 vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); 527 } 528 529 return 1; 530 } 531 532 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) 533 { 534 if (is_guest_mode(vcpu)) 535 return APICV_INHIBIT_REASON_NESTED; 536 return 0; 537 } 538 539 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 540 { 541 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 542 u32 cluster, index; 543 544 ldr = GET_APIC_LOGICAL_ID(ldr); 545 546 if (flat) { 547 cluster = 0; 548 } else { 549 cluster = (ldr >> 4); 550 if (cluster >= 0xf) 551 return NULL; 552 ldr &= 0xf; 553 } 554 if (!ldr || !is_power_of_2(ldr)) 555 return NULL; 556 557 index = __ffs(ldr); 558 if (WARN_ON_ONCE(index > 7)) 559 return NULL; 560 index += (cluster << 2); 561 562 return &kvm_svm->avic_logical_id_table[index]; 563 } 564 565 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) 566 { 567 bool flat; 568 u32 *entry, new_entry; 569 570 flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; 571 entry = avic_get_logical_id_entry(vcpu, ldr, flat); 572 if (!entry) 573 return; 574 575 new_entry = READ_ONCE(*entry); 576 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 577 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 578 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; 579 WRITE_ONCE(*entry, new_entry); 580 } 581 582 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) 583 { 584 struct vcpu_svm *svm = to_svm(vcpu); 585 bool flat = svm->dfr_reg == APIC_DFR_FLAT; 586 u32 *entry; 587 588 /* Note: x2AVIC does not use logical APIC ID table */ 589 if (apic_x2apic_mode(vcpu->arch.apic)) 590 return; 591 592 entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); 593 if (entry) 594 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); 595 } 596 597 static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) 598 { 599 struct vcpu_svm *svm = to_svm(vcpu); 600 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 601 u32 id = kvm_xapic_id(vcpu->arch.apic); 602 603 /* AVIC does not support LDR update for x2APIC */ 604 if (apic_x2apic_mode(vcpu->arch.apic)) 605 return; 606 607 if (ldr == svm->ldr_reg) 608 return; 609 610 avic_invalidate_logical_id_entry(vcpu); 611 612 svm->ldr_reg = ldr; 613 avic_ldr_write(vcpu, id, ldr); 614 } 615 616 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) 617 { 618 struct vcpu_svm *svm = to_svm(vcpu); 619 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 620 621 if (svm->dfr_reg == dfr) 622 return; 623 624 avic_invalidate_logical_id_entry(vcpu); 625 svm->dfr_reg = dfr; 626 } 627 628 static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) 629 { 630 u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & 631 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 632 633 switch (offset) { 634 case APIC_LDR: 635 avic_handle_ldr_update(vcpu); 636 break; 637 case APIC_DFR: 638 avic_handle_dfr_update(vcpu); 639 break; 640 case APIC_RRR: 641 /* Ignore writes to Read Remote Data, it's read-only. */ 642 return 1; 643 default: 644 break; 645 } 646 647 kvm_apic_write_nodecode(vcpu, offset); 648 return 1; 649 } 650 651 static bool is_avic_unaccelerated_access_trap(u32 offset) 652 { 653 bool ret = false; 654 655 switch (offset) { 656 case APIC_ID: 657 case APIC_EOI: 658 case APIC_RRR: 659 case APIC_LDR: 660 case APIC_DFR: 661 case APIC_SPIV: 662 case APIC_ESR: 663 case APIC_ICR: 664 case APIC_LVTT: 665 case APIC_LVTTHMR: 666 case APIC_LVTPC: 667 case APIC_LVT0: 668 case APIC_LVT1: 669 case APIC_LVTERR: 670 case APIC_TMICT: 671 case APIC_TDCR: 672 ret = true; 673 break; 674 default: 675 break; 676 } 677 return ret; 678 } 679 680 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) 681 { 682 struct vcpu_svm *svm = to_svm(vcpu); 683 int ret = 0; 684 u32 offset = svm->vmcb->control.exit_info_1 & 685 AVIC_UNACCEL_ACCESS_OFFSET_MASK; 686 u32 vector = svm->vmcb->control.exit_info_2 & 687 AVIC_UNACCEL_ACCESS_VECTOR_MASK; 688 bool write = (svm->vmcb->control.exit_info_1 >> 32) & 689 AVIC_UNACCEL_ACCESS_WRITE_MASK; 690 bool trap = is_avic_unaccelerated_access_trap(offset); 691 692 trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, 693 trap, write, vector); 694 if (trap) { 695 /* Handling Trap */ 696 WARN_ONCE(!write, "svm: Handling trap read.\n"); 697 ret = avic_unaccel_trap_write(vcpu); 698 } else { 699 /* Handling Fault */ 700 ret = kvm_emulate_instruction(vcpu, 0); 701 } 702 703 return ret; 704 } 705 706 int avic_init_vcpu(struct vcpu_svm *svm) 707 { 708 int ret; 709 struct kvm_vcpu *vcpu = &svm->vcpu; 710 711 INIT_LIST_HEAD(&svm->ir_list); 712 spin_lock_init(&svm->ir_list_lock); 713 714 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 715 return 0; 716 717 ret = avic_init_backing_page(vcpu); 718 if (ret) 719 return ret; 720 721 svm->dfr_reg = APIC_DFR_FLAT; 722 723 return ret; 724 } 725 726 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) 727 { 728 avic_handle_dfr_update(vcpu); 729 avic_handle_ldr_update(vcpu); 730 } 731 732 static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 733 { 734 struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 735 unsigned long flags; 736 737 if (!vcpu) 738 return; 739 740 spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 741 list_del(&irqfd->vcpu_list); 742 spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 743 } 744 745 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 746 unsigned int host_irq, uint32_t guest_irq, 747 struct kvm_vcpu *vcpu, u32 vector) 748 { 749 /* 750 * If the IRQ was affined to a different vCPU, remove the IRTE metadata 751 * from the *previous* vCPU's list. 752 */ 753 svm_ir_list_del(irqfd); 754 755 if (vcpu) { 756 /* 757 * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 758 * in which case configure the IRTE for legacy mode, but track 759 * the IRTE metadata so that it can be converted to guest mode 760 * if AVIC is enabled/uninhibited in the future. 761 */ 762 struct amd_iommu_pi_data pi_data = { 763 .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 764 vcpu->vcpu_idx), 765 .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 766 .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 767 .vector = vector, 768 }; 769 struct vcpu_svm *svm = to_svm(vcpu); 770 u64 entry; 771 int ret; 772 773 /* 774 * Prevent the vCPU from being scheduled out or migrated until 775 * the IRTE is updated and its metadata has been added to the 776 * list of IRQs being posted to the vCPU, to ensure the IRTE 777 * isn't programmed with stale pCPU/IsRunning information. 778 */ 779 guard(spinlock_irqsave)(&svm->ir_list_lock); 780 781 /* 782 * Update the target pCPU for IOMMU doorbells if the vCPU is 783 * running. If the vCPU is NOT running, i.e. is blocking or 784 * scheduled out, KVM will update the pCPU info when the vCPU 785 * is awakened and/or scheduled in. See also avic_vcpu_load(). 786 */ 787 entry = svm->avic_physical_id_entry; 788 if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 789 pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 790 } else { 791 pi_data.cpu = -1; 792 pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 793 } 794 795 ret = irq_set_vcpu_affinity(host_irq, &pi_data); 796 if (ret) 797 return ret; 798 799 /* 800 * Revert to legacy mode if the IOMMU didn't provide metadata 801 * for the IRTE, which KVM needs to keep the IRTE up-to-date, 802 * e.g. if the vCPU is migrated or AVIC is disabled. 803 */ 804 if (WARN_ON_ONCE(!pi_data.ir_data)) { 805 irq_set_vcpu_affinity(host_irq, NULL); 806 return -EIO; 807 } 808 809 irqfd->irq_bypass_data = pi_data.ir_data; 810 list_add(&irqfd->vcpu_list, &svm->ir_list); 811 return 0; 812 } 813 return irq_set_vcpu_affinity(host_irq, NULL); 814 } 815 816 enum avic_vcpu_action { 817 /* 818 * There is no need to differentiate between activate and deactivate, 819 * as KVM only refreshes AVIC state when the vCPU is scheduled in and 820 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 821 * being (de)activated. 822 */ 823 AVIC_TOGGLE_ON_OFF = BIT(0), 824 AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 825 AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 826 827 /* 828 * No unique action is required to deal with a vCPU that stops/starts 829 * running. A vCPU that starts running by definition stops blocking as 830 * well, and a vCPU that stops running can't have been blocking, i.e. 831 * doesn't need to toggle GALogIntr. 832 */ 833 AVIC_START_RUNNING = 0, 834 AVIC_STOP_RUNNING = 0, 835 836 /* 837 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 838 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 839 * sent to the vCPU. 840 */ 841 AVIC_START_BLOCKING = BIT(1), 842 }; 843 844 static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 845 enum avic_vcpu_action action) 846 { 847 bool ga_log_intr = (action & AVIC_START_BLOCKING); 848 struct vcpu_svm *svm = to_svm(vcpu); 849 struct kvm_kernel_irqfd *irqfd; 850 851 lockdep_assert_held(&svm->ir_list_lock); 852 853 /* 854 * Here, we go through the per-vcpu ir_list to update all existing 855 * interrupt remapping table entry targeting this vcpu. 856 */ 857 if (list_empty(&svm->ir_list)) 858 return; 859 860 list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 861 void *data = irqfd->irq_bypass_data; 862 863 if (!(action & AVIC_TOGGLE_ON_OFF)) 864 WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 865 else if (cpu >= 0) 866 WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 867 else 868 WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 869 } 870 } 871 872 static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 873 enum avic_vcpu_action action) 874 { 875 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 876 int h_physical_id = kvm_cpu_get_apicid(cpu); 877 struct vcpu_svm *svm = to_svm(vcpu); 878 unsigned long flags; 879 u64 entry; 880 881 lockdep_assert_preemption_disabled(); 882 883 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 884 return; 885 886 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 887 return; 888 889 /* 890 * Grab the per-vCPU interrupt remapping lock even if the VM doesn't 891 * _currently_ have assigned devices, as that can change. Holding 892 * ir_list_lock ensures that either svm_ir_list_add() will consume 893 * up-to-date entry information, or that this task will wait until 894 * svm_ir_list_add() completes to set the new target pCPU. 895 */ 896 spin_lock_irqsave(&svm->ir_list_lock, flags); 897 898 entry = svm->avic_physical_id_entry; 899 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 900 901 entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 902 AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 903 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 904 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 905 906 svm->avic_physical_id_entry = entry; 907 908 /* 909 * If IPI virtualization is disabled, clear IsRunning when updating the 910 * actual Physical ID table, so that the CPU never sees IsRunning=1. 911 * Keep the APIC ID up-to-date in the entry to minimize the chances of 912 * things going sideways if hardware peeks at the ID. 913 */ 914 if (!enable_ipiv) 915 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 916 917 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 918 919 avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 920 921 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 922 } 923 924 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 925 { 926 /* 927 * No need to update anything if the vCPU is blocking, i.e. if the vCPU 928 * is being scheduled in after being preempted. The CPU entries in the 929 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 930 * If the vCPU was migrated, its new CPU value will be stuffed when the 931 * vCPU unblocks. 932 */ 933 if (kvm_vcpu_is_blocking(vcpu)) 934 return; 935 936 __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 937 } 938 939 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 940 { 941 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 942 struct vcpu_svm *svm = to_svm(vcpu); 943 unsigned long flags; 944 u64 entry = svm->avic_physical_id_entry; 945 946 lockdep_assert_preemption_disabled(); 947 948 if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 949 return; 950 951 /* 952 * Take and hold the per-vCPU interrupt remapping lock while updating 953 * the Physical ID entry even though the lock doesn't protect against 954 * multiple writers (see above). Holding ir_list_lock ensures that 955 * either svm_ir_list_add() will consume up-to-date entry information, 956 * or that this task will wait until svm_ir_list_add() completes to 957 * mark the vCPU as not running. 958 */ 959 spin_lock_irqsave(&svm->ir_list_lock, flags); 960 961 avic_update_iommu_vcpu_affinity(vcpu, -1, action); 962 963 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 964 965 /* 966 * Keep the previous APIC ID in the entry so that a rogue doorbell from 967 * hardware is at least restricted to a CPU associated with the vCPU. 968 */ 969 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 970 971 if (enable_ipiv) 972 WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 973 974 /* 975 * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 976 * it's a synthetic flag that usurps an unused should-be-zero bit. 977 */ 978 if (action & AVIC_START_BLOCKING) 979 entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 980 981 svm->avic_physical_id_entry = entry; 982 983 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 984 } 985 986 void avic_vcpu_put(struct kvm_vcpu *vcpu) 987 { 988 /* 989 * Note, reading the Physical ID entry outside of ir_list_lock is safe 990 * as only the pCPU that has loaded (or is loading) the vCPU is allowed 991 * to modify the entry, and preemption is disabled. I.e. the vCPU 992 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 993 * recursively. 994 */ 995 u64 entry = to_svm(vcpu)->avic_physical_id_entry; 996 997 /* 998 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 999 * vCPU is preempted while its in the process of blocking. WARN if the 1000 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1001 * the AVIC if it wasn't previously loaded. 1002 */ 1003 if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1004 if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1005 return; 1006 1007 /* 1008 * The vCPU was preempted while blocking, ensure its IRTEs are 1009 * configured to generate GA Log Interrupts. 1010 */ 1011 if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1012 return; 1013 } 1014 1015 __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1016 AVIC_STOP_RUNNING); 1017 } 1018 1019 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) 1020 { 1021 struct vcpu_svm *svm = to_svm(vcpu); 1022 struct vmcb *vmcb = svm->vmcb01.ptr; 1023 1024 if (!lapic_in_kernel(vcpu) || !enable_apicv) 1025 return; 1026 1027 if (kvm_vcpu_apicv_active(vcpu)) { 1028 /** 1029 * During AVIC temporary deactivation, guest could update 1030 * APIC ID, DFR and LDR registers, which would not be trapped 1031 * by avic_unaccelerated_access_interception(). In this case, 1032 * we need to check and update the AVIC logical APIC ID table 1033 * accordingly before re-activating. 1034 */ 1035 avic_apicv_post_state_restore(vcpu); 1036 avic_activate_vmcb(svm); 1037 } else { 1038 avic_deactivate_vmcb(svm); 1039 } 1040 vmcb_mark_dirty(vmcb, VMCB_AVIC); 1041 } 1042 1043 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1044 { 1045 if (!enable_apicv) 1046 return; 1047 1048 /* APICv should only be toggled on/off while the vCPU is running. */ 1049 WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1050 1051 avic_refresh_virtual_apic_mode(vcpu); 1052 1053 if (kvm_vcpu_apicv_active(vcpu)) 1054 __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1055 else 1056 __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1057 } 1058 1059 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1060 { 1061 if (!kvm_vcpu_apicv_active(vcpu)) 1062 return; 1063 1064 /* 1065 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1066 * actually blocks. 1067 * 1068 * Note, any IRQs that arrive before IsRunning=0 will not cause an 1069 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1070 * this by checking vIRR one last time before blocking. The memory 1071 * barrier implicit in set_current_state orders writing IsRunning=0 1072 * before reading the vIRR. The processor needs a matching memory 1073 * barrier on interrupt delivery between writing IRR and reading 1074 * IsRunning; the lack of this barrier might be the cause of errata #1235). 1075 * 1076 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1077 * doesn't need to detect events for scheduling purposes. The doorbell 1078 * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1079 * CPU and cause noisy neighbor problems if the VM is sending interrupts 1080 * to the vCPU while it's scheduled out. 1081 */ 1082 __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1083 } 1084 1085 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1086 { 1087 if (!kvm_vcpu_apicv_active(vcpu)) 1088 return; 1089 1090 avic_vcpu_load(vcpu, vcpu->cpu); 1091 } 1092 1093 /* 1094 * Note: 1095 * - The module param avic enable both xAPIC and x2APIC mode. 1096 * - Hypervisor can support both xAVIC and x2AVIC in the same guest. 1097 * - The mode can be switched at run-time. 1098 */ 1099 bool avic_hardware_setup(void) 1100 { 1101 if (!npt_enabled) 1102 return false; 1103 1104 /* AVIC is a prerequisite for x2AVIC. */ 1105 if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { 1106 if (boot_cpu_has(X86_FEATURE_X2AVIC)) { 1107 pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); 1108 pr_warn(FW_BUG "Try enable AVIC using force_avic option"); 1109 } 1110 return false; 1111 } 1112 1113 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && 1114 !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { 1115 pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); 1116 return false; 1117 } 1118 1119 if (boot_cpu_has(X86_FEATURE_AVIC)) { 1120 pr_info("AVIC enabled\n"); 1121 } else if (force_avic) { 1122 /* 1123 * Some older systems does not advertise AVIC support. 1124 * See Revision Guide for specific AMD processor for more detail. 1125 */ 1126 pr_warn("AVIC is not supported in CPUID but force enabled"); 1127 pr_warn("Your system might crash and burn"); 1128 } 1129 1130 /* AVIC is a prerequisite for x2AVIC. */ 1131 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1132 if (x2avic_enabled) 1133 pr_info("x2AVIC enabled\n"); 1134 1135 /* 1136 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1137 * due to erratum 1235, which results in missed VM-Exits on the sender 1138 * and thus missed wake events for blocking vCPUs due to the CPU 1139 * failing to see a software update to clear IsRunning. 1140 */ 1141 enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1142 1143 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1144 1145 return true; 1146 } 1147