1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/kvm_host.h> 5 6 #include <asm/irq_remapping.h> 7 #include <asm/cpu.h> 8 9 #include "lapic.h" 10 #include "irq.h" 11 #include "posted_intr.h" 12 #include "trace.h" 13 #include "vmx.h" 14 15 /* 16 * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() 17 * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when 18 * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. 19 * The vCPUs posted interrupt descriptor is updated at the same time to set its 20 * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices 21 * wake the target vCPUs. vCPUs are removed from the list and the notification 22 * vector is reset when the vCPU is scheduled in. 23 */ 24 static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); 25 /* 26 * Protect the per-CPU list with a per-CPU spinlock to handle task migration. 27 * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the 28 * ->sched_in() path will need to take the vCPU off the list of the _previous_ 29 * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will 30 * occur if a wakeup IRQ arrives and attempts to acquire the lock. 31 */ 32 static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); 33 34 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) 35 { 36 return &(to_vmx(vcpu)->pi_desc); 37 } 38 39 static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) 40 { 41 /* 42 * PID.ON can be set at any time by a different vCPU or by hardware, 43 * e.g. a device. PID.control must be written atomically, and the 44 * update must be retried with a fresh snapshot an ON change causes 45 * the cmpxchg to fail. 46 */ 47 if (!try_cmpxchg64(&pi_desc->control, pold, new)) 48 return -EBUSY; 49 50 return 0; 51 } 52 53 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 54 { 55 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 56 struct vcpu_vmx *vmx = to_vmx(vcpu); 57 struct pi_desc old, new; 58 unsigned long flags; 59 unsigned int dest; 60 61 /* 62 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and 63 * PI.SN up-to-date even if there is no assigned device or if APICv is 64 * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. 65 */ 66 if (!enable_apicv || !lapic_in_kernel(vcpu)) 67 return; 68 69 /* 70 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the 71 * full update can be skipped as neither the vector nor the destination 72 * needs to be changed. 73 */ 74 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { 75 /* 76 * Clear SN if it was set due to being preempted. Again, do 77 * this even if there is no assigned device for simplicity. 78 */ 79 if (pi_test_and_clear_sn(pi_desc)) 80 goto after_clear_sn; 81 return; 82 } 83 84 local_irq_save(flags); 85 86 /* 87 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup 88 * list of the _previous_ pCPU, which will not be the same as the 89 * current pCPU if the task was migrated. 90 */ 91 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { 92 raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 93 list_del(&vmx->pi_wakeup_list); 94 raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 95 } 96 97 dest = cpu_physical_id(cpu); 98 if (!x2apic_mode) 99 dest = (dest << 8) & 0xFF00; 100 101 old.control = READ_ONCE(pi_desc->control); 102 do { 103 new.control = old.control; 104 105 /* 106 * Clear SN (as above) and refresh the destination APIC ID to 107 * handle task migration (@cpu != vcpu->cpu). 108 */ 109 new.ndst = dest; 110 new.sn = 0; 111 112 /* 113 * Restore the notification vector; in the blocking case, the 114 * descriptor was modified on "put" to use the wakeup vector. 115 */ 116 new.nv = POSTED_INTR_VECTOR; 117 } while (pi_try_set_control(pi_desc, &old.control, new.control)); 118 119 local_irq_restore(flags); 120 121 after_clear_sn: 122 123 /* 124 * Clear SN before reading the bitmap. The VT-d firmware 125 * writes the bitmap and reads SN atomically (5.2.3 in the 126 * spec), so it doesn't really have a memory barrier that 127 * pairs with this, but we cannot do that and we need one. 128 */ 129 smp_mb__after_atomic(); 130 131 if (!pi_is_pir_empty(pi_desc)) 132 pi_set_on(pi_desc); 133 } 134 135 static bool vmx_can_use_vtd_pi(struct kvm *kvm) 136 { 137 return irqchip_in_kernel(kvm) && enable_apicv && 138 kvm_arch_has_assigned_device(kvm) && 139 irq_remapping_cap(IRQ_POSTING_CAP); 140 } 141 142 /* 143 * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set 144 * WAKEUP as the notification vector in the PI descriptor. 145 */ 146 static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) 147 { 148 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 149 struct vcpu_vmx *vmx = to_vmx(vcpu); 150 struct pi_desc old, new; 151 unsigned long flags; 152 153 local_irq_save(flags); 154 155 raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 156 list_add_tail(&vmx->pi_wakeup_list, 157 &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); 158 raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 159 160 WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); 161 162 old.control = READ_ONCE(pi_desc->control); 163 do { 164 /* set 'NV' to 'wakeup vector' */ 165 new.control = old.control; 166 new.nv = POSTED_INTR_WAKEUP_VECTOR; 167 } while (pi_try_set_control(pi_desc, &old.control, new.control)); 168 169 /* 170 * Send a wakeup IPI to this CPU if an interrupt may have been posted 171 * before the notification vector was updated, in which case the IRQ 172 * will arrive on the non-wakeup vector. An IPI is needed as calling 173 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not 174 * enabled until it is safe to call try_to_wake_up() on the task being 175 * scheduled out). 176 */ 177 if (pi_test_on(&new)) 178 apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); 179 180 local_irq_restore(flags); 181 } 182 183 static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) 184 { 185 /* 186 * The default posted interrupt vector does nothing when 187 * invoked outside guest mode. Return whether a blocked vCPU 188 * can be the target of posted interrupts, as is the case when 189 * using either IPI virtualization or VT-d PI, so that the 190 * notification vector is switched to the one that calls 191 * back to the pi_wakeup_handler() function. 192 */ 193 return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); 194 } 195 196 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 197 { 198 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 199 200 if (!vmx_needs_pi_wakeup(vcpu)) 201 return; 202 203 if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) 204 pi_enable_wakeup_handler(vcpu); 205 206 /* 207 * Set SN when the vCPU is preempted. Note, the vCPU can both be seen 208 * as blocking and preempted, e.g. if it's preempted between setting 209 * its wait state and manually scheduling out. 210 */ 211 if (vcpu->preempted) 212 pi_set_sn(pi_desc); 213 } 214 215 /* 216 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 217 */ 218 void pi_wakeup_handler(void) 219 { 220 int cpu = smp_processor_id(); 221 struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); 222 raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); 223 struct vcpu_vmx *vmx; 224 225 raw_spin_lock(spinlock); 226 list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { 227 228 if (pi_test_on(&vmx->pi_desc)) 229 kvm_vcpu_wake_up(&vmx->vcpu); 230 } 231 raw_spin_unlock(spinlock); 232 } 233 234 void __init pi_init_cpu(int cpu) 235 { 236 INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); 237 raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); 238 } 239 240 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) 241 { 242 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 243 244 return pi_test_on(pi_desc) || 245 (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); 246 } 247 248 249 /* 250 * Bail out of the block loop if the VM has an assigned 251 * device, but the blocking vCPU didn't reconfigure the 252 * PI.NV to the wakeup vector, i.e. the assigned device 253 * came along after the initial check in vmx_vcpu_pi_put(). 254 */ 255 void vmx_pi_start_assignment(struct kvm *kvm) 256 { 257 if (!irq_remapping_cap(IRQ_POSTING_CAP)) 258 return; 259 260 kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); 261 } 262 263 /* 264 * vmx_pi_update_irte - set IRTE for Posted-Interrupts 265 * 266 * @kvm: kvm 267 * @host_irq: host irq of the interrupt 268 * @guest_irq: gsi of the interrupt 269 * @set: set or unset PI 270 * returns 0 on success, < 0 on failure 271 */ 272 int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 273 uint32_t guest_irq, bool set) 274 { 275 struct kvm_kernel_irq_routing_entry *e; 276 struct kvm_irq_routing_table *irq_rt; 277 struct kvm_lapic_irq irq; 278 struct kvm_vcpu *vcpu; 279 struct vcpu_data vcpu_info; 280 int idx, ret = 0; 281 282 if (!vmx_can_use_vtd_pi(kvm)) 283 return 0; 284 285 idx = srcu_read_lock(&kvm->irq_srcu); 286 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 287 if (guest_irq >= irq_rt->nr_rt_entries || 288 hlist_empty(&irq_rt->map[guest_irq])) { 289 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 290 guest_irq, irq_rt->nr_rt_entries); 291 goto out; 292 } 293 294 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 295 if (e->type != KVM_IRQ_ROUTING_MSI) 296 continue; 297 /* 298 * VT-d PI cannot support posting multicast/broadcast 299 * interrupts to a vCPU, we still use interrupt remapping 300 * for these kind of interrupts. 301 * 302 * For lowest-priority interrupts, we only support 303 * those with single CPU as the destination, e.g. user 304 * configures the interrupts via /proc/irq or uses 305 * irqbalance to make the interrupts single-CPU. 306 * 307 * We will support full lowest-priority interrupt later. 308 * 309 * In addition, we can only inject generic interrupts using 310 * the PI mechanism, refuse to route others through it. 311 */ 312 313 kvm_set_msi_irq(kvm, e, &irq); 314 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 315 !kvm_irq_is_postable(&irq)) { 316 /* 317 * Make sure the IRTE is in remapped mode if 318 * we don't handle it in posted mode. 319 */ 320 ret = irq_set_vcpu_affinity(host_irq, NULL); 321 if (ret < 0) { 322 printk(KERN_INFO 323 "failed to back to remapped mode, irq: %u\n", 324 host_irq); 325 goto out; 326 } 327 328 continue; 329 } 330 331 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); 332 vcpu_info.vector = irq.vector; 333 334 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, 335 vcpu_info.vector, vcpu_info.pi_desc_addr, set); 336 337 if (set) 338 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 339 else 340 ret = irq_set_vcpu_affinity(host_irq, NULL); 341 342 if (ret < 0) { 343 printk(KERN_INFO "%s: failed to update PI IRTE\n", 344 __func__); 345 goto out; 346 } 347 } 348 349 ret = 0; 350 out: 351 srcu_read_unlock(&kvm->irq_srcu, idx); 352 return ret; 353 } 354