1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/cpu.h> 4 #include <linux/kvm.h> 5 #include <linux/kvm_host.h> 6 #include <linux/interrupt.h> 7 #include <linux/io.h> 8 #include <linux/uaccess.h> 9 10 #include <kvm/arm_vgic.h> 11 12 #include <asm/kvm_arm.h> 13 #include <asm/kvm_emulate.h> 14 #include <asm/kvm_nested.h> 15 16 #include "vgic.h" 17 18 #define ICH_LRN(n) (ICH_LR0_EL2 + (n)) 19 #define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) 20 #define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) 21 22 struct mi_state { 23 u16 eisr; 24 u16 elrsr; 25 bool pend; 26 }; 27 28 /* 29 * The shadow registers loaded to the hardware when running a L2 guest 30 * with the virtual IMO/FMO bits set. 31 */ 32 struct shadow_if { 33 struct vgic_v3_cpu_if cpuif; 34 unsigned long lr_map; 35 }; 36 37 static DEFINE_PER_CPU(struct shadow_if, shadow_if); 38 39 static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) 40 { 41 return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); 42 } 43 44 /* 45 * Nesting GICv3 support 46 * 47 * On a non-nesting VM (only running at EL0/EL1), the host hypervisor 48 * completely controls the interrupts injected via the list registers. 49 * Consequently, most of the state that is modified by the guest (by ACK-ing 50 * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we 51 * keep a semi-consistent view of the interrupts. 52 * 53 * This still applies for a NV guest, but only while "InHost" (either 54 * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. 55 * 56 * When running a L2 guest ("not InHost"), things are radically different, 57 * as the L1 guest is in charge of provisioning the interrupts via its own 58 * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR 59 * page. This means that the flow described above does work (there is no 60 * state to rebuild in the L0 hypervisor), and that most things happen on L2 61 * load/put: 62 * 63 * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, 64 * per-CPU data structure that is used to populate the actual LRs. This is 65 * an extra copy that we could avoid, but life is short. In the process, 66 * we remap any interrupt that has the HW bit set to the mapped interrupt 67 * on the host, should the host consider it a HW one. This allows the HW 68 * deactivation to take its course, such as for the timer. 69 * 70 * - on L2 put: perform the inverse transformation, so that the result of L2 71 * running becomes visible to L1 in the VNCR-accessible registers. 72 * 73 * - there is nothing to do on L2 entry apart from enabling the vgic, as 74 * everything will have happened on load. However, this is the point where 75 * we detect that an interrupt targeting L1 and prepare the grand 76 * switcheroo. 77 * 78 * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate 79 * corresponding the L1 interrupt. The L0 active state will be cleared by 80 * the HW if the L1 interrupt was itself backed by a HW interrupt. 81 * 82 * Maintenance Interrupt (MI) management: 83 * 84 * Since the L2 guest runs the vgic in its full glory, MIs get delivered and 85 * used as a handover point between L2 and L1. 86 * 87 * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, 88 * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to 89 * run and process the MI. 90 * 91 * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its 92 * state must be computed at each entry/exit of the guest, much like we do 93 * it for the PMU interrupt. 94 * 95 * - because most of the ICH_*_EL2 registers live in the VNCR page, the 96 * quality of emulation is poor: L1 can setup the vgic so that an MI would 97 * immediately fire, and not observe anything until the next exit. 98 * Similarly, a pending MI is not immediately disabled by clearing 99 * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for 100 * example. 101 * 102 * System register emulation: 103 * 104 * We get two classes of registers: 105 * 106 * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access 107 * them, and L0 doesn't see a thing. 108 * 109 * - those that always trap (ELRSR, EISR, MISR): these are status registers 110 * that are built on the fly based on the in-memory state. 111 * 112 * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, 113 * and a NV L2 would either access the VNCR page provided by L1 (memory 114 * based registers), or see the access redirected to L1 (registers that 115 * trap) thanks to NV being set by L1. 116 */ 117 118 bool vgic_state_is_nested(struct kvm_vcpu *vcpu) 119 { 120 u64 xmo; 121 122 if (is_nested_ctxt(vcpu)) { 123 xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); 124 WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), 125 "Separate virtual IRQ/FIQ settings not supported\n"); 126 127 return !!xmo; 128 } 129 130 return false; 131 } 132 133 static struct shadow_if *get_shadow_if(void) 134 { 135 return this_cpu_ptr(&shadow_if); 136 } 137 138 static bool lr_triggers_eoi(u64 lr) 139 { 140 return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); 141 } 142 143 static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) 144 { 145 u16 eisr = 0, elrsr = 0; 146 bool pend = false; 147 148 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { 149 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 150 151 if (lr_triggers_eoi(lr)) 152 eisr |= BIT(i); 153 if (!(lr & ICH_LR_STATE)) 154 elrsr |= BIT(i); 155 pend |= (lr & ICH_LR_PENDING_BIT); 156 } 157 158 mi_state->eisr = eisr; 159 mi_state->elrsr = elrsr; 160 mi_state->pend = pend; 161 } 162 163 u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) 164 { 165 struct mi_state mi_state; 166 167 vgic_compute_mi_state(vcpu, &mi_state); 168 return mi_state.eisr; 169 } 170 171 u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) 172 { 173 struct mi_state mi_state; 174 175 vgic_compute_mi_state(vcpu, &mi_state); 176 return mi_state.elrsr; 177 } 178 179 u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) 180 { 181 struct mi_state mi_state; 182 u64 reg = 0, hcr, vmcr; 183 184 hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); 185 vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); 186 187 vgic_compute_mi_state(vcpu, &mi_state); 188 189 if (mi_state.eisr) 190 reg |= ICH_MISR_EL2_EOI; 191 192 if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { 193 int used_lrs = kvm_vgic_global_state.nr_lr; 194 195 used_lrs -= hweight16(mi_state.elrsr); 196 reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; 197 } 198 199 if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) 200 reg |= ICH_MISR_EL2_LRENP; 201 202 if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) 203 reg |= ICH_MISR_EL2_NP; 204 205 if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_EL2_VENG0_MASK)) 206 reg |= ICH_MISR_EL2_VGrp0E; 207 208 if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_EL2_VENG0_MASK)) 209 reg |= ICH_MISR_EL2_VGrp0D; 210 211 if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_EL2_VENG1_MASK)) 212 reg |= ICH_MISR_EL2_VGrp1E; 213 214 if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_EL2_VENG1_MASK)) 215 reg |= ICH_MISR_EL2_VGrp1D; 216 217 return reg; 218 } 219 220 static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) 221 { 222 struct vgic_irq *irq; 223 224 if (!(lr & ICH_LR_HW)) 225 return lr; 226 227 /* We have the HW bit set, check for validity of pINTID */ 228 irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); 229 /* If there was no real mapping, nuke the HW bit */ 230 if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) 231 lr &= ~ICH_LR_HW; 232 233 /* Translate the virtual mapping to the real one, even if invalid */ 234 if (irq) { 235 lr &= ~ICH_LR_PHYS_ID_MASK; 236 lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); 237 vgic_put_irq(vcpu->kvm, irq); 238 } 239 240 return lr; 241 } 242 243 /* 244 * For LRs which have HW bit set such as timer interrupts, we modify them to 245 * have the host hardware interrupt number instead of the virtual one programmed 246 * by the guest hypervisor. 247 */ 248 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, 249 struct vgic_v3_cpu_if *s_cpu_if) 250 { 251 struct shadow_if *shadow_if; 252 253 shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); 254 shadow_if->lr_map = 0; 255 256 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { 257 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 258 259 if (!(lr & ICH_LR_STATE)) 260 continue; 261 262 lr = translate_lr_pintid(vcpu, lr); 263 264 s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; 265 shadow_if->lr_map |= BIT(i); 266 } 267 268 s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); 269 } 270 271 void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) 272 { 273 u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); 274 275 write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); 276 } 277 278 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) 279 { 280 struct shadow_if *shadow_if = get_shadow_if(); 281 int i; 282 283 for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { 284 u64 val, host_lr, lr; 285 286 host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); 287 288 /* Propagate the new LR state */ 289 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 290 val = lr & ~ICH_LR_STATE; 291 val |= host_lr & ICH_LR_STATE; 292 __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); 293 294 /* 295 * Deactivation of a HW interrupt: the LR must have the HW 296 * bit set, have been in a non-invalid state before the run, 297 * and now be in an invalid state. If any of that doesn't 298 * hold, we're done with this LR. 299 */ 300 if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && 301 !(host_lr & ICH_LR_STATE))) 302 continue; 303 304 /* 305 * If we had a HW lr programmed by the guest hypervisor, we 306 * need to emulate the HW effect between the guest hypervisor 307 * and the nested guest. 308 */ 309 vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); 310 } 311 312 /* We need these to be synchronised to generate the MI */ 313 __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); 314 __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); 315 __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); 316 317 write_sysreg_s(0, SYS_ICH_HCR_EL2); 318 isb(); 319 320 vgic_v3_nested_update_mi(vcpu); 321 } 322 323 static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, 324 struct vgic_v3_cpu_if *s_cpu_if) 325 { 326 struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; 327 int i; 328 329 s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); 330 s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); 331 s_cpu_if->vgic_sre = host_if->vgic_sre; 332 333 for (i = 0; i < 4; i++) { 334 s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); 335 s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); 336 } 337 338 vgic_v3_create_shadow_lr(vcpu, s_cpu_if); 339 } 340 341 void vgic_v3_load_nested(struct kvm_vcpu *vcpu) 342 { 343 struct shadow_if *shadow_if = get_shadow_if(); 344 struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; 345 346 BUG_ON(!vgic_state_is_nested(vcpu)); 347 348 vgic_v3_create_shadow_state(vcpu, cpu_if); 349 350 __vgic_v3_restore_vmcr_aprs(cpu_if); 351 __vgic_v3_activate_traps(cpu_if); 352 353 for (int i = 0; i < cpu_if->used_lrs; i++) 354 __gic_v3_set_lr(cpu_if->vgic_lr[i], i); 355 356 /* 357 * Propagate the number of used LRs for the benefit of the HYP 358 * GICv3 emulation code. Yes, this is a pretty sorry hack. 359 */ 360 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; 361 } 362 363 void vgic_v3_put_nested(struct kvm_vcpu *vcpu) 364 { 365 struct shadow_if *shadow_if = get_shadow_if(); 366 struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; 367 int i; 368 369 __vgic_v3_save_aprs(s_cpu_if); 370 371 for (i = 0; i < 4; i++) { 372 __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); 373 __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); 374 } 375 376 for (i = 0; i < s_cpu_if->used_lrs; i++) 377 __gic_v3_set_lr(0, i); 378 379 __vgic_v3_deactivate_traps(s_cpu_if); 380 381 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; 382 } 383 384 /* 385 * If we exit a L2 VM with a pending maintenance interrupt from the GIC, 386 * then we need to forward this to L1 so that it can re-sync the appropriate 387 * LRs and sample level triggered interrupts again. 388 */ 389 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) 390 { 391 bool state = read_sysreg_s(SYS_ICH_MISR_EL2); 392 393 /* This will force a switch back to L1 if the level is high */ 394 kvm_vgic_inject_irq(vcpu->kvm, vcpu, 395 vcpu->kvm->arch.vgic.mi_intid, state, vcpu); 396 397 sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); 398 } 399 400 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) 401 { 402 bool level; 403 404 level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu); 405 kvm_vgic_inject_irq(vcpu->kvm, vcpu, 406 vcpu->kvm->arch.vgic.mi_intid, level, vcpu); 407 } 408