xref: /illumos-gate/usr/src/uts/intel/io/vmm/io/vlapic.c (revision 29219719c034367724cbf77434175b3c4e681e43)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2019 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2014 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2020 Oxide Computer Company
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/systm.h>
54 #include <sys/cpuset.h>
55 
56 #include <x86/specialreg.h>
57 #include <x86/apicreg.h>
58 
59 #include <machine/clock.h>
60 
61 #include <machine/vmm.h>
62 #include <sys/vmm_kernel.h>
63 
64 #include "vmm_lapic.h"
65 #include "vmm_ktr.h"
66 #include "vmm_stat.h"
67 
68 #include "vlapic.h"
69 #include "vlapic_priv.h"
70 #include "vioapic.h"
71 
72 
73 /*
74  * The 4 high bits of a given interrupt vector represent its priority.  The same
75  * is true for the contents of the TPR when it is used to calculate the ultimate
76  * PPR of an APIC - the 4 high bits hold the priority.
77  */
78 #define	PRIO(x)			((x) & 0xf0)
79 
80 #define	VLAPIC_VERSION		(16)
81 
82 /*
83  * The 'vlapic->timer_lock' is used to provide mutual exclusion between the
84  * vlapic_callout_handler() and vcpu accesses to:
85  * - timer_freq_bt, timer_period_bt, timer_fire_bt
86  * - timer LVT register
87  */
88 #define	VLAPIC_TIMER_LOCK(vlapic)	mutex_enter(&((vlapic)->timer_lock))
89 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mutex_exit(&((vlapic)->timer_lock))
90 #define	VLAPIC_TIMER_LOCKED(vlapic)	MUTEX_HELD(&((vlapic)->timer_lock))
91 
92 /*
93  * APIC timer frequency:
94  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
95  * - power-of-two to avoid loss of precision when calculating times
96  */
97 #define	VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
98 
99 #define	APICBASE_ADDR_MASK	0xfffffffffffff000UL
100 
101 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
102 static void vlapic_callout_handler(void *arg);
103 
104 #ifdef __ISRVEC_DEBUG
105 static void vlapic_isrstk_accept(struct vlapic *, int);
106 static void vlapic_isrstk_eoi(struct vlapic *, int);
107 static void vlapic_isrstk_verify(const struct vlapic *);
108 #endif /* __ISRVEC_DEBUG */
109 
110 
111 static __inline bool
112 vlapic_x2mode(const struct vlapic *vlapic)
113 {
114 	return ((vlapic->msr_apicbase & APICBASE_X2APIC) != 0);
115 }
116 
117 static __inline bool
118 vlapic_hw_disabled(const struct vlapic *vlapic)
119 {
120 	return ((vlapic->msr_apicbase & APICBASE_ENABLED) == 0);
121 }
122 
123 static __inline bool
124 vlapic_sw_disabled(const struct vlapic *vlapic)
125 {
126 	const struct LAPIC *lapic = vlapic->apic_page;
127 
128 	return ((lapic->svr & APIC_SVR_ENABLE) == 0);
129 }
130 
131 static __inline bool
132 vlapic_enabled(const struct vlapic *vlapic)
133 {
134 	return (!vlapic_hw_disabled(vlapic) && !vlapic_sw_disabled(vlapic));
135 }
136 
137 static __inline uint32_t
138 vlapic_get_id(struct vlapic *vlapic)
139 {
140 
141 	if (vlapic_x2mode(vlapic))
142 		return (vlapic->vcpuid);
143 	else
144 		return (vlapic->vcpuid << 24);
145 }
146 
147 static uint32_t
148 x2apic_ldr(struct vlapic *vlapic)
149 {
150 	int apicid;
151 	uint32_t ldr;
152 
153 	apicid = vlapic_get_id(vlapic);
154 	ldr = 1 << (apicid & 0xf);
155 	ldr |= (apicid & 0xffff0) << 12;
156 	return (ldr);
157 }
158 
159 void
160 vlapic_dfr_write_handler(struct vlapic *vlapic)
161 {
162 	struct LAPIC *lapic;
163 
164 	lapic = vlapic->apic_page;
165 	if (vlapic_x2mode(vlapic)) {
166 		VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
167 		    lapic->dfr);
168 		lapic->dfr = 0;
169 		return;
170 	}
171 
172 	lapic->dfr &= APIC_DFR_MODEL_MASK;
173 	lapic->dfr |= APIC_DFR_RESERVED;
174 }
175 
176 void
177 vlapic_ldr_write_handler(struct vlapic *vlapic)
178 {
179 	struct LAPIC *lapic;
180 
181 	lapic = vlapic->apic_page;
182 
183 	/* LDR is read-only in x2apic mode */
184 	if (vlapic_x2mode(vlapic)) {
185 		VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
186 		    lapic->ldr);
187 		lapic->ldr = x2apic_ldr(vlapic);
188 	} else {
189 		lapic->ldr &= ~APIC_LDR_RESERVED;
190 		VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
191 	}
192 }
193 
194 void
195 vlapic_id_write_handler(struct vlapic *vlapic)
196 {
197 	struct LAPIC *lapic;
198 
199 	/*
200 	 * We don't allow the ID register to be modified so reset it back to
201 	 * its default value.
202 	 */
203 	lapic = vlapic->apic_page;
204 	lapic->id = vlapic_get_id(vlapic);
205 }
206 
207 static int
208 vlapic_timer_divisor(uint32_t dcr)
209 {
210 	switch (dcr & 0xB) {
211 	case APIC_TDCR_1:
212 		return (1);
213 	case APIC_TDCR_2:
214 		return (2);
215 	case APIC_TDCR_4:
216 		return (4);
217 	case APIC_TDCR_8:
218 		return (8);
219 	case APIC_TDCR_16:
220 		return (16);
221 	case APIC_TDCR_32:
222 		return (32);
223 	case APIC_TDCR_64:
224 		return (64);
225 	case APIC_TDCR_128:
226 		return (128);
227 	default:
228 		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
229 	}
230 }
231 
232 #if 0
233 static inline void
234 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
235 {
236 	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
237 	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
238 	    *lvt & APIC_LVTT_M);
239 }
240 #endif
241 
242 static uint32_t
243 vlapic_get_ccr(struct vlapic *vlapic)
244 {
245 	struct LAPIC *lapic;
246 	uint32_t ccr;
247 
248 	ccr = 0;
249 	lapic = vlapic->apic_page;
250 
251 	VLAPIC_TIMER_LOCK(vlapic);
252 	if (callout_active(&vlapic->callout)) {
253 		/*
254 		 * If the timer is scheduled to expire in the future then
255 		 * compute the value of 'ccr' based on the remaining time.
256 		 */
257 
258 		const hrtime_t now = gethrtime();
259 		if (vlapic->timer_fire_when > now) {
260 			ccr += hrt_freq_count(vlapic->timer_fire_when - now,
261 			    vlapic->timer_cur_freq);
262 		}
263 	}
264 	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, "
265 	    "icr_timer is %x", ccr, lapic->icr_timer));
266 	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
267 	    ccr, lapic->icr_timer);
268 	VLAPIC_TIMER_UNLOCK(vlapic);
269 	return (ccr);
270 }
271 
272 void
273 vlapic_dcr_write_handler(struct vlapic *vlapic)
274 {
275 	struct LAPIC *lapic;
276 	int divisor;
277 
278 	lapic = vlapic->apic_page;
279 	VLAPIC_TIMER_LOCK(vlapic);
280 
281 	divisor = vlapic_timer_divisor(lapic->dcr_timer);
282 	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
283 	    lapic->dcr_timer, divisor);
284 
285 	/*
286 	 * Update the timer frequency and the timer period.
287 	 *
288 	 * XXX changes to the frequency divider will not take effect until
289 	 * the timer is reloaded.
290 	 */
291 	vlapic->timer_cur_freq = VLAPIC_BUS_FREQ / divisor;
292 	vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
293 	    lapic->icr_timer);
294 
295 	VLAPIC_TIMER_UNLOCK(vlapic);
296 }
297 
298 void
299 vlapic_esr_write_handler(struct vlapic *vlapic)
300 {
301 	struct LAPIC *lapic;
302 
303 	lapic = vlapic->apic_page;
304 	lapic->esr = vlapic->esr_pending;
305 	vlapic->esr_pending = 0;
306 }
307 
308 vcpu_notify_t
309 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
310 {
311 	struct LAPIC *lapic;
312 	uint32_t *irrptr, *tmrptr, mask, tmr;
313 	int idx;
314 
315 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
316 
317 	lapic = vlapic->apic_page;
318 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
319 		/* ignore interrupt on software-disabled APIC */
320 		return (VCPU_NOTIFY_NONE);
321 	}
322 
323 	if (vector < 16) {
324 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
325 		    false);
326 
327 		/*
328 		 * If the error LVT is configured to interrupt the vCPU, it will
329 		 * have delivered a notification through that mechanism.
330 		 */
331 		return (VCPU_NOTIFY_NONE);
332 	}
333 
334 	if (vlapic->ops.set_intr_ready) {
335 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
336 	}
337 
338 	idx = (vector / 32) * 4;
339 	mask = 1 << (vector % 32);
340 	tmrptr = &lapic->tmr0;
341 	irrptr = &lapic->irr0;
342 
343 	/*
344 	 * Update TMR for requested vector, if necessary.
345 	 * This must be done prior to asserting the bit in IRR so that the
346 	 * proper TMR state is always visible before the to-be-queued interrupt
347 	 * can be injected.
348 	 */
349 	tmr = atomic_load_acq_32(&tmrptr[idx]);
350 	if ((tmr & mask) != (level ? mask : 0)) {
351 		if (level) {
352 			atomic_set_int(&tmrptr[idx], mask);
353 		} else {
354 			atomic_clear_int(&tmrptr[idx], mask);
355 		}
356 	}
357 
358 	/* Now set the bit in IRR */
359 	atomic_set_int(&irrptr[idx], mask);
360 
361 	return (VCPU_NOTIFY_EXIT);
362 }
363 
364 static __inline uint32_t *
365 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
366 {
367 	struct LAPIC	*lapic = vlapic->apic_page;
368 	int		i;
369 
370 	switch (offset) {
371 	case APIC_OFFSET_CMCI_LVT:
372 		return (&lapic->lvt_cmci);
373 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
374 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
375 		return ((&lapic->lvt_timer) + i);
376 	default:
377 		panic("vlapic_get_lvt: invalid LVT\n");
378 	}
379 }
380 
381 static __inline int
382 lvt_off_to_idx(uint32_t offset)
383 {
384 	int index;
385 
386 	switch (offset) {
387 	case APIC_OFFSET_CMCI_LVT:
388 		index = APIC_LVT_CMCI;
389 		break;
390 	case APIC_OFFSET_TIMER_LVT:
391 		index = APIC_LVT_TIMER;
392 		break;
393 	case APIC_OFFSET_THERM_LVT:
394 		index = APIC_LVT_THERMAL;
395 		break;
396 	case APIC_OFFSET_PERF_LVT:
397 		index = APIC_LVT_PMC;
398 		break;
399 	case APIC_OFFSET_LINT0_LVT:
400 		index = APIC_LVT_LINT0;
401 		break;
402 	case APIC_OFFSET_LINT1_LVT:
403 		index = APIC_LVT_LINT1;
404 		break;
405 	case APIC_OFFSET_ERROR_LVT:
406 		index = APIC_LVT_ERROR;
407 		break;
408 	default:
409 		index = -1;
410 		break;
411 	}
412 	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
413 	    "invalid lvt index %d for offset %x", index, offset));
414 
415 	return (index);
416 }
417 
418 static __inline uint32_t
419 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
420 {
421 	int idx;
422 	uint32_t val;
423 
424 	idx = lvt_off_to_idx(offset);
425 	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
426 	return (val);
427 }
428 
429 void
430 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
431 {
432 	uint32_t *lvtptr, mask, val;
433 	struct LAPIC *lapic;
434 	int idx;
435 
436 	lapic = vlapic->apic_page;
437 	lvtptr = vlapic_get_lvtptr(vlapic, offset);
438 	val = *lvtptr;
439 	idx = lvt_off_to_idx(offset);
440 
441 	if (!(lapic->svr & APIC_SVR_ENABLE))
442 		val |= APIC_LVT_M;
443 	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
444 	switch (offset) {
445 	case APIC_OFFSET_TIMER_LVT:
446 		mask |= APIC_LVTT_TM;
447 		break;
448 	case APIC_OFFSET_ERROR_LVT:
449 		break;
450 	case APIC_OFFSET_LINT0_LVT:
451 	case APIC_OFFSET_LINT1_LVT:
452 		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
453 		/* FALLTHROUGH */
454 	default:
455 		mask |= APIC_LVT_DM;
456 		break;
457 	}
458 	val &= mask;
459 	*lvtptr = val;
460 	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
461 }
462 
463 static void
464 vlapic_mask_lvts(struct vlapic *vlapic)
465 {
466 	struct LAPIC *lapic = vlapic->apic_page;
467 
468 	lapic->lvt_cmci |= APIC_LVT_M;
469 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
470 
471 	lapic->lvt_timer |= APIC_LVT_M;
472 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
473 
474 	lapic->lvt_thermal |= APIC_LVT_M;
475 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
476 
477 	lapic->lvt_pcint |= APIC_LVT_M;
478 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
479 
480 	lapic->lvt_lint0 |= APIC_LVT_M;
481 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
482 
483 	lapic->lvt_lint1 |= APIC_LVT_M;
484 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
485 
486 	lapic->lvt_error |= APIC_LVT_M;
487 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
488 }
489 
490 static int
491 vlapic_fire_lvt(struct vlapic *vlapic, uint_t lvt)
492 {
493 	uint32_t mode, reg, vec;
494 	vcpu_notify_t notify;
495 
496 	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
497 
498 	if (reg & APIC_LVT_M)
499 		return (0);
500 	vec = reg & APIC_LVT_VECTOR;
501 	mode = reg & APIC_LVT_DM;
502 
503 	switch (mode) {
504 	case APIC_LVT_DM_FIXED:
505 		if (vec < 16) {
506 			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
507 			    lvt == APIC_LVT_ERROR);
508 			return (0);
509 		}
510 		notify = vlapic_set_intr_ready(vlapic, vec, false);
511 		vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify);
512 		break;
513 	case APIC_LVT_DM_NMI:
514 		(void) vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
515 		break;
516 	case APIC_LVT_DM_EXTINT:
517 		(void) vm_inject_extint(vlapic->vm, vlapic->vcpuid);
518 		break;
519 	default:
520 		// Other modes ignored
521 		return (0);
522 	}
523 	return (1);
524 }
525 
526 static uint_t
527 vlapic_active_isr(struct vlapic *vlapic)
528 {
529 	int i;
530 	uint32_t *isrp;
531 
532 	isrp = &vlapic->apic_page->isr7;
533 
534 	for (i = 7; i >= 0; i--, isrp -= 4) {
535 		uint32_t reg = *isrp;
536 
537 		if (reg != 0) {
538 			uint_t vec = (i * 32) + bsrl(reg);
539 
540 			if (vec < 16) {
541 				/*
542 				 * Truncate the illegal low vectors to value of
543 				 * 0, indicating that no active ISR was found.
544 				 */
545 				return (0);
546 			}
547 			return (vec);
548 		}
549 	}
550 
551 	return (0);
552 }
553 
554 /*
555  * After events which might arbitrarily change the value of PPR, such as a TPR
556  * write or an EOI, calculate that new PPR value and store it in the APIC page.
557  */
558 static void
559 vlapic_update_ppr(struct vlapic *vlapic)
560 {
561 	int isrvec, tpr, ppr;
562 
563 	isrvec = vlapic_active_isr(vlapic);
564 	tpr = vlapic->apic_page->tpr;
565 
566 	/*
567 	 * Algorithm adopted from section "Interrupt, Task and Processor
568 	 * Priority" in Intel Architecture Manual Vol 3a.
569 	 */
570 	if (PRIO(tpr) >= PRIO(isrvec)) {
571 		ppr = tpr;
572 	} else {
573 		ppr = PRIO(isrvec);
574 	}
575 
576 	vlapic->apic_page->ppr = ppr;
577 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
578 }
579 
580 /*
581  * When a vector is asserted in ISR as in-service, the PPR must be raised to the
582  * priority of that vector, as the vCPU would have been at a lower priority in
583  * order for the vector to be accepted.
584  */
585 static void
586 vlapic_raise_ppr(struct vlapic *vlapic, int vec)
587 {
588 	struct LAPIC *lapic = vlapic->apic_page;
589 	int ppr;
590 
591 	ppr = PRIO(vec);
592 
593 #ifdef __ISRVEC_DEBUG
594 	KASSERT(vec >= 16 && vec < 256, ("invalid vector %d", vec));
595 	KASSERT(ppr > lapic->tpr, ("ppr %x <= tpr %x", ppr, lapic->tpr));
596 	KASSERT(ppr > lapic->ppr, ("ppr %x <= old ppr %x", ppr, lapic->ppr));
597 	KASSERT(vec == (int)vlapic_active_isr(vlapic), ("ISR missing for ppr"));
598 #endif /* __ISRVEC_DEBUG */
599 
600 	lapic->ppr = ppr;
601 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
602 }
603 
604 void
605 vlapic_sync_tpr(struct vlapic *vlapic)
606 {
607 	vlapic_update_ppr(vlapic);
608 }
609 
610 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
611 
612 static void
613 vlapic_process_eoi(struct vlapic *vlapic)
614 {
615 	struct LAPIC	*lapic = vlapic->apic_page;
616 	uint32_t	*isrptr, *tmrptr;
617 	int		i;
618 	uint_t		idx, bitpos, vector;
619 
620 	isrptr = &lapic->isr0;
621 	tmrptr = &lapic->tmr0;
622 
623 	for (i = 7; i >= 0; i--) {
624 		idx = i * 4;
625 		if (isrptr[idx] != 0) {
626 			bitpos = bsrl(isrptr[idx]);
627 			vector = i * 32 + bitpos;
628 
629 			isrptr[idx] &= ~(1 << bitpos);
630 			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
631 			    vector);
632 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
633 #ifdef __ISRVEC_DEBUG
634 			vlapic_isrstk_eoi(vlapic, vector);
635 #endif
636 			vlapic_update_ppr(vlapic);
637 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
638 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
639 				    vector);
640 			}
641 			return;
642 		}
643 	}
644 	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
645 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
646 }
647 
648 static __inline int
649 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
650 {
651 
652 	return (lvt & mask);
653 }
654 
655 static __inline int
656 vlapic_periodic_timer(struct vlapic *vlapic)
657 {
658 	uint32_t lvt;
659 
660 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
661 
662 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
663 }
664 
665 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
666 
667 static void
668 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
669 {
670 
671 	vlapic->esr_pending |= mask;
672 
673 	/*
674 	 * Avoid infinite recursion if the error LVT itself is configured with
675 	 * an illegal vector.
676 	 */
677 	if (lvt_error)
678 		return;
679 
680 	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
681 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
682 	}
683 }
684 
685 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
686 
687 static void
688 vlapic_fire_timer(struct vlapic *vlapic)
689 {
690 	ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
691 
692 	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
693 		VLAPIC_CTR0(vlapic, "vlapic timer fired");
694 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
695 	}
696 }
697 
698 static VMM_STAT(VLAPIC_INTR_CMC,
699 	"corrected machine check interrupts generated by vlapic");
700 
701 void
702 vlapic_fire_cmci(struct vlapic *vlapic)
703 {
704 
705 	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
706 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
707 	}
708 }
709 
710 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
711 	"lvts triggered");
712 
713 int
714 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
715 {
716 	if (!vlapic_enabled(vlapic)) {
717 		/*
718 		 * When the local APIC is global/hardware disabled,
719 		 * LINT[1:0] pins are configured as INTR and NMI pins,
720 		 * respectively.
721 		 */
722 		switch (vector) {
723 			case APIC_LVT_LINT0:
724 				(void) vm_inject_extint(vlapic->vm,
725 				    vlapic->vcpuid);
726 				break;
727 			case APIC_LVT_LINT1:
728 				(void) vm_inject_nmi(vlapic->vm,
729 				    vlapic->vcpuid);
730 				break;
731 			default:
732 				break;
733 		}
734 		return (0);
735 	}
736 
737 	switch (vector) {
738 	case APIC_LVT_LINT0:
739 	case APIC_LVT_LINT1:
740 	case APIC_LVT_TIMER:
741 	case APIC_LVT_ERROR:
742 	case APIC_LVT_PMC:
743 	case APIC_LVT_THERMAL:
744 	case APIC_LVT_CMCI:
745 		if (vlapic_fire_lvt(vlapic, vector)) {
746 			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
747 			    LVTS_TRIGGERRED, vector, 1);
748 		}
749 		break;
750 	default:
751 		return (EINVAL);
752 	}
753 	return (0);
754 }
755 
756 static void
757 vlapic_callout_reset(struct vlapic *vlapic)
758 {
759 	callout_reset_hrtime(&vlapic->callout, vlapic->timer_fire_when,
760 	    vlapic_callout_handler, vlapic, C_ABSOLUTE);
761 }
762 
763 static void
764 vlapic_callout_handler(void *arg)
765 {
766 	struct vlapic *vlapic = arg;
767 
768 	VLAPIC_TIMER_LOCK(vlapic);
769 	if (callout_pending(&vlapic->callout))	/* callout was reset */
770 		goto done;
771 
772 	if (!callout_active(&vlapic->callout))	/* callout was stopped */
773 		goto done;
774 
775 	callout_deactivate(&vlapic->callout);
776 
777 	vlapic_fire_timer(vlapic);
778 
779 	if (vlapic_periodic_timer(vlapic)) {
780 		/*
781 		 * Compute the delta between when the timer was supposed to
782 		 * fire and the present time.  We can depend on the fact that
783 		 * cyclics (which underly these callouts) will never be called
784 		 * early.
785 		 */
786 		const hrtime_t now = gethrtime();
787 		const hrtime_t delta = now - vlapic->timer_fire_when;
788 		if (delta >= vlapic->timer_period) {
789 			/*
790 			 * If we are so behind that we have missed an entire
791 			 * timer period, reset the time base rather than
792 			 * attempting to catch up.
793 			 */
794 			vlapic->timer_fire_when = now + vlapic->timer_period;
795 		} else {
796 			vlapic->timer_fire_when += vlapic->timer_period;
797 		}
798 		vlapic_callout_reset(vlapic);
799 	}
800 done:
801 	VLAPIC_TIMER_UNLOCK(vlapic);
802 }
803 
804 void
805 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
806 {
807 	struct LAPIC *lapic = vlapic->apic_page;
808 
809 	VLAPIC_TIMER_LOCK(vlapic);
810 	vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
811 	    lapic->icr_timer);
812 	if (vlapic->timer_period != 0) {
813 		vlapic->timer_fire_when = gethrtime() + vlapic->timer_period;
814 		vlapic_callout_reset(vlapic);
815 	} else {
816 		vlapic->timer_fire_when = 0;
817 		callout_stop(&vlapic->callout);
818 	}
819 	VLAPIC_TIMER_UNLOCK(vlapic);
820 }
821 
822 /*
823  * This function populates 'dmask' with the set of vcpus that match the
824  * addressing specified by the (dest, phys, lowprio) tuple.
825  *
826  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
827  * or xAPIC (8-bit) destination field.
828  */
829 void
830 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
831     bool lowprio, bool x2apic_dest)
832 {
833 	struct vlapic *vlapic;
834 	uint32_t dfr, ldr, ldest, cluster;
835 	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
836 	cpuset_t amask;
837 	int vcpuid;
838 
839 	if ((x2apic_dest && dest == 0xffffffff) ||
840 	    (!x2apic_dest && dest == 0xff)) {
841 		/*
842 		 * Broadcast in both logical and physical modes.
843 		 */
844 		*dmask = vm_active_cpus(vm);
845 		return;
846 	}
847 
848 	if (phys) {
849 		/*
850 		 * Physical mode: destination is APIC ID.
851 		 */
852 		CPU_ZERO(dmask);
853 		vcpuid = vm_apicid2vcpuid(vm, dest);
854 		amask = vm_active_cpus(vm);
855 		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
856 			CPU_SET(vcpuid, dmask);
857 	} else {
858 		/*
859 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
860 		 * bitmask. This model is only available in the xAPIC mode.
861 		 */
862 		mda_flat_ldest = dest & 0xff;
863 
864 		/*
865 		 * In the "Cluster Model" the MDA is used to identify a
866 		 * specific cluster and a set of APICs in that cluster.
867 		 */
868 		if (x2apic_dest) {
869 			mda_cluster_id = dest >> 16;
870 			mda_cluster_ldest = dest & 0xffff;
871 		} else {
872 			mda_cluster_id = (dest >> 4) & 0xf;
873 			mda_cluster_ldest = dest & 0xf;
874 		}
875 
876 		/*
877 		 * Logical mode: match each APIC that has a bit set
878 		 * in its LDR that matches a bit in the ldest.
879 		 */
880 		CPU_ZERO(dmask);
881 		amask = vm_active_cpus(vm);
882 		while ((vcpuid = CPU_FFS(&amask)) != 0) {
883 			vcpuid--;
884 			CPU_CLR(vcpuid, &amask);
885 
886 			vlapic = vm_lapic(vm, vcpuid);
887 			dfr = vlapic->apic_page->dfr;
888 			ldr = vlapic->apic_page->ldr;
889 
890 			if ((dfr & APIC_DFR_MODEL_MASK) ==
891 			    APIC_DFR_MODEL_FLAT) {
892 				ldest = ldr >> 24;
893 				mda_ldest = mda_flat_ldest;
894 			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
895 			    APIC_DFR_MODEL_CLUSTER) {
896 				if (vlapic_x2mode(vlapic)) {
897 					cluster = ldr >> 16;
898 					ldest = ldr & 0xffff;
899 				} else {
900 					cluster = ldr >> 28;
901 					ldest = (ldr >> 24) & 0xf;
902 				}
903 				if (cluster != mda_cluster_id)
904 					continue;
905 				mda_ldest = mda_cluster_ldest;
906 			} else {
907 				/*
908 				 * Guest has configured a bad logical
909 				 * model for this vcpu - skip it.
910 				 */
911 				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
912 				    "model %x - cannot deliver interrupt", dfr);
913 				continue;
914 			}
915 
916 			if ((mda_ldest & ldest) != 0) {
917 				CPU_SET(vcpuid, dmask);
918 				if (lowprio)
919 					break;
920 			}
921 		}
922 	}
923 }
924 
925 static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu");
926 static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu");
927 
928 static void
929 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
930 {
931 	struct LAPIC *lapic = vlapic->apic_page;
932 
933 	if (lapic->tpr != val) {
934 		VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed "
935 		    "from %#x to %#x", lapic->tpr, val);
936 		lapic->tpr = val;
937 		vlapic_update_ppr(vlapic);
938 	}
939 }
940 
941 void
942 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
943 {
944 	uint8_t tpr;
945 
946 	if (val & ~0xf) {
947 		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
948 		return;
949 	}
950 
951 	tpr = val << 4;
952 	vlapic_set_tpr(vlapic, tpr);
953 }
954 
955 uint64_t
956 vlapic_get_cr8(struct vlapic *vlapic)
957 {
958 	const struct LAPIC *lapic = vlapic->apic_page;
959 
960 	return (lapic->tpr >> 4);
961 }
962 
963 void
964 vlapic_icrlo_write_handler(struct vlapic *vlapic)
965 {
966 	int i;
967 	cpuset_t dmask;
968 	uint64_t icrval;
969 	uint32_t dest, vec, mode, dsh;
970 	struct LAPIC *lapic;
971 
972 	lapic = vlapic->apic_page;
973 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
974 	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
975 
976 	if (vlapic_x2mode(vlapic))
977 		dest = icrval >> 32;
978 	else
979 		dest = icrval >> (32 + 24);
980 	vec = icrval & APIC_VECTOR_MASK;
981 	mode = icrval & APIC_DELMODE_MASK;
982 	dsh = icrval & APIC_DEST_MASK;
983 
984 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
985 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
986 		return;
987 	}
988 	if (mode == APIC_DELMODE_INIT &&
989 	    (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
990 		/* No work required to deassert INIT */
991 		return;
992 	}
993 	if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) &&
994 	    !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) {
995 		/*
996 		 * While Intel makes no mention of restrictions for destination
997 		 * shorthand when sending INIT or SIPI, AMD requires either a
998 		 * specific destination or all-excluding self.  Common use seems
999 		 * to be restricted to those two cases.  Until handling is in
1000 		 * place to halt a guest which makes such a frivolous request,
1001 		 * we will ignore them.
1002 		 */
1003 		return;
1004 	}
1005 
1006 	switch (dsh) {
1007 	case APIC_DEST_DESTFLD:
1008 		vlapic_calcdest(vlapic->vm, &dmask, dest,
1009 		    (icrval & APIC_DESTMODE_LOG) == 0, false,
1010 		    vlapic_x2mode(vlapic));
1011 		break;
1012 	case APIC_DEST_SELF:
1013 		CPU_SETOF(vlapic->vcpuid, &dmask);
1014 		break;
1015 	case APIC_DEST_ALLISELF:
1016 		dmask = vm_active_cpus(vlapic->vm);
1017 		break;
1018 	case APIC_DEST_ALLESELF:
1019 		dmask = vm_active_cpus(vlapic->vm);
1020 		CPU_CLR(vlapic->vcpuid, &dmask);
1021 		break;
1022 	default:
1023 		/*
1024 		 * All possible delivery notations are covered above.
1025 		 * We should never end up here.
1026 		 */
1027 		panic("unknown delivery shorthand: %x", dsh);
1028 	}
1029 
1030 	while ((i = CPU_FFS(&dmask)) != 0) {
1031 		i--;
1032 		CPU_CLR(i, &dmask);
1033 		switch (mode) {
1034 		case APIC_DELMODE_FIXED:
1035 			(void) lapic_intr_edge(vlapic->vm, i, vec);
1036 			vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
1037 			    VLAPIC_IPI_SEND, 1);
1038 			vmm_stat_incr(vlapic->vm, i,
1039 			    VLAPIC_IPI_RECV, 1);
1040 			break;
1041 		case APIC_DELMODE_NMI:
1042 			(void) vm_inject_nmi(vlapic->vm, i);
1043 			break;
1044 		case APIC_DELMODE_INIT:
1045 			(void) vm_inject_init(vlapic->vm, i);
1046 			break;
1047 		case APIC_DELMODE_STARTUP:
1048 			(void) vm_inject_sipi(vlapic->vm, i, vec);
1049 			break;
1050 		case APIC_DELMODE_LOWPRIO:
1051 		case APIC_DELMODE_SMI:
1052 		default:
1053 			/* Unhandled IPI modes (for now) */
1054 			break;
1055 		}
1056 	}
1057 }
1058 
1059 void
1060 vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val)
1061 {
1062 	const int vec = val & 0xff;
1063 
1064 	/* self-IPI is only exposed via x2APIC */
1065 	ASSERT(vlapic_x2mode(vlapic));
1066 
1067 	(void) lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
1068 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_SEND, 1);
1069 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_RECV, 1);
1070 	VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
1071 }
1072 
1073 int
1074 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1075 {
1076 	struct LAPIC	*lapic = vlapic->apic_page;
1077 	int		 idx, i, bitpos, vector;
1078 	uint32_t	*irrptr, val;
1079 
1080 	if (vlapic->ops.sync_state) {
1081 		(*vlapic->ops.sync_state)(vlapic);
1082 	}
1083 
1084 	irrptr = &lapic->irr0;
1085 
1086 	for (i = 7; i >= 0; i--) {
1087 		idx = i * 4;
1088 		val = atomic_load_acq_int(&irrptr[idx]);
1089 		bitpos = fls(val);
1090 		if (bitpos != 0) {
1091 			vector = i * 32 + (bitpos - 1);
1092 			if (PRIO(vector) > PRIO(lapic->ppr)) {
1093 				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
1094 				if (vecptr != NULL)
1095 					*vecptr = vector;
1096 				return (1);
1097 			} else
1098 				break;
1099 		}
1100 	}
1101 	return (0);
1102 }
1103 
1104 void
1105 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1106 {
1107 	struct LAPIC	*lapic = vlapic->apic_page;
1108 	uint32_t	*irrptr, *isrptr;
1109 	int		idx;
1110 
1111 	KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector));
1112 
1113 	if (vlapic->ops.intr_accepted)
1114 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1115 
1116 	/*
1117 	 * clear the ready bit for vector being accepted in irr
1118 	 * and set the vector as in service in isr.
1119 	 */
1120 	idx = (vector / 32) * 4;
1121 
1122 	irrptr = &lapic->irr0;
1123 	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1124 	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
1125 
1126 	isrptr = &lapic->isr0;
1127 	isrptr[idx] |= 1 << (vector % 32);
1128 	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
1129 
1130 	/*
1131 	 * The only way a fresh vector could be accepted into ISR is if it was
1132 	 * of a higher priority than the current PPR.  With that vector now
1133 	 * in-service, the PPR must be raised.
1134 	 */
1135 	vlapic_raise_ppr(vlapic, vector);
1136 
1137 #ifdef __ISRVEC_DEBUG
1138 	vlapic_isrstk_accept(vlapic, vector);
1139 #endif
1140 }
1141 
1142 void
1143 vlapic_svr_write_handler(struct vlapic *vlapic)
1144 {
1145 	struct LAPIC *lapic;
1146 	uint32_t old, new, changed;
1147 
1148 	lapic = vlapic->apic_page;
1149 
1150 	new = lapic->svr;
1151 	old = vlapic->svr_last;
1152 	vlapic->svr_last = new;
1153 
1154 	changed = old ^ new;
1155 	if ((changed & APIC_SVR_ENABLE) != 0) {
1156 		if ((new & APIC_SVR_ENABLE) == 0) {
1157 			/*
1158 			 * The apic is now disabled so stop the apic timer
1159 			 * and mask all the LVT entries.
1160 			 */
1161 			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
1162 			VLAPIC_TIMER_LOCK(vlapic);
1163 			callout_stop(&vlapic->callout);
1164 			VLAPIC_TIMER_UNLOCK(vlapic);
1165 			vlapic_mask_lvts(vlapic);
1166 		} else {
1167 			/*
1168 			 * The apic is now enabled so restart the apic timer
1169 			 * if it is configured in periodic mode.
1170 			 */
1171 			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
1172 			if (vlapic_periodic_timer(vlapic))
1173 				vlapic_icrtmr_write_handler(vlapic);
1174 		}
1175 	}
1176 }
1177 
1178 static bool
1179 vlapic_read(struct vlapic *vlapic, uint16_t offset, uint32_t *outp)
1180 {
1181 	struct LAPIC *lapic = vlapic->apic_page;
1182 	uint32_t *reg;
1183 	int i;
1184 
1185 	ASSERT3U(offset & 0x3, ==, 0);
1186 	ASSERT3U(offset, <, PAGESIZE);
1187 	ASSERT3P(outp, !=, NULL);
1188 
1189 	uint32_t data = 0;
1190 	switch (offset) {
1191 	case APIC_OFFSET_ID:
1192 		data = lapic->id;
1193 		break;
1194 	case APIC_OFFSET_VER:
1195 		data = lapic->version;
1196 		break;
1197 	case APIC_OFFSET_TPR:
1198 		data = lapic->tpr;
1199 		break;
1200 	case APIC_OFFSET_APR:
1201 		data = lapic->apr;
1202 		break;
1203 	case APIC_OFFSET_PPR:
1204 		data = lapic->ppr;
1205 		break;
1206 	case APIC_OFFSET_LDR:
1207 		data = lapic->ldr;
1208 		break;
1209 	case APIC_OFFSET_DFR:
1210 		data = lapic->dfr;
1211 		break;
1212 	case APIC_OFFSET_SVR:
1213 		data = lapic->svr;
1214 		break;
1215 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1216 		i = (offset - APIC_OFFSET_ISR0) >> 2;
1217 		reg = &lapic->isr0;
1218 		data = *(reg + i);
1219 		break;
1220 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1221 		i = (offset - APIC_OFFSET_TMR0) >> 2;
1222 		reg = &lapic->tmr0;
1223 		data = *(reg + i);
1224 		break;
1225 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1226 		i = (offset - APIC_OFFSET_IRR0) >> 2;
1227 		reg = &lapic->irr0;
1228 		data = atomic_load_acq_int(reg + i);
1229 		break;
1230 	case APIC_OFFSET_ESR:
1231 		data = lapic->esr;
1232 		break;
1233 	case APIC_OFFSET_ICR_LOW:
1234 		data = lapic->icr_lo;
1235 		break;
1236 	case APIC_OFFSET_ICR_HI:
1237 		data = lapic->icr_hi;
1238 		break;
1239 	case APIC_OFFSET_CMCI_LVT:
1240 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1241 		data = vlapic_get_lvt(vlapic, offset);
1242 #ifdef INVARIANTS
1243 		reg = vlapic_get_lvtptr(vlapic, offset);
1244 		ASSERT3U(data, ==, *reg);
1245 #endif
1246 		break;
1247 	case APIC_OFFSET_TIMER_ICR:
1248 		data = lapic->icr_timer;
1249 		break;
1250 	case APIC_OFFSET_TIMER_CCR:
1251 		data = vlapic_get_ccr(vlapic);
1252 		break;
1253 	case APIC_OFFSET_TIMER_DCR:
1254 		data = lapic->dcr_timer;
1255 		break;
1256 	case APIC_OFFSET_RRR:
1257 		data = 0;
1258 		break;
1259 
1260 	case APIC_OFFSET_SELF_IPI:
1261 	case APIC_OFFSET_EOI:
1262 		/* Write-only register */
1263 		*outp = 0;
1264 		return (false);
1265 
1266 	default:
1267 		/* Invalid register */
1268 		*outp = 0;
1269 		return (false);
1270 	}
1271 
1272 	*outp = data;
1273 	return (true);
1274 }
1275 
1276 static bool
1277 vlapic_write(struct vlapic *vlapic, uint16_t offset, uint32_t data)
1278 {
1279 	struct LAPIC	*lapic = vlapic->apic_page;
1280 	uint32_t	*regptr;
1281 
1282 	ASSERT3U(offset & 0xf, ==, 0);
1283 	ASSERT3U(offset, <, PAGESIZE);
1284 
1285 	switch (offset) {
1286 	case APIC_OFFSET_ID:
1287 		lapic->id = data;
1288 		vlapic_id_write_handler(vlapic);
1289 		break;
1290 	case APIC_OFFSET_TPR:
1291 		vlapic_set_tpr(vlapic, data & 0xff);
1292 		break;
1293 	case APIC_OFFSET_EOI:
1294 		vlapic_process_eoi(vlapic);
1295 		break;
1296 	case APIC_OFFSET_LDR:
1297 		lapic->ldr = data;
1298 		vlapic_ldr_write_handler(vlapic);
1299 		break;
1300 	case APIC_OFFSET_DFR:
1301 		lapic->dfr = data;
1302 		vlapic_dfr_write_handler(vlapic);
1303 		break;
1304 	case APIC_OFFSET_SVR:
1305 		lapic->svr = data;
1306 		vlapic_svr_write_handler(vlapic);
1307 		break;
1308 	case APIC_OFFSET_ICR_LOW:
1309 		lapic->icr_lo = data;
1310 		vlapic_icrlo_write_handler(vlapic);
1311 		break;
1312 	case APIC_OFFSET_ICR_HI:
1313 		lapic->icr_hi = data;
1314 		break;
1315 	case APIC_OFFSET_CMCI_LVT:
1316 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1317 		regptr = vlapic_get_lvtptr(vlapic, offset);
1318 		*regptr = data;
1319 		vlapic_lvt_write_handler(vlapic, offset);
1320 		break;
1321 	case APIC_OFFSET_TIMER_ICR:
1322 		lapic->icr_timer = data;
1323 		vlapic_icrtmr_write_handler(vlapic);
1324 		break;
1325 
1326 	case APIC_OFFSET_TIMER_DCR:
1327 		lapic->dcr_timer = data;
1328 		vlapic_dcr_write_handler(vlapic);
1329 		break;
1330 
1331 	case APIC_OFFSET_ESR:
1332 		vlapic_esr_write_handler(vlapic);
1333 		break;
1334 
1335 	case APIC_OFFSET_SELF_IPI:
1336 		if (vlapic_x2mode(vlapic))
1337 			vlapic_self_ipi_handler(vlapic, data);
1338 		break;
1339 
1340 	case APIC_OFFSET_VER:
1341 	case APIC_OFFSET_APR:
1342 	case APIC_OFFSET_PPR:
1343 	case APIC_OFFSET_RRR:
1344 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1345 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1346 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1347 	case APIC_OFFSET_TIMER_CCR:
1348 		/* Read-only register */
1349 		return (false);
1350 
1351 	default:
1352 		/* Invalid register */
1353 		return (false);
1354 	}
1355 
1356 	return (true);
1357 }
1358 
1359 void
1360 vlapic_reset(struct vlapic *vlapic)
1361 {
1362 	struct LAPIC *lapic = vlapic->apic_page;
1363 	uint32_t *isrptr, *tmrptr, *irrptr;
1364 
1365 	/* Reset any timer-related state first */
1366 	VLAPIC_TIMER_LOCK(vlapic);
1367 	callout_stop(&vlapic->callout);
1368 	lapic->icr_timer = 0;
1369 	lapic->ccr_timer = 0;
1370 	VLAPIC_TIMER_UNLOCK(vlapic);
1371 	lapic->dcr_timer = 0;
1372 	vlapic_dcr_write_handler(vlapic);
1373 
1374 	/*
1375 	 * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so
1376 	 * it is not leftover after the reset.  This is performed after the APIC
1377 	 * timer has been stopped, in case it happened to fire just prior to
1378 	 * being deactivated.
1379 	 */
1380 	if (vlapic->ops.sync_state) {
1381 		(*vlapic->ops.sync_state)(vlapic);
1382 	}
1383 
1384 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1385 	if (vlapic->vcpuid == 0)
1386 		vlapic->msr_apicbase |= APICBASE_BSP;
1387 
1388 	lapic->id = vlapic_get_id(vlapic);
1389 	lapic->version = VLAPIC_VERSION;
1390 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1391 
1392 	lapic->tpr = 0;
1393 	lapic->apr = 0;
1394 	lapic->ppr = 0;
1395 
1396 #ifdef __ISRVEC_DEBUG
1397 	/* With the PPR cleared, the isrvec tracking should be reset too */
1398 	vlapic->isrvec_stk_top = 0;
1399 #endif
1400 
1401 	lapic->eoi = 0;
1402 	lapic->ldr = 0;
1403 	lapic->dfr = 0xffffffff;
1404 	lapic->svr = APIC_SVR_VECTOR;
1405 	vlapic->svr_last = lapic->svr;
1406 
1407 	isrptr = &lapic->isr0;
1408 	tmrptr = &lapic->tmr0;
1409 	irrptr = &lapic->irr0;
1410 	for (uint_t i = 0; i < 8; i++) {
1411 		atomic_store_rel_int(&isrptr[i * 4], 0);
1412 		atomic_store_rel_int(&tmrptr[i * 4], 0);
1413 		atomic_store_rel_int(&irrptr[i * 4], 0);
1414 	}
1415 
1416 	lapic->esr = 0;
1417 	vlapic->esr_pending = 0;
1418 	lapic->icr_lo = 0;
1419 	lapic->icr_hi = 0;
1420 
1421 	lapic->lvt_cmci = 0;
1422 	lapic->lvt_timer = 0;
1423 	lapic->lvt_thermal = 0;
1424 	lapic->lvt_pcint = 0;
1425 	lapic->lvt_lint0 = 0;
1426 	lapic->lvt_lint1 = 0;
1427 	lapic->lvt_error = 0;
1428 	vlapic_mask_lvts(vlapic);
1429 }
1430 
1431 void
1432 vlapic_init(struct vlapic *vlapic)
1433 {
1434 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1435 	KASSERT(vlapic->vcpuid >= 0 &&
1436 	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1437 	    ("vlapic_init: vcpuid is not initialized"));
1438 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1439 	    "initialized"));
1440 
1441 	/*
1442 	 * If the vlapic is configured in x2apic mode then it will be
1443 	 * accessed in the critical section via the MSR emulation code.
1444 	 *
1445 	 * Therefore the timer mutex must be a spinlock because blockable
1446 	 * mutexes cannot be acquired in a critical section.
1447 	 */
1448 	mutex_init(&vlapic->timer_lock, NULL, MUTEX_ADAPTIVE, NULL);
1449 	callout_init(&vlapic->callout, 1);
1450 
1451 	vlapic_reset(vlapic);
1452 }
1453 
1454 void
1455 vlapic_cleanup(struct vlapic *vlapic)
1456 {
1457 	callout_drain(&vlapic->callout);
1458 	mutex_destroy(&vlapic->timer_lock);
1459 }
1460 
1461 int
1462 vlapic_mmio_read(struct vlapic *vlapic, uint64_t gpa, uint64_t *valp,
1463     uint_t size)
1464 {
1465 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1466 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1467 
1468 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1469 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1470 		*valp = UINT64_MAX;
1471 		return (0);
1472 	}
1473 
1474 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1475 	uint32_t raw = 0;
1476 	(void) vlapic_read(vlapic, off & ~0xf, &raw);
1477 
1478 	/* Shift and mask reads which are small and/or unaligned */
1479 	const uint8_t align = off & 0xf;
1480 	if (align < 4) {
1481 		*valp = (uint64_t)raw << (align * 8);
1482 	} else {
1483 		*valp = 0;
1484 	}
1485 
1486 	return (0);
1487 }
1488 
1489 int
1490 vlapic_mmio_write(struct vlapic *vlapic, uint64_t gpa, uint64_t val,
1491     uint_t size)
1492 {
1493 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1494 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1495 
1496 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1497 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1498 		return (0);
1499 	}
1500 
1501 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1502 	/* Ignore writes which are not 32-bits wide and 16-byte aligned */
1503 	if ((off & 0xf) != 0 || size != 4) {
1504 		return (0);
1505 	}
1506 
1507 	(void) vlapic_write(vlapic, off, (uint32_t)val);
1508 	return (0);
1509 }
1510 
1511 /* Should attempts to change the APIC base address be rejected with a #GP?  */
1512 int vlapic_gp_on_addr_change = 1;
1513 
1514 static vm_msr_result_t
1515 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
1516 {
1517 	const uint64_t diff = vlapic->msr_apicbase ^ val;
1518 
1519 	/*
1520 	 * Until the LAPIC emulation for switching between xAPIC and x2APIC
1521 	 * modes is more polished, it will remain off-limits from being altered
1522 	 * by the guest.
1523 	 */
1524 	const uint64_t reserved_bits = APICBASE_RESERVED | APICBASE_X2APIC |
1525 	    APICBASE_BSP;
1526 	if ((diff & reserved_bits) != 0) {
1527 		return (VMR_GP);
1528 	}
1529 
1530 	/* We do not presently allow the LAPIC access address to be modified. */
1531 	if ((diff & APICBASE_ADDR_MASK) != 0) {
1532 		/*
1533 		 * Explicitly rebuffing such requests with a #GP is the most
1534 		 * straightforward way to handle the situation, but certain
1535 		 * consumers (such as the KVM unit tests) may balk at the
1536 		 * otherwise unexpected exception.
1537 		 */
1538 		if (vlapic_gp_on_addr_change) {
1539 			return (VMR_GP);
1540 		}
1541 
1542 		/* If silence is required, just ignore the address change. */
1543 		val = (val & ~APICBASE_ADDR_MASK) | DEFAULT_APIC_BASE;
1544 	}
1545 
1546 	vlapic->msr_apicbase = val;
1547 	return (VMR_OK);
1548 }
1549 
1550 static __inline uint16_t
1551 vlapic_msr_to_regoff(uint32_t msr)
1552 {
1553 	ASSERT3U(msr, >=, MSR_APIC_000);
1554 	ASSERT3U(msr, <, (MSR_APIC_000 + 0x100));
1555 
1556 	return ((msr - MSR_APIC_000) << 4);
1557 }
1558 
1559 bool
1560 vlapic_owned_msr(uint32_t msr)
1561 {
1562 	if (msr == MSR_APICBASE) {
1563 		return (true);
1564 	}
1565 	if (msr >= MSR_APIC_000 &&
1566 	    msr < (MSR_APIC_000 + 0x100)) {
1567 		return (true);
1568 	}
1569 	return (false);
1570 }
1571 
1572 vm_msr_result_t
1573 vlapic_rdmsr(struct vlapic *vlapic, uint32_t msr, uint64_t *valp)
1574 {
1575 	ASSERT(vlapic_owned_msr(msr));
1576 	ASSERT3P(valp, !=, NULL);
1577 
1578 	if (msr == MSR_APICBASE) {
1579 		*valp = vlapic->msr_apicbase;
1580 		return (VMR_OK);
1581 	}
1582 
1583 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1584 	if (!vlapic_x2mode(vlapic)) {
1585 		return (VMR_GP);
1586 	}
1587 
1588 	uint64_t out = 0;
1589 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1590 	switch (reg) {
1591 	case APIC_OFFSET_ICR_LOW: {
1592 		/* Read from ICR register gets entire (64-bit) value */
1593 		uint32_t low = 0, high = 0;
1594 		bool valid;
1595 
1596 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_HI, &high);
1597 		VERIFY(valid);
1598 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_LOW, &low);
1599 		VERIFY(valid);
1600 
1601 		*valp = ((uint64_t)high << 32) | low;
1602 		return (VMR_OK);
1603 		}
1604 	case APIC_OFFSET_ICR_HI:
1605 		/* Already covered by ICR_LOW */
1606 		return (VMR_GP);
1607 	default:
1608 		break;
1609 	}
1610 	if (!vlapic_read(vlapic, reg, (uint32_t *)&out)) {
1611 		return (VMR_GP);
1612 	}
1613 	*valp = out;
1614 	return (VMR_OK);
1615 }
1616 
1617 vm_msr_result_t
1618 vlapic_wrmsr(struct vlapic *vlapic, uint32_t msr, uint64_t val)
1619 {
1620 	ASSERT(vlapic_owned_msr(msr));
1621 
1622 	if (msr == MSR_APICBASE) {
1623 		return (vlapic_set_apicbase(vlapic, val));
1624 	}
1625 
1626 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1627 	if (!vlapic_x2mode(vlapic)) {
1628 		return (VMR_GP);
1629 	}
1630 
1631 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1632 	switch (reg) {
1633 	case APIC_OFFSET_ICR_LOW: {
1634 		/* Write to ICR register sets entire (64-bit) value */
1635 		bool valid;
1636 
1637 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_HI, val >> 32);
1638 		VERIFY(valid);
1639 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_LOW, val);
1640 		VERIFY(valid);
1641 		return (VMR_OK);
1642 		}
1643 	case APIC_OFFSET_ICR_HI:
1644 		/* Already covered by ICR_LOW */
1645 		return (VMR_GP);
1646 	case APIC_OFFSET_ESR:
1647 		/* Only 0 may be written from x2APIC mode */
1648 		if (val != 0) {
1649 			return (VMR_GP);
1650 		}
1651 		break;
1652 	default:
1653 		break;
1654 	}
1655 	if (!vlapic_write(vlapic, reg, val)) {
1656 		return (VMR_GP);
1657 	}
1658 	return (VMR_OK);
1659 }
1660 
1661 void
1662 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1663 {
1664 	struct vlapic *vlapic;
1665 	struct LAPIC *lapic;
1666 
1667 	vlapic = vm_lapic(vm, vcpuid);
1668 
1669 	if (state == X2APIC_DISABLED)
1670 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1671 	else
1672 		vlapic->msr_apicbase |= APICBASE_X2APIC;
1673 
1674 	/*
1675 	 * Reset the local APIC registers whose values are mode-dependent.
1676 	 *
1677 	 * XXX this works because the APIC mode can be changed only at vcpu
1678 	 * initialization time.
1679 	 */
1680 	lapic = vlapic->apic_page;
1681 	lapic->id = vlapic_get_id(vlapic);
1682 	if (vlapic_x2mode(vlapic)) {
1683 		lapic->ldr = x2apic_ldr(vlapic);
1684 		lapic->dfr = 0;
1685 	} else {
1686 		lapic->ldr = 0;
1687 		lapic->dfr = 0xffffffff;
1688 	}
1689 
1690 	if (state == X2APIC_ENABLED) {
1691 		if (vlapic->ops.enable_x2apic_mode)
1692 			(*vlapic->ops.enable_x2apic_mode)(vlapic);
1693 	}
1694 }
1695 
1696 void
1697 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1698     int delmode, int vec)
1699 {
1700 	bool lowprio;
1701 	int vcpuid;
1702 	cpuset_t dmask;
1703 
1704 	if (delmode != IOART_DELFIXED &&
1705 	    delmode != IOART_DELLOPRI &&
1706 	    delmode != IOART_DELEXINT) {
1707 		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
1708 		return;
1709 	}
1710 	lowprio = (delmode == IOART_DELLOPRI);
1711 
1712 	/*
1713 	 * We don't provide any virtual interrupt redirection hardware so
1714 	 * all interrupts originating from the ioapic or MSI specify the
1715 	 * 'dest' in the legacy xAPIC format.
1716 	 */
1717 	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1718 
1719 	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
1720 		vcpuid--;
1721 		CPU_CLR(vcpuid, &dmask);
1722 		if (delmode == IOART_DELEXINT) {
1723 			(void) vm_inject_extint(vm, vcpuid);
1724 		} else {
1725 			(void) lapic_set_intr(vm, vcpuid, vec, level);
1726 		}
1727 	}
1728 }
1729 
1730 void
1731 vlapic_post_intr(struct vlapic *vlapic, int hostcpu)
1732 {
1733 	/*
1734 	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
1735 	 *
1736 	 * This is done by leveraging features like Posted Interrupts (Intel)
1737 	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1738 	 *
1739 	 * If neither of these features are available then fallback to
1740 	 * sending an IPI to 'hostcpu'.
1741 	 */
1742 	if (vlapic->ops.post_intr)
1743 		(*vlapic->ops.post_intr)(vlapic, hostcpu);
1744 	else
1745 		poke_cpu(hostcpu);
1746 }
1747 
1748 void
1749 vlapic_localize_resources(struct vlapic *vlapic)
1750 {
1751 	vmm_glue_callout_localize(&vlapic->callout);
1752 }
1753 
1754 #ifdef __ISRVEC_DEBUG
1755 static void
1756 vlapic_isrstk_eoi(struct vlapic *vlapic, int vector)
1757 {
1758 	if (vlapic->isrvec_stk_top <= 0) {
1759 		panic("invalid vlapic isrvec_stk_top %d",
1760 		    vlapic->isrvec_stk_top);
1761 	}
1762 	vlapic->isrvec_stk_top--;
1763 	vlapic_isrstk_verify(vlapic);
1764 }
1765 
1766 static void
1767 vlapic_isrstk_accept(struct vlapic *vlapic, int vector)
1768 {
1769 	int stk_top;
1770 
1771 	vlapic->isrvec_stk_top++;
1772 
1773 	stk_top = vlapic->isrvec_stk_top;
1774 	if (stk_top >= ISRVEC_STK_SIZE)
1775 		panic("isrvec_stk_top overflow %d", stk_top);
1776 
1777 	vlapic->isrvec_stk[stk_top] = vector;
1778 	vlapic_isrstk_verify(vlapic);
1779 }
1780 
1781 static void
1782 vlapic_isrstk_dump(const struct vlapic *vlapic)
1783 {
1784 	int i;
1785 	uint32_t *isrptr;
1786 
1787 	isrptr = &vlapic->apic_page->isr0;
1788 	for (i = 0; i < 8; i++)
1789 		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
1790 
1791 	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
1792 		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
1793 }
1794 
1795 static void
1796 vlapic_isrstk_verify(const struct vlapic *vlapic)
1797 {
1798 	int i, lastprio, curprio, vector, idx;
1799 	uint32_t *isrptr;
1800 
1801 	/*
1802 	 * Note: The value at index 0 in isrvec_stk is always 0.
1803 	 *
1804 	 * It is a placeholder for the value of ISR vector when no bits are set
1805 	 * in the ISRx registers.
1806 	 */
1807 	if (vlapic->isrvec_stk_top == 0 && vlapic->isrvec_stk[0] != 0) {
1808 		panic("isrvec_stk is corrupted: %d", vlapic->isrvec_stk[0]);
1809 	}
1810 
1811 	/*
1812 	 * Make sure that the priority of the nested interrupts is
1813 	 * always increasing.
1814 	 */
1815 	lastprio = -1;
1816 	for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
1817 		curprio = PRIO(vlapic->isrvec_stk[i]);
1818 		if (curprio <= lastprio) {
1819 			vlapic_isrstk_dump(vlapic);
1820 			panic("isrvec_stk does not satisfy invariant");
1821 		}
1822 		lastprio = curprio;
1823 	}
1824 
1825 	/*
1826 	 * Make sure that each bit set in the ISRx registers has a
1827 	 * corresponding entry on the isrvec stack.
1828 	 */
1829 	i = 1;
1830 	isrptr = &vlapic->apic_page->isr0;
1831 	for (vector = 0; vector < 256; vector++) {
1832 		idx = (vector / 32) * 4;
1833 		if (isrptr[idx] & (1 << (vector % 32))) {
1834 			if (i > vlapic->isrvec_stk_top ||
1835 			    vlapic->isrvec_stk[i] != vector) {
1836 				vlapic_isrstk_dump(vlapic);
1837 				panic("ISR and isrvec_stk out of sync");
1838 			}
1839 			i++;
1840 		}
1841 	}
1842 }
1843 #endif
1844