xref: /illumos-gate/usr/src/uts/intel/io/vmm/io/vlapic.c (revision ae5a8bed14db6c16225cac733ea042c27e242d18)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2019 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2014 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2020 Oxide Computer Company
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/mutex.h>
53 #include <sys/systm.h>
54 #include <sys/cpuset.h>
55 
56 #include <x86/specialreg.h>
57 #include <x86/apicreg.h>
58 
59 #include <machine/clock.h>
60 
61 #include <machine/vmm.h>
62 #include <sys/vmm_kernel.h>
63 
64 #include "vmm_lapic.h"
65 #include "vmm_stat.h"
66 
67 #include "vlapic.h"
68 #include "vlapic_priv.h"
69 #include "vioapic.h"
70 
71 
72 /*
73  * The 4 high bits of a given interrupt vector represent its priority.  The same
74  * is true for the contents of the TPR when it is used to calculate the ultimate
75  * PPR of an APIC - the 4 high bits hold the priority.
76  */
77 #define	PRIO(x)			((x) & 0xf0)
78 
79 #define	VLAPIC_VERSION		(16)
80 
81 /*
82  * The 'vlapic->timer_lock' is used to provide mutual exclusion between the
83  * vlapic_callout_handler() and vcpu accesses to:
84  * - timer_freq_bt, timer_period_bt, timer_fire_bt
85  * - timer LVT register
86  */
87 #define	VLAPIC_TIMER_LOCK(vlapic)	mutex_enter(&((vlapic)->timer_lock))
88 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mutex_exit(&((vlapic)->timer_lock))
89 #define	VLAPIC_TIMER_LOCKED(vlapic)	MUTEX_HELD(&((vlapic)->timer_lock))
90 
91 /*
92  * APIC timer frequency:
93  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
94  * - power-of-two to avoid loss of precision when calculating times
95  */
96 #define	VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
97 
98 #define	APICBASE_ADDR_MASK	0xfffffffffffff000UL
99 
100 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
101 static void vlapic_callout_handler(void *arg);
102 
103 #ifdef __ISRVEC_DEBUG
104 static void vlapic_isrstk_accept(struct vlapic *, int);
105 static void vlapic_isrstk_eoi(struct vlapic *, int);
106 static void vlapic_isrstk_verify(const struct vlapic *);
107 #endif /* __ISRVEC_DEBUG */
108 
109 
110 static __inline bool
111 vlapic_x2mode(const struct vlapic *vlapic)
112 {
113 	return ((vlapic->msr_apicbase & APICBASE_X2APIC) != 0);
114 }
115 
116 static __inline bool
117 vlapic_hw_disabled(const struct vlapic *vlapic)
118 {
119 	return ((vlapic->msr_apicbase & APICBASE_ENABLED) == 0);
120 }
121 
122 static __inline bool
123 vlapic_sw_disabled(const struct vlapic *vlapic)
124 {
125 	const struct LAPIC *lapic = vlapic->apic_page;
126 
127 	return ((lapic->svr & APIC_SVR_ENABLE) == 0);
128 }
129 
130 static __inline bool
131 vlapic_enabled(const struct vlapic *vlapic)
132 {
133 	return (!vlapic_hw_disabled(vlapic) && !vlapic_sw_disabled(vlapic));
134 }
135 
136 static __inline uint32_t
137 vlapic_get_id(struct vlapic *vlapic)
138 {
139 
140 	if (vlapic_x2mode(vlapic))
141 		return (vlapic->vcpuid);
142 	else
143 		return (vlapic->vcpuid << 24);
144 }
145 
146 static uint32_t
147 x2apic_ldr(struct vlapic *vlapic)
148 {
149 	int apicid;
150 	uint32_t ldr;
151 
152 	apicid = vlapic_get_id(vlapic);
153 	ldr = 1 << (apicid & 0xf);
154 	ldr |= (apicid & 0xffff0) << 12;
155 	return (ldr);
156 }
157 
158 void
159 vlapic_dfr_write_handler(struct vlapic *vlapic)
160 {
161 	struct LAPIC *lapic;
162 
163 	lapic = vlapic->apic_page;
164 	if (vlapic_x2mode(vlapic)) {
165 		/* Ignore write to DFR in x2APIC mode */
166 		lapic->dfr = 0;
167 		return;
168 	}
169 
170 	lapic->dfr &= APIC_DFR_MODEL_MASK;
171 	lapic->dfr |= APIC_DFR_RESERVED;
172 }
173 
174 void
175 vlapic_ldr_write_handler(struct vlapic *vlapic)
176 {
177 	struct LAPIC *lapic;
178 
179 	lapic = vlapic->apic_page;
180 
181 	/* LDR is read-only in x2apic mode */
182 	if (vlapic_x2mode(vlapic)) {
183 		/* Ignore write to LDR in x2APIC mode */
184 		lapic->ldr = x2apic_ldr(vlapic);
185 	} else {
186 		lapic->ldr &= ~APIC_LDR_RESERVED;
187 	}
188 }
189 
190 void
191 vlapic_id_write_handler(struct vlapic *vlapic)
192 {
193 	struct LAPIC *lapic;
194 
195 	/*
196 	 * We don't allow the ID register to be modified so reset it back to
197 	 * its default value.
198 	 */
199 	lapic = vlapic->apic_page;
200 	lapic->id = vlapic_get_id(vlapic);
201 }
202 
203 static int
204 vlapic_timer_divisor(uint32_t dcr)
205 {
206 	switch (dcr & 0xB) {
207 	case APIC_TDCR_1:
208 		return (1);
209 	case APIC_TDCR_2:
210 		return (2);
211 	case APIC_TDCR_4:
212 		return (4);
213 	case APIC_TDCR_8:
214 		return (8);
215 	case APIC_TDCR_16:
216 		return (16);
217 	case APIC_TDCR_32:
218 		return (32);
219 	case APIC_TDCR_64:
220 		return (64);
221 	case APIC_TDCR_128:
222 		return (128);
223 	default:
224 		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
225 	}
226 }
227 
228 #if 0
229 static inline void
230 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
231 {
232 	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
233 	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
234 	    *lvt & APIC_LVTT_M);
235 }
236 #endif
237 
238 static uint32_t
239 vlapic_get_ccr(struct vlapic *vlapic)
240 {
241 	struct LAPIC *lapic;
242 	uint32_t ccr;
243 
244 	ccr = 0;
245 	lapic = vlapic->apic_page;
246 
247 	VLAPIC_TIMER_LOCK(vlapic);
248 	if (callout_active(&vlapic->callout)) {
249 		/*
250 		 * If the timer is scheduled to expire in the future then
251 		 * compute the value of 'ccr' based on the remaining time.
252 		 */
253 
254 		const hrtime_t now = gethrtime();
255 		if (vlapic->timer_fire_when > now) {
256 			ccr += hrt_freq_count(vlapic->timer_fire_when - now,
257 			    vlapic->timer_cur_freq);
258 		}
259 	}
260 	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, "
261 	    "icr_timer is %x", ccr, lapic->icr_timer));
262 	VLAPIC_TIMER_UNLOCK(vlapic);
263 	return (ccr);
264 }
265 
266 void
267 vlapic_dcr_write_handler(struct vlapic *vlapic)
268 {
269 	struct LAPIC *lapic;
270 	int divisor;
271 
272 	lapic = vlapic->apic_page;
273 	VLAPIC_TIMER_LOCK(vlapic);
274 
275 	divisor = vlapic_timer_divisor(lapic->dcr_timer);
276 
277 	/*
278 	 * Update the timer frequency and the timer period.
279 	 *
280 	 * XXX changes to the frequency divider will not take effect until
281 	 * the timer is reloaded.
282 	 */
283 	vlapic->timer_cur_freq = VLAPIC_BUS_FREQ / divisor;
284 	vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
285 	    lapic->icr_timer);
286 
287 	VLAPIC_TIMER_UNLOCK(vlapic);
288 }
289 
290 void
291 vlapic_esr_write_handler(struct vlapic *vlapic)
292 {
293 	struct LAPIC *lapic;
294 
295 	lapic = vlapic->apic_page;
296 	lapic->esr = vlapic->esr_pending;
297 	vlapic->esr_pending = 0;
298 }
299 
300 vcpu_notify_t
301 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
302 {
303 	struct LAPIC *lapic;
304 	uint32_t *irrptr, *tmrptr, mask, tmr;
305 	int idx;
306 
307 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
308 
309 	lapic = vlapic->apic_page;
310 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
311 		/* ignore interrupt on software-disabled APIC */
312 		return (VCPU_NOTIFY_NONE);
313 	}
314 
315 	if (vector < 16) {
316 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
317 		    false);
318 
319 		/*
320 		 * If the error LVT is configured to interrupt the vCPU, it will
321 		 * have delivered a notification through that mechanism.
322 		 */
323 		return (VCPU_NOTIFY_NONE);
324 	}
325 
326 	if (vlapic->ops.set_intr_ready) {
327 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
328 	}
329 
330 	idx = (vector / 32) * 4;
331 	mask = 1 << (vector % 32);
332 	tmrptr = &lapic->tmr0;
333 	irrptr = &lapic->irr0;
334 
335 	/*
336 	 * Update TMR for requested vector, if necessary.
337 	 * This must be done prior to asserting the bit in IRR so that the
338 	 * proper TMR state is always visible before the to-be-queued interrupt
339 	 * can be injected.
340 	 */
341 	tmr = atomic_load_acq_32(&tmrptr[idx]);
342 	if ((tmr & mask) != (level ? mask : 0)) {
343 		if (level) {
344 			atomic_set_int(&tmrptr[idx], mask);
345 		} else {
346 			atomic_clear_int(&tmrptr[idx], mask);
347 		}
348 	}
349 
350 	/* Now set the bit in IRR */
351 	atomic_set_int(&irrptr[idx], mask);
352 
353 	return (VCPU_NOTIFY_EXIT);
354 }
355 
356 static __inline uint32_t *
357 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
358 {
359 	struct LAPIC	*lapic = vlapic->apic_page;
360 	int		i;
361 
362 	switch (offset) {
363 	case APIC_OFFSET_CMCI_LVT:
364 		return (&lapic->lvt_cmci);
365 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
366 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
367 		return ((&lapic->lvt_timer) + i);
368 	default:
369 		panic("vlapic_get_lvt: invalid LVT\n");
370 	}
371 }
372 
373 static __inline int
374 lvt_off_to_idx(uint32_t offset)
375 {
376 	int index;
377 
378 	switch (offset) {
379 	case APIC_OFFSET_CMCI_LVT:
380 		index = APIC_LVT_CMCI;
381 		break;
382 	case APIC_OFFSET_TIMER_LVT:
383 		index = APIC_LVT_TIMER;
384 		break;
385 	case APIC_OFFSET_THERM_LVT:
386 		index = APIC_LVT_THERMAL;
387 		break;
388 	case APIC_OFFSET_PERF_LVT:
389 		index = APIC_LVT_PMC;
390 		break;
391 	case APIC_OFFSET_LINT0_LVT:
392 		index = APIC_LVT_LINT0;
393 		break;
394 	case APIC_OFFSET_LINT1_LVT:
395 		index = APIC_LVT_LINT1;
396 		break;
397 	case APIC_OFFSET_ERROR_LVT:
398 		index = APIC_LVT_ERROR;
399 		break;
400 	default:
401 		index = -1;
402 		break;
403 	}
404 	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
405 	    "invalid lvt index %d for offset %x", index, offset));
406 
407 	return (index);
408 }
409 
410 static __inline uint32_t
411 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
412 {
413 	int idx;
414 	uint32_t val;
415 
416 	idx = lvt_off_to_idx(offset);
417 	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
418 	return (val);
419 }
420 
421 void
422 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
423 {
424 	uint32_t *lvtptr, mask, val;
425 	struct LAPIC *lapic;
426 	int idx;
427 
428 	lapic = vlapic->apic_page;
429 	lvtptr = vlapic_get_lvtptr(vlapic, offset);
430 	val = *lvtptr;
431 	idx = lvt_off_to_idx(offset);
432 
433 	if (!(lapic->svr & APIC_SVR_ENABLE))
434 		val |= APIC_LVT_M;
435 	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
436 	switch (offset) {
437 	case APIC_OFFSET_TIMER_LVT:
438 		mask |= APIC_LVTT_TM;
439 		break;
440 	case APIC_OFFSET_ERROR_LVT:
441 		break;
442 	case APIC_OFFSET_LINT0_LVT:
443 	case APIC_OFFSET_LINT1_LVT:
444 		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
445 		/* FALLTHROUGH */
446 	default:
447 		mask |= APIC_LVT_DM;
448 		break;
449 	}
450 	val &= mask;
451 	*lvtptr = val;
452 	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
453 }
454 
455 static void
456 vlapic_mask_lvts(struct vlapic *vlapic)
457 {
458 	struct LAPIC *lapic = vlapic->apic_page;
459 
460 	lapic->lvt_cmci |= APIC_LVT_M;
461 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
462 
463 	lapic->lvt_timer |= APIC_LVT_M;
464 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
465 
466 	lapic->lvt_thermal |= APIC_LVT_M;
467 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
468 
469 	lapic->lvt_pcint |= APIC_LVT_M;
470 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
471 
472 	lapic->lvt_lint0 |= APIC_LVT_M;
473 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
474 
475 	lapic->lvt_lint1 |= APIC_LVT_M;
476 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
477 
478 	lapic->lvt_error |= APIC_LVT_M;
479 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
480 }
481 
482 static int
483 vlapic_fire_lvt(struct vlapic *vlapic, uint_t lvt)
484 {
485 	uint32_t mode, reg, vec;
486 	vcpu_notify_t notify;
487 
488 	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
489 
490 	if (reg & APIC_LVT_M)
491 		return (0);
492 	vec = reg & APIC_LVT_VECTOR;
493 	mode = reg & APIC_LVT_DM;
494 
495 	switch (mode) {
496 	case APIC_LVT_DM_FIXED:
497 		if (vec < 16) {
498 			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
499 			    lvt == APIC_LVT_ERROR);
500 			return (0);
501 		}
502 		notify = vlapic_set_intr_ready(vlapic, vec, false);
503 		vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify);
504 		break;
505 	case APIC_LVT_DM_NMI:
506 		(void) vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
507 		break;
508 	case APIC_LVT_DM_EXTINT:
509 		(void) vm_inject_extint(vlapic->vm, vlapic->vcpuid);
510 		break;
511 	default:
512 		// Other modes ignored
513 		return (0);
514 	}
515 	return (1);
516 }
517 
518 static uint_t
519 vlapic_active_isr(struct vlapic *vlapic)
520 {
521 	int i;
522 	uint32_t *isrp;
523 
524 	isrp = &vlapic->apic_page->isr7;
525 
526 	for (i = 7; i >= 0; i--, isrp -= 4) {
527 		uint32_t reg = *isrp;
528 
529 		if (reg != 0) {
530 			uint_t vec = (i * 32) + bsrl(reg);
531 
532 			if (vec < 16) {
533 				/*
534 				 * Truncate the illegal low vectors to value of
535 				 * 0, indicating that no active ISR was found.
536 				 */
537 				return (0);
538 			}
539 			return (vec);
540 		}
541 	}
542 
543 	return (0);
544 }
545 
546 /*
547  * After events which might arbitrarily change the value of PPR, such as a TPR
548  * write or an EOI, calculate that new PPR value and store it in the APIC page.
549  */
550 static void
551 vlapic_update_ppr(struct vlapic *vlapic)
552 {
553 	int isrvec, tpr, ppr;
554 
555 	isrvec = vlapic_active_isr(vlapic);
556 	tpr = vlapic->apic_page->tpr;
557 
558 	/*
559 	 * Algorithm adopted from section "Interrupt, Task and Processor
560 	 * Priority" in Intel Architecture Manual Vol 3a.
561 	 */
562 	if (PRIO(tpr) >= PRIO(isrvec)) {
563 		ppr = tpr;
564 	} else {
565 		ppr = PRIO(isrvec);
566 	}
567 
568 	vlapic->apic_page->ppr = ppr;
569 }
570 
571 /*
572  * When a vector is asserted in ISR as in-service, the PPR must be raised to the
573  * priority of that vector, as the vCPU would have been at a lower priority in
574  * order for the vector to be accepted.
575  */
576 static void
577 vlapic_raise_ppr(struct vlapic *vlapic, int vec)
578 {
579 	struct LAPIC *lapic = vlapic->apic_page;
580 	int ppr;
581 
582 	ppr = PRIO(vec);
583 
584 #ifdef __ISRVEC_DEBUG
585 	KASSERT(vec >= 16 && vec < 256, ("invalid vector %d", vec));
586 	KASSERT(ppr > lapic->tpr, ("ppr %x <= tpr %x", ppr, lapic->tpr));
587 	KASSERT(ppr > lapic->ppr, ("ppr %x <= old ppr %x", ppr, lapic->ppr));
588 	KASSERT(vec == (int)vlapic_active_isr(vlapic), ("ISR missing for ppr"));
589 #endif /* __ISRVEC_DEBUG */
590 
591 	lapic->ppr = ppr;
592 }
593 
594 void
595 vlapic_sync_tpr(struct vlapic *vlapic)
596 {
597 	vlapic_update_ppr(vlapic);
598 }
599 
600 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
601 
602 static void
603 vlapic_process_eoi(struct vlapic *vlapic)
604 {
605 	struct LAPIC	*lapic = vlapic->apic_page;
606 	uint32_t	*isrptr, *tmrptr;
607 	int		i;
608 	uint_t		idx, bitpos, vector;
609 
610 	isrptr = &lapic->isr0;
611 	tmrptr = &lapic->tmr0;
612 
613 	for (i = 7; i >= 0; i--) {
614 		idx = i * 4;
615 		if (isrptr[idx] != 0) {
616 			bitpos = bsrl(isrptr[idx]);
617 			vector = i * 32 + bitpos;
618 
619 			isrptr[idx] &= ~(1 << bitpos);
620 #ifdef __ISRVEC_DEBUG
621 			vlapic_isrstk_eoi(vlapic, vector);
622 #endif
623 			vlapic_update_ppr(vlapic);
624 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
625 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
626 				    vector);
627 			}
628 			return;
629 		}
630 	}
631 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
632 }
633 
634 static __inline int
635 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
636 {
637 
638 	return (lvt & mask);
639 }
640 
641 static __inline int
642 vlapic_periodic_timer(struct vlapic *vlapic)
643 {
644 	uint32_t lvt;
645 
646 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
647 
648 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
649 }
650 
651 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
652 
653 static void
654 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
655 {
656 
657 	vlapic->esr_pending |= mask;
658 
659 	/*
660 	 * Avoid infinite recursion if the error LVT itself is configured with
661 	 * an illegal vector.
662 	 */
663 	if (lvt_error)
664 		return;
665 
666 	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
667 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
668 	}
669 }
670 
671 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
672 
673 static void
674 vlapic_fire_timer(struct vlapic *vlapic)
675 {
676 	ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
677 
678 	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
679 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
680 	}
681 }
682 
683 static VMM_STAT(VLAPIC_INTR_CMC,
684 	"corrected machine check interrupts generated by vlapic");
685 
686 void
687 vlapic_fire_cmci(struct vlapic *vlapic)
688 {
689 
690 	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
691 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
692 	}
693 }
694 
695 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
696 	"lvts triggered");
697 
698 int
699 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
700 {
701 	if (!vlapic_enabled(vlapic)) {
702 		/*
703 		 * When the local APIC is global/hardware disabled,
704 		 * LINT[1:0] pins are configured as INTR and NMI pins,
705 		 * respectively.
706 		 */
707 		switch (vector) {
708 			case APIC_LVT_LINT0:
709 				(void) vm_inject_extint(vlapic->vm,
710 				    vlapic->vcpuid);
711 				break;
712 			case APIC_LVT_LINT1:
713 				(void) vm_inject_nmi(vlapic->vm,
714 				    vlapic->vcpuid);
715 				break;
716 			default:
717 				break;
718 		}
719 		return (0);
720 	}
721 
722 	switch (vector) {
723 	case APIC_LVT_LINT0:
724 	case APIC_LVT_LINT1:
725 	case APIC_LVT_TIMER:
726 	case APIC_LVT_ERROR:
727 	case APIC_LVT_PMC:
728 	case APIC_LVT_THERMAL:
729 	case APIC_LVT_CMCI:
730 		if (vlapic_fire_lvt(vlapic, vector)) {
731 			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
732 			    LVTS_TRIGGERRED, vector, 1);
733 		}
734 		break;
735 	default:
736 		return (EINVAL);
737 	}
738 	return (0);
739 }
740 
741 static void
742 vlapic_callout_reset(struct vlapic *vlapic)
743 {
744 	callout_reset_hrtime(&vlapic->callout, vlapic->timer_fire_when,
745 	    vlapic_callout_handler, vlapic, C_ABSOLUTE);
746 }
747 
748 static void
749 vlapic_callout_handler(void *arg)
750 {
751 	struct vlapic *vlapic = arg;
752 
753 	VLAPIC_TIMER_LOCK(vlapic);
754 	if (callout_pending(&vlapic->callout))	/* callout was reset */
755 		goto done;
756 
757 	if (!callout_active(&vlapic->callout))	/* callout was stopped */
758 		goto done;
759 
760 	callout_deactivate(&vlapic->callout);
761 
762 	vlapic_fire_timer(vlapic);
763 
764 	if (vlapic_periodic_timer(vlapic)) {
765 		/*
766 		 * Compute the delta between when the timer was supposed to
767 		 * fire and the present time.  We can depend on the fact that
768 		 * cyclics (which underly these callouts) will never be called
769 		 * early.
770 		 */
771 		const hrtime_t now = gethrtime();
772 		const hrtime_t delta = now - vlapic->timer_fire_when;
773 		if (delta >= vlapic->timer_period) {
774 			/*
775 			 * If we are so behind that we have missed an entire
776 			 * timer period, reset the time base rather than
777 			 * attempting to catch up.
778 			 */
779 			vlapic->timer_fire_when = now + vlapic->timer_period;
780 		} else {
781 			vlapic->timer_fire_when += vlapic->timer_period;
782 		}
783 		vlapic_callout_reset(vlapic);
784 	}
785 done:
786 	VLAPIC_TIMER_UNLOCK(vlapic);
787 }
788 
789 void
790 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
791 {
792 	struct LAPIC *lapic = vlapic->apic_page;
793 
794 	VLAPIC_TIMER_LOCK(vlapic);
795 	vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
796 	    lapic->icr_timer);
797 	if (vlapic->timer_period != 0) {
798 		vlapic->timer_fire_when = gethrtime() + vlapic->timer_period;
799 		vlapic_callout_reset(vlapic);
800 	} else {
801 		vlapic->timer_fire_when = 0;
802 		callout_stop(&vlapic->callout);
803 	}
804 	VLAPIC_TIMER_UNLOCK(vlapic);
805 }
806 
807 /*
808  * This function populates 'dmask' with the set of vcpus that match the
809  * addressing specified by the (dest, phys, lowprio) tuple.
810  *
811  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
812  * or xAPIC (8-bit) destination field.
813  */
814 void
815 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
816     bool lowprio, bool x2apic_dest)
817 {
818 	struct vlapic *vlapic;
819 	uint32_t dfr, ldr, ldest, cluster;
820 	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
821 	cpuset_t amask;
822 	int vcpuid;
823 
824 	if ((x2apic_dest && dest == 0xffffffff) ||
825 	    (!x2apic_dest && dest == 0xff)) {
826 		/*
827 		 * Broadcast in both logical and physical modes.
828 		 */
829 		*dmask = vm_active_cpus(vm);
830 		return;
831 	}
832 
833 	if (phys) {
834 		/*
835 		 * Physical mode: destination is APIC ID.
836 		 */
837 		CPU_ZERO(dmask);
838 		vcpuid = vm_apicid2vcpuid(vm, dest);
839 		amask = vm_active_cpus(vm);
840 		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
841 			CPU_SET(vcpuid, dmask);
842 	} else {
843 		/*
844 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
845 		 * bitmask. This model is only available in the xAPIC mode.
846 		 */
847 		mda_flat_ldest = dest & 0xff;
848 
849 		/*
850 		 * In the "Cluster Model" the MDA is used to identify a
851 		 * specific cluster and a set of APICs in that cluster.
852 		 */
853 		if (x2apic_dest) {
854 			mda_cluster_id = dest >> 16;
855 			mda_cluster_ldest = dest & 0xffff;
856 		} else {
857 			mda_cluster_id = (dest >> 4) & 0xf;
858 			mda_cluster_ldest = dest & 0xf;
859 		}
860 
861 		/*
862 		 * Logical mode: match each APIC that has a bit set
863 		 * in its LDR that matches a bit in the ldest.
864 		 */
865 		CPU_ZERO(dmask);
866 		amask = vm_active_cpus(vm);
867 		while ((vcpuid = CPU_FFS(&amask)) != 0) {
868 			vcpuid--;
869 			CPU_CLR(vcpuid, &amask);
870 
871 			vlapic = vm_lapic(vm, vcpuid);
872 			dfr = vlapic->apic_page->dfr;
873 			ldr = vlapic->apic_page->ldr;
874 
875 			if ((dfr & APIC_DFR_MODEL_MASK) ==
876 			    APIC_DFR_MODEL_FLAT) {
877 				ldest = ldr >> 24;
878 				mda_ldest = mda_flat_ldest;
879 			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
880 			    APIC_DFR_MODEL_CLUSTER) {
881 				if (vlapic_x2mode(vlapic)) {
882 					cluster = ldr >> 16;
883 					ldest = ldr & 0xffff;
884 				} else {
885 					cluster = ldr >> 28;
886 					ldest = (ldr >> 24) & 0xf;
887 				}
888 				if (cluster != mda_cluster_id)
889 					continue;
890 				mda_ldest = mda_cluster_ldest;
891 			} else {
892 				/*
893 				 * Guest has configured a bad logical
894 				 * model for this vcpu - skip it.
895 				 */
896 				continue;
897 			}
898 
899 			if ((mda_ldest & ldest) != 0) {
900 				CPU_SET(vcpuid, dmask);
901 				if (lowprio)
902 					break;
903 			}
904 		}
905 	}
906 }
907 
908 static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu");
909 static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu");
910 
911 static void
912 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
913 {
914 	struct LAPIC *lapic = vlapic->apic_page;
915 
916 	if (lapic->tpr != val) {
917 		lapic->tpr = val;
918 		vlapic_update_ppr(vlapic);
919 	}
920 }
921 
922 void
923 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
924 {
925 	uint8_t tpr;
926 
927 	if (val & ~0xf) {
928 		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
929 		return;
930 	}
931 
932 	tpr = val << 4;
933 	vlapic_set_tpr(vlapic, tpr);
934 }
935 
936 uint64_t
937 vlapic_get_cr8(struct vlapic *vlapic)
938 {
939 	const struct LAPIC *lapic = vlapic->apic_page;
940 
941 	return (lapic->tpr >> 4);
942 }
943 
944 void
945 vlapic_icrlo_write_handler(struct vlapic *vlapic)
946 {
947 	int i;
948 	cpuset_t dmask;
949 	uint64_t icrval;
950 	uint32_t dest, vec, mode, dsh;
951 	struct LAPIC *lapic;
952 
953 	lapic = vlapic->apic_page;
954 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
955 	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
956 
957 	if (vlapic_x2mode(vlapic))
958 		dest = icrval >> 32;
959 	else
960 		dest = icrval >> (32 + 24);
961 	vec = icrval & APIC_VECTOR_MASK;
962 	mode = icrval & APIC_DELMODE_MASK;
963 	dsh = icrval & APIC_DEST_MASK;
964 
965 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
966 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
967 		return;
968 	}
969 	if (mode == APIC_DELMODE_INIT &&
970 	    (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
971 		/* No work required to deassert INIT */
972 		return;
973 	}
974 	if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) &&
975 	    !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) {
976 		/*
977 		 * While Intel makes no mention of restrictions for destination
978 		 * shorthand when sending INIT or SIPI, AMD requires either a
979 		 * specific destination or all-excluding self.  Common use seems
980 		 * to be restricted to those two cases.  Until handling is in
981 		 * place to halt a guest which makes such a frivolous request,
982 		 * we will ignore them.
983 		 */
984 		return;
985 	}
986 
987 	switch (dsh) {
988 	case APIC_DEST_DESTFLD:
989 		vlapic_calcdest(vlapic->vm, &dmask, dest,
990 		    (icrval & APIC_DESTMODE_LOG) == 0, false,
991 		    vlapic_x2mode(vlapic));
992 		break;
993 	case APIC_DEST_SELF:
994 		CPU_SETOF(vlapic->vcpuid, &dmask);
995 		break;
996 	case APIC_DEST_ALLISELF:
997 		dmask = vm_active_cpus(vlapic->vm);
998 		break;
999 	case APIC_DEST_ALLESELF:
1000 		dmask = vm_active_cpus(vlapic->vm);
1001 		CPU_CLR(vlapic->vcpuid, &dmask);
1002 		break;
1003 	default:
1004 		/*
1005 		 * All possible delivery notations are covered above.
1006 		 * We should never end up here.
1007 		 */
1008 		panic("unknown delivery shorthand: %x", dsh);
1009 	}
1010 
1011 	while ((i = CPU_FFS(&dmask)) != 0) {
1012 		i--;
1013 		CPU_CLR(i, &dmask);
1014 		switch (mode) {
1015 		case APIC_DELMODE_FIXED:
1016 			(void) lapic_intr_edge(vlapic->vm, i, vec);
1017 			vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
1018 			    VLAPIC_IPI_SEND, 1);
1019 			vmm_stat_incr(vlapic->vm, i,
1020 			    VLAPIC_IPI_RECV, 1);
1021 			break;
1022 		case APIC_DELMODE_NMI:
1023 			(void) vm_inject_nmi(vlapic->vm, i);
1024 			break;
1025 		case APIC_DELMODE_INIT:
1026 			(void) vm_inject_init(vlapic->vm, i);
1027 			break;
1028 		case APIC_DELMODE_STARTUP:
1029 			(void) vm_inject_sipi(vlapic->vm, i, vec);
1030 			break;
1031 		case APIC_DELMODE_LOWPRIO:
1032 		case APIC_DELMODE_SMI:
1033 		default:
1034 			/* Unhandled IPI modes (for now) */
1035 			break;
1036 		}
1037 	}
1038 }
1039 
1040 void
1041 vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val)
1042 {
1043 	const int vec = val & 0xff;
1044 
1045 	/* self-IPI is only exposed via x2APIC */
1046 	ASSERT(vlapic_x2mode(vlapic));
1047 
1048 	(void) lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
1049 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_SEND, 1);
1050 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_RECV, 1);
1051 }
1052 
1053 int
1054 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1055 {
1056 	struct LAPIC	*lapic = vlapic->apic_page;
1057 	int		 idx, i, bitpos, vector;
1058 	uint32_t	*irrptr, val;
1059 
1060 	if (vlapic->ops.sync_state) {
1061 		(*vlapic->ops.sync_state)(vlapic);
1062 	}
1063 
1064 	irrptr = &lapic->irr0;
1065 
1066 	for (i = 7; i >= 0; i--) {
1067 		idx = i * 4;
1068 		val = atomic_load_acq_int(&irrptr[idx]);
1069 		bitpos = fls(val);
1070 		if (bitpos != 0) {
1071 			vector = i * 32 + (bitpos - 1);
1072 			if (PRIO(vector) > PRIO(lapic->ppr)) {
1073 				if (vecptr != NULL)
1074 					*vecptr = vector;
1075 				return (1);
1076 			} else
1077 				break;
1078 		}
1079 	}
1080 	return (0);
1081 }
1082 
1083 void
1084 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1085 {
1086 	struct LAPIC	*lapic = vlapic->apic_page;
1087 	uint32_t	*irrptr, *isrptr;
1088 	int		idx;
1089 
1090 	KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector));
1091 
1092 	if (vlapic->ops.intr_accepted)
1093 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1094 
1095 	/*
1096 	 * clear the ready bit for vector being accepted in irr
1097 	 * and set the vector as in service in isr.
1098 	 */
1099 	idx = (vector / 32) * 4;
1100 
1101 	irrptr = &lapic->irr0;
1102 	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1103 
1104 	isrptr = &lapic->isr0;
1105 	isrptr[idx] |= 1 << (vector % 32);
1106 
1107 	/*
1108 	 * The only way a fresh vector could be accepted into ISR is if it was
1109 	 * of a higher priority than the current PPR.  With that vector now
1110 	 * in-service, the PPR must be raised.
1111 	 */
1112 	vlapic_raise_ppr(vlapic, vector);
1113 
1114 #ifdef __ISRVEC_DEBUG
1115 	vlapic_isrstk_accept(vlapic, vector);
1116 #endif
1117 }
1118 
1119 void
1120 vlapic_svr_write_handler(struct vlapic *vlapic)
1121 {
1122 	struct LAPIC *lapic;
1123 	uint32_t old, new, changed;
1124 
1125 	lapic = vlapic->apic_page;
1126 
1127 	new = lapic->svr;
1128 	old = vlapic->svr_last;
1129 	vlapic->svr_last = new;
1130 
1131 	changed = old ^ new;
1132 	if ((changed & APIC_SVR_ENABLE) != 0) {
1133 		if ((new & APIC_SVR_ENABLE) == 0) {
1134 			/*
1135 			 * The apic is now disabled so stop the apic timer
1136 			 * and mask all the LVT entries.
1137 			 */
1138 			VLAPIC_TIMER_LOCK(vlapic);
1139 			callout_stop(&vlapic->callout);
1140 			VLAPIC_TIMER_UNLOCK(vlapic);
1141 			vlapic_mask_lvts(vlapic);
1142 		} else {
1143 			/*
1144 			 * The apic is now enabled so restart the apic timer
1145 			 * if it is configured in periodic mode.
1146 			 */
1147 			if (vlapic_periodic_timer(vlapic))
1148 				vlapic_icrtmr_write_handler(vlapic);
1149 		}
1150 	}
1151 }
1152 
1153 static bool
1154 vlapic_read(struct vlapic *vlapic, uint16_t offset, uint32_t *outp)
1155 {
1156 	struct LAPIC *lapic = vlapic->apic_page;
1157 	uint32_t *reg;
1158 	int i;
1159 
1160 	ASSERT3U(offset & 0x3, ==, 0);
1161 	ASSERT3U(offset, <, PAGESIZE);
1162 	ASSERT3P(outp, !=, NULL);
1163 
1164 	uint32_t data = 0;
1165 	switch (offset) {
1166 	case APIC_OFFSET_ID:
1167 		data = lapic->id;
1168 		break;
1169 	case APIC_OFFSET_VER:
1170 		data = lapic->version;
1171 		break;
1172 	case APIC_OFFSET_TPR:
1173 		data = lapic->tpr;
1174 		break;
1175 	case APIC_OFFSET_APR:
1176 		data = lapic->apr;
1177 		break;
1178 	case APIC_OFFSET_PPR:
1179 		data = lapic->ppr;
1180 		break;
1181 	case APIC_OFFSET_LDR:
1182 		data = lapic->ldr;
1183 		break;
1184 	case APIC_OFFSET_DFR:
1185 		data = lapic->dfr;
1186 		break;
1187 	case APIC_OFFSET_SVR:
1188 		data = lapic->svr;
1189 		break;
1190 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1191 		i = (offset - APIC_OFFSET_ISR0) >> 2;
1192 		reg = &lapic->isr0;
1193 		data = *(reg + i);
1194 		break;
1195 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1196 		i = (offset - APIC_OFFSET_TMR0) >> 2;
1197 		reg = &lapic->tmr0;
1198 		data = *(reg + i);
1199 		break;
1200 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1201 		i = (offset - APIC_OFFSET_IRR0) >> 2;
1202 		reg = &lapic->irr0;
1203 		data = atomic_load_acq_int(reg + i);
1204 		break;
1205 	case APIC_OFFSET_ESR:
1206 		data = lapic->esr;
1207 		break;
1208 	case APIC_OFFSET_ICR_LOW:
1209 		data = lapic->icr_lo;
1210 		break;
1211 	case APIC_OFFSET_ICR_HI:
1212 		data = lapic->icr_hi;
1213 		break;
1214 	case APIC_OFFSET_CMCI_LVT:
1215 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1216 		data = vlapic_get_lvt(vlapic, offset);
1217 #ifdef INVARIANTS
1218 		reg = vlapic_get_lvtptr(vlapic, offset);
1219 		ASSERT3U(data, ==, *reg);
1220 #endif
1221 		break;
1222 	case APIC_OFFSET_TIMER_ICR:
1223 		data = lapic->icr_timer;
1224 		break;
1225 	case APIC_OFFSET_TIMER_CCR:
1226 		data = vlapic_get_ccr(vlapic);
1227 		break;
1228 	case APIC_OFFSET_TIMER_DCR:
1229 		data = lapic->dcr_timer;
1230 		break;
1231 	case APIC_OFFSET_RRR:
1232 		data = 0;
1233 		break;
1234 
1235 	case APIC_OFFSET_SELF_IPI:
1236 	case APIC_OFFSET_EOI:
1237 		/* Write-only register */
1238 		*outp = 0;
1239 		return (false);
1240 
1241 	default:
1242 		/* Invalid register */
1243 		*outp = 0;
1244 		return (false);
1245 	}
1246 
1247 	*outp = data;
1248 	return (true);
1249 }
1250 
1251 static bool
1252 vlapic_write(struct vlapic *vlapic, uint16_t offset, uint32_t data)
1253 {
1254 	struct LAPIC	*lapic = vlapic->apic_page;
1255 	uint32_t	*regptr;
1256 
1257 	ASSERT3U(offset & 0xf, ==, 0);
1258 	ASSERT3U(offset, <, PAGESIZE);
1259 
1260 	switch (offset) {
1261 	case APIC_OFFSET_ID:
1262 		lapic->id = data;
1263 		vlapic_id_write_handler(vlapic);
1264 		break;
1265 	case APIC_OFFSET_TPR:
1266 		vlapic_set_tpr(vlapic, data & 0xff);
1267 		break;
1268 	case APIC_OFFSET_EOI:
1269 		vlapic_process_eoi(vlapic);
1270 		break;
1271 	case APIC_OFFSET_LDR:
1272 		lapic->ldr = data;
1273 		vlapic_ldr_write_handler(vlapic);
1274 		break;
1275 	case APIC_OFFSET_DFR:
1276 		lapic->dfr = data;
1277 		vlapic_dfr_write_handler(vlapic);
1278 		break;
1279 	case APIC_OFFSET_SVR:
1280 		lapic->svr = data;
1281 		vlapic_svr_write_handler(vlapic);
1282 		break;
1283 	case APIC_OFFSET_ICR_LOW:
1284 		lapic->icr_lo = data;
1285 		vlapic_icrlo_write_handler(vlapic);
1286 		break;
1287 	case APIC_OFFSET_ICR_HI:
1288 		lapic->icr_hi = data;
1289 		break;
1290 	case APIC_OFFSET_CMCI_LVT:
1291 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1292 		regptr = vlapic_get_lvtptr(vlapic, offset);
1293 		*regptr = data;
1294 		vlapic_lvt_write_handler(vlapic, offset);
1295 		break;
1296 	case APIC_OFFSET_TIMER_ICR:
1297 		lapic->icr_timer = data;
1298 		vlapic_icrtmr_write_handler(vlapic);
1299 		break;
1300 
1301 	case APIC_OFFSET_TIMER_DCR:
1302 		lapic->dcr_timer = data;
1303 		vlapic_dcr_write_handler(vlapic);
1304 		break;
1305 
1306 	case APIC_OFFSET_ESR:
1307 		vlapic_esr_write_handler(vlapic);
1308 		break;
1309 
1310 	case APIC_OFFSET_SELF_IPI:
1311 		if (vlapic_x2mode(vlapic))
1312 			vlapic_self_ipi_handler(vlapic, data);
1313 		break;
1314 
1315 	case APIC_OFFSET_VER:
1316 	case APIC_OFFSET_APR:
1317 	case APIC_OFFSET_PPR:
1318 	case APIC_OFFSET_RRR:
1319 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1320 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1321 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1322 	case APIC_OFFSET_TIMER_CCR:
1323 		/* Read-only register */
1324 		return (false);
1325 
1326 	default:
1327 		/* Invalid register */
1328 		return (false);
1329 	}
1330 
1331 	return (true);
1332 }
1333 
1334 void
1335 vlapic_reset(struct vlapic *vlapic)
1336 {
1337 	struct LAPIC *lapic = vlapic->apic_page;
1338 	uint32_t *isrptr, *tmrptr, *irrptr;
1339 
1340 	/* Reset any timer-related state first */
1341 	VLAPIC_TIMER_LOCK(vlapic);
1342 	callout_stop(&vlapic->callout);
1343 	lapic->icr_timer = 0;
1344 	lapic->ccr_timer = 0;
1345 	VLAPIC_TIMER_UNLOCK(vlapic);
1346 	lapic->dcr_timer = 0;
1347 	vlapic_dcr_write_handler(vlapic);
1348 
1349 	/*
1350 	 * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so
1351 	 * it is not leftover after the reset.  This is performed after the APIC
1352 	 * timer has been stopped, in case it happened to fire just prior to
1353 	 * being deactivated.
1354 	 */
1355 	if (vlapic->ops.sync_state) {
1356 		(*vlapic->ops.sync_state)(vlapic);
1357 	}
1358 
1359 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1360 	if (vlapic->vcpuid == 0)
1361 		vlapic->msr_apicbase |= APICBASE_BSP;
1362 
1363 	lapic->id = vlapic_get_id(vlapic);
1364 	lapic->version = VLAPIC_VERSION;
1365 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1366 
1367 	lapic->tpr = 0;
1368 	lapic->apr = 0;
1369 	lapic->ppr = 0;
1370 
1371 #ifdef __ISRVEC_DEBUG
1372 	/* With the PPR cleared, the isrvec tracking should be reset too */
1373 	vlapic->isrvec_stk_top = 0;
1374 #endif
1375 
1376 	lapic->eoi = 0;
1377 	lapic->ldr = 0;
1378 	lapic->dfr = 0xffffffff;
1379 	lapic->svr = APIC_SVR_VECTOR;
1380 	vlapic->svr_last = lapic->svr;
1381 
1382 	isrptr = &lapic->isr0;
1383 	tmrptr = &lapic->tmr0;
1384 	irrptr = &lapic->irr0;
1385 	for (uint_t i = 0; i < 8; i++) {
1386 		atomic_store_rel_int(&isrptr[i * 4], 0);
1387 		atomic_store_rel_int(&tmrptr[i * 4], 0);
1388 		atomic_store_rel_int(&irrptr[i * 4], 0);
1389 	}
1390 
1391 	lapic->esr = 0;
1392 	vlapic->esr_pending = 0;
1393 	lapic->icr_lo = 0;
1394 	lapic->icr_hi = 0;
1395 
1396 	lapic->lvt_cmci = 0;
1397 	lapic->lvt_timer = 0;
1398 	lapic->lvt_thermal = 0;
1399 	lapic->lvt_pcint = 0;
1400 	lapic->lvt_lint0 = 0;
1401 	lapic->lvt_lint1 = 0;
1402 	lapic->lvt_error = 0;
1403 	vlapic_mask_lvts(vlapic);
1404 }
1405 
1406 void
1407 vlapic_init(struct vlapic *vlapic)
1408 {
1409 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1410 	KASSERT(vlapic->vcpuid >= 0 &&
1411 	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1412 	    ("vlapic_init: vcpuid is not initialized"));
1413 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1414 	    "initialized"));
1415 
1416 	/*
1417 	 * If the vlapic is configured in x2apic mode then it will be
1418 	 * accessed in the critical section via the MSR emulation code.
1419 	 *
1420 	 * Therefore the timer mutex must be a spinlock because blockable
1421 	 * mutexes cannot be acquired in a critical section.
1422 	 */
1423 	mutex_init(&vlapic->timer_lock, NULL, MUTEX_ADAPTIVE, NULL);
1424 	callout_init(&vlapic->callout, 1);
1425 
1426 	vlapic_reset(vlapic);
1427 }
1428 
1429 void
1430 vlapic_cleanup(struct vlapic *vlapic)
1431 {
1432 	callout_drain(&vlapic->callout);
1433 	mutex_destroy(&vlapic->timer_lock);
1434 }
1435 
1436 int
1437 vlapic_mmio_read(struct vlapic *vlapic, uint64_t gpa, uint64_t *valp,
1438     uint_t size)
1439 {
1440 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1441 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1442 
1443 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1444 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1445 		*valp = UINT64_MAX;
1446 		return (0);
1447 	}
1448 
1449 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1450 	uint32_t raw = 0;
1451 	(void) vlapic_read(vlapic, off & ~0xf, &raw);
1452 
1453 	/* Shift and mask reads which are small and/or unaligned */
1454 	const uint8_t align = off & 0xf;
1455 	if (align < 4) {
1456 		*valp = (uint64_t)raw << (align * 8);
1457 	} else {
1458 		*valp = 0;
1459 	}
1460 
1461 	return (0);
1462 }
1463 
1464 int
1465 vlapic_mmio_write(struct vlapic *vlapic, uint64_t gpa, uint64_t val,
1466     uint_t size)
1467 {
1468 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1469 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1470 
1471 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1472 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1473 		return (0);
1474 	}
1475 
1476 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1477 	/* Ignore writes which are not 32-bits wide and 16-byte aligned */
1478 	if ((off & 0xf) != 0 || size != 4) {
1479 		return (0);
1480 	}
1481 
1482 	(void) vlapic_write(vlapic, off, (uint32_t)val);
1483 	return (0);
1484 }
1485 
1486 /* Should attempts to change the APIC base address be rejected with a #GP?  */
1487 int vlapic_gp_on_addr_change = 1;
1488 
1489 static vm_msr_result_t
1490 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
1491 {
1492 	const uint64_t diff = vlapic->msr_apicbase ^ val;
1493 
1494 	/*
1495 	 * Until the LAPIC emulation for switching between xAPIC and x2APIC
1496 	 * modes is more polished, it will remain off-limits from being altered
1497 	 * by the guest.
1498 	 */
1499 	const uint64_t reserved_bits = APICBASE_RESERVED | APICBASE_X2APIC |
1500 	    APICBASE_BSP;
1501 	if ((diff & reserved_bits) != 0) {
1502 		return (VMR_GP);
1503 	}
1504 
1505 	/* We do not presently allow the LAPIC access address to be modified. */
1506 	if ((diff & APICBASE_ADDR_MASK) != 0) {
1507 		/*
1508 		 * Explicitly rebuffing such requests with a #GP is the most
1509 		 * straightforward way to handle the situation, but certain
1510 		 * consumers (such as the KVM unit tests) may balk at the
1511 		 * otherwise unexpected exception.
1512 		 */
1513 		if (vlapic_gp_on_addr_change) {
1514 			return (VMR_GP);
1515 		}
1516 
1517 		/* If silence is required, just ignore the address change. */
1518 		val = (val & ~APICBASE_ADDR_MASK) | DEFAULT_APIC_BASE;
1519 	}
1520 
1521 	vlapic->msr_apicbase = val;
1522 	return (VMR_OK);
1523 }
1524 
1525 static __inline uint16_t
1526 vlapic_msr_to_regoff(uint32_t msr)
1527 {
1528 	ASSERT3U(msr, >=, MSR_APIC_000);
1529 	ASSERT3U(msr, <, (MSR_APIC_000 + 0x100));
1530 
1531 	return ((msr - MSR_APIC_000) << 4);
1532 }
1533 
1534 bool
1535 vlapic_owned_msr(uint32_t msr)
1536 {
1537 	if (msr == MSR_APICBASE) {
1538 		return (true);
1539 	}
1540 	if (msr >= MSR_APIC_000 &&
1541 	    msr < (MSR_APIC_000 + 0x100)) {
1542 		return (true);
1543 	}
1544 	return (false);
1545 }
1546 
1547 vm_msr_result_t
1548 vlapic_rdmsr(struct vlapic *vlapic, uint32_t msr, uint64_t *valp)
1549 {
1550 	ASSERT(vlapic_owned_msr(msr));
1551 	ASSERT3P(valp, !=, NULL);
1552 
1553 	if (msr == MSR_APICBASE) {
1554 		*valp = vlapic->msr_apicbase;
1555 		return (VMR_OK);
1556 	}
1557 
1558 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1559 	if (!vlapic_x2mode(vlapic)) {
1560 		return (VMR_GP);
1561 	}
1562 
1563 	uint64_t out = 0;
1564 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1565 	switch (reg) {
1566 	case APIC_OFFSET_ICR_LOW: {
1567 		/* Read from ICR register gets entire (64-bit) value */
1568 		uint32_t low = 0, high = 0;
1569 		bool valid;
1570 
1571 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_HI, &high);
1572 		VERIFY(valid);
1573 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_LOW, &low);
1574 		VERIFY(valid);
1575 
1576 		*valp = ((uint64_t)high << 32) | low;
1577 		return (VMR_OK);
1578 		}
1579 	case APIC_OFFSET_ICR_HI:
1580 		/* Already covered by ICR_LOW */
1581 		return (VMR_GP);
1582 	default:
1583 		break;
1584 	}
1585 	if (!vlapic_read(vlapic, reg, (uint32_t *)&out)) {
1586 		return (VMR_GP);
1587 	}
1588 	*valp = out;
1589 	return (VMR_OK);
1590 }
1591 
1592 vm_msr_result_t
1593 vlapic_wrmsr(struct vlapic *vlapic, uint32_t msr, uint64_t val)
1594 {
1595 	ASSERT(vlapic_owned_msr(msr));
1596 
1597 	if (msr == MSR_APICBASE) {
1598 		return (vlapic_set_apicbase(vlapic, val));
1599 	}
1600 
1601 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1602 	if (!vlapic_x2mode(vlapic)) {
1603 		return (VMR_GP);
1604 	}
1605 
1606 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1607 	switch (reg) {
1608 	case APIC_OFFSET_ICR_LOW: {
1609 		/* Write to ICR register sets entire (64-bit) value */
1610 		bool valid;
1611 
1612 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_HI, val >> 32);
1613 		VERIFY(valid);
1614 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_LOW, val);
1615 		VERIFY(valid);
1616 		return (VMR_OK);
1617 		}
1618 	case APIC_OFFSET_ICR_HI:
1619 		/* Already covered by ICR_LOW */
1620 		return (VMR_GP);
1621 	case APIC_OFFSET_ESR:
1622 		/* Only 0 may be written from x2APIC mode */
1623 		if (val != 0) {
1624 			return (VMR_GP);
1625 		}
1626 		break;
1627 	default:
1628 		break;
1629 	}
1630 	if (!vlapic_write(vlapic, reg, val)) {
1631 		return (VMR_GP);
1632 	}
1633 	return (VMR_OK);
1634 }
1635 
1636 void
1637 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1638 {
1639 	struct vlapic *vlapic;
1640 	struct LAPIC *lapic;
1641 
1642 	vlapic = vm_lapic(vm, vcpuid);
1643 
1644 	if (state == X2APIC_DISABLED)
1645 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1646 	else
1647 		vlapic->msr_apicbase |= APICBASE_X2APIC;
1648 
1649 	/*
1650 	 * Reset the local APIC registers whose values are mode-dependent.
1651 	 *
1652 	 * XXX this works because the APIC mode can be changed only at vcpu
1653 	 * initialization time.
1654 	 */
1655 	lapic = vlapic->apic_page;
1656 	lapic->id = vlapic_get_id(vlapic);
1657 	if (vlapic_x2mode(vlapic)) {
1658 		lapic->ldr = x2apic_ldr(vlapic);
1659 		lapic->dfr = 0;
1660 	} else {
1661 		lapic->ldr = 0;
1662 		lapic->dfr = 0xffffffff;
1663 	}
1664 
1665 	if (state == X2APIC_ENABLED) {
1666 		if (vlapic->ops.enable_x2apic_mode)
1667 			(*vlapic->ops.enable_x2apic_mode)(vlapic);
1668 	}
1669 }
1670 
1671 void
1672 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1673     int delmode, int vec)
1674 {
1675 	bool lowprio;
1676 	int vcpuid;
1677 	cpuset_t dmask;
1678 
1679 	if (delmode != IOART_DELFIXED &&
1680 	    delmode != IOART_DELLOPRI &&
1681 	    delmode != IOART_DELEXINT) {
1682 		/* Invalid delivery mode */
1683 		return;
1684 	}
1685 	lowprio = (delmode == IOART_DELLOPRI);
1686 
1687 	/*
1688 	 * We don't provide any virtual interrupt redirection hardware so
1689 	 * all interrupts originating from the ioapic or MSI specify the
1690 	 * 'dest' in the legacy xAPIC format.
1691 	 */
1692 	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1693 
1694 	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
1695 		vcpuid--;
1696 		CPU_CLR(vcpuid, &dmask);
1697 		if (delmode == IOART_DELEXINT) {
1698 			(void) vm_inject_extint(vm, vcpuid);
1699 		} else {
1700 			(void) lapic_set_intr(vm, vcpuid, vec, level);
1701 		}
1702 	}
1703 }
1704 
1705 void
1706 vlapic_post_intr(struct vlapic *vlapic, int hostcpu)
1707 {
1708 	/*
1709 	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
1710 	 *
1711 	 * This is done by leveraging features like Posted Interrupts (Intel)
1712 	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1713 	 *
1714 	 * If neither of these features are available then fallback to
1715 	 * sending an IPI to 'hostcpu'.
1716 	 */
1717 	if (vlapic->ops.post_intr)
1718 		(*vlapic->ops.post_intr)(vlapic, hostcpu);
1719 	else
1720 		poke_cpu(hostcpu);
1721 }
1722 
1723 void
1724 vlapic_localize_resources(struct vlapic *vlapic)
1725 {
1726 	vmm_glue_callout_localize(&vlapic->callout);
1727 }
1728 
1729 #ifdef __ISRVEC_DEBUG
1730 static void
1731 vlapic_isrstk_eoi(struct vlapic *vlapic, int vector)
1732 {
1733 	if (vlapic->isrvec_stk_top <= 0) {
1734 		panic("invalid vlapic isrvec_stk_top %d",
1735 		    vlapic->isrvec_stk_top);
1736 	}
1737 	vlapic->isrvec_stk_top--;
1738 	vlapic_isrstk_verify(vlapic);
1739 }
1740 
1741 static void
1742 vlapic_isrstk_accept(struct vlapic *vlapic, int vector)
1743 {
1744 	int stk_top;
1745 
1746 	vlapic->isrvec_stk_top++;
1747 
1748 	stk_top = vlapic->isrvec_stk_top;
1749 	if (stk_top >= ISRVEC_STK_SIZE)
1750 		panic("isrvec_stk_top overflow %d", stk_top);
1751 
1752 	vlapic->isrvec_stk[stk_top] = vector;
1753 	vlapic_isrstk_verify(vlapic);
1754 }
1755 
1756 static void
1757 vlapic_isrstk_dump(const struct vlapic *vlapic)
1758 {
1759 	int i;
1760 	uint32_t *isrptr;
1761 
1762 	isrptr = &vlapic->apic_page->isr0;
1763 	for (i = 0; i < 8; i++)
1764 		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
1765 
1766 	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
1767 		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
1768 }
1769 
1770 static void
1771 vlapic_isrstk_verify(const struct vlapic *vlapic)
1772 {
1773 	int i, lastprio, curprio, vector, idx;
1774 	uint32_t *isrptr;
1775 
1776 	/*
1777 	 * Note: The value at index 0 in isrvec_stk is always 0.
1778 	 *
1779 	 * It is a placeholder for the value of ISR vector when no bits are set
1780 	 * in the ISRx registers.
1781 	 */
1782 	if (vlapic->isrvec_stk_top == 0 && vlapic->isrvec_stk[0] != 0) {
1783 		panic("isrvec_stk is corrupted: %d", vlapic->isrvec_stk[0]);
1784 	}
1785 
1786 	/*
1787 	 * Make sure that the priority of the nested interrupts is
1788 	 * always increasing.
1789 	 */
1790 	lastprio = -1;
1791 	for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
1792 		curprio = PRIO(vlapic->isrvec_stk[i]);
1793 		if (curprio <= lastprio) {
1794 			vlapic_isrstk_dump(vlapic);
1795 			panic("isrvec_stk does not satisfy invariant");
1796 		}
1797 		lastprio = curprio;
1798 	}
1799 
1800 	/*
1801 	 * Make sure that each bit set in the ISRx registers has a
1802 	 * corresponding entry on the isrvec stack.
1803 	 */
1804 	i = 1;
1805 	isrptr = &vlapic->apic_page->isr0;
1806 	for (vector = 0; vector < 256; vector++) {
1807 		idx = (vector / 32) * 4;
1808 		if (isrptr[idx] & (1 << (vector % 32))) {
1809 			if (i > vlapic->isrvec_stk_top ||
1810 			    vlapic->isrvec_stk[i] != vector) {
1811 				vlapic_isrstk_dump(vlapic);
1812 				panic("ISR and isrvec_stk out of sync");
1813 			}
1814 			i++;
1815 		}
1816 	}
1817 }
1818 #endif
1819