xref: /illumos-gate/usr/src/uts/i86pc/io/pcplusmp/apic_common.c (revision 95faac55ed9158a0f593df1059de9fffbe33c5b4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright 2019, Joyent, Inc.
27  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
28  * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
29  */
30 
31 /*
32  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
33  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
34  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
35  * PSMI 1.5 extensions are supported in Solaris Nevada.
36  * PSMI 1.6 extensions are supported in Solaris Nevada.
37  * PSMI 1.7 extensions are supported in Solaris Nevada.
38  */
39 #define	PSMI_1_7
40 
41 #include <sys/processor.h>
42 #include <sys/time.h>
43 #include <sys/psm.h>
44 #include <sys/smp_impldefs.h>
45 #include <sys/cram.h>
46 #include <sys/acpi/acpi.h>
47 #include <sys/acpica.h>
48 #include <sys/psm_common.h>
49 #include <sys/apic.h>
50 #include <sys/pit.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddi_impldefs.h>
54 #include <sys/pci.h>
55 #include <sys/promif.h>
56 #include <sys/x86_archext.h>
57 #include <sys/cpc_impl.h>
58 #include <sys/uadmin.h>
59 #include <sys/panic.h>
60 #include <sys/debug.h>
61 #include <sys/archsystm.h>
62 #include <sys/trap.h>
63 #include <sys/machsystm.h>
64 #include <sys/sysmacros.h>
65 #include <sys/cpuvar.h>
66 #include <sys/rm_platter.h>
67 #include <sys/privregs.h>
68 #include <sys/note.h>
69 #include <sys/pci_intr_lib.h>
70 #include <sys/spl.h>
71 #include <sys/clock.h>
72 #include <sys/dditypes.h>
73 #include <sys/sunddi.h>
74 #include <sys/x_call.h>
75 #include <sys/reboot.h>
76 #include <sys/hpet.h>
77 #include <sys/apic_common.h>
78 #include <sys/apic_timer.h>
79 
80 static void	apic_record_ioapic_rdt(void *intrmap_private,
81 		    ioapic_rdt_t *irdt);
82 static void	apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
83 
84 /*
85  * Common routines between pcplusmp & apix (taken from apic.c).
86  */
87 
88 int	apic_clkinit(int);
89 hrtime_t apic_gethrtime(void);
90 void	apic_send_ipi(int, int);
91 void	apic_set_idlecpu(processorid_t);
92 void	apic_unset_idlecpu(processorid_t);
93 void	apic_shutdown(int, int);
94 void	apic_preshutdown(int, int);
95 processorid_t	apic_get_next_processorid(processorid_t);
96 
97 hrtime_t apic_gettime();
98 
99 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
100 
101 /* Now the ones for Dynamic Interrupt distribution */
102 int	apic_enable_dynamic_migration = 0;
103 
104 /* maximum loop count when sending Start IPIs. */
105 int apic_sipi_max_loop_count = 0x1000;
106 
107 /*
108  * These variables are frequently accessed in apic_intr_enter(),
109  * apic_intr_exit and apic_setspl, so group them together
110  */
111 volatile uint32_t *apicadr =  NULL;	/* virtual addr of local APIC	*/
112 int apic_setspl_delay = 1;		/* apic_setspl - delay enable	*/
113 int apic_clkvect;
114 
115 /* vector at which error interrupts come in */
116 int apic_errvect;
117 int apic_enable_error_intr = 1;
118 int apic_error_display_delay = 100;
119 
120 /* vector at which performance counter overflow interrupts come in */
121 int apic_cpcovf_vect;
122 int apic_enable_cpcovf_intr = 1;
123 
124 /* vector at which CMCI interrupts come in */
125 int apic_cmci_vect;
126 extern void cmi_cmci_trap(void);
127 
128 lock_t apic_mode_switch_lock;
129 
130 int apic_pir_vect;
131 
132 /*
133  * Patchable global variables.
134  */
135 int	apic_forceload = 0;
136 
137 int	apic_coarse_hrtime = 1;		/* 0 - use accurate slow gethrtime() */
138 
139 int	apic_flat_model = 0;		/* 0 - clustered. 1 - flat */
140 int	apic_panic_on_nmi = 0;
141 int	apic_panic_on_apic_error = 0;
142 
143 int	apic_verbose = 0;	/* 0x1ff */
144 
145 #ifdef DEBUG
146 int	apic_debug = 0;
147 int	apic_restrict_vector = 0;
148 
149 int	apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
150 int	apic_debug_msgbufindex = 0;
151 
152 #endif /* DEBUG */
153 
154 uint_t apic_nticks = 0;
155 uint_t apic_skipped_redistribute = 0;
156 
157 uint_t last_count_read = 0;
158 lock_t	apic_gethrtime_lock;
159 volatile int	apic_hrtime_stamp = 0;
160 volatile hrtime_t apic_nsec_since_boot = 0;
161 
162 static	hrtime_t	apic_last_hrtime = 0;
163 int		apic_hrtime_error = 0;
164 int		apic_remote_hrterr = 0;
165 int		apic_num_nmis = 0;
166 int		apic_apic_error = 0;
167 int		apic_num_apic_errors = 0;
168 int		apic_num_cksum_errors = 0;
169 
170 int	apic_error = 0;
171 
172 static	int	apic_cmos_ssb_set = 0;
173 
174 /* use to make sure only one cpu handles the nmi */
175 lock_t	apic_nmi_lock;
176 /* use to make sure only one cpu handles the error interrupt */
177 lock_t	apic_error_lock;
178 
179 static	struct {
180 	uchar_t	cntl;
181 	uchar_t	data;
182 } aspen_bmc[] = {
183 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
184 	{ CC_SMS_WR_NEXT,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
185 	{ CC_SMS_WR_NEXT,	0x84 },		/* DataByte 1: SMS/OS no log */
186 	{ CC_SMS_WR_NEXT,	0x2 },		/* DataByte 2: Power Down */
187 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 3: no pre-timeout */
188 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 4: timer expir. */
189 	{ CC_SMS_WR_NEXT,	0xa },		/* DataByte 5: init countdown */
190 	{ CC_SMS_WR_END,	0x0 },		/* DataByte 6: init countdown */
191 
192 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
193 	{ CC_SMS_WR_END,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
194 };
195 
196 static	struct {
197 	int	port;
198 	uchar_t	data;
199 } sitka_bmc[] = {
200 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
201 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
202 	{ SMS_DATA_REGISTER,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
203 	{ SMS_DATA_REGISTER,	0x84 },		/* DataByte 1: SMS/OS no log */
204 	{ SMS_DATA_REGISTER,	0x2 },		/* DataByte 2: Power Down */
205 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 3: no pre-timeout */
206 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 4: timer expir. */
207 	{ SMS_DATA_REGISTER,	0xa },		/* DataByte 5: init countdown */
208 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
209 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 6: init countdown */
210 
211 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
212 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
213 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
214 	{ SMS_DATA_REGISTER,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
215 };
216 
217 /* Patchable global variables. */
218 int		apic_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
219 uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
220 
221 /* default apic ops without interrupt remapping */
222 static apic_intrmap_ops_t apic_nointrmap_ops = {
223 	(int (*)(int))return_instr,
224 	(void (*)(int))return_instr,
225 	(void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
226 	(void (*)(void *, void *, uint16_t, int))return_instr,
227 	(void (*)(void **))return_instr,
228 	apic_record_ioapic_rdt,
229 	apic_record_msi,
230 };
231 
232 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
233 apic_cpus_info_t	*apic_cpus = NULL;
234 cpuset_t	apic_cpumask;
235 uint_t		apic_picinit_called;
236 
237 /* Flag to indicate that we need to shut down all processors */
238 static uint_t	apic_shutdown_processors;
239 
240 /*
241  * Probe the ioapic method for apix module. Called in apic_probe_common()
242  */
243 int
244 apic_ioapic_method_probe()
245 {
246 	if (apix_enable == 0)
247 		return (PSM_SUCCESS);
248 
249 	/*
250 	 * Set IOAPIC EOI handling method. The priority from low to high is:
251 	 *	1. IOxAPIC: with EOI register
252 	 *	2. IOMMU interrupt mapping
253 	 *	3. Mask-Before-EOI method for systems without boot
254 	 *	interrupt routing, such as systems with only one IOAPIC;
255 	 *	NVIDIA CK8-04/MCP55 systems; systems with bridge solution
256 	 *	which disables the boot interrupt routing already.
257 	 *	4. Directed EOI
258 	 */
259 	if (apic_io_ver[0] >= 0x20)
260 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
261 	if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
262 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
263 	if (apic_directed_EOI_supported())
264 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
265 
266 	/* fall back to pcplusmp */
267 	if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
268 		/* make sure apix is after pcplusmp in /etc/mach */
269 		apix_enable = 0; /* go ahead with pcplusmp install next */
270 		return (PSM_FAILURE);
271 	}
272 
273 	return (PSM_SUCCESS);
274 }
275 
276 /*
277  * handler for APIC Error interrupt. Just print a warning and continue
278  */
279 int
280 apic_error_intr()
281 {
282 	uint_t	error0, error1, error;
283 	uint_t	i;
284 
285 	/*
286 	 * We need to write before read as per 7.4.17 of system prog manual.
287 	 * We do both and or the results to be safe
288 	 */
289 	error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
290 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
291 	error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
292 	error = error0 | error1;
293 
294 	/*
295 	 * Clear the APIC error status (do this on all cpus that enter here)
296 	 * (two writes are required due to the semantics of accessing the
297 	 * error status register.)
298 	 */
299 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
300 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
301 
302 	/*
303 	 * Prevent more than 1 CPU from handling error interrupt causing
304 	 * double printing (interleave of characters from multiple
305 	 * CPU's when using prom_printf)
306 	 */
307 	if (lock_try(&apic_error_lock) == 0)
308 		return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
309 	if (error) {
310 #if	DEBUG
311 		if (apic_debug)
312 			debug_enter("pcplusmp: APIC Error interrupt received");
313 #endif /* DEBUG */
314 		if (apic_panic_on_apic_error)
315 			cmn_err(CE_PANIC,
316 			    "APIC Error interrupt on CPU %d. Status = %x",
317 			    psm_get_cpu_id(), error);
318 		else {
319 			if ((error & ~APIC_CS_ERRORS) == 0) {
320 				/* cksum error only */
321 				apic_error |= APIC_ERR_APIC_ERROR;
322 				apic_apic_error |= error;
323 				apic_num_apic_errors++;
324 				apic_num_cksum_errors++;
325 			} else {
326 				/*
327 				 * prom_printf is the best shot we have of
328 				 * something which is problem free from
329 				 * high level/NMI type of interrupts
330 				 */
331 				prom_printf("APIC Error interrupt on CPU %d. "
332 				    "Status 0 = %x, Status 1 = %x\n",
333 				    psm_get_cpu_id(), error0, error1);
334 				apic_error |= APIC_ERR_APIC_ERROR;
335 				apic_apic_error |= error;
336 				apic_num_apic_errors++;
337 				for (i = 0; i < apic_error_display_delay; i++) {
338 					tenmicrosec();
339 				}
340 				/*
341 				 * provide more delay next time limited to
342 				 * roughly 1 clock tick time
343 				 */
344 				if (apic_error_display_delay < 500)
345 					apic_error_display_delay *= 2;
346 			}
347 		}
348 		lock_clear(&apic_error_lock);
349 		return (DDI_INTR_CLAIMED);
350 	} else {
351 		lock_clear(&apic_error_lock);
352 		return (DDI_INTR_UNCLAIMED);
353 	}
354 }
355 
356 /*
357  * Turn off the mask bit in the performance counter Local Vector Table entry.
358  */
359 void
360 apic_cpcovf_mask_clear(void)
361 {
362 	apic_reg_ops->apic_write(APIC_PCINT_VECT,
363 	    (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
364 }
365 
366 static int
367 apic_cmci_enable(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
368     xc_arg_t arg3 __unused)
369 {
370 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
371 	return (0);
372 }
373 
374 static int
375 apic_cmci_disable(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused,
376     xc_arg_t arg3 __unused)
377 {
378 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
379 	return (0);
380 }
381 
382 void
383 apic_cmci_setup(processorid_t cpuid, boolean_t enable)
384 {
385 	cpuset_t	cpu_set;
386 
387 	CPUSET_ONLY(cpu_set, cpuid);
388 
389 	if (enable) {
390 		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
391 		    (xc_func_t)apic_cmci_enable);
392 	} else {
393 		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
394 		    (xc_func_t)apic_cmci_disable);
395 	}
396 }
397 
398 static void
399 apic_disable_local_apic(void)
400 {
401 	apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
402 	apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
403 
404 	/* local intr reg 0 */
405 	apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
406 
407 	/* disable NMI */
408 	apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
409 
410 	/* and error interrupt */
411 	apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
412 
413 	/* and perf counter intr */
414 	apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
415 
416 	apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
417 }
418 
419 static void
420 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
421 {
422 	int		loop_count;
423 	uint32_t	vector;
424 	uint_t		apicid;
425 	ulong_t		iflag;
426 
427 	apicid =  apic_cpus[cpun].aci_local_id;
428 
429 	/*
430 	 * Interrupts on current CPU will be disabled during the
431 	 * steps in order to avoid unwanted side effects from
432 	 * executing interrupt handlers on a problematic BIOS.
433 	 */
434 	iflag = intr_clear();
435 
436 	if (start) {
437 		outb(CMOS_ADDR, SSB);
438 		outb(CMOS_DATA, BIOS_SHUTDOWN);
439 	}
440 
441 	/*
442 	 * According to X2APIC specification in section '2.3.5.1' of
443 	 * Interrupt Command Register Semantics, the semantics of
444 	 * programming the Interrupt Command Register to dispatch an interrupt
445 	 * is simplified. A single MSR write to the 64-bit ICR is required
446 	 * for dispatching an interrupt. Specifically, with the 64-bit MSR
447 	 * interface to ICR, system software is not required to check the
448 	 * status of the delivery status bit prior to writing to the ICR
449 	 * to send an IPI. With the removal of the Delivery Status bit,
450 	 * system software no longer has a reason to read the ICR. It remains
451 	 * readable only to aid in debugging.
452 	 */
453 #ifdef	DEBUG
454 	APIC_AV_PENDING_SET();
455 #else
456 	if (apic_mode == LOCAL_APIC) {
457 		APIC_AV_PENDING_SET();
458 	}
459 #endif /* DEBUG */
460 
461 	/* for integrated - make sure there is one INIT IPI in buffer */
462 	/* for external - it will wake up the cpu */
463 	apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
464 
465 	/* If only 1 CPU is installed, PENDING bit will not go low */
466 	for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
467 		if (apic_mode == LOCAL_APIC &&
468 		    apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
469 			apic_ret();
470 		else
471 			break;
472 	}
473 
474 	apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
475 	drv_usecwait(20000);		/* 20 milli sec */
476 
477 	if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
478 		/* integrated apic */
479 
480 		vector = (rm_platter_pa >> MMU_PAGESHIFT) &
481 		    (APIC_VECTOR_MASK | APIC_IPL_MASK);
482 
483 		/* to offset the INIT IPI queue up in the buffer */
484 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
485 		drv_usecwait(200);		/* 20 micro sec */
486 
487 		/*
488 		 * send the second SIPI (Startup IPI) as recommended by Intel
489 		 * software development manual.
490 		 */
491 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
492 		drv_usecwait(200);	/* 20 micro sec */
493 	}
494 
495 	intr_restore(iflag);
496 }
497 
498 /*ARGSUSED1*/
499 int
500 apic_cpu_start(processorid_t cpun, caddr_t arg __unused)
501 {
502 	ASSERT(MUTEX_HELD(&cpu_lock));
503 
504 	if (!apic_cpu_in_range(cpun)) {
505 		return (EINVAL);
506 	}
507 
508 	/*
509 	 * Switch to apic_common_send_ipi for safety during starting other CPUs.
510 	 */
511 	if (apic_mode == LOCAL_X2APIC) {
512 		apic_switch_ipi_callback(B_TRUE);
513 	}
514 
515 	apic_cmos_ssb_set = 1;
516 	apic_cpu_send_SIPI(cpun, B_TRUE);
517 
518 	return (0);
519 }
520 
521 /*
522  * Put CPU into halted state with interrupts disabled.
523  */
524 /*ARGSUSED1*/
525 int
526 apic_cpu_stop(processorid_t cpun, caddr_t arg __unused)
527 {
528 	int		rc;
529 	cpu_t		*cp;
530 	extern cpuset_t cpu_ready_set;
531 	extern void cpu_idle_intercept_cpu(cpu_t *cp);
532 
533 	ASSERT(MUTEX_HELD(&cpu_lock));
534 
535 	if (!apic_cpu_in_range(cpun)) {
536 		return (EINVAL);
537 	}
538 	if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
539 		return (ENOTSUP);
540 	}
541 
542 	cp = cpu_get(cpun);
543 	ASSERT(cp != NULL);
544 	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
545 	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
546 	ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
547 
548 	/* Clear CPU_READY flag to disable cross calls. */
549 	cp->cpu_flags &= ~CPU_READY;
550 	CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
551 	rc = xc_flush_cpu(cp);
552 	if (rc != 0) {
553 		CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
554 		cp->cpu_flags |= CPU_READY;
555 		return (rc);
556 	}
557 
558 	/* Intercept target CPU at a safe point before powering it off. */
559 	cpu_idle_intercept_cpu(cp);
560 
561 	apic_cpu_send_SIPI(cpun, B_FALSE);
562 	cp->cpu_flags &= ~CPU_RUNNING;
563 
564 	return (0);
565 }
566 
567 int
568 apic_cpu_ops(psm_cpu_request_t *reqp)
569 {
570 	if (reqp == NULL) {
571 		return (EINVAL);
572 	}
573 
574 	switch (reqp->pcr_cmd) {
575 	case PSM_CPU_ADD:
576 		return (apic_cpu_add(reqp));
577 
578 	case PSM_CPU_REMOVE:
579 		return (apic_cpu_remove(reqp));
580 
581 	case PSM_CPU_STOP:
582 		return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
583 		    reqp->req.cpu_stop.ctx));
584 
585 	default:
586 		return (ENOTSUP);
587 	}
588 }
589 
590 #ifdef	DEBUG
591 int	apic_break_on_cpu = 9;
592 int	apic_stretch_interrupts = 0;
593 int	apic_stretch_ISR = 1 << 3;	/* IPL of 3 matches nothing now */
594 #endif /* DEBUG */
595 
596 /*
597  * generates an interprocessor interrupt to another CPU. Any changes made to
598  * this routine must be accompanied by similar changes to
599  * apic_common_send_ipi().
600  */
601 void
602 apic_send_ipi(int cpun, int ipl)
603 {
604 	int vector;
605 	ulong_t flag;
606 
607 	vector = apic_resv_vector[ipl];
608 
609 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
610 
611 	flag = intr_clear();
612 
613 	APIC_AV_PENDING_SET();
614 
615 	apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
616 	    vector);
617 
618 	intr_restore(flag);
619 }
620 
621 void
622 apic_send_pir_ipi(processorid_t cpun)
623 {
624 	const int vector = apic_pir_vect;
625 	ulong_t flag;
626 
627 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
628 
629 	flag = intr_clear();
630 
631 	/* Self-IPI for inducing PIR makes no sense. */
632 	if ((cpun != psm_get_cpu_id())) {
633 		APIC_AV_PENDING_SET();
634 		apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
635 		    vector);
636 	}
637 
638 	intr_restore(flag);
639 }
640 
641 int
642 apic_get_pir_ipivect(void)
643 {
644 	return (apic_pir_vect);
645 }
646 
647 void
648 apic_set_idlecpu(processorid_t cpun __unused)
649 {
650 }
651 
652 void
653 apic_unset_idlecpu(processorid_t cpun __unused)
654 {
655 }
656 
657 
658 void
659 apic_ret()
660 {
661 }
662 
663 /*
664  * If apic_coarse_time == 1, then apic_gettime() is used instead of
665  * apic_gethrtime().  This is used for performance instead of accuracy.
666  */
667 
668 hrtime_t
669 apic_gettime()
670 {
671 	int old_hrtime_stamp;
672 	hrtime_t temp;
673 
674 	/*
675 	 * In one-shot mode, we do not keep time, so if anyone
676 	 * calls psm_gettime() directly, we vector over to
677 	 * gethrtime().
678 	 * one-shot mode MUST NOT be enabled if this psm is the source of
679 	 * hrtime.
680 	 */
681 
682 	if (apic_oneshot)
683 		return (gethrtime());
684 
685 
686 gettime_again:
687 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
688 		apic_ret();
689 
690 	temp = apic_nsec_since_boot;
691 
692 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
693 		goto gettime_again;
694 	}
695 	return (temp);
696 }
697 
698 /*
699  * Here we return the number of nanoseconds since booting.  Note every
700  * clock interrupt increments apic_nsec_since_boot by the appropriate
701  * amount.
702  */
703 hrtime_t
704 apic_gethrtime(void)
705 {
706 	int curr_timeval, countval, elapsed_ticks;
707 	int old_hrtime_stamp, status;
708 	hrtime_t temp;
709 	uint32_t cpun;
710 	ulong_t oflags;
711 
712 	/*
713 	 * In one-shot mode, we do not keep time, so if anyone
714 	 * calls psm_gethrtime() directly, we vector over to
715 	 * gethrtime().
716 	 * one-shot mode MUST NOT be enabled if this psm is the source of
717 	 * hrtime.
718 	 */
719 
720 	if (apic_oneshot)
721 		return (gethrtime());
722 
723 	oflags = intr_clear();	/* prevent migration */
724 
725 	cpun = apic_reg_ops->apic_read(APIC_LID_REG);
726 	if (apic_mode == LOCAL_APIC)
727 		cpun >>= APIC_ID_BIT_OFFSET;
728 
729 	lock_set(&apic_gethrtime_lock);
730 
731 gethrtime_again:
732 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
733 		apic_ret();
734 
735 	/*
736 	 * Check to see which CPU we are on.  Note the time is kept on
737 	 * the local APIC of CPU 0.  If on CPU 0, simply read the current
738 	 * counter.  If on another CPU, issue a remote read command to CPU 0.
739 	 */
740 	if (cpun == apic_cpus[0].aci_local_id) {
741 		countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
742 	} else {
743 #ifdef	DEBUG
744 		APIC_AV_PENDING_SET();
745 #else
746 		if (apic_mode == LOCAL_APIC)
747 			APIC_AV_PENDING_SET();
748 #endif /* DEBUG */
749 
750 		apic_reg_ops->apic_write_int_cmd(
751 		    apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
752 
753 		while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
754 		    & AV_READ_PENDING) {
755 			apic_ret();
756 		}
757 
758 		if (status & AV_REMOTE_STATUS)	/* 1 = valid */
759 			countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
760 		else {	/* 0 = invalid */
761 			apic_remote_hrterr++;
762 			/*
763 			 * return last hrtime right now, will need more
764 			 * testing if change to retry
765 			 */
766 			temp = apic_last_hrtime;
767 
768 			lock_clear(&apic_gethrtime_lock);
769 
770 			intr_restore(oflags);
771 
772 			return (temp);
773 		}
774 	}
775 	if (countval > last_count_read)
776 		countval = 0;
777 	else
778 		last_count_read = countval;
779 
780 	elapsed_ticks = apic_hertz_count - countval;
781 
782 	curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
783 	temp = apic_nsec_since_boot + curr_timeval;
784 
785 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
786 		/* we might have clobbered last_count_read. Restore it */
787 		last_count_read = apic_hertz_count;
788 		goto gethrtime_again;
789 	}
790 
791 	if (temp < apic_last_hrtime) {
792 		/* return last hrtime if error occurs */
793 		apic_hrtime_error++;
794 		temp = apic_last_hrtime;
795 	}
796 	else
797 		apic_last_hrtime = temp;
798 
799 	lock_clear(&apic_gethrtime_lock);
800 	intr_restore(oflags);
801 
802 	return (temp);
803 }
804 
805 /* apic NMI handler */
806 uint_t
807 apic_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused)
808 {
809 	nmi_action_t action = nmi_action;
810 
811 	if (apic_shutdown_processors) {
812 		apic_disable_local_apic();
813 		return (DDI_INTR_CLAIMED);
814 	}
815 
816 	apic_error |= APIC_ERR_NMI;
817 
818 	if (!lock_try(&apic_nmi_lock))
819 		return (DDI_INTR_CLAIMED);
820 	apic_num_nmis++;
821 
822 	/*
823 	 * "nmi_action" always over-rides the older way of doing this, unless we
824 	 * can't actually drop into kmdb when requested.
825 	 */
826 	if (action == NMI_ACTION_KMDB && !psm_debugger())
827 		action = NMI_ACTION_UNSET;
828 
829 	if (action == NMI_ACTION_UNSET) {
830 		if (apic_kmdb_on_nmi && psm_debugger())
831 			action = NMI_ACTION_KMDB;
832 		else if (apic_panic_on_nmi)
833 			action = NMI_ACTION_PANIC;
834 		else
835 			action = NMI_ACTION_IGNORE;
836 	}
837 
838 	switch (action) {
839 	case NMI_ACTION_IGNORE:
840 		/*
841 		 * prom_printf is the best shot we have of something which is
842 		 * problem free from high level/NMI type of interrupts
843 		 */
844 		prom_printf("NMI received\n");
845 		break;
846 
847 	case NMI_ACTION_PANIC:
848 		/* Keep panic from entering kmdb. */
849 		nopanicdebug = 1;
850 		panic("NMI received\n");
851 		break;
852 
853 	case NMI_ACTION_KMDB:
854 	default:
855 		debug_enter("NMI received: entering kmdb\n");
856 		break;
857 	}
858 
859 	lock_clear(&apic_nmi_lock);
860 	return (DDI_INTR_CLAIMED);
861 }
862 
863 processorid_t
864 apic_get_next_processorid(processorid_t cpu_id)
865 {
866 
867 	int i;
868 
869 	if (cpu_id == -1)
870 		return ((processorid_t)0);
871 
872 	for (i = cpu_id + 1; i < NCPU; i++) {
873 		if (apic_cpu_in_range(i))
874 			return (i);
875 	}
876 
877 	return ((processorid_t)-1);
878 }
879 
880 int
881 apic_cpu_add(psm_cpu_request_t *reqp)
882 {
883 	int i, rv = 0;
884 	ulong_t iflag;
885 	boolean_t first = B_TRUE;
886 	uchar_t localver = 0;
887 	uint32_t localid, procid;
888 	processorid_t cpuid = (processorid_t)-1;
889 	mach_cpu_add_arg_t *ap;
890 
891 	ASSERT(reqp != NULL);
892 	reqp->req.cpu_add.cpuid = (processorid_t)-1;
893 
894 	/* Check whether CPU hotplug is supported. */
895 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
896 		return (ENOTSUP);
897 	}
898 
899 	ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
900 	switch (ap->type) {
901 	case MACH_CPU_ARG_LOCAL_APIC:
902 		localid = ap->arg.apic.apic_id;
903 		procid = ap->arg.apic.proc_id;
904 		if (localid >= 255 || procid > 255) {
905 			cmn_err(CE_WARN,
906 			    "!apic: apicid(%u) or procid(%u) is invalid.",
907 			    localid, procid);
908 			return (EINVAL);
909 		}
910 		break;
911 
912 	case MACH_CPU_ARG_LOCAL_X2APIC:
913 		localid = ap->arg.apic.apic_id;
914 		procid = ap->arg.apic.proc_id;
915 		if (localid >= UINT32_MAX) {
916 			cmn_err(CE_WARN,
917 			    "!apic: x2apicid(%u) is invalid.", localid);
918 			return (EINVAL);
919 		} else if (localid >= 255 && apic_mode == LOCAL_APIC) {
920 			cmn_err(CE_WARN, "!apic: system is in APIC mode, "
921 			    "can't support x2APIC processor.");
922 			return (ENOTSUP);
923 		}
924 		break;
925 
926 	default:
927 		cmn_err(CE_WARN,
928 		    "!apic: unknown argument type %d to apic_cpu_add().",
929 		    ap->type);
930 		return (EINVAL);
931 	}
932 
933 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
934 	iflag = intr_clear();
935 	lock_set(&apic_ioapic_lock);
936 
937 	/* Check whether local APIC id already exists. */
938 	for (i = 0; i < apic_nproc; i++) {
939 		if (!CPU_IN_SET(apic_cpumask, i))
940 			continue;
941 		if (apic_cpus[i].aci_local_id == localid) {
942 			lock_clear(&apic_ioapic_lock);
943 			intr_restore(iflag);
944 			cmn_err(CE_WARN,
945 			    "!apic: local apic id %u already exists.",
946 			    localid);
947 			return (EEXIST);
948 		} else if (apic_cpus[i].aci_processor_id == procid) {
949 			lock_clear(&apic_ioapic_lock);
950 			intr_restore(iflag);
951 			cmn_err(CE_WARN,
952 			    "!apic: processor id %u already exists.",
953 			    (int)procid);
954 			return (EEXIST);
955 		}
956 
957 		/*
958 		 * There's no local APIC version number available in MADT table,
959 		 * so assume that all CPUs are homogeneous and use local APIC
960 		 * version number of the first existing CPU.
961 		 */
962 		if (first) {
963 			first = B_FALSE;
964 			localver = apic_cpus[i].aci_local_ver;
965 		}
966 	}
967 	ASSERT(first == B_FALSE);
968 
969 	/*
970 	 * Try to assign the same cpuid if APIC id exists in the dirty cache.
971 	 */
972 	for (i = 0; i < apic_max_nproc; i++) {
973 		if (CPU_IN_SET(apic_cpumask, i)) {
974 			ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
975 			continue;
976 		}
977 		ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
978 		if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
979 		    apic_cpus[i].aci_local_id == localid &&
980 		    apic_cpus[i].aci_processor_id == procid) {
981 			cpuid = i;
982 			break;
983 		}
984 	}
985 
986 	/* Avoid the dirty cache and allocate fresh slot if possible. */
987 	if (cpuid == (processorid_t)-1) {
988 		for (i = 0; i < apic_max_nproc; i++) {
989 			if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
990 			    (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
991 				cpuid = i;
992 				break;
993 			}
994 		}
995 	}
996 
997 	/* Try to find any free slot as last resort. */
998 	if (cpuid == (processorid_t)-1) {
999 		for (i = 0; i < apic_max_nproc; i++) {
1000 			if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
1001 				cpuid = i;
1002 				break;
1003 			}
1004 		}
1005 	}
1006 
1007 	if (cpuid == (processorid_t)-1) {
1008 		lock_clear(&apic_ioapic_lock);
1009 		intr_restore(iflag);
1010 		cmn_err(CE_NOTE,
1011 		    "!apic: failed to allocate cpu id for processor %u.",
1012 		    procid);
1013 		rv = EAGAIN;
1014 	} else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
1015 		lock_clear(&apic_ioapic_lock);
1016 		intr_restore(iflag);
1017 		cmn_err(CE_NOTE,
1018 		    "!apic: failed to build mapping for processor %u.",
1019 		    procid);
1020 		rv = EBUSY;
1021 	} else {
1022 		ASSERT(cpuid >= 0 && cpuid < NCPU);
1023 		ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
1024 		bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
1025 		apic_cpus[cpuid].aci_processor_id = procid;
1026 		apic_cpus[cpuid].aci_local_id = localid;
1027 		apic_cpus[cpuid].aci_local_ver = localver;
1028 		CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
1029 		if (cpuid >= apic_nproc) {
1030 			apic_nproc = cpuid + 1;
1031 		}
1032 		lock_clear(&apic_ioapic_lock);
1033 		intr_restore(iflag);
1034 		reqp->req.cpu_add.cpuid = cpuid;
1035 	}
1036 
1037 	return (rv);
1038 }
1039 
1040 int
1041 apic_cpu_remove(psm_cpu_request_t *reqp)
1042 {
1043 	int i;
1044 	ulong_t iflag;
1045 	processorid_t cpuid;
1046 
1047 	/* Check whether CPU hotplug is supported. */
1048 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1049 		return (ENOTSUP);
1050 	}
1051 
1052 	cpuid = reqp->req.cpu_remove.cpuid;
1053 
1054 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1055 	iflag = intr_clear();
1056 	lock_set(&apic_ioapic_lock);
1057 
1058 	if (!apic_cpu_in_range(cpuid)) {
1059 		lock_clear(&apic_ioapic_lock);
1060 		intr_restore(iflag);
1061 		cmn_err(CE_WARN,
1062 		    "!apic: cpuid %d doesn't exist in apic_cpus array.",
1063 		    cpuid);
1064 		return (ENODEV);
1065 	}
1066 	ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1067 
1068 	if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1069 		lock_clear(&apic_ioapic_lock);
1070 		intr_restore(iflag);
1071 		return (ENOENT);
1072 	}
1073 
1074 	if (cpuid == apic_nproc - 1) {
1075 		/*
1076 		 * We are removing the highest numbered cpuid so we need to
1077 		 * find the next highest cpuid as the new value for apic_nproc.
1078 		 */
1079 		for (i = apic_nproc; i > 0; i--) {
1080 			if (CPU_IN_SET(apic_cpumask, i - 1)) {
1081 				apic_nproc = i;
1082 				break;
1083 			}
1084 		}
1085 		/* at least one CPU left */
1086 		ASSERT(i > 0);
1087 	}
1088 	CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1089 	/* mark slot as free and keep it in the dirty cache */
1090 	apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1091 
1092 	lock_clear(&apic_ioapic_lock);
1093 	intr_restore(iflag);
1094 
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Return the number of ticks the APIC decrements in SF nanoseconds.
1100  * The fixed-frequency PIT (aka 8254) is used for the measurement.
1101  */
1102 static uint64_t
1103 apic_calibrate_impl()
1104 {
1105 	uint8_t		pit_tick_lo;
1106 	uint16_t	pit_tick, target_pit_tick, pit_ticks_adj;
1107 	uint32_t	pit_ticks;
1108 	uint32_t	start_apic_tick, end_apic_tick, apic_ticks;
1109 	ulong_t		iflag;
1110 
1111 	apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1112 	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1113 
1114 	iflag = intr_clear();
1115 
1116 	/*
1117 	 * Put the PIT in mode 0, "Interrupt On Terminal Count":
1118 	 */
1119 	outb(PITCTL_PORT, PIT_C0 | PIT_LOADMODE | PIT_ENDSIGMODE);
1120 
1121 	/*
1122 	 * The PIT counts down and then the counter value wraps around.  Load
1123 	 * the maximum counter value:
1124 	 */
1125 	outb(PITCTR0_PORT, 0xFF);
1126 	outb(PITCTR0_PORT, 0xFF);
1127 
1128 	do {
1129 		pit_tick_lo = inb(PITCTR0_PORT);
1130 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1131 	} while (pit_tick < APIC_TIME_MIN ||
1132 	    pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1133 
1134 	/*
1135 	 * Wait for the PIT to decrement by 5 ticks to ensure
1136 	 * we didn't start in the middle of a tick.
1137 	 * Compare with 0x10 for the wrap around case.
1138 	 */
1139 	target_pit_tick = pit_tick - 5;
1140 	do {
1141 		pit_tick_lo = inb(PITCTR0_PORT);
1142 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1143 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1144 
1145 	start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1146 
1147 	/*
1148 	 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1149 	 */
1150 	target_pit_tick = pit_tick - APIC_TIME_COUNT;
1151 	do {
1152 		pit_tick_lo = inb(PITCTR0_PORT);
1153 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1154 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1155 
1156 	end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1157 
1158 	intr_restore(iflag);
1159 
1160 	apic_ticks = start_apic_tick - end_apic_tick;
1161 
1162 	/* The PIT might have decremented by more ticks than planned */
1163 	pit_ticks_adj = target_pit_tick - pit_tick;
1164 	/* total number of PIT ticks corresponding to apic_ticks */
1165 	pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1166 
1167 	/*
1168 	 * Determine the number of nanoseconds per APIC clock tick
1169 	 * and then determine how many APIC ticks to interrupt at the
1170 	 * desired frequency
1171 	 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1172 	 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1173 	 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1174 	 * apic_ticks_per_SFns =
1175 	 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1176 	 */
1177 	return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1178 }
1179 
1180 /*
1181  * It was found empirically that 5 measurements seem sufficient to give a good
1182  * accuracy. Most spurious measurements are higher than the target value thus
1183  * we eliminate up to 2/5 spurious measurements.
1184  */
1185 #define	APIC_CALIBRATE_MEASUREMENTS		5
1186 
1187 #define	APIC_CALIBRATE_PERCENT_OFF_WARNING	10
1188 
1189 /*
1190  * Return the number of ticks the APIC decrements in SF nanoseconds.
1191  * Several measurements are taken to filter out outliers.
1192  */
1193 uint64_t
1194 apic_calibrate()
1195 {
1196 	uint64_t	measurements[APIC_CALIBRATE_MEASUREMENTS];
1197 	int		median_idx;
1198 	uint64_t	median;
1199 
1200 	/*
1201 	 * When running under a virtual machine, the emulated PIT and APIC
1202 	 * counters do not always return the right values and can roll over.
1203 	 * Those spurious measurements are relatively rare but could
1204 	 * significantly affect the calibration.
1205 	 * Therefore we take several measurements and then keep the median.
1206 	 * The median is preferred to the average here as we only want to
1207 	 * discard outliers.
1208 	 */
1209 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1210 		measurements[i] = apic_calibrate_impl();
1211 
1212 	/*
1213 	 * sort results and retrieve median.
1214 	 */
1215 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1216 		for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1217 			if (measurements[j] < measurements[i]) {
1218 				uint64_t tmp = measurements[i];
1219 				measurements[i] = measurements[j];
1220 				measurements[j] = tmp;
1221 			}
1222 		}
1223 	}
1224 	median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1225 	median = measurements[median_idx];
1226 
1227 #if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1228 	/*
1229 	 * Check that measurements are consistent. Post a warning
1230 	 * if the three middle values are not close to each other.
1231 	 */
1232 	uint64_t delta_warn = median *
1233 	    APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1234 	if ((median - measurements[median_idx - 1]) > delta_warn ||
1235 	    (measurements[median_idx + 1] - median) > delta_warn) {
1236 		cmn_err(CE_WARN, "apic_calibrate measurements lack "
1237 		    "precision: %llu, %llu, %llu.",
1238 		    (u_longlong_t)measurements[median_idx - 1],
1239 		    (u_longlong_t)median,
1240 		    (u_longlong_t)measurements[median_idx + 1]);
1241 	}
1242 #endif
1243 
1244 	return (median);
1245 }
1246 
1247 /*
1248  * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1249  * frequency.  Note at this stage in the boot sequence, the boot processor
1250  * is the only active processor.
1251  * hertz value of 0 indicates a one-shot mode request.  In this case
1252  * the function returns the resolution (in nanoseconds) for the hardware
1253  * timer interrupt.  If one-shot mode capability is not available,
1254  * the return value will be 0. apic_enable_oneshot is a global switch
1255  * for disabling the functionality.
1256  * A non-zero positive value for hertz indicates a periodic mode request.
1257  * In this case the hardware will be programmed to generate clock interrupts
1258  * at hertz frequency and returns the resolution of interrupts in
1259  * nanosecond.
1260  */
1261 
1262 int
1263 apic_clkinit(int hertz)
1264 {
1265 	int		ret;
1266 
1267 	apic_int_busy_mark = (apic_int_busy_mark *
1268 	    apic_sample_factor_redistribution) / 100;
1269 	apic_int_free_mark = (apic_int_free_mark *
1270 	    apic_sample_factor_redistribution) / 100;
1271 	apic_diff_for_redistribution = (apic_diff_for_redistribution *
1272 	    apic_sample_factor_redistribution) / 100;
1273 
1274 	ret = apic_timer_init(hertz);
1275 	return (ret);
1276 
1277 }
1278 
1279 /*
1280  * apic_preshutdown:
1281  * Called early in shutdown whilst we can still access filesystems to do
1282  * things like loading modules which will be required to complete shutdown
1283  * after filesystems are all unmounted.
1284  */
1285 void
1286 apic_preshutdown(int cmd __unused, int fcn __unused)
1287 {
1288 	APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1289 	    cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1290 }
1291 
1292 void
1293 apic_shutdown(int cmd, int fcn)
1294 {
1295 	int restarts, attempts;
1296 	int i;
1297 	uchar_t	byte;
1298 	ulong_t iflag;
1299 
1300 	hpet_acpi_fini();
1301 
1302 	/* Send NMI to all CPUs except self to do per processor shutdown */
1303 	iflag = intr_clear();
1304 #ifdef	DEBUG
1305 	APIC_AV_PENDING_SET();
1306 #else
1307 	if (apic_mode == LOCAL_APIC)
1308 		APIC_AV_PENDING_SET();
1309 #endif /* DEBUG */
1310 	apic_shutdown_processors = 1;
1311 	apic_reg_ops->apic_write(APIC_INT_CMD1,
1312 	    AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1313 
1314 	/* restore cmos shutdown byte before reboot */
1315 	if (apic_cmos_ssb_set) {
1316 		outb(CMOS_ADDR, SSB);
1317 		outb(CMOS_DATA, 0);
1318 	}
1319 
1320 	ioapic_disable_redirection();
1321 
1322 	/*	disable apic mode if imcr present	*/
1323 	if (apic_imcrp) {
1324 		outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1325 		outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1326 	}
1327 
1328 	apic_disable_local_apic();
1329 
1330 	intr_restore(iflag);
1331 
1332 	/* remainder of function is for shutdown cases only */
1333 	if (cmd != A_SHUTDOWN)
1334 		return;
1335 
1336 	/*
1337 	 * Switch system back into Legacy-Mode if using ACPI and
1338 	 * not powering-off.  Some BIOSes need to remain in ACPI-mode
1339 	 * for power-off to succeed (Dell Dimension 4600)
1340 	 * Do not disable ACPI while doing fastreboot
1341 	 */
1342 	if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1343 		(void) AcpiDisable();
1344 
1345 	if (fcn == AD_FASTREBOOT) {
1346 		apic_reg_ops->apic_write(APIC_INT_CMD1,
1347 		    AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1348 	}
1349 
1350 	/* remainder of function is for shutdown+poweroff case only */
1351 	if (fcn != AD_POWEROFF)
1352 		return;
1353 
1354 	switch (apic_poweroff_method) {
1355 		case APIC_POWEROFF_VIA_RTC:
1356 
1357 			/* select the extended NVRAM bank in the RTC */
1358 			outb(CMOS_ADDR, RTC_REGA);
1359 			byte = inb(CMOS_DATA);
1360 			outb(CMOS_DATA, (byte | EXT_BANK));
1361 
1362 			outb(CMOS_ADDR, PFR_REG);
1363 
1364 			/* for Predator must toggle the PAB bit */
1365 			byte = inb(CMOS_DATA);
1366 
1367 			/*
1368 			 * clear power active bar, wakeup alarm and
1369 			 * kickstart
1370 			 */
1371 			byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1372 			outb(CMOS_DATA, byte);
1373 
1374 			/* delay before next write */
1375 			drv_usecwait(1000);
1376 
1377 			/* for S40 the following would suffice */
1378 			byte = inb(CMOS_DATA);
1379 
1380 			/* power active bar control bit */
1381 			byte |= PAB_CBIT;
1382 			outb(CMOS_DATA, byte);
1383 
1384 			break;
1385 
1386 		case APIC_POWEROFF_VIA_ASPEN_BMC:
1387 			restarts = 0;
1388 restart_aspen_bmc:
1389 			if (++restarts == 3)
1390 				break;
1391 			attempts = 0;
1392 			do {
1393 				byte = inb(MISMIC_FLAG_REGISTER);
1394 				byte &= MISMIC_BUSY_MASK;
1395 				if (byte != 0) {
1396 					drv_usecwait(1000);
1397 					if (attempts >= 3)
1398 						goto restart_aspen_bmc;
1399 					++attempts;
1400 				}
1401 			} while (byte != 0);
1402 			outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1403 			byte = inb(MISMIC_FLAG_REGISTER);
1404 			byte |= 0x1;
1405 			outb(MISMIC_FLAG_REGISTER, byte);
1406 			i = 0;
1407 			for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1408 			    i++) {
1409 				attempts = 0;
1410 				do {
1411 					byte = inb(MISMIC_FLAG_REGISTER);
1412 					byte &= MISMIC_BUSY_MASK;
1413 					if (byte != 0) {
1414 						drv_usecwait(1000);
1415 						if (attempts >= 3)
1416 							goto restart_aspen_bmc;
1417 						++attempts;
1418 					}
1419 				} while (byte != 0);
1420 				outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1421 				outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1422 				byte = inb(MISMIC_FLAG_REGISTER);
1423 				byte |= 0x1;
1424 				outb(MISMIC_FLAG_REGISTER, byte);
1425 			}
1426 			break;
1427 
1428 		case APIC_POWEROFF_VIA_SITKA_BMC:
1429 			restarts = 0;
1430 restart_sitka_bmc:
1431 			if (++restarts == 3)
1432 				break;
1433 			attempts = 0;
1434 			do {
1435 				byte = inb(SMS_STATUS_REGISTER);
1436 				byte &= SMS_STATE_MASK;
1437 				if ((byte == SMS_READ_STATE) ||
1438 				    (byte == SMS_WRITE_STATE)) {
1439 					drv_usecwait(1000);
1440 					if (attempts >= 3)
1441 						goto restart_sitka_bmc;
1442 					++attempts;
1443 				}
1444 			} while ((byte == SMS_READ_STATE) ||
1445 			    (byte == SMS_WRITE_STATE));
1446 			outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1447 			i = 0;
1448 			for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1449 			    i++) {
1450 				attempts = 0;
1451 				do {
1452 					byte = inb(SMS_STATUS_REGISTER);
1453 					byte &= SMS_IBF_MASK;
1454 					if (byte != 0) {
1455 						drv_usecwait(1000);
1456 						if (attempts >= 3)
1457 							goto restart_sitka_bmc;
1458 						++attempts;
1459 					}
1460 				} while (byte != 0);
1461 				outb(sitka_bmc[i].port, sitka_bmc[i].data);
1462 			}
1463 			break;
1464 
1465 		case APIC_POWEROFF_NONE:
1466 
1467 			/* If no APIC direct method, we will try using ACPI */
1468 			if (apic_enable_acpi) {
1469 				if (acpi_poweroff() == 1)
1470 					return;
1471 			} else
1472 				return;
1473 
1474 			break;
1475 	}
1476 	/*
1477 	 * Wait a limited time here for power to go off.
1478 	 * If the power does not go off, then there was a
1479 	 * problem and we should continue to the halt which
1480 	 * prints a message for the user to press a key to
1481 	 * reboot.
1482 	 */
1483 	drv_usecwait(7000000); /* wait seven seconds */
1484 
1485 }
1486 
1487 cyclic_id_t apic_cyclic_id;
1488 
1489 /*
1490  * The following functions are in the platform specific file so that they
1491  * can be different functions depending on whether we are running on
1492  * bare metal or a hypervisor.
1493  */
1494 
1495 /*
1496  * map an apic for memory-mapped access
1497  */
1498 uint32_t *
1499 mapin_apic(uint32_t addr, size_t len, int flags)
1500 {
1501 	return ((void *)psm_map_phys(addr, len, flags));
1502 }
1503 
1504 uint32_t *
1505 mapin_ioapic(uint32_t addr, size_t len, int flags)
1506 {
1507 	return (mapin_apic(addr, len, flags));
1508 }
1509 
1510 /*
1511  * unmap an apic
1512  */
1513 void
1514 mapout_apic(caddr_t addr, size_t len)
1515 {
1516 	psm_unmap_phys(addr, len);
1517 }
1518 
1519 void
1520 mapout_ioapic(caddr_t addr, size_t len)
1521 {
1522 	mapout_apic(addr, len);
1523 }
1524 
1525 uint32_t
1526 ioapic_read(int ioapic_ix, uint32_t reg)
1527 {
1528 	volatile uint32_t *ioapic;
1529 
1530 	ioapic = apicioadr[ioapic_ix];
1531 	ioapic[APIC_IO_REG] = reg;
1532 	return (ioapic[APIC_IO_DATA]);
1533 }
1534 
1535 void
1536 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1537 {
1538 	volatile uint32_t *ioapic;
1539 
1540 	ioapic = apicioadr[ioapic_ix];
1541 	ioapic[APIC_IO_REG] = reg;
1542 	ioapic[APIC_IO_DATA] = value;
1543 }
1544 
1545 void
1546 ioapic_write_eoi(int ioapic_ix, uint32_t value)
1547 {
1548 	volatile uint32_t *ioapic;
1549 
1550 	ioapic = apicioadr[ioapic_ix];
1551 	ioapic[APIC_IO_EOI] = value;
1552 }
1553 
1554 /*
1555  * Round-robin algorithm to find the next CPU with interrupts enabled.
1556  * It can't share the same static variable apic_next_bind_cpu with
1557  * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1558  * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1559  * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1560  * are called.  However, the pcplusmp driver assumes that there will be
1561  * boot_ncpus CPUs configured eventually so it tries to distribute all
1562  * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1563  * interrupts being targetted at CPU1, we need to use a dedicated static
1564  * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1565  */
1566 
1567 processorid_t
1568 apic_find_cpu(int flag)
1569 {
1570 	int i;
1571 	static processorid_t acid = 0;
1572 
1573 	/* Find the first CPU with the passed-in flag set */
1574 	for (i = 0; i < apic_nproc; i++) {
1575 		if (++acid >= apic_nproc) {
1576 			acid = 0;
1577 		}
1578 		if (apic_cpu_in_range(acid) &&
1579 		    (apic_cpus[acid].aci_status & flag)) {
1580 			break;
1581 		}
1582 	}
1583 
1584 	ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1585 	return (acid);
1586 }
1587 
1588 void
1589 apic_intrmap_init(int apic_mode)
1590 {
1591 	int suppress_brdcst_eoi = 0;
1592 
1593 	/*
1594 	 * Intel Software Developer's Manual 3A, 10.12.7:
1595 	 *
1596 	 * Routing of device interrupts to local APIC units operating in
1597 	 * x2APIC mode requires use of the interrupt-remapping architecture
1598 	 * specified in the Intel Virtualization Technology for Directed
1599 	 * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1600 	 * for and software must enable this interrupt remapping with
1601 	 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1602 	 * the local APIC units.
1603 	 *
1604 	 *
1605 	 * In other words, to use the APIC in x2APIC mode, we need interrupt
1606 	 * remapping.  Since we don't start up the IOMMU by default, we
1607 	 * won't be able to do any interrupt remapping and therefore have to
1608 	 * use the APIC in traditional 'local APIC' mode with memory mapped
1609 	 * I/O.
1610 	 */
1611 
1612 	if (psm_vt_ops != NULL) {
1613 		if (((apic_intrmap_ops_t *)psm_vt_ops)->
1614 		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1615 
1616 			apic_vt_ops = psm_vt_ops;
1617 
1618 			/*
1619 			 * We leverage the interrupt remapping engine to
1620 			 * suppress broadcast EOI; thus we must send the
1621 			 * directed EOI with the directed-EOI handler.
1622 			 */
1623 			if (apic_directed_EOI_supported() == 0) {
1624 				suppress_brdcst_eoi = 1;
1625 			}
1626 
1627 			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1628 
1629 			if (apic_detect_x2apic()) {
1630 				apic_enable_x2apic();
1631 			}
1632 
1633 			if (apic_directed_EOI_supported() == 0) {
1634 				apic_set_directed_EOI_handler();
1635 			}
1636 		}
1637 	}
1638 }
1639 
1640 static void
1641 apic_record_ioapic_rdt(void *intrmap_private __unused, ioapic_rdt_t *irdt)
1642 {
1643 	irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1644 }
1645 
1646 static void
1647 apic_record_msi(void *intrmap_private __unused, msi_regs_t *mregs)
1648 {
1649 	mregs->mr_addr = MSI_ADDR_HDR |
1650 	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1651 	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1652 	    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1653 	mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1654 	    mregs->mr_data;
1655 }
1656 
1657 /*
1658  * Functions from apic_introp.c
1659  *
1660  * Those functions are used by apic_intr_ops().
1661  */
1662 
1663 /*
1664  * MSI support flag:
1665  * reflects whether MSI is supported at APIC level
1666  * it can also be patched through /etc/system
1667  *
1668  *  0 = default value - don't know and need to call apic_check_msi_support()
1669  *      to find out then set it accordingly
1670  *  1 = supported
1671  * -1 = not supported
1672  */
1673 int	apic_support_msi = 0;
1674 
1675 /* Multiple vector support for MSI-X */
1676 int	apic_msix_enable = 1;
1677 
1678 /* Multiple vector support for MSI */
1679 int	apic_multi_msi_enable = 1;
1680 
1681 /*
1682  * Check whether the system supports MSI.
1683  *
1684  * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1685  * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1686  * return PSM_SUCCESS to indicate this system supports MSI.
1687  *
1688  * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1689  * by detecting if we are running inside the KVM hypervisor, which guarantees
1690  * this version number.)
1691  */
1692 int
1693 apic_check_msi_support()
1694 {
1695 	dev_info_t *cdip;
1696 	char dev_type[16];
1697 	int dev_len;
1698 	int hwenv = get_hwenv();
1699 
1700 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1701 
1702 	/*
1703 	 * check whether the first level children of root_node have
1704 	 * PCI-E or PCI capability.
1705 	 */
1706 	for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1707 	    cdip = ddi_get_next_sibling(cdip)) {
1708 
1709 		DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1710 		    " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1711 		    ddi_driver_name(cdip), ddi_binding_name(cdip),
1712 		    ddi_node_name(cdip)));
1713 		dev_len = sizeof (dev_type);
1714 		if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1715 		    "device_type", (caddr_t)dev_type, &dev_len)
1716 		    != DDI_PROP_SUCCESS)
1717 			continue;
1718 		if (strcmp(dev_type, "pciex") == 0)
1719 			return (PSM_SUCCESS);
1720 		if (strcmp(dev_type, "pci") == 0 &&
1721 		    (hwenv == HW_KVM || hwenv == HW_BHYVE))
1722 			return (PSM_SUCCESS);
1723 	}
1724 
1725 	/* MSI is not supported on this system */
1726 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1727 	    "device_type found\n"));
1728 	return (PSM_FAILURE);
1729 }
1730 
1731 /*
1732  * apic_pci_msi_unconfigure:
1733  *
1734  * This and next two interfaces are copied from pci_intr_lib.c
1735  * Do ensure that these two files stay in sync.
1736  * These needed to be copied over here to avoid a deadlock situation on
1737  * certain mp systems that use MSI interrupts.
1738  *
1739  * IMPORTANT regards next three interfaces:
1740  * i) are called only for MSI/X interrupts.
1741  * ii) called with interrupts disabled, and must not block
1742  */
1743 void
1744 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1745 {
1746 	ushort_t		msi_ctrl;
1747 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1748 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1749 
1750 	ASSERT((handle != NULL) && (cap_ptr != 0));
1751 
1752 	if (type == DDI_INTR_TYPE_MSI) {
1753 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1754 		msi_ctrl &= (~PCI_MSI_MME_MASK);
1755 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1756 		pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1757 
1758 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1759 			pci_config_put16(handle,
1760 			    cap_ptr + PCI_MSI_64BIT_DATA, 0);
1761 			pci_config_put32(handle,
1762 			    cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1763 		} else {
1764 			pci_config_put16(handle,
1765 			    cap_ptr + PCI_MSI_32BIT_DATA, 0);
1766 		}
1767 
1768 	} else if (type == DDI_INTR_TYPE_MSIX) {
1769 		uintptr_t	off;
1770 		uint32_t	mask;
1771 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(rdip);
1772 
1773 		ASSERT(msix_p != NULL);
1774 
1775 		/* Offset into "inum"th entry in the MSI-X table & mask it */
1776 		off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1777 		    PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1778 
1779 		mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1780 
1781 		ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1782 
1783 		/* Offset into the "inum"th entry in the MSI-X table */
1784 		off = (uintptr_t)msix_p->msix_tbl_addr +
1785 		    (inum * PCI_MSIX_VECTOR_SIZE);
1786 
1787 		/* Reset the "data" and "addr" bits */
1788 		ddi_put32(msix_p->msix_tbl_hdl,
1789 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1790 		ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1791 	}
1792 }
1793 
1794 /*
1795  * apic_pci_msi_disable_mode:
1796  */
1797 void
1798 apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1799 {
1800 	ushort_t		msi_ctrl;
1801 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1802 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1803 
1804 	ASSERT((handle != NULL) && (cap_ptr != 0));
1805 
1806 	if (type == DDI_INTR_TYPE_MSI) {
1807 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1808 		if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1809 			return;
1810 
1811 		msi_ctrl &= ~PCI_MSI_ENABLE_BIT;	/* MSI disable */
1812 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1813 
1814 	} else if (type == DDI_INTR_TYPE_MSIX) {
1815 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1816 		if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1817 			msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1818 			pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1819 			    msi_ctrl);
1820 		}
1821 	}
1822 }
1823 
1824 uint32_t
1825 apic_get_localapicid(uint32_t cpuid)
1826 {
1827 	ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1828 
1829 	return (apic_cpus[cpuid].aci_local_id);
1830 }
1831 
1832 uchar_t
1833 apic_get_ioapicid(uchar_t ioapicindex)
1834 {
1835 	ASSERT(ioapicindex < MAX_IO_APIC);
1836 
1837 	return (apic_io_id[ioapicindex]);
1838 }
1839