xref: /illumos-gate/usr/src/uts/i86pc/io/pcplusmp/apic_common.c (revision 14b24e2b79293068c8e016a69ef1d872fb5e2fd5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
27  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
28  */
29 
30 /*
31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34  * PSMI 1.5 extensions are supported in Solaris Nevada.
35  * PSMI 1.6 extensions are supported in Solaris Nevada.
36  * PSMI 1.7 extensions are supported in Solaris Nevada.
37  */
38 #define	PSMI_1_7
39 
40 #include <sys/processor.h>
41 #include <sys/time.h>
42 #include <sys/psm.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/cram.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/psm_common.h>
48 #include <sys/apic.h>
49 #include <sys/pit.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/ddi_impldefs.h>
53 #include <sys/pci.h>
54 #include <sys/promif.h>
55 #include <sys/x86_archext.h>
56 #include <sys/cpc_impl.h>
57 #include <sys/uadmin.h>
58 #include <sys/panic.h>
59 #include <sys/debug.h>
60 #include <sys/archsystm.h>
61 #include <sys/trap.h>
62 #include <sys/machsystm.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cpuvar.h>
65 #include <sys/rm_platter.h>
66 #include <sys/privregs.h>
67 #include <sys/note.h>
68 #include <sys/pci_intr_lib.h>
69 #include <sys/spl.h>
70 #include <sys/clock.h>
71 #include <sys/dditypes.h>
72 #include <sys/sunddi.h>
73 #include <sys/x_call.h>
74 #include <sys/reboot.h>
75 #include <sys/hpet.h>
76 #include <sys/apic_common.h>
77 #include <sys/apic_timer.h>
78 
79 static void	apic_record_ioapic_rdt(void *intrmap_private,
80 		    ioapic_rdt_t *irdt);
81 static void	apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
82 
83 /*
84  * Common routines between pcplusmp & apix (taken from apic.c).
85  */
86 
87 int	apic_clkinit(int);
88 hrtime_t apic_gethrtime(void);
89 void	apic_send_ipi(int, int);
90 void	apic_set_idlecpu(processorid_t);
91 void	apic_unset_idlecpu(processorid_t);
92 void	apic_shutdown(int, int);
93 void	apic_preshutdown(int, int);
94 processorid_t	apic_get_next_processorid(processorid_t);
95 
96 hrtime_t apic_gettime();
97 
98 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
99 
100 /* Now the ones for Dynamic Interrupt distribution */
101 int	apic_enable_dynamic_migration = 0;
102 
103 /* maximum loop count when sending Start IPIs. */
104 int apic_sipi_max_loop_count = 0x1000;
105 
106 /*
107  * These variables are frequently accessed in apic_intr_enter(),
108  * apic_intr_exit and apic_setspl, so group them together
109  */
110 volatile uint32_t *apicadr =  NULL;	/* virtual addr of local APIC	*/
111 int apic_setspl_delay = 1;		/* apic_setspl - delay enable	*/
112 int apic_clkvect;
113 
114 /* vector at which error interrupts come in */
115 int apic_errvect;
116 int apic_enable_error_intr = 1;
117 int apic_error_display_delay = 100;
118 
119 /* vector at which performance counter overflow interrupts come in */
120 int apic_cpcovf_vect;
121 int apic_enable_cpcovf_intr = 1;
122 
123 /* vector at which CMCI interrupts come in */
124 int apic_cmci_vect;
125 extern int cmi_enable_cmci;
126 extern void cmi_cmci_trap(void);
127 
128 kmutex_t cmci_cpu_setup_lock;	/* protects cmci_cpu_setup_registered */
129 int cmci_cpu_setup_registered;
130 
131 lock_t apic_mode_switch_lock;
132 
133 /*
134  * Patchable global variables.
135  */
136 int	apic_forceload = 0;
137 
138 int	apic_coarse_hrtime = 1;		/* 0 - use accurate slow gethrtime() */
139 
140 int	apic_flat_model = 0;		/* 0 - clustered. 1 - flat */
141 int	apic_panic_on_nmi = 0;
142 int	apic_panic_on_apic_error = 0;
143 
144 int	apic_verbose = 0;	/* 0x1ff */
145 
146 #ifdef DEBUG
147 int	apic_debug = 0;
148 int	apic_restrict_vector = 0;
149 
150 int	apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
151 int	apic_debug_msgbufindex = 0;
152 
153 #endif /* DEBUG */
154 
155 uint_t apic_nticks = 0;
156 uint_t apic_skipped_redistribute = 0;
157 
158 uint_t last_count_read = 0;
159 lock_t	apic_gethrtime_lock;
160 volatile int	apic_hrtime_stamp = 0;
161 volatile hrtime_t apic_nsec_since_boot = 0;
162 
163 static	hrtime_t	apic_last_hrtime = 0;
164 int		apic_hrtime_error = 0;
165 int		apic_remote_hrterr = 0;
166 int		apic_num_nmis = 0;
167 int		apic_apic_error = 0;
168 int		apic_num_apic_errors = 0;
169 int		apic_num_cksum_errors = 0;
170 
171 int	apic_error = 0;
172 
173 static	int	apic_cmos_ssb_set = 0;
174 
175 /* use to make sure only one cpu handles the nmi */
176 lock_t	apic_nmi_lock;
177 /* use to make sure only one cpu handles the error interrupt */
178 lock_t	apic_error_lock;
179 
180 static	struct {
181 	uchar_t	cntl;
182 	uchar_t	data;
183 } aspen_bmc[] = {
184 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
185 	{ CC_SMS_WR_NEXT,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
186 	{ CC_SMS_WR_NEXT,	0x84 },		/* DataByte 1: SMS/OS no log */
187 	{ CC_SMS_WR_NEXT,	0x2 },		/* DataByte 2: Power Down */
188 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 3: no pre-timeout */
189 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 4: timer expir. */
190 	{ CC_SMS_WR_NEXT,	0xa },		/* DataByte 5: init countdown */
191 	{ CC_SMS_WR_END,	0x0 },		/* DataByte 6: init countdown */
192 
193 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
194 	{ CC_SMS_WR_END,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
195 };
196 
197 static	struct {
198 	int	port;
199 	uchar_t	data;
200 } sitka_bmc[] = {
201 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
202 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
203 	{ SMS_DATA_REGISTER,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
204 	{ SMS_DATA_REGISTER,	0x84 },		/* DataByte 1: SMS/OS no log */
205 	{ SMS_DATA_REGISTER,	0x2 },		/* DataByte 2: Power Down */
206 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 3: no pre-timeout */
207 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 4: timer expir. */
208 	{ SMS_DATA_REGISTER,	0xa },		/* DataByte 5: init countdown */
209 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
210 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 6: init countdown */
211 
212 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
213 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
214 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
215 	{ SMS_DATA_REGISTER,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
216 };
217 
218 /* Patchable global variables. */
219 int		apic_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
220 uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
221 
222 /* default apic ops without interrupt remapping */
223 static apic_intrmap_ops_t apic_nointrmap_ops = {
224 	(int (*)(int))return_instr,
225 	(void (*)(int))return_instr,
226 	(void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
227 	(void (*)(void *, void *, uint16_t, int))return_instr,
228 	(void (*)(void **))return_instr,
229 	apic_record_ioapic_rdt,
230 	apic_record_msi,
231 };
232 
233 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
234 apic_cpus_info_t	*apic_cpus = NULL;
235 cpuset_t	apic_cpumask;
236 uint_t		apic_picinit_called;
237 
238 /* Flag to indicate that we need to shut down all processors */
239 static uint_t	apic_shutdown_processors;
240 
241 /*
242  * Probe the ioapic method for apix module. Called in apic_probe_common()
243  */
244 int
245 apic_ioapic_method_probe()
246 {
247 	if (apix_enable == 0)
248 		return (PSM_SUCCESS);
249 
250 	/*
251 	 * Set IOAPIC EOI handling method. The priority from low to high is:
252 	 * 	1. IOxAPIC: with EOI register
253 	 * 	2. IOMMU interrupt mapping
254 	 *	3. Mask-Before-EOI method for systems without boot
255 	 *	interrupt routing, such as systems with only one IOAPIC;
256 	 *	NVIDIA CK8-04/MCP55 systems; systems with bridge solution
257 	 *	which disables the boot interrupt routing already.
258 	 * 	4. Directed EOI
259 	 */
260 	if (apic_io_ver[0] >= 0x20)
261 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
262 	if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
263 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
264 	if (apic_directed_EOI_supported())
265 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
266 
267 	/* fall back to pcplusmp */
268 	if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
269 		/* make sure apix is after pcplusmp in /etc/mach */
270 		apix_enable = 0; /* go ahead with pcplusmp install next */
271 		return (PSM_FAILURE);
272 	}
273 
274 	return (PSM_SUCCESS);
275 }
276 
277 /*
278  * handler for APIC Error interrupt. Just print a warning and continue
279  */
280 int
281 apic_error_intr()
282 {
283 	uint_t	error0, error1, error;
284 	uint_t	i;
285 
286 	/*
287 	 * We need to write before read as per 7.4.17 of system prog manual.
288 	 * We do both and or the results to be safe
289 	 */
290 	error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
291 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
292 	error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
293 	error = error0 | error1;
294 
295 	/*
296 	 * Clear the APIC error status (do this on all cpus that enter here)
297 	 * (two writes are required due to the semantics of accessing the
298 	 * error status register.)
299 	 */
300 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
301 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
302 
303 	/*
304 	 * Prevent more than 1 CPU from handling error interrupt causing
305 	 * double printing (interleave of characters from multiple
306 	 * CPU's when using prom_printf)
307 	 */
308 	if (lock_try(&apic_error_lock) == 0)
309 		return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
310 	if (error) {
311 #if	DEBUG
312 		if (apic_debug)
313 			debug_enter("pcplusmp: APIC Error interrupt received");
314 #endif /* DEBUG */
315 		if (apic_panic_on_apic_error)
316 			cmn_err(CE_PANIC,
317 			    "APIC Error interrupt on CPU %d. Status = %x",
318 			    psm_get_cpu_id(), error);
319 		else {
320 			if ((error & ~APIC_CS_ERRORS) == 0) {
321 				/* cksum error only */
322 				apic_error |= APIC_ERR_APIC_ERROR;
323 				apic_apic_error |= error;
324 				apic_num_apic_errors++;
325 				apic_num_cksum_errors++;
326 			} else {
327 				/*
328 				 * prom_printf is the best shot we have of
329 				 * something which is problem free from
330 				 * high level/NMI type of interrupts
331 				 */
332 				prom_printf("APIC Error interrupt on CPU %d. "
333 				    "Status 0 = %x, Status 1 = %x\n",
334 				    psm_get_cpu_id(), error0, error1);
335 				apic_error |= APIC_ERR_APIC_ERROR;
336 				apic_apic_error |= error;
337 				apic_num_apic_errors++;
338 				for (i = 0; i < apic_error_display_delay; i++) {
339 					tenmicrosec();
340 				}
341 				/*
342 				 * provide more delay next time limited to
343 				 * roughly 1 clock tick time
344 				 */
345 				if (apic_error_display_delay < 500)
346 					apic_error_display_delay *= 2;
347 			}
348 		}
349 		lock_clear(&apic_error_lock);
350 		return (DDI_INTR_CLAIMED);
351 	} else {
352 		lock_clear(&apic_error_lock);
353 		return (DDI_INTR_UNCLAIMED);
354 	}
355 }
356 
357 /*
358  * Turn off the mask bit in the performance counter Local Vector Table entry.
359  */
360 void
361 apic_cpcovf_mask_clear(void)
362 {
363 	apic_reg_ops->apic_write(APIC_PCINT_VECT,
364 	    (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
365 }
366 
367 /*ARGSUSED*/
368 static int
369 apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
370 {
371 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
372 	return (0);
373 }
374 
375 /*ARGSUSED*/
376 static int
377 apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
378 {
379 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
380 	return (0);
381 }
382 
383 /*ARGSUSED*/
384 int
385 cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
386 {
387 	cpuset_t	cpu_set;
388 
389 	CPUSET_ONLY(cpu_set, cpuid);
390 
391 	switch (what) {
392 		case CPU_ON:
393 			xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
394 			    (xc_func_t)apic_cmci_enable);
395 			break;
396 
397 		case CPU_OFF:
398 			xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
399 			    (xc_func_t)apic_cmci_disable);
400 			break;
401 
402 		default:
403 			break;
404 	}
405 
406 	return (0);
407 }
408 
409 static void
410 apic_disable_local_apic(void)
411 {
412 	apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
413 	apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
414 
415 	/* local intr reg 0 */
416 	apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
417 
418 	/* disable NMI */
419 	apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
420 
421 	/* and error interrupt */
422 	apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
423 
424 	/* and perf counter intr */
425 	apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
426 
427 	apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
428 }
429 
430 static void
431 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
432 {
433 	int		loop_count;
434 	uint32_t	vector;
435 	uint_t		apicid;
436 	ulong_t		iflag;
437 
438 	apicid =  apic_cpus[cpun].aci_local_id;
439 
440 	/*
441 	 * Interrupts on current CPU will be disabled during the
442 	 * steps in order to avoid unwanted side effects from
443 	 * executing interrupt handlers on a problematic BIOS.
444 	 */
445 	iflag = intr_clear();
446 
447 	if (start) {
448 		outb(CMOS_ADDR, SSB);
449 		outb(CMOS_DATA, BIOS_SHUTDOWN);
450 	}
451 
452 	/*
453 	 * According to X2APIC specification in section '2.3.5.1' of
454 	 * Interrupt Command Register Semantics, the semantics of
455 	 * programming the Interrupt Command Register to dispatch an interrupt
456 	 * is simplified. A single MSR write to the 64-bit ICR is required
457 	 * for dispatching an interrupt. Specifically, with the 64-bit MSR
458 	 * interface to ICR, system software is not required to check the
459 	 * status of the delivery status bit prior to writing to the ICR
460 	 * to send an IPI. With the removal of the Delivery Status bit,
461 	 * system software no longer has a reason to read the ICR. It remains
462 	 * readable only to aid in debugging.
463 	 */
464 #ifdef	DEBUG
465 	APIC_AV_PENDING_SET();
466 #else
467 	if (apic_mode == LOCAL_APIC) {
468 		APIC_AV_PENDING_SET();
469 	}
470 #endif /* DEBUG */
471 
472 	/* for integrated - make sure there is one INIT IPI in buffer */
473 	/* for external - it will wake up the cpu */
474 	apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
475 
476 	/* If only 1 CPU is installed, PENDING bit will not go low */
477 	for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
478 		if (apic_mode == LOCAL_APIC &&
479 		    apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
480 			apic_ret();
481 		else
482 			break;
483 	}
484 
485 	apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
486 	drv_usecwait(20000);		/* 20 milli sec */
487 
488 	if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
489 		/* integrated apic */
490 
491 		vector = (rm_platter_pa >> MMU_PAGESHIFT) &
492 		    (APIC_VECTOR_MASK | APIC_IPL_MASK);
493 
494 		/* to offset the INIT IPI queue up in the buffer */
495 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
496 		drv_usecwait(200);		/* 20 micro sec */
497 
498 		/*
499 		 * send the second SIPI (Startup IPI) as recommended by Intel
500 		 * software development manual.
501 		 */
502 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
503 		drv_usecwait(200);	/* 20 micro sec */
504 	}
505 
506 	intr_restore(iflag);
507 }
508 
509 /*ARGSUSED1*/
510 int
511 apic_cpu_start(processorid_t cpun, caddr_t arg)
512 {
513 	ASSERT(MUTEX_HELD(&cpu_lock));
514 
515 	if (!apic_cpu_in_range(cpun)) {
516 		return (EINVAL);
517 	}
518 
519 	/*
520 	 * Switch to apic_common_send_ipi for safety during starting other CPUs.
521 	 */
522 	if (apic_mode == LOCAL_X2APIC) {
523 		apic_switch_ipi_callback(B_TRUE);
524 	}
525 
526 	apic_cmos_ssb_set = 1;
527 	apic_cpu_send_SIPI(cpun, B_TRUE);
528 
529 	return (0);
530 }
531 
532 /*
533  * Put CPU into halted state with interrupts disabled.
534  */
535 /*ARGSUSED1*/
536 int
537 apic_cpu_stop(processorid_t cpun, caddr_t arg)
538 {
539 	int		rc;
540 	cpu_t 		*cp;
541 	extern cpuset_t cpu_ready_set;
542 	extern void cpu_idle_intercept_cpu(cpu_t *cp);
543 
544 	ASSERT(MUTEX_HELD(&cpu_lock));
545 
546 	if (!apic_cpu_in_range(cpun)) {
547 		return (EINVAL);
548 	}
549 	if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
550 		return (ENOTSUP);
551 	}
552 
553 	cp = cpu_get(cpun);
554 	ASSERT(cp != NULL);
555 	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
556 	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
557 	ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
558 
559 	/* Clear CPU_READY flag to disable cross calls. */
560 	cp->cpu_flags &= ~CPU_READY;
561 	CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
562 	rc = xc_flush_cpu(cp);
563 	if (rc != 0) {
564 		CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
565 		cp->cpu_flags |= CPU_READY;
566 		return (rc);
567 	}
568 
569 	/* Intercept target CPU at a safe point before powering it off. */
570 	cpu_idle_intercept_cpu(cp);
571 
572 	apic_cpu_send_SIPI(cpun, B_FALSE);
573 	cp->cpu_flags &= ~CPU_RUNNING;
574 
575 	return (0);
576 }
577 
578 int
579 apic_cpu_ops(psm_cpu_request_t *reqp)
580 {
581 	if (reqp == NULL) {
582 		return (EINVAL);
583 	}
584 
585 	switch (reqp->pcr_cmd) {
586 	case PSM_CPU_ADD:
587 		return (apic_cpu_add(reqp));
588 
589 	case PSM_CPU_REMOVE:
590 		return (apic_cpu_remove(reqp));
591 
592 	case PSM_CPU_STOP:
593 		return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
594 		    reqp->req.cpu_stop.ctx));
595 
596 	default:
597 		return (ENOTSUP);
598 	}
599 }
600 
601 #ifdef	DEBUG
602 int	apic_break_on_cpu = 9;
603 int	apic_stretch_interrupts = 0;
604 int	apic_stretch_ISR = 1 << 3;	/* IPL of 3 matches nothing now */
605 #endif /* DEBUG */
606 
607 /*
608  * generates an interprocessor interrupt to another CPU. Any changes made to
609  * this routine must be accompanied by similar changes to
610  * apic_common_send_ipi().
611  */
612 void
613 apic_send_ipi(int cpun, int ipl)
614 {
615 	int vector;
616 	ulong_t flag;
617 
618 	vector = apic_resv_vector[ipl];
619 
620 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
621 
622 	flag = intr_clear();
623 
624 	APIC_AV_PENDING_SET();
625 
626 	apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
627 	    vector);
628 
629 	intr_restore(flag);
630 }
631 
632 
633 /*ARGSUSED*/
634 void
635 apic_set_idlecpu(processorid_t cpun)
636 {
637 }
638 
639 /*ARGSUSED*/
640 void
641 apic_unset_idlecpu(processorid_t cpun)
642 {
643 }
644 
645 
646 void
647 apic_ret()
648 {
649 }
650 
651 /*
652  * If apic_coarse_time == 1, then apic_gettime() is used instead of
653  * apic_gethrtime().  This is used for performance instead of accuracy.
654  */
655 
656 hrtime_t
657 apic_gettime()
658 {
659 	int old_hrtime_stamp;
660 	hrtime_t temp;
661 
662 	/*
663 	 * In one-shot mode, we do not keep time, so if anyone
664 	 * calls psm_gettime() directly, we vector over to
665 	 * gethrtime().
666 	 * one-shot mode MUST NOT be enabled if this psm is the source of
667 	 * hrtime.
668 	 */
669 
670 	if (apic_oneshot)
671 		return (gethrtime());
672 
673 
674 gettime_again:
675 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
676 		apic_ret();
677 
678 	temp = apic_nsec_since_boot;
679 
680 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
681 		goto gettime_again;
682 	}
683 	return (temp);
684 }
685 
686 /*
687  * Here we return the number of nanoseconds since booting.  Note every
688  * clock interrupt increments apic_nsec_since_boot by the appropriate
689  * amount.
690  */
691 hrtime_t
692 apic_gethrtime(void)
693 {
694 	int curr_timeval, countval, elapsed_ticks;
695 	int old_hrtime_stamp, status;
696 	hrtime_t temp;
697 	uint32_t cpun;
698 	ulong_t oflags;
699 
700 	/*
701 	 * In one-shot mode, we do not keep time, so if anyone
702 	 * calls psm_gethrtime() directly, we vector over to
703 	 * gethrtime().
704 	 * one-shot mode MUST NOT be enabled if this psm is the source of
705 	 * hrtime.
706 	 */
707 
708 	if (apic_oneshot)
709 		return (gethrtime());
710 
711 	oflags = intr_clear();	/* prevent migration */
712 
713 	cpun = apic_reg_ops->apic_read(APIC_LID_REG);
714 	if (apic_mode == LOCAL_APIC)
715 		cpun >>= APIC_ID_BIT_OFFSET;
716 
717 	lock_set(&apic_gethrtime_lock);
718 
719 gethrtime_again:
720 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
721 		apic_ret();
722 
723 	/*
724 	 * Check to see which CPU we are on.  Note the time is kept on
725 	 * the local APIC of CPU 0.  If on CPU 0, simply read the current
726 	 * counter.  If on another CPU, issue a remote read command to CPU 0.
727 	 */
728 	if (cpun == apic_cpus[0].aci_local_id) {
729 		countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
730 	} else {
731 #ifdef	DEBUG
732 		APIC_AV_PENDING_SET();
733 #else
734 		if (apic_mode == LOCAL_APIC)
735 			APIC_AV_PENDING_SET();
736 #endif /* DEBUG */
737 
738 		apic_reg_ops->apic_write_int_cmd(
739 		    apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
740 
741 		while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
742 		    & AV_READ_PENDING) {
743 			apic_ret();
744 		}
745 
746 		if (status & AV_REMOTE_STATUS)	/* 1 = valid */
747 			countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
748 		else {	/* 0 = invalid */
749 			apic_remote_hrterr++;
750 			/*
751 			 * return last hrtime right now, will need more
752 			 * testing if change to retry
753 			 */
754 			temp = apic_last_hrtime;
755 
756 			lock_clear(&apic_gethrtime_lock);
757 
758 			intr_restore(oflags);
759 
760 			return (temp);
761 		}
762 	}
763 	if (countval > last_count_read)
764 		countval = 0;
765 	else
766 		last_count_read = countval;
767 
768 	elapsed_ticks = apic_hertz_count - countval;
769 
770 	curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
771 	temp = apic_nsec_since_boot + curr_timeval;
772 
773 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
774 		/* we might have clobbered last_count_read. Restore it */
775 		last_count_read = apic_hertz_count;
776 		goto gethrtime_again;
777 	}
778 
779 	if (temp < apic_last_hrtime) {
780 		/* return last hrtime if error occurs */
781 		apic_hrtime_error++;
782 		temp = apic_last_hrtime;
783 	}
784 	else
785 		apic_last_hrtime = temp;
786 
787 	lock_clear(&apic_gethrtime_lock);
788 	intr_restore(oflags);
789 
790 	return (temp);
791 }
792 
793 /* apic NMI handler */
794 /*ARGSUSED*/
795 void
796 apic_nmi_intr(caddr_t arg, struct regs *rp)
797 {
798 	if (apic_shutdown_processors) {
799 		apic_disable_local_apic();
800 		return;
801 	}
802 
803 	apic_error |= APIC_ERR_NMI;
804 
805 	if (!lock_try(&apic_nmi_lock))
806 		return;
807 	apic_num_nmis++;
808 
809 	if (apic_kmdb_on_nmi && psm_debugger()) {
810 		debug_enter("NMI received: entering kmdb\n");
811 	} else if (apic_panic_on_nmi) {
812 		/* Keep panic from entering kmdb. */
813 		nopanicdebug = 1;
814 		panic("NMI received\n");
815 	} else {
816 		/*
817 		 * prom_printf is the best shot we have of something which is
818 		 * problem free from high level/NMI type of interrupts
819 		 */
820 		prom_printf("NMI received\n");
821 	}
822 
823 	lock_clear(&apic_nmi_lock);
824 }
825 
826 processorid_t
827 apic_get_next_processorid(processorid_t cpu_id)
828 {
829 
830 	int i;
831 
832 	if (cpu_id == -1)
833 		return ((processorid_t)0);
834 
835 	for (i = cpu_id + 1; i < NCPU; i++) {
836 		if (apic_cpu_in_range(i))
837 			return (i);
838 	}
839 
840 	return ((processorid_t)-1);
841 }
842 
843 int
844 apic_cpu_add(psm_cpu_request_t *reqp)
845 {
846 	int i, rv = 0;
847 	ulong_t iflag;
848 	boolean_t first = B_TRUE;
849 	uchar_t localver = 0;
850 	uint32_t localid, procid;
851 	processorid_t cpuid = (processorid_t)-1;
852 	mach_cpu_add_arg_t *ap;
853 
854 	ASSERT(reqp != NULL);
855 	reqp->req.cpu_add.cpuid = (processorid_t)-1;
856 
857 	/* Check whether CPU hotplug is supported. */
858 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
859 		return (ENOTSUP);
860 	}
861 
862 	ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
863 	switch (ap->type) {
864 	case MACH_CPU_ARG_LOCAL_APIC:
865 		localid = ap->arg.apic.apic_id;
866 		procid = ap->arg.apic.proc_id;
867 		if (localid >= 255 || procid > 255) {
868 			cmn_err(CE_WARN,
869 			    "!apic: apicid(%u) or procid(%u) is invalid.",
870 			    localid, procid);
871 			return (EINVAL);
872 		}
873 		break;
874 
875 	case MACH_CPU_ARG_LOCAL_X2APIC:
876 		localid = ap->arg.apic.apic_id;
877 		procid = ap->arg.apic.proc_id;
878 		if (localid >= UINT32_MAX) {
879 			cmn_err(CE_WARN,
880 			    "!apic: x2apicid(%u) is invalid.", localid);
881 			return (EINVAL);
882 		} else if (localid >= 255 && apic_mode == LOCAL_APIC) {
883 			cmn_err(CE_WARN, "!apic: system is in APIC mode, "
884 			    "can't support x2APIC processor.");
885 			return (ENOTSUP);
886 		}
887 		break;
888 
889 	default:
890 		cmn_err(CE_WARN,
891 		    "!apic: unknown argument type %d to apic_cpu_add().",
892 		    ap->type);
893 		return (EINVAL);
894 	}
895 
896 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
897 	iflag = intr_clear();
898 	lock_set(&apic_ioapic_lock);
899 
900 	/* Check whether local APIC id already exists. */
901 	for (i = 0; i < apic_nproc; i++) {
902 		if (!CPU_IN_SET(apic_cpumask, i))
903 			continue;
904 		if (apic_cpus[i].aci_local_id == localid) {
905 			lock_clear(&apic_ioapic_lock);
906 			intr_restore(iflag);
907 			cmn_err(CE_WARN,
908 			    "!apic: local apic id %u already exists.",
909 			    localid);
910 			return (EEXIST);
911 		} else if (apic_cpus[i].aci_processor_id == procid) {
912 			lock_clear(&apic_ioapic_lock);
913 			intr_restore(iflag);
914 			cmn_err(CE_WARN,
915 			    "!apic: processor id %u already exists.",
916 			    (int)procid);
917 			return (EEXIST);
918 		}
919 
920 		/*
921 		 * There's no local APIC version number available in MADT table,
922 		 * so assume that all CPUs are homogeneous and use local APIC
923 		 * version number of the first existing CPU.
924 		 */
925 		if (first) {
926 			first = B_FALSE;
927 			localver = apic_cpus[i].aci_local_ver;
928 		}
929 	}
930 	ASSERT(first == B_FALSE);
931 
932 	/*
933 	 * Try to assign the same cpuid if APIC id exists in the dirty cache.
934 	 */
935 	for (i = 0; i < apic_max_nproc; i++) {
936 		if (CPU_IN_SET(apic_cpumask, i)) {
937 			ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
938 			continue;
939 		}
940 		ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
941 		if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
942 		    apic_cpus[i].aci_local_id == localid &&
943 		    apic_cpus[i].aci_processor_id == procid) {
944 			cpuid = i;
945 			break;
946 		}
947 	}
948 
949 	/* Avoid the dirty cache and allocate fresh slot if possible. */
950 	if (cpuid == (processorid_t)-1) {
951 		for (i = 0; i < apic_max_nproc; i++) {
952 			if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
953 			    (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
954 				cpuid = i;
955 				break;
956 			}
957 		}
958 	}
959 
960 	/* Try to find any free slot as last resort. */
961 	if (cpuid == (processorid_t)-1) {
962 		for (i = 0; i < apic_max_nproc; i++) {
963 			if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
964 				cpuid = i;
965 				break;
966 			}
967 		}
968 	}
969 
970 	if (cpuid == (processorid_t)-1) {
971 		lock_clear(&apic_ioapic_lock);
972 		intr_restore(iflag);
973 		cmn_err(CE_NOTE,
974 		    "!apic: failed to allocate cpu id for processor %u.",
975 		    procid);
976 		rv = EAGAIN;
977 	} else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
978 		lock_clear(&apic_ioapic_lock);
979 		intr_restore(iflag);
980 		cmn_err(CE_NOTE,
981 		    "!apic: failed to build mapping for processor %u.",
982 		    procid);
983 		rv = EBUSY;
984 	} else {
985 		ASSERT(cpuid >= 0 && cpuid < NCPU);
986 		ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
987 		bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
988 		apic_cpus[cpuid].aci_processor_id = procid;
989 		apic_cpus[cpuid].aci_local_id = localid;
990 		apic_cpus[cpuid].aci_local_ver = localver;
991 		CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
992 		if (cpuid >= apic_nproc) {
993 			apic_nproc = cpuid + 1;
994 		}
995 		lock_clear(&apic_ioapic_lock);
996 		intr_restore(iflag);
997 		reqp->req.cpu_add.cpuid = cpuid;
998 	}
999 
1000 	return (rv);
1001 }
1002 
1003 int
1004 apic_cpu_remove(psm_cpu_request_t *reqp)
1005 {
1006 	int i;
1007 	ulong_t iflag;
1008 	processorid_t cpuid;
1009 
1010 	/* Check whether CPU hotplug is supported. */
1011 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1012 		return (ENOTSUP);
1013 	}
1014 
1015 	cpuid = reqp->req.cpu_remove.cpuid;
1016 
1017 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1018 	iflag = intr_clear();
1019 	lock_set(&apic_ioapic_lock);
1020 
1021 	if (!apic_cpu_in_range(cpuid)) {
1022 		lock_clear(&apic_ioapic_lock);
1023 		intr_restore(iflag);
1024 		cmn_err(CE_WARN,
1025 		    "!apic: cpuid %d doesn't exist in apic_cpus array.",
1026 		    cpuid);
1027 		return (ENODEV);
1028 	}
1029 	ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1030 
1031 	if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1032 		lock_clear(&apic_ioapic_lock);
1033 		intr_restore(iflag);
1034 		return (ENOENT);
1035 	}
1036 
1037 	if (cpuid == apic_nproc - 1) {
1038 		/*
1039 		 * We are removing the highest numbered cpuid so we need to
1040 		 * find the next highest cpuid as the new value for apic_nproc.
1041 		 */
1042 		for (i = apic_nproc; i > 0; i--) {
1043 			if (CPU_IN_SET(apic_cpumask, i - 1)) {
1044 				apic_nproc = i;
1045 				break;
1046 			}
1047 		}
1048 		/* at least one CPU left */
1049 		ASSERT(i > 0);
1050 	}
1051 	CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1052 	/* mark slot as free and keep it in the dirty cache */
1053 	apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1054 
1055 	lock_clear(&apic_ioapic_lock);
1056 	intr_restore(iflag);
1057 
1058 	return (0);
1059 }
1060 
1061 /*
1062  * Return the number of ticks the APIC decrements in SF nanoseconds.
1063  * The fixed-frequency PIT (aka 8254) is used for the measurement.
1064  */
1065 static uint64_t
1066 apic_calibrate_impl()
1067 {
1068 	uint8_t		pit_tick_lo;
1069 	uint16_t	pit_tick, target_pit_tick, pit_ticks_adj;
1070 	uint32_t	pit_ticks;
1071 	uint32_t	start_apic_tick, end_apic_tick, apic_ticks;
1072 	ulong_t		iflag;
1073 
1074 	apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1075 	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1076 
1077 	iflag = intr_clear();
1078 
1079 	do {
1080 		pit_tick_lo = inb(PITCTR0_PORT);
1081 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1082 	} while (pit_tick < APIC_TIME_MIN ||
1083 	    pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1084 
1085 	/*
1086 	 * Wait for the PIT to decrement by 5 ticks to ensure
1087 	 * we didn't start in the middle of a tick.
1088 	 * Compare with 0x10 for the wrap around case.
1089 	 */
1090 	target_pit_tick = pit_tick - 5;
1091 	do {
1092 		pit_tick_lo = inb(PITCTR0_PORT);
1093 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1094 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1095 
1096 	start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1097 
1098 	/*
1099 	 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1100 	 */
1101 	target_pit_tick = pit_tick - APIC_TIME_COUNT;
1102 	do {
1103 		pit_tick_lo = inb(PITCTR0_PORT);
1104 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1105 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1106 
1107 	end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1108 
1109 	intr_restore(iflag);
1110 
1111 	apic_ticks = start_apic_tick - end_apic_tick;
1112 
1113 	/* The PIT might have decremented by more ticks than planned */
1114 	pit_ticks_adj = target_pit_tick - pit_tick;
1115 	/* total number of PIT ticks corresponding to apic_ticks */
1116 	pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1117 
1118 	/*
1119 	 * Determine the number of nanoseconds per APIC clock tick
1120 	 * and then determine how many APIC ticks to interrupt at the
1121 	 * desired frequency
1122 	 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1123 	 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1124 	 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1125 	 * apic_ticks_per_SFns =
1126 	 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1127 	 */
1128 	return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1129 }
1130 
1131 /*
1132  * It was found empirically that 5 measurements seem sufficient to give a good
1133  * accuracy. Most spurious measurements are higher than the target value thus
1134  * we eliminate up to 2/5 spurious measurements.
1135  */
1136 #define	APIC_CALIBRATE_MEASUREMENTS		5
1137 
1138 #define	APIC_CALIBRATE_PERCENT_OFF_WARNING	10
1139 
1140 /*
1141  * Return the number of ticks the APIC decrements in SF nanoseconds.
1142  * Several measurements are taken to filter out outliers.
1143  */
1144 uint64_t
1145 apic_calibrate()
1146 {
1147 	uint64_t	measurements[APIC_CALIBRATE_MEASUREMENTS];
1148 	int		median_idx;
1149 	uint64_t	median;
1150 
1151 	/*
1152 	 * When running under a virtual machine, the emulated PIT and APIC
1153 	 * counters do not always return the right values and can roll over.
1154 	 * Those spurious measurements are relatively rare but could
1155 	 * significantly affect the calibration.
1156 	 * Therefore we take several measurements and then keep the median.
1157 	 * The median is preferred to the average here as we only want to
1158 	 * discard outliers.
1159 	 */
1160 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1161 		measurements[i] = apic_calibrate_impl();
1162 
1163 	/*
1164 	 * sort results and retrieve median.
1165 	 */
1166 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1167 		for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1168 			if (measurements[j] < measurements[i]) {
1169 				uint64_t tmp = measurements[i];
1170 				measurements[i] = measurements[j];
1171 				measurements[j] = tmp;
1172 			}
1173 		}
1174 	}
1175 	median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1176 	median = measurements[median_idx];
1177 
1178 #if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1179 	/*
1180 	 * Check that measurements are consistent. Post a warning
1181 	 * if the three middle values are not close to each other.
1182 	 */
1183 	uint64_t delta_warn = median *
1184 	    APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1185 	if ((median - measurements[median_idx - 1]) > delta_warn ||
1186 	    (measurements[median_idx + 1] - median) > delta_warn) {
1187 		cmn_err(CE_WARN, "apic_calibrate measurements lack "
1188 		    "precision: %llu, %llu, %llu.",
1189 		    (u_longlong_t)measurements[median_idx - 1],
1190 		    (u_longlong_t)median,
1191 		    (u_longlong_t)measurements[median_idx + 1]);
1192 	}
1193 #endif
1194 
1195 	return (median);
1196 }
1197 
1198 /*
1199  * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1200  * frequency.  Note at this stage in the boot sequence, the boot processor
1201  * is the only active processor.
1202  * hertz value of 0 indicates a one-shot mode request.  In this case
1203  * the function returns the resolution (in nanoseconds) for the hardware
1204  * timer interrupt.  If one-shot mode capability is not available,
1205  * the return value will be 0. apic_enable_oneshot is a global switch
1206  * for disabling the functionality.
1207  * A non-zero positive value for hertz indicates a periodic mode request.
1208  * In this case the hardware will be programmed to generate clock interrupts
1209  * at hertz frequency and returns the resolution of interrupts in
1210  * nanosecond.
1211  */
1212 
1213 int
1214 apic_clkinit(int hertz)
1215 {
1216 	int		ret;
1217 
1218 	apic_int_busy_mark = (apic_int_busy_mark *
1219 	    apic_sample_factor_redistribution) / 100;
1220 	apic_int_free_mark = (apic_int_free_mark *
1221 	    apic_sample_factor_redistribution) / 100;
1222 	apic_diff_for_redistribution = (apic_diff_for_redistribution *
1223 	    apic_sample_factor_redistribution) / 100;
1224 
1225 	ret = apic_timer_init(hertz);
1226 	return (ret);
1227 
1228 }
1229 
1230 /*
1231  * apic_preshutdown:
1232  * Called early in shutdown whilst we can still access filesystems to do
1233  * things like loading modules which will be required to complete shutdown
1234  * after filesystems are all unmounted.
1235  */
1236 void
1237 apic_preshutdown(int cmd, int fcn)
1238 {
1239 	APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1240 	    cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1241 }
1242 
1243 void
1244 apic_shutdown(int cmd, int fcn)
1245 {
1246 	int restarts, attempts;
1247 	int i;
1248 	uchar_t	byte;
1249 	ulong_t iflag;
1250 
1251 	hpet_acpi_fini();
1252 
1253 	/* Send NMI to all CPUs except self to do per processor shutdown */
1254 	iflag = intr_clear();
1255 #ifdef	DEBUG
1256 	APIC_AV_PENDING_SET();
1257 #else
1258 	if (apic_mode == LOCAL_APIC)
1259 		APIC_AV_PENDING_SET();
1260 #endif /* DEBUG */
1261 	apic_shutdown_processors = 1;
1262 	apic_reg_ops->apic_write(APIC_INT_CMD1,
1263 	    AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1264 
1265 	/* restore cmos shutdown byte before reboot */
1266 	if (apic_cmos_ssb_set) {
1267 		outb(CMOS_ADDR, SSB);
1268 		outb(CMOS_DATA, 0);
1269 	}
1270 
1271 	ioapic_disable_redirection();
1272 
1273 	/*	disable apic mode if imcr present	*/
1274 	if (apic_imcrp) {
1275 		outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1276 		outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1277 	}
1278 
1279 	apic_disable_local_apic();
1280 
1281 	intr_restore(iflag);
1282 
1283 	/* remainder of function is for shutdown cases only */
1284 	if (cmd != A_SHUTDOWN)
1285 		return;
1286 
1287 	/*
1288 	 * Switch system back into Legacy-Mode if using ACPI and
1289 	 * not powering-off.  Some BIOSes need to remain in ACPI-mode
1290 	 * for power-off to succeed (Dell Dimension 4600)
1291 	 * Do not disable ACPI while doing fastreboot
1292 	 */
1293 	if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1294 		(void) AcpiDisable();
1295 
1296 	if (fcn == AD_FASTREBOOT) {
1297 		apic_reg_ops->apic_write(APIC_INT_CMD1,
1298 		    AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1299 	}
1300 
1301 	/* remainder of function is for shutdown+poweroff case only */
1302 	if (fcn != AD_POWEROFF)
1303 		return;
1304 
1305 	switch (apic_poweroff_method) {
1306 		case APIC_POWEROFF_VIA_RTC:
1307 
1308 			/* select the extended NVRAM bank in the RTC */
1309 			outb(CMOS_ADDR, RTC_REGA);
1310 			byte = inb(CMOS_DATA);
1311 			outb(CMOS_DATA, (byte | EXT_BANK));
1312 
1313 			outb(CMOS_ADDR, PFR_REG);
1314 
1315 			/* for Predator must toggle the PAB bit */
1316 			byte = inb(CMOS_DATA);
1317 
1318 			/*
1319 			 * clear power active bar, wakeup alarm and
1320 			 * kickstart
1321 			 */
1322 			byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1323 			outb(CMOS_DATA, byte);
1324 
1325 			/* delay before next write */
1326 			drv_usecwait(1000);
1327 
1328 			/* for S40 the following would suffice */
1329 			byte = inb(CMOS_DATA);
1330 
1331 			/* power active bar control bit */
1332 			byte |= PAB_CBIT;
1333 			outb(CMOS_DATA, byte);
1334 
1335 			break;
1336 
1337 		case APIC_POWEROFF_VIA_ASPEN_BMC:
1338 			restarts = 0;
1339 restart_aspen_bmc:
1340 			if (++restarts == 3)
1341 				break;
1342 			attempts = 0;
1343 			do {
1344 				byte = inb(MISMIC_FLAG_REGISTER);
1345 				byte &= MISMIC_BUSY_MASK;
1346 				if (byte != 0) {
1347 					drv_usecwait(1000);
1348 					if (attempts >= 3)
1349 						goto restart_aspen_bmc;
1350 					++attempts;
1351 				}
1352 			} while (byte != 0);
1353 			outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1354 			byte = inb(MISMIC_FLAG_REGISTER);
1355 			byte |= 0x1;
1356 			outb(MISMIC_FLAG_REGISTER, byte);
1357 			i = 0;
1358 			for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1359 			    i++) {
1360 				attempts = 0;
1361 				do {
1362 					byte = inb(MISMIC_FLAG_REGISTER);
1363 					byte &= MISMIC_BUSY_MASK;
1364 					if (byte != 0) {
1365 						drv_usecwait(1000);
1366 						if (attempts >= 3)
1367 							goto restart_aspen_bmc;
1368 						++attempts;
1369 					}
1370 				} while (byte != 0);
1371 				outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1372 				outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1373 				byte = inb(MISMIC_FLAG_REGISTER);
1374 				byte |= 0x1;
1375 				outb(MISMIC_FLAG_REGISTER, byte);
1376 			}
1377 			break;
1378 
1379 		case APIC_POWEROFF_VIA_SITKA_BMC:
1380 			restarts = 0;
1381 restart_sitka_bmc:
1382 			if (++restarts == 3)
1383 				break;
1384 			attempts = 0;
1385 			do {
1386 				byte = inb(SMS_STATUS_REGISTER);
1387 				byte &= SMS_STATE_MASK;
1388 				if ((byte == SMS_READ_STATE) ||
1389 				    (byte == SMS_WRITE_STATE)) {
1390 					drv_usecwait(1000);
1391 					if (attempts >= 3)
1392 						goto restart_sitka_bmc;
1393 					++attempts;
1394 				}
1395 			} while ((byte == SMS_READ_STATE) ||
1396 			    (byte == SMS_WRITE_STATE));
1397 			outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1398 			i = 0;
1399 			for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1400 			    i++) {
1401 				attempts = 0;
1402 				do {
1403 					byte = inb(SMS_STATUS_REGISTER);
1404 					byte &= SMS_IBF_MASK;
1405 					if (byte != 0) {
1406 						drv_usecwait(1000);
1407 						if (attempts >= 3)
1408 							goto restart_sitka_bmc;
1409 						++attempts;
1410 					}
1411 				} while (byte != 0);
1412 				outb(sitka_bmc[i].port, sitka_bmc[i].data);
1413 			}
1414 			break;
1415 
1416 		case APIC_POWEROFF_NONE:
1417 
1418 			/* If no APIC direct method, we will try using ACPI */
1419 			if (apic_enable_acpi) {
1420 				if (acpi_poweroff() == 1)
1421 					return;
1422 			} else
1423 				return;
1424 
1425 			break;
1426 	}
1427 	/*
1428 	 * Wait a limited time here for power to go off.
1429 	 * If the power does not go off, then there was a
1430 	 * problem and we should continue to the halt which
1431 	 * prints a message for the user to press a key to
1432 	 * reboot.
1433 	 */
1434 	drv_usecwait(7000000); /* wait seven seconds */
1435 
1436 }
1437 
1438 cyclic_id_t apic_cyclic_id;
1439 
1440 /*
1441  * The following functions are in the platform specific file so that they
1442  * can be different functions depending on whether we are running on
1443  * bare metal or a hypervisor.
1444  */
1445 
1446 /*
1447  * map an apic for memory-mapped access
1448  */
1449 uint32_t *
1450 mapin_apic(uint32_t addr, size_t len, int flags)
1451 {
1452 	return ((void *)psm_map_phys(addr, len, flags));
1453 }
1454 
1455 uint32_t *
1456 mapin_ioapic(uint32_t addr, size_t len, int flags)
1457 {
1458 	return (mapin_apic(addr, len, flags));
1459 }
1460 
1461 /*
1462  * unmap an apic
1463  */
1464 void
1465 mapout_apic(caddr_t addr, size_t len)
1466 {
1467 	psm_unmap_phys(addr, len);
1468 }
1469 
1470 void
1471 mapout_ioapic(caddr_t addr, size_t len)
1472 {
1473 	mapout_apic(addr, len);
1474 }
1475 
1476 uint32_t
1477 ioapic_read(int ioapic_ix, uint32_t reg)
1478 {
1479 	volatile uint32_t *ioapic;
1480 
1481 	ioapic = apicioadr[ioapic_ix];
1482 	ioapic[APIC_IO_REG] = reg;
1483 	return (ioapic[APIC_IO_DATA]);
1484 }
1485 
1486 void
1487 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1488 {
1489 	volatile uint32_t *ioapic;
1490 
1491 	ioapic = apicioadr[ioapic_ix];
1492 	ioapic[APIC_IO_REG] = reg;
1493 	ioapic[APIC_IO_DATA] = value;
1494 }
1495 
1496 void
1497 ioapic_write_eoi(int ioapic_ix, uint32_t value)
1498 {
1499 	volatile uint32_t *ioapic;
1500 
1501 	ioapic = apicioadr[ioapic_ix];
1502 	ioapic[APIC_IO_EOI] = value;
1503 }
1504 
1505 /*
1506  * Round-robin algorithm to find the next CPU with interrupts enabled.
1507  * It can't share the same static variable apic_next_bind_cpu with
1508  * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1509  * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1510  * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1511  * are called.  However, the pcplusmp driver assumes that there will be
1512  * boot_ncpus CPUs configured eventually so it tries to distribute all
1513  * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1514  * interrupts being targetted at CPU1, we need to use a dedicated static
1515  * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1516  */
1517 
1518 processorid_t
1519 apic_find_cpu(int flag)
1520 {
1521 	int i;
1522 	static processorid_t acid = 0;
1523 
1524 	/* Find the first CPU with the passed-in flag set */
1525 	for (i = 0; i < apic_nproc; i++) {
1526 		if (++acid >= apic_nproc) {
1527 			acid = 0;
1528 		}
1529 		if (apic_cpu_in_range(acid) &&
1530 		    (apic_cpus[acid].aci_status & flag)) {
1531 			break;
1532 		}
1533 	}
1534 
1535 	ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1536 	return (acid);
1537 }
1538 
1539 void
1540 apic_intrmap_init(int apic_mode)
1541 {
1542 	int suppress_brdcst_eoi = 0;
1543 
1544 	/*
1545 	 * Intel Software Developer's Manual 3A, 10.12.7:
1546 	 *
1547 	 * Routing of device interrupts to local APIC units operating in
1548 	 * x2APIC mode requires use of the interrupt-remapping architecture
1549 	 * specified in the Intel Virtualization Technology for Directed
1550 	 * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1551 	 * for and software must enable this interrupt remapping with
1552 	 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1553 	 * the local APIC units.
1554 	 *
1555 	 *
1556 	 * In other words, to use the APIC in x2APIC mode, we need interrupt
1557 	 * remapping.  Since we don't start up the IOMMU by default, we
1558 	 * won't be able to do any interrupt remapping and therefore have to
1559 	 * use the APIC in traditional 'local APIC' mode with memory mapped
1560 	 * I/O.
1561 	 */
1562 
1563 	if (psm_vt_ops != NULL) {
1564 		if (((apic_intrmap_ops_t *)psm_vt_ops)->
1565 		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1566 
1567 			apic_vt_ops = psm_vt_ops;
1568 
1569 			/*
1570 			 * We leverage the interrupt remapping engine to
1571 			 * suppress broadcast EOI; thus we must send the
1572 			 * directed EOI with the directed-EOI handler.
1573 			 */
1574 			if (apic_directed_EOI_supported() == 0) {
1575 				suppress_brdcst_eoi = 1;
1576 			}
1577 
1578 			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1579 
1580 			if (apic_detect_x2apic()) {
1581 				apic_enable_x2apic();
1582 			}
1583 
1584 			if (apic_directed_EOI_supported() == 0) {
1585 				apic_set_directed_EOI_handler();
1586 			}
1587 		}
1588 	}
1589 }
1590 
1591 /*ARGSUSED*/
1592 static void
1593 apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt)
1594 {
1595 	irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1596 }
1597 
1598 /*ARGSUSED*/
1599 static void
1600 apic_record_msi(void *intrmap_private, msi_regs_t *mregs)
1601 {
1602 	mregs->mr_addr = MSI_ADDR_HDR |
1603 	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1604 	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1605 	    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1606 	mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1607 	    mregs->mr_data;
1608 }
1609 
1610 /*
1611  * Functions from apic_introp.c
1612  *
1613  * Those functions are used by apic_intr_ops().
1614  */
1615 
1616 /*
1617  * MSI support flag:
1618  * reflects whether MSI is supported at APIC level
1619  * it can also be patched through /etc/system
1620  *
1621  *  0 = default value - don't know and need to call apic_check_msi_support()
1622  *      to find out then set it accordingly
1623  *  1 = supported
1624  * -1 = not supported
1625  */
1626 int	apic_support_msi = 0;
1627 
1628 /* Multiple vector support for MSI-X */
1629 int	apic_msix_enable = 1;
1630 
1631 /* Multiple vector support for MSI */
1632 int	apic_multi_msi_enable = 1;
1633 
1634 /*
1635  * Check whether the system supports MSI.
1636  *
1637  * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1638  * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1639  * return PSM_SUCCESS to indicate this system supports MSI.
1640  *
1641  * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1642  * by detecting if we are running inside the KVM hypervisor, which guarantees
1643  * this version number.)
1644  */
1645 int
1646 apic_check_msi_support()
1647 {
1648 	dev_info_t *cdip;
1649 	char dev_type[16];
1650 	int dev_len;
1651 
1652 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1653 
1654 	/*
1655 	 * check whether the first level children of root_node have
1656 	 * PCI-E or PCI capability.
1657 	 */
1658 	for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1659 	    cdip = ddi_get_next_sibling(cdip)) {
1660 
1661 		DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1662 		    " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1663 		    ddi_driver_name(cdip), ddi_binding_name(cdip),
1664 		    ddi_node_name(cdip)));
1665 		dev_len = sizeof (dev_type);
1666 		if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1667 		    "device_type", (caddr_t)dev_type, &dev_len)
1668 		    != DDI_PROP_SUCCESS)
1669 			continue;
1670 		if (strcmp(dev_type, "pciex") == 0)
1671 			return (PSM_SUCCESS);
1672 		if (strcmp(dev_type, "pci") == 0 && get_hwenv() == HW_KVM)
1673 			return (PSM_SUCCESS);
1674 	}
1675 
1676 	/* MSI is not supported on this system */
1677 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1678 	    "device_type found\n"));
1679 	return (PSM_FAILURE);
1680 }
1681 
1682 /*
1683  * apic_pci_msi_unconfigure:
1684  *
1685  * This and next two interfaces are copied from pci_intr_lib.c
1686  * Do ensure that these two files stay in sync.
1687  * These needed to be copied over here to avoid a deadlock situation on
1688  * certain mp systems that use MSI interrupts.
1689  *
1690  * IMPORTANT regards next three interfaces:
1691  * i) are called only for MSI/X interrupts.
1692  * ii) called with interrupts disabled, and must not block
1693  */
1694 void
1695 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1696 {
1697 	ushort_t		msi_ctrl;
1698 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1699 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1700 
1701 	ASSERT((handle != NULL) && (cap_ptr != 0));
1702 
1703 	if (type == DDI_INTR_TYPE_MSI) {
1704 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1705 		msi_ctrl &= (~PCI_MSI_MME_MASK);
1706 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1707 		pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1708 
1709 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1710 			pci_config_put16(handle,
1711 			    cap_ptr + PCI_MSI_64BIT_DATA, 0);
1712 			pci_config_put32(handle,
1713 			    cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1714 		} else {
1715 			pci_config_put16(handle,
1716 			    cap_ptr + PCI_MSI_32BIT_DATA, 0);
1717 		}
1718 
1719 	} else if (type == DDI_INTR_TYPE_MSIX) {
1720 		uintptr_t	off;
1721 		uint32_t	mask;
1722 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(rdip);
1723 
1724 		ASSERT(msix_p != NULL);
1725 
1726 		/* Offset into "inum"th entry in the MSI-X table & mask it */
1727 		off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1728 		    PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1729 
1730 		mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1731 
1732 		ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1733 
1734 		/* Offset into the "inum"th entry in the MSI-X table */
1735 		off = (uintptr_t)msix_p->msix_tbl_addr +
1736 		    (inum * PCI_MSIX_VECTOR_SIZE);
1737 
1738 		/* Reset the "data" and "addr" bits */
1739 		ddi_put32(msix_p->msix_tbl_hdl,
1740 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1741 		ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1742 	}
1743 }
1744 
1745 /*
1746  * apic_pci_msi_disable_mode:
1747  */
1748 void
1749 apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1750 {
1751 	ushort_t		msi_ctrl;
1752 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1753 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1754 
1755 	ASSERT((handle != NULL) && (cap_ptr != 0));
1756 
1757 	if (type == DDI_INTR_TYPE_MSI) {
1758 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1759 		if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1760 			return;
1761 
1762 		msi_ctrl &= ~PCI_MSI_ENABLE_BIT;	/* MSI disable */
1763 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1764 
1765 	} else if (type == DDI_INTR_TYPE_MSIX) {
1766 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1767 		if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1768 			msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1769 			pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1770 			    msi_ctrl);
1771 		}
1772 	}
1773 }
1774 
1775 uint32_t
1776 apic_get_localapicid(uint32_t cpuid)
1777 {
1778 	ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1779 
1780 	return (apic_cpus[cpuid].aci_local_id);
1781 }
1782 
1783 uchar_t
1784 apic_get_ioapicid(uchar_t ioapicindex)
1785 {
1786 	ASSERT(ioapicindex < MAX_IO_APIC);
1787 
1788 	return (apic_io_id[ioapicindex]);
1789 }
1790