xref: /illumos-gate/usr/src/uts/i86pc/io/pcplusmp/apic_common.c (revision aa9ef484c6f8ecee85dfefdb4970c50cfa2db302)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright 2018 Joyent, Inc.
27  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
28  */
29 
30 /*
31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34  * PSMI 1.5 extensions are supported in Solaris Nevada.
35  * PSMI 1.6 extensions are supported in Solaris Nevada.
36  * PSMI 1.7 extensions are supported in Solaris Nevada.
37  */
38 #define	PSMI_1_7
39 
40 #include <sys/processor.h>
41 #include <sys/time.h>
42 #include <sys/psm.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/cram.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/psm_common.h>
48 #include <sys/apic.h>
49 #include <sys/pit.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/ddi_impldefs.h>
53 #include <sys/pci.h>
54 #include <sys/promif.h>
55 #include <sys/x86_archext.h>
56 #include <sys/cpc_impl.h>
57 #include <sys/uadmin.h>
58 #include <sys/panic.h>
59 #include <sys/debug.h>
60 #include <sys/archsystm.h>
61 #include <sys/trap.h>
62 #include <sys/machsystm.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cpuvar.h>
65 #include <sys/rm_platter.h>
66 #include <sys/privregs.h>
67 #include <sys/note.h>
68 #include <sys/pci_intr_lib.h>
69 #include <sys/spl.h>
70 #include <sys/clock.h>
71 #include <sys/dditypes.h>
72 #include <sys/sunddi.h>
73 #include <sys/x_call.h>
74 #include <sys/reboot.h>
75 #include <sys/hpet.h>
76 #include <sys/apic_common.h>
77 #include <sys/apic_timer.h>
78 
79 static void	apic_record_ioapic_rdt(void *intrmap_private,
80 		    ioapic_rdt_t *irdt);
81 static void	apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
82 
83 /*
84  * Common routines between pcplusmp & apix (taken from apic.c).
85  */
86 
87 int	apic_clkinit(int);
88 hrtime_t apic_gethrtime(void);
89 void	apic_send_ipi(int, int);
90 void	apic_set_idlecpu(processorid_t);
91 void	apic_unset_idlecpu(processorid_t);
92 void	apic_shutdown(int, int);
93 void	apic_preshutdown(int, int);
94 processorid_t	apic_get_next_processorid(processorid_t);
95 
96 hrtime_t apic_gettime();
97 
98 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
99 
100 /* Now the ones for Dynamic Interrupt distribution */
101 int	apic_enable_dynamic_migration = 0;
102 
103 /* maximum loop count when sending Start IPIs. */
104 int apic_sipi_max_loop_count = 0x1000;
105 
106 /*
107  * These variables are frequently accessed in apic_intr_enter(),
108  * apic_intr_exit and apic_setspl, so group them together
109  */
110 volatile uint32_t *apicadr =  NULL;	/* virtual addr of local APIC	*/
111 int apic_setspl_delay = 1;		/* apic_setspl - delay enable	*/
112 int apic_clkvect;
113 
114 /* vector at which error interrupts come in */
115 int apic_errvect;
116 int apic_enable_error_intr = 1;
117 int apic_error_display_delay = 100;
118 
119 /* vector at which performance counter overflow interrupts come in */
120 int apic_cpcovf_vect;
121 int apic_enable_cpcovf_intr = 1;
122 
123 /* vector at which CMCI interrupts come in */
124 int apic_cmci_vect;
125 extern int cmi_enable_cmci;
126 extern void cmi_cmci_trap(void);
127 
128 kmutex_t cmci_cpu_setup_lock;	/* protects cmci_cpu_setup_registered */
129 int cmci_cpu_setup_registered;
130 
131 lock_t apic_mode_switch_lock;
132 
133 int apic_pir_vect;
134 
135 /*
136  * Patchable global variables.
137  */
138 int	apic_forceload = 0;
139 
140 int	apic_coarse_hrtime = 1;		/* 0 - use accurate slow gethrtime() */
141 
142 int	apic_flat_model = 0;		/* 0 - clustered. 1 - flat */
143 int	apic_panic_on_nmi = 0;
144 int	apic_panic_on_apic_error = 0;
145 
146 int	apic_verbose = 0;	/* 0x1ff */
147 
148 #ifdef DEBUG
149 int	apic_debug = 0;
150 int	apic_restrict_vector = 0;
151 
152 int	apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
153 int	apic_debug_msgbufindex = 0;
154 
155 #endif /* DEBUG */
156 
157 uint_t apic_nticks = 0;
158 uint_t apic_skipped_redistribute = 0;
159 
160 uint_t last_count_read = 0;
161 lock_t	apic_gethrtime_lock;
162 volatile int	apic_hrtime_stamp = 0;
163 volatile hrtime_t apic_nsec_since_boot = 0;
164 
165 static	hrtime_t	apic_last_hrtime = 0;
166 int		apic_hrtime_error = 0;
167 int		apic_remote_hrterr = 0;
168 int		apic_num_nmis = 0;
169 int		apic_apic_error = 0;
170 int		apic_num_apic_errors = 0;
171 int		apic_num_cksum_errors = 0;
172 
173 int	apic_error = 0;
174 
175 static	int	apic_cmos_ssb_set = 0;
176 
177 /* use to make sure only one cpu handles the nmi */
178 lock_t	apic_nmi_lock;
179 /* use to make sure only one cpu handles the error interrupt */
180 lock_t	apic_error_lock;
181 
182 static	struct {
183 	uchar_t	cntl;
184 	uchar_t	data;
185 } aspen_bmc[] = {
186 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
187 	{ CC_SMS_WR_NEXT,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
188 	{ CC_SMS_WR_NEXT,	0x84 },		/* DataByte 1: SMS/OS no log */
189 	{ CC_SMS_WR_NEXT,	0x2 },		/* DataByte 2: Power Down */
190 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 3: no pre-timeout */
191 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 4: timer expir. */
192 	{ CC_SMS_WR_NEXT,	0xa },		/* DataByte 5: init countdown */
193 	{ CC_SMS_WR_END,	0x0 },		/* DataByte 6: init countdown */
194 
195 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
196 	{ CC_SMS_WR_END,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
197 };
198 
199 static	struct {
200 	int	port;
201 	uchar_t	data;
202 } sitka_bmc[] = {
203 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
204 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
205 	{ SMS_DATA_REGISTER,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
206 	{ SMS_DATA_REGISTER,	0x84 },		/* DataByte 1: SMS/OS no log */
207 	{ SMS_DATA_REGISTER,	0x2 },		/* DataByte 2: Power Down */
208 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 3: no pre-timeout */
209 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 4: timer expir. */
210 	{ SMS_DATA_REGISTER,	0xa },		/* DataByte 5: init countdown */
211 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
212 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 6: init countdown */
213 
214 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
215 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
216 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
217 	{ SMS_DATA_REGISTER,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
218 };
219 
220 /* Patchable global variables. */
221 int		apic_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
222 uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
223 
224 /* default apic ops without interrupt remapping */
225 static apic_intrmap_ops_t apic_nointrmap_ops = {
226 	(int (*)(int))return_instr,
227 	(void (*)(int))return_instr,
228 	(void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
229 	(void (*)(void *, void *, uint16_t, int))return_instr,
230 	(void (*)(void **))return_instr,
231 	apic_record_ioapic_rdt,
232 	apic_record_msi,
233 };
234 
235 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
236 apic_cpus_info_t	*apic_cpus = NULL;
237 cpuset_t	apic_cpumask;
238 uint_t		apic_picinit_called;
239 
240 /* Flag to indicate that we need to shut down all processors */
241 static uint_t	apic_shutdown_processors;
242 
243 /*
244  * Probe the ioapic method for apix module. Called in apic_probe_common()
245  */
246 int
247 apic_ioapic_method_probe()
248 {
249 	if (apix_enable == 0)
250 		return (PSM_SUCCESS);
251 
252 	/*
253 	 * Set IOAPIC EOI handling method. The priority from low to high is:
254 	 *	1. IOxAPIC: with EOI register
255 	 *	2. IOMMU interrupt mapping
256 	 *	3. Mask-Before-EOI method for systems without boot
257 	 *	interrupt routing, such as systems with only one IOAPIC;
258 	 *	NVIDIA CK8-04/MCP55 systems; systems with bridge solution
259 	 *	which disables the boot interrupt routing already.
260 	 *	4. Directed EOI
261 	 */
262 	if (apic_io_ver[0] >= 0x20)
263 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
264 	if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
265 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
266 	if (apic_directed_EOI_supported())
267 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
268 
269 	/* fall back to pcplusmp */
270 	if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
271 		/* make sure apix is after pcplusmp in /etc/mach */
272 		apix_enable = 0; /* go ahead with pcplusmp install next */
273 		return (PSM_FAILURE);
274 	}
275 
276 	return (PSM_SUCCESS);
277 }
278 
279 /*
280  * handler for APIC Error interrupt. Just print a warning and continue
281  */
282 int
283 apic_error_intr()
284 {
285 	uint_t	error0, error1, error;
286 	uint_t	i;
287 
288 	/*
289 	 * We need to write before read as per 7.4.17 of system prog manual.
290 	 * We do both and or the results to be safe
291 	 */
292 	error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
293 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
294 	error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
295 	error = error0 | error1;
296 
297 	/*
298 	 * Clear the APIC error status (do this on all cpus that enter here)
299 	 * (two writes are required due to the semantics of accessing the
300 	 * error status register.)
301 	 */
302 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
303 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
304 
305 	/*
306 	 * Prevent more than 1 CPU from handling error interrupt causing
307 	 * double printing (interleave of characters from multiple
308 	 * CPU's when using prom_printf)
309 	 */
310 	if (lock_try(&apic_error_lock) == 0)
311 		return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
312 	if (error) {
313 #if	DEBUG
314 		if (apic_debug)
315 			debug_enter("pcplusmp: APIC Error interrupt received");
316 #endif /* DEBUG */
317 		if (apic_panic_on_apic_error)
318 			cmn_err(CE_PANIC,
319 			    "APIC Error interrupt on CPU %d. Status = %x",
320 			    psm_get_cpu_id(), error);
321 		else {
322 			if ((error & ~APIC_CS_ERRORS) == 0) {
323 				/* cksum error only */
324 				apic_error |= APIC_ERR_APIC_ERROR;
325 				apic_apic_error |= error;
326 				apic_num_apic_errors++;
327 				apic_num_cksum_errors++;
328 			} else {
329 				/*
330 				 * prom_printf is the best shot we have of
331 				 * something which is problem free from
332 				 * high level/NMI type of interrupts
333 				 */
334 				prom_printf("APIC Error interrupt on CPU %d. "
335 				    "Status 0 = %x, Status 1 = %x\n",
336 				    psm_get_cpu_id(), error0, error1);
337 				apic_error |= APIC_ERR_APIC_ERROR;
338 				apic_apic_error |= error;
339 				apic_num_apic_errors++;
340 				for (i = 0; i < apic_error_display_delay; i++) {
341 					tenmicrosec();
342 				}
343 				/*
344 				 * provide more delay next time limited to
345 				 * roughly 1 clock tick time
346 				 */
347 				if (apic_error_display_delay < 500)
348 					apic_error_display_delay *= 2;
349 			}
350 		}
351 		lock_clear(&apic_error_lock);
352 		return (DDI_INTR_CLAIMED);
353 	} else {
354 		lock_clear(&apic_error_lock);
355 		return (DDI_INTR_UNCLAIMED);
356 	}
357 }
358 
359 /*
360  * Turn off the mask bit in the performance counter Local Vector Table entry.
361  */
362 void
363 apic_cpcovf_mask_clear(void)
364 {
365 	apic_reg_ops->apic_write(APIC_PCINT_VECT,
366 	    (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
367 }
368 
369 /*ARGSUSED*/
370 static int
371 apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
372 {
373 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
374 	return (0);
375 }
376 
377 /*ARGSUSED*/
378 static int
379 apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
380 {
381 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
382 	return (0);
383 }
384 
385 /*ARGSUSED*/
386 int
387 cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
388 {
389 	cpuset_t	cpu_set;
390 
391 	CPUSET_ONLY(cpu_set, cpuid);
392 
393 	switch (what) {
394 		case CPU_ON:
395 			xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
396 			    (xc_func_t)apic_cmci_enable);
397 			break;
398 
399 		case CPU_OFF:
400 			xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
401 			    (xc_func_t)apic_cmci_disable);
402 			break;
403 
404 		default:
405 			break;
406 	}
407 
408 	return (0);
409 }
410 
411 static void
412 apic_disable_local_apic(void)
413 {
414 	apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
415 	apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
416 
417 	/* local intr reg 0 */
418 	apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
419 
420 	/* disable NMI */
421 	apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
422 
423 	/* and error interrupt */
424 	apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
425 
426 	/* and perf counter intr */
427 	apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
428 
429 	apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
430 }
431 
432 static void
433 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
434 {
435 	int		loop_count;
436 	uint32_t	vector;
437 	uint_t		apicid;
438 	ulong_t		iflag;
439 
440 	apicid =  apic_cpus[cpun].aci_local_id;
441 
442 	/*
443 	 * Interrupts on current CPU will be disabled during the
444 	 * steps in order to avoid unwanted side effects from
445 	 * executing interrupt handlers on a problematic BIOS.
446 	 */
447 	iflag = intr_clear();
448 
449 	if (start) {
450 		outb(CMOS_ADDR, SSB);
451 		outb(CMOS_DATA, BIOS_SHUTDOWN);
452 	}
453 
454 	/*
455 	 * According to X2APIC specification in section '2.3.5.1' of
456 	 * Interrupt Command Register Semantics, the semantics of
457 	 * programming the Interrupt Command Register to dispatch an interrupt
458 	 * is simplified. A single MSR write to the 64-bit ICR is required
459 	 * for dispatching an interrupt. Specifically, with the 64-bit MSR
460 	 * interface to ICR, system software is not required to check the
461 	 * status of the delivery status bit prior to writing to the ICR
462 	 * to send an IPI. With the removal of the Delivery Status bit,
463 	 * system software no longer has a reason to read the ICR. It remains
464 	 * readable only to aid in debugging.
465 	 */
466 #ifdef	DEBUG
467 	APIC_AV_PENDING_SET();
468 #else
469 	if (apic_mode == LOCAL_APIC) {
470 		APIC_AV_PENDING_SET();
471 	}
472 #endif /* DEBUG */
473 
474 	/* for integrated - make sure there is one INIT IPI in buffer */
475 	/* for external - it will wake up the cpu */
476 	apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
477 
478 	/* If only 1 CPU is installed, PENDING bit will not go low */
479 	for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
480 		if (apic_mode == LOCAL_APIC &&
481 		    apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
482 			apic_ret();
483 		else
484 			break;
485 	}
486 
487 	apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
488 	drv_usecwait(20000);		/* 20 milli sec */
489 
490 	if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
491 		/* integrated apic */
492 
493 		vector = (rm_platter_pa >> MMU_PAGESHIFT) &
494 		    (APIC_VECTOR_MASK | APIC_IPL_MASK);
495 
496 		/* to offset the INIT IPI queue up in the buffer */
497 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
498 		drv_usecwait(200);		/* 20 micro sec */
499 
500 		/*
501 		 * send the second SIPI (Startup IPI) as recommended by Intel
502 		 * software development manual.
503 		 */
504 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
505 		drv_usecwait(200);	/* 20 micro sec */
506 	}
507 
508 	intr_restore(iflag);
509 }
510 
511 /*ARGSUSED1*/
512 int
513 apic_cpu_start(processorid_t cpun, caddr_t arg)
514 {
515 	ASSERT(MUTEX_HELD(&cpu_lock));
516 
517 	if (!apic_cpu_in_range(cpun)) {
518 		return (EINVAL);
519 	}
520 
521 	/*
522 	 * Switch to apic_common_send_ipi for safety during starting other CPUs.
523 	 */
524 	if (apic_mode == LOCAL_X2APIC) {
525 		apic_switch_ipi_callback(B_TRUE);
526 	}
527 
528 	apic_cmos_ssb_set = 1;
529 	apic_cpu_send_SIPI(cpun, B_TRUE);
530 
531 	return (0);
532 }
533 
534 /*
535  * Put CPU into halted state with interrupts disabled.
536  */
537 /*ARGSUSED1*/
538 int
539 apic_cpu_stop(processorid_t cpun, caddr_t arg)
540 {
541 	int		rc;
542 	cpu_t		*cp;
543 	extern cpuset_t cpu_ready_set;
544 	extern void cpu_idle_intercept_cpu(cpu_t *cp);
545 
546 	ASSERT(MUTEX_HELD(&cpu_lock));
547 
548 	if (!apic_cpu_in_range(cpun)) {
549 		return (EINVAL);
550 	}
551 	if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
552 		return (ENOTSUP);
553 	}
554 
555 	cp = cpu_get(cpun);
556 	ASSERT(cp != NULL);
557 	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
558 	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
559 	ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
560 
561 	/* Clear CPU_READY flag to disable cross calls. */
562 	cp->cpu_flags &= ~CPU_READY;
563 	CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
564 	rc = xc_flush_cpu(cp);
565 	if (rc != 0) {
566 		CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
567 		cp->cpu_flags |= CPU_READY;
568 		return (rc);
569 	}
570 
571 	/* Intercept target CPU at a safe point before powering it off. */
572 	cpu_idle_intercept_cpu(cp);
573 
574 	apic_cpu_send_SIPI(cpun, B_FALSE);
575 	cp->cpu_flags &= ~CPU_RUNNING;
576 
577 	return (0);
578 }
579 
580 int
581 apic_cpu_ops(psm_cpu_request_t *reqp)
582 {
583 	if (reqp == NULL) {
584 		return (EINVAL);
585 	}
586 
587 	switch (reqp->pcr_cmd) {
588 	case PSM_CPU_ADD:
589 		return (apic_cpu_add(reqp));
590 
591 	case PSM_CPU_REMOVE:
592 		return (apic_cpu_remove(reqp));
593 
594 	case PSM_CPU_STOP:
595 		return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
596 		    reqp->req.cpu_stop.ctx));
597 
598 	default:
599 		return (ENOTSUP);
600 	}
601 }
602 
603 #ifdef	DEBUG
604 int	apic_break_on_cpu = 9;
605 int	apic_stretch_interrupts = 0;
606 int	apic_stretch_ISR = 1 << 3;	/* IPL of 3 matches nothing now */
607 #endif /* DEBUG */
608 
609 /*
610  * generates an interprocessor interrupt to another CPU. Any changes made to
611  * this routine must be accompanied by similar changes to
612  * apic_common_send_ipi().
613  */
614 void
615 apic_send_ipi(int cpun, int ipl)
616 {
617 	int vector;
618 	ulong_t flag;
619 
620 	vector = apic_resv_vector[ipl];
621 
622 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
623 
624 	flag = intr_clear();
625 
626 	APIC_AV_PENDING_SET();
627 
628 	apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
629 	    vector);
630 
631 	intr_restore(flag);
632 }
633 
634 void
635 apic_send_pir_ipi(processorid_t cpun)
636 {
637 	const int vector = apic_pir_vect;
638 	ulong_t flag;
639 
640 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
641 
642 	flag = intr_clear();
643 
644 	/* Self-IPI for inducing PIR makes no sense. */
645 	if ((cpun != psm_get_cpu_id())) {
646 		APIC_AV_PENDING_SET();
647 		apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
648 		    vector);
649 	}
650 
651 	intr_restore(flag);
652 }
653 
654 int
655 apic_get_pir_ipivect(void)
656 {
657 	return (apic_pir_vect);
658 }
659 
660 /*ARGSUSED*/
661 void
662 apic_set_idlecpu(processorid_t cpun)
663 {
664 }
665 
666 /*ARGSUSED*/
667 void
668 apic_unset_idlecpu(processorid_t cpun)
669 {
670 }
671 
672 
673 void
674 apic_ret()
675 {
676 }
677 
678 /*
679  * If apic_coarse_time == 1, then apic_gettime() is used instead of
680  * apic_gethrtime().  This is used for performance instead of accuracy.
681  */
682 
683 hrtime_t
684 apic_gettime()
685 {
686 	int old_hrtime_stamp;
687 	hrtime_t temp;
688 
689 	/*
690 	 * In one-shot mode, we do not keep time, so if anyone
691 	 * calls psm_gettime() directly, we vector over to
692 	 * gethrtime().
693 	 * one-shot mode MUST NOT be enabled if this psm is the source of
694 	 * hrtime.
695 	 */
696 
697 	if (apic_oneshot)
698 		return (gethrtime());
699 
700 
701 gettime_again:
702 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
703 		apic_ret();
704 
705 	temp = apic_nsec_since_boot;
706 
707 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
708 		goto gettime_again;
709 	}
710 	return (temp);
711 }
712 
713 /*
714  * Here we return the number of nanoseconds since booting.  Note every
715  * clock interrupt increments apic_nsec_since_boot by the appropriate
716  * amount.
717  */
718 hrtime_t
719 apic_gethrtime(void)
720 {
721 	int curr_timeval, countval, elapsed_ticks;
722 	int old_hrtime_stamp, status;
723 	hrtime_t temp;
724 	uint32_t cpun;
725 	ulong_t oflags;
726 
727 	/*
728 	 * In one-shot mode, we do not keep time, so if anyone
729 	 * calls psm_gethrtime() directly, we vector over to
730 	 * gethrtime().
731 	 * one-shot mode MUST NOT be enabled if this psm is the source of
732 	 * hrtime.
733 	 */
734 
735 	if (apic_oneshot)
736 		return (gethrtime());
737 
738 	oflags = intr_clear();	/* prevent migration */
739 
740 	cpun = apic_reg_ops->apic_read(APIC_LID_REG);
741 	if (apic_mode == LOCAL_APIC)
742 		cpun >>= APIC_ID_BIT_OFFSET;
743 
744 	lock_set(&apic_gethrtime_lock);
745 
746 gethrtime_again:
747 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
748 		apic_ret();
749 
750 	/*
751 	 * Check to see which CPU we are on.  Note the time is kept on
752 	 * the local APIC of CPU 0.  If on CPU 0, simply read the current
753 	 * counter.  If on another CPU, issue a remote read command to CPU 0.
754 	 */
755 	if (cpun == apic_cpus[0].aci_local_id) {
756 		countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
757 	} else {
758 #ifdef	DEBUG
759 		APIC_AV_PENDING_SET();
760 #else
761 		if (apic_mode == LOCAL_APIC)
762 			APIC_AV_PENDING_SET();
763 #endif /* DEBUG */
764 
765 		apic_reg_ops->apic_write_int_cmd(
766 		    apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
767 
768 		while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
769 		    & AV_READ_PENDING) {
770 			apic_ret();
771 		}
772 
773 		if (status & AV_REMOTE_STATUS)	/* 1 = valid */
774 			countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
775 		else {	/* 0 = invalid */
776 			apic_remote_hrterr++;
777 			/*
778 			 * return last hrtime right now, will need more
779 			 * testing if change to retry
780 			 */
781 			temp = apic_last_hrtime;
782 
783 			lock_clear(&apic_gethrtime_lock);
784 
785 			intr_restore(oflags);
786 
787 			return (temp);
788 		}
789 	}
790 	if (countval > last_count_read)
791 		countval = 0;
792 	else
793 		last_count_read = countval;
794 
795 	elapsed_ticks = apic_hertz_count - countval;
796 
797 	curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
798 	temp = apic_nsec_since_boot + curr_timeval;
799 
800 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
801 		/* we might have clobbered last_count_read. Restore it */
802 		last_count_read = apic_hertz_count;
803 		goto gethrtime_again;
804 	}
805 
806 	if (temp < apic_last_hrtime) {
807 		/* return last hrtime if error occurs */
808 		apic_hrtime_error++;
809 		temp = apic_last_hrtime;
810 	}
811 	else
812 		apic_last_hrtime = temp;
813 
814 	lock_clear(&apic_gethrtime_lock);
815 	intr_restore(oflags);
816 
817 	return (temp);
818 }
819 
820 /* apic NMI handler */
821 /*ARGSUSED*/
822 void
823 apic_nmi_intr(caddr_t arg, struct regs *rp)
824 {
825 	if (apic_shutdown_processors) {
826 		apic_disable_local_apic();
827 		return;
828 	}
829 
830 	apic_error |= APIC_ERR_NMI;
831 
832 	if (!lock_try(&apic_nmi_lock))
833 		return;
834 	apic_num_nmis++;
835 
836 	if (apic_kmdb_on_nmi && psm_debugger()) {
837 		debug_enter("NMI received: entering kmdb\n");
838 	} else if (apic_panic_on_nmi) {
839 		/* Keep panic from entering kmdb. */
840 		nopanicdebug = 1;
841 		panic("NMI received\n");
842 	} else {
843 		/*
844 		 * prom_printf is the best shot we have of something which is
845 		 * problem free from high level/NMI type of interrupts
846 		 */
847 		prom_printf("NMI received\n");
848 	}
849 
850 	lock_clear(&apic_nmi_lock);
851 }
852 
853 processorid_t
854 apic_get_next_processorid(processorid_t cpu_id)
855 {
856 
857 	int i;
858 
859 	if (cpu_id == -1)
860 		return ((processorid_t)0);
861 
862 	for (i = cpu_id + 1; i < NCPU; i++) {
863 		if (apic_cpu_in_range(i))
864 			return (i);
865 	}
866 
867 	return ((processorid_t)-1);
868 }
869 
870 int
871 apic_cpu_add(psm_cpu_request_t *reqp)
872 {
873 	int i, rv = 0;
874 	ulong_t iflag;
875 	boolean_t first = B_TRUE;
876 	uchar_t localver = 0;
877 	uint32_t localid, procid;
878 	processorid_t cpuid = (processorid_t)-1;
879 	mach_cpu_add_arg_t *ap;
880 
881 	ASSERT(reqp != NULL);
882 	reqp->req.cpu_add.cpuid = (processorid_t)-1;
883 
884 	/* Check whether CPU hotplug is supported. */
885 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
886 		return (ENOTSUP);
887 	}
888 
889 	ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
890 	switch (ap->type) {
891 	case MACH_CPU_ARG_LOCAL_APIC:
892 		localid = ap->arg.apic.apic_id;
893 		procid = ap->arg.apic.proc_id;
894 		if (localid >= 255 || procid > 255) {
895 			cmn_err(CE_WARN,
896 			    "!apic: apicid(%u) or procid(%u) is invalid.",
897 			    localid, procid);
898 			return (EINVAL);
899 		}
900 		break;
901 
902 	case MACH_CPU_ARG_LOCAL_X2APIC:
903 		localid = ap->arg.apic.apic_id;
904 		procid = ap->arg.apic.proc_id;
905 		if (localid >= UINT32_MAX) {
906 			cmn_err(CE_WARN,
907 			    "!apic: x2apicid(%u) is invalid.", localid);
908 			return (EINVAL);
909 		} else if (localid >= 255 && apic_mode == LOCAL_APIC) {
910 			cmn_err(CE_WARN, "!apic: system is in APIC mode, "
911 			    "can't support x2APIC processor.");
912 			return (ENOTSUP);
913 		}
914 		break;
915 
916 	default:
917 		cmn_err(CE_WARN,
918 		    "!apic: unknown argument type %d to apic_cpu_add().",
919 		    ap->type);
920 		return (EINVAL);
921 	}
922 
923 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
924 	iflag = intr_clear();
925 	lock_set(&apic_ioapic_lock);
926 
927 	/* Check whether local APIC id already exists. */
928 	for (i = 0; i < apic_nproc; i++) {
929 		if (!CPU_IN_SET(apic_cpumask, i))
930 			continue;
931 		if (apic_cpus[i].aci_local_id == localid) {
932 			lock_clear(&apic_ioapic_lock);
933 			intr_restore(iflag);
934 			cmn_err(CE_WARN,
935 			    "!apic: local apic id %u already exists.",
936 			    localid);
937 			return (EEXIST);
938 		} else if (apic_cpus[i].aci_processor_id == procid) {
939 			lock_clear(&apic_ioapic_lock);
940 			intr_restore(iflag);
941 			cmn_err(CE_WARN,
942 			    "!apic: processor id %u already exists.",
943 			    (int)procid);
944 			return (EEXIST);
945 		}
946 
947 		/*
948 		 * There's no local APIC version number available in MADT table,
949 		 * so assume that all CPUs are homogeneous and use local APIC
950 		 * version number of the first existing CPU.
951 		 */
952 		if (first) {
953 			first = B_FALSE;
954 			localver = apic_cpus[i].aci_local_ver;
955 		}
956 	}
957 	ASSERT(first == B_FALSE);
958 
959 	/*
960 	 * Try to assign the same cpuid if APIC id exists in the dirty cache.
961 	 */
962 	for (i = 0; i < apic_max_nproc; i++) {
963 		if (CPU_IN_SET(apic_cpumask, i)) {
964 			ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
965 			continue;
966 		}
967 		ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
968 		if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
969 		    apic_cpus[i].aci_local_id == localid &&
970 		    apic_cpus[i].aci_processor_id == procid) {
971 			cpuid = i;
972 			break;
973 		}
974 	}
975 
976 	/* Avoid the dirty cache and allocate fresh slot if possible. */
977 	if (cpuid == (processorid_t)-1) {
978 		for (i = 0; i < apic_max_nproc; i++) {
979 			if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
980 			    (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
981 				cpuid = i;
982 				break;
983 			}
984 		}
985 	}
986 
987 	/* Try to find any free slot as last resort. */
988 	if (cpuid == (processorid_t)-1) {
989 		for (i = 0; i < apic_max_nproc; i++) {
990 			if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
991 				cpuid = i;
992 				break;
993 			}
994 		}
995 	}
996 
997 	if (cpuid == (processorid_t)-1) {
998 		lock_clear(&apic_ioapic_lock);
999 		intr_restore(iflag);
1000 		cmn_err(CE_NOTE,
1001 		    "!apic: failed to allocate cpu id for processor %u.",
1002 		    procid);
1003 		rv = EAGAIN;
1004 	} else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
1005 		lock_clear(&apic_ioapic_lock);
1006 		intr_restore(iflag);
1007 		cmn_err(CE_NOTE,
1008 		    "!apic: failed to build mapping for processor %u.",
1009 		    procid);
1010 		rv = EBUSY;
1011 	} else {
1012 		ASSERT(cpuid >= 0 && cpuid < NCPU);
1013 		ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
1014 		bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
1015 		apic_cpus[cpuid].aci_processor_id = procid;
1016 		apic_cpus[cpuid].aci_local_id = localid;
1017 		apic_cpus[cpuid].aci_local_ver = localver;
1018 		CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
1019 		if (cpuid >= apic_nproc) {
1020 			apic_nproc = cpuid + 1;
1021 		}
1022 		lock_clear(&apic_ioapic_lock);
1023 		intr_restore(iflag);
1024 		reqp->req.cpu_add.cpuid = cpuid;
1025 	}
1026 
1027 	return (rv);
1028 }
1029 
1030 int
1031 apic_cpu_remove(psm_cpu_request_t *reqp)
1032 {
1033 	int i;
1034 	ulong_t iflag;
1035 	processorid_t cpuid;
1036 
1037 	/* Check whether CPU hotplug is supported. */
1038 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1039 		return (ENOTSUP);
1040 	}
1041 
1042 	cpuid = reqp->req.cpu_remove.cpuid;
1043 
1044 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1045 	iflag = intr_clear();
1046 	lock_set(&apic_ioapic_lock);
1047 
1048 	if (!apic_cpu_in_range(cpuid)) {
1049 		lock_clear(&apic_ioapic_lock);
1050 		intr_restore(iflag);
1051 		cmn_err(CE_WARN,
1052 		    "!apic: cpuid %d doesn't exist in apic_cpus array.",
1053 		    cpuid);
1054 		return (ENODEV);
1055 	}
1056 	ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1057 
1058 	if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1059 		lock_clear(&apic_ioapic_lock);
1060 		intr_restore(iflag);
1061 		return (ENOENT);
1062 	}
1063 
1064 	if (cpuid == apic_nproc - 1) {
1065 		/*
1066 		 * We are removing the highest numbered cpuid so we need to
1067 		 * find the next highest cpuid as the new value for apic_nproc.
1068 		 */
1069 		for (i = apic_nproc; i > 0; i--) {
1070 			if (CPU_IN_SET(apic_cpumask, i - 1)) {
1071 				apic_nproc = i;
1072 				break;
1073 			}
1074 		}
1075 		/* at least one CPU left */
1076 		ASSERT(i > 0);
1077 	}
1078 	CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1079 	/* mark slot as free and keep it in the dirty cache */
1080 	apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1081 
1082 	lock_clear(&apic_ioapic_lock);
1083 	intr_restore(iflag);
1084 
1085 	return (0);
1086 }
1087 
1088 /*
1089  * Return the number of ticks the APIC decrements in SF nanoseconds.
1090  * The fixed-frequency PIT (aka 8254) is used for the measurement.
1091  */
1092 static uint64_t
1093 apic_calibrate_impl()
1094 {
1095 	uint8_t		pit_tick_lo;
1096 	uint16_t	pit_tick, target_pit_tick, pit_ticks_adj;
1097 	uint32_t	pit_ticks;
1098 	uint32_t	start_apic_tick, end_apic_tick, apic_ticks;
1099 	ulong_t		iflag;
1100 
1101 	apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1102 	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1103 
1104 	iflag = intr_clear();
1105 
1106 	do {
1107 		pit_tick_lo = inb(PITCTR0_PORT);
1108 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1109 	} while (pit_tick < APIC_TIME_MIN ||
1110 	    pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1111 
1112 	/*
1113 	 * Wait for the PIT to decrement by 5 ticks to ensure
1114 	 * we didn't start in the middle of a tick.
1115 	 * Compare with 0x10 for the wrap around case.
1116 	 */
1117 	target_pit_tick = pit_tick - 5;
1118 	do {
1119 		pit_tick_lo = inb(PITCTR0_PORT);
1120 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1121 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1122 
1123 	start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1124 
1125 	/*
1126 	 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1127 	 */
1128 	target_pit_tick = pit_tick - APIC_TIME_COUNT;
1129 	do {
1130 		pit_tick_lo = inb(PITCTR0_PORT);
1131 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1132 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1133 
1134 	end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1135 
1136 	intr_restore(iflag);
1137 
1138 	apic_ticks = start_apic_tick - end_apic_tick;
1139 
1140 	/* The PIT might have decremented by more ticks than planned */
1141 	pit_ticks_adj = target_pit_tick - pit_tick;
1142 	/* total number of PIT ticks corresponding to apic_ticks */
1143 	pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1144 
1145 	/*
1146 	 * Determine the number of nanoseconds per APIC clock tick
1147 	 * and then determine how many APIC ticks to interrupt at the
1148 	 * desired frequency
1149 	 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1150 	 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1151 	 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1152 	 * apic_ticks_per_SFns =
1153 	 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1154 	 */
1155 	return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1156 }
1157 
1158 /*
1159  * It was found empirically that 5 measurements seem sufficient to give a good
1160  * accuracy. Most spurious measurements are higher than the target value thus
1161  * we eliminate up to 2/5 spurious measurements.
1162  */
1163 #define	APIC_CALIBRATE_MEASUREMENTS		5
1164 
1165 #define	APIC_CALIBRATE_PERCENT_OFF_WARNING	10
1166 
1167 /*
1168  * Return the number of ticks the APIC decrements in SF nanoseconds.
1169  * Several measurements are taken to filter out outliers.
1170  */
1171 uint64_t
1172 apic_calibrate()
1173 {
1174 	uint64_t	measurements[APIC_CALIBRATE_MEASUREMENTS];
1175 	int		median_idx;
1176 	uint64_t	median;
1177 
1178 	/*
1179 	 * When running under a virtual machine, the emulated PIT and APIC
1180 	 * counters do not always return the right values and can roll over.
1181 	 * Those spurious measurements are relatively rare but could
1182 	 * significantly affect the calibration.
1183 	 * Therefore we take several measurements and then keep the median.
1184 	 * The median is preferred to the average here as we only want to
1185 	 * discard outliers.
1186 	 */
1187 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1188 		measurements[i] = apic_calibrate_impl();
1189 
1190 	/*
1191 	 * sort results and retrieve median.
1192 	 */
1193 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1194 		for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1195 			if (measurements[j] < measurements[i]) {
1196 				uint64_t tmp = measurements[i];
1197 				measurements[i] = measurements[j];
1198 				measurements[j] = tmp;
1199 			}
1200 		}
1201 	}
1202 	median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1203 	median = measurements[median_idx];
1204 
1205 #if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1206 	/*
1207 	 * Check that measurements are consistent. Post a warning
1208 	 * if the three middle values are not close to each other.
1209 	 */
1210 	uint64_t delta_warn = median *
1211 	    APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1212 	if ((median - measurements[median_idx - 1]) > delta_warn ||
1213 	    (measurements[median_idx + 1] - median) > delta_warn) {
1214 		cmn_err(CE_WARN, "apic_calibrate measurements lack "
1215 		    "precision: %llu, %llu, %llu.",
1216 		    (u_longlong_t)measurements[median_idx - 1],
1217 		    (u_longlong_t)median,
1218 		    (u_longlong_t)measurements[median_idx + 1]);
1219 	}
1220 #endif
1221 
1222 	return (median);
1223 }
1224 
1225 /*
1226  * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1227  * frequency.  Note at this stage in the boot sequence, the boot processor
1228  * is the only active processor.
1229  * hertz value of 0 indicates a one-shot mode request.  In this case
1230  * the function returns the resolution (in nanoseconds) for the hardware
1231  * timer interrupt.  If one-shot mode capability is not available,
1232  * the return value will be 0. apic_enable_oneshot is a global switch
1233  * for disabling the functionality.
1234  * A non-zero positive value for hertz indicates a periodic mode request.
1235  * In this case the hardware will be programmed to generate clock interrupts
1236  * at hertz frequency and returns the resolution of interrupts in
1237  * nanosecond.
1238  */
1239 
1240 int
1241 apic_clkinit(int hertz)
1242 {
1243 	int		ret;
1244 
1245 	apic_int_busy_mark = (apic_int_busy_mark *
1246 	    apic_sample_factor_redistribution) / 100;
1247 	apic_int_free_mark = (apic_int_free_mark *
1248 	    apic_sample_factor_redistribution) / 100;
1249 	apic_diff_for_redistribution = (apic_diff_for_redistribution *
1250 	    apic_sample_factor_redistribution) / 100;
1251 
1252 	ret = apic_timer_init(hertz);
1253 	return (ret);
1254 
1255 }
1256 
1257 /*
1258  * apic_preshutdown:
1259  * Called early in shutdown whilst we can still access filesystems to do
1260  * things like loading modules which will be required to complete shutdown
1261  * after filesystems are all unmounted.
1262  */
1263 void
1264 apic_preshutdown(int cmd, int fcn)
1265 {
1266 	APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1267 	    cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1268 }
1269 
1270 void
1271 apic_shutdown(int cmd, int fcn)
1272 {
1273 	int restarts, attempts;
1274 	int i;
1275 	uchar_t	byte;
1276 	ulong_t iflag;
1277 
1278 	hpet_acpi_fini();
1279 
1280 	/* Send NMI to all CPUs except self to do per processor shutdown */
1281 	iflag = intr_clear();
1282 #ifdef	DEBUG
1283 	APIC_AV_PENDING_SET();
1284 #else
1285 	if (apic_mode == LOCAL_APIC)
1286 		APIC_AV_PENDING_SET();
1287 #endif /* DEBUG */
1288 	apic_shutdown_processors = 1;
1289 	apic_reg_ops->apic_write(APIC_INT_CMD1,
1290 	    AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1291 
1292 	/* restore cmos shutdown byte before reboot */
1293 	if (apic_cmos_ssb_set) {
1294 		outb(CMOS_ADDR, SSB);
1295 		outb(CMOS_DATA, 0);
1296 	}
1297 
1298 	ioapic_disable_redirection();
1299 
1300 	/*	disable apic mode if imcr present	*/
1301 	if (apic_imcrp) {
1302 		outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1303 		outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1304 	}
1305 
1306 	apic_disable_local_apic();
1307 
1308 	intr_restore(iflag);
1309 
1310 	/* remainder of function is for shutdown cases only */
1311 	if (cmd != A_SHUTDOWN)
1312 		return;
1313 
1314 	/*
1315 	 * Switch system back into Legacy-Mode if using ACPI and
1316 	 * not powering-off.  Some BIOSes need to remain in ACPI-mode
1317 	 * for power-off to succeed (Dell Dimension 4600)
1318 	 * Do not disable ACPI while doing fastreboot
1319 	 */
1320 	if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1321 		(void) AcpiDisable();
1322 
1323 	if (fcn == AD_FASTREBOOT) {
1324 		apic_reg_ops->apic_write(APIC_INT_CMD1,
1325 		    AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1326 	}
1327 
1328 	/* remainder of function is for shutdown+poweroff case only */
1329 	if (fcn != AD_POWEROFF)
1330 		return;
1331 
1332 	switch (apic_poweroff_method) {
1333 		case APIC_POWEROFF_VIA_RTC:
1334 
1335 			/* select the extended NVRAM bank in the RTC */
1336 			outb(CMOS_ADDR, RTC_REGA);
1337 			byte = inb(CMOS_DATA);
1338 			outb(CMOS_DATA, (byte | EXT_BANK));
1339 
1340 			outb(CMOS_ADDR, PFR_REG);
1341 
1342 			/* for Predator must toggle the PAB bit */
1343 			byte = inb(CMOS_DATA);
1344 
1345 			/*
1346 			 * clear power active bar, wakeup alarm and
1347 			 * kickstart
1348 			 */
1349 			byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1350 			outb(CMOS_DATA, byte);
1351 
1352 			/* delay before next write */
1353 			drv_usecwait(1000);
1354 
1355 			/* for S40 the following would suffice */
1356 			byte = inb(CMOS_DATA);
1357 
1358 			/* power active bar control bit */
1359 			byte |= PAB_CBIT;
1360 			outb(CMOS_DATA, byte);
1361 
1362 			break;
1363 
1364 		case APIC_POWEROFF_VIA_ASPEN_BMC:
1365 			restarts = 0;
1366 restart_aspen_bmc:
1367 			if (++restarts == 3)
1368 				break;
1369 			attempts = 0;
1370 			do {
1371 				byte = inb(MISMIC_FLAG_REGISTER);
1372 				byte &= MISMIC_BUSY_MASK;
1373 				if (byte != 0) {
1374 					drv_usecwait(1000);
1375 					if (attempts >= 3)
1376 						goto restart_aspen_bmc;
1377 					++attempts;
1378 				}
1379 			} while (byte != 0);
1380 			outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1381 			byte = inb(MISMIC_FLAG_REGISTER);
1382 			byte |= 0x1;
1383 			outb(MISMIC_FLAG_REGISTER, byte);
1384 			i = 0;
1385 			for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1386 			    i++) {
1387 				attempts = 0;
1388 				do {
1389 					byte = inb(MISMIC_FLAG_REGISTER);
1390 					byte &= MISMIC_BUSY_MASK;
1391 					if (byte != 0) {
1392 						drv_usecwait(1000);
1393 						if (attempts >= 3)
1394 							goto restart_aspen_bmc;
1395 						++attempts;
1396 					}
1397 				} while (byte != 0);
1398 				outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1399 				outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1400 				byte = inb(MISMIC_FLAG_REGISTER);
1401 				byte |= 0x1;
1402 				outb(MISMIC_FLAG_REGISTER, byte);
1403 			}
1404 			break;
1405 
1406 		case APIC_POWEROFF_VIA_SITKA_BMC:
1407 			restarts = 0;
1408 restart_sitka_bmc:
1409 			if (++restarts == 3)
1410 				break;
1411 			attempts = 0;
1412 			do {
1413 				byte = inb(SMS_STATUS_REGISTER);
1414 				byte &= SMS_STATE_MASK;
1415 				if ((byte == SMS_READ_STATE) ||
1416 				    (byte == SMS_WRITE_STATE)) {
1417 					drv_usecwait(1000);
1418 					if (attempts >= 3)
1419 						goto restart_sitka_bmc;
1420 					++attempts;
1421 				}
1422 			} while ((byte == SMS_READ_STATE) ||
1423 			    (byte == SMS_WRITE_STATE));
1424 			outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1425 			i = 0;
1426 			for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1427 			    i++) {
1428 				attempts = 0;
1429 				do {
1430 					byte = inb(SMS_STATUS_REGISTER);
1431 					byte &= SMS_IBF_MASK;
1432 					if (byte != 0) {
1433 						drv_usecwait(1000);
1434 						if (attempts >= 3)
1435 							goto restart_sitka_bmc;
1436 						++attempts;
1437 					}
1438 				} while (byte != 0);
1439 				outb(sitka_bmc[i].port, sitka_bmc[i].data);
1440 			}
1441 			break;
1442 
1443 		case APIC_POWEROFF_NONE:
1444 
1445 			/* If no APIC direct method, we will try using ACPI */
1446 			if (apic_enable_acpi) {
1447 				if (acpi_poweroff() == 1)
1448 					return;
1449 			} else
1450 				return;
1451 
1452 			break;
1453 	}
1454 	/*
1455 	 * Wait a limited time here for power to go off.
1456 	 * If the power does not go off, then there was a
1457 	 * problem and we should continue to the halt which
1458 	 * prints a message for the user to press a key to
1459 	 * reboot.
1460 	 */
1461 	drv_usecwait(7000000); /* wait seven seconds */
1462 
1463 }
1464 
1465 cyclic_id_t apic_cyclic_id;
1466 
1467 /*
1468  * The following functions are in the platform specific file so that they
1469  * can be different functions depending on whether we are running on
1470  * bare metal or a hypervisor.
1471  */
1472 
1473 /*
1474  * map an apic for memory-mapped access
1475  */
1476 uint32_t *
1477 mapin_apic(uint32_t addr, size_t len, int flags)
1478 {
1479 	return ((void *)psm_map_phys(addr, len, flags));
1480 }
1481 
1482 uint32_t *
1483 mapin_ioapic(uint32_t addr, size_t len, int flags)
1484 {
1485 	return (mapin_apic(addr, len, flags));
1486 }
1487 
1488 /*
1489  * unmap an apic
1490  */
1491 void
1492 mapout_apic(caddr_t addr, size_t len)
1493 {
1494 	psm_unmap_phys(addr, len);
1495 }
1496 
1497 void
1498 mapout_ioapic(caddr_t addr, size_t len)
1499 {
1500 	mapout_apic(addr, len);
1501 }
1502 
1503 uint32_t
1504 ioapic_read(int ioapic_ix, uint32_t reg)
1505 {
1506 	volatile uint32_t *ioapic;
1507 
1508 	ioapic = apicioadr[ioapic_ix];
1509 	ioapic[APIC_IO_REG] = reg;
1510 	return (ioapic[APIC_IO_DATA]);
1511 }
1512 
1513 void
1514 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1515 {
1516 	volatile uint32_t *ioapic;
1517 
1518 	ioapic = apicioadr[ioapic_ix];
1519 	ioapic[APIC_IO_REG] = reg;
1520 	ioapic[APIC_IO_DATA] = value;
1521 }
1522 
1523 void
1524 ioapic_write_eoi(int ioapic_ix, uint32_t value)
1525 {
1526 	volatile uint32_t *ioapic;
1527 
1528 	ioapic = apicioadr[ioapic_ix];
1529 	ioapic[APIC_IO_EOI] = value;
1530 }
1531 
1532 /*
1533  * Round-robin algorithm to find the next CPU with interrupts enabled.
1534  * It can't share the same static variable apic_next_bind_cpu with
1535  * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1536  * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1537  * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1538  * are called.  However, the pcplusmp driver assumes that there will be
1539  * boot_ncpus CPUs configured eventually so it tries to distribute all
1540  * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1541  * interrupts being targetted at CPU1, we need to use a dedicated static
1542  * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1543  */
1544 
1545 processorid_t
1546 apic_find_cpu(int flag)
1547 {
1548 	int i;
1549 	static processorid_t acid = 0;
1550 
1551 	/* Find the first CPU with the passed-in flag set */
1552 	for (i = 0; i < apic_nproc; i++) {
1553 		if (++acid >= apic_nproc) {
1554 			acid = 0;
1555 		}
1556 		if (apic_cpu_in_range(acid) &&
1557 		    (apic_cpus[acid].aci_status & flag)) {
1558 			break;
1559 		}
1560 	}
1561 
1562 	ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1563 	return (acid);
1564 }
1565 
1566 void
1567 apic_intrmap_init(int apic_mode)
1568 {
1569 	int suppress_brdcst_eoi = 0;
1570 
1571 	/*
1572 	 * Intel Software Developer's Manual 3A, 10.12.7:
1573 	 *
1574 	 * Routing of device interrupts to local APIC units operating in
1575 	 * x2APIC mode requires use of the interrupt-remapping architecture
1576 	 * specified in the Intel Virtualization Technology for Directed
1577 	 * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1578 	 * for and software must enable this interrupt remapping with
1579 	 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1580 	 * the local APIC units.
1581 	 *
1582 	 *
1583 	 * In other words, to use the APIC in x2APIC mode, we need interrupt
1584 	 * remapping.  Since we don't start up the IOMMU by default, we
1585 	 * won't be able to do any interrupt remapping and therefore have to
1586 	 * use the APIC in traditional 'local APIC' mode with memory mapped
1587 	 * I/O.
1588 	 */
1589 
1590 	if (psm_vt_ops != NULL) {
1591 		if (((apic_intrmap_ops_t *)psm_vt_ops)->
1592 		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1593 
1594 			apic_vt_ops = psm_vt_ops;
1595 
1596 			/*
1597 			 * We leverage the interrupt remapping engine to
1598 			 * suppress broadcast EOI; thus we must send the
1599 			 * directed EOI with the directed-EOI handler.
1600 			 */
1601 			if (apic_directed_EOI_supported() == 0) {
1602 				suppress_brdcst_eoi = 1;
1603 			}
1604 
1605 			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1606 
1607 			if (apic_detect_x2apic()) {
1608 				apic_enable_x2apic();
1609 			}
1610 
1611 			if (apic_directed_EOI_supported() == 0) {
1612 				apic_set_directed_EOI_handler();
1613 			}
1614 		}
1615 	}
1616 }
1617 
1618 /*ARGSUSED*/
1619 static void
1620 apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt)
1621 {
1622 	irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1623 }
1624 
1625 /*ARGSUSED*/
1626 static void
1627 apic_record_msi(void *intrmap_private, msi_regs_t *mregs)
1628 {
1629 	mregs->mr_addr = MSI_ADDR_HDR |
1630 	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1631 	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1632 	    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1633 	mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1634 	    mregs->mr_data;
1635 }
1636 
1637 /*
1638  * Functions from apic_introp.c
1639  *
1640  * Those functions are used by apic_intr_ops().
1641  */
1642 
1643 /*
1644  * MSI support flag:
1645  * reflects whether MSI is supported at APIC level
1646  * it can also be patched through /etc/system
1647  *
1648  *  0 = default value - don't know and need to call apic_check_msi_support()
1649  *      to find out then set it accordingly
1650  *  1 = supported
1651  * -1 = not supported
1652  */
1653 int	apic_support_msi = 0;
1654 
1655 /* Multiple vector support for MSI-X */
1656 int	apic_msix_enable = 1;
1657 
1658 /* Multiple vector support for MSI */
1659 int	apic_multi_msi_enable = 1;
1660 
1661 /*
1662  * Check whether the system supports MSI.
1663  *
1664  * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1665  * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1666  * return PSM_SUCCESS to indicate this system supports MSI.
1667  *
1668  * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1669  * by detecting if we are running inside the KVM hypervisor, which guarantees
1670  * this version number.)
1671  */
1672 int
1673 apic_check_msi_support()
1674 {
1675 	dev_info_t *cdip;
1676 	char dev_type[16];
1677 	int dev_len;
1678 	int hwenv = get_hwenv();
1679 
1680 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1681 
1682 	/*
1683 	 * check whether the first level children of root_node have
1684 	 * PCI-E or PCI capability.
1685 	 */
1686 	for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1687 	    cdip = ddi_get_next_sibling(cdip)) {
1688 
1689 		DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1690 		    " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1691 		    ddi_driver_name(cdip), ddi_binding_name(cdip),
1692 		    ddi_node_name(cdip)));
1693 		dev_len = sizeof (dev_type);
1694 		if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1695 		    "device_type", (caddr_t)dev_type, &dev_len)
1696 		    != DDI_PROP_SUCCESS)
1697 			continue;
1698 		if (strcmp(dev_type, "pciex") == 0)
1699 			return (PSM_SUCCESS);
1700 		if (strcmp(dev_type, "pci") == 0 &&
1701 		    (hwenv == HW_KVM || hwenv == HW_BHYVE))
1702 			return (PSM_SUCCESS);
1703 	}
1704 
1705 	/* MSI is not supported on this system */
1706 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1707 	    "device_type found\n"));
1708 	return (PSM_FAILURE);
1709 }
1710 
1711 /*
1712  * apic_pci_msi_unconfigure:
1713  *
1714  * This and next two interfaces are copied from pci_intr_lib.c
1715  * Do ensure that these two files stay in sync.
1716  * These needed to be copied over here to avoid a deadlock situation on
1717  * certain mp systems that use MSI interrupts.
1718  *
1719  * IMPORTANT regards next three interfaces:
1720  * i) are called only for MSI/X interrupts.
1721  * ii) called with interrupts disabled, and must not block
1722  */
1723 void
1724 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1725 {
1726 	ushort_t		msi_ctrl;
1727 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1728 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1729 
1730 	ASSERT((handle != NULL) && (cap_ptr != 0));
1731 
1732 	if (type == DDI_INTR_TYPE_MSI) {
1733 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1734 		msi_ctrl &= (~PCI_MSI_MME_MASK);
1735 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1736 		pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1737 
1738 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1739 			pci_config_put16(handle,
1740 			    cap_ptr + PCI_MSI_64BIT_DATA, 0);
1741 			pci_config_put32(handle,
1742 			    cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1743 		} else {
1744 			pci_config_put16(handle,
1745 			    cap_ptr + PCI_MSI_32BIT_DATA, 0);
1746 		}
1747 
1748 	} else if (type == DDI_INTR_TYPE_MSIX) {
1749 		uintptr_t	off;
1750 		uint32_t	mask;
1751 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(rdip);
1752 
1753 		ASSERT(msix_p != NULL);
1754 
1755 		/* Offset into "inum"th entry in the MSI-X table & mask it */
1756 		off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1757 		    PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1758 
1759 		mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1760 
1761 		ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1762 
1763 		/* Offset into the "inum"th entry in the MSI-X table */
1764 		off = (uintptr_t)msix_p->msix_tbl_addr +
1765 		    (inum * PCI_MSIX_VECTOR_SIZE);
1766 
1767 		/* Reset the "data" and "addr" bits */
1768 		ddi_put32(msix_p->msix_tbl_hdl,
1769 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1770 		ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1771 	}
1772 }
1773 
1774 /*
1775  * apic_pci_msi_disable_mode:
1776  */
1777 void
1778 apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1779 {
1780 	ushort_t		msi_ctrl;
1781 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1782 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1783 
1784 	ASSERT((handle != NULL) && (cap_ptr != 0));
1785 
1786 	if (type == DDI_INTR_TYPE_MSI) {
1787 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1788 		if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1789 			return;
1790 
1791 		msi_ctrl &= ~PCI_MSI_ENABLE_BIT;	/* MSI disable */
1792 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1793 
1794 	} else if (type == DDI_INTR_TYPE_MSIX) {
1795 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1796 		if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1797 			msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1798 			pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1799 			    msi_ctrl);
1800 		}
1801 	}
1802 }
1803 
1804 uint32_t
1805 apic_get_localapicid(uint32_t cpuid)
1806 {
1807 	ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1808 
1809 	return (apic_cpus[cpuid].aci_local_id);
1810 }
1811 
1812 uchar_t
1813 apic_get_ioapicid(uchar_t ioapicindex)
1814 {
1815 	ASSERT(ioapicindex < MAX_IO_APIC);
1816 
1817 	return (apic_io_id[ioapicindex]);
1818 }
1819