xref: /illumos-gate/usr/src/uts/i86pc/io/pcplusmp/apic_common.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright 2019, Joyent, Inc.
27  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
28  */
29 
30 /*
31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34  * PSMI 1.5 extensions are supported in Solaris Nevada.
35  * PSMI 1.6 extensions are supported in Solaris Nevada.
36  * PSMI 1.7 extensions are supported in Solaris Nevada.
37  */
38 #define	PSMI_1_7
39 
40 #include <sys/processor.h>
41 #include <sys/time.h>
42 #include <sys/psm.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/cram.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/psm_common.h>
48 #include <sys/apic.h>
49 #include <sys/pit.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/ddi_impldefs.h>
53 #include <sys/pci.h>
54 #include <sys/promif.h>
55 #include <sys/x86_archext.h>
56 #include <sys/cpc_impl.h>
57 #include <sys/uadmin.h>
58 #include <sys/panic.h>
59 #include <sys/debug.h>
60 #include <sys/archsystm.h>
61 #include <sys/trap.h>
62 #include <sys/machsystm.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cpuvar.h>
65 #include <sys/rm_platter.h>
66 #include <sys/privregs.h>
67 #include <sys/note.h>
68 #include <sys/pci_intr_lib.h>
69 #include <sys/spl.h>
70 #include <sys/clock.h>
71 #include <sys/dditypes.h>
72 #include <sys/sunddi.h>
73 #include <sys/x_call.h>
74 #include <sys/reboot.h>
75 #include <sys/hpet.h>
76 #include <sys/apic_common.h>
77 #include <sys/apic_timer.h>
78 
79 static void	apic_record_ioapic_rdt(void *intrmap_private,
80 		    ioapic_rdt_t *irdt);
81 static void	apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
82 
83 /*
84  * Common routines between pcplusmp & apix (taken from apic.c).
85  */
86 
87 int	apic_clkinit(int);
88 hrtime_t apic_gethrtime(void);
89 void	apic_send_ipi(int, int);
90 void	apic_set_idlecpu(processorid_t);
91 void	apic_unset_idlecpu(processorid_t);
92 void	apic_shutdown(int, int);
93 void	apic_preshutdown(int, int);
94 processorid_t	apic_get_next_processorid(processorid_t);
95 
96 hrtime_t apic_gettime();
97 
98 enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
99 
100 /* Now the ones for Dynamic Interrupt distribution */
101 int	apic_enable_dynamic_migration = 0;
102 
103 /* maximum loop count when sending Start IPIs. */
104 int apic_sipi_max_loop_count = 0x1000;
105 
106 /*
107  * These variables are frequently accessed in apic_intr_enter(),
108  * apic_intr_exit and apic_setspl, so group them together
109  */
110 volatile uint32_t *apicadr =  NULL;	/* virtual addr of local APIC	*/
111 int apic_setspl_delay = 1;		/* apic_setspl - delay enable	*/
112 int apic_clkvect;
113 
114 /* vector at which error interrupts come in */
115 int apic_errvect;
116 int apic_enable_error_intr = 1;
117 int apic_error_display_delay = 100;
118 
119 /* vector at which performance counter overflow interrupts come in */
120 int apic_cpcovf_vect;
121 int apic_enable_cpcovf_intr = 1;
122 
123 /* vector at which CMCI interrupts come in */
124 int apic_cmci_vect;
125 extern void cmi_cmci_trap(void);
126 
127 lock_t apic_mode_switch_lock;
128 
129 int apic_pir_vect;
130 
131 /*
132  * Patchable global variables.
133  */
134 int	apic_forceload = 0;
135 
136 int	apic_coarse_hrtime = 1;		/* 0 - use accurate slow gethrtime() */
137 
138 int	apic_flat_model = 0;		/* 0 - clustered. 1 - flat */
139 int	apic_panic_on_nmi = 0;
140 int	apic_panic_on_apic_error = 0;
141 
142 int	apic_verbose = 0;	/* 0x1ff */
143 
144 #ifdef DEBUG
145 int	apic_debug = 0;
146 int	apic_restrict_vector = 0;
147 
148 int	apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
149 int	apic_debug_msgbufindex = 0;
150 
151 #endif /* DEBUG */
152 
153 uint_t apic_nticks = 0;
154 uint_t apic_skipped_redistribute = 0;
155 
156 uint_t last_count_read = 0;
157 lock_t	apic_gethrtime_lock;
158 volatile int	apic_hrtime_stamp = 0;
159 volatile hrtime_t apic_nsec_since_boot = 0;
160 
161 static	hrtime_t	apic_last_hrtime = 0;
162 int		apic_hrtime_error = 0;
163 int		apic_remote_hrterr = 0;
164 int		apic_num_nmis = 0;
165 int		apic_apic_error = 0;
166 int		apic_num_apic_errors = 0;
167 int		apic_num_cksum_errors = 0;
168 
169 int	apic_error = 0;
170 
171 static	int	apic_cmos_ssb_set = 0;
172 
173 /* use to make sure only one cpu handles the nmi */
174 lock_t	apic_nmi_lock;
175 /* use to make sure only one cpu handles the error interrupt */
176 lock_t	apic_error_lock;
177 
178 static	struct {
179 	uchar_t	cntl;
180 	uchar_t	data;
181 } aspen_bmc[] = {
182 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
183 	{ CC_SMS_WR_NEXT,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
184 	{ CC_SMS_WR_NEXT,	0x84 },		/* DataByte 1: SMS/OS no log */
185 	{ CC_SMS_WR_NEXT,	0x2 },		/* DataByte 2: Power Down */
186 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 3: no pre-timeout */
187 	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 4: timer expir. */
188 	{ CC_SMS_WR_NEXT,	0xa },		/* DataByte 5: init countdown */
189 	{ CC_SMS_WR_END,	0x0 },		/* DataByte 6: init countdown */
190 
191 	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
192 	{ CC_SMS_WR_END,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
193 };
194 
195 static	struct {
196 	int	port;
197 	uchar_t	data;
198 } sitka_bmc[] = {
199 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
200 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
201 	{ SMS_DATA_REGISTER,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
202 	{ SMS_DATA_REGISTER,	0x84 },		/* DataByte 1: SMS/OS no log */
203 	{ SMS_DATA_REGISTER,	0x2 },		/* DataByte 2: Power Down */
204 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 3: no pre-timeout */
205 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 4: timer expir. */
206 	{ SMS_DATA_REGISTER,	0xa },		/* DataByte 5: init countdown */
207 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
208 	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 6: init countdown */
209 
210 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
211 	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
212 	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
213 	{ SMS_DATA_REGISTER,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
214 };
215 
216 /* Patchable global variables. */
217 int		apic_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
218 uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
219 
220 /* default apic ops without interrupt remapping */
221 static apic_intrmap_ops_t apic_nointrmap_ops = {
222 	(int (*)(int))return_instr,
223 	(void (*)(int))return_instr,
224 	(void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
225 	(void (*)(void *, void *, uint16_t, int))return_instr,
226 	(void (*)(void **))return_instr,
227 	apic_record_ioapic_rdt,
228 	apic_record_msi,
229 };
230 
231 apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
232 apic_cpus_info_t	*apic_cpus = NULL;
233 cpuset_t	apic_cpumask;
234 uint_t		apic_picinit_called;
235 
236 /* Flag to indicate that we need to shut down all processors */
237 static uint_t	apic_shutdown_processors;
238 
239 /*
240  * Probe the ioapic method for apix module. Called in apic_probe_common()
241  */
242 int
243 apic_ioapic_method_probe()
244 {
245 	if (apix_enable == 0)
246 		return (PSM_SUCCESS);
247 
248 	/*
249 	 * Set IOAPIC EOI handling method. The priority from low to high is:
250 	 *	1. IOxAPIC: with EOI register
251 	 *	2. IOMMU interrupt mapping
252 	 *	3. Mask-Before-EOI method for systems without boot
253 	 *	interrupt routing, such as systems with only one IOAPIC;
254 	 *	NVIDIA CK8-04/MCP55 systems; systems with bridge solution
255 	 *	which disables the boot interrupt routing already.
256 	 *	4. Directed EOI
257 	 */
258 	if (apic_io_ver[0] >= 0x20)
259 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
260 	if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
261 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
262 	if (apic_directed_EOI_supported())
263 		apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
264 
265 	/* fall back to pcplusmp */
266 	if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
267 		/* make sure apix is after pcplusmp in /etc/mach */
268 		apix_enable = 0; /* go ahead with pcplusmp install next */
269 		return (PSM_FAILURE);
270 	}
271 
272 	return (PSM_SUCCESS);
273 }
274 
275 /*
276  * handler for APIC Error interrupt. Just print a warning and continue
277  */
278 int
279 apic_error_intr()
280 {
281 	uint_t	error0, error1, error;
282 	uint_t	i;
283 
284 	/*
285 	 * We need to write before read as per 7.4.17 of system prog manual.
286 	 * We do both and or the results to be safe
287 	 */
288 	error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
289 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
290 	error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
291 	error = error0 | error1;
292 
293 	/*
294 	 * Clear the APIC error status (do this on all cpus that enter here)
295 	 * (two writes are required due to the semantics of accessing the
296 	 * error status register.)
297 	 */
298 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
299 	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
300 
301 	/*
302 	 * Prevent more than 1 CPU from handling error interrupt causing
303 	 * double printing (interleave of characters from multiple
304 	 * CPU's when using prom_printf)
305 	 */
306 	if (lock_try(&apic_error_lock) == 0)
307 		return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
308 	if (error) {
309 #if	DEBUG
310 		if (apic_debug)
311 			debug_enter("pcplusmp: APIC Error interrupt received");
312 #endif /* DEBUG */
313 		if (apic_panic_on_apic_error)
314 			cmn_err(CE_PANIC,
315 			    "APIC Error interrupt on CPU %d. Status = %x",
316 			    psm_get_cpu_id(), error);
317 		else {
318 			if ((error & ~APIC_CS_ERRORS) == 0) {
319 				/* cksum error only */
320 				apic_error |= APIC_ERR_APIC_ERROR;
321 				apic_apic_error |= error;
322 				apic_num_apic_errors++;
323 				apic_num_cksum_errors++;
324 			} else {
325 				/*
326 				 * prom_printf is the best shot we have of
327 				 * something which is problem free from
328 				 * high level/NMI type of interrupts
329 				 */
330 				prom_printf("APIC Error interrupt on CPU %d. "
331 				    "Status 0 = %x, Status 1 = %x\n",
332 				    psm_get_cpu_id(), error0, error1);
333 				apic_error |= APIC_ERR_APIC_ERROR;
334 				apic_apic_error |= error;
335 				apic_num_apic_errors++;
336 				for (i = 0; i < apic_error_display_delay; i++) {
337 					tenmicrosec();
338 				}
339 				/*
340 				 * provide more delay next time limited to
341 				 * roughly 1 clock tick time
342 				 */
343 				if (apic_error_display_delay < 500)
344 					apic_error_display_delay *= 2;
345 			}
346 		}
347 		lock_clear(&apic_error_lock);
348 		return (DDI_INTR_CLAIMED);
349 	} else {
350 		lock_clear(&apic_error_lock);
351 		return (DDI_INTR_UNCLAIMED);
352 	}
353 }
354 
355 /*
356  * Turn off the mask bit in the performance counter Local Vector Table entry.
357  */
358 void
359 apic_cpcovf_mask_clear(void)
360 {
361 	apic_reg_ops->apic_write(APIC_PCINT_VECT,
362 	    (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
363 }
364 
365 /*ARGSUSED*/
366 static int
367 apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
368 {
369 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
370 	return (0);
371 }
372 
373 /*ARGSUSED*/
374 static int
375 apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
376 {
377 	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
378 	return (0);
379 }
380 
381 void
382 apic_cmci_setup(processorid_t cpuid, boolean_t enable)
383 {
384 	cpuset_t	cpu_set;
385 
386 	CPUSET_ONLY(cpu_set, cpuid);
387 
388 	if (enable) {
389 		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
390 		    (xc_func_t)apic_cmci_enable);
391 	} else {
392 		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
393 		    (xc_func_t)apic_cmci_disable);
394 	}
395 }
396 
397 static void
398 apic_disable_local_apic(void)
399 {
400 	apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
401 	apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
402 
403 	/* local intr reg 0 */
404 	apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
405 
406 	/* disable NMI */
407 	apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
408 
409 	/* and error interrupt */
410 	apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
411 
412 	/* and perf counter intr */
413 	apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
414 
415 	apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
416 }
417 
418 static void
419 apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
420 {
421 	int		loop_count;
422 	uint32_t	vector;
423 	uint_t		apicid;
424 	ulong_t		iflag;
425 
426 	apicid =  apic_cpus[cpun].aci_local_id;
427 
428 	/*
429 	 * Interrupts on current CPU will be disabled during the
430 	 * steps in order to avoid unwanted side effects from
431 	 * executing interrupt handlers on a problematic BIOS.
432 	 */
433 	iflag = intr_clear();
434 
435 	if (start) {
436 		outb(CMOS_ADDR, SSB);
437 		outb(CMOS_DATA, BIOS_SHUTDOWN);
438 	}
439 
440 	/*
441 	 * According to X2APIC specification in section '2.3.5.1' of
442 	 * Interrupt Command Register Semantics, the semantics of
443 	 * programming the Interrupt Command Register to dispatch an interrupt
444 	 * is simplified. A single MSR write to the 64-bit ICR is required
445 	 * for dispatching an interrupt. Specifically, with the 64-bit MSR
446 	 * interface to ICR, system software is not required to check the
447 	 * status of the delivery status bit prior to writing to the ICR
448 	 * to send an IPI. With the removal of the Delivery Status bit,
449 	 * system software no longer has a reason to read the ICR. It remains
450 	 * readable only to aid in debugging.
451 	 */
452 #ifdef	DEBUG
453 	APIC_AV_PENDING_SET();
454 #else
455 	if (apic_mode == LOCAL_APIC) {
456 		APIC_AV_PENDING_SET();
457 	}
458 #endif /* DEBUG */
459 
460 	/* for integrated - make sure there is one INIT IPI in buffer */
461 	/* for external - it will wake up the cpu */
462 	apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
463 
464 	/* If only 1 CPU is installed, PENDING bit will not go low */
465 	for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
466 		if (apic_mode == LOCAL_APIC &&
467 		    apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
468 			apic_ret();
469 		else
470 			break;
471 	}
472 
473 	apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
474 	drv_usecwait(20000);		/* 20 milli sec */
475 
476 	if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
477 		/* integrated apic */
478 
479 		vector = (rm_platter_pa >> MMU_PAGESHIFT) &
480 		    (APIC_VECTOR_MASK | APIC_IPL_MASK);
481 
482 		/* to offset the INIT IPI queue up in the buffer */
483 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
484 		drv_usecwait(200);		/* 20 micro sec */
485 
486 		/*
487 		 * send the second SIPI (Startup IPI) as recommended by Intel
488 		 * software development manual.
489 		 */
490 		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
491 		drv_usecwait(200);	/* 20 micro sec */
492 	}
493 
494 	intr_restore(iflag);
495 }
496 
497 /*ARGSUSED1*/
498 int
499 apic_cpu_start(processorid_t cpun, caddr_t arg)
500 {
501 	ASSERT(MUTEX_HELD(&cpu_lock));
502 
503 	if (!apic_cpu_in_range(cpun)) {
504 		return (EINVAL);
505 	}
506 
507 	/*
508 	 * Switch to apic_common_send_ipi for safety during starting other CPUs.
509 	 */
510 	if (apic_mode == LOCAL_X2APIC) {
511 		apic_switch_ipi_callback(B_TRUE);
512 	}
513 
514 	apic_cmos_ssb_set = 1;
515 	apic_cpu_send_SIPI(cpun, B_TRUE);
516 
517 	return (0);
518 }
519 
520 /*
521  * Put CPU into halted state with interrupts disabled.
522  */
523 /*ARGSUSED1*/
524 int
525 apic_cpu_stop(processorid_t cpun, caddr_t arg)
526 {
527 	int		rc;
528 	cpu_t		*cp;
529 	extern cpuset_t cpu_ready_set;
530 	extern void cpu_idle_intercept_cpu(cpu_t *cp);
531 
532 	ASSERT(MUTEX_HELD(&cpu_lock));
533 
534 	if (!apic_cpu_in_range(cpun)) {
535 		return (EINVAL);
536 	}
537 	if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
538 		return (ENOTSUP);
539 	}
540 
541 	cp = cpu_get(cpun);
542 	ASSERT(cp != NULL);
543 	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
544 	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
545 	ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
546 
547 	/* Clear CPU_READY flag to disable cross calls. */
548 	cp->cpu_flags &= ~CPU_READY;
549 	CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
550 	rc = xc_flush_cpu(cp);
551 	if (rc != 0) {
552 		CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
553 		cp->cpu_flags |= CPU_READY;
554 		return (rc);
555 	}
556 
557 	/* Intercept target CPU at a safe point before powering it off. */
558 	cpu_idle_intercept_cpu(cp);
559 
560 	apic_cpu_send_SIPI(cpun, B_FALSE);
561 	cp->cpu_flags &= ~CPU_RUNNING;
562 
563 	return (0);
564 }
565 
566 int
567 apic_cpu_ops(psm_cpu_request_t *reqp)
568 {
569 	if (reqp == NULL) {
570 		return (EINVAL);
571 	}
572 
573 	switch (reqp->pcr_cmd) {
574 	case PSM_CPU_ADD:
575 		return (apic_cpu_add(reqp));
576 
577 	case PSM_CPU_REMOVE:
578 		return (apic_cpu_remove(reqp));
579 
580 	case PSM_CPU_STOP:
581 		return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
582 		    reqp->req.cpu_stop.ctx));
583 
584 	default:
585 		return (ENOTSUP);
586 	}
587 }
588 
589 #ifdef	DEBUG
590 int	apic_break_on_cpu = 9;
591 int	apic_stretch_interrupts = 0;
592 int	apic_stretch_ISR = 1 << 3;	/* IPL of 3 matches nothing now */
593 #endif /* DEBUG */
594 
595 /*
596  * generates an interprocessor interrupt to another CPU. Any changes made to
597  * this routine must be accompanied by similar changes to
598  * apic_common_send_ipi().
599  */
600 void
601 apic_send_ipi(int cpun, int ipl)
602 {
603 	int vector;
604 	ulong_t flag;
605 
606 	vector = apic_resv_vector[ipl];
607 
608 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
609 
610 	flag = intr_clear();
611 
612 	APIC_AV_PENDING_SET();
613 
614 	apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
615 	    vector);
616 
617 	intr_restore(flag);
618 }
619 
620 void
621 apic_send_pir_ipi(processorid_t cpun)
622 {
623 	const int vector = apic_pir_vect;
624 	ulong_t flag;
625 
626 	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
627 
628 	flag = intr_clear();
629 
630 	/* Self-IPI for inducing PIR makes no sense. */
631 	if ((cpun != psm_get_cpu_id())) {
632 		APIC_AV_PENDING_SET();
633 		apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
634 		    vector);
635 	}
636 
637 	intr_restore(flag);
638 }
639 
640 int
641 apic_get_pir_ipivect(void)
642 {
643 	return (apic_pir_vect);
644 }
645 
646 /*ARGSUSED*/
647 void
648 apic_set_idlecpu(processorid_t cpun)
649 {
650 }
651 
652 /*ARGSUSED*/
653 void
654 apic_unset_idlecpu(processorid_t cpun)
655 {
656 }
657 
658 
659 void
660 apic_ret()
661 {
662 }
663 
664 /*
665  * If apic_coarse_time == 1, then apic_gettime() is used instead of
666  * apic_gethrtime().  This is used for performance instead of accuracy.
667  */
668 
669 hrtime_t
670 apic_gettime()
671 {
672 	int old_hrtime_stamp;
673 	hrtime_t temp;
674 
675 	/*
676 	 * In one-shot mode, we do not keep time, so if anyone
677 	 * calls psm_gettime() directly, we vector over to
678 	 * gethrtime().
679 	 * one-shot mode MUST NOT be enabled if this psm is the source of
680 	 * hrtime.
681 	 */
682 
683 	if (apic_oneshot)
684 		return (gethrtime());
685 
686 
687 gettime_again:
688 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
689 		apic_ret();
690 
691 	temp = apic_nsec_since_boot;
692 
693 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
694 		goto gettime_again;
695 	}
696 	return (temp);
697 }
698 
699 /*
700  * Here we return the number of nanoseconds since booting.  Note every
701  * clock interrupt increments apic_nsec_since_boot by the appropriate
702  * amount.
703  */
704 hrtime_t
705 apic_gethrtime(void)
706 {
707 	int curr_timeval, countval, elapsed_ticks;
708 	int old_hrtime_stamp, status;
709 	hrtime_t temp;
710 	uint32_t cpun;
711 	ulong_t oflags;
712 
713 	/*
714 	 * In one-shot mode, we do not keep time, so if anyone
715 	 * calls psm_gethrtime() directly, we vector over to
716 	 * gethrtime().
717 	 * one-shot mode MUST NOT be enabled if this psm is the source of
718 	 * hrtime.
719 	 */
720 
721 	if (apic_oneshot)
722 		return (gethrtime());
723 
724 	oflags = intr_clear();	/* prevent migration */
725 
726 	cpun = apic_reg_ops->apic_read(APIC_LID_REG);
727 	if (apic_mode == LOCAL_APIC)
728 		cpun >>= APIC_ID_BIT_OFFSET;
729 
730 	lock_set(&apic_gethrtime_lock);
731 
732 gethrtime_again:
733 	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
734 		apic_ret();
735 
736 	/*
737 	 * Check to see which CPU we are on.  Note the time is kept on
738 	 * the local APIC of CPU 0.  If on CPU 0, simply read the current
739 	 * counter.  If on another CPU, issue a remote read command to CPU 0.
740 	 */
741 	if (cpun == apic_cpus[0].aci_local_id) {
742 		countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
743 	} else {
744 #ifdef	DEBUG
745 		APIC_AV_PENDING_SET();
746 #else
747 		if (apic_mode == LOCAL_APIC)
748 			APIC_AV_PENDING_SET();
749 #endif /* DEBUG */
750 
751 		apic_reg_ops->apic_write_int_cmd(
752 		    apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
753 
754 		while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
755 		    & AV_READ_PENDING) {
756 			apic_ret();
757 		}
758 
759 		if (status & AV_REMOTE_STATUS)	/* 1 = valid */
760 			countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
761 		else {	/* 0 = invalid */
762 			apic_remote_hrterr++;
763 			/*
764 			 * return last hrtime right now, will need more
765 			 * testing if change to retry
766 			 */
767 			temp = apic_last_hrtime;
768 
769 			lock_clear(&apic_gethrtime_lock);
770 
771 			intr_restore(oflags);
772 
773 			return (temp);
774 		}
775 	}
776 	if (countval > last_count_read)
777 		countval = 0;
778 	else
779 		last_count_read = countval;
780 
781 	elapsed_ticks = apic_hertz_count - countval;
782 
783 	curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
784 	temp = apic_nsec_since_boot + curr_timeval;
785 
786 	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
787 		/* we might have clobbered last_count_read. Restore it */
788 		last_count_read = apic_hertz_count;
789 		goto gethrtime_again;
790 	}
791 
792 	if (temp < apic_last_hrtime) {
793 		/* return last hrtime if error occurs */
794 		apic_hrtime_error++;
795 		temp = apic_last_hrtime;
796 	}
797 	else
798 		apic_last_hrtime = temp;
799 
800 	lock_clear(&apic_gethrtime_lock);
801 	intr_restore(oflags);
802 
803 	return (temp);
804 }
805 
806 /* apic NMI handler */
807 /*ARGSUSED*/
808 void
809 apic_nmi_intr(caddr_t arg, struct regs *rp)
810 {
811 	nmi_action_t action = nmi_action;
812 
813 	if (apic_shutdown_processors) {
814 		apic_disable_local_apic();
815 		return;
816 	}
817 
818 	apic_error |= APIC_ERR_NMI;
819 
820 	if (!lock_try(&apic_nmi_lock))
821 		return;
822 	apic_num_nmis++;
823 
824 	/*
825 	 * "nmi_action" always over-rides the older way of doing this, unless we
826 	 * can't actually drop into kmdb when requested.
827 	 */
828 	if (action == NMI_ACTION_KMDB && !psm_debugger())
829 		action = NMI_ACTION_UNSET;
830 
831 	if (action == NMI_ACTION_UNSET) {
832 		if (apic_kmdb_on_nmi && psm_debugger())
833 			action = NMI_ACTION_KMDB;
834 		else if (apic_panic_on_nmi)
835 			action = NMI_ACTION_PANIC;
836 		else
837 			action = NMI_ACTION_IGNORE;
838 	}
839 
840 	switch (action) {
841 	case NMI_ACTION_IGNORE:
842 		/*
843 		 * prom_printf is the best shot we have of something which is
844 		 * problem free from high level/NMI type of interrupts
845 		 */
846 		prom_printf("NMI received\n");
847 		break;
848 
849 	case NMI_ACTION_PANIC:
850 		/* Keep panic from entering kmdb. */
851 		nopanicdebug = 1;
852 		panic("NMI received\n");
853 		break;
854 
855 	case NMI_ACTION_KMDB:
856 	default:
857 		debug_enter("NMI received: entering kmdb\n");
858 		break;
859 	}
860 
861 	lock_clear(&apic_nmi_lock);
862 }
863 
864 processorid_t
865 apic_get_next_processorid(processorid_t cpu_id)
866 {
867 
868 	int i;
869 
870 	if (cpu_id == -1)
871 		return ((processorid_t)0);
872 
873 	for (i = cpu_id + 1; i < NCPU; i++) {
874 		if (apic_cpu_in_range(i))
875 			return (i);
876 	}
877 
878 	return ((processorid_t)-1);
879 }
880 
881 int
882 apic_cpu_add(psm_cpu_request_t *reqp)
883 {
884 	int i, rv = 0;
885 	ulong_t iflag;
886 	boolean_t first = B_TRUE;
887 	uchar_t localver = 0;
888 	uint32_t localid, procid;
889 	processorid_t cpuid = (processorid_t)-1;
890 	mach_cpu_add_arg_t *ap;
891 
892 	ASSERT(reqp != NULL);
893 	reqp->req.cpu_add.cpuid = (processorid_t)-1;
894 
895 	/* Check whether CPU hotplug is supported. */
896 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
897 		return (ENOTSUP);
898 	}
899 
900 	ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
901 	switch (ap->type) {
902 	case MACH_CPU_ARG_LOCAL_APIC:
903 		localid = ap->arg.apic.apic_id;
904 		procid = ap->arg.apic.proc_id;
905 		if (localid >= 255 || procid > 255) {
906 			cmn_err(CE_WARN,
907 			    "!apic: apicid(%u) or procid(%u) is invalid.",
908 			    localid, procid);
909 			return (EINVAL);
910 		}
911 		break;
912 
913 	case MACH_CPU_ARG_LOCAL_X2APIC:
914 		localid = ap->arg.apic.apic_id;
915 		procid = ap->arg.apic.proc_id;
916 		if (localid >= UINT32_MAX) {
917 			cmn_err(CE_WARN,
918 			    "!apic: x2apicid(%u) is invalid.", localid);
919 			return (EINVAL);
920 		} else if (localid >= 255 && apic_mode == LOCAL_APIC) {
921 			cmn_err(CE_WARN, "!apic: system is in APIC mode, "
922 			    "can't support x2APIC processor.");
923 			return (ENOTSUP);
924 		}
925 		break;
926 
927 	default:
928 		cmn_err(CE_WARN,
929 		    "!apic: unknown argument type %d to apic_cpu_add().",
930 		    ap->type);
931 		return (EINVAL);
932 	}
933 
934 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
935 	iflag = intr_clear();
936 	lock_set(&apic_ioapic_lock);
937 
938 	/* Check whether local APIC id already exists. */
939 	for (i = 0; i < apic_nproc; i++) {
940 		if (!CPU_IN_SET(apic_cpumask, i))
941 			continue;
942 		if (apic_cpus[i].aci_local_id == localid) {
943 			lock_clear(&apic_ioapic_lock);
944 			intr_restore(iflag);
945 			cmn_err(CE_WARN,
946 			    "!apic: local apic id %u already exists.",
947 			    localid);
948 			return (EEXIST);
949 		} else if (apic_cpus[i].aci_processor_id == procid) {
950 			lock_clear(&apic_ioapic_lock);
951 			intr_restore(iflag);
952 			cmn_err(CE_WARN,
953 			    "!apic: processor id %u already exists.",
954 			    (int)procid);
955 			return (EEXIST);
956 		}
957 
958 		/*
959 		 * There's no local APIC version number available in MADT table,
960 		 * so assume that all CPUs are homogeneous and use local APIC
961 		 * version number of the first existing CPU.
962 		 */
963 		if (first) {
964 			first = B_FALSE;
965 			localver = apic_cpus[i].aci_local_ver;
966 		}
967 	}
968 	ASSERT(first == B_FALSE);
969 
970 	/*
971 	 * Try to assign the same cpuid if APIC id exists in the dirty cache.
972 	 */
973 	for (i = 0; i < apic_max_nproc; i++) {
974 		if (CPU_IN_SET(apic_cpumask, i)) {
975 			ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
976 			continue;
977 		}
978 		ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
979 		if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
980 		    apic_cpus[i].aci_local_id == localid &&
981 		    apic_cpus[i].aci_processor_id == procid) {
982 			cpuid = i;
983 			break;
984 		}
985 	}
986 
987 	/* Avoid the dirty cache and allocate fresh slot if possible. */
988 	if (cpuid == (processorid_t)-1) {
989 		for (i = 0; i < apic_max_nproc; i++) {
990 			if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
991 			    (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
992 				cpuid = i;
993 				break;
994 			}
995 		}
996 	}
997 
998 	/* Try to find any free slot as last resort. */
999 	if (cpuid == (processorid_t)-1) {
1000 		for (i = 0; i < apic_max_nproc; i++) {
1001 			if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
1002 				cpuid = i;
1003 				break;
1004 			}
1005 		}
1006 	}
1007 
1008 	if (cpuid == (processorid_t)-1) {
1009 		lock_clear(&apic_ioapic_lock);
1010 		intr_restore(iflag);
1011 		cmn_err(CE_NOTE,
1012 		    "!apic: failed to allocate cpu id for processor %u.",
1013 		    procid);
1014 		rv = EAGAIN;
1015 	} else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
1016 		lock_clear(&apic_ioapic_lock);
1017 		intr_restore(iflag);
1018 		cmn_err(CE_NOTE,
1019 		    "!apic: failed to build mapping for processor %u.",
1020 		    procid);
1021 		rv = EBUSY;
1022 	} else {
1023 		ASSERT(cpuid >= 0 && cpuid < NCPU);
1024 		ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
1025 		bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
1026 		apic_cpus[cpuid].aci_processor_id = procid;
1027 		apic_cpus[cpuid].aci_local_id = localid;
1028 		apic_cpus[cpuid].aci_local_ver = localver;
1029 		CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
1030 		if (cpuid >= apic_nproc) {
1031 			apic_nproc = cpuid + 1;
1032 		}
1033 		lock_clear(&apic_ioapic_lock);
1034 		intr_restore(iflag);
1035 		reqp->req.cpu_add.cpuid = cpuid;
1036 	}
1037 
1038 	return (rv);
1039 }
1040 
1041 int
1042 apic_cpu_remove(psm_cpu_request_t *reqp)
1043 {
1044 	int i;
1045 	ulong_t iflag;
1046 	processorid_t cpuid;
1047 
1048 	/* Check whether CPU hotplug is supported. */
1049 	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1050 		return (ENOTSUP);
1051 	}
1052 
1053 	cpuid = reqp->req.cpu_remove.cpuid;
1054 
1055 	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1056 	iflag = intr_clear();
1057 	lock_set(&apic_ioapic_lock);
1058 
1059 	if (!apic_cpu_in_range(cpuid)) {
1060 		lock_clear(&apic_ioapic_lock);
1061 		intr_restore(iflag);
1062 		cmn_err(CE_WARN,
1063 		    "!apic: cpuid %d doesn't exist in apic_cpus array.",
1064 		    cpuid);
1065 		return (ENODEV);
1066 	}
1067 	ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1068 
1069 	if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1070 		lock_clear(&apic_ioapic_lock);
1071 		intr_restore(iflag);
1072 		return (ENOENT);
1073 	}
1074 
1075 	if (cpuid == apic_nproc - 1) {
1076 		/*
1077 		 * We are removing the highest numbered cpuid so we need to
1078 		 * find the next highest cpuid as the new value for apic_nproc.
1079 		 */
1080 		for (i = apic_nproc; i > 0; i--) {
1081 			if (CPU_IN_SET(apic_cpumask, i - 1)) {
1082 				apic_nproc = i;
1083 				break;
1084 			}
1085 		}
1086 		/* at least one CPU left */
1087 		ASSERT(i > 0);
1088 	}
1089 	CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1090 	/* mark slot as free and keep it in the dirty cache */
1091 	apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1092 
1093 	lock_clear(&apic_ioapic_lock);
1094 	intr_restore(iflag);
1095 
1096 	return (0);
1097 }
1098 
1099 /*
1100  * Return the number of ticks the APIC decrements in SF nanoseconds.
1101  * The fixed-frequency PIT (aka 8254) is used for the measurement.
1102  */
1103 static uint64_t
1104 apic_calibrate_impl()
1105 {
1106 	uint8_t		pit_tick_lo;
1107 	uint16_t	pit_tick, target_pit_tick, pit_ticks_adj;
1108 	uint32_t	pit_ticks;
1109 	uint32_t	start_apic_tick, end_apic_tick, apic_ticks;
1110 	ulong_t		iflag;
1111 
1112 	apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1113 	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1114 
1115 	iflag = intr_clear();
1116 
1117 	do {
1118 		pit_tick_lo = inb(PITCTR0_PORT);
1119 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1120 	} while (pit_tick < APIC_TIME_MIN ||
1121 	    pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1122 
1123 	/*
1124 	 * Wait for the PIT to decrement by 5 ticks to ensure
1125 	 * we didn't start in the middle of a tick.
1126 	 * Compare with 0x10 for the wrap around case.
1127 	 */
1128 	target_pit_tick = pit_tick - 5;
1129 	do {
1130 		pit_tick_lo = inb(PITCTR0_PORT);
1131 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1132 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1133 
1134 	start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1135 
1136 	/*
1137 	 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1138 	 */
1139 	target_pit_tick = pit_tick - APIC_TIME_COUNT;
1140 	do {
1141 		pit_tick_lo = inb(PITCTR0_PORT);
1142 		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1143 	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1144 
1145 	end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1146 
1147 	intr_restore(iflag);
1148 
1149 	apic_ticks = start_apic_tick - end_apic_tick;
1150 
1151 	/* The PIT might have decremented by more ticks than planned */
1152 	pit_ticks_adj = target_pit_tick - pit_tick;
1153 	/* total number of PIT ticks corresponding to apic_ticks */
1154 	pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1155 
1156 	/*
1157 	 * Determine the number of nanoseconds per APIC clock tick
1158 	 * and then determine how many APIC ticks to interrupt at the
1159 	 * desired frequency
1160 	 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1161 	 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1162 	 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1163 	 * apic_ticks_per_SFns =
1164 	 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1165 	 */
1166 	return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1167 }
1168 
1169 /*
1170  * It was found empirically that 5 measurements seem sufficient to give a good
1171  * accuracy. Most spurious measurements are higher than the target value thus
1172  * we eliminate up to 2/5 spurious measurements.
1173  */
1174 #define	APIC_CALIBRATE_MEASUREMENTS		5
1175 
1176 #define	APIC_CALIBRATE_PERCENT_OFF_WARNING	10
1177 
1178 /*
1179  * Return the number of ticks the APIC decrements in SF nanoseconds.
1180  * Several measurements are taken to filter out outliers.
1181  */
1182 uint64_t
1183 apic_calibrate()
1184 {
1185 	uint64_t	measurements[APIC_CALIBRATE_MEASUREMENTS];
1186 	int		median_idx;
1187 	uint64_t	median;
1188 
1189 	/*
1190 	 * When running under a virtual machine, the emulated PIT and APIC
1191 	 * counters do not always return the right values and can roll over.
1192 	 * Those spurious measurements are relatively rare but could
1193 	 * significantly affect the calibration.
1194 	 * Therefore we take several measurements and then keep the median.
1195 	 * The median is preferred to the average here as we only want to
1196 	 * discard outliers.
1197 	 */
1198 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1199 		measurements[i] = apic_calibrate_impl();
1200 
1201 	/*
1202 	 * sort results and retrieve median.
1203 	 */
1204 	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1205 		for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1206 			if (measurements[j] < measurements[i]) {
1207 				uint64_t tmp = measurements[i];
1208 				measurements[i] = measurements[j];
1209 				measurements[j] = tmp;
1210 			}
1211 		}
1212 	}
1213 	median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1214 	median = measurements[median_idx];
1215 
1216 #if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1217 	/*
1218 	 * Check that measurements are consistent. Post a warning
1219 	 * if the three middle values are not close to each other.
1220 	 */
1221 	uint64_t delta_warn = median *
1222 	    APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1223 	if ((median - measurements[median_idx - 1]) > delta_warn ||
1224 	    (measurements[median_idx + 1] - median) > delta_warn) {
1225 		cmn_err(CE_WARN, "apic_calibrate measurements lack "
1226 		    "precision: %llu, %llu, %llu.",
1227 		    (u_longlong_t)measurements[median_idx - 1],
1228 		    (u_longlong_t)median,
1229 		    (u_longlong_t)measurements[median_idx + 1]);
1230 	}
1231 #endif
1232 
1233 	return (median);
1234 }
1235 
1236 /*
1237  * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1238  * frequency.  Note at this stage in the boot sequence, the boot processor
1239  * is the only active processor.
1240  * hertz value of 0 indicates a one-shot mode request.  In this case
1241  * the function returns the resolution (in nanoseconds) for the hardware
1242  * timer interrupt.  If one-shot mode capability is not available,
1243  * the return value will be 0. apic_enable_oneshot is a global switch
1244  * for disabling the functionality.
1245  * A non-zero positive value for hertz indicates a periodic mode request.
1246  * In this case the hardware will be programmed to generate clock interrupts
1247  * at hertz frequency and returns the resolution of interrupts in
1248  * nanosecond.
1249  */
1250 
1251 int
1252 apic_clkinit(int hertz)
1253 {
1254 	int		ret;
1255 
1256 	apic_int_busy_mark = (apic_int_busy_mark *
1257 	    apic_sample_factor_redistribution) / 100;
1258 	apic_int_free_mark = (apic_int_free_mark *
1259 	    apic_sample_factor_redistribution) / 100;
1260 	apic_diff_for_redistribution = (apic_diff_for_redistribution *
1261 	    apic_sample_factor_redistribution) / 100;
1262 
1263 	ret = apic_timer_init(hertz);
1264 	return (ret);
1265 
1266 }
1267 
1268 /*
1269  * apic_preshutdown:
1270  * Called early in shutdown whilst we can still access filesystems to do
1271  * things like loading modules which will be required to complete shutdown
1272  * after filesystems are all unmounted.
1273  */
1274 void
1275 apic_preshutdown(int cmd, int fcn)
1276 {
1277 	APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1278 	    cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1279 }
1280 
1281 void
1282 apic_shutdown(int cmd, int fcn)
1283 {
1284 	int restarts, attempts;
1285 	int i;
1286 	uchar_t	byte;
1287 	ulong_t iflag;
1288 
1289 	hpet_acpi_fini();
1290 
1291 	/* Send NMI to all CPUs except self to do per processor shutdown */
1292 	iflag = intr_clear();
1293 #ifdef	DEBUG
1294 	APIC_AV_PENDING_SET();
1295 #else
1296 	if (apic_mode == LOCAL_APIC)
1297 		APIC_AV_PENDING_SET();
1298 #endif /* DEBUG */
1299 	apic_shutdown_processors = 1;
1300 	apic_reg_ops->apic_write(APIC_INT_CMD1,
1301 	    AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1302 
1303 	/* restore cmos shutdown byte before reboot */
1304 	if (apic_cmos_ssb_set) {
1305 		outb(CMOS_ADDR, SSB);
1306 		outb(CMOS_DATA, 0);
1307 	}
1308 
1309 	ioapic_disable_redirection();
1310 
1311 	/*	disable apic mode if imcr present	*/
1312 	if (apic_imcrp) {
1313 		outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1314 		outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1315 	}
1316 
1317 	apic_disable_local_apic();
1318 
1319 	intr_restore(iflag);
1320 
1321 	/* remainder of function is for shutdown cases only */
1322 	if (cmd != A_SHUTDOWN)
1323 		return;
1324 
1325 	/*
1326 	 * Switch system back into Legacy-Mode if using ACPI and
1327 	 * not powering-off.  Some BIOSes need to remain in ACPI-mode
1328 	 * for power-off to succeed (Dell Dimension 4600)
1329 	 * Do not disable ACPI while doing fastreboot
1330 	 */
1331 	if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1332 		(void) AcpiDisable();
1333 
1334 	if (fcn == AD_FASTREBOOT) {
1335 		apic_reg_ops->apic_write(APIC_INT_CMD1,
1336 		    AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1337 	}
1338 
1339 	/* remainder of function is for shutdown+poweroff case only */
1340 	if (fcn != AD_POWEROFF)
1341 		return;
1342 
1343 	switch (apic_poweroff_method) {
1344 		case APIC_POWEROFF_VIA_RTC:
1345 
1346 			/* select the extended NVRAM bank in the RTC */
1347 			outb(CMOS_ADDR, RTC_REGA);
1348 			byte = inb(CMOS_DATA);
1349 			outb(CMOS_DATA, (byte | EXT_BANK));
1350 
1351 			outb(CMOS_ADDR, PFR_REG);
1352 
1353 			/* for Predator must toggle the PAB bit */
1354 			byte = inb(CMOS_DATA);
1355 
1356 			/*
1357 			 * clear power active bar, wakeup alarm and
1358 			 * kickstart
1359 			 */
1360 			byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1361 			outb(CMOS_DATA, byte);
1362 
1363 			/* delay before next write */
1364 			drv_usecwait(1000);
1365 
1366 			/* for S40 the following would suffice */
1367 			byte = inb(CMOS_DATA);
1368 
1369 			/* power active bar control bit */
1370 			byte |= PAB_CBIT;
1371 			outb(CMOS_DATA, byte);
1372 
1373 			break;
1374 
1375 		case APIC_POWEROFF_VIA_ASPEN_BMC:
1376 			restarts = 0;
1377 restart_aspen_bmc:
1378 			if (++restarts == 3)
1379 				break;
1380 			attempts = 0;
1381 			do {
1382 				byte = inb(MISMIC_FLAG_REGISTER);
1383 				byte &= MISMIC_BUSY_MASK;
1384 				if (byte != 0) {
1385 					drv_usecwait(1000);
1386 					if (attempts >= 3)
1387 						goto restart_aspen_bmc;
1388 					++attempts;
1389 				}
1390 			} while (byte != 0);
1391 			outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1392 			byte = inb(MISMIC_FLAG_REGISTER);
1393 			byte |= 0x1;
1394 			outb(MISMIC_FLAG_REGISTER, byte);
1395 			i = 0;
1396 			for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1397 			    i++) {
1398 				attempts = 0;
1399 				do {
1400 					byte = inb(MISMIC_FLAG_REGISTER);
1401 					byte &= MISMIC_BUSY_MASK;
1402 					if (byte != 0) {
1403 						drv_usecwait(1000);
1404 						if (attempts >= 3)
1405 							goto restart_aspen_bmc;
1406 						++attempts;
1407 					}
1408 				} while (byte != 0);
1409 				outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1410 				outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1411 				byte = inb(MISMIC_FLAG_REGISTER);
1412 				byte |= 0x1;
1413 				outb(MISMIC_FLAG_REGISTER, byte);
1414 			}
1415 			break;
1416 
1417 		case APIC_POWEROFF_VIA_SITKA_BMC:
1418 			restarts = 0;
1419 restart_sitka_bmc:
1420 			if (++restarts == 3)
1421 				break;
1422 			attempts = 0;
1423 			do {
1424 				byte = inb(SMS_STATUS_REGISTER);
1425 				byte &= SMS_STATE_MASK;
1426 				if ((byte == SMS_READ_STATE) ||
1427 				    (byte == SMS_WRITE_STATE)) {
1428 					drv_usecwait(1000);
1429 					if (attempts >= 3)
1430 						goto restart_sitka_bmc;
1431 					++attempts;
1432 				}
1433 			} while ((byte == SMS_READ_STATE) ||
1434 			    (byte == SMS_WRITE_STATE));
1435 			outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1436 			i = 0;
1437 			for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1438 			    i++) {
1439 				attempts = 0;
1440 				do {
1441 					byte = inb(SMS_STATUS_REGISTER);
1442 					byte &= SMS_IBF_MASK;
1443 					if (byte != 0) {
1444 						drv_usecwait(1000);
1445 						if (attempts >= 3)
1446 							goto restart_sitka_bmc;
1447 						++attempts;
1448 					}
1449 				} while (byte != 0);
1450 				outb(sitka_bmc[i].port, sitka_bmc[i].data);
1451 			}
1452 			break;
1453 
1454 		case APIC_POWEROFF_NONE:
1455 
1456 			/* If no APIC direct method, we will try using ACPI */
1457 			if (apic_enable_acpi) {
1458 				if (acpi_poweroff() == 1)
1459 					return;
1460 			} else
1461 				return;
1462 
1463 			break;
1464 	}
1465 	/*
1466 	 * Wait a limited time here for power to go off.
1467 	 * If the power does not go off, then there was a
1468 	 * problem and we should continue to the halt which
1469 	 * prints a message for the user to press a key to
1470 	 * reboot.
1471 	 */
1472 	drv_usecwait(7000000); /* wait seven seconds */
1473 
1474 }
1475 
1476 cyclic_id_t apic_cyclic_id;
1477 
1478 /*
1479  * The following functions are in the platform specific file so that they
1480  * can be different functions depending on whether we are running on
1481  * bare metal or a hypervisor.
1482  */
1483 
1484 /*
1485  * map an apic for memory-mapped access
1486  */
1487 uint32_t *
1488 mapin_apic(uint32_t addr, size_t len, int flags)
1489 {
1490 	return ((void *)psm_map_phys(addr, len, flags));
1491 }
1492 
1493 uint32_t *
1494 mapin_ioapic(uint32_t addr, size_t len, int flags)
1495 {
1496 	return (mapin_apic(addr, len, flags));
1497 }
1498 
1499 /*
1500  * unmap an apic
1501  */
1502 void
1503 mapout_apic(caddr_t addr, size_t len)
1504 {
1505 	psm_unmap_phys(addr, len);
1506 }
1507 
1508 void
1509 mapout_ioapic(caddr_t addr, size_t len)
1510 {
1511 	mapout_apic(addr, len);
1512 }
1513 
1514 uint32_t
1515 ioapic_read(int ioapic_ix, uint32_t reg)
1516 {
1517 	volatile uint32_t *ioapic;
1518 
1519 	ioapic = apicioadr[ioapic_ix];
1520 	ioapic[APIC_IO_REG] = reg;
1521 	return (ioapic[APIC_IO_DATA]);
1522 }
1523 
1524 void
1525 ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1526 {
1527 	volatile uint32_t *ioapic;
1528 
1529 	ioapic = apicioadr[ioapic_ix];
1530 	ioapic[APIC_IO_REG] = reg;
1531 	ioapic[APIC_IO_DATA] = value;
1532 }
1533 
1534 void
1535 ioapic_write_eoi(int ioapic_ix, uint32_t value)
1536 {
1537 	volatile uint32_t *ioapic;
1538 
1539 	ioapic = apicioadr[ioapic_ix];
1540 	ioapic[APIC_IO_EOI] = value;
1541 }
1542 
1543 /*
1544  * Round-robin algorithm to find the next CPU with interrupts enabled.
1545  * It can't share the same static variable apic_next_bind_cpu with
1546  * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1547  * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1548  * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1549  * are called.  However, the pcplusmp driver assumes that there will be
1550  * boot_ncpus CPUs configured eventually so it tries to distribute all
1551  * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1552  * interrupts being targetted at CPU1, we need to use a dedicated static
1553  * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1554  */
1555 
1556 processorid_t
1557 apic_find_cpu(int flag)
1558 {
1559 	int i;
1560 	static processorid_t acid = 0;
1561 
1562 	/* Find the first CPU with the passed-in flag set */
1563 	for (i = 0; i < apic_nproc; i++) {
1564 		if (++acid >= apic_nproc) {
1565 			acid = 0;
1566 		}
1567 		if (apic_cpu_in_range(acid) &&
1568 		    (apic_cpus[acid].aci_status & flag)) {
1569 			break;
1570 		}
1571 	}
1572 
1573 	ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1574 	return (acid);
1575 }
1576 
1577 void
1578 apic_intrmap_init(int apic_mode)
1579 {
1580 	int suppress_brdcst_eoi = 0;
1581 
1582 	/*
1583 	 * Intel Software Developer's Manual 3A, 10.12.7:
1584 	 *
1585 	 * Routing of device interrupts to local APIC units operating in
1586 	 * x2APIC mode requires use of the interrupt-remapping architecture
1587 	 * specified in the Intel Virtualization Technology for Directed
1588 	 * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1589 	 * for and software must enable this interrupt remapping with
1590 	 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1591 	 * the local APIC units.
1592 	 *
1593 	 *
1594 	 * In other words, to use the APIC in x2APIC mode, we need interrupt
1595 	 * remapping.  Since we don't start up the IOMMU by default, we
1596 	 * won't be able to do any interrupt remapping and therefore have to
1597 	 * use the APIC in traditional 'local APIC' mode with memory mapped
1598 	 * I/O.
1599 	 */
1600 
1601 	if (psm_vt_ops != NULL) {
1602 		if (((apic_intrmap_ops_t *)psm_vt_ops)->
1603 		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1604 
1605 			apic_vt_ops = psm_vt_ops;
1606 
1607 			/*
1608 			 * We leverage the interrupt remapping engine to
1609 			 * suppress broadcast EOI; thus we must send the
1610 			 * directed EOI with the directed-EOI handler.
1611 			 */
1612 			if (apic_directed_EOI_supported() == 0) {
1613 				suppress_brdcst_eoi = 1;
1614 			}
1615 
1616 			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1617 
1618 			if (apic_detect_x2apic()) {
1619 				apic_enable_x2apic();
1620 			}
1621 
1622 			if (apic_directed_EOI_supported() == 0) {
1623 				apic_set_directed_EOI_handler();
1624 			}
1625 		}
1626 	}
1627 }
1628 
1629 /*ARGSUSED*/
1630 static void
1631 apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt)
1632 {
1633 	irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1634 }
1635 
1636 /*ARGSUSED*/
1637 static void
1638 apic_record_msi(void *intrmap_private, msi_regs_t *mregs)
1639 {
1640 	mregs->mr_addr = MSI_ADDR_HDR |
1641 	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1642 	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1643 	    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1644 	mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1645 	    mregs->mr_data;
1646 }
1647 
1648 /*
1649  * Functions from apic_introp.c
1650  *
1651  * Those functions are used by apic_intr_ops().
1652  */
1653 
1654 /*
1655  * MSI support flag:
1656  * reflects whether MSI is supported at APIC level
1657  * it can also be patched through /etc/system
1658  *
1659  *  0 = default value - don't know and need to call apic_check_msi_support()
1660  *      to find out then set it accordingly
1661  *  1 = supported
1662  * -1 = not supported
1663  */
1664 int	apic_support_msi = 0;
1665 
1666 /* Multiple vector support for MSI-X */
1667 int	apic_msix_enable = 1;
1668 
1669 /* Multiple vector support for MSI */
1670 int	apic_multi_msi_enable = 1;
1671 
1672 /*
1673  * Check whether the system supports MSI.
1674  *
1675  * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1676  * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1677  * return PSM_SUCCESS to indicate this system supports MSI.
1678  *
1679  * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1680  * by detecting if we are running inside the KVM hypervisor, which guarantees
1681  * this version number.)
1682  */
1683 int
1684 apic_check_msi_support()
1685 {
1686 	dev_info_t *cdip;
1687 	char dev_type[16];
1688 	int dev_len;
1689 	int hwenv = get_hwenv();
1690 
1691 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1692 
1693 	/*
1694 	 * check whether the first level children of root_node have
1695 	 * PCI-E or PCI capability.
1696 	 */
1697 	for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1698 	    cdip = ddi_get_next_sibling(cdip)) {
1699 
1700 		DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1701 		    " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1702 		    ddi_driver_name(cdip), ddi_binding_name(cdip),
1703 		    ddi_node_name(cdip)));
1704 		dev_len = sizeof (dev_type);
1705 		if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1706 		    "device_type", (caddr_t)dev_type, &dev_len)
1707 		    != DDI_PROP_SUCCESS)
1708 			continue;
1709 		if (strcmp(dev_type, "pciex") == 0)
1710 			return (PSM_SUCCESS);
1711 		if (strcmp(dev_type, "pci") == 0 &&
1712 		    (hwenv == HW_KVM || hwenv == HW_BHYVE))
1713 			return (PSM_SUCCESS);
1714 	}
1715 
1716 	/* MSI is not supported on this system */
1717 	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1718 	    "device_type found\n"));
1719 	return (PSM_FAILURE);
1720 }
1721 
1722 /*
1723  * apic_pci_msi_unconfigure:
1724  *
1725  * This and next two interfaces are copied from pci_intr_lib.c
1726  * Do ensure that these two files stay in sync.
1727  * These needed to be copied over here to avoid a deadlock situation on
1728  * certain mp systems that use MSI interrupts.
1729  *
1730  * IMPORTANT regards next three interfaces:
1731  * i) are called only for MSI/X interrupts.
1732  * ii) called with interrupts disabled, and must not block
1733  */
1734 void
1735 apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1736 {
1737 	ushort_t		msi_ctrl;
1738 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1739 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1740 
1741 	ASSERT((handle != NULL) && (cap_ptr != 0));
1742 
1743 	if (type == DDI_INTR_TYPE_MSI) {
1744 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1745 		msi_ctrl &= (~PCI_MSI_MME_MASK);
1746 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1747 		pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1748 
1749 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1750 			pci_config_put16(handle,
1751 			    cap_ptr + PCI_MSI_64BIT_DATA, 0);
1752 			pci_config_put32(handle,
1753 			    cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1754 		} else {
1755 			pci_config_put16(handle,
1756 			    cap_ptr + PCI_MSI_32BIT_DATA, 0);
1757 		}
1758 
1759 	} else if (type == DDI_INTR_TYPE_MSIX) {
1760 		uintptr_t	off;
1761 		uint32_t	mask;
1762 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(rdip);
1763 
1764 		ASSERT(msix_p != NULL);
1765 
1766 		/* Offset into "inum"th entry in the MSI-X table & mask it */
1767 		off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1768 		    PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1769 
1770 		mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1771 
1772 		ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1773 
1774 		/* Offset into the "inum"th entry in the MSI-X table */
1775 		off = (uintptr_t)msix_p->msix_tbl_addr +
1776 		    (inum * PCI_MSIX_VECTOR_SIZE);
1777 
1778 		/* Reset the "data" and "addr" bits */
1779 		ddi_put32(msix_p->msix_tbl_hdl,
1780 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1781 		ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1782 	}
1783 }
1784 
1785 /*
1786  * apic_pci_msi_disable_mode:
1787  */
1788 void
1789 apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1790 {
1791 	ushort_t		msi_ctrl;
1792 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1793 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1794 
1795 	ASSERT((handle != NULL) && (cap_ptr != 0));
1796 
1797 	if (type == DDI_INTR_TYPE_MSI) {
1798 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1799 		if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1800 			return;
1801 
1802 		msi_ctrl &= ~PCI_MSI_ENABLE_BIT;	/* MSI disable */
1803 		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1804 
1805 	} else if (type == DDI_INTR_TYPE_MSIX) {
1806 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1807 		if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1808 			msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1809 			pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1810 			    msi_ctrl);
1811 		}
1812 	}
1813 }
1814 
1815 uint32_t
1816 apic_get_localapicid(uint32_t cpuid)
1817 {
1818 	ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1819 
1820 	return (apic_cpus[cpuid].aci_local_id);
1821 }
1822 
1823 uchar_t
1824 apic_get_ioapicid(uchar_t ioapicindex)
1825 {
1826 	ASSERT(ioapicindex < MAX_IO_APIC);
1827 
1828 	return (apic_io_id[ioapicindex]);
1829 }
1830