xref: /illumos-gate/usr/src/uts/i86pc/io/hpet_acpi.c (revision 71f3ceb939e47627273608fb7ea4b3aa1c3b37e7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2024 Oxide Computer Company
24  * Copyright 2020 Joyent, Inc.
25  */
26 
27 #include <sys/hpet_acpi.h>
28 #include <sys/hpet.h>
29 #include <sys/bitmap.h>
30 #include <sys/inttypes.h>
31 #include <sys/time.h>
32 #include <sys/sunddi.h>
33 #include <sys/ksynch.h>
34 #include <sys/apic.h>
35 #include <sys/callb.h>
36 #include <sys/clock.h>
37 #include <sys/archsystm.h>
38 #include <sys/cpupart.h>
39 #include <sys/x86_archext.h>
40 #include <sys/prom_debug.h>
41 #include <sys/psm.h>
42 #include <sys/bootconf.h>
43 
44 static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags);
45 static boolean_t hpet_install_proxy(void);
46 static boolean_t hpet_callback(int code);
47 static boolean_t hpet_cpr(int code);
48 static boolean_t hpet_resume(void);
49 static void hpet_cst_callback(uint32_t code);
50 static boolean_t hpet_deep_idle_config(int code);
51 static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table);
52 static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len);
53 static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table);
54 static int hpet_start_main_counter(hpet_info_t *hip);
55 static int hpet_stop_main_counter(hpet_info_t *hip);
56 static uint64_t hpet_read_main_counter_value(hpet_info_t *hip);
57 static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value);
58 static uint64_t hpet_read_gen_cap(hpet_info_t *hip);
59 static uint64_t hpet_read_gen_config(hpet_info_t *hip);
60 static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip);
61 static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n);
62 static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf);
63 static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l);
64 static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l);
65 static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l);
66 static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l);
67 static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n);
68 static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n);
69 static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip);
70 static int hpet_timer_available(uint32_t allocated_timers, uint32_t n);
71 static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n);
72 static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n,
73     uint32_t interrupt);
74 static uint_t hpet_isr(caddr_t, caddr_t);
75 static uint32_t hpet_install_interrupt_handler(avfunc func, int vector);
76 static void hpet_uninstall_interrupt_handler(void);
77 static void hpet_expire_all(void);
78 static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time);
79 static boolean_t hpet_use_hpet_timer(hrtime_t *expire);
80 static void hpet_use_lapic_timer(hrtime_t expire);
81 static void hpet_init_proxy_data(void);
82 
83 /*
84  * hpet_state_lock is used to synchronize disabling/enabling deep c-states
85  * and to synchronize suspend/resume.
86  */
87 static kmutex_t		hpet_state_lock;
88 static struct hpet_state {
89 	boolean_t	proxy_installed;	/* CBE proxy interrupt setup */
90 	boolean_t	cpr;			/* currently in CPR */
91 	boolean_t	cpu_deep_idle;		/* user enable/disable */
92 	boolean_t	uni_cstate;		/* disable if only one cstate */
93 } hpet_state = { B_FALSE, B_FALSE, B_TRUE, B_TRUE};
94 
95 uint64_t hpet_spin_check = HPET_SPIN_CHECK;
96 uint64_t hpet_spin_timeout = HPET_SPIN_TIMEOUT;
97 uint64_t hpet_idle_spin_timeout = HPET_SPIN_TIMEOUT;
98 uint64_t hpet_isr_spin_timeout = HPET_SPIN_TIMEOUT;
99 
100 static kmutex_t		hpet_proxy_lock;	/* lock for lAPIC proxy data */
101 /*
102  * hpet_proxy_users is a per-cpu array.
103  */
104 static hpet_proxy_t	*hpet_proxy_users;	/* one per CPU */
105 
106 static boolean_t	hpet_early_init_failed;
107 
108 ACPI_TABLE_HPET		*hpet_table;		/* ACPI HPET table */
109 hpet_info_t		hpet_info;		/* Human readable Information */
110 
111 static hrtime_t (*apic_timer_stop_count_fn)(void);
112 static void (*apic_timer_restart_fn)(hrtime_t);
113 
114 /*
115  * Provide HPET access from unix.so.
116  * Set up pointers to access symbols in pcplusmp.
117  */
118 static void
119 hpet_establish_hooks(void)
120 {
121 	hpet.install_proxy = &hpet_install_proxy;
122 	hpet.callback = &hpet_callback;
123 	hpet.use_hpet_timer = &hpet_use_hpet_timer;
124 	hpet.use_lapic_timer = &hpet_use_lapic_timer;
125 }
126 
127 /*
128  * Initialize the HPET early in the boot process if it is both present
129  * and needed to calibrate the TSC. This initializes the HPET enough to
130  * allow the main counter to be read for calibration purposes.
131  *
132  * If the HPET is not needed early in the boot process, but is needed later
133  * by ACPI, this will be called at that time to start the initialization
134  * process.
135  */
136 int
137 hpet_early_init(void)
138 {
139 	extern hrtime_t tsc_read(void);
140 	void		*la;
141 	uint64_t	ret;
142 	uint_t		num_timers;
143 	uint_t		ti;
144 
145 	PRM_POINT("Initializing the HPET...");
146 
147 	/* If we tried and failed, don't try again. */
148 	if (hpet_early_init_failed) {
149 		PRM_POINT("Prior HPET initialization failed, aborting...");
150 		return (DDI_FAILURE);
151 	}
152 
153 	/* No need to initialize again if we already succeeded */
154 	if (hpet.supported >= HPET_TIMER_SUPPORT)
155 		return (DDI_SUCCESS);
156 
157 	(void) memset(&hpet_info, 0, sizeof (hpet_info));
158 	hpet.supported = HPET_NO_SUPPORT;
159 
160 	/*
161 	 * Once called, we assume initialization fails unless we complete all
162 	 * the early init tasks.
163 	 */
164 	hpet_early_init_failed = B_TRUE;
165 
166 	if ((get_hwenv() & HW_XEN_HVM) != 0) {
167 		/*
168 		 * In some AWS EC2 guests, though the HPET is advertised via
169 		 * ACPI, programming the interrupt on the non-legacy timer can
170 		 * result in an immediate reset of the instance.  It is not
171 		 * currently possible to tell whether this is an instance with
172 		 * broken HPET emulation or not, so we simply disable it across
173 		 * the board.
174 		 */
175 		PRM_POINT("will not program HPET in Xen HVM");
176 		return (DDI_FAILURE);
177 	}
178 
179 	/*
180 	 * If there are any HPET tables, we should have mapped and stored
181 	 * the address of the first table while building up the boot
182 	 * properties.
183 	 *
184 	 * Systems with a large numbers of HPET timer blocks may have
185 	 * multiple HPET tables (each HPET table can contain at most 32 timer
186 	 * blocks). Most x86 systems have 1 HPET table with 3 counters (it
187 	 * appears multiple HPET timers was largely seen on Itanium systems).
188 	 * illumos currently only uses the first HPET table, so we do not need
189 	 * to be concerned about additional tables.
190 	 */
191 	if (BOP_GETPROPLEN(bootops, "hpet-table") != 8 ||
192 	    BOP_GETPROP(bootops, "hpet-table", (void *)&hpet_table) != 0) {
193 		cmn_err(CE_NOTE, "!hpet_acpi: unable to get ACPI HPET table");
194 		return (DDI_FAILURE);
195 	}
196 
197 	if (hpet_validate_table(hpet_table) != AE_OK) {
198 		cmn_err(CE_NOTE, "!hpet_acpi: invalid HPET table");
199 		return (DDI_FAILURE);
200 	}
201 
202 	PRM_POINT("hpet_memory_map()");
203 	la = hpet_memory_map(hpet_table);
204 	PRM_DEBUG(la);
205 	if (la == NULL) {
206 		cmn_err(CE_NOTE, "!hpet_acpi: memory map HPET failed");
207 		return (DDI_FAILURE);
208 	}
209 	hpet_info.logical_address = la;
210 
211 	PRM_POINT("hpet_read_gen_cap()");
212 	ret = hpet_read_gen_cap(&hpet_info);
213 	PRM_DEBUG(ret);
214 	hpet_info.gen_cap.counter_clk_period = HPET_GCAP_CNTR_CLK_PERIOD(ret);
215 	hpet_info.gen_cap.vendor_id = HPET_GCAP_VENDOR_ID(ret);
216 	hpet_info.gen_cap.leg_route_cap = HPET_GCAP_LEG_ROUTE_CAP(ret);
217 	hpet_info.gen_cap.count_size_cap = HPET_GCAP_CNT_SIZE_CAP(ret);
218 	/*
219 	 * Hardware contains the last timer's number.
220 	 * Add 1 to get the number of timers.
221 	 */
222 	hpet_info.gen_cap.num_tim_cap = HPET_GCAP_NUM_TIM_CAP(ret) + 1;
223 	hpet_info.gen_cap.rev_id = HPET_GCAP_REV_ID(ret);
224 
225 	if (hpet_info.gen_cap.counter_clk_period > HPET_MAX_CLK_PERIOD) {
226 		cmn_err(CE_NOTE, "!hpet_acpi: COUNTER_CLK_PERIOD 0x%lx > 0x%lx",
227 		    (long)hpet_info.gen_cap.counter_clk_period,
228 		    (long)HPET_MAX_CLK_PERIOD);
229 		return (DDI_FAILURE);
230 	}
231 
232 	num_timers = (uint_t)hpet_info.gen_cap.num_tim_cap;
233 	PRM_DEBUG(num_timers);
234 	if ((num_timers < 3) || (num_timers > 32)) {
235 		cmn_err(CE_NOTE, "!hpet_acpi: invalid number of HPET timers "
236 		    "%lx", (long)num_timers);
237 		return (DDI_FAILURE);
238 	}
239 	hpet_info.timer_n_config = (hpet_TN_conf_cap_t *)kmem_zalloc(
240 	    num_timers * sizeof (uint64_t), KM_SLEEP);
241 
242 	PRM_POINT("hpet_read_gen_config()");
243 	ret = hpet_read_gen_config(&hpet_info);
244 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
245 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
246 
247 	/*
248 	 * illumos does not use the HPET Legacy Replacement Route capabilities.
249 	 * This feature has been off by default on test systems.
250 	 * The HPET spec does not specify if Legacy Replacement Route is
251 	 * on or off by default, so we explicitly set it off here.
252 	 * It should not matter which mode the HPET is in since we use
253 	 * the first available non-legacy replacement timer: timer 2.
254 	 */
255 	PRM_POINT("hpet_read_gen_config()");
256 	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
257 
258 	PRM_POINT("hpet_read_gen_config() again");
259 	ret = hpet_read_gen_config(&hpet_info);
260 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
261 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
262 
263 	hpet_info.gen_intrpt_stat = hpet_read_gen_intrpt_stat(&hpet_info);
264 	hpet_info.main_counter_value = hpet_read_main_counter_value(&hpet_info);
265 
266 	PRM_POINT("disable timer loop...");
267 	for (ti = 0; ti < num_timers; ++ti) {
268 		ret = hpet_read_timer_N_config(&hpet_info, ti);
269 		/*
270 		 * Make sure no timers are enabled (think fast reboot or
271 		 * virtual hardware).
272 		 */
273 		if (ret & HPET_TIMER_N_INT_ENB_CNF_BIT) {
274 			hpet_disable_timer(&hpet_info, ti);
275 			ret &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
276 		}
277 
278 		hpet_info.timer_n_config[ti] = hpet_convert_timer_N_config(ret);
279 	}
280 	PRM_POINT("disable timer loop complete");
281 
282 	/*
283 	 * Be aware the Main Counter may need to be initialized in the future
284 	 * if it is used for more than just Deep C-State support.
285 	 * The HPET's Main Counter does not need to be initialize to a specific
286 	 * value before starting it for use to wake up CPUs from Deep C-States.
287 	 */
288 	PRM_POINT("hpet_start_main_counter()");
289 	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
290 		cmn_err(CE_NOTE, "!hpet_acpi: hpet_start_main_counter failed");
291 		return (DDI_FAILURE);
292 	}
293 
294 	hpet_info.period = hpet_info.gen_cap.counter_clk_period;
295 	/*
296 	 * Read main counter twice to record HPET latency for debugging.
297 	 */
298 	PRM_POINT("TSC and HPET reads:");
299 	hpet_info.tsc[0] = tsc_read();
300 	hpet_info.hpet_main_counter_reads[0] =
301 	    hpet_read_main_counter_value(&hpet_info);
302 	hpet_info.tsc[1] = tsc_read();
303 	hpet_info.hpet_main_counter_reads[1] =
304 	    hpet_read_main_counter_value(&hpet_info);
305 	hpet_info.tsc[2] = tsc_read();
306 
307 	PRM_DEBUG(hpet_info.hpet_main_counter_reads[0]);
308 	PRM_DEBUG(hpet_info.hpet_main_counter_reads[1]);
309 	PRM_DEBUG(hpet_info.tsc[0]);
310 	PRM_DEBUG(hpet_info.tsc[1]);
311 	PRM_DEBUG(hpet_info.tsc[2]);
312 
313 	ret = hpet_read_gen_config(&hpet_info);
314 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
315 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
316 
317 	/*
318 	 * HPET main counter reads are supported now.
319 	 */
320 	hpet.supported = HPET_TIMER_SUPPORT;
321 	hpet_early_init_failed = B_FALSE;
322 
323 	PRM_POINT("HPET main counter configured for reading...");
324 	return (DDI_SUCCESS);
325 }
326 
327 /*
328  * Called by acpi_init() to set up HPET interrupts and fully initialize the
329  * HPET.
330  */
331 int
332 hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags, hrtime_t (*stop_fn)(void),
333     void (*restart_fn)(hrtime_t))
334 {
335 	extern int	idle_cpu_no_deep_c;
336 	extern int	cpuid_deep_cstates_supported(void);
337 
338 	PRM_POINT("Completing HPET initialization...");
339 
340 	if (hpet_early_init() != DDI_SUCCESS) {
341 		PRM_POINT("Early HPET initialization failed; aborting...");
342 		return (DDI_FAILURE);
343 	}
344 
345 	/*
346 	 * These functions reside in either pcplusmp or apix, and allow
347 	 * the HPET to proxy the LAPIC.
348 	 */
349 	apic_timer_stop_count_fn = stop_fn;
350 	apic_timer_restart_fn = restart_fn;
351 
352 	hpet_establish_hooks();
353 
354 	if (idle_cpu_no_deep_c ||
355 	    !cpuid_deep_cstates_supported() ||
356 	    cpuid_arat_supported()) {
357 		/*
358 		 * If Deep C-States are disabled or not supported, then we do
359 		 * not need to program the HPET at all as it will not
360 		 * subsequently be used.
361 		 *
362 		 * Otherwise we may need the HPET depending on hardware support.
363 		 * If the hardware indicates that LAPIC timers are always
364 		 * active, we won't need to use the HPET proxy timer. Skip
365 		 * programming it in that case as well.
366 		 */
367 		PRM_POINT("no need to program the HPET");
368 		return (DDI_FAILURE);
369 	}
370 
371 	return (hpet_init_proxy(hpet_vect, hpet_flags));
372 }
373 
374 void
375 hpet_acpi_fini(void)
376 {
377 	if (hpet.supported == HPET_NO_SUPPORT)
378 		return;
379 	if (hpet.supported >= HPET_TIMER_SUPPORT)
380 		(void) hpet_stop_main_counter(&hpet_info);
381 	if (hpet.supported > HPET_TIMER_SUPPORT)
382 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
383 }
384 
385 /*
386  * Do initial setup to use a HPET timer as a proxy for Deep C-state stalled
387  * LAPIC Timers.  Get a free HPET timer that supports I/O APIC routed interrupt.
388  * Setup data to handle the timer's ISR, and add the timer's interrupt.
389  *
390  * The ddi cannot be use to allocate the HPET timer's interrupt.
391  * ioapic_init_intr() in mp_platform_common() later sets up the I/O APIC
392  * to handle the HPET timer's interrupt.
393  *
394  * Note: FSB (MSI) interrupts are not currently supported by Intel HPETs as of
395  * ICH9.  The HPET spec allows for MSI.  In the future MSI may be prefered.
396  */
397 static int
398 hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags)
399 {
400 	PRM_POINT("hpet_get_IOAPIC_intr_capable_timer()");
401 	if (hpet_get_IOAPIC_intr_capable_timer(&hpet_info) == -1) {
402 		cmn_err(CE_WARN, "!hpet_acpi: get ioapic intr failed.");
403 		return (DDI_FAILURE);
404 	}
405 
406 	hpet_init_proxy_data();
407 
408 	PRM_POINT("hpet_install_interrupt_handler()");
409 	if (hpet_install_interrupt_handler(&hpet_isr,
410 	    hpet_info.cstate_timer.intr) != AE_OK) {
411 		cmn_err(CE_WARN, "!hpet_acpi: install interrupt failed.");
412 		return (DDI_FAILURE);
413 	}
414 	*hpet_vect = hpet_info.cstate_timer.intr;
415 	hpet_flags->intr_el = INTR_EL_LEVEL;
416 	hpet_flags->intr_po = INTR_PO_ACTIVE_HIGH;
417 	hpet_flags->bustype = BUS_PCI;		/*  we *do* conform to PCI */
418 
419 	/*
420 	 * Avoid a possibly stuck interrupt by programing the HPET's timer here
421 	 * before the I/O APIC is programmed to handle this interrupt.
422 	 */
423 	PRM_POINT("hpet_timer_set_up()");
424 	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
425 	    hpet_info.cstate_timer.intr);
426 	PRM_POINT("back from hpet_timer_set_up()");
427 
428 	/*
429 	 * All HPET functionality is supported.
430 	 */
431 	hpet.supported = HPET_FULL_SUPPORT;
432 	PRM_POINT("HPET full support");
433 	return (DDI_SUCCESS);
434 }
435 
436 /*
437  * Called by kernel if it can support Deep C-States.
438  */
439 static boolean_t
440 hpet_install_proxy(void)
441 {
442 	if (hpet_state.proxy_installed == B_TRUE)
443 		return (B_TRUE);
444 
445 	if (hpet.supported != HPET_FULL_SUPPORT)
446 		return (B_FALSE);
447 
448 	hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
449 	hpet_state.proxy_installed = B_TRUE;
450 
451 	return (B_TRUE);
452 }
453 
454 /*
455  * Remove the interrupt that was added with add_avintr() in
456  * hpet_install_interrupt_handler().
457  */
458 static void
459 hpet_uninstall_interrupt_handler(void)
460 {
461 	rem_avintr(NULL, CBE_HIGH_PIL, &hpet_isr, hpet_info.cstate_timer.intr);
462 }
463 
464 static int
465 hpet_validate_table(ACPI_TABLE_HPET *hpet_table)
466 {
467 	ACPI_TABLE_HEADER	*table_header = (ACPI_TABLE_HEADER *)hpet_table;
468 
469 	if (table_header->Length != sizeof (ACPI_TABLE_HPET)) {
470 		cmn_err(CE_WARN, "!hpet_validate_table: Length %lx != sizeof ("
471 		    "ACPI_TABLE_HPET) %lx.",
472 		    (unsigned long)((ACPI_TABLE_HEADER *)hpet_table)->Length,
473 		    (unsigned long)sizeof (ACPI_TABLE_HPET));
474 		return (AE_ERROR);
475 	}
476 
477 	if (!ACPI_COMPARE_NAME(table_header->Signature, ACPI_SIG_HPET)) {
478 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET table "
479 		    "signature");
480 		return (AE_ERROR);
481 	}
482 
483 	if (!hpet_checksum_table((unsigned char *)hpet_table,
484 	    (unsigned int)table_header->Length)) {
485 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET checksum");
486 		return (AE_ERROR);
487 	}
488 
489 	/*
490 	 * Sequence should be table number - 1.  We are using table 1.
491 	 */
492 	if (hpet_table->Sequence != HPET_TABLE_1 - 1) {
493 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid Sequence %lx",
494 		    (long)hpet_table->Sequence);
495 		return (AE_ERROR);
496 	}
497 
498 	return (AE_OK);
499 }
500 
501 static boolean_t
502 hpet_checksum_table(unsigned char *table, unsigned int length)
503 {
504 	unsigned char	checksum = 0;
505 	int		i;
506 
507 	for (i = 0; i < length; ++i, ++table)
508 		checksum += *table;
509 
510 	return (checksum == 0);
511 }
512 
513 static void *
514 hpet_memory_map(ACPI_TABLE_HPET *hpet_table)
515 {
516 	return (psm_map_new(hpet_table->Address.Address, (size_t)HPET_SIZE,
517 	    PSM_PROT_WRITE | PSM_PROT_READ));
518 }
519 
520 static int
521 hpet_start_main_counter(hpet_info_t *hip)
522 {
523 	uint64_t	*gcr_ptr;
524 	uint64_t	gcr;
525 
526 	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
527 	gcr = *gcr_ptr;
528 
529 	gcr |= HPET_GCFR_ENABLE_CNF;
530 	*gcr_ptr = gcr;
531 	gcr = *gcr_ptr;
532 
533 	return (gcr & HPET_GCFR_ENABLE_CNF ? AE_OK : ~AE_OK);
534 }
535 
536 static int
537 hpet_stop_main_counter(hpet_info_t *hip)
538 {
539 	uint64_t	*gcr_ptr;
540 	uint64_t	gcr;
541 
542 	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
543 	gcr = *gcr_ptr;
544 
545 	gcr &= ~HPET_GCFR_ENABLE_CNF;
546 	*gcr_ptr = gcr;
547 	gcr = *gcr_ptr;
548 
549 	return (gcr & HPET_GCFR_ENABLE_CNF ? ~AE_OK : AE_OK);
550 }
551 
552 boolean_t
553 hpet_timer_is_readable(void)
554 {
555 	return ((hpet.supported >= HPET_TIMER_SUPPORT) ? B_TRUE : B_FALSE);
556 }
557 
558 uint64_t
559 hpet_read_timer(void)
560 {
561 	return (hpet_read_main_counter_value(&hpet_info));
562 }
563 
564 /*
565  * Set the Legacy Replacement Route bit.
566  * This should be called before setting up timers.
567  * The HPET specification is silent regarding setting this after timers are
568  * programmed.
569  */
570 static uint64_t
571 hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value)
572 {
573 	uint64_t gen_conf = hpet_read_gen_config(hip);
574 
575 	switch (new_value) {
576 	case 0:
577 		gen_conf &= ~HPET_GCFR_LEG_RT_CNF;
578 		break;
579 
580 	case HPET_GCFR_LEG_RT_CNF:
581 		gen_conf |= HPET_GCFR_LEG_RT_CNF;
582 		break;
583 
584 	default:
585 		ASSERT(new_value == 0 || new_value == HPET_GCFR_LEG_RT_CNF);
586 		break;
587 	}
588 	hpet_write_gen_config(hip, gen_conf);
589 	return (gen_conf);
590 }
591 
592 static uint64_t
593 hpet_read_gen_cap(hpet_info_t *hip)
594 {
595 	return (*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address));
596 }
597 
598 static uint64_t
599 hpet_read_gen_config(hpet_info_t *hip)
600 {
601 	return (*(uint64_t *)
602 	    HPET_GEN_CONFIG_ADDRESS(hip->logical_address));
603 }
604 
605 static uint64_t
606 hpet_read_gen_intrpt_stat(hpet_info_t *hip)
607 {
608 	hip->gen_intrpt_stat = *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(
609 	    hip->logical_address);
610 	return (hip->gen_intrpt_stat);
611 }
612 
613 static uint64_t
614 hpet_read_timer_N_config(hpet_info_t *hip, uint_t n)
615 {
616 	uint64_t conf = *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
617 	    hip->logical_address, n);
618 	hip->timer_n_config[n] = hpet_convert_timer_N_config(conf);
619 	return (conf);
620 }
621 
622 static hpet_TN_conf_cap_t
623 hpet_convert_timer_N_config(uint64_t conf)
624 {
625 	hpet_TN_conf_cap_t cc = { 0 };
626 
627 	cc.int_route_cap = HPET_TIMER_N_INT_ROUTE_CAP(conf);
628 	cc.fsb_int_del_cap = HPET_TIMER_N_FSB_INT_DEL_CAP(conf);
629 	cc.fsb_int_en_cnf = HPET_TIMER_N_FSB_EN_CNF(conf);
630 	cc.int_route_cnf = HPET_TIMER_N_INT_ROUTE_CNF(conf);
631 	cc.mode32_cnf = HPET_TIMER_N_MODE32_CNF(conf);
632 	cc.val_set_cnf = HPET_TIMER_N_VAL_SET_CNF(conf);
633 	cc.size_cap = HPET_TIMER_N_SIZE_CAP(conf);
634 	cc.per_int_cap = HPET_TIMER_N_PER_INT_CAP(conf);
635 	cc.type_cnf = HPET_TIMER_N_TYPE_CNF(conf);
636 	cc.int_enb_cnf = HPET_TIMER_N_INT_ENB_CNF(conf);
637 	cc.int_type_cnf = HPET_TIMER_N_INT_TYPE_CNF(conf);
638 
639 	return (cc);
640 }
641 
642 static uint64_t
643 hpet_read_main_counter_value(hpet_info_t *hip)
644 {
645 	uint64_t	value;
646 	uint32_t	*counter;
647 	uint32_t	high1, high2, low;
648 
649 	counter = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address);
650 
651 	/*
652 	 * 32-bit main counters
653 	 */
654 	if (hip->gen_cap.count_size_cap == 0) {
655 		value = (uint64_t)*counter;
656 		hip->main_counter_value = value;
657 		return (value);
658 	}
659 
660 	/*
661 	 * HPET spec claims a 64-bit read can be split into two 32-bit reads
662 	 * by the hardware connection to the HPET.
663 	 */
664 	high2 = counter[1];
665 	do {
666 		high1 = high2;
667 		low = counter[0];
668 		high2 = counter[1];
669 	} while (high2 != high1);
670 
671 	value = ((uint64_t)high1 << 32) | low;
672 	hip->main_counter_value = value;
673 	return (value);
674 }
675 
676 static void
677 hpet_write_gen_config(hpet_info_t *hip, uint64_t l)
678 {
679 	*(uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address) = l;
680 }
681 
682 static void
683 hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l)
684 {
685 	*(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(hip->logical_address) = l;
686 }
687 
688 static void
689 hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t conf)
690 {
691 	/*
692 	 * The configuration register size is not affected by the size
693 	 * capability; it is always a 64-bit value.  The top 32-bit half of
694 	 * this register is always read-only so we constrain our write to the
695 	 * bottom half.
696 	 */
697 	uint32_t *confaddr = (uint32_t *)HPET_TIMER_N_CONF_ADDRESS(
698 	    hip->logical_address, n);
699 	uint32_t conf32 = 0xFFFFFFFF & conf;
700 
701 	PRM_DEBUG(n);
702 	PRM_DEBUG(conf);
703 	PRM_DEBUG(conf32);
704 
705 	*confaddr = conf32;
706 
707 	PRM_POINT("write done");
708 }
709 
710 static void
711 hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l)
712 {
713 	*(uint64_t *)HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n) = l;
714 }
715 
716 static void
717 hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n)
718 {
719 	uint64_t l;
720 
721 	l = hpet_read_timer_N_config(hip, timer_n);
722 	l &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
723 	hpet_write_timer_N_config(hip, timer_n, l);
724 }
725 
726 static void
727 hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n)
728 {
729 	uint64_t l;
730 
731 	l = hpet_read_timer_N_config(hip, timer_n);
732 	l |= HPET_TIMER_N_INT_ENB_CNF_BIT;
733 	hpet_write_timer_N_config(hip, timer_n, l);
734 }
735 
736 /*
737  * Add the interrupt handler for I/O APIC interrupt number (interrupt line).
738  *
739  * The I/O APIC line (vector) is programmed in ioapic_init_intr() called
740  * from apic_picinit() psm_ops apic_ops entry point after we return from
741  * apic_init() psm_ops entry point.
742  */
743 static uint32_t
744 hpet_install_interrupt_handler(avfunc func, int vector)
745 {
746 	uint32_t retval;
747 
748 	retval = add_avintr(NULL, CBE_HIGH_PIL, func, "HPET Timer",
749 	    vector, NULL, NULL, NULL, NULL);
750 	if (retval == 0) {
751 		cmn_err(CE_WARN, "!hpet_acpi: add_avintr() failed");
752 		return (AE_BAD_PARAMETER);
753 	}
754 	return (AE_OK);
755 }
756 
757 /*
758  * The HPET timers specify which I/O APIC interrupts they can be routed to.
759  * Find the first available non-legacy-replacement timer and its I/O APIC irq.
760  * Supported I/O APIC IRQs are specified in the int_route_cap bitmap in each
761  * timer's timer_n_config register.
762  */
763 static int
764 hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip)
765 {
766 	int timer;
767 	int intr;
768 
769 	for (timer = HPET_FIRST_NON_LEGACY_TIMER;
770 	    timer < hip->gen_cap.num_tim_cap; ++timer) {
771 		if (!hpet_timer_available(hip->allocated_timers, timer))
772 			continue;
773 
774 		intr = lowbit(hip->timer_n_config[timer].int_route_cap) - 1;
775 
776 		PRM_DEBUG(timer);
777 		PRM_DEBUG(intr);
778 
779 		if (intr >= 0) {
780 			hpet_timer_alloc(&hip->allocated_timers, timer);
781 			hip->cstate_timer.timer = timer;
782 			hip->cstate_timer.intr = intr;
783 			return (timer);
784 		}
785 	}
786 
787 	return (-1);
788 }
789 
790 /*
791  * Mark this timer as used.
792  */
793 static void
794 hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n)
795 {
796 	*allocated_timers |= 1 << n;
797 }
798 
799 /*
800  * Check if this timer is available.
801  * No mutual exclusion because only one thread uses this.
802  */
803 static int
804 hpet_timer_available(uint32_t allocated_timers, uint32_t n)
805 {
806 	return ((allocated_timers & (1 << n)) == 0);
807 }
808 
809 /*
810  * Setup timer N to route its interrupt to I/O APIC.
811  */
812 static void
813 hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, uint32_t interrupt)
814 {
815 	uint64_t conf;
816 
817 	PRM_DEBUG(timer_n);
818 	PRM_DEBUG(interrupt);
819 
820 	PRM_POINT("hpet_read_timer_N_config()");
821 	conf = hpet_read_timer_N_config(hip, timer_n);
822 	PRM_DEBUG(conf);
823 
824 	/*
825 	 * Caller is required to verify this interrupt route is supported.
826 	 */
827 	ASSERT(HPET_TIMER_N_INT_ROUTE_CAP(conf) & (1 << interrupt));
828 
829 	conf &= ~HPET_TIMER_N_FSB_EN_CNF_BIT;	/* use IOAPIC */
830 	conf |= HPET_TIMER_N_INT_ROUTE_SHIFT(interrupt);
831 	conf &= ~HPET_TIMER_N_TYPE_CNF_BIT;	/* non periodic */
832 	conf &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;	/* disabled */
833 	conf |= HPET_TIMER_N_INT_TYPE_CNF_BIT;	/* Level Triggered */
834 
835 	PRM_POINT("hpet_write_timer_N_config()");
836 	PRM_DEBUG(conf);
837 	hpet_write_timer_N_config(hip, timer_n, conf);
838 	PRM_POINT("back from hpet_write_timer_N_config()");
839 }
840 
841 /*
842  * The HPET's Main Counter is not stopped before programming an HPET timer.
843  * This will allow the HPET to be used as a time source.
844  * The programmed timer interrupt may occur before this function returns.
845  * Callers must block interrupts before calling this function if they must
846  * guarantee the interrupt is handled after this function returns.
847  *
848  * Return 0 if main counter is less than timer after enabling timer.
849  * The interrupt was programmed, but it may fire before this returns.
850  * Return !0 if main counter is greater than timer after enabling timer.
851  * In other words: the timer will not fire, and we do not know if it did fire.
852  *
853  * delta is in HPET ticks.
854  *
855  * Writing a 64-bit value to a 32-bit register will "wrap around".
856  * A 32-bit HPET timer will wrap around in a little over 5 minutes.
857  */
858 int
859 hpet_timer_program(hpet_info_t *hip, uint32_t timer, uint64_t delta)
860 {
861 	uint64_t time, program;
862 
863 	program = hpet_read_main_counter_value(hip);
864 	program += delta;
865 	hpet_write_timer_N_comp(hip, timer, program);
866 
867 	time = hpet_read_main_counter_value(hip);
868 	if (time < program)
869 		return (AE_OK);
870 
871 	return (AE_TIME);
872 }
873 
874 /*
875  * CPR and power policy-change callback entry point.
876  */
877 boolean_t
878 hpet_callback(int code)
879 {
880 	switch (code) {
881 	case PM_DEFAULT_CPU_DEEP_IDLE:
882 		/*FALLTHROUGH*/
883 	case PM_ENABLE_CPU_DEEP_IDLE:
884 		/*FALLTHROUGH*/
885 	case PM_DISABLE_CPU_DEEP_IDLE:
886 		return (hpet_deep_idle_config(code));
887 
888 	case CB_CODE_CPR_RESUME:
889 		/*FALLTHROUGH*/
890 	case CB_CODE_CPR_CHKPT:
891 		return (hpet_cpr(code));
892 
893 	case CST_EVENT_MULTIPLE_CSTATES:
894 		hpet_cst_callback(CST_EVENT_MULTIPLE_CSTATES);
895 		return (B_TRUE);
896 
897 	case CST_EVENT_ONE_CSTATE:
898 		hpet_cst_callback(CST_EVENT_ONE_CSTATE);
899 		return (B_TRUE);
900 
901 	default:
902 		cmn_err(CE_NOTE, "!hpet_callback: invalid code %d\n", code);
903 		return (B_FALSE);
904 	}
905 }
906 
907 /*
908  * According to the HPET spec 1.0a: the Operating System must save and restore
909  * HPET event timer hardware context through ACPI sleep state transitions.
910  * Timer registers (including the main counter) may not be preserved through
911  * ACPI S3, S4, or S5 sleep states.  This code does not not support S1 nor S2.
912  *
913  * Current HPET state is already in hpet.supported and
914  * hpet_state.proxy_installed.  hpet_info contains the proxy interrupt HPET
915  * Timer state.
916  *
917  * Future projects beware: the HPET Main Counter is undefined after ACPI S3 or
918  * S4, and it is not saved/restored here.  Future projects cannot expect the
919  * Main Counter to be monotomically (or accurately) increasing across CPR.
920  *
921  * Note: the CPR Checkpoint path later calls pause_cpus() which ensures all
922  * CPUs are awake and in a spin loop before the system suspends.  The HPET is
923  * not needed for Deep C-state wakeup when CPUs are in cpu_pause().
924  * It is safe to leave the HPET running as the system suspends; we just
925  * disable the timer from generating interrupts here.
926  */
927 static boolean_t
928 hpet_cpr(int code)
929 {
930 	ulong_t		intr, dead_count = 0;
931 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
932 	boolean_t	ret = B_TRUE;
933 
934 	mutex_enter(&hpet_state_lock);
935 	switch (code) {
936 	case CB_CODE_CPR_CHKPT:
937 		if (hpet_state.proxy_installed == B_FALSE)
938 			break;
939 
940 		hpet_state.cpr = B_TRUE;
941 
942 		intr = intr_clear();
943 		while (!mutex_tryenter(&hpet_proxy_lock)) {
944 			/*
945 			 * spin
946 			 */
947 			intr_restore(intr);
948 			if (dead_count++ > hpet_spin_check) {
949 				dead_count = 0;
950 				if (gethrtime() > dead) {
951 					hpet_state.cpr = B_FALSE;
952 					mutex_exit(&hpet_state_lock);
953 					cmn_err(CE_NOTE, "!hpet_cpr: deadman");
954 					return (B_FALSE);
955 				}
956 			}
957 			intr = intr_clear();
958 		}
959 		hpet_expire_all();
960 		mutex_exit(&hpet_proxy_lock);
961 		intr_restore(intr);
962 
963 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
964 		break;
965 
966 	case CB_CODE_CPR_RESUME:
967 		if (hpet_resume() == B_TRUE)
968 			hpet_state.cpr = B_FALSE;
969 		else
970 			cmn_err(CE_NOTE, "!hpet_resume failed.");
971 		break;
972 
973 	default:
974 		cmn_err(CE_NOTE, "!hpet_cpr: invalid code %d\n", code);
975 		ret = B_FALSE;
976 		break;
977 	}
978 	mutex_exit(&hpet_state_lock);
979 	return (ret);
980 }
981 
982 /*
983  * Assume the HPET stopped in Suspend state and timer state was lost.
984  */
985 static boolean_t
986 hpet_resume(void)
987 {
988 	if (hpet.supported != HPET_TIMER_SUPPORT)
989 		return (B_TRUE);
990 
991 	/*
992 	 * The HPET spec does not specify if Legacy Replacement Route is
993 	 * on or off by default, so we set it off here.
994 	 */
995 	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
996 
997 	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
998 		cmn_err(CE_NOTE, "!hpet_resume: start main counter failed");
999 		hpet.supported = HPET_NO_SUPPORT;
1000 		if (hpet_state.proxy_installed == B_TRUE) {
1001 			hpet_state.proxy_installed = B_FALSE;
1002 			hpet_uninstall_interrupt_handler();
1003 		}
1004 		return (B_FALSE);
1005 	}
1006 
1007 	if (hpet_state.proxy_installed == B_FALSE)
1008 		return (B_TRUE);
1009 
1010 	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
1011 	    hpet_info.cstate_timer.intr);
1012 	if (hpet_state.cpu_deep_idle == B_TRUE)
1013 		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
1014 
1015 	return (B_TRUE);
1016 }
1017 
1018 /*
1019  * Callback to enable/disable Deep C-States based on power.conf setting.
1020  */
1021 static boolean_t
1022 hpet_deep_idle_config(int code)
1023 {
1024 	ulong_t		intr, dead_count = 0;
1025 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
1026 	boolean_t	ret = B_TRUE;
1027 
1028 	mutex_enter(&hpet_state_lock);
1029 	switch (code) {
1030 	case PM_DEFAULT_CPU_DEEP_IDLE:
1031 		/*FALLTHROUGH*/
1032 	case PM_ENABLE_CPU_DEEP_IDLE:
1033 
1034 		if (hpet_state.cpu_deep_idle == B_TRUE)
1035 			break;
1036 
1037 		if (hpet_state.proxy_installed == B_FALSE) {
1038 			ret = B_FALSE;  /* Deep C-States not supported */
1039 			break;
1040 		}
1041 
1042 		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
1043 		hpet_state.cpu_deep_idle = B_TRUE;
1044 		break;
1045 
1046 	case PM_DISABLE_CPU_DEEP_IDLE:
1047 
1048 		if ((hpet_state.cpu_deep_idle == B_FALSE) ||
1049 		    (hpet_state.proxy_installed == B_FALSE))
1050 			break;
1051 
1052 		/*
1053 		 * The order of these operations is important to avoid
1054 		 * lost wakeups: Set a flag to refuse all future LAPIC Timer
1055 		 * proxy requests, then wake up all CPUs from deep C-state,
1056 		 * and finally disable the HPET interrupt-generating timer.
1057 		 */
1058 		hpet_state.cpu_deep_idle = B_FALSE;
1059 
1060 		intr = intr_clear();
1061 		while (!mutex_tryenter(&hpet_proxy_lock)) {
1062 			/*
1063 			 * spin
1064 			 */
1065 			intr_restore(intr);
1066 			if (dead_count++ > hpet_spin_check) {
1067 				dead_count = 0;
1068 				if (gethrtime() > dead) {
1069 					hpet_state.cpu_deep_idle = B_TRUE;
1070 					mutex_exit(&hpet_state_lock);
1071 					cmn_err(CE_NOTE,
1072 					    "!hpet_deep_idle_config: deadman");
1073 					return (B_FALSE);
1074 				}
1075 			}
1076 			intr = intr_clear();
1077 		}
1078 		hpet_expire_all();
1079 		mutex_exit(&hpet_proxy_lock);
1080 		intr_restore(intr);
1081 
1082 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
1083 		break;
1084 
1085 	default:
1086 		cmn_err(CE_NOTE, "!hpet_deep_idle_config: invalid code %d\n",
1087 		    code);
1088 		ret = B_FALSE;
1089 		break;
1090 	}
1091 	mutex_exit(&hpet_state_lock);
1092 
1093 	return (ret);
1094 }
1095 
1096 /*
1097  * Callback for _CST c-state change notifications.
1098  */
1099 static void
1100 hpet_cst_callback(uint32_t code)
1101 {
1102 	ulong_t		intr, dead_count = 0;
1103 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
1104 
1105 	switch (code) {
1106 	case CST_EVENT_ONE_CSTATE:
1107 		hpet_state.uni_cstate = B_TRUE;
1108 		intr = intr_clear();
1109 		while (!mutex_tryenter(&hpet_proxy_lock)) {
1110 			/*
1111 			 * spin
1112 			 */
1113 			intr_restore(intr);
1114 			if (dead_count++ > hpet_spin_check) {
1115 				dead_count = 0;
1116 				if (gethrtime() > dead) {
1117 					hpet_expire_all();
1118 					cmn_err(CE_NOTE,
1119 					    "!hpet_cst_callback: deadman");
1120 					return;
1121 				}
1122 			}
1123 			intr = intr_clear();
1124 		}
1125 		hpet_expire_all();
1126 		mutex_exit(&hpet_proxy_lock);
1127 		intr_restore(intr);
1128 		break;
1129 
1130 	case CST_EVENT_MULTIPLE_CSTATES:
1131 		hpet_state.uni_cstate = B_FALSE;
1132 		break;
1133 
1134 	default:
1135 		cmn_err(CE_NOTE, "!hpet_cst_callback: invalid code %d\n", code);
1136 		break;
1137 	}
1138 }
1139 
1140 /*
1141  * Interrupt Service Routine for HPET I/O-APIC-generated interrupts.
1142  * Used to wakeup CPUs from Deep C-state when their Local APIC Timer stops.
1143  * This ISR runs on one CPU which pokes other CPUs out of Deep C-state as
1144  * needed.
1145  */
1146 static uint_t
1147 hpet_isr(caddr_t arg __unused, caddr_t arg1 __unused)
1148 {
1149 	uint64_t	timer_status;
1150 	uint64_t	timer_mask;
1151 	ulong_t		intr, dead_count = 0;
1152 	hrtime_t	dead = gethrtime() + hpet_isr_spin_timeout;
1153 
1154 	timer_mask = HPET_INTR_STATUS_MASK(hpet_info.cstate_timer.timer);
1155 
1156 	/*
1157 	 * We are using a level-triggered interrupt.
1158 	 * HPET sets timer's General Interrupt Status Register bit N.
1159 	 * ISR checks this bit to see if it needs servicing.
1160 	 * ISR then clears this bit by writing 1 to that bit.
1161 	 */
1162 	timer_status = hpet_read_gen_intrpt_stat(&hpet_info);
1163 	if (!(timer_status & timer_mask))
1164 		return (DDI_INTR_UNCLAIMED);
1165 	hpet_write_gen_intrpt_stat(&hpet_info, timer_mask);
1166 
1167 	/*
1168 	 * Do not touch ISR data structures before checking the HPET's General
1169 	 * Interrupt Status register.  The General Interrupt Status register
1170 	 * will not be set by hardware until after timer interrupt generation
1171 	 * is enabled by software.  Software allocates necessary data
1172 	 * structures before enabling timer interrupts.  ASSERT the software
1173 	 * data structures required to handle this interrupt are initialized.
1174 	 */
1175 	ASSERT(hpet_proxy_users != NULL);
1176 
1177 	/*
1178 	 * CPUs in deep c-states do not enable interrupts until after
1179 	 * performing idle cleanup which includes descheduling themselves from
1180 	 * the HPET.  The CPU running this ISR will NEVER find itself in the
1181 	 * proxy list.  A lost wakeup may occur if this is false.
1182 	 */
1183 	ASSERT(hpet_proxy_users[CPU->cpu_id] == HPET_INFINITY);
1184 
1185 	/*
1186 	 * Higher level interrupts may deadlock with CPUs going idle if this
1187 	 * ISR is prempted while holding hpet_proxy_lock.
1188 	 */
1189 	intr = intr_clear();
1190 	while (!mutex_tryenter(&hpet_proxy_lock)) {
1191 		/*
1192 		 * spin
1193 		 */
1194 		intr_restore(intr);
1195 		if (dead_count++ > hpet_spin_check) {
1196 			dead_count = 0;
1197 			if (gethrtime() > dead) {
1198 				hpet_expire_all();
1199 				return (DDI_INTR_CLAIMED);
1200 			}
1201 		}
1202 		intr = intr_clear();
1203 	}
1204 	(void) hpet_guaranteed_schedule(HPET_INFINITY);
1205 	mutex_exit(&hpet_proxy_lock);
1206 	intr_restore(intr);
1207 
1208 	return (DDI_INTR_CLAIMED);
1209 }
1210 
1211 /*
1212  * Used when disabling the HPET Timer interrupt.  CPUs in Deep C-state must be
1213  * woken up because they can no longer rely on the HPET's Timer to wake them.
1214  * We do not need to wait for CPUs to wakeup.
1215  */
1216 static void
1217 hpet_expire_all(void)
1218 {
1219 	processorid_t	id;
1220 
1221 	for (id = 0; id < max_ncpus; ++id) {
1222 		if (hpet_proxy_users[id] != HPET_INFINITY) {
1223 			hpet_proxy_users[id] = HPET_INFINITY;
1224 			if (id != CPU->cpu_id)
1225 				poke_cpu(id);
1226 		}
1227 	}
1228 }
1229 
1230 /*
1231  * To avoid missed wakeups this function must guarantee either the HPET timer
1232  * was successfully programmed to the next expire time or there are no waiting
1233  * CPUs.
1234  *
1235  * Callers cannot enter C2 or deeper if the HPET could not be programmed to
1236  * generate its next interrupt to happen at required_wakeup_time or sooner.
1237  * Returns B_TRUE if the HPET was programmed to interrupt by
1238  * required_wakeup_time, B_FALSE if not.
1239  */
1240 static boolean_t
1241 hpet_guaranteed_schedule(hrtime_t required_wakeup_time)
1242 {
1243 	hrtime_t	now, next_proxy_time;
1244 	processorid_t	id, next_proxy_id;
1245 	int		proxy_timer = hpet_info.cstate_timer.timer;
1246 	boolean_t	done = B_FALSE;
1247 
1248 	ASSERT(mutex_owned(&hpet_proxy_lock));
1249 
1250 	/*
1251 	 * Loop until we successfully program the HPET,
1252 	 * or no CPUs are scheduled to use the HPET as a proxy.
1253 	 */
1254 	do {
1255 		/*
1256 		 * Wake all CPUs that expired before now.
1257 		 * Find the next CPU to wake up and next HPET program time.
1258 		 */
1259 		now = gethrtime();
1260 		next_proxy_time = HPET_INFINITY;
1261 		next_proxy_id = CPU->cpu_id;
1262 		for (id = 0; id < max_ncpus; ++id) {
1263 			if (hpet_proxy_users[id] < now) {
1264 				hpet_proxy_users[id] = HPET_INFINITY;
1265 				if (id != CPU->cpu_id)
1266 					poke_cpu(id);
1267 			} else if (hpet_proxy_users[id] < next_proxy_time) {
1268 				next_proxy_time = hpet_proxy_users[id];
1269 				next_proxy_id = id;
1270 			}
1271 		}
1272 
1273 		if (next_proxy_time == HPET_INFINITY) {
1274 			done = B_TRUE;
1275 			/*
1276 			 * There are currently no CPUs using the HPET's Timer
1277 			 * as a proxy for their LAPIC Timer.  The HPET's Timer
1278 			 * does not need to be programmed.
1279 			 *
1280 			 * Letting the HPET timer wrap around to the current
1281 			 * time is the longest possible timeout.
1282 			 * A 64-bit timer will wrap around in ~ 2^44 seconds.
1283 			 * A 32-bit timer will wrap around in ~ 2^12 seconds.
1284 			 *
1285 			 * Disabling the HPET's timer interrupt requires a
1286 			 * (relatively expensive) write to the HPET.
1287 			 * Instead we do nothing.
1288 			 *
1289 			 * We are gambling some CPU will attempt to enter a
1290 			 * deep c-state before the timer wraps around.
1291 			 * We assume one spurious interrupt in a little over an
1292 			 * hour has less performance impact than writing to the
1293 			 * HPET's timer disable bit every time all CPUs wakeup
1294 			 * from deep c-state.
1295 			 */
1296 
1297 		} else {
1298 			/*
1299 			 * Idle CPUs disable interrupts before programming the
1300 			 * HPET to prevent a lost wakeup if the HPET
1301 			 * interrupts the idle cpu before it can enter a
1302 			 * Deep C-State.
1303 			 */
1304 			if (hpet_timer_program(&hpet_info, proxy_timer,
1305 			    HRTIME_TO_HPET_TICKS(next_proxy_time - gethrtime()))
1306 			    != AE_OK) {
1307 				/*
1308 				 * We could not program the HPET to wakeup the
1309 				 * next CPU.  We must wake the CPU ourself to
1310 				 * avoid a lost wakeup.
1311 				 */
1312 				hpet_proxy_users[next_proxy_id] = HPET_INFINITY;
1313 				if (next_proxy_id != CPU->cpu_id)
1314 					poke_cpu(next_proxy_id);
1315 			} else {
1316 				done = B_TRUE;
1317 			}
1318 		}
1319 
1320 	} while (!done);
1321 
1322 	return (next_proxy_time <= required_wakeup_time);
1323 }
1324 
1325 /*
1326  * Use an HPET timer to act as this CPU's proxy local APIC timer.
1327  * Used in deep c-states C2 and above while the CPU's local APIC timer stalls.
1328  * Called by the idle thread with interrupts enabled.
1329  * Always returns with interrupts disabled.
1330  *
1331  * There are 3 possible outcomes from this function:
1332  * 1. The Local APIC Timer was already disabled before this function was called.
1333  *	LAPIC TIMER	: disabled
1334  *	HPET		: not scheduled to wake this CPU
1335  *	*lapic_expire	: (hrtime_t)HPET_INFINITY
1336  *	Returns		: B_TRUE
1337  * 2. Successfully programmed the HPET to act as a LAPIC Timer proxy.
1338  *	LAPIC TIMER	: disabled
1339  *	HPET		: scheduled to wake this CPU
1340  *	*lapic_expire	: hrtime_t when LAPIC timer would have expired
1341  *	Returns		: B_TRUE
1342  * 3. Failed to programmed the HPET to act as a LAPIC Timer proxy.
1343  *	LAPIC TIMER	: enabled
1344  *	HPET		: not scheduled to wake this CPU
1345  *	*lapic_expire	: (hrtime_t)HPET_INFINITY
1346  *	Returns		: B_FALSE
1347  *
1348  * The idle thread cannot enter Deep C-State in case 3.
1349  * The idle thread must re-enable & re-program the LAPIC_TIMER in case 2.
1350  */
1351 static boolean_t
1352 hpet_use_hpet_timer(hrtime_t *lapic_expire)
1353 {
1354 	hrtime_t	now, expire, dead;
1355 	uint64_t	lapic_count, dead_count;
1356 	cpupart_t	*cpu_part;
1357 	processorid_t	cpu_sid;
1358 	processorid_t	cpu_id = CPU->cpu_id;
1359 	processorid_t	id;
1360 	boolean_t	rslt;
1361 	boolean_t	hset_update;
1362 
1363 	cpu_part = CPU->cpu_part;
1364 	cpu_sid = CPU->cpu_seqid;
1365 
1366 	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
1367 
1368 	/*
1369 	 * A critical section exists between when the HPET is programmed
1370 	 * to interrupt the CPU and when this CPU enters an idle state.
1371 	 * Interrupts must be blocked during that time to prevent lost
1372 	 * CBE wakeup interrupts from either LAPIC or HPET.
1373 	 *
1374 	 * Must block interrupts before acquiring hpet_proxy_lock to prevent
1375 	 * a deadlock with the ISR if the ISR runs on this CPU after the
1376 	 * idle thread acquires the mutex but before it clears interrupts.
1377 	 */
1378 	ASSERT(!interrupts_enabled());
1379 	lapic_count = apic_timer_stop_count_fn();
1380 	now = gethrtime();
1381 	dead = now + hpet_idle_spin_timeout;
1382 	*lapic_expire = expire = now + lapic_count;
1383 	if (lapic_count == (hrtime_t)-1) {
1384 		/*
1385 		 * LAPIC timer is currently disabled.
1386 		 * Will not use the HPET as a LAPIC Timer proxy.
1387 		 */
1388 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1389 		return (B_TRUE);
1390 	}
1391 
1392 	/*
1393 	 * Serialize hpet_proxy data structure manipulation.
1394 	 */
1395 	dead_count = 0;
1396 	while (!mutex_tryenter(&hpet_proxy_lock)) {
1397 		/*
1398 		 * spin
1399 		 */
1400 		apic_timer_restart_fn(expire);
1401 		sti();
1402 		cli();
1403 
1404 		if (dead_count++ > hpet_spin_check) {
1405 			dead_count = 0;
1406 			hset_update = (((CPU->cpu_flags & CPU_OFFLINE) == 0) &&
1407 			    (ncpus > 1));
1408 			if (hset_update &&
1409 			    !bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
1410 				*lapic_expire = (hrtime_t)HPET_INFINITY;
1411 				return (B_FALSE);
1412 			}
1413 		}
1414 
1415 		lapic_count = apic_timer_stop_count_fn();
1416 		now = gethrtime();
1417 		*lapic_expire = expire = now + lapic_count;
1418 		if (lapic_count == (hrtime_t)-1) {
1419 			/*
1420 			 * LAPIC timer is currently disabled.
1421 			 * Will not use the HPET as a LAPIC Timer proxy.
1422 			 */
1423 			*lapic_expire = (hrtime_t)HPET_INFINITY;
1424 			return (B_TRUE);
1425 		}
1426 		if (now > dead) {
1427 			apic_timer_restart_fn(expire);
1428 			*lapic_expire = (hrtime_t)HPET_INFINITY;
1429 			return (B_FALSE);
1430 		}
1431 	}
1432 
1433 	if ((hpet_state.cpr == B_TRUE) ||
1434 	    (hpet_state.cpu_deep_idle == B_FALSE) ||
1435 	    (hpet_state.proxy_installed == B_FALSE) ||
1436 	    (hpet_state.uni_cstate == B_TRUE)) {
1437 		mutex_exit(&hpet_proxy_lock);
1438 		apic_timer_restart_fn(expire);
1439 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1440 		return (B_FALSE);
1441 	}
1442 
1443 	hpet_proxy_users[cpu_id] = expire;
1444 
1445 	/*
1446 	 * We are done if another cpu is scheduled on the HPET with an
1447 	 * expire time before us.  The next HPET interrupt has been programmed
1448 	 * to fire before our expire time.
1449 	 */
1450 	for (id = 0; id < max_ncpus; ++id) {
1451 		if ((hpet_proxy_users[id] <= expire) && (id != cpu_id)) {
1452 			mutex_exit(&hpet_proxy_lock);
1453 			return (B_TRUE);
1454 		}
1455 	}
1456 
1457 	/*
1458 	 * We are the next lAPIC to expire.
1459 	 * Program the HPET with our expire time.
1460 	 */
1461 	rslt = hpet_guaranteed_schedule(expire);
1462 	mutex_exit(&hpet_proxy_lock);
1463 
1464 	if (rslt == B_FALSE) {
1465 		apic_timer_restart_fn(expire);
1466 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1467 	}
1468 
1469 	return (rslt);
1470 }
1471 
1472 /*
1473  * Called by the idle thread when waking up from Deep C-state before enabling
1474  * interrupts.  With an array data structure it is faster to always remove
1475  * ourself from the array without checking if the HPET ISR already removed.
1476  *
1477  * We use a lazy algorithm for removing CPUs from the HPET's schedule.
1478  * We do not reprogram the HPET here because this CPU has real work to do.
1479  * On a idle system the CPU was probably woken up by the HPET's ISR.
1480  * On a heavily loaded system CPUs are not going into Deep C-state.
1481  * On a moderately loaded system another CPU will usually enter Deep C-state
1482  * and reprogram the HPET before the HPET fires with our wakeup.
1483  */
1484 static void
1485 hpet_use_lapic_timer(hrtime_t expire)
1486 {
1487 	processorid_t	cpu_id = CPU->cpu_id;
1488 
1489 	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
1490 	ASSERT(!interrupts_enabled());
1491 
1492 	hpet_proxy_users[cpu_id] = HPET_INFINITY;
1493 
1494 	/*
1495 	 * Do not enable a LAPIC Timer that was initially disabled.
1496 	 */
1497 	if (expire != HPET_INFINITY)
1498 		apic_timer_restart_fn(expire);
1499 }
1500 
1501 /*
1502  * Initialize data structure to keep track of CPUs using HPET as a proxy for
1503  * their stalled local APIC timer.  For now this is just an array.
1504  */
1505 static void
1506 hpet_init_proxy_data(void)
1507 {
1508 	processorid_t	id;
1509 
1510 	/*
1511 	 * Use max_ncpus for hot plug compliance.
1512 	 */
1513 	hpet_proxy_users = kmem_zalloc(max_ncpus * sizeof (*hpet_proxy_users),
1514 	    KM_SLEEP);
1515 
1516 	/*
1517 	 * Unused entries always contain HPET_INFINITY.
1518 	 */
1519 	for (id = 0; id < max_ncpus; ++id)
1520 		hpet_proxy_users[id] = HPET_INFINITY;
1521 }
1522