xref: /illumos-gate/usr/src/uts/i86pc/io/hpet_acpi.c (revision 2cbc828d66d6088914f685ae0e77169ecc9fc7f9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2020 Oxide Computer Company
24  */
25 
26 #include <sys/hpet_acpi.h>
27 #include <sys/hpet.h>
28 #include <sys/bitmap.h>
29 #include <sys/inttypes.h>
30 #include <sys/time.h>
31 #include <sys/sunddi.h>
32 #include <sys/ksynch.h>
33 #include <sys/apic.h>
34 #include <sys/callb.h>
35 #include <sys/clock.h>
36 #include <sys/archsystm.h>
37 #include <sys/cpupart.h>
38 #include <sys/x86_archext.h>
39 #include <sys/prom_debug.h>
40 
41 static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags);
42 static boolean_t hpet_install_proxy(void);
43 static boolean_t hpet_callback(int code);
44 static boolean_t hpet_cpr(int code);
45 static boolean_t hpet_resume(void);
46 static void hpet_cst_callback(uint32_t code);
47 static boolean_t hpet_deep_idle_config(int code);
48 static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table);
49 static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len);
50 static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table);
51 static int hpet_start_main_counter(hpet_info_t *hip);
52 static int hpet_stop_main_counter(hpet_info_t *hip);
53 static uint64_t hpet_read_main_counter_value(hpet_info_t *hip);
54 static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value);
55 static uint64_t hpet_read_gen_cap(hpet_info_t *hip);
56 static uint64_t hpet_read_gen_config(hpet_info_t *hip);
57 static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip);
58 static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n);
59 static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf);
60 static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l);
61 static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l);
62 static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l);
63 static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l);
64 static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n);
65 static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n);
66 static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip);
67 static int hpet_timer_available(uint32_t allocated_timers, uint32_t n);
68 static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n);
69 static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n,
70     uint32_t interrupt);
71 static uint_t hpet_isr(caddr_t, caddr_t);
72 static uint32_t hpet_install_interrupt_handler(avfunc func, int vector);
73 static void hpet_uninstall_interrupt_handler(void);
74 static void hpet_expire_all(void);
75 static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time);
76 static boolean_t hpet_use_hpet_timer(hrtime_t *expire);
77 static void hpet_use_lapic_timer(hrtime_t expire);
78 static void hpet_init_proxy_data(void);
79 
80 /*
81  * hpet_state_lock is used to synchronize disabling/enabling deep c-states
82  * and to synchronize suspend/resume.
83  */
84 static kmutex_t		hpet_state_lock;
85 static struct hpet_state {
86 	boolean_t	proxy_installed;	/* CBE proxy interrupt setup */
87 	boolean_t	cpr;			/* currently in CPR */
88 	boolean_t	cpu_deep_idle;		/* user enable/disable */
89 	boolean_t	uni_cstate;		/* disable if only one cstate */
90 } hpet_state = { B_FALSE, B_FALSE, B_TRUE, B_TRUE};
91 
92 uint64_t hpet_spin_check = HPET_SPIN_CHECK;
93 uint64_t hpet_spin_timeout = HPET_SPIN_TIMEOUT;
94 uint64_t hpet_idle_spin_timeout = HPET_SPIN_TIMEOUT;
95 uint64_t hpet_isr_spin_timeout = HPET_SPIN_TIMEOUT;
96 
97 static kmutex_t		hpet_proxy_lock;	/* lock for lAPIC proxy data */
98 /*
99  * hpet_proxy_users is a per-cpu array.
100  */
101 static hpet_proxy_t	*hpet_proxy_users;	/* one per CPU */
102 
103 
104 ACPI_TABLE_HPET		*hpet_table;		/* ACPI HPET table */
105 hpet_info_t		hpet_info;		/* Human readable Information */
106 
107 /*
108  * Provide HPET access from unix.so.
109  * Set up pointers to access symbols in pcplusmp.
110  */
111 static void
112 hpet_establish_hooks(void)
113 {
114 	hpet.install_proxy = &hpet_install_proxy;
115 	hpet.callback = &hpet_callback;
116 	hpet.use_hpet_timer = &hpet_use_hpet_timer;
117 	hpet.use_lapic_timer = &hpet_use_lapic_timer;
118 }
119 
120 /*
121  * Get the ACPI "HPET" table.
122  * acpi_probe() calls this function from mp_startup before drivers are loaded.
123  * acpi_probe() verified the system is using ACPI before calling this.
124  *
125  * There may be more than one ACPI HPET table (Itanium only?).
126  * Intel's HPET spec defines each timer block to have up to 32 counters and
127  * be 1024 bytes long.  There can be more than one timer block of 32 counters.
128  * Each timer block would have an additional ACPI HPET table.
129  * Typical x86 systems today only have 1 HPET with 3 counters.
130  * On x86 we only consume HPET table "1" for now.
131  */
132 int
133 hpet_acpi_init(int *hpet_vect, iflag_t *hpet_flags)
134 {
135 	extern hrtime_t tsc_read(void);
136 	extern int	idle_cpu_no_deep_c;
137 	extern int	cpuid_deep_cstates_supported(void);
138 	void		*la;
139 	uint64_t	ret;
140 	uint_t		num_timers;
141 	uint_t		ti;
142 
143 	(void) memset(&hpet_info, 0, sizeof (hpet_info));
144 	hpet.supported = HPET_NO_SUPPORT;
145 
146 	if ((get_hwenv() & HW_XEN_HVM) != 0) {
147 		/*
148 		 * In some AWS EC2 guests, though the HPET is advertised via
149 		 * ACPI, programming the interrupt on the non-legacy timer can
150 		 * result in an immediate reset of the instance.  It is not
151 		 * currently possible to tell whether this is an instance with
152 		 * broken HPET emulation or not, so we simply disable it across
153 		 * the board.
154 		 */
155 		PRM_POINT("will not program HPET in Xen HVM");
156 		return (DDI_FAILURE);
157 	}
158 
159 	if (idle_cpu_no_deep_c ||
160 	    !cpuid_deep_cstates_supported()) {
161 		/*
162 		 * If Deep C-States are disabled or not supported, then we do
163 		 * not need to program the HPET at all as it will not
164 		 * subsequently be used.
165 		 */
166 		PRM_POINT("no need to program the HPET");
167 		return (DDI_FAILURE);
168 	}
169 
170 	hpet_establish_hooks();
171 
172 	/*
173 	 * Get HPET ACPI table 1.
174 	 */
175 	PRM_POINT("AcpiGetTable() HPET #1");
176 	if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_HPET, HPET_TABLE_1,
177 	    (ACPI_TABLE_HEADER **)&hpet_table))) {
178 		cmn_err(CE_NOTE, "!hpet_acpi: unable to get ACPI HPET table");
179 		return (DDI_FAILURE);
180 	}
181 
182 	if (hpet_validate_table(hpet_table) != AE_OK) {
183 		cmn_err(CE_NOTE, "!hpet_acpi: invalid HPET table");
184 		return (DDI_FAILURE);
185 	}
186 
187 	PRM_POINT("hpet_memory_map()");
188 	la = hpet_memory_map(hpet_table);
189 	PRM_DEBUG(la);
190 	if (la == NULL) {
191 		cmn_err(CE_NOTE, "!hpet_acpi: memory map HPET failed");
192 		return (DDI_FAILURE);
193 	}
194 	hpet_info.logical_address = la;
195 
196 	PRM_POINT("hpet_read_gen_cap()");
197 	ret = hpet_read_gen_cap(&hpet_info);
198 	PRM_DEBUG(ret);
199 	hpet_info.gen_cap.counter_clk_period = HPET_GCAP_CNTR_CLK_PERIOD(ret);
200 	hpet_info.gen_cap.vendor_id = HPET_GCAP_VENDOR_ID(ret);
201 	hpet_info.gen_cap.leg_route_cap = HPET_GCAP_LEG_ROUTE_CAP(ret);
202 	hpet_info.gen_cap.count_size_cap = HPET_GCAP_CNT_SIZE_CAP(ret);
203 	/*
204 	 * Hardware contains the last timer's number.
205 	 * Add 1 to get the number of timers.
206 	 */
207 	hpet_info.gen_cap.num_tim_cap = HPET_GCAP_NUM_TIM_CAP(ret) + 1;
208 	hpet_info.gen_cap.rev_id = HPET_GCAP_REV_ID(ret);
209 
210 	if (hpet_info.gen_cap.counter_clk_period > HPET_MAX_CLK_PERIOD) {
211 		cmn_err(CE_NOTE, "!hpet_acpi: COUNTER_CLK_PERIOD 0x%lx > 0x%lx",
212 		    (long)hpet_info.gen_cap.counter_clk_period,
213 		    (long)HPET_MAX_CLK_PERIOD);
214 		return (DDI_FAILURE);
215 	}
216 
217 	num_timers = (uint_t)hpet_info.gen_cap.num_tim_cap;
218 	PRM_DEBUG(num_timers);
219 	if ((num_timers < 3) || (num_timers > 32)) {
220 		cmn_err(CE_NOTE, "!hpet_acpi: invalid number of HPET timers "
221 		    "%lx", (long)num_timers);
222 		return (DDI_FAILURE);
223 	}
224 	hpet_info.timer_n_config = (hpet_TN_conf_cap_t *)kmem_zalloc(
225 	    num_timers * sizeof (uint64_t), KM_SLEEP);
226 
227 	PRM_POINT("hpet_read_gen_config()");
228 	ret = hpet_read_gen_config(&hpet_info);
229 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
230 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
231 
232 	/*
233 	 * illumos does not use the HPET Legacy Replacement Route capabilities.
234 	 * This feature has been off by default on test systems.
235 	 * The HPET spec does not specify if Legacy Replacement Route is
236 	 * on or off by default, so we explicitly set it off here.
237 	 * It should not matter which mode the HPET is in since we use
238 	 * the first available non-legacy replacement timer: timer 2.
239 	 */
240 	PRM_POINT("hpet_read_gen_config()");
241 	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
242 
243 	PRM_POINT("hpet_read_gen_config() again");
244 	ret = hpet_read_gen_config(&hpet_info);
245 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
246 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
247 
248 	hpet_info.gen_intrpt_stat = hpet_read_gen_intrpt_stat(&hpet_info);
249 	hpet_info.main_counter_value = hpet_read_main_counter_value(&hpet_info);
250 
251 	PRM_POINT("disable timer loop...");
252 	for (ti = 0; ti < num_timers; ++ti) {
253 		ret = hpet_read_timer_N_config(&hpet_info, ti);
254 		/*
255 		 * Make sure no timers are enabled (think fast reboot or
256 		 * virtual hardware).
257 		 */
258 		if (ret & HPET_TIMER_N_INT_ENB_CNF_BIT) {
259 			hpet_disable_timer(&hpet_info, ti);
260 			ret &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
261 		}
262 
263 		hpet_info.timer_n_config[ti] = hpet_convert_timer_N_config(ret);
264 	}
265 	PRM_POINT("disable timer loop complete");
266 
267 	/*
268 	 * Be aware the Main Counter may need to be initialized in the future
269 	 * if it is used for more than just Deep C-State support.
270 	 * The HPET's Main Counter does not need to be initialize to a specific
271 	 * value before starting it for use to wake up CPUs from Deep C-States.
272 	 */
273 	PRM_POINT("hpet_start_main_counter()");
274 	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
275 		cmn_err(CE_NOTE, "!hpet_acpi: hpet_start_main_counter failed");
276 		return (DDI_FAILURE);
277 	}
278 
279 	hpet_info.period = hpet_info.gen_cap.counter_clk_period;
280 	/*
281 	 * Read main counter twice to record HPET latency for debugging.
282 	 */
283 	PRM_POINT("TSC and HPET reads:");
284 	hpet_info.tsc[0] = tsc_read();
285 	hpet_info.hpet_main_counter_reads[0] =
286 	    hpet_read_main_counter_value(&hpet_info);
287 	hpet_info.tsc[1] = tsc_read();
288 	hpet_info.hpet_main_counter_reads[1] =
289 	    hpet_read_main_counter_value(&hpet_info);
290 	hpet_info.tsc[2] = tsc_read();
291 
292 	PRM_DEBUG(hpet_info.hpet_main_counter_reads[0]);
293 	PRM_DEBUG(hpet_info.hpet_main_counter_reads[1]);
294 	PRM_DEBUG(hpet_info.tsc[0]);
295 	PRM_DEBUG(hpet_info.tsc[1]);
296 	PRM_DEBUG(hpet_info.tsc[2]);
297 
298 	ret = hpet_read_gen_config(&hpet_info);
299 	hpet_info.gen_config.leg_rt_cnf = HPET_GCFR_LEG_RT_CNF_BITX(ret);
300 	hpet_info.gen_config.enable_cnf = HPET_GCFR_ENABLE_CNF_BITX(ret);
301 
302 	/*
303 	 * HPET main counter reads are supported now.
304 	 */
305 	hpet.supported = HPET_TIMER_SUPPORT;
306 
307 	return (hpet_init_proxy(hpet_vect, hpet_flags));
308 }
309 
310 void
311 hpet_acpi_fini(void)
312 {
313 	if (hpet.supported == HPET_NO_SUPPORT)
314 		return;
315 	if (hpet.supported >= HPET_TIMER_SUPPORT)
316 		(void) hpet_stop_main_counter(&hpet_info);
317 	if (hpet.supported > HPET_TIMER_SUPPORT)
318 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
319 }
320 
321 /*
322  * Do initial setup to use a HPET timer as a proxy for Deep C-state stalled
323  * LAPIC Timers.  Get a free HPET timer that supports I/O APIC routed interrupt.
324  * Setup data to handle the timer's ISR, and add the timer's interrupt.
325  *
326  * The ddi cannot be use to allocate the HPET timer's interrupt.
327  * ioapic_init_intr() in mp_platform_common() later sets up the I/O APIC
328  * to handle the HPET timer's interrupt.
329  *
330  * Note: FSB (MSI) interrupts are not currently supported by Intel HPETs as of
331  * ICH9.  The HPET spec allows for MSI.  In the future MSI may be prefered.
332  */
333 static int
334 hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags)
335 {
336 	PRM_POINT("hpet_get_IOAPIC_intr_capable_timer()");
337 	if (hpet_get_IOAPIC_intr_capable_timer(&hpet_info) == -1) {
338 		cmn_err(CE_WARN, "!hpet_acpi: get ioapic intr failed.");
339 		return (DDI_FAILURE);
340 	}
341 
342 	hpet_init_proxy_data();
343 
344 	PRM_POINT("hpet_install_interrupt_handler()");
345 	if (hpet_install_interrupt_handler(&hpet_isr,
346 	    hpet_info.cstate_timer.intr) != AE_OK) {
347 		cmn_err(CE_WARN, "!hpet_acpi: install interrupt failed.");
348 		return (DDI_FAILURE);
349 	}
350 	*hpet_vect = hpet_info.cstate_timer.intr;
351 	hpet_flags->intr_el = INTR_EL_LEVEL;
352 	hpet_flags->intr_po = INTR_PO_ACTIVE_HIGH;
353 	hpet_flags->bustype = BUS_PCI;		/*  we *do* conform to PCI */
354 
355 	/*
356 	 * Avoid a possibly stuck interrupt by programing the HPET's timer here
357 	 * before the I/O APIC is programmed to handle this interrupt.
358 	 */
359 	PRM_POINT("hpet_timer_set_up()");
360 	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
361 	    hpet_info.cstate_timer.intr);
362 	PRM_POINT("back from hpet_timer_set_up()");
363 
364 	/*
365 	 * All HPET functionality is supported.
366 	 */
367 	hpet.supported = HPET_FULL_SUPPORT;
368 	PRM_POINT("HPET full support");
369 	return (DDI_SUCCESS);
370 }
371 
372 /*
373  * Called by kernel if it can support Deep C-States.
374  */
375 static boolean_t
376 hpet_install_proxy(void)
377 {
378 	if (hpet_state.proxy_installed == B_TRUE)
379 		return (B_TRUE);
380 
381 	if (hpet.supported != HPET_FULL_SUPPORT)
382 		return (B_FALSE);
383 
384 	hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
385 	hpet_state.proxy_installed = B_TRUE;
386 
387 	return (B_TRUE);
388 }
389 
390 /*
391  * Remove the interrupt that was added with add_avintr() in
392  * hpet_install_interrupt_handler().
393  */
394 static void
395 hpet_uninstall_interrupt_handler(void)
396 {
397 	rem_avintr(NULL, CBE_HIGH_PIL, &hpet_isr, hpet_info.cstate_timer.intr);
398 }
399 
400 static int
401 hpet_validate_table(ACPI_TABLE_HPET *hpet_table)
402 {
403 	ACPI_TABLE_HEADER	*table_header = (ACPI_TABLE_HEADER *)hpet_table;
404 
405 	if (table_header->Length != sizeof (ACPI_TABLE_HPET)) {
406 		cmn_err(CE_WARN, "!hpet_validate_table: Length %lx != sizeof ("
407 		    "ACPI_TABLE_HPET) %lx.",
408 		    (unsigned long)((ACPI_TABLE_HEADER *)hpet_table)->Length,
409 		    (unsigned long)sizeof (ACPI_TABLE_HPET));
410 		return (AE_ERROR);
411 	}
412 
413 	if (!ACPI_COMPARE_NAME(table_header->Signature, ACPI_SIG_HPET)) {
414 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET table "
415 		    "signature");
416 		return (AE_ERROR);
417 	}
418 
419 	if (!hpet_checksum_table((unsigned char *)hpet_table,
420 	    (unsigned int)table_header->Length)) {
421 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid HPET checksum");
422 		return (AE_ERROR);
423 	}
424 
425 	/*
426 	 * Sequence should be table number - 1.  We are using table 1.
427 	 */
428 	if (hpet_table->Sequence != HPET_TABLE_1 - 1) {
429 		cmn_err(CE_WARN, "!hpet_validate_table: Invalid Sequence %lx",
430 		    (long)hpet_table->Sequence);
431 		return (AE_ERROR);
432 	}
433 
434 	return (AE_OK);
435 }
436 
437 static boolean_t
438 hpet_checksum_table(unsigned char *table, unsigned int length)
439 {
440 	unsigned char	checksum = 0;
441 	int		i;
442 
443 	for (i = 0; i < length; ++i, ++table)
444 		checksum += *table;
445 
446 	return (checksum == 0);
447 }
448 
449 static void *
450 hpet_memory_map(ACPI_TABLE_HPET *hpet_table)
451 {
452 	return (AcpiOsMapMemory(hpet_table->Address.Address, HPET_SIZE));
453 }
454 
455 static int
456 hpet_start_main_counter(hpet_info_t *hip)
457 {
458 	uint64_t	*gcr_ptr;
459 	uint64_t	gcr;
460 
461 	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
462 	gcr = *gcr_ptr;
463 
464 	gcr |= HPET_GCFR_ENABLE_CNF;
465 	*gcr_ptr = gcr;
466 	gcr = *gcr_ptr;
467 
468 	return (gcr & HPET_GCFR_ENABLE_CNF ? AE_OK : ~AE_OK);
469 }
470 
471 static int
472 hpet_stop_main_counter(hpet_info_t *hip)
473 {
474 	uint64_t	*gcr_ptr;
475 	uint64_t	gcr;
476 
477 	gcr_ptr = (uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address);
478 	gcr = *gcr_ptr;
479 
480 	gcr &= ~HPET_GCFR_ENABLE_CNF;
481 	*gcr_ptr = gcr;
482 	gcr = *gcr_ptr;
483 
484 	return (gcr & HPET_GCFR_ENABLE_CNF ? ~AE_OK : AE_OK);
485 }
486 
487 /*
488  * Set the Legacy Replacement Route bit.
489  * This should be called before setting up timers.
490  * The HPET specification is silent regarding setting this after timers are
491  * programmed.
492  */
493 static uint64_t
494 hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value)
495 {
496 	uint64_t gen_conf = hpet_read_gen_config(hip);
497 
498 	switch (new_value) {
499 	case 0:
500 		gen_conf &= ~HPET_GCFR_LEG_RT_CNF;
501 		break;
502 
503 	case HPET_GCFR_LEG_RT_CNF:
504 		gen_conf |= HPET_GCFR_LEG_RT_CNF;
505 		break;
506 
507 	default:
508 		ASSERT(new_value == 0 || new_value == HPET_GCFR_LEG_RT_CNF);
509 		break;
510 	}
511 	hpet_write_gen_config(hip, gen_conf);
512 	return (gen_conf);
513 }
514 
515 static uint64_t
516 hpet_read_gen_cap(hpet_info_t *hip)
517 {
518 	return (*(uint64_t *)HPET_GEN_CAP_ADDRESS(hip->logical_address));
519 }
520 
521 static uint64_t
522 hpet_read_gen_config(hpet_info_t *hip)
523 {
524 	return (*(uint64_t *)
525 	    HPET_GEN_CONFIG_ADDRESS(hip->logical_address));
526 }
527 
528 static uint64_t
529 hpet_read_gen_intrpt_stat(hpet_info_t *hip)
530 {
531 	hip->gen_intrpt_stat = *(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(
532 	    hip->logical_address);
533 	return (hip->gen_intrpt_stat);
534 }
535 
536 static uint64_t
537 hpet_read_timer_N_config(hpet_info_t *hip, uint_t n)
538 {
539 	uint64_t conf = *(uint64_t *)HPET_TIMER_N_CONF_ADDRESS(
540 	    hip->logical_address, n);
541 	hip->timer_n_config[n] = hpet_convert_timer_N_config(conf);
542 	return (conf);
543 }
544 
545 static hpet_TN_conf_cap_t
546 hpet_convert_timer_N_config(uint64_t conf)
547 {
548 	hpet_TN_conf_cap_t cc = { 0 };
549 
550 	cc.int_route_cap = HPET_TIMER_N_INT_ROUTE_CAP(conf);
551 	cc.fsb_int_del_cap = HPET_TIMER_N_FSB_INT_DEL_CAP(conf);
552 	cc.fsb_int_en_cnf = HPET_TIMER_N_FSB_EN_CNF(conf);
553 	cc.int_route_cnf = HPET_TIMER_N_INT_ROUTE_CNF(conf);
554 	cc.mode32_cnf = HPET_TIMER_N_MODE32_CNF(conf);
555 	cc.val_set_cnf = HPET_TIMER_N_VAL_SET_CNF(conf);
556 	cc.size_cap = HPET_TIMER_N_SIZE_CAP(conf);
557 	cc.per_int_cap = HPET_TIMER_N_PER_INT_CAP(conf);
558 	cc.type_cnf = HPET_TIMER_N_TYPE_CNF(conf);
559 	cc.int_enb_cnf = HPET_TIMER_N_INT_ENB_CNF(conf);
560 	cc.int_type_cnf = HPET_TIMER_N_INT_TYPE_CNF(conf);
561 
562 	return (cc);
563 }
564 
565 static uint64_t
566 hpet_read_main_counter_value(hpet_info_t *hip)
567 {
568 	uint64_t	value;
569 	uint32_t	*counter;
570 	uint32_t	high1, high2, low;
571 
572 	counter = (uint32_t *)HPET_MAIN_COUNTER_ADDRESS(hip->logical_address);
573 
574 	/*
575 	 * 32-bit main counters
576 	 */
577 	if (hip->gen_cap.count_size_cap == 0) {
578 		value = (uint64_t)*counter;
579 		hip->main_counter_value = value;
580 		return (value);
581 	}
582 
583 	/*
584 	 * HPET spec claims a 64-bit read can be split into two 32-bit reads
585 	 * by the hardware connection to the HPET.
586 	 */
587 	high2 = counter[1];
588 	do {
589 		high1 = high2;
590 		low = counter[0];
591 		high2 = counter[1];
592 	} while (high2 != high1);
593 
594 	value = ((uint64_t)high1 << 32) | low;
595 	hip->main_counter_value = value;
596 	return (value);
597 }
598 
599 static void
600 hpet_write_gen_config(hpet_info_t *hip, uint64_t l)
601 {
602 	*(uint64_t *)HPET_GEN_CONFIG_ADDRESS(hip->logical_address) = l;
603 }
604 
605 static void
606 hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l)
607 {
608 	*(uint64_t *)HPET_GEN_INTR_STAT_ADDRESS(hip->logical_address) = l;
609 }
610 
611 static void
612 hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t conf)
613 {
614 	/*
615 	 * The configuration register size is not affected by the size
616 	 * capability; it is always a 64-bit value.  The top 32-bit half of
617 	 * this register is always read-only so we constrain our write to the
618 	 * bottom half.
619 	 */
620 	uint32_t *confaddr = (uint32_t *)HPET_TIMER_N_CONF_ADDRESS(
621 	    hip->logical_address, n);
622 	uint32_t conf32 = 0xFFFFFFFF & conf;
623 
624 	PRM_DEBUG(n);
625 	PRM_DEBUG(conf);
626 	PRM_DEBUG(conf32);
627 
628 	*confaddr = conf32;
629 
630 	PRM_POINT("write done");
631 }
632 
633 static void
634 hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l)
635 {
636 	*(uint64_t *)HPET_TIMER_N_COMP_ADDRESS(hip->logical_address, n) = l;
637 }
638 
639 static void
640 hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n)
641 {
642 	uint64_t l;
643 
644 	l = hpet_read_timer_N_config(hip, timer_n);
645 	l &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;
646 	hpet_write_timer_N_config(hip, timer_n, l);
647 }
648 
649 static void
650 hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n)
651 {
652 	uint64_t l;
653 
654 	l = hpet_read_timer_N_config(hip, timer_n);
655 	l |= HPET_TIMER_N_INT_ENB_CNF_BIT;
656 	hpet_write_timer_N_config(hip, timer_n, l);
657 }
658 
659 /*
660  * Add the interrupt handler for I/O APIC interrupt number (interrupt line).
661  *
662  * The I/O APIC line (vector) is programmed in ioapic_init_intr() called
663  * from apic_picinit() psm_ops apic_ops entry point after we return from
664  * apic_init() psm_ops entry point.
665  */
666 static uint32_t
667 hpet_install_interrupt_handler(avfunc func, int vector)
668 {
669 	uint32_t retval;
670 
671 	retval = add_avintr(NULL, CBE_HIGH_PIL, func, "HPET Timer",
672 	    vector, NULL, NULL, NULL, NULL);
673 	if (retval == 0) {
674 		cmn_err(CE_WARN, "!hpet_acpi: add_avintr() failed");
675 		return (AE_BAD_PARAMETER);
676 	}
677 	return (AE_OK);
678 }
679 
680 /*
681  * The HPET timers specify which I/O APIC interrupts they can be routed to.
682  * Find the first available non-legacy-replacement timer and its I/O APIC irq.
683  * Supported I/O APIC IRQs are specified in the int_route_cap bitmap in each
684  * timer's timer_n_config register.
685  */
686 static int
687 hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip)
688 {
689 	int timer;
690 	int intr;
691 
692 	for (timer = HPET_FIRST_NON_LEGACY_TIMER;
693 	    timer < hip->gen_cap.num_tim_cap; ++timer) {
694 		if (!hpet_timer_available(hip->allocated_timers, timer))
695 			continue;
696 
697 		intr = lowbit(hip->timer_n_config[timer].int_route_cap) - 1;
698 
699 		PRM_DEBUG(timer);
700 		PRM_DEBUG(intr);
701 
702 		if (intr >= 0) {
703 			hpet_timer_alloc(&hip->allocated_timers, timer);
704 			hip->cstate_timer.timer = timer;
705 			hip->cstate_timer.intr = intr;
706 			return (timer);
707 		}
708 	}
709 
710 	return (-1);
711 }
712 
713 /*
714  * Mark this timer as used.
715  */
716 static void
717 hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n)
718 {
719 	*allocated_timers |= 1 << n;
720 }
721 
722 /*
723  * Check if this timer is available.
724  * No mutual exclusion because only one thread uses this.
725  */
726 static int
727 hpet_timer_available(uint32_t allocated_timers, uint32_t n)
728 {
729 	return ((allocated_timers & (1 << n)) == 0);
730 }
731 
732 /*
733  * Setup timer N to route its interrupt to I/O APIC.
734  */
735 static void
736 hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, uint32_t interrupt)
737 {
738 	uint64_t conf;
739 
740 	PRM_DEBUG(timer_n);
741 	PRM_DEBUG(interrupt);
742 
743 	PRM_POINT("hpet_read_timer_N_config()");
744 	conf = hpet_read_timer_N_config(hip, timer_n);
745 	PRM_DEBUG(conf);
746 
747 	/*
748 	 * Caller is required to verify this interrupt route is supported.
749 	 */
750 	ASSERT(HPET_TIMER_N_INT_ROUTE_CAP(conf) & (1 << interrupt));
751 
752 	conf &= ~HPET_TIMER_N_FSB_EN_CNF_BIT;	/* use IOAPIC */
753 	conf |= HPET_TIMER_N_INT_ROUTE_SHIFT(interrupt);
754 	conf &= ~HPET_TIMER_N_TYPE_CNF_BIT;	/* non periodic */
755 	conf &= ~HPET_TIMER_N_INT_ENB_CNF_BIT;	/* disabled */
756 	conf |= HPET_TIMER_N_INT_TYPE_CNF_BIT;	/* Level Triggered */
757 
758 	PRM_POINT("hpet_write_timer_N_config()");
759 	PRM_DEBUG(conf);
760 	hpet_write_timer_N_config(hip, timer_n, conf);
761 	PRM_POINT("back from hpet_write_timer_N_config()");
762 }
763 
764 /*
765  * The HPET's Main Counter is not stopped before programming an HPET timer.
766  * This will allow the HPET to be used as a time source.
767  * The programmed timer interrupt may occur before this function returns.
768  * Callers must block interrupts before calling this function if they must
769  * guarantee the interrupt is handled after this function returns.
770  *
771  * Return 0 if main counter is less than timer after enabling timer.
772  * The interrupt was programmed, but it may fire before this returns.
773  * Return !0 if main counter is greater than timer after enabling timer.
774  * In other words: the timer will not fire, and we do not know if it did fire.
775  *
776  * delta is in HPET ticks.
777  *
778  * Writing a 64-bit value to a 32-bit register will "wrap around".
779  * A 32-bit HPET timer will wrap around in a little over 5 minutes.
780  */
781 int
782 hpet_timer_program(hpet_info_t *hip, uint32_t timer, uint64_t delta)
783 {
784 	uint64_t time, program;
785 
786 	program = hpet_read_main_counter_value(hip);
787 	program += delta;
788 	hpet_write_timer_N_comp(hip, timer, program);
789 
790 	time = hpet_read_main_counter_value(hip);
791 	if (time < program)
792 		return (AE_OK);
793 
794 	return (AE_TIME);
795 }
796 
797 /*
798  * CPR and power policy-change callback entry point.
799  */
800 boolean_t
801 hpet_callback(int code)
802 {
803 	switch (code) {
804 	case PM_DEFAULT_CPU_DEEP_IDLE:
805 		/*FALLTHROUGH*/
806 	case PM_ENABLE_CPU_DEEP_IDLE:
807 		/*FALLTHROUGH*/
808 	case PM_DISABLE_CPU_DEEP_IDLE:
809 		return (hpet_deep_idle_config(code));
810 
811 	case CB_CODE_CPR_RESUME:
812 		/*FALLTHROUGH*/
813 	case CB_CODE_CPR_CHKPT:
814 		return (hpet_cpr(code));
815 
816 	case CST_EVENT_MULTIPLE_CSTATES:
817 		hpet_cst_callback(CST_EVENT_MULTIPLE_CSTATES);
818 		return (B_TRUE);
819 
820 	case CST_EVENT_ONE_CSTATE:
821 		hpet_cst_callback(CST_EVENT_ONE_CSTATE);
822 		return (B_TRUE);
823 
824 	default:
825 		cmn_err(CE_NOTE, "!hpet_callback: invalid code %d\n", code);
826 		return (B_FALSE);
827 	}
828 }
829 
830 /*
831  * According to the HPET spec 1.0a: the Operating System must save and restore
832  * HPET event timer hardware context through ACPI sleep state transitions.
833  * Timer registers (including the main counter) may not be preserved through
834  * ACPI S3, S4, or S5 sleep states.  This code does not not support S1 nor S2.
835  *
836  * Current HPET state is already in hpet.supported and
837  * hpet_state.proxy_installed.  hpet_info contains the proxy interrupt HPET
838  * Timer state.
839  *
840  * Future projects beware: the HPET Main Counter is undefined after ACPI S3 or
841  * S4, and it is not saved/restored here.  Future projects cannot expect the
842  * Main Counter to be monotomically (or accurately) increasing across CPR.
843  *
844  * Note: the CPR Checkpoint path later calls pause_cpus() which ensures all
845  * CPUs are awake and in a spin loop before the system suspends.  The HPET is
846  * not needed for Deep C-state wakeup when CPUs are in cpu_pause().
847  * It is safe to leave the HPET running as the system suspends; we just
848  * disable the timer from generating interrupts here.
849  */
850 static boolean_t
851 hpet_cpr(int code)
852 {
853 	ulong_t		intr, dead_count = 0;
854 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
855 	boolean_t	ret = B_TRUE;
856 
857 	mutex_enter(&hpet_state_lock);
858 	switch (code) {
859 	case CB_CODE_CPR_CHKPT:
860 		if (hpet_state.proxy_installed == B_FALSE)
861 			break;
862 
863 		hpet_state.cpr = B_TRUE;
864 
865 		intr = intr_clear();
866 		while (!mutex_tryenter(&hpet_proxy_lock)) {
867 			/*
868 			 * spin
869 			 */
870 			intr_restore(intr);
871 			if (dead_count++ > hpet_spin_check) {
872 				dead_count = 0;
873 				if (gethrtime() > dead) {
874 					hpet_state.cpr = B_FALSE;
875 					mutex_exit(&hpet_state_lock);
876 					cmn_err(CE_NOTE, "!hpet_cpr: deadman");
877 					return (B_FALSE);
878 				}
879 			}
880 			intr = intr_clear();
881 		}
882 		hpet_expire_all();
883 		mutex_exit(&hpet_proxy_lock);
884 		intr_restore(intr);
885 
886 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
887 		break;
888 
889 	case CB_CODE_CPR_RESUME:
890 		if (hpet_resume() == B_TRUE)
891 			hpet_state.cpr = B_FALSE;
892 		else
893 			cmn_err(CE_NOTE, "!hpet_resume failed.");
894 		break;
895 
896 	default:
897 		cmn_err(CE_NOTE, "!hpet_cpr: invalid code %d\n", code);
898 		ret = B_FALSE;
899 		break;
900 	}
901 	mutex_exit(&hpet_state_lock);
902 	return (ret);
903 }
904 
905 /*
906  * Assume the HPET stopped in Suspend state and timer state was lost.
907  */
908 static boolean_t
909 hpet_resume(void)
910 {
911 	if (hpet.supported != HPET_TIMER_SUPPORT)
912 		return (B_TRUE);
913 
914 	/*
915 	 * The HPET spec does not specify if Legacy Replacement Route is
916 	 * on or off by default, so we set it off here.
917 	 */
918 	(void) hpet_set_leg_rt_cnf(&hpet_info, 0);
919 
920 	if (hpet_start_main_counter(&hpet_info) != AE_OK) {
921 		cmn_err(CE_NOTE, "!hpet_resume: start main counter failed");
922 		hpet.supported = HPET_NO_SUPPORT;
923 		if (hpet_state.proxy_installed == B_TRUE) {
924 			hpet_state.proxy_installed = B_FALSE;
925 			hpet_uninstall_interrupt_handler();
926 		}
927 		return (B_FALSE);
928 	}
929 
930 	if (hpet_state.proxy_installed == B_FALSE)
931 		return (B_TRUE);
932 
933 	hpet_timer_set_up(&hpet_info, hpet_info.cstate_timer.timer,
934 	    hpet_info.cstate_timer.intr);
935 	if (hpet_state.cpu_deep_idle == B_TRUE)
936 		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
937 
938 	return (B_TRUE);
939 }
940 
941 /*
942  * Callback to enable/disable Deep C-States based on power.conf setting.
943  */
944 static boolean_t
945 hpet_deep_idle_config(int code)
946 {
947 	ulong_t		intr, dead_count = 0;
948 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
949 	boolean_t	ret = B_TRUE;
950 
951 	mutex_enter(&hpet_state_lock);
952 	switch (code) {
953 	case PM_DEFAULT_CPU_DEEP_IDLE:
954 		/*FALLTHROUGH*/
955 	case PM_ENABLE_CPU_DEEP_IDLE:
956 
957 		if (hpet_state.cpu_deep_idle == B_TRUE)
958 			break;
959 
960 		if (hpet_state.proxy_installed == B_FALSE) {
961 			ret = B_FALSE;  /* Deep C-States not supported */
962 			break;
963 		}
964 
965 		hpet_enable_timer(&hpet_info, hpet_info.cstate_timer.timer);
966 		hpet_state.cpu_deep_idle = B_TRUE;
967 		break;
968 
969 	case PM_DISABLE_CPU_DEEP_IDLE:
970 
971 		if ((hpet_state.cpu_deep_idle == B_FALSE) ||
972 		    (hpet_state.proxy_installed == B_FALSE))
973 			break;
974 
975 		/*
976 		 * The order of these operations is important to avoid
977 		 * lost wakeups: Set a flag to refuse all future LAPIC Timer
978 		 * proxy requests, then wake up all CPUs from deep C-state,
979 		 * and finally disable the HPET interrupt-generating timer.
980 		 */
981 		hpet_state.cpu_deep_idle = B_FALSE;
982 
983 		intr = intr_clear();
984 		while (!mutex_tryenter(&hpet_proxy_lock)) {
985 			/*
986 			 * spin
987 			 */
988 			intr_restore(intr);
989 			if (dead_count++ > hpet_spin_check) {
990 				dead_count = 0;
991 				if (gethrtime() > dead) {
992 					hpet_state.cpu_deep_idle = B_TRUE;
993 					mutex_exit(&hpet_state_lock);
994 					cmn_err(CE_NOTE,
995 					    "!hpet_deep_idle_config: deadman");
996 					return (B_FALSE);
997 				}
998 			}
999 			intr = intr_clear();
1000 		}
1001 		hpet_expire_all();
1002 		mutex_exit(&hpet_proxy_lock);
1003 		intr_restore(intr);
1004 
1005 		hpet_disable_timer(&hpet_info, hpet_info.cstate_timer.timer);
1006 		break;
1007 
1008 	default:
1009 		cmn_err(CE_NOTE, "!hpet_deep_idle_config: invalid code %d\n",
1010 		    code);
1011 		ret = B_FALSE;
1012 		break;
1013 	}
1014 	mutex_exit(&hpet_state_lock);
1015 
1016 	return (ret);
1017 }
1018 
1019 /*
1020  * Callback for _CST c-state change notifications.
1021  */
1022 static void
1023 hpet_cst_callback(uint32_t code)
1024 {
1025 	ulong_t		intr, dead_count = 0;
1026 	hrtime_t	dead = gethrtime() + hpet_spin_timeout;
1027 
1028 	switch (code) {
1029 	case CST_EVENT_ONE_CSTATE:
1030 		hpet_state.uni_cstate = B_TRUE;
1031 		intr = intr_clear();
1032 		while (!mutex_tryenter(&hpet_proxy_lock)) {
1033 			/*
1034 			 * spin
1035 			 */
1036 			intr_restore(intr);
1037 			if (dead_count++ > hpet_spin_check) {
1038 				dead_count = 0;
1039 				if (gethrtime() > dead) {
1040 					hpet_expire_all();
1041 					cmn_err(CE_NOTE,
1042 					    "!hpet_cst_callback: deadman");
1043 					return;
1044 				}
1045 			}
1046 			intr = intr_clear();
1047 		}
1048 		hpet_expire_all();
1049 		mutex_exit(&hpet_proxy_lock);
1050 		intr_restore(intr);
1051 		break;
1052 
1053 	case CST_EVENT_MULTIPLE_CSTATES:
1054 		hpet_state.uni_cstate = B_FALSE;
1055 		break;
1056 
1057 	default:
1058 		cmn_err(CE_NOTE, "!hpet_cst_callback: invalid code %d\n", code);
1059 		break;
1060 	}
1061 }
1062 
1063 /*
1064  * Interrupt Service Routine for HPET I/O-APIC-generated interrupts.
1065  * Used to wakeup CPUs from Deep C-state when their Local APIC Timer stops.
1066  * This ISR runs on one CPU which pokes other CPUs out of Deep C-state as
1067  * needed.
1068  */
1069 static uint_t
1070 hpet_isr(caddr_t arg __unused, caddr_t arg1 __unused)
1071 {
1072 	uint64_t	timer_status;
1073 	uint64_t	timer_mask;
1074 	ulong_t		intr, dead_count = 0;
1075 	hrtime_t	dead = gethrtime() + hpet_isr_spin_timeout;
1076 
1077 	timer_mask = HPET_INTR_STATUS_MASK(hpet_info.cstate_timer.timer);
1078 
1079 	/*
1080 	 * We are using a level-triggered interrupt.
1081 	 * HPET sets timer's General Interrupt Status Register bit N.
1082 	 * ISR checks this bit to see if it needs servicing.
1083 	 * ISR then clears this bit by writing 1 to that bit.
1084 	 */
1085 	timer_status = hpet_read_gen_intrpt_stat(&hpet_info);
1086 	if (!(timer_status & timer_mask))
1087 		return (DDI_INTR_UNCLAIMED);
1088 	hpet_write_gen_intrpt_stat(&hpet_info, timer_mask);
1089 
1090 	/*
1091 	 * Do not touch ISR data structures before checking the HPET's General
1092 	 * Interrupt Status register.  The General Interrupt Status register
1093 	 * will not be set by hardware until after timer interrupt generation
1094 	 * is enabled by software.  Software allocates necessary data
1095 	 * structures before enabling timer interrupts.  ASSERT the software
1096 	 * data structures required to handle this interrupt are initialized.
1097 	 */
1098 	ASSERT(hpet_proxy_users != NULL);
1099 
1100 	/*
1101 	 * CPUs in deep c-states do not enable interrupts until after
1102 	 * performing idle cleanup which includes descheduling themselves from
1103 	 * the HPET.  The CPU running this ISR will NEVER find itself in the
1104 	 * proxy list.  A lost wakeup may occur if this is false.
1105 	 */
1106 	ASSERT(hpet_proxy_users[CPU->cpu_id] == HPET_INFINITY);
1107 
1108 	/*
1109 	 * Higher level interrupts may deadlock with CPUs going idle if this
1110 	 * ISR is prempted while holding hpet_proxy_lock.
1111 	 */
1112 	intr = intr_clear();
1113 	while (!mutex_tryenter(&hpet_proxy_lock)) {
1114 		/*
1115 		 * spin
1116 		 */
1117 		intr_restore(intr);
1118 		if (dead_count++ > hpet_spin_check) {
1119 			dead_count = 0;
1120 			if (gethrtime() > dead) {
1121 				hpet_expire_all();
1122 				return (DDI_INTR_CLAIMED);
1123 			}
1124 		}
1125 		intr = intr_clear();
1126 	}
1127 	(void) hpet_guaranteed_schedule(HPET_INFINITY);
1128 	mutex_exit(&hpet_proxy_lock);
1129 	intr_restore(intr);
1130 
1131 	return (DDI_INTR_CLAIMED);
1132 }
1133 
1134 /*
1135  * Used when disabling the HPET Timer interrupt.  CPUs in Deep C-state must be
1136  * woken up because they can no longer rely on the HPET's Timer to wake them.
1137  * We do not need to wait for CPUs to wakeup.
1138  */
1139 static void
1140 hpet_expire_all(void)
1141 {
1142 	processorid_t	id;
1143 
1144 	for (id = 0; id < max_ncpus; ++id) {
1145 		if (hpet_proxy_users[id] != HPET_INFINITY) {
1146 			hpet_proxy_users[id] = HPET_INFINITY;
1147 			if (id != CPU->cpu_id)
1148 				poke_cpu(id);
1149 		}
1150 	}
1151 }
1152 
1153 /*
1154  * To avoid missed wakeups this function must guarantee either the HPET timer
1155  * was successfully programmed to the next expire time or there are no waiting
1156  * CPUs.
1157  *
1158  * Callers cannot enter C2 or deeper if the HPET could not be programmed to
1159  * generate its next interrupt to happen at required_wakeup_time or sooner.
1160  * Returns B_TRUE if the HPET was programmed to interrupt by
1161  * required_wakeup_time, B_FALSE if not.
1162  */
1163 static boolean_t
1164 hpet_guaranteed_schedule(hrtime_t required_wakeup_time)
1165 {
1166 	hrtime_t	now, next_proxy_time;
1167 	processorid_t	id, next_proxy_id;
1168 	int		proxy_timer = hpet_info.cstate_timer.timer;
1169 	boolean_t	done = B_FALSE;
1170 
1171 	ASSERT(mutex_owned(&hpet_proxy_lock));
1172 
1173 	/*
1174 	 * Loop until we successfully program the HPET,
1175 	 * or no CPUs are scheduled to use the HPET as a proxy.
1176 	 */
1177 	do {
1178 		/*
1179 		 * Wake all CPUs that expired before now.
1180 		 * Find the next CPU to wake up and next HPET program time.
1181 		 */
1182 		now = gethrtime();
1183 		next_proxy_time = HPET_INFINITY;
1184 		next_proxy_id = CPU->cpu_id;
1185 		for (id = 0; id < max_ncpus; ++id) {
1186 			if (hpet_proxy_users[id] < now) {
1187 				hpet_proxy_users[id] = HPET_INFINITY;
1188 				if (id != CPU->cpu_id)
1189 					poke_cpu(id);
1190 			} else if (hpet_proxy_users[id] < next_proxy_time) {
1191 				next_proxy_time = hpet_proxy_users[id];
1192 				next_proxy_id = id;
1193 			}
1194 		}
1195 
1196 		if (next_proxy_time == HPET_INFINITY) {
1197 			done = B_TRUE;
1198 			/*
1199 			 * There are currently no CPUs using the HPET's Timer
1200 			 * as a proxy for their LAPIC Timer.  The HPET's Timer
1201 			 * does not need to be programmed.
1202 			 *
1203 			 * Letting the HPET timer wrap around to the current
1204 			 * time is the longest possible timeout.
1205 			 * A 64-bit timer will wrap around in ~ 2^44 seconds.
1206 			 * A 32-bit timer will wrap around in ~ 2^12 seconds.
1207 			 *
1208 			 * Disabling the HPET's timer interrupt requires a
1209 			 * (relatively expensive) write to the HPET.
1210 			 * Instead we do nothing.
1211 			 *
1212 			 * We are gambling some CPU will attempt to enter a
1213 			 * deep c-state before the timer wraps around.
1214 			 * We assume one spurious interrupt in a little over an
1215 			 * hour has less performance impact than writing to the
1216 			 * HPET's timer disable bit every time all CPUs wakeup
1217 			 * from deep c-state.
1218 			 */
1219 
1220 		} else {
1221 			/*
1222 			 * Idle CPUs disable interrupts before programming the
1223 			 * HPET to prevent a lost wakeup if the HPET
1224 			 * interrupts the idle cpu before it can enter a
1225 			 * Deep C-State.
1226 			 */
1227 			if (hpet_timer_program(&hpet_info, proxy_timer,
1228 			    HRTIME_TO_HPET_TICKS(next_proxy_time - gethrtime()))
1229 			    != AE_OK) {
1230 				/*
1231 				 * We could not program the HPET to wakeup the
1232 				 * next CPU.  We must wake the CPU ourself to
1233 				 * avoid a lost wakeup.
1234 				 */
1235 				hpet_proxy_users[next_proxy_id] = HPET_INFINITY;
1236 				if (next_proxy_id != CPU->cpu_id)
1237 					poke_cpu(next_proxy_id);
1238 			} else {
1239 				done = B_TRUE;
1240 			}
1241 		}
1242 
1243 	} while (!done);
1244 
1245 	return (next_proxy_time <= required_wakeup_time);
1246 }
1247 
1248 /*
1249  * Use an HPET timer to act as this CPU's proxy local APIC timer.
1250  * Used in deep c-states C2 and above while the CPU's local APIC timer stalls.
1251  * Called by the idle thread with interrupts enabled.
1252  * Always returns with interrupts disabled.
1253  *
1254  * There are 3 possible outcomes from this function:
1255  * 1. The Local APIC Timer was already disabled before this function was called.
1256  *	LAPIC TIMER	: disabled
1257  *	HPET		: not scheduled to wake this CPU
1258  *	*lapic_expire	: (hrtime_t)HPET_INFINITY
1259  *	Returns		: B_TRUE
1260  * 2. Successfully programmed the HPET to act as a LAPIC Timer proxy.
1261  *	LAPIC TIMER	: disabled
1262  *	HPET		: scheduled to wake this CPU
1263  *	*lapic_expire	: hrtime_t when LAPIC timer would have expired
1264  *	Returns		: B_TRUE
1265  * 3. Failed to programmed the HPET to act as a LAPIC Timer proxy.
1266  *	LAPIC TIMER	: enabled
1267  *	HPET		: not scheduled to wake this CPU
1268  *	*lapic_expire	: (hrtime_t)HPET_INFINITY
1269  *	Returns		: B_FALSE
1270  *
1271  * The idle thread cannot enter Deep C-State in case 3.
1272  * The idle thread must re-enable & re-program the LAPIC_TIMER in case 2.
1273  */
1274 static boolean_t
1275 hpet_use_hpet_timer(hrtime_t *lapic_expire)
1276 {
1277 	extern hrtime_t	apic_timer_stop_count(void);
1278 	extern void	apic_timer_restart(hrtime_t);
1279 	hrtime_t	now, expire, dead;
1280 	uint64_t	lapic_count, dead_count;
1281 	cpupart_t	*cpu_part;
1282 	processorid_t	cpu_sid;
1283 	processorid_t	cpu_id = CPU->cpu_id;
1284 	processorid_t	id;
1285 	boolean_t	rslt;
1286 	boolean_t	hset_update;
1287 
1288 	cpu_part = CPU->cpu_part;
1289 	cpu_sid = CPU->cpu_seqid;
1290 
1291 	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
1292 
1293 	/*
1294 	 * A critical section exists between when the HPET is programmed
1295 	 * to interrupt the CPU and when this CPU enters an idle state.
1296 	 * Interrupts must be blocked during that time to prevent lost
1297 	 * CBE wakeup interrupts from either LAPIC or HPET.
1298 	 *
1299 	 * Must block interrupts before acquiring hpet_proxy_lock to prevent
1300 	 * a deadlock with the ISR if the ISR runs on this CPU after the
1301 	 * idle thread acquires the mutex but before it clears interrupts.
1302 	 */
1303 	ASSERT(!interrupts_enabled());
1304 	lapic_count = apic_timer_stop_count();
1305 	now = gethrtime();
1306 	dead = now + hpet_idle_spin_timeout;
1307 	*lapic_expire = expire = now + lapic_count;
1308 	if (lapic_count == (hrtime_t)-1) {
1309 		/*
1310 		 * LAPIC timer is currently disabled.
1311 		 * Will not use the HPET as a LAPIC Timer proxy.
1312 		 */
1313 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1314 		return (B_TRUE);
1315 	}
1316 
1317 	/*
1318 	 * Serialize hpet_proxy data structure manipulation.
1319 	 */
1320 	dead_count = 0;
1321 	while (!mutex_tryenter(&hpet_proxy_lock)) {
1322 		/*
1323 		 * spin
1324 		 */
1325 		apic_timer_restart(expire);
1326 		sti();
1327 		cli();
1328 
1329 		if (dead_count++ > hpet_spin_check) {
1330 			dead_count = 0;
1331 			hset_update = (((CPU->cpu_flags & CPU_OFFLINE) == 0) &&
1332 			    (ncpus > 1));
1333 			if (hset_update &&
1334 			    !bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
1335 				*lapic_expire = (hrtime_t)HPET_INFINITY;
1336 				return (B_FALSE);
1337 			}
1338 		}
1339 
1340 		lapic_count = apic_timer_stop_count();
1341 		now = gethrtime();
1342 		*lapic_expire = expire = now + lapic_count;
1343 		if (lapic_count == (hrtime_t)-1) {
1344 			/*
1345 			 * LAPIC timer is currently disabled.
1346 			 * Will not use the HPET as a LAPIC Timer proxy.
1347 			 */
1348 			*lapic_expire = (hrtime_t)HPET_INFINITY;
1349 			return (B_TRUE);
1350 		}
1351 		if (now > dead) {
1352 			apic_timer_restart(expire);
1353 			*lapic_expire = (hrtime_t)HPET_INFINITY;
1354 			return (B_FALSE);
1355 		}
1356 	}
1357 
1358 	if ((hpet_state.cpr == B_TRUE) ||
1359 	    (hpet_state.cpu_deep_idle == B_FALSE) ||
1360 	    (hpet_state.proxy_installed == B_FALSE) ||
1361 	    (hpet_state.uni_cstate == B_TRUE)) {
1362 		mutex_exit(&hpet_proxy_lock);
1363 		apic_timer_restart(expire);
1364 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1365 		return (B_FALSE);
1366 	}
1367 
1368 	hpet_proxy_users[cpu_id] = expire;
1369 
1370 	/*
1371 	 * We are done if another cpu is scheduled on the HPET with an
1372 	 * expire time before us.  The next HPET interrupt has been programmed
1373 	 * to fire before our expire time.
1374 	 */
1375 	for (id = 0; id < max_ncpus; ++id) {
1376 		if ((hpet_proxy_users[id] <= expire) && (id != cpu_id)) {
1377 			mutex_exit(&hpet_proxy_lock);
1378 			return (B_TRUE);
1379 		}
1380 	}
1381 
1382 	/*
1383 	 * We are the next lAPIC to expire.
1384 	 * Program the HPET with our expire time.
1385 	 */
1386 	rslt = hpet_guaranteed_schedule(expire);
1387 	mutex_exit(&hpet_proxy_lock);
1388 
1389 	if (rslt == B_FALSE) {
1390 		apic_timer_restart(expire);
1391 		*lapic_expire = (hrtime_t)HPET_INFINITY;
1392 	}
1393 
1394 	return (rslt);
1395 }
1396 
1397 /*
1398  * Called by the idle thread when waking up from Deep C-state before enabling
1399  * interrupts.  With an array data structure it is faster to always remove
1400  * ourself from the array without checking if the HPET ISR already removed.
1401  *
1402  * We use a lazy algorithm for removing CPUs from the HPET's schedule.
1403  * We do not reprogram the HPET here because this CPU has real work to do.
1404  * On a idle system the CPU was probably woken up by the HPET's ISR.
1405  * On a heavily loaded system CPUs are not going into Deep C-state.
1406  * On a moderately loaded system another CPU will usually enter Deep C-state
1407  * and reprogram the HPET before the HPET fires with our wakeup.
1408  */
1409 static void
1410 hpet_use_lapic_timer(hrtime_t expire)
1411 {
1412 	extern void	apic_timer_restart(hrtime_t);
1413 	processorid_t	cpu_id = CPU->cpu_id;
1414 
1415 	ASSERT(CPU->cpu_thread == CPU->cpu_idle_thread);
1416 	ASSERT(!interrupts_enabled());
1417 
1418 	hpet_proxy_users[cpu_id] = HPET_INFINITY;
1419 
1420 	/*
1421 	 * Do not enable a LAPIC Timer that was initially disabled.
1422 	 */
1423 	if (expire != HPET_INFINITY)
1424 		apic_timer_restart(expire);
1425 }
1426 
1427 /*
1428  * Initialize data structure to keep track of CPUs using HPET as a proxy for
1429  * their stalled local APIC timer.  For now this is just an array.
1430  */
1431 static void
1432 hpet_init_proxy_data(void)
1433 {
1434 	processorid_t	id;
1435 
1436 	/*
1437 	 * Use max_ncpus for hot plug compliance.
1438 	 */
1439 	hpet_proxy_users = kmem_zalloc(max_ncpus * sizeof (*hpet_proxy_users),
1440 	    KM_SLEEP);
1441 
1442 	/*
1443 	 * Unused entries always contain HPET_INFINITY.
1444 	 */
1445 	for (id = 0; id < max_ncpus; ++id)
1446 		hpet_proxy_users[id] = HPET_INFINITY;
1447 }
1448