xref: /illumos-gate/usr/src/uts/common/os/cpu_event.c (revision 129b3e6c5b0ac55b5021a4c38db6387b6acdaaf1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, Intel Corporation.
23  * All rights reserved.
24  */
25 
26 /*
27  * Introduction
28  * This file implements a CPU event notification mechanism to signal clients
29  * which are interested in CPU related events.
30  * Currently it only supports CPU idle state change events which will be
31  * triggered just before CPU entering hardware idle state and just after CPU
32  * wakes up from hardware idle state.
33  * Please refer to PSARC/2009/115 for detail information.
34  *
35  * Lock Strategy
36  * 1) cpu_idle_prop_busy/free are protected by cpu_idle_prop_lock.
37  * 2) No protection for cpu_idle_cb_state because it's per-CPU data.
38  * 3) cpu_idle_cb_busy is protected by cpu_idle_cb_lock.
39  * 4) cpu_idle_cb_array is protected by pause_cpus/start_cpus logic.
40  * 5) cpu_idle_cb_max/curr are protected by both cpu_idle_cb_lock and
41  *    pause_cpus/start_cpus logic.
42  * We have optimized the algorithm for hot path on read side access.
43  * In the current algorithm, it's lock free on read side access.
44  * On write side, we use pause_cpus() to keep other CPUs in the pause thread,
45  * which will guarantee that no other threads will access
46  * cpu_idle_cb_max/curr/array data structure.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/cmn_err.h>
51 #include <sys/cpuvar.h>
52 #include <sys/cpu.h>
53 #include <sys/kmem.h>
54 #include <sys/machcpuvar.h>
55 #include <sys/sdt.h>
56 #include <sys/sysmacros.h>
57 #include <sys/synch.h>
58 #include <sys/systm.h>
59 #include <sys/sunddi.h>
60 #if defined(__sparc)
61 #include <sys/machsystm.h>
62 #elif defined(__x86)
63 #include <sys/archsystm.h>
64 #endif
65 #include <sys/cpu_event.h>
66 
67 /* Define normal state for CPU on different platforms. */
68 #if defined(__x86)
69 #define	CPU_IDLE_STATE_NORMAL		IDLE_STATE_C0
70 #elif defined(__sparc)
71 /*
72  * At the time of this implementation IDLE_STATE_NORMAL is defined
73  * in mach_startup.c, and not in a header file.  So if we find it is
74  * undefined, then we set it to the value as defined in mach_startup.c
75  * Should it eventually be defined, we will pick it up.
76  */
77 #ifndef	IDLE_STATE_NORMAL
78 #define	IDLE_STATE_NORMAL	0
79 #endif
80 #define	CPU_IDLE_STATE_NORMAL	IDLE_STATE_NORMAL
81 #endif
82 
83 /*
84  * To improve cache efficiency and avoid cache false sharing, CPU idle
85  * properties are grouped into cache lines as below:
86  * |     CPU0      |     CPU1      |.........|     CPUn      |
87  * | cache line 0  | cache line 1  |.........| cache line n  |
88  * | v0 | ... | vm | v0 | ... | vm |.........| v0 | ... | vm |
89  * To access value of property m for CPU n, using following value as index:
90  *    index = seq_id_of_CPUn * CPU_IDLE_VALUE_GROUP_SIZE + m.
91  */
92 #define	CPU_IDLE_VALUE_GROUP_SIZE	\
93 	(CPU_CACHE_COHERENCE_SIZE / sizeof (cpu_idle_prop_value_t))
94 
95 /* Get callback context handle for current CPU. */
96 #define	CPU_IDLE_GET_CTX(cp)		\
97 	((cpu_idle_callback_context_t)(intptr_t)((cp)->cpu_seqid))
98 
99 /* Get CPU sequential id from ctx. */
100 #define	CPU_IDLE_CTX2CPUID(ctx)		((processorid_t)(intptr_t)(ctx))
101 
102 /* Compute index from callback context handle. */
103 #define	CPU_IDLE_CTX2IDX(ctx)		\
104 	(((int)(intptr_t)(ctx)) * CPU_IDLE_VALUE_GROUP_SIZE)
105 
106 #define	CPU_IDLE_HDL2VALP(hdl, idx)	\
107 	(&((cpu_idle_prop_impl_t *)(hdl))->value[(idx)])
108 
109 /*
110  * When cpu_idle_cb_array is NULL or full, increase CPU_IDLE_ARRAY_CAPACITY_INC
111  * entries every time. Here we prefer linear growth instead of exponential.
112  */
113 #define	CPU_IDLE_ARRAY_CAPACITY_INC	0x10
114 
115 typedef struct cpu_idle_prop_impl {
116 	cpu_idle_prop_value_t		*value;
117 	struct cpu_idle_prop_impl	*next;
118 	char				*name;
119 	cpu_idle_prop_update_t		update;
120 	void				*private;
121 	cpu_idle_prop_type_t		type;
122 	uint32_t			refcnt;
123 } cpu_idle_prop_impl_t;
124 
125 typedef struct cpu_idle_prop_item {
126 	cpu_idle_prop_type_t		type;
127 	char				*name;
128 	cpu_idle_prop_update_t		update;
129 	void				*arg;
130 	cpu_idle_prop_handle_t		handle;
131 } cpu_idle_prop_item_t;
132 
133 /* Structure to maintain registered callbacks in list. */
134 typedef struct cpu_idle_cb_impl {
135 	struct cpu_idle_cb_impl		*next;
136 	cpu_idle_callback_t		*callback;
137 	void				*argument;
138 	int				priority;
139 } cpu_idle_cb_impl_t;
140 
141 /*
142  * Structure to maintain registered callbacks in priority order and also
143  * optimized for cache efficiency for reading access.
144  */
145 typedef struct cpu_idle_cb_item {
146 	cpu_idle_enter_cbfn_t		enter;
147 	cpu_idle_exit_cbfn_t		exit;
148 	void				*arg;
149 	cpu_idle_cb_impl_t		*impl;
150 } cpu_idle_cb_item_t;
151 
152 /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */
153 typedef union cpu_idle_cb_state {
154 	struct {
155 		int			index;
156 		boolean_t		ready;
157 		cpu_idle_prop_value_t	*idle_state;
158 		cpu_idle_prop_value_t	*enter_ts;
159 		cpu_idle_prop_value_t	*exit_ts;
160 		cpu_idle_prop_value_t	*last_idle;
161 		cpu_idle_prop_value_t	*last_busy;
162 		cpu_idle_prop_value_t	*total_idle;
163 		cpu_idle_prop_value_t	*total_busy;
164 		cpu_idle_prop_value_t	*intr_cnt;
165 	} v;
166 #ifdef _LP64
167 	char				align[2 * CPU_CACHE_COHERENCE_SIZE];
168 #else
169 	char				align[CPU_CACHE_COHERENCE_SIZE];
170 #endif
171 } cpu_idle_cb_state_t;
172 
173 static kmutex_t				cpu_idle_prop_lock;
174 static cpu_idle_prop_impl_t		*cpu_idle_prop_busy = NULL;
175 static cpu_idle_prop_impl_t		*cpu_idle_prop_free = NULL;
176 
177 static kmutex_t				cpu_idle_cb_lock;
178 static cpu_idle_cb_impl_t		*cpu_idle_cb_busy = NULL;
179 static cpu_idle_cb_item_t		*cpu_idle_cb_array = NULL;
180 static int				cpu_idle_cb_curr = 0;
181 static int				cpu_idle_cb_max = 0;
182 
183 static cpu_idle_cb_state_t		*cpu_idle_cb_state;
184 
185 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
186     cpu_idle_prop_value_t *valp);
187 
188 static cpu_idle_prop_item_t cpu_idle_prop_array[] = {
189 	{
190 	    CPU_IDLE_PROP_TYPE_INTPTR, CPU_IDLE_PROP_IDLE_STATE,
191 	    NULL, NULL, NULL
192 	},
193 	{
194 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_ENTER_TIMESTAMP,
195 	    NULL, NULL, NULL
196 	},
197 	{
198 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_EXIT_TIMESTAMP,
199 	    NULL, NULL, NULL
200 	},
201 	{
202 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_IDLE_TIME,
203 	    NULL, NULL, NULL
204 	},
205 	{
206 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_BUSY_TIME,
207 	    NULL, NULL, NULL
208 	},
209 	{
210 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_IDLE_TIME,
211 	    NULL, NULL, NULL
212 	},
213 	{
214 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_BUSY_TIME,
215 	    NULL, NULL, NULL
216 	},
217 	{
218 	    CPU_IDLE_PROP_TYPE_UINT64, CPU_IDLE_PROP_INTERRUPT_COUNT,
219 	    cpu_idle_prop_update_intr_cnt, NULL, NULL
220 	},
221 };
222 
223 #define	CPU_IDLE_PROP_IDX_IDLE_STATE	0
224 #define	CPU_IDLE_PROP_IDX_ENTER_TS	1
225 #define	CPU_IDLE_PROP_IDX_EXIT_TS	2
226 #define	CPU_IDLE_PROP_IDX_LAST_IDLE	3
227 #define	CPU_IDLE_PROP_IDX_LAST_BUSY	4
228 #define	CPU_IDLE_PROP_IDX_TOTAL_IDLE	5
229 #define	CPU_IDLE_PROP_IDX_TOTAL_BUSY	6
230 #define	CPU_IDLE_PROP_IDX_INTR_CNT	7
231 
232 /*ARGSUSED*/
233 static void
234 cpu_idle_dtrace_enter(void *arg, cpu_idle_callback_context_t ctx,
235     cpu_idle_check_wakeup_t check_func, void *check_arg)
236 {
237 	int state;
238 
239 	state = cpu_idle_prop_get_intptr(
240 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, ctx);
241 	DTRACE_PROBE1(idle__state__transition, uint_t, state);
242 }
243 
244 /*ARGSUSED*/
245 static void
246 cpu_idle_dtrace_exit(void *arg, cpu_idle_callback_context_t ctx, int flag)
247 {
248 	DTRACE_PROBE1(idle__state__transition, uint_t, CPU_IDLE_STATE_NORMAL);
249 }
250 
251 static cpu_idle_callback_handle_t cpu_idle_cb_handle_dtrace;
252 static cpu_idle_callback_t cpu_idle_callback_dtrace = {
253 	CPU_IDLE_CALLBACK_VERS,
254 	cpu_idle_dtrace_enter,
255 	cpu_idle_dtrace_exit,
256 };
257 
258 #if defined(__x86) && !defined(__xpv)
259 extern void tlb_going_idle(void);
260 extern void tlb_service(void);
261 
262 static cpu_idle_callback_handle_t cpu_idle_cb_handle_tlb;
263 static cpu_idle_callback_t cpu_idle_callback_tlb = {
264 	CPU_IDLE_CALLBACK_VERS,
265 	(cpu_idle_enter_cbfn_t)tlb_going_idle,
266 	(cpu_idle_exit_cbfn_t)tlb_service,
267 };
268 #endif
269 
270 void
271 cpu_event_init(void)
272 {
273 	int i, idx;
274 	size_t sz;
275 	intptr_t buf;
276 	cpu_idle_cb_state_t *sp;
277 	cpu_idle_prop_item_t *ip;
278 
279 	mutex_init(&cpu_idle_cb_lock, NULL, MUTEX_DRIVER, NULL);
280 	mutex_init(&cpu_idle_prop_lock, NULL, MUTEX_DRIVER, NULL);
281 
282 	/* Create internal properties. */
283 	for (i = 0, ip = cpu_idle_prop_array;
284 	    i < sizeof (cpu_idle_prop_array) / sizeof (cpu_idle_prop_array[0]);
285 	    i++, ip++) {
286 		(void) cpu_idle_prop_create_property(ip->name, ip->type,
287 		    ip->update, ip->arg, &ip->handle);
288 		ASSERT(ip->handle != NULL);
289 	}
290 
291 	/* Allocate buffer and align to CPU_CACHE_COHERENCE_SIZE. */
292 	sz = sizeof (cpu_idle_cb_state_t) * max_ncpus;
293 	sz += CPU_CACHE_COHERENCE_SIZE;
294 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
295 	cpu_idle_cb_state = (cpu_idle_cb_state_t *)P2ROUNDUP(buf,
296 	    CPU_CACHE_COHERENCE_SIZE);
297 
298 	/* Cache frequently used property value pointers. */
299 	for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
300 		idx = CPU_IDLE_CTX2IDX(i);
301 #define	___INIT_P(f, i)	\
302 	sp->v.f = CPU_IDLE_HDL2VALP(cpu_idle_prop_array[(i)].handle, idx)
303 		___INIT_P(idle_state, CPU_IDLE_PROP_IDX_IDLE_STATE);
304 		___INIT_P(enter_ts, CPU_IDLE_PROP_IDX_ENTER_TS);
305 		___INIT_P(exit_ts, CPU_IDLE_PROP_IDX_EXIT_TS);
306 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_LAST_IDLE);
307 		___INIT_P(last_busy, CPU_IDLE_PROP_IDX_LAST_BUSY);
308 		___INIT_P(total_idle, CPU_IDLE_PROP_IDX_TOTAL_IDLE);
309 		___INIT_P(total_busy, CPU_IDLE_PROP_IDX_TOTAL_BUSY);
310 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_INTR_CNT);
311 #undef	___INIT_P
312 	}
313 
314 	/* Register built-in callbacks. */
315 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_DTRACE,
316 	    &cpu_idle_callback_dtrace, NULL, &cpu_idle_cb_handle_dtrace) != 0) {
317 		cmn_err(CE_PANIC,
318 		    "cpu_idle: failed to register callback for dtrace.");
319 	}
320 #if defined(__x86) && !defined(__xpv)
321 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_TLB,
322 	    &cpu_idle_callback_tlb, NULL, &cpu_idle_cb_handle_tlb) != 0) {
323 		cmn_err(CE_PANIC,
324 		    "cpu_idle: failed to register callback for tlb_flush.");
325 	}
326 #endif
327 }
328 
329 void
330 cpu_event_init_cpu(cpu_t *cp)
331 {
332 	ASSERT(cp->cpu_seqid < max_ncpus);
333 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
334 }
335 
336 void
337 cpu_event_fini_cpu(cpu_t *cp)
338 {
339 	ASSERT(cp->cpu_seqid < max_ncpus);
340 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
341 }
342 
343 static void
344 cpu_idle_insert_callback(cpu_idle_cb_impl_t *cip)
345 {
346 	int unlock = 0, unpause = 0;
347 	int i, cnt_new = 0, cnt_old = 0;
348 	char *buf_new = NULL, *buf_old = NULL;
349 
350 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
351 
352 	/*
353 	 * Expand array if it's full.
354 	 * Memory must be allocated out of pause/start_cpus() scope because
355 	 * kmem_zalloc() can't be called with KM_SLEEP flag within that scope.
356 	 */
357 	if (cpu_idle_cb_curr == cpu_idle_cb_max) {
358 		cnt_new = cpu_idle_cb_max + CPU_IDLE_ARRAY_CAPACITY_INC;
359 		buf_new = (char *)kmem_zalloc(cnt_new *
360 		    sizeof (cpu_idle_cb_item_t), KM_SLEEP);
361 	}
362 
363 	/* Try to acquire cpu_lock if not held yet. */
364 	if (!MUTEX_HELD(&cpu_lock)) {
365 		mutex_enter(&cpu_lock);
366 		unlock = 1;
367 	}
368 	/*
369 	 * Pause all other CPUs (and let them run pause thread).
370 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
371 	 * after pause_cpus().
372 	 */
373 	if (!cpus_paused()) {
374 		pause_cpus(NULL);
375 		unpause = 1;
376 	}
377 
378 	/* Copy content to new buffer if needed. */
379 	if (buf_new != NULL) {
380 		buf_old = (char *)cpu_idle_cb_array;
381 		cnt_old = cpu_idle_cb_max;
382 		if (buf_old != NULL) {
383 			ASSERT(cnt_old != 0);
384 			bcopy(cpu_idle_cb_array, buf_new,
385 			    sizeof (cpu_idle_cb_item_t) * cnt_old);
386 		}
387 		cpu_idle_cb_array = (cpu_idle_cb_item_t *)buf_new;
388 		cpu_idle_cb_max = cnt_new;
389 	}
390 
391 	/* Insert into array according to priority. */
392 	ASSERT(cpu_idle_cb_curr < cpu_idle_cb_max);
393 	for (i = cpu_idle_cb_curr; i > 0; i--) {
394 		if (cpu_idle_cb_array[i - 1].impl->priority >= cip->priority) {
395 			break;
396 		}
397 		cpu_idle_cb_array[i] = cpu_idle_cb_array[i - 1];
398 	}
399 	cpu_idle_cb_array[i].arg = cip->argument;
400 	cpu_idle_cb_array[i].enter = cip->callback->idle_enter;
401 	cpu_idle_cb_array[i].exit = cip->callback->idle_exit;
402 	cpu_idle_cb_array[i].impl = cip;
403 	cpu_idle_cb_curr++;
404 
405 	/* Resume other CPUs from paused state if needed. */
406 	if (unpause) {
407 		start_cpus();
408 	}
409 	if (unlock) {
410 		mutex_exit(&cpu_lock);
411 	}
412 
413 	/* Free old resource if needed. */
414 	if (buf_old != NULL) {
415 		ASSERT(cnt_old != 0);
416 		kmem_free(buf_old, cnt_old * sizeof (cpu_idle_cb_item_t));
417 	}
418 }
419 
420 static void
421 cpu_idle_remove_callback(cpu_idle_cb_impl_t *cip)
422 {
423 	int i, found = 0;
424 	int unlock = 0, unpause = 0;
425 	cpu_idle_cb_state_t *sp;
426 
427 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
428 
429 	/* Try to acquire cpu_lock if not held yet. */
430 	if (!MUTEX_HELD(&cpu_lock)) {
431 		mutex_enter(&cpu_lock);
432 		unlock = 1;
433 	}
434 	/*
435 	 * Pause all other CPUs.
436 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
437 	 * after pause_cpus().
438 	 */
439 	if (!cpus_paused()) {
440 		pause_cpus(NULL);
441 		unpause = 1;
442 	}
443 
444 	/* Remove cip from array. */
445 	for (i = 0; i < cpu_idle_cb_curr; i++) {
446 		if (found == 0) {
447 			if (cpu_idle_cb_array[i].impl == cip) {
448 				found = 1;
449 			}
450 		} else {
451 			cpu_idle_cb_array[i - 1] = cpu_idle_cb_array[i];
452 		}
453 	}
454 	ASSERT(found != 0);
455 	cpu_idle_cb_curr--;
456 
457 	/*
458 	 * Reset property ready flag for all CPUs if no registered callback
459 	 * left because cpu_idle_enter/exit will stop updating property if
460 	 * there's no callback registered.
461 	 */
462 	if (cpu_idle_cb_curr == 0) {
463 		for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
464 			sp->v.ready = B_FALSE;
465 		}
466 	}
467 
468 	/* Resume other CPUs from paused state if needed. */
469 	if (unpause) {
470 		start_cpus();
471 	}
472 	if (unlock) {
473 		mutex_exit(&cpu_lock);
474 	}
475 }
476 
477 int
478 cpu_idle_register_callback(uint_t prio, cpu_idle_callback_t *cbp,
479     void *arg, cpu_idle_callback_handle_t *hdlp)
480 {
481 	cpu_idle_cb_state_t *sp;
482 	cpu_idle_cb_impl_t *cip = NULL;
483 
484 	/* First validate parameters. */
485 	ASSERT(!CPU_ON_INTR(CPU));
486 	ASSERT(CPU->cpu_seqid < max_ncpus);
487 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
488 	if (sp->v.index != 0) {
489 		cmn_err(CE_NOTE,
490 		    "!cpu_event: register_callback called from callback.");
491 		return (EBUSY);
492 	} else if (cbp == NULL || hdlp == NULL) {
493 		cmn_err(CE_NOTE,
494 		    "!cpu_event: NULL parameters in register_callback.");
495 		return (EINVAL);
496 	} else if (prio < CPU_IDLE_CB_PRIO_LOW_BASE ||
497 	    prio >= CPU_IDLE_CB_PRIO_RESV_BASE) {
498 		cmn_err(CE_NOTE,
499 		    "!cpu_event: priority 0x%x out of range.", prio);
500 		return (EINVAL);
501 	} else if (cbp->version != CPU_IDLE_CALLBACK_VERS) {
502 		cmn_err(CE_NOTE,
503 		    "!cpu_event: callback version %d is not supported.",
504 		    cbp->version);
505 		return (EINVAL);
506 	}
507 
508 	mutex_enter(&cpu_idle_cb_lock);
509 	/* Check whether callback with priority exists if not dynamic. */
510 	if (prio != CPU_IDLE_CB_PRIO_DYNAMIC) {
511 		for (cip = cpu_idle_cb_busy; cip != NULL;
512 		    cip = cip->next) {
513 			if (cip->priority == prio) {
514 				mutex_exit(&cpu_idle_cb_lock);
515 				cmn_err(CE_NOTE, "!cpu_event: callback with "
516 				    "priority 0x%x already exists.", prio);
517 				return (EEXIST);
518 			}
519 		}
520 	}
521 
522 	cip = kmem_zalloc(sizeof (*cip), KM_SLEEP);
523 	cip->callback = cbp;
524 	cip->argument = arg;
525 	cip->priority = prio;
526 	cip->next = cpu_idle_cb_busy;
527 	cpu_idle_cb_busy = cip;
528 	cpu_idle_insert_callback(cip);
529 	mutex_exit(&cpu_idle_cb_lock);
530 
531 	*hdlp = (cpu_idle_callback_handle_t)cip;
532 
533 	return (0);
534 }
535 
536 int
537 cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl)
538 {
539 	int rc = ENODEV;
540 	cpu_idle_cb_state_t *sp;
541 	cpu_idle_cb_impl_t *ip, **ipp;
542 
543 	ASSERT(!CPU_ON_INTR(CPU));
544 	ASSERT(CPU->cpu_seqid < max_ncpus);
545 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
546 	if (sp->v.index != 0) {
547 		cmn_err(CE_NOTE,
548 		    "!cpu_event: unregister_callback called from callback.");
549 		return (EBUSY);
550 	} else if (hdl == NULL) {
551 		cmn_err(CE_NOTE,
552 		    "!cpu_event: hdl is NULL in unregister_callback.");
553 		return (EINVAL);
554 	}
555 
556 	ip = (cpu_idle_cb_impl_t *)hdl;
557 	mutex_enter(&cpu_idle_cb_lock);
558 	for (ipp = &cpu_idle_cb_busy; *ipp != NULL; ipp = &(*ipp)->next) {
559 		if (*ipp == ip) {
560 			*ipp = ip->next;
561 			cpu_idle_remove_callback(ip);
562 			rc = 0;
563 			break;
564 		}
565 	}
566 	mutex_exit(&cpu_idle_cb_lock);
567 
568 	if (rc == 0) {
569 		kmem_free(ip, sizeof (*ip));
570 	} else {
571 		cmn_err(CE_NOTE,
572 		    "!cpu_event: callback handle %p not found.", (void *)hdl);
573 	}
574 
575 	return (rc);
576 }
577 
578 static int
579 cpu_idle_enter_state(cpu_idle_cb_state_t *sp, intptr_t state)
580 {
581 	sp->v.idle_state->cipv_intptr = state;
582 	sp->v.enter_ts->cipv_hrtime = gethrtime_unscaled();
583 	sp->v.last_busy->cipv_hrtime = sp->v.enter_ts->cipv_hrtime -
584 	    sp->v.exit_ts->cipv_hrtime;
585 	sp->v.total_busy->cipv_hrtime += sp->v.last_busy->cipv_hrtime;
586 	if (sp->v.ready == B_FALSE) {
587 		sp->v.ready = B_TRUE;
588 		return (0);
589 	}
590 
591 	return (1);
592 }
593 
594 static void
595 cpu_idle_exit_state(cpu_idle_cb_state_t *sp)
596 {
597 	sp->v.idle_state->cipv_intptr = CPU_IDLE_STATE_NORMAL;
598 	sp->v.exit_ts->cipv_hrtime = gethrtime_unscaled();
599 	sp->v.last_idle->cipv_hrtime = sp->v.exit_ts->cipv_hrtime -
600 	    sp->v.enter_ts->cipv_hrtime;
601 	sp->v.total_idle->cipv_hrtime += sp->v.last_idle->cipv_hrtime;
602 }
603 
604 /*ARGSUSED*/
605 int
606 cpu_idle_enter(int state, int flag,
607     cpu_idle_check_wakeup_t check_func, void *check_arg)
608 {
609 	int i;
610 	cpu_idle_cb_item_t *cip;
611 	cpu_idle_cb_state_t *sp;
612 	cpu_idle_callback_context_t ctx;
613 #if defined(__x86)
614 	ulong_t iflags;
615 #endif
616 
617 	ctx = CPU_IDLE_GET_CTX(CPU);
618 	ASSERT(CPU->cpu_seqid < max_ncpus);
619 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
620 	ASSERT(sp->v.index == 0);
621 
622 	/*
623 	 * On x86, cpu_idle_enter can be called from idle thread with either
624 	 * interrupts enabled or disabled, so we need to make sure interrupts
625 	 * are disabled here.
626 	 * On SPARC, cpu_idle_enter will be called from idle thread with
627 	 * interrupt disabled, so no special handling necessary.
628 	 */
629 #if defined(__x86)
630 	iflags = intr_clear();
631 #endif
632 
633 	/* Skip calling callback if state is not ready for current CPU. */
634 	if (cpu_idle_enter_state(sp, state) == 0) {
635 #if defined(__x86)
636 		intr_restore(iflags);
637 #endif
638 		return (0);
639 	}
640 
641 	for (i = 0, cip = cpu_idle_cb_array; i < cpu_idle_cb_curr; i++, cip++) {
642 		/*
643 		 * Increase index so corresponding idle_exit callback
644 		 * will be invoked should interrupt happen during
645 		 * idle_enter callback.
646 		 */
647 		sp->v.index++;
648 
649 		/* Call idle_enter callback function if it's not NULL. */
650 		if (cip->enter != NULL) {
651 			cip->enter(cip->arg, ctx, check_func, check_arg);
652 
653 			/*
654 			 * cpu_idle_enter runs with interrupts
655 			 * disabled, so the idle_enter callbacks will
656 			 * also be called with interrupts disabled.
657 			 * It is permissible for the callbacks to
658 			 * enable the interrupts, if they can also
659 			 * handle the condition if the interrupt
660 			 * occurs.
661 			 *
662 			 * However, if an interrupt occurs and we
663 			 * return here without dealing with it, we
664 			 * return to the cpu_idle_enter() caller
665 			 * with an EBUSY, and the caller will not
666 			 * enter the idle state.
667 			 *
668 			 * We detect the interrupt, by checking the
669 			 * index value of the state pointer.  If it
670 			 * is not the index we incremented above,
671 			 * then it was cleared while processing
672 			 * the interrupt.
673 			 *
674 			 * Also note, that at this point of the code
675 			 * the normal index value will be one greater
676 			 * than the variable 'i' in the loop, as it
677 			 * hasn't yet been incremented.
678 			 */
679 			if (sp->v.index != i + 1) {
680 #if defined(__x86)
681 				intr_restore(iflags);
682 #endif
683 				return (EBUSY);
684 			}
685 		}
686 	}
687 #if defined(__x86)
688 	intr_restore(iflags);
689 #endif
690 
691 	return (0);
692 }
693 
694 void
695 cpu_idle_exit(int flag)
696 {
697 	int i;
698 	cpu_idle_cb_item_t *cip;
699 	cpu_idle_cb_state_t *sp;
700 	cpu_idle_callback_context_t ctx;
701 #if defined(__x86)
702 	ulong_t iflags;
703 #endif
704 
705 	ASSERT(CPU->cpu_seqid < max_ncpus);
706 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
707 
708 #if defined(__sparc)
709 	/*
710 	 * On SPARC, cpu_idle_exit will only be called from idle thread
711 	 * with interrupt disabled.
712 	 */
713 
714 	if (sp->v.index != 0) {
715 		ctx = CPU_IDLE_GET_CTX(CPU);
716 		cpu_idle_exit_state(sp);
717 		for (i = sp->v.index - 1; i >= 0; i--) {
718 			cip = &cpu_idle_cb_array[i];
719 			if (cip->exit != NULL) {
720 				cip->exit(cip->arg, ctx, flag);
721 			}
722 		}
723 		sp->v.index = 0;
724 	}
725 #elif defined(__x86)
726 	/*
727 	 * On x86, cpu_idle_exit will be called from idle thread or interrupt
728 	 * handler. When called from interrupt handler, interrupts will be
729 	 * disabled. When called from idle thread, interrupts may be disabled
730 	 * or enabled.
731 	 */
732 
733 	/* Called from interrupt, interrupts are already disabled. */
734 	if (flag & CPU_IDLE_CB_FLAG_INTR) {
735 		/*
736 		 * return if cpu_idle_exit already called or
737 		 * there is no registered callback.
738 		 */
739 		if (sp->v.index == 0) {
740 			return;
741 		}
742 		ctx = CPU_IDLE_GET_CTX(CPU);
743 		cpu_idle_exit_state(sp);
744 		for (i = sp->v.index - 1; i >= 0; i--) {
745 			cip = &cpu_idle_cb_array[i];
746 			if (cip->exit != NULL) {
747 				cip->exit(cip->arg, ctx, flag);
748 			}
749 		}
750 		sp->v.index = 0;
751 
752 	/* Called from idle thread, need to disable interrupt. */
753 	} else {
754 		iflags = intr_clear();
755 		if (sp->v.index != 0) {
756 			ctx = CPU_IDLE_GET_CTX(CPU);
757 			cpu_idle_exit_state(sp);
758 			for (i = sp->v.index - 1; i >= 0; i--) {
759 				cip = &cpu_idle_cb_array[i];
760 				if (cip->exit != NULL) {
761 					cip->exit(cip->arg, ctx, flag);
762 				}
763 			}
764 			sp->v.index = 0;
765 		}
766 		intr_restore(iflags);
767 	}
768 #endif
769 }
770 
771 cpu_idle_callback_context_t
772 cpu_idle_get_context(void)
773 {
774 	return (CPU_IDLE_GET_CTX(CPU));
775 }
776 
777 /*
778  * Allocate property structure in group of CPU_IDLE_VALUE_GROUP_SIZE to improve
779  * cache efficiency. To simplify implementation, allocated memory for property
780  * structure won't be freed.
781  */
782 static void
783 cpu_idle_prop_allocate_impl(void)
784 {
785 	int i;
786 	size_t sz;
787 	intptr_t buf;
788 	cpu_idle_prop_impl_t *prop;
789 	cpu_idle_prop_value_t *valp;
790 
791 	ASSERT(!CPU_ON_INTR(CPU));
792 	prop = kmem_zalloc(sizeof (*prop) * CPU_IDLE_VALUE_GROUP_SIZE,
793 	    KM_SLEEP);
794 	sz = sizeof (*valp) * CPU_IDLE_VALUE_GROUP_SIZE * max_ncpus;
795 	sz += CPU_CACHE_COHERENCE_SIZE;
796 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
797 	valp = (cpu_idle_prop_value_t *)P2ROUNDUP(buf,
798 	    CPU_CACHE_COHERENCE_SIZE);
799 
800 	for (i = 0; i < CPU_IDLE_VALUE_GROUP_SIZE; i++, prop++, valp++) {
801 		prop->value = valp;
802 		prop->next = cpu_idle_prop_free;
803 		cpu_idle_prop_free = prop;
804 	}
805 }
806 
807 int
808 cpu_idle_prop_create_property(const char *name, cpu_idle_prop_type_t type,
809     cpu_idle_prop_update_t update, void *arg, cpu_idle_prop_handle_t *hdlp)
810 {
811 	int rc = EEXIST;
812 	cpu_idle_prop_impl_t *prop;
813 
814 	ASSERT(!CPU_ON_INTR(CPU));
815 	if (name == NULL || hdlp == NULL) {
816 		cmn_err(CE_WARN,
817 		    "!cpu_event: NULL parameters in create_property.");
818 		return (EINVAL);
819 	}
820 
821 	mutex_enter(&cpu_idle_prop_lock);
822 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
823 		if (strcmp(prop->name, name) == 0) {
824 			cmn_err(CE_NOTE,
825 			    "!cpu_event: property %s already exists.", name);
826 			break;
827 		}
828 	}
829 	if (prop == NULL) {
830 		if (cpu_idle_prop_free == NULL) {
831 			cpu_idle_prop_allocate_impl();
832 		}
833 		ASSERT(cpu_idle_prop_free != NULL);
834 		prop = cpu_idle_prop_free;
835 		cpu_idle_prop_free = prop->next;
836 		prop->next = cpu_idle_prop_busy;
837 		cpu_idle_prop_busy = prop;
838 
839 		ASSERT(prop->value != NULL);
840 		prop->name = strdup(name);
841 		prop->type = type;
842 		prop->update = update;
843 		prop->private = arg;
844 		prop->refcnt = 1;
845 		*hdlp = prop;
846 		rc = 0;
847 	}
848 	mutex_exit(&cpu_idle_prop_lock);
849 
850 	return (rc);
851 }
852 
853 int
854 cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl)
855 {
856 	int rc = ENODEV;
857 	cpu_idle_prop_impl_t *prop, **propp;
858 	cpu_idle_prop_value_t *valp;
859 
860 	ASSERT(!CPU_ON_INTR(CPU));
861 	if (hdl == NULL) {
862 		cmn_err(CE_WARN,
863 		    "!cpu_event: hdl is NULL in destroy_property.");
864 		return (EINVAL);
865 	}
866 
867 	prop = (cpu_idle_prop_impl_t *)hdl;
868 	mutex_enter(&cpu_idle_prop_lock);
869 	for (propp = &cpu_idle_prop_busy; *propp != NULL;
870 	    propp = &(*propp)->next) {
871 		if (*propp == prop) {
872 			ASSERT(prop->refcnt > 0);
873 			if (atomic_cas_32(&prop->refcnt, 1, 0) == 1) {
874 				*propp = prop->next;
875 				strfree(prop->name);
876 				valp = prop->value;
877 				bzero(prop, sizeof (*prop));
878 				prop->value = valp;
879 				prop->next = cpu_idle_prop_free;
880 				cpu_idle_prop_free = prop;
881 				rc = 0;
882 			} else {
883 				rc = EBUSY;
884 			}
885 			break;
886 		}
887 	}
888 	mutex_exit(&cpu_idle_prop_lock);
889 
890 	return (rc);
891 }
892 
893 int
894 cpu_idle_prop_create_handle(const char *name, cpu_idle_prop_handle_t *hdlp)
895 {
896 	int rc = ENODEV;
897 	cpu_idle_prop_impl_t *prop;
898 
899 	ASSERT(!CPU_ON_INTR(CPU));
900 	if (name == NULL || hdlp == NULL) {
901 		cmn_err(CE_WARN,
902 		    "!cpu_event: NULL parameters in create_handle.");
903 		return (EINVAL);
904 	}
905 
906 	mutex_enter(&cpu_idle_prop_lock);
907 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
908 		if (strcmp(prop->name, name) == 0) {
909 			/* Hold one refcount on object. */
910 			ASSERT(prop->refcnt > 0);
911 			atomic_inc_32(&prop->refcnt);
912 			*hdlp = (cpu_idle_prop_handle_t)prop;
913 			rc = 0;
914 			break;
915 		}
916 	}
917 	mutex_exit(&cpu_idle_prop_lock);
918 
919 	return (rc);
920 }
921 
922 int
923 cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl)
924 {
925 	int rc = ENODEV;
926 	cpu_idle_prop_impl_t *prop;
927 
928 	ASSERT(!CPU_ON_INTR(CPU));
929 	if (hdl == NULL) {
930 		cmn_err(CE_WARN,
931 		    "!cpu_event: hdl is NULL in destroy_handle.");
932 		return (EINVAL);
933 	}
934 
935 	mutex_enter(&cpu_idle_prop_lock);
936 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
937 		if (prop == hdl) {
938 			/* Release refcnt held in create_handle. */
939 			ASSERT(prop->refcnt > 1);
940 			atomic_dec_32(&prop->refcnt);
941 			rc = 0;
942 			break;
943 		}
944 	}
945 	mutex_exit(&cpu_idle_prop_lock);
946 
947 	return (rc);
948 }
949 
950 cpu_idle_prop_type_t
951 cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl)
952 {
953 	ASSERT(hdl != NULL);
954 	return (((cpu_idle_prop_impl_t *)hdl)->type);
955 }
956 
957 const char *
958 cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl)
959 {
960 	ASSERT(hdl != NULL);
961 	return (((cpu_idle_prop_impl_t *)hdl)->name);
962 }
963 
964 int
965 cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl,
966     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t *valp)
967 {
968 	int idx, rc = 0;
969 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
970 
971 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
972 	if (hdl == NULL || valp == NULL) {
973 		cmn_err(CE_NOTE, "!cpu_event: NULL parameters in prop_get.");
974 		return (EINVAL);
975 	}
976 	idx = CPU_IDLE_CTX2IDX(ctx);
977 	if (prop->update != NULL) {
978 		cpu_idle_cb_state_t *sp;
979 
980 		ASSERT(CPU->cpu_seqid < max_ncpus);
981 		sp = &cpu_idle_cb_state[CPU->cpu_seqid];
982 		/* CPU's idle enter timestamp as sequence number. */
983 		rc = prop->update(prop->private,
984 		    (uint64_t)sp->v.enter_ts->cipv_hrtime, &prop->value[idx]);
985 	}
986 	if (rc == 0) {
987 		*valp = prop->value[idx];
988 	}
989 
990 	return (rc);
991 }
992 
993 uint32_t
994 cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl,
995     cpu_idle_callback_context_t ctx)
996 {
997 	int idx;
998 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
999 
1000 	ASSERT(hdl != NULL);
1001 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1002 	idx = CPU_IDLE_CTX2IDX(ctx);
1003 	return (prop->value[idx].cipv_uint32);
1004 }
1005 
1006 uint64_t
1007 cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl,
1008     cpu_idle_callback_context_t ctx)
1009 {
1010 	int idx;
1011 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1012 
1013 	ASSERT(hdl != NULL);
1014 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1015 	idx = CPU_IDLE_CTX2IDX(ctx);
1016 	return (prop->value[idx].cipv_uint64);
1017 }
1018 
1019 intptr_t
1020 cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl,
1021     cpu_idle_callback_context_t ctx)
1022 {
1023 	int idx;
1024 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1025 
1026 	ASSERT(hdl != NULL);
1027 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1028 	idx = CPU_IDLE_CTX2IDX(ctx);
1029 	return (prop->value[idx].cipv_intptr);
1030 }
1031 
1032 hrtime_t
1033 cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl,
1034     cpu_idle_callback_context_t ctx)
1035 {
1036 	int idx;
1037 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1038 
1039 	ASSERT(hdl != NULL);
1040 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1041 	idx = CPU_IDLE_CTX2IDX(ctx);
1042 	return (prop->value[idx].cipv_hrtime);
1043 }
1044 
1045 void
1046 cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl,
1047     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t val)
1048 {
1049 	int idx;
1050 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1051 
1052 	ASSERT(hdl != NULL);
1053 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1054 	idx = CPU_IDLE_CTX2IDX(ctx);
1055 	prop->value[idx] = val;
1056 }
1057 
1058 void
1059 cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl, cpu_idle_prop_value_t val)
1060 {
1061 	int i, idx;
1062 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1063 
1064 	ASSERT(hdl != NULL);
1065 	for (i = 0; i < max_ncpus; i++) {
1066 		idx = CPU_IDLE_CTX2IDX(i);
1067 		prop->value[idx] = val;
1068 	}
1069 }
1070 
1071 /*ARGSUSED*/
1072 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
1073     cpu_idle_prop_value_t *valp)
1074 {
1075 	int i;
1076 	uint64_t val;
1077 
1078 	for (val = 0, i = 0; i < PIL_MAX; i++) {
1079 		val += CPU->cpu_stats.sys.intr[i];
1080 	}
1081 	valp->cipv_uint64 = val;
1082 
1083 	return (0);
1084 }
1085 
1086 uint_t
1087 cpu_idle_get_cpu_state(cpu_t *cp)
1088 {
1089 	ASSERT(cp != NULL && cp->cpu_seqid < max_ncpus);
1090 	return ((uint_t)cpu_idle_prop_get_uint32(
1091 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle,
1092 	    CPU_IDLE_GET_CTX(cp)));
1093 }
1094