xref: /titanic_41/usr/src/uts/common/os/cpu_event.c (revision 744947dc83c634d985ed3ad79ac9c5e28d1865fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009-2010, Intel Corporation.
23  * All rights reserved.
24  */
25 
26 /*
27  * Introduction
28  * This file implements a CPU event notification mechanism to signal clients
29  * which are interested in CPU related events.
30  * Currently it only supports CPU idle state change events which will be
31  * triggered just before CPU entering hardware idle state and just after CPU
32  * wakes up from hardware idle state.
33  * Please refer to PSARC/2009/115 for detail information.
34  *
35  * Lock Strategy
36  * 1) cpu_idle_prop_busy/free are protected by cpu_idle_prop_lock.
37  * 2) No protection for cpu_idle_cb_state because it's per-CPU data.
38  * 3) cpu_idle_cb_busy is protected by cpu_idle_cb_lock.
39  * 4) cpu_idle_cb_array is protected by pause_cpus/start_cpus logic.
40  * 5) cpu_idle_cb_max/curr are protected by both cpu_idle_cb_lock and
41  *    pause_cpus/start_cpus logic.
42  * We have optimized the algorithm for hot path on read side access.
43  * In the current algorithm, it's lock free on read side access.
44  * On write side, we use pause_cpus() to keep other CPUs in the pause thread,
45  * which will guarantee that no other threads will access
46  * cpu_idle_cb_max/curr/array data structure.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/cmn_err.h>
51 #include <sys/cpuvar.h>
52 #include <sys/cpu.h>
53 #include <sys/kmem.h>
54 #include <sys/machcpuvar.h>
55 #include <sys/sdt.h>
56 #include <sys/sysmacros.h>
57 #include <sys/synch.h>
58 #include <sys/systm.h>
59 #include <sys/sunddi.h>
60 #if defined(__sparc)
61 #include <sys/machsystm.h>
62 #elif defined(__x86)
63 #include <sys/archsystm.h>
64 #endif
65 #include <sys/cpu_event.h>
66 
67 /* Define normal state for CPU on different platforms. */
68 #if defined(__x86)
69 #define	CPU_IDLE_STATE_NORMAL		IDLE_STATE_C0
70 #elif defined(__sparc)
71 /*
72  * At the time of this implementation IDLE_STATE_NORMAL is defined
73  * in mach_startup.c, and not in a header file.  So if we find it is
74  * undefined, then we set it to the value as defined in mach_startup.c
75  * Should it eventually be defined, we will pick it up.
76  */
77 #ifndef	IDLE_STATE_NORMAL
78 #define	IDLE_STATE_NORMAL	0
79 #endif
80 #define	CPU_IDLE_STATE_NORMAL	IDLE_STATE_NORMAL
81 #endif
82 
83 /*
84  * To improve cache efficiency and avoid cache false sharing, CPU idle
85  * properties are grouped into cache lines as below:
86  * |     CPU0      |     CPU1      |.........|     CPUn      |
87  * | cache line 0  | cache line 1  |.........| cache line n  |
88  * | v0 | ... | vm | v0 | ... | vm |.........| v0 | ... | vm |
89  * To access value of property m for CPU n, using following value as index:
90  *    index = seq_id_of_CPUn * CPU_IDLE_VALUE_GROUP_SIZE + m.
91  */
92 #define	CPU_IDLE_VALUE_GROUP_SIZE	\
93 	(CPU_CACHE_COHERENCE_SIZE / sizeof (cpu_idle_prop_value_t))
94 
95 /* Get callback context handle for current CPU. */
96 #define	CPU_IDLE_GET_CTX(cp)		\
97 	((cpu_idle_callback_context_t)(intptr_t)((cp)->cpu_seqid))
98 
99 /* Get CPU sequential id from ctx. */
100 #define	CPU_IDLE_CTX2CPUID(ctx)		((processorid_t)(intptr_t)(ctx))
101 
102 /* Compute index from callback context handle. */
103 #define	CPU_IDLE_CTX2IDX(ctx)		\
104 	(((int)(intptr_t)(ctx)) * CPU_IDLE_VALUE_GROUP_SIZE)
105 
106 #define	CPU_IDLE_HDL2VALP(hdl, idx)	\
107 	(&((cpu_idle_prop_impl_t *)(hdl))->value[(idx)])
108 
109 /*
110  * When cpu_idle_cb_array is NULL or full, increase CPU_IDLE_ARRAY_CAPACITY_INC
111  * entries every time. Here we prefer linear growth instead of exponential.
112  */
113 #define	CPU_IDLE_ARRAY_CAPACITY_INC	0x10
114 
115 typedef struct cpu_idle_prop_impl {
116 	cpu_idle_prop_value_t		*value;
117 	struct cpu_idle_prop_impl	*next;
118 	char				*name;
119 	cpu_idle_prop_update_t		update;
120 	void				*private;
121 	cpu_idle_prop_type_t		type;
122 	uint32_t			refcnt;
123 } cpu_idle_prop_impl_t;
124 
125 typedef struct cpu_idle_prop_item {
126 	cpu_idle_prop_type_t		type;
127 	char				*name;
128 	cpu_idle_prop_update_t		update;
129 	void				*arg;
130 	cpu_idle_prop_handle_t		handle;
131 } cpu_idle_prop_item_t;
132 
133 /* Structure to maintain registered callbacks in list. */
134 typedef struct cpu_idle_cb_impl {
135 	struct cpu_idle_cb_impl		*next;
136 	cpu_idle_callback_t		*callback;
137 	void				*argument;
138 	int				priority;
139 } cpu_idle_cb_impl_t;
140 
141 /*
142  * Structure to maintain registered callbacks in priority order and also
143  * optimized for cache efficiency for reading access.
144  */
145 typedef struct cpu_idle_cb_item {
146 	cpu_idle_enter_cbfn_t		enter;
147 	cpu_idle_exit_cbfn_t		exit;
148 	void				*arg;
149 	cpu_idle_cb_impl_t		*impl;
150 } cpu_idle_cb_item_t;
151 
152 /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */
153 typedef union cpu_idle_cb_state {
154 	struct {
155 		/* Index of already invoked callbacks. */
156 		int			index;
157 		/* Invoke registered callbacks if true. */
158 		boolean_t		enabled;
159 		/* Property values are valid if true. */
160 		boolean_t		ready;
161 		/* Pointers to per-CPU properties. */
162 		cpu_idle_prop_value_t	*idle_state;
163 		cpu_idle_prop_value_t	*enter_ts;
164 		cpu_idle_prop_value_t	*exit_ts;
165 		cpu_idle_prop_value_t	*last_idle;
166 		cpu_idle_prop_value_t	*last_busy;
167 		cpu_idle_prop_value_t	*total_idle;
168 		cpu_idle_prop_value_t	*total_busy;
169 		cpu_idle_prop_value_t	*intr_cnt;
170 	} v;
171 #ifdef _LP64
172 	char				align[2 * CPU_CACHE_COHERENCE_SIZE];
173 #else
174 	char				align[CPU_CACHE_COHERENCE_SIZE];
175 #endif
176 } cpu_idle_cb_state_t;
177 
178 static kmutex_t				cpu_idle_prop_lock;
179 static cpu_idle_prop_impl_t		*cpu_idle_prop_busy = NULL;
180 static cpu_idle_prop_impl_t		*cpu_idle_prop_free = NULL;
181 
182 static kmutex_t				cpu_idle_cb_lock;
183 static cpu_idle_cb_impl_t		*cpu_idle_cb_busy = NULL;
184 static cpu_idle_cb_item_t		*cpu_idle_cb_array = NULL;
185 static int				cpu_idle_cb_curr = 0;
186 static int				cpu_idle_cb_max = 0;
187 
188 static cpu_idle_cb_state_t		*cpu_idle_cb_state;
189 
190 #ifdef	__x86
191 /*
192  * cpuset used to intercept CPUs before powering them off.
193  * The control CPU sets the bit corresponding to the target CPU and waits
194  * until the bit is cleared.
195  * The target CPU disables interrupts before clearing corresponding bit and
196  * then loops for ever.
197  */
198 static cpuset_t				cpu_idle_intercept_set;
199 #endif
200 
201 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
202     cpu_idle_prop_value_t *valp);
203 
204 static cpu_idle_prop_item_t cpu_idle_prop_array[] = {
205 	{
206 	    CPU_IDLE_PROP_TYPE_INTPTR, CPU_IDLE_PROP_IDLE_STATE,
207 	    NULL, NULL, NULL
208 	},
209 	{
210 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_ENTER_TIMESTAMP,
211 	    NULL, NULL, NULL
212 	},
213 	{
214 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_EXIT_TIMESTAMP,
215 	    NULL, NULL, NULL
216 	},
217 	{
218 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_IDLE_TIME,
219 	    NULL, NULL, NULL
220 	},
221 	{
222 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_BUSY_TIME,
223 	    NULL, NULL, NULL
224 	},
225 	{
226 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_IDLE_TIME,
227 	    NULL, NULL, NULL
228 	},
229 	{
230 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_BUSY_TIME,
231 	    NULL, NULL, NULL
232 	},
233 	{
234 	    CPU_IDLE_PROP_TYPE_UINT64, CPU_IDLE_PROP_INTERRUPT_COUNT,
235 	    cpu_idle_prop_update_intr_cnt, NULL, NULL
236 	},
237 };
238 
239 #define	CPU_IDLE_PROP_IDX_IDLE_STATE	0
240 #define	CPU_IDLE_PROP_IDX_ENTER_TS	1
241 #define	CPU_IDLE_PROP_IDX_EXIT_TS	2
242 #define	CPU_IDLE_PROP_IDX_LAST_IDLE	3
243 #define	CPU_IDLE_PROP_IDX_LAST_BUSY	4
244 #define	CPU_IDLE_PROP_IDX_TOTAL_IDLE	5
245 #define	CPU_IDLE_PROP_IDX_TOTAL_BUSY	6
246 #define	CPU_IDLE_PROP_IDX_INTR_CNT	7
247 
248 /*ARGSUSED*/
249 static void
250 cpu_idle_dtrace_enter(void *arg, cpu_idle_callback_context_t ctx,
251     cpu_idle_check_wakeup_t check_func, void *check_arg)
252 {
253 	int state;
254 
255 	state = cpu_idle_prop_get_intptr(
256 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, ctx);
257 	DTRACE_PROBE1(idle__state__transition, uint_t, state);
258 }
259 
260 /*ARGSUSED*/
261 static void
262 cpu_idle_dtrace_exit(void *arg, cpu_idle_callback_context_t ctx, int flag)
263 {
264 	DTRACE_PROBE1(idle__state__transition, uint_t, CPU_IDLE_STATE_NORMAL);
265 }
266 
267 static cpu_idle_callback_handle_t cpu_idle_cb_handle_dtrace;
268 static cpu_idle_callback_t cpu_idle_callback_dtrace = {
269 	CPU_IDLE_CALLBACK_VERS,
270 	cpu_idle_dtrace_enter,
271 	cpu_idle_dtrace_exit,
272 };
273 
274 #if defined(__x86) && !defined(__xpv)
275 extern void tlb_going_idle(void);
276 extern void tlb_service(void);
277 
278 static cpu_idle_callback_handle_t cpu_idle_cb_handle_tlb;
279 static cpu_idle_callback_t cpu_idle_callback_tlb = {
280 	CPU_IDLE_CALLBACK_VERS,
281 	(cpu_idle_enter_cbfn_t)tlb_going_idle,
282 	(cpu_idle_exit_cbfn_t)tlb_service,
283 };
284 #endif
285 
286 void
287 cpu_event_init(void)
288 {
289 	int i, idx;
290 	size_t sz;
291 	intptr_t buf;
292 	cpu_idle_cb_state_t *sp;
293 	cpu_idle_prop_item_t *ip;
294 
295 	mutex_init(&cpu_idle_cb_lock, NULL, MUTEX_DRIVER, NULL);
296 	mutex_init(&cpu_idle_prop_lock, NULL, MUTEX_DRIVER, NULL);
297 
298 	/* Create internal properties. */
299 	for (i = 0, ip = cpu_idle_prop_array;
300 	    i < sizeof (cpu_idle_prop_array) / sizeof (cpu_idle_prop_array[0]);
301 	    i++, ip++) {
302 		(void) cpu_idle_prop_create_property(ip->name, ip->type,
303 		    ip->update, ip->arg, &ip->handle);
304 		ASSERT(ip->handle != NULL);
305 	}
306 
307 	/* Allocate buffer and align to CPU_CACHE_COHERENCE_SIZE. */
308 	sz = sizeof (cpu_idle_cb_state_t) * max_ncpus;
309 	sz += CPU_CACHE_COHERENCE_SIZE;
310 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
311 	cpu_idle_cb_state = (cpu_idle_cb_state_t *)P2ROUNDUP(buf,
312 	    CPU_CACHE_COHERENCE_SIZE);
313 
314 	/* Cache frequently used property value pointers. */
315 	for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
316 		idx = CPU_IDLE_CTX2IDX(i);
317 #define	___INIT_P(f, i)	\
318 	sp->v.f = CPU_IDLE_HDL2VALP(cpu_idle_prop_array[(i)].handle, idx)
319 		___INIT_P(idle_state, CPU_IDLE_PROP_IDX_IDLE_STATE);
320 		___INIT_P(enter_ts, CPU_IDLE_PROP_IDX_ENTER_TS);
321 		___INIT_P(exit_ts, CPU_IDLE_PROP_IDX_EXIT_TS);
322 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_LAST_IDLE);
323 		___INIT_P(last_busy, CPU_IDLE_PROP_IDX_LAST_BUSY);
324 		___INIT_P(total_idle, CPU_IDLE_PROP_IDX_TOTAL_IDLE);
325 		___INIT_P(total_busy, CPU_IDLE_PROP_IDX_TOTAL_BUSY);
326 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_INTR_CNT);
327 #undef	___INIT_P
328 	}
329 
330 	/* Register built-in callbacks. */
331 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_DTRACE,
332 	    &cpu_idle_callback_dtrace, NULL, &cpu_idle_cb_handle_dtrace) != 0) {
333 		cmn_err(CE_PANIC,
334 		    "cpu_idle: failed to register callback for dtrace.");
335 	}
336 #if defined(__x86) && !defined(__xpv)
337 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_TLB,
338 	    &cpu_idle_callback_tlb, NULL, &cpu_idle_cb_handle_tlb) != 0) {
339 		cmn_err(CE_PANIC,
340 		    "cpu_idle: failed to register callback for tlb_flush.");
341 	}
342 #endif
343 }
344 
345 /*
346  * This function is called to initialize per CPU state when starting CPUs.
347  */
348 void
349 cpu_event_init_cpu(cpu_t *cp)
350 {
351 	ASSERT(cp->cpu_seqid < max_ncpus);
352 	cpu_idle_cb_state[cp->cpu_seqid].v.index = 0;
353 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
354 	cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_TRUE;
355 }
356 
357 /*
358  * This function is called to clean up per CPU state when stopping CPUs.
359  */
360 void
361 cpu_event_fini_cpu(cpu_t *cp)
362 {
363 	ASSERT(cp->cpu_seqid < max_ncpus);
364 	cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_FALSE;
365 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
366 }
367 
368 static void
369 cpu_idle_insert_callback(cpu_idle_cb_impl_t *cip)
370 {
371 	int unlock = 0, unpause = 0;
372 	int i, cnt_new = 0, cnt_old = 0;
373 	char *buf_new = NULL, *buf_old = NULL;
374 
375 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
376 
377 	/*
378 	 * Expand array if it's full.
379 	 * Memory must be allocated out of pause/start_cpus() scope because
380 	 * kmem_zalloc() can't be called with KM_SLEEP flag within that scope.
381 	 */
382 	if (cpu_idle_cb_curr == cpu_idle_cb_max) {
383 		cnt_new = cpu_idle_cb_max + CPU_IDLE_ARRAY_CAPACITY_INC;
384 		buf_new = (char *)kmem_zalloc(cnt_new *
385 		    sizeof (cpu_idle_cb_item_t), KM_SLEEP);
386 	}
387 
388 	/* Try to acquire cpu_lock if not held yet. */
389 	if (!MUTEX_HELD(&cpu_lock)) {
390 		mutex_enter(&cpu_lock);
391 		unlock = 1;
392 	}
393 	/*
394 	 * Pause all other CPUs (and let them run pause thread).
395 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
396 	 * after pause_cpus().
397 	 */
398 	if (!cpus_paused()) {
399 		pause_cpus(NULL);
400 		unpause = 1;
401 	}
402 
403 	/* Copy content to new buffer if needed. */
404 	if (buf_new != NULL) {
405 		buf_old = (char *)cpu_idle_cb_array;
406 		cnt_old = cpu_idle_cb_max;
407 		if (buf_old != NULL) {
408 			ASSERT(cnt_old != 0);
409 			bcopy(cpu_idle_cb_array, buf_new,
410 			    sizeof (cpu_idle_cb_item_t) * cnt_old);
411 		}
412 		cpu_idle_cb_array = (cpu_idle_cb_item_t *)buf_new;
413 		cpu_idle_cb_max = cnt_new;
414 	}
415 
416 	/* Insert into array according to priority. */
417 	ASSERT(cpu_idle_cb_curr < cpu_idle_cb_max);
418 	for (i = cpu_idle_cb_curr; i > 0; i--) {
419 		if (cpu_idle_cb_array[i - 1].impl->priority >= cip->priority) {
420 			break;
421 		}
422 		cpu_idle_cb_array[i] = cpu_idle_cb_array[i - 1];
423 	}
424 	cpu_idle_cb_array[i].arg = cip->argument;
425 	cpu_idle_cb_array[i].enter = cip->callback->idle_enter;
426 	cpu_idle_cb_array[i].exit = cip->callback->idle_exit;
427 	cpu_idle_cb_array[i].impl = cip;
428 	cpu_idle_cb_curr++;
429 
430 	/* Resume other CPUs from paused state if needed. */
431 	if (unpause) {
432 		start_cpus();
433 	}
434 	if (unlock) {
435 		mutex_exit(&cpu_lock);
436 	}
437 
438 	/* Free old resource if needed. */
439 	if (buf_old != NULL) {
440 		ASSERT(cnt_old != 0);
441 		kmem_free(buf_old, cnt_old * sizeof (cpu_idle_cb_item_t));
442 	}
443 }
444 
445 static void
446 cpu_idle_remove_callback(cpu_idle_cb_impl_t *cip)
447 {
448 	int i, found = 0;
449 	int unlock = 0, unpause = 0;
450 	cpu_idle_cb_state_t *sp;
451 
452 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
453 
454 	/* Try to acquire cpu_lock if not held yet. */
455 	if (!MUTEX_HELD(&cpu_lock)) {
456 		mutex_enter(&cpu_lock);
457 		unlock = 1;
458 	}
459 	/*
460 	 * Pause all other CPUs.
461 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
462 	 * after pause_cpus().
463 	 */
464 	if (!cpus_paused()) {
465 		pause_cpus(NULL);
466 		unpause = 1;
467 	}
468 
469 	/* Remove cip from array. */
470 	for (i = 0; i < cpu_idle_cb_curr; i++) {
471 		if (found == 0) {
472 			if (cpu_idle_cb_array[i].impl == cip) {
473 				found = 1;
474 			}
475 		} else {
476 			cpu_idle_cb_array[i - 1] = cpu_idle_cb_array[i];
477 		}
478 	}
479 	ASSERT(found != 0);
480 	cpu_idle_cb_curr--;
481 
482 	/*
483 	 * Reset property ready flag for all CPUs if no registered callback
484 	 * left because cpu_idle_enter/exit will stop updating property if
485 	 * there's no callback registered.
486 	 */
487 	if (cpu_idle_cb_curr == 0) {
488 		for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
489 			sp->v.ready = B_FALSE;
490 		}
491 	}
492 
493 	/* Resume other CPUs from paused state if needed. */
494 	if (unpause) {
495 		start_cpus();
496 	}
497 	if (unlock) {
498 		mutex_exit(&cpu_lock);
499 	}
500 }
501 
502 int
503 cpu_idle_register_callback(uint_t prio, cpu_idle_callback_t *cbp,
504     void *arg, cpu_idle_callback_handle_t *hdlp)
505 {
506 	cpu_idle_cb_state_t *sp;
507 	cpu_idle_cb_impl_t *cip = NULL;
508 
509 	/* First validate parameters. */
510 	ASSERT(!CPU_ON_INTR(CPU));
511 	ASSERT(CPU->cpu_seqid < max_ncpus);
512 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
513 	if (sp->v.index != 0) {
514 		cmn_err(CE_NOTE,
515 		    "!cpu_event: register_callback called from callback.");
516 		return (EBUSY);
517 	} else if (cbp == NULL || hdlp == NULL) {
518 		cmn_err(CE_NOTE,
519 		    "!cpu_event: NULL parameters in register_callback.");
520 		return (EINVAL);
521 	} else if (prio < CPU_IDLE_CB_PRIO_LOW_BASE ||
522 	    prio >= CPU_IDLE_CB_PRIO_RESV_BASE) {
523 		cmn_err(CE_NOTE,
524 		    "!cpu_event: priority 0x%x out of range.", prio);
525 		return (EINVAL);
526 	} else if (cbp->version != CPU_IDLE_CALLBACK_VERS) {
527 		cmn_err(CE_NOTE,
528 		    "!cpu_event: callback version %d is not supported.",
529 		    cbp->version);
530 		return (EINVAL);
531 	}
532 
533 	mutex_enter(&cpu_idle_cb_lock);
534 	/* Check whether callback with priority exists if not dynamic. */
535 	if (prio != CPU_IDLE_CB_PRIO_DYNAMIC) {
536 		for (cip = cpu_idle_cb_busy; cip != NULL;
537 		    cip = cip->next) {
538 			if (cip->priority == prio) {
539 				mutex_exit(&cpu_idle_cb_lock);
540 				cmn_err(CE_NOTE, "!cpu_event: callback with "
541 				    "priority 0x%x already exists.", prio);
542 				return (EEXIST);
543 			}
544 		}
545 	}
546 
547 	cip = kmem_zalloc(sizeof (*cip), KM_SLEEP);
548 	cip->callback = cbp;
549 	cip->argument = arg;
550 	cip->priority = prio;
551 	cip->next = cpu_idle_cb_busy;
552 	cpu_idle_cb_busy = cip;
553 	cpu_idle_insert_callback(cip);
554 	mutex_exit(&cpu_idle_cb_lock);
555 
556 	*hdlp = (cpu_idle_callback_handle_t)cip;
557 
558 	return (0);
559 }
560 
561 int
562 cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl)
563 {
564 	int rc = ENODEV;
565 	cpu_idle_cb_state_t *sp;
566 	cpu_idle_cb_impl_t *ip, **ipp;
567 
568 	ASSERT(!CPU_ON_INTR(CPU));
569 	ASSERT(CPU->cpu_seqid < max_ncpus);
570 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
571 	if (sp->v.index != 0) {
572 		cmn_err(CE_NOTE,
573 		    "!cpu_event: unregister_callback called from callback.");
574 		return (EBUSY);
575 	} else if (hdl == NULL) {
576 		cmn_err(CE_NOTE,
577 		    "!cpu_event: hdl is NULL in unregister_callback.");
578 		return (EINVAL);
579 	}
580 
581 	ip = (cpu_idle_cb_impl_t *)hdl;
582 	mutex_enter(&cpu_idle_cb_lock);
583 	for (ipp = &cpu_idle_cb_busy; *ipp != NULL; ipp = &(*ipp)->next) {
584 		if (*ipp == ip) {
585 			*ipp = ip->next;
586 			cpu_idle_remove_callback(ip);
587 			rc = 0;
588 			break;
589 		}
590 	}
591 	mutex_exit(&cpu_idle_cb_lock);
592 
593 	if (rc == 0) {
594 		kmem_free(ip, sizeof (*ip));
595 	} else {
596 		cmn_err(CE_NOTE,
597 		    "!cpu_event: callback handle %p not found.", (void *)hdl);
598 	}
599 
600 	return (rc);
601 }
602 
603 static int
604 cpu_idle_enter_state(cpu_idle_cb_state_t *sp, intptr_t state)
605 {
606 	sp->v.idle_state->cipv_intptr = state;
607 	sp->v.enter_ts->cipv_hrtime = gethrtime_unscaled();
608 	sp->v.last_busy->cipv_hrtime = sp->v.enter_ts->cipv_hrtime -
609 	    sp->v.exit_ts->cipv_hrtime;
610 	sp->v.total_busy->cipv_hrtime += sp->v.last_busy->cipv_hrtime;
611 	if (sp->v.ready == B_FALSE) {
612 		sp->v.ready = B_TRUE;
613 		return (0);
614 	}
615 
616 	return (1);
617 }
618 
619 static void
620 cpu_idle_exit_state(cpu_idle_cb_state_t *sp)
621 {
622 	sp->v.idle_state->cipv_intptr = CPU_IDLE_STATE_NORMAL;
623 	sp->v.exit_ts->cipv_hrtime = gethrtime_unscaled();
624 	sp->v.last_idle->cipv_hrtime = sp->v.exit_ts->cipv_hrtime -
625 	    sp->v.enter_ts->cipv_hrtime;
626 	sp->v.total_idle->cipv_hrtime += sp->v.last_idle->cipv_hrtime;
627 }
628 
629 /*ARGSUSED*/
630 int
631 cpu_idle_enter(int state, int flag,
632     cpu_idle_check_wakeup_t check_func, void *check_arg)
633 {
634 	int i;
635 	cpu_idle_cb_item_t *cip;
636 	cpu_idle_cb_state_t *sp;
637 	cpu_idle_callback_context_t ctx;
638 #if defined(__x86)
639 	ulong_t iflags;
640 #endif
641 
642 	ctx = CPU_IDLE_GET_CTX(CPU);
643 	ASSERT(CPU->cpu_seqid < max_ncpus);
644 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
645 	ASSERT(sp->v.index == 0);
646 	if (sp->v.enabled == B_FALSE) {
647 #if defined(__x86)
648 		/* Intercept CPU at a safe point before powering off it. */
649 		if (CPU_IN_SET(cpu_idle_intercept_set, CPU->cpu_id)) {
650 			iflags = intr_clear();
651 			CPUSET_ATOMIC_DEL(cpu_idle_intercept_set, CPU->cpu_id);
652 			/*CONSTCOND*/
653 			while (1) {
654 				SMT_PAUSE();
655 			}
656 		}
657 #endif
658 
659 		return (0);
660 	}
661 
662 	/*
663 	 * On x86, cpu_idle_enter can be called from idle thread with either
664 	 * interrupts enabled or disabled, so we need to make sure interrupts
665 	 * are disabled here.
666 	 * On SPARC, cpu_idle_enter will be called from idle thread with
667 	 * interrupt disabled, so no special handling necessary.
668 	 */
669 #if defined(__x86)
670 	iflags = intr_clear();
671 #endif
672 
673 	/* Skip calling callback if state is not ready for current CPU. */
674 	if (cpu_idle_enter_state(sp, state) == 0) {
675 #if defined(__x86)
676 		intr_restore(iflags);
677 #endif
678 		return (0);
679 	}
680 
681 	for (i = 0, cip = cpu_idle_cb_array; i < cpu_idle_cb_curr; i++, cip++) {
682 		/*
683 		 * Increase index so corresponding idle_exit callback
684 		 * will be invoked should interrupt happen during
685 		 * idle_enter callback.
686 		 */
687 		sp->v.index++;
688 
689 		/* Call idle_enter callback function if it's not NULL. */
690 		if (cip->enter != NULL) {
691 			cip->enter(cip->arg, ctx, check_func, check_arg);
692 
693 			/*
694 			 * cpu_idle_enter runs with interrupts
695 			 * disabled, so the idle_enter callbacks will
696 			 * also be called with interrupts disabled.
697 			 * It is permissible for the callbacks to
698 			 * enable the interrupts, if they can also
699 			 * handle the condition if the interrupt
700 			 * occurs.
701 			 *
702 			 * However, if an interrupt occurs and we
703 			 * return here without dealing with it, we
704 			 * return to the cpu_idle_enter() caller
705 			 * with an EBUSY, and the caller will not
706 			 * enter the idle state.
707 			 *
708 			 * We detect the interrupt, by checking the
709 			 * index value of the state pointer.  If it
710 			 * is not the index we incremented above,
711 			 * then it was cleared while processing
712 			 * the interrupt.
713 			 *
714 			 * Also note, that at this point of the code
715 			 * the normal index value will be one greater
716 			 * than the variable 'i' in the loop, as it
717 			 * hasn't yet been incremented.
718 			 */
719 			if (sp->v.index != i + 1) {
720 #if defined(__x86)
721 				intr_restore(iflags);
722 #endif
723 				return (EBUSY);
724 			}
725 		}
726 	}
727 #if defined(__x86)
728 	intr_restore(iflags);
729 #endif
730 
731 	return (0);
732 }
733 
734 void
735 cpu_idle_exit(int flag)
736 {
737 	int i;
738 	cpu_idle_cb_item_t *cip;
739 	cpu_idle_cb_state_t *sp;
740 	cpu_idle_callback_context_t ctx;
741 #if defined(__x86)
742 	ulong_t iflags;
743 #endif
744 
745 	ASSERT(CPU->cpu_seqid < max_ncpus);
746 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
747 
748 #if defined(__sparc)
749 	/*
750 	 * On SPARC, cpu_idle_exit will only be called from idle thread
751 	 * with interrupt disabled.
752 	 */
753 
754 	if (sp->v.index != 0) {
755 		ctx = CPU_IDLE_GET_CTX(CPU);
756 		cpu_idle_exit_state(sp);
757 		for (i = sp->v.index - 1; i >= 0; i--) {
758 			cip = &cpu_idle_cb_array[i];
759 			if (cip->exit != NULL) {
760 				cip->exit(cip->arg, ctx, flag);
761 			}
762 		}
763 		sp->v.index = 0;
764 	}
765 #elif defined(__x86)
766 	/*
767 	 * On x86, cpu_idle_exit will be called from idle thread or interrupt
768 	 * handler. When called from interrupt handler, interrupts will be
769 	 * disabled. When called from idle thread, interrupts may be disabled
770 	 * or enabled.
771 	 */
772 
773 	/* Called from interrupt, interrupts are already disabled. */
774 	if (flag & CPU_IDLE_CB_FLAG_INTR) {
775 		/*
776 		 * return if cpu_idle_exit already called or
777 		 * there is no registered callback.
778 		 */
779 		if (sp->v.index == 0) {
780 			return;
781 		}
782 		ctx = CPU_IDLE_GET_CTX(CPU);
783 		cpu_idle_exit_state(sp);
784 		for (i = sp->v.index - 1; i >= 0; i--) {
785 			cip = &cpu_idle_cb_array[i];
786 			if (cip->exit != NULL) {
787 				cip->exit(cip->arg, ctx, flag);
788 			}
789 		}
790 		sp->v.index = 0;
791 
792 	/* Called from idle thread, need to disable interrupt. */
793 	} else {
794 		iflags = intr_clear();
795 		if (sp->v.index != 0) {
796 			ctx = CPU_IDLE_GET_CTX(CPU);
797 			cpu_idle_exit_state(sp);
798 			for (i = sp->v.index - 1; i >= 0; i--) {
799 				cip = &cpu_idle_cb_array[i];
800 				if (cip->exit != NULL) {
801 					cip->exit(cip->arg, ctx, flag);
802 				}
803 			}
804 			sp->v.index = 0;
805 		}
806 		intr_restore(iflags);
807 	}
808 #endif
809 }
810 
811 cpu_idle_callback_context_t
812 cpu_idle_get_context(void)
813 {
814 	return (CPU_IDLE_GET_CTX(CPU));
815 }
816 
817 /*
818  * Allocate property structure in group of CPU_IDLE_VALUE_GROUP_SIZE to improve
819  * cache efficiency. To simplify implementation, allocated memory for property
820  * structure won't be freed.
821  */
822 static void
823 cpu_idle_prop_allocate_impl(void)
824 {
825 	int i;
826 	size_t sz;
827 	intptr_t buf;
828 	cpu_idle_prop_impl_t *prop;
829 	cpu_idle_prop_value_t *valp;
830 
831 	ASSERT(!CPU_ON_INTR(CPU));
832 	prop = kmem_zalloc(sizeof (*prop) * CPU_IDLE_VALUE_GROUP_SIZE,
833 	    KM_SLEEP);
834 	sz = sizeof (*valp) * CPU_IDLE_VALUE_GROUP_SIZE * max_ncpus;
835 	sz += CPU_CACHE_COHERENCE_SIZE;
836 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
837 	valp = (cpu_idle_prop_value_t *)P2ROUNDUP(buf,
838 	    CPU_CACHE_COHERENCE_SIZE);
839 
840 	for (i = 0; i < CPU_IDLE_VALUE_GROUP_SIZE; i++, prop++, valp++) {
841 		prop->value = valp;
842 		prop->next = cpu_idle_prop_free;
843 		cpu_idle_prop_free = prop;
844 	}
845 }
846 
847 int
848 cpu_idle_prop_create_property(const char *name, cpu_idle_prop_type_t type,
849     cpu_idle_prop_update_t update, void *arg, cpu_idle_prop_handle_t *hdlp)
850 {
851 	int rc = EEXIST;
852 	cpu_idle_prop_impl_t *prop;
853 
854 	ASSERT(!CPU_ON_INTR(CPU));
855 	if (name == NULL || hdlp == NULL) {
856 		cmn_err(CE_WARN,
857 		    "!cpu_event: NULL parameters in create_property.");
858 		return (EINVAL);
859 	}
860 
861 	mutex_enter(&cpu_idle_prop_lock);
862 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
863 		if (strcmp(prop->name, name) == 0) {
864 			cmn_err(CE_NOTE,
865 			    "!cpu_event: property %s already exists.", name);
866 			break;
867 		}
868 	}
869 	if (prop == NULL) {
870 		if (cpu_idle_prop_free == NULL) {
871 			cpu_idle_prop_allocate_impl();
872 		}
873 		ASSERT(cpu_idle_prop_free != NULL);
874 		prop = cpu_idle_prop_free;
875 		cpu_idle_prop_free = prop->next;
876 		prop->next = cpu_idle_prop_busy;
877 		cpu_idle_prop_busy = prop;
878 
879 		ASSERT(prop->value != NULL);
880 		prop->name = strdup(name);
881 		prop->type = type;
882 		prop->update = update;
883 		prop->private = arg;
884 		prop->refcnt = 1;
885 		*hdlp = prop;
886 		rc = 0;
887 	}
888 	mutex_exit(&cpu_idle_prop_lock);
889 
890 	return (rc);
891 }
892 
893 int
894 cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl)
895 {
896 	int rc = ENODEV;
897 	cpu_idle_prop_impl_t *prop, **propp;
898 	cpu_idle_prop_value_t *valp;
899 
900 	ASSERT(!CPU_ON_INTR(CPU));
901 	if (hdl == NULL) {
902 		cmn_err(CE_WARN,
903 		    "!cpu_event: hdl is NULL in destroy_property.");
904 		return (EINVAL);
905 	}
906 
907 	prop = (cpu_idle_prop_impl_t *)hdl;
908 	mutex_enter(&cpu_idle_prop_lock);
909 	for (propp = &cpu_idle_prop_busy; *propp != NULL;
910 	    propp = &(*propp)->next) {
911 		if (*propp == prop) {
912 			ASSERT(prop->refcnt > 0);
913 			if (atomic_cas_32(&prop->refcnt, 1, 0) == 1) {
914 				*propp = prop->next;
915 				strfree(prop->name);
916 				valp = prop->value;
917 				bzero(prop, sizeof (*prop));
918 				prop->value = valp;
919 				prop->next = cpu_idle_prop_free;
920 				cpu_idle_prop_free = prop;
921 				rc = 0;
922 			} else {
923 				rc = EBUSY;
924 			}
925 			break;
926 		}
927 	}
928 	mutex_exit(&cpu_idle_prop_lock);
929 
930 	return (rc);
931 }
932 
933 int
934 cpu_idle_prop_create_handle(const char *name, cpu_idle_prop_handle_t *hdlp)
935 {
936 	int rc = ENODEV;
937 	cpu_idle_prop_impl_t *prop;
938 
939 	ASSERT(!CPU_ON_INTR(CPU));
940 	if (name == NULL || hdlp == NULL) {
941 		cmn_err(CE_WARN,
942 		    "!cpu_event: NULL parameters in create_handle.");
943 		return (EINVAL);
944 	}
945 
946 	mutex_enter(&cpu_idle_prop_lock);
947 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
948 		if (strcmp(prop->name, name) == 0) {
949 			/* Hold one refcount on object. */
950 			ASSERT(prop->refcnt > 0);
951 			atomic_inc_32(&prop->refcnt);
952 			*hdlp = (cpu_idle_prop_handle_t)prop;
953 			rc = 0;
954 			break;
955 		}
956 	}
957 	mutex_exit(&cpu_idle_prop_lock);
958 
959 	return (rc);
960 }
961 
962 int
963 cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl)
964 {
965 	int rc = ENODEV;
966 	cpu_idle_prop_impl_t *prop;
967 
968 	ASSERT(!CPU_ON_INTR(CPU));
969 	if (hdl == NULL) {
970 		cmn_err(CE_WARN,
971 		    "!cpu_event: hdl is NULL in destroy_handle.");
972 		return (EINVAL);
973 	}
974 
975 	mutex_enter(&cpu_idle_prop_lock);
976 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
977 		if (prop == hdl) {
978 			/* Release refcnt held in create_handle. */
979 			ASSERT(prop->refcnt > 1);
980 			atomic_dec_32(&prop->refcnt);
981 			rc = 0;
982 			break;
983 		}
984 	}
985 	mutex_exit(&cpu_idle_prop_lock);
986 
987 	return (rc);
988 }
989 
990 cpu_idle_prop_type_t
991 cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl)
992 {
993 	ASSERT(hdl != NULL);
994 	return (((cpu_idle_prop_impl_t *)hdl)->type);
995 }
996 
997 const char *
998 cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl)
999 {
1000 	ASSERT(hdl != NULL);
1001 	return (((cpu_idle_prop_impl_t *)hdl)->name);
1002 }
1003 
1004 int
1005 cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl,
1006     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t *valp)
1007 {
1008 	int idx, rc = 0;
1009 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1010 
1011 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1012 	if (hdl == NULL || valp == NULL) {
1013 		cmn_err(CE_NOTE, "!cpu_event: NULL parameters in prop_get.");
1014 		return (EINVAL);
1015 	}
1016 	idx = CPU_IDLE_CTX2IDX(ctx);
1017 	if (prop->update != NULL) {
1018 		cpu_idle_cb_state_t *sp;
1019 
1020 		ASSERT(CPU->cpu_seqid < max_ncpus);
1021 		sp = &cpu_idle_cb_state[CPU->cpu_seqid];
1022 		/* CPU's idle enter timestamp as sequence number. */
1023 		rc = prop->update(prop->private,
1024 		    (uint64_t)sp->v.enter_ts->cipv_hrtime, &prop->value[idx]);
1025 	}
1026 	if (rc == 0) {
1027 		*valp = prop->value[idx];
1028 	}
1029 
1030 	return (rc);
1031 }
1032 
1033 uint32_t
1034 cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl,
1035     cpu_idle_callback_context_t ctx)
1036 {
1037 	int idx;
1038 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1039 
1040 	ASSERT(hdl != NULL);
1041 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1042 	idx = CPU_IDLE_CTX2IDX(ctx);
1043 	return (prop->value[idx].cipv_uint32);
1044 }
1045 
1046 uint64_t
1047 cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl,
1048     cpu_idle_callback_context_t ctx)
1049 {
1050 	int idx;
1051 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1052 
1053 	ASSERT(hdl != NULL);
1054 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1055 	idx = CPU_IDLE_CTX2IDX(ctx);
1056 	return (prop->value[idx].cipv_uint64);
1057 }
1058 
1059 intptr_t
1060 cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl,
1061     cpu_idle_callback_context_t ctx)
1062 {
1063 	int idx;
1064 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1065 
1066 	ASSERT(hdl != NULL);
1067 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1068 	idx = CPU_IDLE_CTX2IDX(ctx);
1069 	return (prop->value[idx].cipv_intptr);
1070 }
1071 
1072 hrtime_t
1073 cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl,
1074     cpu_idle_callback_context_t ctx)
1075 {
1076 	int idx;
1077 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1078 
1079 	ASSERT(hdl != NULL);
1080 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1081 	idx = CPU_IDLE_CTX2IDX(ctx);
1082 	return (prop->value[idx].cipv_hrtime);
1083 }
1084 
1085 void
1086 cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl,
1087     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t val)
1088 {
1089 	int idx;
1090 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1091 
1092 	ASSERT(hdl != NULL);
1093 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1094 	idx = CPU_IDLE_CTX2IDX(ctx);
1095 	prop->value[idx] = val;
1096 }
1097 
1098 void
1099 cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl, cpu_idle_prop_value_t val)
1100 {
1101 	int i, idx;
1102 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1103 
1104 	ASSERT(hdl != NULL);
1105 	for (i = 0; i < max_ncpus; i++) {
1106 		idx = CPU_IDLE_CTX2IDX(i);
1107 		prop->value[idx] = val;
1108 	}
1109 }
1110 
1111 /*ARGSUSED*/
1112 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
1113     cpu_idle_prop_value_t *valp)
1114 {
1115 	int i;
1116 	uint64_t val;
1117 
1118 	for (val = 0, i = 0; i < PIL_MAX; i++) {
1119 		val += CPU->cpu_stats.sys.intr[i];
1120 	}
1121 	valp->cipv_uint64 = val;
1122 
1123 	return (0);
1124 }
1125 
1126 uint_t
1127 cpu_idle_get_cpu_state(cpu_t *cp)
1128 {
1129 	ASSERT(cp != NULL && cp->cpu_seqid < max_ncpus);
1130 	return ((uint_t)cpu_idle_prop_get_uint32(
1131 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle,
1132 	    CPU_IDLE_GET_CTX(cp)));
1133 }
1134 
1135 #if defined(__x86)
1136 /*
1137  * Intercept CPU at a safe point in idle() before powering it off.
1138  */
1139 void
1140 cpu_idle_intercept_cpu(cpu_t *cp)
1141 {
1142 	ASSERT(cp->cpu_seqid < max_ncpus);
1143 	ASSERT(cpu_idle_cb_state[cp->cpu_seqid].v.enabled == B_FALSE);
1144 
1145 	/* Set flag to intercept CPU. */
1146 	CPUSET_ATOMIC_ADD(cpu_idle_intercept_set, cp->cpu_id);
1147 	/* Wake up CPU from possible sleep state. */
1148 	poke_cpu(cp->cpu_id);
1149 	while (CPU_IN_SET(cpu_idle_intercept_set, cp->cpu_id)) {
1150 		DELAY(1);
1151 	}
1152 	/*
1153 	 * Now target CPU is spinning in a pause loop with interrupts disabled.
1154 	 */
1155 }
1156 #endif
1157