1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2009-2010, Intel Corporation.
23 * All rights reserved.
24 */
25
26 /*
27 * Introduction
28 * This file implements a CPU event notification mechanism to signal clients
29 * which are interested in CPU related events.
30 * Currently it only supports CPU idle state change events which will be
31 * triggered just before CPU entering hardware idle state and just after CPU
32 * wakes up from hardware idle state.
33 * Please refer to PSARC/2009/115 for detail information.
34 *
35 * Lock Strategy
36 * 1) cpu_idle_prop_busy/free are protected by cpu_idle_prop_lock.
37 * 2) No protection for cpu_idle_cb_state because it's per-CPU data.
38 * 3) cpu_idle_cb_busy is protected by cpu_idle_cb_lock.
39 * 4) cpu_idle_cb_array is protected by pause_cpus/start_cpus logic.
40 * 5) cpu_idle_cb_max/curr are protected by both cpu_idle_cb_lock and
41 * pause_cpus/start_cpus logic.
42 * We have optimized the algorithm for hot path on read side access.
43 * In the current algorithm, it's lock free on read side access.
44 * On write side, we use pause_cpus() to keep other CPUs in the pause thread,
45 * which will guarantee that no other threads will access
46 * cpu_idle_cb_max/curr/array data structure.
47 */
48
49 #include <sys/types.h>
50 #include <sys/cmn_err.h>
51 #include <sys/cpuvar.h>
52 #include <sys/cpu.h>
53 #include <sys/kmem.h>
54 #include <sys/machcpuvar.h>
55 #include <sys/sdt.h>
56 #include <sys/sysmacros.h>
57 #include <sys/synch.h>
58 #include <sys/systm.h>
59 #include <sys/sunddi.h>
60 #if defined(__sparc)
61 #include <sys/machsystm.h>
62 #elif defined(__x86)
63 #include <sys/archsystm.h>
64 #endif
65 #include <sys/cpu_event.h>
66
67 /* Define normal state for CPU on different platforms. */
68 #if defined(__x86)
69 #define CPU_IDLE_STATE_NORMAL IDLE_STATE_C0
70 #elif defined(__sparc)
71 /*
72 * At the time of this implementation IDLE_STATE_NORMAL is defined
73 * in mach_startup.c, and not in a header file. So if we find it is
74 * undefined, then we set it to the value as defined in mach_startup.c
75 * Should it eventually be defined, we will pick it up.
76 */
77 #ifndef IDLE_STATE_NORMAL
78 #define IDLE_STATE_NORMAL 0
79 #endif
80 #define CPU_IDLE_STATE_NORMAL IDLE_STATE_NORMAL
81 #endif
82
83 /*
84 * To improve cache efficiency and avoid cache false sharing, CPU idle
85 * properties are grouped into cache lines as below:
86 * | CPU0 | CPU1 |.........| CPUn |
87 * | cache line 0 | cache line 1 |.........| cache line n |
88 * | v0 | ... | vm | v0 | ... | vm |.........| v0 | ... | vm |
89 * To access value of property m for CPU n, using following value as index:
90 * index = seq_id_of_CPUn * CPU_IDLE_VALUE_GROUP_SIZE + m.
91 */
92 #define CPU_IDLE_VALUE_GROUP_SIZE \
93 (CPU_CACHE_COHERENCE_SIZE / sizeof (cpu_idle_prop_value_t))
94
95 /* Get callback context handle for current CPU. */
96 #define CPU_IDLE_GET_CTX(cp) \
97 ((cpu_idle_callback_context_t)(intptr_t)((cp)->cpu_seqid))
98
99 /* Get CPU sequential id from ctx. */
100 #define CPU_IDLE_CTX2CPUID(ctx) ((processorid_t)(intptr_t)(ctx))
101
102 /* Compute index from callback context handle. */
103 #define CPU_IDLE_CTX2IDX(ctx) \
104 (((int)(intptr_t)(ctx)) * CPU_IDLE_VALUE_GROUP_SIZE)
105
106 #define CPU_IDLE_HDL2VALP(hdl, idx) \
107 (&((cpu_idle_prop_impl_t *)(hdl))->value[(idx)])
108
109 /*
110 * When cpu_idle_cb_array is NULL or full, increase CPU_IDLE_ARRAY_CAPACITY_INC
111 * entries every time. Here we prefer linear growth instead of exponential.
112 */
113 #define CPU_IDLE_ARRAY_CAPACITY_INC 0x10
114
115 typedef struct cpu_idle_prop_impl {
116 cpu_idle_prop_value_t *value;
117 struct cpu_idle_prop_impl *next;
118 char *name;
119 cpu_idle_prop_update_t update;
120 void *private;
121 cpu_idle_prop_type_t type;
122 uint32_t refcnt;
123 } cpu_idle_prop_impl_t;
124
125 typedef struct cpu_idle_prop_item {
126 cpu_idle_prop_type_t type;
127 char *name;
128 cpu_idle_prop_update_t update;
129 void *arg;
130 cpu_idle_prop_handle_t handle;
131 } cpu_idle_prop_item_t;
132
133 /* Structure to maintain registered callbacks in list. */
134 typedef struct cpu_idle_cb_impl {
135 struct cpu_idle_cb_impl *next;
136 cpu_idle_callback_t *callback;
137 void *argument;
138 int priority;
139 } cpu_idle_cb_impl_t;
140
141 /*
142 * Structure to maintain registered callbacks in priority order and also
143 * optimized for cache efficiency for reading access.
144 */
145 typedef struct cpu_idle_cb_item {
146 cpu_idle_enter_cbfn_t enter;
147 cpu_idle_exit_cbfn_t exit;
148 void *arg;
149 cpu_idle_cb_impl_t *impl;
150 } cpu_idle_cb_item_t;
151
152 /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */
153 typedef union cpu_idle_cb_state {
154 struct {
155 /* Index of already invoked callbacks. */
156 int index;
157 /* Invoke registered callbacks if true. */
158 boolean_t enabled;
159 /* Property values are valid if true. */
160 boolean_t ready;
161 /* Pointers to per-CPU properties. */
162 cpu_idle_prop_value_t *idle_state;
163 cpu_idle_prop_value_t *enter_ts;
164 cpu_idle_prop_value_t *exit_ts;
165 cpu_idle_prop_value_t *last_idle;
166 cpu_idle_prop_value_t *last_busy;
167 cpu_idle_prop_value_t *total_idle;
168 cpu_idle_prop_value_t *total_busy;
169 cpu_idle_prop_value_t *intr_cnt;
170 } v;
171 #ifdef _LP64
172 char align[2 * CPU_CACHE_COHERENCE_SIZE];
173 #else
174 char align[CPU_CACHE_COHERENCE_SIZE];
175 #endif
176 } cpu_idle_cb_state_t;
177
178 static kmutex_t cpu_idle_prop_lock;
179 static cpu_idle_prop_impl_t *cpu_idle_prop_busy = NULL;
180 static cpu_idle_prop_impl_t *cpu_idle_prop_free = NULL;
181
182 static kmutex_t cpu_idle_cb_lock;
183 static cpu_idle_cb_impl_t *cpu_idle_cb_busy = NULL;
184 static cpu_idle_cb_item_t *cpu_idle_cb_array = NULL;
185 static int cpu_idle_cb_curr = 0;
186 static int cpu_idle_cb_max = 0;
187
188 static cpu_idle_cb_state_t *cpu_idle_cb_state;
189
190 #ifdef __x86
191 /*
192 * cpuset used to intercept CPUs before powering them off.
193 * The control CPU sets the bit corresponding to the target CPU and waits
194 * until the bit is cleared.
195 * The target CPU disables interrupts before clearing corresponding bit and
196 * then loops for ever.
197 */
198 static cpuset_t cpu_idle_intercept_set;
199 #endif
200
201 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
202 cpu_idle_prop_value_t *valp);
203
204 static cpu_idle_prop_item_t cpu_idle_prop_array[] = {
205 {
206 CPU_IDLE_PROP_TYPE_INTPTR, CPU_IDLE_PROP_IDLE_STATE,
207 NULL, NULL, NULL
208 },
209 {
210 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_ENTER_TIMESTAMP,
211 NULL, NULL, NULL
212 },
213 {
214 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_EXIT_TIMESTAMP,
215 NULL, NULL, NULL
216 },
217 {
218 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_IDLE_TIME,
219 NULL, NULL, NULL
220 },
221 {
222 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_BUSY_TIME,
223 NULL, NULL, NULL
224 },
225 {
226 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_IDLE_TIME,
227 NULL, NULL, NULL
228 },
229 {
230 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_BUSY_TIME,
231 NULL, NULL, NULL
232 },
233 {
234 CPU_IDLE_PROP_TYPE_UINT64, CPU_IDLE_PROP_INTERRUPT_COUNT,
235 cpu_idle_prop_update_intr_cnt, NULL, NULL
236 },
237 };
238
239 #define CPU_IDLE_PROP_IDX_IDLE_STATE 0
240 #define CPU_IDLE_PROP_IDX_ENTER_TS 1
241 #define CPU_IDLE_PROP_IDX_EXIT_TS 2
242 #define CPU_IDLE_PROP_IDX_LAST_IDLE 3
243 #define CPU_IDLE_PROP_IDX_LAST_BUSY 4
244 #define CPU_IDLE_PROP_IDX_TOTAL_IDLE 5
245 #define CPU_IDLE_PROP_IDX_TOTAL_BUSY 6
246 #define CPU_IDLE_PROP_IDX_INTR_CNT 7
247
248 /*ARGSUSED*/
249 static void
cpu_idle_dtrace_enter(void * arg,cpu_idle_callback_context_t ctx,cpu_idle_check_wakeup_t check_func,void * check_arg)250 cpu_idle_dtrace_enter(void *arg, cpu_idle_callback_context_t ctx,
251 cpu_idle_check_wakeup_t check_func, void *check_arg)
252 {
253 int state;
254
255 state = cpu_idle_prop_get_intptr(
256 cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, ctx);
257 DTRACE_PROBE1(idle__state__transition, uint_t, state);
258 }
259
260 /*ARGSUSED*/
261 static void
cpu_idle_dtrace_exit(void * arg,cpu_idle_callback_context_t ctx,int flag)262 cpu_idle_dtrace_exit(void *arg, cpu_idle_callback_context_t ctx, int flag)
263 {
264 DTRACE_PROBE1(idle__state__transition, uint_t, CPU_IDLE_STATE_NORMAL);
265 }
266
267 static cpu_idle_callback_handle_t cpu_idle_cb_handle_dtrace;
268 static cpu_idle_callback_t cpu_idle_callback_dtrace = {
269 CPU_IDLE_CALLBACK_VERS,
270 cpu_idle_dtrace_enter,
271 cpu_idle_dtrace_exit,
272 };
273
274 #if defined(__x86) && !defined(__xpv)
275 extern void tlb_going_idle(void);
276 extern void tlb_service(void);
277
278 static cpu_idle_callback_handle_t cpu_idle_cb_handle_tlb;
279 static cpu_idle_callback_t cpu_idle_callback_tlb = {
280 CPU_IDLE_CALLBACK_VERS,
281 (cpu_idle_enter_cbfn_t)tlb_going_idle,
282 (cpu_idle_exit_cbfn_t)tlb_service,
283 };
284 #endif
285
286 void
cpu_event_init(void)287 cpu_event_init(void)
288 {
289 int i, idx;
290 size_t sz;
291 intptr_t buf;
292 cpu_idle_cb_state_t *sp;
293 cpu_idle_prop_item_t *ip;
294
295 mutex_init(&cpu_idle_cb_lock, NULL, MUTEX_DRIVER, NULL);
296 mutex_init(&cpu_idle_prop_lock, NULL, MUTEX_DRIVER, NULL);
297
298 /* Create internal properties. */
299 for (i = 0, ip = cpu_idle_prop_array;
300 i < sizeof (cpu_idle_prop_array) / sizeof (cpu_idle_prop_array[0]);
301 i++, ip++) {
302 (void) cpu_idle_prop_create_property(ip->name, ip->type,
303 ip->update, ip->arg, &ip->handle);
304 ASSERT(ip->handle != NULL);
305 }
306
307 /* Allocate buffer and align to CPU_CACHE_COHERENCE_SIZE. */
308 sz = sizeof (cpu_idle_cb_state_t) * max_ncpus;
309 sz += CPU_CACHE_COHERENCE_SIZE;
310 buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
311 cpu_idle_cb_state = (cpu_idle_cb_state_t *)P2ROUNDUP(buf,
312 CPU_CACHE_COHERENCE_SIZE);
313
314 /* Cache frequently used property value pointers. */
315 for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
316 idx = CPU_IDLE_CTX2IDX(i);
317 #define ___INIT_P(f, i) \
318 sp->v.f = CPU_IDLE_HDL2VALP(cpu_idle_prop_array[(i)].handle, idx)
319 ___INIT_P(idle_state, CPU_IDLE_PROP_IDX_IDLE_STATE);
320 ___INIT_P(enter_ts, CPU_IDLE_PROP_IDX_ENTER_TS);
321 ___INIT_P(exit_ts, CPU_IDLE_PROP_IDX_EXIT_TS);
322 ___INIT_P(last_idle, CPU_IDLE_PROP_IDX_LAST_IDLE);
323 ___INIT_P(last_busy, CPU_IDLE_PROP_IDX_LAST_BUSY);
324 ___INIT_P(total_idle, CPU_IDLE_PROP_IDX_TOTAL_IDLE);
325 ___INIT_P(total_busy, CPU_IDLE_PROP_IDX_TOTAL_BUSY);
326 ___INIT_P(last_idle, CPU_IDLE_PROP_IDX_INTR_CNT);
327 #undef ___INIT_P
328 }
329
330 /* Register built-in callbacks. */
331 if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_DTRACE,
332 &cpu_idle_callback_dtrace, NULL, &cpu_idle_cb_handle_dtrace) != 0) {
333 cmn_err(CE_PANIC,
334 "cpu_idle: failed to register callback for dtrace.");
335 }
336 #if defined(__x86) && !defined(__xpv)
337 if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_TLB,
338 &cpu_idle_callback_tlb, NULL, &cpu_idle_cb_handle_tlb) != 0) {
339 cmn_err(CE_PANIC,
340 "cpu_idle: failed to register callback for tlb_flush.");
341 }
342 #endif
343 }
344
345 /*
346 * This function is called to initialize per CPU state when starting CPUs.
347 */
348 void
cpu_event_init_cpu(cpu_t * cp)349 cpu_event_init_cpu(cpu_t *cp)
350 {
351 ASSERT(cp->cpu_seqid < max_ncpus);
352 cpu_idle_cb_state[cp->cpu_seqid].v.index = 0;
353 cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
354 cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_TRUE;
355 }
356
357 /*
358 * This function is called to clean up per CPU state when stopping CPUs.
359 */
360 void
cpu_event_fini_cpu(cpu_t * cp)361 cpu_event_fini_cpu(cpu_t *cp)
362 {
363 ASSERT(cp->cpu_seqid < max_ncpus);
364 cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_FALSE;
365 cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
366 }
367
368 static void
cpu_idle_insert_callback(cpu_idle_cb_impl_t * cip)369 cpu_idle_insert_callback(cpu_idle_cb_impl_t *cip)
370 {
371 int unlock = 0, unpause = 0;
372 int i, cnt_new = 0, cnt_old = 0;
373 char *buf_new = NULL, *buf_old = NULL;
374
375 ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
376
377 /*
378 * Expand array if it's full.
379 * Memory must be allocated out of pause/start_cpus() scope because
380 * kmem_zalloc() can't be called with KM_SLEEP flag within that scope.
381 */
382 if (cpu_idle_cb_curr == cpu_idle_cb_max) {
383 cnt_new = cpu_idle_cb_max + CPU_IDLE_ARRAY_CAPACITY_INC;
384 buf_new = (char *)kmem_zalloc(cnt_new *
385 sizeof (cpu_idle_cb_item_t), KM_SLEEP);
386 }
387
388 /* Try to acquire cpu_lock if not held yet. */
389 if (!MUTEX_HELD(&cpu_lock)) {
390 mutex_enter(&cpu_lock);
391 unlock = 1;
392 }
393 /*
394 * Pause all other CPUs (and let them run pause thread).
395 * It's guaranteed that no other threads will access cpu_idle_cb_array
396 * after pause_cpus().
397 */
398 if (!cpus_paused()) {
399 pause_cpus(NULL, NULL);
400 unpause = 1;
401 }
402
403 /* Copy content to new buffer if needed. */
404 if (buf_new != NULL) {
405 buf_old = (char *)cpu_idle_cb_array;
406 cnt_old = cpu_idle_cb_max;
407 if (buf_old != NULL) {
408 ASSERT(cnt_old != 0);
409 bcopy(cpu_idle_cb_array, buf_new,
410 sizeof (cpu_idle_cb_item_t) * cnt_old);
411 }
412 cpu_idle_cb_array = (cpu_idle_cb_item_t *)buf_new;
413 cpu_idle_cb_max = cnt_new;
414 }
415
416 /* Insert into array according to priority. */
417 ASSERT(cpu_idle_cb_curr < cpu_idle_cb_max);
418 for (i = cpu_idle_cb_curr; i > 0; i--) {
419 if (cpu_idle_cb_array[i - 1].impl->priority >= cip->priority) {
420 break;
421 }
422 cpu_idle_cb_array[i] = cpu_idle_cb_array[i - 1];
423 }
424 cpu_idle_cb_array[i].arg = cip->argument;
425 cpu_idle_cb_array[i].enter = cip->callback->idle_enter;
426 cpu_idle_cb_array[i].exit = cip->callback->idle_exit;
427 cpu_idle_cb_array[i].impl = cip;
428 cpu_idle_cb_curr++;
429
430 /* Resume other CPUs from paused state if needed. */
431 if (unpause) {
432 start_cpus();
433 }
434 if (unlock) {
435 mutex_exit(&cpu_lock);
436 }
437
438 /* Free old resource if needed. */
439 if (buf_old != NULL) {
440 ASSERT(cnt_old != 0);
441 kmem_free(buf_old, cnt_old * sizeof (cpu_idle_cb_item_t));
442 }
443 }
444
445 static void
cpu_idle_remove_callback(cpu_idle_cb_impl_t * cip)446 cpu_idle_remove_callback(cpu_idle_cb_impl_t *cip)
447 {
448 int i, found = 0;
449 int unlock = 0, unpause = 0;
450 cpu_idle_cb_state_t *sp;
451
452 ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
453
454 /* Try to acquire cpu_lock if not held yet. */
455 if (!MUTEX_HELD(&cpu_lock)) {
456 mutex_enter(&cpu_lock);
457 unlock = 1;
458 }
459 /*
460 * Pause all other CPUs.
461 * It's guaranteed that no other threads will access cpu_idle_cb_array
462 * after pause_cpus().
463 */
464 if (!cpus_paused()) {
465 pause_cpus(NULL, NULL);
466 unpause = 1;
467 }
468
469 /* Remove cip from array. */
470 for (i = 0; i < cpu_idle_cb_curr; i++) {
471 if (found == 0) {
472 if (cpu_idle_cb_array[i].impl == cip) {
473 found = 1;
474 }
475 } else {
476 cpu_idle_cb_array[i - 1] = cpu_idle_cb_array[i];
477 }
478 }
479 ASSERT(found != 0);
480 cpu_idle_cb_curr--;
481
482 /*
483 * Reset property ready flag for all CPUs if no registered callback
484 * left because cpu_idle_enter/exit will stop updating property if
485 * there's no callback registered.
486 */
487 if (cpu_idle_cb_curr == 0) {
488 for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
489 sp->v.ready = B_FALSE;
490 }
491 }
492
493 /* Resume other CPUs from paused state if needed. */
494 if (unpause) {
495 start_cpus();
496 }
497 if (unlock) {
498 mutex_exit(&cpu_lock);
499 }
500 }
501
502 int
cpu_idle_register_callback(uint_t prio,cpu_idle_callback_t * cbp,void * arg,cpu_idle_callback_handle_t * hdlp)503 cpu_idle_register_callback(uint_t prio, cpu_idle_callback_t *cbp,
504 void *arg, cpu_idle_callback_handle_t *hdlp)
505 {
506 cpu_idle_cb_state_t *sp;
507 cpu_idle_cb_impl_t *cip = NULL;
508
509 /* First validate parameters. */
510 ASSERT(!CPU_ON_INTR(CPU));
511 ASSERT(CPU->cpu_seqid < max_ncpus);
512 sp = &cpu_idle_cb_state[CPU->cpu_seqid];
513 if (sp->v.index != 0) {
514 cmn_err(CE_NOTE,
515 "!cpu_event: register_callback called from callback.");
516 return (EBUSY);
517 } else if (cbp == NULL || hdlp == NULL) {
518 cmn_err(CE_NOTE,
519 "!cpu_event: NULL parameters in register_callback.");
520 return (EINVAL);
521 } else if (prio < CPU_IDLE_CB_PRIO_LOW_BASE ||
522 prio >= CPU_IDLE_CB_PRIO_RESV_BASE) {
523 cmn_err(CE_NOTE,
524 "!cpu_event: priority 0x%x out of range.", prio);
525 return (EINVAL);
526 } else if (cbp->version != CPU_IDLE_CALLBACK_VERS) {
527 cmn_err(CE_NOTE,
528 "!cpu_event: callback version %d is not supported.",
529 cbp->version);
530 return (EINVAL);
531 }
532
533 mutex_enter(&cpu_idle_cb_lock);
534 /* Check whether callback with priority exists if not dynamic. */
535 if (prio != CPU_IDLE_CB_PRIO_DYNAMIC) {
536 for (cip = cpu_idle_cb_busy; cip != NULL;
537 cip = cip->next) {
538 if (cip->priority == prio) {
539 mutex_exit(&cpu_idle_cb_lock);
540 cmn_err(CE_NOTE, "!cpu_event: callback with "
541 "priority 0x%x already exists.", prio);
542 return (EEXIST);
543 }
544 }
545 }
546
547 cip = kmem_zalloc(sizeof (*cip), KM_SLEEP);
548 cip->callback = cbp;
549 cip->argument = arg;
550 cip->priority = prio;
551 cip->next = cpu_idle_cb_busy;
552 cpu_idle_cb_busy = cip;
553 cpu_idle_insert_callback(cip);
554 mutex_exit(&cpu_idle_cb_lock);
555
556 *hdlp = (cpu_idle_callback_handle_t)cip;
557
558 return (0);
559 }
560
561 int
cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl)562 cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl)
563 {
564 int rc = ENODEV;
565 cpu_idle_cb_state_t *sp;
566 cpu_idle_cb_impl_t *ip, **ipp;
567
568 ASSERT(!CPU_ON_INTR(CPU));
569 ASSERT(CPU->cpu_seqid < max_ncpus);
570 sp = &cpu_idle_cb_state[CPU->cpu_seqid];
571 if (sp->v.index != 0) {
572 cmn_err(CE_NOTE,
573 "!cpu_event: unregister_callback called from callback.");
574 return (EBUSY);
575 } else if (hdl == NULL) {
576 cmn_err(CE_NOTE,
577 "!cpu_event: hdl is NULL in unregister_callback.");
578 return (EINVAL);
579 }
580
581 ip = (cpu_idle_cb_impl_t *)hdl;
582 mutex_enter(&cpu_idle_cb_lock);
583 for (ipp = &cpu_idle_cb_busy; *ipp != NULL; ipp = &(*ipp)->next) {
584 if (*ipp == ip) {
585 *ipp = ip->next;
586 cpu_idle_remove_callback(ip);
587 rc = 0;
588 break;
589 }
590 }
591 mutex_exit(&cpu_idle_cb_lock);
592
593 if (rc == 0) {
594 kmem_free(ip, sizeof (*ip));
595 } else {
596 cmn_err(CE_NOTE,
597 "!cpu_event: callback handle %p not found.", (void *)hdl);
598 }
599
600 return (rc);
601 }
602
603 static int
cpu_idle_enter_state(cpu_idle_cb_state_t * sp,intptr_t state)604 cpu_idle_enter_state(cpu_idle_cb_state_t *sp, intptr_t state)
605 {
606 sp->v.idle_state->cipv_intptr = state;
607 sp->v.enter_ts->cipv_hrtime = gethrtime_unscaled();
608 sp->v.last_busy->cipv_hrtime = sp->v.enter_ts->cipv_hrtime -
609 sp->v.exit_ts->cipv_hrtime;
610 sp->v.total_busy->cipv_hrtime += sp->v.last_busy->cipv_hrtime;
611 if (sp->v.ready == B_FALSE) {
612 sp->v.ready = B_TRUE;
613 return (0);
614 }
615
616 return (1);
617 }
618
619 static void
cpu_idle_exit_state(cpu_idle_cb_state_t * sp)620 cpu_idle_exit_state(cpu_idle_cb_state_t *sp)
621 {
622 sp->v.idle_state->cipv_intptr = CPU_IDLE_STATE_NORMAL;
623 sp->v.exit_ts->cipv_hrtime = gethrtime_unscaled();
624 sp->v.last_idle->cipv_hrtime = sp->v.exit_ts->cipv_hrtime -
625 sp->v.enter_ts->cipv_hrtime;
626 sp->v.total_idle->cipv_hrtime += sp->v.last_idle->cipv_hrtime;
627 }
628
629 /*ARGSUSED*/
630 int
cpu_idle_enter(int state,int flag,cpu_idle_check_wakeup_t check_func,void * check_arg)631 cpu_idle_enter(int state, int flag,
632 cpu_idle_check_wakeup_t check_func, void *check_arg)
633 {
634 int i;
635 cpu_idle_cb_item_t *cip;
636 cpu_idle_cb_state_t *sp;
637 cpu_idle_callback_context_t ctx;
638 #if defined(__x86)
639 ulong_t iflags;
640 #endif
641
642 ctx = CPU_IDLE_GET_CTX(CPU);
643 ASSERT(CPU->cpu_seqid < max_ncpus);
644 sp = &cpu_idle_cb_state[CPU->cpu_seqid];
645 ASSERT(sp->v.index == 0);
646 if (sp->v.enabled == B_FALSE) {
647 #if defined(__x86)
648 /* Intercept CPU at a safe point before powering off it. */
649 if (CPU_IN_SET(cpu_idle_intercept_set, CPU->cpu_id)) {
650 iflags = intr_clear();
651 CPUSET_ATOMIC_DEL(cpu_idle_intercept_set, CPU->cpu_id);
652 /*CONSTCOND*/
653 while (1) {
654 SMT_PAUSE();
655 }
656 }
657 #endif
658
659 return (0);
660 }
661
662 /*
663 * On x86, cpu_idle_enter can be called from idle thread with either
664 * interrupts enabled or disabled, so we need to make sure interrupts
665 * are disabled here.
666 * On SPARC, cpu_idle_enter will be called from idle thread with
667 * interrupt disabled, so no special handling necessary.
668 */
669 #if defined(__x86)
670 iflags = intr_clear();
671 #endif
672
673 /* Skip calling callback if state is not ready for current CPU. */
674 if (cpu_idle_enter_state(sp, state) == 0) {
675 #if defined(__x86)
676 intr_restore(iflags);
677 #endif
678 return (0);
679 }
680
681 for (i = 0, cip = cpu_idle_cb_array; i < cpu_idle_cb_curr; i++, cip++) {
682 /*
683 * Increase index so corresponding idle_exit callback
684 * will be invoked should interrupt happen during
685 * idle_enter callback.
686 */
687 sp->v.index++;
688
689 /* Call idle_enter callback function if it's not NULL. */
690 if (cip->enter != NULL) {
691 cip->enter(cip->arg, ctx, check_func, check_arg);
692
693 /*
694 * cpu_idle_enter runs with interrupts
695 * disabled, so the idle_enter callbacks will
696 * also be called with interrupts disabled.
697 * It is permissible for the callbacks to
698 * enable the interrupts, if they can also
699 * handle the condition if the interrupt
700 * occurs.
701 *
702 * However, if an interrupt occurs and we
703 * return here without dealing with it, we
704 * return to the cpu_idle_enter() caller
705 * with an EBUSY, and the caller will not
706 * enter the idle state.
707 *
708 * We detect the interrupt, by checking the
709 * index value of the state pointer. If it
710 * is not the index we incremented above,
711 * then it was cleared while processing
712 * the interrupt.
713 *
714 * Also note, that at this point of the code
715 * the normal index value will be one greater
716 * than the variable 'i' in the loop, as it
717 * hasn't yet been incremented.
718 */
719 if (sp->v.index != i + 1) {
720 #if defined(__x86)
721 intr_restore(iflags);
722 #endif
723 return (EBUSY);
724 }
725 }
726 }
727 #if defined(__x86)
728 intr_restore(iflags);
729 #endif
730
731 return (0);
732 }
733
734 void
cpu_idle_exit(int flag)735 cpu_idle_exit(int flag)
736 {
737 int i;
738 cpu_idle_cb_item_t *cip;
739 cpu_idle_cb_state_t *sp;
740 cpu_idle_callback_context_t ctx;
741 #if defined(__x86)
742 ulong_t iflags;
743 #endif
744
745 ASSERT(CPU->cpu_seqid < max_ncpus);
746 sp = &cpu_idle_cb_state[CPU->cpu_seqid];
747
748 #if defined(__sparc)
749 /*
750 * On SPARC, cpu_idle_exit will only be called from idle thread
751 * with interrupt disabled.
752 */
753
754 if (sp->v.index != 0) {
755 ctx = CPU_IDLE_GET_CTX(CPU);
756 cpu_idle_exit_state(sp);
757 for (i = sp->v.index - 1; i >= 0; i--) {
758 cip = &cpu_idle_cb_array[i];
759 if (cip->exit != NULL) {
760 cip->exit(cip->arg, ctx, flag);
761 }
762 }
763 sp->v.index = 0;
764 }
765 #elif defined(__x86)
766 /*
767 * On x86, cpu_idle_exit will be called from idle thread or interrupt
768 * handler. When called from interrupt handler, interrupts will be
769 * disabled. When called from idle thread, interrupts may be disabled
770 * or enabled.
771 */
772
773 /* Called from interrupt, interrupts are already disabled. */
774 if (flag & CPU_IDLE_CB_FLAG_INTR) {
775 /*
776 * return if cpu_idle_exit already called or
777 * there is no registered callback.
778 */
779 if (sp->v.index == 0) {
780 return;
781 }
782 ctx = CPU_IDLE_GET_CTX(CPU);
783 cpu_idle_exit_state(sp);
784 for (i = sp->v.index - 1; i >= 0; i--) {
785 cip = &cpu_idle_cb_array[i];
786 if (cip->exit != NULL) {
787 cip->exit(cip->arg, ctx, flag);
788 }
789 }
790 sp->v.index = 0;
791
792 /* Called from idle thread, need to disable interrupt. */
793 } else {
794 iflags = intr_clear();
795 if (sp->v.index != 0) {
796 ctx = CPU_IDLE_GET_CTX(CPU);
797 cpu_idle_exit_state(sp);
798 for (i = sp->v.index - 1; i >= 0; i--) {
799 cip = &cpu_idle_cb_array[i];
800 if (cip->exit != NULL) {
801 cip->exit(cip->arg, ctx, flag);
802 }
803 }
804 sp->v.index = 0;
805 }
806 intr_restore(iflags);
807 }
808 #endif
809 }
810
811 cpu_idle_callback_context_t
cpu_idle_get_context(void)812 cpu_idle_get_context(void)
813 {
814 return (CPU_IDLE_GET_CTX(CPU));
815 }
816
817 /*
818 * Allocate property structure in group of CPU_IDLE_VALUE_GROUP_SIZE to improve
819 * cache efficiency. To simplify implementation, allocated memory for property
820 * structure won't be freed.
821 */
822 static void
cpu_idle_prop_allocate_impl(void)823 cpu_idle_prop_allocate_impl(void)
824 {
825 int i;
826 size_t sz;
827 intptr_t buf;
828 cpu_idle_prop_impl_t *prop;
829 cpu_idle_prop_value_t *valp;
830
831 ASSERT(!CPU_ON_INTR(CPU));
832 prop = kmem_zalloc(sizeof (*prop) * CPU_IDLE_VALUE_GROUP_SIZE,
833 KM_SLEEP);
834 sz = sizeof (*valp) * CPU_IDLE_VALUE_GROUP_SIZE * max_ncpus;
835 sz += CPU_CACHE_COHERENCE_SIZE;
836 buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
837 valp = (cpu_idle_prop_value_t *)P2ROUNDUP(buf,
838 CPU_CACHE_COHERENCE_SIZE);
839
840 for (i = 0; i < CPU_IDLE_VALUE_GROUP_SIZE; i++, prop++, valp++) {
841 prop->value = valp;
842 prop->next = cpu_idle_prop_free;
843 cpu_idle_prop_free = prop;
844 }
845 }
846
847 int
cpu_idle_prop_create_property(const char * name,cpu_idle_prop_type_t type,cpu_idle_prop_update_t update,void * arg,cpu_idle_prop_handle_t * hdlp)848 cpu_idle_prop_create_property(const char *name, cpu_idle_prop_type_t type,
849 cpu_idle_prop_update_t update, void *arg, cpu_idle_prop_handle_t *hdlp)
850 {
851 int rc = EEXIST;
852 cpu_idle_prop_impl_t *prop;
853
854 ASSERT(!CPU_ON_INTR(CPU));
855 if (name == NULL || hdlp == NULL) {
856 cmn_err(CE_WARN,
857 "!cpu_event: NULL parameters in create_property.");
858 return (EINVAL);
859 }
860
861 mutex_enter(&cpu_idle_prop_lock);
862 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
863 if (strcmp(prop->name, name) == 0) {
864 cmn_err(CE_NOTE,
865 "!cpu_event: property %s already exists.", name);
866 break;
867 }
868 }
869 if (prop == NULL) {
870 if (cpu_idle_prop_free == NULL) {
871 cpu_idle_prop_allocate_impl();
872 }
873 ASSERT(cpu_idle_prop_free != NULL);
874 prop = cpu_idle_prop_free;
875 cpu_idle_prop_free = prop->next;
876 prop->next = cpu_idle_prop_busy;
877 cpu_idle_prop_busy = prop;
878
879 ASSERT(prop->value != NULL);
880 prop->name = strdup(name);
881 prop->type = type;
882 prop->update = update;
883 prop->private = arg;
884 prop->refcnt = 1;
885 *hdlp = prop;
886 rc = 0;
887 }
888 mutex_exit(&cpu_idle_prop_lock);
889
890 return (rc);
891 }
892
893 int
cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl)894 cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl)
895 {
896 int rc = ENODEV;
897 cpu_idle_prop_impl_t *prop, **propp;
898 cpu_idle_prop_value_t *valp;
899
900 ASSERT(!CPU_ON_INTR(CPU));
901 if (hdl == NULL) {
902 cmn_err(CE_WARN,
903 "!cpu_event: hdl is NULL in destroy_property.");
904 return (EINVAL);
905 }
906
907 prop = (cpu_idle_prop_impl_t *)hdl;
908 mutex_enter(&cpu_idle_prop_lock);
909 for (propp = &cpu_idle_prop_busy; *propp != NULL;
910 propp = &(*propp)->next) {
911 if (*propp == prop) {
912 ASSERT(prop->refcnt > 0);
913 if (atomic_cas_32(&prop->refcnt, 1, 0) == 1) {
914 *propp = prop->next;
915 strfree(prop->name);
916 valp = prop->value;
917 bzero(prop, sizeof (*prop));
918 prop->value = valp;
919 prop->next = cpu_idle_prop_free;
920 cpu_idle_prop_free = prop;
921 rc = 0;
922 } else {
923 rc = EBUSY;
924 }
925 break;
926 }
927 }
928 mutex_exit(&cpu_idle_prop_lock);
929
930 return (rc);
931 }
932
933 int
cpu_idle_prop_create_handle(const char * name,cpu_idle_prop_handle_t * hdlp)934 cpu_idle_prop_create_handle(const char *name, cpu_idle_prop_handle_t *hdlp)
935 {
936 int rc = ENODEV;
937 cpu_idle_prop_impl_t *prop;
938
939 ASSERT(!CPU_ON_INTR(CPU));
940 if (name == NULL || hdlp == NULL) {
941 cmn_err(CE_WARN,
942 "!cpu_event: NULL parameters in create_handle.");
943 return (EINVAL);
944 }
945
946 mutex_enter(&cpu_idle_prop_lock);
947 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
948 if (strcmp(prop->name, name) == 0) {
949 /* Hold one refcount on object. */
950 ASSERT(prop->refcnt > 0);
951 atomic_inc_32(&prop->refcnt);
952 *hdlp = (cpu_idle_prop_handle_t)prop;
953 rc = 0;
954 break;
955 }
956 }
957 mutex_exit(&cpu_idle_prop_lock);
958
959 return (rc);
960 }
961
962 int
cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl)963 cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl)
964 {
965 int rc = ENODEV;
966 cpu_idle_prop_impl_t *prop;
967
968 ASSERT(!CPU_ON_INTR(CPU));
969 if (hdl == NULL) {
970 cmn_err(CE_WARN,
971 "!cpu_event: hdl is NULL in destroy_handle.");
972 return (EINVAL);
973 }
974
975 mutex_enter(&cpu_idle_prop_lock);
976 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
977 if (prop == hdl) {
978 /* Release refcnt held in create_handle. */
979 ASSERT(prop->refcnt > 1);
980 atomic_dec_32(&prop->refcnt);
981 rc = 0;
982 break;
983 }
984 }
985 mutex_exit(&cpu_idle_prop_lock);
986
987 return (rc);
988 }
989
990 cpu_idle_prop_type_t
cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl)991 cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl)
992 {
993 ASSERT(hdl != NULL);
994 return (((cpu_idle_prop_impl_t *)hdl)->type);
995 }
996
997 const char *
cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl)998 cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl)
999 {
1000 ASSERT(hdl != NULL);
1001 return (((cpu_idle_prop_impl_t *)hdl)->name);
1002 }
1003
1004 int
cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx,cpu_idle_prop_value_t * valp)1005 cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl,
1006 cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t *valp)
1007 {
1008 int idx, rc = 0;
1009 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1010
1011 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1012 if (hdl == NULL || valp == NULL) {
1013 cmn_err(CE_NOTE, "!cpu_event: NULL parameters in prop_get.");
1014 return (EINVAL);
1015 }
1016 idx = CPU_IDLE_CTX2IDX(ctx);
1017 if (prop->update != NULL) {
1018 cpu_idle_cb_state_t *sp;
1019
1020 ASSERT(CPU->cpu_seqid < max_ncpus);
1021 sp = &cpu_idle_cb_state[CPU->cpu_seqid];
1022 /* CPU's idle enter timestamp as sequence number. */
1023 rc = prop->update(prop->private,
1024 (uint64_t)sp->v.enter_ts->cipv_hrtime, &prop->value[idx]);
1025 }
1026 if (rc == 0) {
1027 *valp = prop->value[idx];
1028 }
1029
1030 return (rc);
1031 }
1032
1033 uint32_t
cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx)1034 cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl,
1035 cpu_idle_callback_context_t ctx)
1036 {
1037 int idx;
1038 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1039
1040 ASSERT(hdl != NULL);
1041 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1042 idx = CPU_IDLE_CTX2IDX(ctx);
1043 return (prop->value[idx].cipv_uint32);
1044 }
1045
1046 uint64_t
cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx)1047 cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl,
1048 cpu_idle_callback_context_t ctx)
1049 {
1050 int idx;
1051 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1052
1053 ASSERT(hdl != NULL);
1054 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1055 idx = CPU_IDLE_CTX2IDX(ctx);
1056 return (prop->value[idx].cipv_uint64);
1057 }
1058
1059 intptr_t
cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx)1060 cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl,
1061 cpu_idle_callback_context_t ctx)
1062 {
1063 int idx;
1064 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1065
1066 ASSERT(hdl != NULL);
1067 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1068 idx = CPU_IDLE_CTX2IDX(ctx);
1069 return (prop->value[idx].cipv_intptr);
1070 }
1071
1072 hrtime_t
cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx)1073 cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl,
1074 cpu_idle_callback_context_t ctx)
1075 {
1076 int idx;
1077 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1078
1079 ASSERT(hdl != NULL);
1080 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1081 idx = CPU_IDLE_CTX2IDX(ctx);
1082 return (prop->value[idx].cipv_hrtime);
1083 }
1084
1085 void
cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl,cpu_idle_callback_context_t ctx,cpu_idle_prop_value_t val)1086 cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl,
1087 cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t val)
1088 {
1089 int idx;
1090 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1091
1092 ASSERT(hdl != NULL);
1093 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
1094 idx = CPU_IDLE_CTX2IDX(ctx);
1095 prop->value[idx] = val;
1096 }
1097
1098 void
cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl,cpu_idle_prop_value_t val)1099 cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl, cpu_idle_prop_value_t val)
1100 {
1101 int i, idx;
1102 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
1103
1104 ASSERT(hdl != NULL);
1105 for (i = 0; i < max_ncpus; i++) {
1106 idx = CPU_IDLE_CTX2IDX(i);
1107 prop->value[idx] = val;
1108 }
1109 }
1110
1111 /*ARGSUSED*/
cpu_idle_prop_update_intr_cnt(void * arg,uint64_t seqnum,cpu_idle_prop_value_t * valp)1112 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
1113 cpu_idle_prop_value_t *valp)
1114 {
1115 int i;
1116 uint64_t val;
1117
1118 for (val = 0, i = 0; i < PIL_MAX; i++) {
1119 val += CPU->cpu_stats.sys.intr[i];
1120 }
1121 valp->cipv_uint64 = val;
1122
1123 return (0);
1124 }
1125
1126 uint_t
cpu_idle_get_cpu_state(cpu_t * cp)1127 cpu_idle_get_cpu_state(cpu_t *cp)
1128 {
1129 ASSERT(cp != NULL && cp->cpu_seqid < max_ncpus);
1130 return ((uint_t)cpu_idle_prop_get_uint32(
1131 cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle,
1132 CPU_IDLE_GET_CTX(cp)));
1133 }
1134
1135 #if defined(__x86)
1136 /*
1137 * Intercept CPU at a safe point in idle() before powering it off.
1138 */
1139 void
cpu_idle_intercept_cpu(cpu_t * cp)1140 cpu_idle_intercept_cpu(cpu_t *cp)
1141 {
1142 ASSERT(cp->cpu_seqid < max_ncpus);
1143 ASSERT(cpu_idle_cb_state[cp->cpu_seqid].v.enabled == B_FALSE);
1144
1145 /* Set flag to intercept CPU. */
1146 CPUSET_ATOMIC_ADD(cpu_idle_intercept_set, cp->cpu_id);
1147 /* Wake up CPU from possible sleep state. */
1148 poke_cpu(cp->cpu_id);
1149 while (CPU_IN_SET(cpu_idle_intercept_set, cp->cpu_id)) {
1150 DELAY(1);
1151 }
1152 /*
1153 * Now target CPU is spinning in a pause loop with interrupts disabled.
1154 */
1155 }
1156 #endif
1157