1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009-2010, Intel Corporation. 23 * All rights reserved. 24 */ 25 26 /* 27 * Introduction 28 * This file implements a CPU event notification mechanism to signal clients 29 * which are interested in CPU related events. 30 * Currently it only supports CPU idle state change events which will be 31 * triggered just before CPU entering hardware idle state and just after CPU 32 * wakes up from hardware idle state. 33 * Please refer to PSARC/2009/115 for detail information. 34 * 35 * Lock Strategy 36 * 1) cpu_idle_prop_busy/free are protected by cpu_idle_prop_lock. 37 * 2) No protection for cpu_idle_cb_state because it's per-CPU data. 38 * 3) cpu_idle_cb_busy is protected by cpu_idle_cb_lock. 39 * 4) cpu_idle_cb_array is protected by pause_cpus/start_cpus logic. 40 * 5) cpu_idle_cb_max/curr are protected by both cpu_idle_cb_lock and 41 * pause_cpus/start_cpus logic. 42 * We have optimized the algorithm for hot path on read side access. 43 * In the current algorithm, it's lock free on read side access. 44 * On write side, we use pause_cpus() to keep other CPUs in the pause thread, 45 * which will guarantee that no other threads will access 46 * cpu_idle_cb_max/curr/array data structure. 47 */ 48 49 #include <sys/types.h> 50 #include <sys/cmn_err.h> 51 #include <sys/cpuvar.h> 52 #include <sys/cpu.h> 53 #include <sys/kmem.h> 54 #include <sys/machcpuvar.h> 55 #include <sys/sdt.h> 56 #include <sys/sysmacros.h> 57 #include <sys/synch.h> 58 #include <sys/systm.h> 59 #include <sys/sunddi.h> 60 #if defined(__sparc) 61 #include <sys/machsystm.h> 62 #elif defined(__x86) 63 #include <sys/archsystm.h> 64 #endif 65 #include <sys/cpu_event.h> 66 67 /* Define normal state for CPU on different platforms. */ 68 #if defined(__x86) 69 #define CPU_IDLE_STATE_NORMAL IDLE_STATE_C0 70 #elif defined(__sparc) 71 /* 72 * At the time of this implementation IDLE_STATE_NORMAL is defined 73 * in mach_startup.c, and not in a header file. So if we find it is 74 * undefined, then we set it to the value as defined in mach_startup.c 75 * Should it eventually be defined, we will pick it up. 76 */ 77 #ifndef IDLE_STATE_NORMAL 78 #define IDLE_STATE_NORMAL 0 79 #endif 80 #define CPU_IDLE_STATE_NORMAL IDLE_STATE_NORMAL 81 #endif 82 83 /* 84 * To improve cache efficiency and avoid cache false sharing, CPU idle 85 * properties are grouped into cache lines as below: 86 * | CPU0 | CPU1 |.........| CPUn | 87 * | cache line 0 | cache line 1 |.........| cache line n | 88 * | v0 | ... | vm | v0 | ... | vm |.........| v0 | ... | vm | 89 * To access value of property m for CPU n, using following value as index: 90 * index = seq_id_of_CPUn * CPU_IDLE_VALUE_GROUP_SIZE + m. 91 */ 92 #define CPU_IDLE_VALUE_GROUP_SIZE \ 93 (CPU_CACHE_COHERENCE_SIZE / sizeof (cpu_idle_prop_value_t)) 94 95 /* Get callback context handle for current CPU. */ 96 #define CPU_IDLE_GET_CTX(cp) \ 97 ((cpu_idle_callback_context_t)(intptr_t)((cp)->cpu_seqid)) 98 99 /* Get CPU sequential id from ctx. */ 100 #define CPU_IDLE_CTX2CPUID(ctx) ((processorid_t)(intptr_t)(ctx)) 101 102 /* Compute index from callback context handle. */ 103 #define CPU_IDLE_CTX2IDX(ctx) \ 104 (((int)(intptr_t)(ctx)) * CPU_IDLE_VALUE_GROUP_SIZE) 105 106 #define CPU_IDLE_HDL2VALP(hdl, idx) \ 107 (&((cpu_idle_prop_impl_t *)(hdl))->value[(idx)]) 108 109 /* 110 * When cpu_idle_cb_array is NULL or full, increase CPU_IDLE_ARRAY_CAPACITY_INC 111 * entries every time. Here we prefer linear growth instead of exponential. 112 */ 113 #define CPU_IDLE_ARRAY_CAPACITY_INC 0x10 114 115 typedef struct cpu_idle_prop_impl { 116 cpu_idle_prop_value_t *value; 117 struct cpu_idle_prop_impl *next; 118 char *name; 119 cpu_idle_prop_update_t update; 120 void *private; 121 cpu_idle_prop_type_t type; 122 uint32_t refcnt; 123 } cpu_idle_prop_impl_t; 124 125 typedef struct cpu_idle_prop_item { 126 cpu_idle_prop_type_t type; 127 char *name; 128 cpu_idle_prop_update_t update; 129 void *arg; 130 cpu_idle_prop_handle_t handle; 131 } cpu_idle_prop_item_t; 132 133 /* Structure to maintain registered callbacks in list. */ 134 typedef struct cpu_idle_cb_impl { 135 struct cpu_idle_cb_impl *next; 136 cpu_idle_callback_t *callback; 137 void *argument; 138 int priority; 139 } cpu_idle_cb_impl_t; 140 141 /* 142 * Structure to maintain registered callbacks in priority order and also 143 * optimized for cache efficiency for reading access. 144 */ 145 typedef struct cpu_idle_cb_item { 146 cpu_idle_enter_cbfn_t enter; 147 cpu_idle_exit_cbfn_t exit; 148 void *arg; 149 cpu_idle_cb_impl_t *impl; 150 } cpu_idle_cb_item_t; 151 152 /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */ 153 typedef union cpu_idle_cb_state { 154 struct { 155 /* Index of already invoked callbacks. */ 156 int index; 157 /* Invoke registered callbacks if true. */ 158 boolean_t enabled; 159 /* Property values are valid if true. */ 160 boolean_t ready; 161 /* Pointers to per-CPU properties. */ 162 cpu_idle_prop_value_t *idle_state; 163 cpu_idle_prop_value_t *enter_ts; 164 cpu_idle_prop_value_t *exit_ts; 165 cpu_idle_prop_value_t *last_idle; 166 cpu_idle_prop_value_t *last_busy; 167 cpu_idle_prop_value_t *total_idle; 168 cpu_idle_prop_value_t *total_busy; 169 cpu_idle_prop_value_t *intr_cnt; 170 } v; 171 #ifdef _LP64 172 char align[2 * CPU_CACHE_COHERENCE_SIZE]; 173 #else 174 char align[CPU_CACHE_COHERENCE_SIZE]; 175 #endif 176 } cpu_idle_cb_state_t; 177 178 static kmutex_t cpu_idle_prop_lock; 179 static cpu_idle_prop_impl_t *cpu_idle_prop_busy = NULL; 180 static cpu_idle_prop_impl_t *cpu_idle_prop_free = NULL; 181 182 static kmutex_t cpu_idle_cb_lock; 183 static cpu_idle_cb_impl_t *cpu_idle_cb_busy = NULL; 184 static cpu_idle_cb_item_t *cpu_idle_cb_array = NULL; 185 static int cpu_idle_cb_curr = 0; 186 static int cpu_idle_cb_max = 0; 187 188 static cpu_idle_cb_state_t *cpu_idle_cb_state; 189 190 #ifdef __x86 191 /* 192 * cpuset used to intercept CPUs before powering them off. 193 * The control CPU sets the bit corresponding to the target CPU and waits 194 * until the bit is cleared. 195 * The target CPU disables interrupts before clearing corresponding bit and 196 * then loops for ever. 197 */ 198 static cpuset_t cpu_idle_intercept_set; 199 #endif 200 201 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum, 202 cpu_idle_prop_value_t *valp); 203 204 static cpu_idle_prop_item_t cpu_idle_prop_array[] = { 205 { 206 CPU_IDLE_PROP_TYPE_INTPTR, CPU_IDLE_PROP_IDLE_STATE, 207 NULL, NULL, NULL 208 }, 209 { 210 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_ENTER_TIMESTAMP, 211 NULL, NULL, NULL 212 }, 213 { 214 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_EXIT_TIMESTAMP, 215 NULL, NULL, NULL 216 }, 217 { 218 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_IDLE_TIME, 219 NULL, NULL, NULL 220 }, 221 { 222 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_BUSY_TIME, 223 NULL, NULL, NULL 224 }, 225 { 226 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_IDLE_TIME, 227 NULL, NULL, NULL 228 }, 229 { 230 CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_BUSY_TIME, 231 NULL, NULL, NULL 232 }, 233 { 234 CPU_IDLE_PROP_TYPE_UINT64, CPU_IDLE_PROP_INTERRUPT_COUNT, 235 cpu_idle_prop_update_intr_cnt, NULL, NULL 236 }, 237 }; 238 239 #define CPU_IDLE_PROP_IDX_IDLE_STATE 0 240 #define CPU_IDLE_PROP_IDX_ENTER_TS 1 241 #define CPU_IDLE_PROP_IDX_EXIT_TS 2 242 #define CPU_IDLE_PROP_IDX_LAST_IDLE 3 243 #define CPU_IDLE_PROP_IDX_LAST_BUSY 4 244 #define CPU_IDLE_PROP_IDX_TOTAL_IDLE 5 245 #define CPU_IDLE_PROP_IDX_TOTAL_BUSY 6 246 #define CPU_IDLE_PROP_IDX_INTR_CNT 7 247 248 /*ARGSUSED*/ 249 static void 250 cpu_idle_dtrace_enter(void *arg, cpu_idle_callback_context_t ctx, 251 cpu_idle_check_wakeup_t check_func, void *check_arg) 252 { 253 int state; 254 255 state = cpu_idle_prop_get_intptr( 256 cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, ctx); 257 DTRACE_PROBE1(idle__state__transition, uint_t, state); 258 } 259 260 /*ARGSUSED*/ 261 static void 262 cpu_idle_dtrace_exit(void *arg, cpu_idle_callback_context_t ctx, int flag) 263 { 264 DTRACE_PROBE1(idle__state__transition, uint_t, CPU_IDLE_STATE_NORMAL); 265 } 266 267 static cpu_idle_callback_handle_t cpu_idle_cb_handle_dtrace; 268 static cpu_idle_callback_t cpu_idle_callback_dtrace = { 269 CPU_IDLE_CALLBACK_VERS, 270 cpu_idle_dtrace_enter, 271 cpu_idle_dtrace_exit, 272 }; 273 274 #if defined(__x86) && !defined(__xpv) 275 extern void tlb_going_idle(void); 276 extern void tlb_service(void); 277 278 static cpu_idle_callback_handle_t cpu_idle_cb_handle_tlb; 279 static cpu_idle_callback_t cpu_idle_callback_tlb = { 280 CPU_IDLE_CALLBACK_VERS, 281 (cpu_idle_enter_cbfn_t)tlb_going_idle, 282 (cpu_idle_exit_cbfn_t)tlb_service, 283 }; 284 #endif 285 286 void 287 cpu_event_init(void) 288 { 289 int i, idx; 290 size_t sz; 291 intptr_t buf; 292 cpu_idle_cb_state_t *sp; 293 cpu_idle_prop_item_t *ip; 294 295 mutex_init(&cpu_idle_cb_lock, NULL, MUTEX_DRIVER, NULL); 296 mutex_init(&cpu_idle_prop_lock, NULL, MUTEX_DRIVER, NULL); 297 298 /* Create internal properties. */ 299 for (i = 0, ip = cpu_idle_prop_array; 300 i < sizeof (cpu_idle_prop_array) / sizeof (cpu_idle_prop_array[0]); 301 i++, ip++) { 302 (void) cpu_idle_prop_create_property(ip->name, ip->type, 303 ip->update, ip->arg, &ip->handle); 304 ASSERT(ip->handle != NULL); 305 } 306 307 /* Allocate buffer and align to CPU_CACHE_COHERENCE_SIZE. */ 308 sz = sizeof (cpu_idle_cb_state_t) * max_ncpus; 309 sz += CPU_CACHE_COHERENCE_SIZE; 310 buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP); 311 cpu_idle_cb_state = (cpu_idle_cb_state_t *)P2ROUNDUP(buf, 312 CPU_CACHE_COHERENCE_SIZE); 313 314 /* Cache frequently used property value pointers. */ 315 for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) { 316 idx = CPU_IDLE_CTX2IDX(i); 317 #define ___INIT_P(f, i) \ 318 sp->v.f = CPU_IDLE_HDL2VALP(cpu_idle_prop_array[(i)].handle, idx) 319 ___INIT_P(idle_state, CPU_IDLE_PROP_IDX_IDLE_STATE); 320 ___INIT_P(enter_ts, CPU_IDLE_PROP_IDX_ENTER_TS); 321 ___INIT_P(exit_ts, CPU_IDLE_PROP_IDX_EXIT_TS); 322 ___INIT_P(last_idle, CPU_IDLE_PROP_IDX_LAST_IDLE); 323 ___INIT_P(last_busy, CPU_IDLE_PROP_IDX_LAST_BUSY); 324 ___INIT_P(total_idle, CPU_IDLE_PROP_IDX_TOTAL_IDLE); 325 ___INIT_P(total_busy, CPU_IDLE_PROP_IDX_TOTAL_BUSY); 326 ___INIT_P(last_idle, CPU_IDLE_PROP_IDX_INTR_CNT); 327 #undef ___INIT_P 328 } 329 330 /* Register built-in callbacks. */ 331 if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_DTRACE, 332 &cpu_idle_callback_dtrace, NULL, &cpu_idle_cb_handle_dtrace) != 0) { 333 cmn_err(CE_PANIC, 334 "cpu_idle: failed to register callback for dtrace."); 335 } 336 #if defined(__x86) && !defined(__xpv) 337 if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_TLB, 338 &cpu_idle_callback_tlb, NULL, &cpu_idle_cb_handle_tlb) != 0) { 339 cmn_err(CE_PANIC, 340 "cpu_idle: failed to register callback for tlb_flush."); 341 } 342 #endif 343 } 344 345 /* 346 * This function is called to initialize per CPU state when starting CPUs. 347 */ 348 void 349 cpu_event_init_cpu(cpu_t *cp) 350 { 351 ASSERT(cp->cpu_seqid < max_ncpus); 352 cpu_idle_cb_state[cp->cpu_seqid].v.index = 0; 353 cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE; 354 cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_TRUE; 355 } 356 357 /* 358 * This function is called to clean up per CPU state when stopping CPUs. 359 */ 360 void 361 cpu_event_fini_cpu(cpu_t *cp) 362 { 363 ASSERT(cp->cpu_seqid < max_ncpus); 364 cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_FALSE; 365 cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE; 366 } 367 368 static void 369 cpu_idle_insert_callback(cpu_idle_cb_impl_t *cip) 370 { 371 int unlock = 0, unpause = 0; 372 int i, cnt_new = 0, cnt_old = 0; 373 char *buf_new = NULL, *buf_old = NULL; 374 375 ASSERT(MUTEX_HELD(&cpu_idle_cb_lock)); 376 377 /* 378 * Expand array if it's full. 379 * Memory must be allocated out of pause/start_cpus() scope because 380 * kmem_zalloc() can't be called with KM_SLEEP flag within that scope. 381 */ 382 if (cpu_idle_cb_curr == cpu_idle_cb_max) { 383 cnt_new = cpu_idle_cb_max + CPU_IDLE_ARRAY_CAPACITY_INC; 384 buf_new = (char *)kmem_zalloc(cnt_new * 385 sizeof (cpu_idle_cb_item_t), KM_SLEEP); 386 } 387 388 /* Try to acquire cpu_lock if not held yet. */ 389 if (!MUTEX_HELD(&cpu_lock)) { 390 mutex_enter(&cpu_lock); 391 unlock = 1; 392 } 393 /* 394 * Pause all other CPUs (and let them run pause thread). 395 * It's guaranteed that no other threads will access cpu_idle_cb_array 396 * after pause_cpus(). 397 */ 398 if (!cpus_paused()) { 399 pause_cpus(NULL); 400 unpause = 1; 401 } 402 403 /* Copy content to new buffer if needed. */ 404 if (buf_new != NULL) { 405 buf_old = (char *)cpu_idle_cb_array; 406 cnt_old = cpu_idle_cb_max; 407 if (buf_old != NULL) { 408 ASSERT(cnt_old != 0); 409 bcopy(cpu_idle_cb_array, buf_new, 410 sizeof (cpu_idle_cb_item_t) * cnt_old); 411 } 412 cpu_idle_cb_array = (cpu_idle_cb_item_t *)buf_new; 413 cpu_idle_cb_max = cnt_new; 414 } 415 416 /* Insert into array according to priority. */ 417 ASSERT(cpu_idle_cb_curr < cpu_idle_cb_max); 418 for (i = cpu_idle_cb_curr; i > 0; i--) { 419 if (cpu_idle_cb_array[i - 1].impl->priority >= cip->priority) { 420 break; 421 } 422 cpu_idle_cb_array[i] = cpu_idle_cb_array[i - 1]; 423 } 424 cpu_idle_cb_array[i].arg = cip->argument; 425 cpu_idle_cb_array[i].enter = cip->callback->idle_enter; 426 cpu_idle_cb_array[i].exit = cip->callback->idle_exit; 427 cpu_idle_cb_array[i].impl = cip; 428 cpu_idle_cb_curr++; 429 430 /* Resume other CPUs from paused state if needed. */ 431 if (unpause) { 432 start_cpus(); 433 } 434 if (unlock) { 435 mutex_exit(&cpu_lock); 436 } 437 438 /* Free old resource if needed. */ 439 if (buf_old != NULL) { 440 ASSERT(cnt_old != 0); 441 kmem_free(buf_old, cnt_old * sizeof (cpu_idle_cb_item_t)); 442 } 443 } 444 445 static void 446 cpu_idle_remove_callback(cpu_idle_cb_impl_t *cip) 447 { 448 int i, found = 0; 449 int unlock = 0, unpause = 0; 450 cpu_idle_cb_state_t *sp; 451 452 ASSERT(MUTEX_HELD(&cpu_idle_cb_lock)); 453 454 /* Try to acquire cpu_lock if not held yet. */ 455 if (!MUTEX_HELD(&cpu_lock)) { 456 mutex_enter(&cpu_lock); 457 unlock = 1; 458 } 459 /* 460 * Pause all other CPUs. 461 * It's guaranteed that no other threads will access cpu_idle_cb_array 462 * after pause_cpus(). 463 */ 464 if (!cpus_paused()) { 465 pause_cpus(NULL); 466 unpause = 1; 467 } 468 469 /* Remove cip from array. */ 470 for (i = 0; i < cpu_idle_cb_curr; i++) { 471 if (found == 0) { 472 if (cpu_idle_cb_array[i].impl == cip) { 473 found = 1; 474 } 475 } else { 476 cpu_idle_cb_array[i - 1] = cpu_idle_cb_array[i]; 477 } 478 } 479 ASSERT(found != 0); 480 cpu_idle_cb_curr--; 481 482 /* 483 * Reset property ready flag for all CPUs if no registered callback 484 * left because cpu_idle_enter/exit will stop updating property if 485 * there's no callback registered. 486 */ 487 if (cpu_idle_cb_curr == 0) { 488 for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) { 489 sp->v.ready = B_FALSE; 490 } 491 } 492 493 /* Resume other CPUs from paused state if needed. */ 494 if (unpause) { 495 start_cpus(); 496 } 497 if (unlock) { 498 mutex_exit(&cpu_lock); 499 } 500 } 501 502 int 503 cpu_idle_register_callback(uint_t prio, cpu_idle_callback_t *cbp, 504 void *arg, cpu_idle_callback_handle_t *hdlp) 505 { 506 cpu_idle_cb_state_t *sp; 507 cpu_idle_cb_impl_t *cip = NULL; 508 509 /* First validate parameters. */ 510 ASSERT(!CPU_ON_INTR(CPU)); 511 ASSERT(CPU->cpu_seqid < max_ncpus); 512 sp = &cpu_idle_cb_state[CPU->cpu_seqid]; 513 if (sp->v.index != 0) { 514 cmn_err(CE_NOTE, 515 "!cpu_event: register_callback called from callback."); 516 return (EBUSY); 517 } else if (cbp == NULL || hdlp == NULL) { 518 cmn_err(CE_NOTE, 519 "!cpu_event: NULL parameters in register_callback."); 520 return (EINVAL); 521 } else if (prio < CPU_IDLE_CB_PRIO_LOW_BASE || 522 prio >= CPU_IDLE_CB_PRIO_RESV_BASE) { 523 cmn_err(CE_NOTE, 524 "!cpu_event: priority 0x%x out of range.", prio); 525 return (EINVAL); 526 } else if (cbp->version != CPU_IDLE_CALLBACK_VERS) { 527 cmn_err(CE_NOTE, 528 "!cpu_event: callback version %d is not supported.", 529 cbp->version); 530 return (EINVAL); 531 } 532 533 mutex_enter(&cpu_idle_cb_lock); 534 /* Check whether callback with priority exists if not dynamic. */ 535 if (prio != CPU_IDLE_CB_PRIO_DYNAMIC) { 536 for (cip = cpu_idle_cb_busy; cip != NULL; 537 cip = cip->next) { 538 if (cip->priority == prio) { 539 mutex_exit(&cpu_idle_cb_lock); 540 cmn_err(CE_NOTE, "!cpu_event: callback with " 541 "priority 0x%x already exists.", prio); 542 return (EEXIST); 543 } 544 } 545 } 546 547 cip = kmem_zalloc(sizeof (*cip), KM_SLEEP); 548 cip->callback = cbp; 549 cip->argument = arg; 550 cip->priority = prio; 551 cip->next = cpu_idle_cb_busy; 552 cpu_idle_cb_busy = cip; 553 cpu_idle_insert_callback(cip); 554 mutex_exit(&cpu_idle_cb_lock); 555 556 *hdlp = (cpu_idle_callback_handle_t)cip; 557 558 return (0); 559 } 560 561 int 562 cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl) 563 { 564 int rc = ENODEV; 565 cpu_idle_cb_state_t *sp; 566 cpu_idle_cb_impl_t *ip, **ipp; 567 568 ASSERT(!CPU_ON_INTR(CPU)); 569 ASSERT(CPU->cpu_seqid < max_ncpus); 570 sp = &cpu_idle_cb_state[CPU->cpu_seqid]; 571 if (sp->v.index != 0) { 572 cmn_err(CE_NOTE, 573 "!cpu_event: unregister_callback called from callback."); 574 return (EBUSY); 575 } else if (hdl == NULL) { 576 cmn_err(CE_NOTE, 577 "!cpu_event: hdl is NULL in unregister_callback."); 578 return (EINVAL); 579 } 580 581 ip = (cpu_idle_cb_impl_t *)hdl; 582 mutex_enter(&cpu_idle_cb_lock); 583 for (ipp = &cpu_idle_cb_busy; *ipp != NULL; ipp = &(*ipp)->next) { 584 if (*ipp == ip) { 585 *ipp = ip->next; 586 cpu_idle_remove_callback(ip); 587 rc = 0; 588 break; 589 } 590 } 591 mutex_exit(&cpu_idle_cb_lock); 592 593 if (rc == 0) { 594 kmem_free(ip, sizeof (*ip)); 595 } else { 596 cmn_err(CE_NOTE, 597 "!cpu_event: callback handle %p not found.", (void *)hdl); 598 } 599 600 return (rc); 601 } 602 603 static int 604 cpu_idle_enter_state(cpu_idle_cb_state_t *sp, intptr_t state) 605 { 606 sp->v.idle_state->cipv_intptr = state; 607 sp->v.enter_ts->cipv_hrtime = gethrtime_unscaled(); 608 sp->v.last_busy->cipv_hrtime = sp->v.enter_ts->cipv_hrtime - 609 sp->v.exit_ts->cipv_hrtime; 610 sp->v.total_busy->cipv_hrtime += sp->v.last_busy->cipv_hrtime; 611 if (sp->v.ready == B_FALSE) { 612 sp->v.ready = B_TRUE; 613 return (0); 614 } 615 616 return (1); 617 } 618 619 static void 620 cpu_idle_exit_state(cpu_idle_cb_state_t *sp) 621 { 622 sp->v.idle_state->cipv_intptr = CPU_IDLE_STATE_NORMAL; 623 sp->v.exit_ts->cipv_hrtime = gethrtime_unscaled(); 624 sp->v.last_idle->cipv_hrtime = sp->v.exit_ts->cipv_hrtime - 625 sp->v.enter_ts->cipv_hrtime; 626 sp->v.total_idle->cipv_hrtime += sp->v.last_idle->cipv_hrtime; 627 } 628 629 /*ARGSUSED*/ 630 int 631 cpu_idle_enter(int state, int flag, 632 cpu_idle_check_wakeup_t check_func, void *check_arg) 633 { 634 int i; 635 cpu_idle_cb_item_t *cip; 636 cpu_idle_cb_state_t *sp; 637 cpu_idle_callback_context_t ctx; 638 #if defined(__x86) 639 ulong_t iflags; 640 #endif 641 642 ctx = CPU_IDLE_GET_CTX(CPU); 643 ASSERT(CPU->cpu_seqid < max_ncpus); 644 sp = &cpu_idle_cb_state[CPU->cpu_seqid]; 645 ASSERT(sp->v.index == 0); 646 if (sp->v.enabled == B_FALSE) { 647 #if defined(__x86) 648 /* Intercept CPU at a safe point before powering off it. */ 649 if (CPU_IN_SET(cpu_idle_intercept_set, CPU->cpu_id)) { 650 iflags = intr_clear(); 651 CPUSET_ATOMIC_DEL(cpu_idle_intercept_set, CPU->cpu_id); 652 /*CONSTCOND*/ 653 while (1) { 654 SMT_PAUSE(); 655 } 656 } 657 #endif 658 659 return (0); 660 } 661 662 /* 663 * On x86, cpu_idle_enter can be called from idle thread with either 664 * interrupts enabled or disabled, so we need to make sure interrupts 665 * are disabled here. 666 * On SPARC, cpu_idle_enter will be called from idle thread with 667 * interrupt disabled, so no special handling necessary. 668 */ 669 #if defined(__x86) 670 iflags = intr_clear(); 671 #endif 672 673 /* Skip calling callback if state is not ready for current CPU. */ 674 if (cpu_idle_enter_state(sp, state) == 0) { 675 #if defined(__x86) 676 intr_restore(iflags); 677 #endif 678 return (0); 679 } 680 681 for (i = 0, cip = cpu_idle_cb_array; i < cpu_idle_cb_curr; i++, cip++) { 682 /* 683 * Increase index so corresponding idle_exit callback 684 * will be invoked should interrupt happen during 685 * idle_enter callback. 686 */ 687 sp->v.index++; 688 689 /* Call idle_enter callback function if it's not NULL. */ 690 if (cip->enter != NULL) { 691 cip->enter(cip->arg, ctx, check_func, check_arg); 692 693 /* 694 * cpu_idle_enter runs with interrupts 695 * disabled, so the idle_enter callbacks will 696 * also be called with interrupts disabled. 697 * It is permissible for the callbacks to 698 * enable the interrupts, if they can also 699 * handle the condition if the interrupt 700 * occurs. 701 * 702 * However, if an interrupt occurs and we 703 * return here without dealing with it, we 704 * return to the cpu_idle_enter() caller 705 * with an EBUSY, and the caller will not 706 * enter the idle state. 707 * 708 * We detect the interrupt, by checking the 709 * index value of the state pointer. If it 710 * is not the index we incremented above, 711 * then it was cleared while processing 712 * the interrupt. 713 * 714 * Also note, that at this point of the code 715 * the normal index value will be one greater 716 * than the variable 'i' in the loop, as it 717 * hasn't yet been incremented. 718 */ 719 if (sp->v.index != i + 1) { 720 #if defined(__x86) 721 intr_restore(iflags); 722 #endif 723 return (EBUSY); 724 } 725 } 726 } 727 #if defined(__x86) 728 intr_restore(iflags); 729 #endif 730 731 return (0); 732 } 733 734 void 735 cpu_idle_exit(int flag) 736 { 737 int i; 738 cpu_idle_cb_item_t *cip; 739 cpu_idle_cb_state_t *sp; 740 cpu_idle_callback_context_t ctx; 741 #if defined(__x86) 742 ulong_t iflags; 743 #endif 744 745 ASSERT(CPU->cpu_seqid < max_ncpus); 746 sp = &cpu_idle_cb_state[CPU->cpu_seqid]; 747 748 #if defined(__sparc) 749 /* 750 * On SPARC, cpu_idle_exit will only be called from idle thread 751 * with interrupt disabled. 752 */ 753 754 if (sp->v.index != 0) { 755 ctx = CPU_IDLE_GET_CTX(CPU); 756 cpu_idle_exit_state(sp); 757 for (i = sp->v.index - 1; i >= 0; i--) { 758 cip = &cpu_idle_cb_array[i]; 759 if (cip->exit != NULL) { 760 cip->exit(cip->arg, ctx, flag); 761 } 762 } 763 sp->v.index = 0; 764 } 765 #elif defined(__x86) 766 /* 767 * On x86, cpu_idle_exit will be called from idle thread or interrupt 768 * handler. When called from interrupt handler, interrupts will be 769 * disabled. When called from idle thread, interrupts may be disabled 770 * or enabled. 771 */ 772 773 /* Called from interrupt, interrupts are already disabled. */ 774 if (flag & CPU_IDLE_CB_FLAG_INTR) { 775 /* 776 * return if cpu_idle_exit already called or 777 * there is no registered callback. 778 */ 779 if (sp->v.index == 0) { 780 return; 781 } 782 ctx = CPU_IDLE_GET_CTX(CPU); 783 cpu_idle_exit_state(sp); 784 for (i = sp->v.index - 1; i >= 0; i--) { 785 cip = &cpu_idle_cb_array[i]; 786 if (cip->exit != NULL) { 787 cip->exit(cip->arg, ctx, flag); 788 } 789 } 790 sp->v.index = 0; 791 792 /* Called from idle thread, need to disable interrupt. */ 793 } else { 794 iflags = intr_clear(); 795 if (sp->v.index != 0) { 796 ctx = CPU_IDLE_GET_CTX(CPU); 797 cpu_idle_exit_state(sp); 798 for (i = sp->v.index - 1; i >= 0; i--) { 799 cip = &cpu_idle_cb_array[i]; 800 if (cip->exit != NULL) { 801 cip->exit(cip->arg, ctx, flag); 802 } 803 } 804 sp->v.index = 0; 805 } 806 intr_restore(iflags); 807 } 808 #endif 809 } 810 811 cpu_idle_callback_context_t 812 cpu_idle_get_context(void) 813 { 814 return (CPU_IDLE_GET_CTX(CPU)); 815 } 816 817 /* 818 * Allocate property structure in group of CPU_IDLE_VALUE_GROUP_SIZE to improve 819 * cache efficiency. To simplify implementation, allocated memory for property 820 * structure won't be freed. 821 */ 822 static void 823 cpu_idle_prop_allocate_impl(void) 824 { 825 int i; 826 size_t sz; 827 intptr_t buf; 828 cpu_idle_prop_impl_t *prop; 829 cpu_idle_prop_value_t *valp; 830 831 ASSERT(!CPU_ON_INTR(CPU)); 832 prop = kmem_zalloc(sizeof (*prop) * CPU_IDLE_VALUE_GROUP_SIZE, 833 KM_SLEEP); 834 sz = sizeof (*valp) * CPU_IDLE_VALUE_GROUP_SIZE * max_ncpus; 835 sz += CPU_CACHE_COHERENCE_SIZE; 836 buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP); 837 valp = (cpu_idle_prop_value_t *)P2ROUNDUP(buf, 838 CPU_CACHE_COHERENCE_SIZE); 839 840 for (i = 0; i < CPU_IDLE_VALUE_GROUP_SIZE; i++, prop++, valp++) { 841 prop->value = valp; 842 prop->next = cpu_idle_prop_free; 843 cpu_idle_prop_free = prop; 844 } 845 } 846 847 int 848 cpu_idle_prop_create_property(const char *name, cpu_idle_prop_type_t type, 849 cpu_idle_prop_update_t update, void *arg, cpu_idle_prop_handle_t *hdlp) 850 { 851 int rc = EEXIST; 852 cpu_idle_prop_impl_t *prop; 853 854 ASSERT(!CPU_ON_INTR(CPU)); 855 if (name == NULL || hdlp == NULL) { 856 cmn_err(CE_WARN, 857 "!cpu_event: NULL parameters in create_property."); 858 return (EINVAL); 859 } 860 861 mutex_enter(&cpu_idle_prop_lock); 862 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) { 863 if (strcmp(prop->name, name) == 0) { 864 cmn_err(CE_NOTE, 865 "!cpu_event: property %s already exists.", name); 866 break; 867 } 868 } 869 if (prop == NULL) { 870 if (cpu_idle_prop_free == NULL) { 871 cpu_idle_prop_allocate_impl(); 872 } 873 ASSERT(cpu_idle_prop_free != NULL); 874 prop = cpu_idle_prop_free; 875 cpu_idle_prop_free = prop->next; 876 prop->next = cpu_idle_prop_busy; 877 cpu_idle_prop_busy = prop; 878 879 ASSERT(prop->value != NULL); 880 prop->name = strdup(name); 881 prop->type = type; 882 prop->update = update; 883 prop->private = arg; 884 prop->refcnt = 1; 885 *hdlp = prop; 886 rc = 0; 887 } 888 mutex_exit(&cpu_idle_prop_lock); 889 890 return (rc); 891 } 892 893 int 894 cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl) 895 { 896 int rc = ENODEV; 897 cpu_idle_prop_impl_t *prop, **propp; 898 cpu_idle_prop_value_t *valp; 899 900 ASSERT(!CPU_ON_INTR(CPU)); 901 if (hdl == NULL) { 902 cmn_err(CE_WARN, 903 "!cpu_event: hdl is NULL in destroy_property."); 904 return (EINVAL); 905 } 906 907 prop = (cpu_idle_prop_impl_t *)hdl; 908 mutex_enter(&cpu_idle_prop_lock); 909 for (propp = &cpu_idle_prop_busy; *propp != NULL; 910 propp = &(*propp)->next) { 911 if (*propp == prop) { 912 ASSERT(prop->refcnt > 0); 913 if (atomic_cas_32(&prop->refcnt, 1, 0) == 1) { 914 *propp = prop->next; 915 strfree(prop->name); 916 valp = prop->value; 917 bzero(prop, sizeof (*prop)); 918 prop->value = valp; 919 prop->next = cpu_idle_prop_free; 920 cpu_idle_prop_free = prop; 921 rc = 0; 922 } else { 923 rc = EBUSY; 924 } 925 break; 926 } 927 } 928 mutex_exit(&cpu_idle_prop_lock); 929 930 return (rc); 931 } 932 933 int 934 cpu_idle_prop_create_handle(const char *name, cpu_idle_prop_handle_t *hdlp) 935 { 936 int rc = ENODEV; 937 cpu_idle_prop_impl_t *prop; 938 939 ASSERT(!CPU_ON_INTR(CPU)); 940 if (name == NULL || hdlp == NULL) { 941 cmn_err(CE_WARN, 942 "!cpu_event: NULL parameters in create_handle."); 943 return (EINVAL); 944 } 945 946 mutex_enter(&cpu_idle_prop_lock); 947 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) { 948 if (strcmp(prop->name, name) == 0) { 949 /* Hold one refcount on object. */ 950 ASSERT(prop->refcnt > 0); 951 atomic_inc_32(&prop->refcnt); 952 *hdlp = (cpu_idle_prop_handle_t)prop; 953 rc = 0; 954 break; 955 } 956 } 957 mutex_exit(&cpu_idle_prop_lock); 958 959 return (rc); 960 } 961 962 int 963 cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl) 964 { 965 int rc = ENODEV; 966 cpu_idle_prop_impl_t *prop; 967 968 ASSERT(!CPU_ON_INTR(CPU)); 969 if (hdl == NULL) { 970 cmn_err(CE_WARN, 971 "!cpu_event: hdl is NULL in destroy_handle."); 972 return (EINVAL); 973 } 974 975 mutex_enter(&cpu_idle_prop_lock); 976 for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) { 977 if (prop == hdl) { 978 /* Release refcnt held in create_handle. */ 979 ASSERT(prop->refcnt > 1); 980 atomic_dec_32(&prop->refcnt); 981 rc = 0; 982 break; 983 } 984 } 985 mutex_exit(&cpu_idle_prop_lock); 986 987 return (rc); 988 } 989 990 cpu_idle_prop_type_t 991 cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl) 992 { 993 ASSERT(hdl != NULL); 994 return (((cpu_idle_prop_impl_t *)hdl)->type); 995 } 996 997 const char * 998 cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl) 999 { 1000 ASSERT(hdl != NULL); 1001 return (((cpu_idle_prop_impl_t *)hdl)->name); 1002 } 1003 1004 int 1005 cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl, 1006 cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t *valp) 1007 { 1008 int idx, rc = 0; 1009 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1010 1011 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1012 if (hdl == NULL || valp == NULL) { 1013 cmn_err(CE_NOTE, "!cpu_event: NULL parameters in prop_get."); 1014 return (EINVAL); 1015 } 1016 idx = CPU_IDLE_CTX2IDX(ctx); 1017 if (prop->update != NULL) { 1018 cpu_idle_cb_state_t *sp; 1019 1020 ASSERT(CPU->cpu_seqid < max_ncpus); 1021 sp = &cpu_idle_cb_state[CPU->cpu_seqid]; 1022 /* CPU's idle enter timestamp as sequence number. */ 1023 rc = prop->update(prop->private, 1024 (uint64_t)sp->v.enter_ts->cipv_hrtime, &prop->value[idx]); 1025 } 1026 if (rc == 0) { 1027 *valp = prop->value[idx]; 1028 } 1029 1030 return (rc); 1031 } 1032 1033 uint32_t 1034 cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl, 1035 cpu_idle_callback_context_t ctx) 1036 { 1037 int idx; 1038 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1039 1040 ASSERT(hdl != NULL); 1041 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1042 idx = CPU_IDLE_CTX2IDX(ctx); 1043 return (prop->value[idx].cipv_uint32); 1044 } 1045 1046 uint64_t 1047 cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl, 1048 cpu_idle_callback_context_t ctx) 1049 { 1050 int idx; 1051 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1052 1053 ASSERT(hdl != NULL); 1054 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1055 idx = CPU_IDLE_CTX2IDX(ctx); 1056 return (prop->value[idx].cipv_uint64); 1057 } 1058 1059 intptr_t 1060 cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl, 1061 cpu_idle_callback_context_t ctx) 1062 { 1063 int idx; 1064 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1065 1066 ASSERT(hdl != NULL); 1067 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1068 idx = CPU_IDLE_CTX2IDX(ctx); 1069 return (prop->value[idx].cipv_intptr); 1070 } 1071 1072 hrtime_t 1073 cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl, 1074 cpu_idle_callback_context_t ctx) 1075 { 1076 int idx; 1077 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1078 1079 ASSERT(hdl != NULL); 1080 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1081 idx = CPU_IDLE_CTX2IDX(ctx); 1082 return (prop->value[idx].cipv_hrtime); 1083 } 1084 1085 void 1086 cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl, 1087 cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t val) 1088 { 1089 int idx; 1090 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1091 1092 ASSERT(hdl != NULL); 1093 ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus); 1094 idx = CPU_IDLE_CTX2IDX(ctx); 1095 prop->value[idx] = val; 1096 } 1097 1098 void 1099 cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl, cpu_idle_prop_value_t val) 1100 { 1101 int i, idx; 1102 cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl; 1103 1104 ASSERT(hdl != NULL); 1105 for (i = 0; i < max_ncpus; i++) { 1106 idx = CPU_IDLE_CTX2IDX(i); 1107 prop->value[idx] = val; 1108 } 1109 } 1110 1111 /*ARGSUSED*/ 1112 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum, 1113 cpu_idle_prop_value_t *valp) 1114 { 1115 int i; 1116 uint64_t val; 1117 1118 for (val = 0, i = 0; i < PIL_MAX; i++) { 1119 val += CPU->cpu_stats.sys.intr[i]; 1120 } 1121 valp->cipv_uint64 = val; 1122 1123 return (0); 1124 } 1125 1126 uint_t 1127 cpu_idle_get_cpu_state(cpu_t *cp) 1128 { 1129 ASSERT(cp != NULL && cp->cpu_seqid < max_ncpus); 1130 return ((uint_t)cpu_idle_prop_get_uint32( 1131 cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, 1132 CPU_IDLE_GET_CTX(cp))); 1133 } 1134 1135 #if defined(__x86) 1136 /* 1137 * Intercept CPU at a safe point in idle() before powering it off. 1138 */ 1139 void 1140 cpu_idle_intercept_cpu(cpu_t *cp) 1141 { 1142 ASSERT(cp->cpu_seqid < max_ncpus); 1143 ASSERT(cpu_idle_cb_state[cp->cpu_seqid].v.enabled == B_FALSE); 1144 1145 /* Set flag to intercept CPU. */ 1146 CPUSET_ATOMIC_ADD(cpu_idle_intercept_set, cp->cpu_id); 1147 /* Wake up CPU from possible sleep state. */ 1148 poke_cpu(cp->cpu_id); 1149 while (CPU_IN_SET(cpu_idle_intercept_set, cp->cpu_id)) { 1150 DELAY(1); 1151 } 1152 /* 1153 * Now target CPU is spinning in a pause loop with interrupts disabled. 1154 */ 1155 } 1156 #endif 1157