1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Support for determining capacity and utilization of performance relevant 29 * hardware components in a computer 30 * 31 * THEORY 32 * ------ 33 * The capacity and utilization of the performance relevant hardware components 34 * is needed to be able to optimize performance while minimizing the amount of 35 * power used on a system. The idea is to use hardware performance counters 36 * and potentially other means to determine the capacity and utilization of 37 * performance relevant hardware components (eg. execution pipeline, cache, 38 * memory, etc.) and attribute the utilization to the responsible CPU and the 39 * thread running there. 40 * 41 * This will help characterize the utilization of performance relevant 42 * components and how much is used by each CPU and each thread. With 43 * that data, the utilization can be aggregated to all the CPUs sharing each 44 * performance relevant hardware component to calculate the total utilization 45 * of each component and compare that with the component's capacity to 46 * essentially determine the actual hardware load of the component. The 47 * hardware utilization attributed to each running thread can also be 48 * aggregated to determine the total hardware utilization of each component to 49 * a workload. 50 * 51 * Once that is done, one can determine how much of each performance relevant 52 * hardware component is needed by a given thread or set of threads (eg. a 53 * workload) and size up exactly what hardware is needed by the threads and how 54 * much. With this info, we can better place threads among CPUs to match their 55 * exact hardware resource needs and potentially lower or raise the power based 56 * on their utilization or pack threads onto the fewest hardware components 57 * needed and power off any remaining unused components to minimize power 58 * without sacrificing performance. 59 * 60 * IMPLEMENTATION 61 * -------------- 62 * The code has been designed and implemented to make (un)programming and 63 * reading the counters for a given CPU as lightweight and fast as possible. 64 * This is very important because we need to read and potentially (un)program 65 * the counters very often and in performance sensitive code. Specifically, 66 * the counters may need to be (un)programmed during context switch and/or a 67 * cyclic handler when there are more counter events to count than existing 68 * counters. 69 * 70 * Consequently, the code has been split up to allow allocating and 71 * initializing everything needed to program and read the counters on a given 72 * CPU once and make (un)programming and reading the counters for a given CPU 73 * not have to allocate/free memory or grab any locks. To do this, all the 74 * state needed to (un)program and read the counters on a CPU is kept per CPU 75 * and is made lock free by forcing any code that reads or manipulates the 76 * counters or the state needed to (un)program or read the counters to run on 77 * the target CPU and disable preemption while running on the target CPU to 78 * protect any critical sections. All counter manipulation on the target CPU is 79 * happening either from a cross-call to the target CPU or at the same PIL as 80 * used by the cross-call subsystem. This guarantees that counter manipulation 81 * is not interrupted by cross-calls from other CPUs. 82 * 83 * The synchronization has been made lock free or as simple as possible for 84 * performance and to avoid getting the locking all tangled up when we interpose 85 * on the CPC routines that (un)program the counters to manage the counters 86 * between the kernel and user on each CPU. When the user starts using the 87 * counters on a given CPU, the kernel will unprogram the counters that it is 88 * using on that CPU just before they are programmed for the user. Then the 89 * kernel will program the counters on a given CPU for its own use when the user 90 * stops using them. 91 * 92 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc 93 * enables any probe, it requests to disable and unprogram all counters used for 94 * capacity and utilizations. These counters are never re-programmed back until 95 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU 96 * framework and it re-programs the counters. 97 * 98 * When a CPU is going offline, its CU counters are unprogrammed and disabled, 99 * so that they would not be re-programmed again by some other activity on the 100 * CPU that is going offline. 101 * 102 * The counters are programmed during boot. However, a flag is available to 103 * disable this if necessary (see cu_flag below). A handler is provided to 104 * (un)program the counters during CPU on/offline. Basic routines are provided 105 * to initialize and tear down this module, initialize and tear down any state 106 * needed for a given CPU, and (un)program the counters for a given CPU. 107 * Lastly, a handler is provided to read the counters and attribute the 108 * utilization to the responsible CPU. 109 */ 110 #include <sys/types.h> 111 #include <sys/cmn_err.h> 112 #include <sys/cpuvar.h> 113 #include <sys/ddi.h> 114 #include <sys/disp.h> 115 #include <sys/sdt.h> 116 #include <sys/sunddi.h> 117 #include <sys/thread.h> 118 #include <sys/pghw.h> 119 #include <sys/cmt.h> 120 #include <sys/x_call.h> 121 #include <sys/cap_util.h> 122 123 #include <sys/archsystm.h> 124 #include <sys/promif.h> 125 126 #if defined(__x86) 127 #include <sys/xc_levels.h> 128 #endif 129 130 131 /* 132 * Default CPU hardware performance counter flags to use for measuring capacity 133 * and utilization 134 */ 135 #define CU_CPC_FLAGS_DEFAULT \ 136 (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT) 137 138 /* 139 * Possible Flags for controlling this module. 140 */ 141 #define CU_FLAG_ENABLE 1 /* Enable module */ 142 #define CU_FLAG_READY 2 /* Ready to setup module */ 143 #define CU_FLAG_ON 4 /* Module is on */ 144 145 /* 146 * pg_cpu kstats calculate utilization rate and maximum utilization rate for 147 * some CPUs. The rate is calculated based on data from two subsequent 148 * snapshots. When the time between such two snapshots is too small, the 149 * resulting rate may have low accuracy, so we only consider snapshots which 150 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not 151 * update the rate if the interval is smaller than that. 152 * 153 * Use one tenth of a second as the minimum interval for utilization rate 154 * calculation. 155 * 156 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in 157 * the CU_RATE() macro below to guarantee that we never divide by zero. 158 * 159 * Rate is the number of events per second. The rate is the number of events 160 * divided by time and multiplied by the number of nanoseconds in a second. We 161 * do not want time to be too small since it will cause large errors in 162 * division. 163 * 164 * We do not want to multiply two large numbers (the instruction count and 165 * NANOSEC) either since it may cause integer overflow. So we divide both the 166 * numerator and the denominator by the same value. 167 * 168 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN 169 * above to guarantee that time divided by this value is always non-zero. 170 */ 171 #define CU_RATE(val, time) \ 172 (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE)) 173 174 #define CU_SAMPLE_INTERVAL_MIN (NANOSEC / 10) 175 176 #define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000) 177 178 /* 179 * When the time between two kstat reads for the same CPU is less than 180 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values 181 * for the CPU. This helps reduce cross-calls when kstat consumers read data 182 * very often or when they read PG utilization data and then CPU utilization 183 * data quickly after that. 184 */ 185 #define CU_UPDATE_THRESHOLD (NANOSEC / 10) 186 187 /* 188 * The IS_HIPIL() macro verifies that the code is executed either from a 189 * cross-call or from high-PIL interrupt 190 */ 191 #ifdef DEBUG 192 #define IS_HIPIL() (getpil() >= XCALL_PIL) 193 #else 194 #define IS_HIPIL() 195 #endif /* DEBUG */ 196 197 198 typedef void (*cu_cpu_func_t)(uintptr_t, int *); 199 200 201 /* 202 * Flags to use for programming CPU hardware performance counters to measure 203 * capacity and utilization 204 */ 205 int cu_cpc_flags = CU_CPC_FLAGS_DEFAULT; 206 207 /* 208 * Initial value used for programming hardware counters 209 */ 210 uint64_t cu_cpc_preset_value = 0; 211 212 /* 213 * List of CPC event requests for capacity and utilization. 214 */ 215 static kcpc_request_list_t *cu_cpc_reqs = NULL; 216 217 /* 218 * When a CPU is a member of PG with a sharing relationship that is supported 219 * by the capacity/utilization framework, a kstat is created for that CPU and 220 * sharing relationship. 221 * 222 * These kstats are updated one at a time, so we can have a single scratch 223 * space to fill the data. 224 * 225 * CPU counter kstats fields: 226 * 227 * cu_cpu_id CPU ID for this kstat 228 * 229 * cu_generation Generation value that increases whenever any CPU goes 230 * offline or online. Two kstat snapshots for the same 231 * CPU may only be compared if they have the same 232 * generation. 233 * 234 * cu_pg_id PG ID for the relationship described by this kstat 235 * 236 * cu_cpu_util Running value of CPU utilization for the sharing 237 * relationship 238 * 239 * cu_cpu_time_running Total time spent collecting CU data. The time may be 240 * less than wall time if CU counters were stopped for 241 * some time. 242 * 243 * cu_cpu_time_stopped Total time the CU counters were stopped. 244 * 245 * cu_cpu_rate Utilization rate, expressed in operations per second. 246 * 247 * cu_cpu_rate_max Maximum observed value of utilization rate. 248 */ 249 struct cu_cpu_kstat { 250 kstat_named_t cu_cpu_id; 251 kstat_named_t cu_generation; 252 kstat_named_t cu_pg_id; 253 kstat_named_t cu_cpu_util; 254 kstat_named_t cu_cpu_time_running; 255 kstat_named_t cu_cpu_time_stopped; 256 kstat_named_t cu_cpu_rate; 257 kstat_named_t cu_cpu_rate_max; 258 } cu_cpu_kstat = { 259 { "id", KSTAT_DATA_UINT32 }, 260 { "generation", KSTAT_DATA_UINT32 }, 261 { "pg_id", KSTAT_DATA_LONG }, 262 { "hw_util", KSTAT_DATA_UINT64 }, 263 { "hw_util_time_running", KSTAT_DATA_UINT64 }, 264 { "hw_util_time_stopped", KSTAT_DATA_UINT64 }, 265 { "hw_util_rate", KSTAT_DATA_UINT64 }, 266 { "hw_util_rate_max", KSTAT_DATA_UINT64 }, 267 }; 268 269 /* 270 * Flags for controlling this module 271 */ 272 uint_t cu_flags = CU_FLAG_ENABLE; 273 274 /* 275 * Error return value for cu_init() since it can't return anything to be called 276 * from mp_init_tbl[] (:-( 277 */ 278 static int cu_init_error = 0; 279 280 hrtime_t cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN; 281 282 hrtime_t cu_update_threshold = CU_UPDATE_THRESHOLD; 283 284 static kmutex_t pg_cpu_kstat_lock; 285 286 287 /* 288 * Forward declaration of interface routines 289 */ 290 void cu_disable(void); 291 void cu_enable(void); 292 void cu_init(void); 293 void cu_cpc_program(cpu_t *cp, int *err); 294 void cu_cpc_unprogram(cpu_t *cp, int *err); 295 int cu_cpu_update(struct cpu *cp, boolean_t move_to); 296 void cu_pg_update(pghw_t *pg); 297 298 299 /* 300 * Forward declaration of private routines 301 */ 302 static int cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs); 303 static void cu_cpc_program_xcall(uintptr_t arg, int *err); 304 static int cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, 305 int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents); 306 static int cu_cpu_callback(cpu_setup_t what, int id, void *arg); 307 static void cu_cpu_disable(cpu_t *cp); 308 static void cu_cpu_enable(cpu_t *cp); 309 static int cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs); 310 static int cu_cpu_fini(cpu_t *cp); 311 static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info); 312 static int cu_cpu_kstat_update(kstat_t *ksp, int rw); 313 static int cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg); 314 static int cu_cpu_update_stats(cu_cntr_stats_t *stats, 315 uint64_t cntr_value); 316 static void cu_cpu_info_detach_xcall(void); 317 318 /* 319 * Disable or enable Capacity Utilization counters on all CPUs. 320 */ 321 void 322 cu_disable(void) 323 { 324 cpu_t *cp; 325 326 ASSERT(MUTEX_HELD(&cpu_lock)); 327 328 cp = cpu_active; 329 do { 330 if (!(cp->cpu_flags & CPU_OFFLINE)) 331 cu_cpu_disable(cp); 332 } while ((cp = cp->cpu_next_onln) != cpu_active); 333 } 334 335 336 void 337 cu_enable(void) 338 { 339 cpu_t *cp; 340 341 ASSERT(MUTEX_HELD(&cpu_lock)); 342 343 cp = cpu_active; 344 do { 345 if (!(cp->cpu_flags & CPU_OFFLINE)) 346 cu_cpu_enable(cp); 347 } while ((cp = cp->cpu_next_onln) != cpu_active); 348 } 349 350 351 /* 352 * Setup capacity and utilization support 353 */ 354 void 355 cu_init(void) 356 { 357 cpu_t *cp; 358 359 cu_init_error = 0; 360 if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) { 361 cu_init_error = -1; 362 return; 363 } 364 365 if (kcpc_init() != 0) { 366 cu_init_error = -2; 367 return; 368 } 369 370 /* 371 * Can't measure hardware capacity and utilization without CPU 372 * hardware performance counters 373 */ 374 if (cpc_ncounters <= 0) { 375 cu_init_error = -3; 376 return; 377 } 378 379 /* 380 * Setup CPC event request queue 381 */ 382 cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP); 383 384 mutex_enter(&cpu_lock); 385 386 /* 387 * Mark flags to say that module is ready to be setup 388 */ 389 cu_flags |= CU_FLAG_READY; 390 391 cp = cpu_active; 392 do { 393 /* 394 * Allocate and setup state needed to measure capacity and 395 * utilization 396 */ 397 if (cu_cpu_init(cp, cu_cpc_reqs) != 0) 398 cu_init_error = -5; 399 400 /* 401 * Reset list of counter event requests so its space can be 402 * reused for a different set of requests for next CPU 403 */ 404 (void) kcpc_reqs_reset(cu_cpc_reqs); 405 406 cp = cp->cpu_next_onln; 407 } while (cp != cpu_active); 408 409 /* 410 * Mark flags to say that module is on now and counters are ready to be 411 * programmed on all active CPUs 412 */ 413 cu_flags |= CU_FLAG_ON; 414 415 /* 416 * Program counters on currently active CPUs 417 */ 418 cp = cpu_active; 419 do { 420 if (cu_cpu_run(cp, cu_cpc_program_xcall, 421 (uintptr_t)B_FALSE) != 0) 422 cu_init_error = -6; 423 424 cp = cp->cpu_next_onln; 425 } while (cp != cpu_active); 426 427 /* 428 * Register callback for CPU state changes to enable and disable 429 * CPC counters as CPUs come on and offline 430 */ 431 register_cpu_setup_func(cu_cpu_callback, NULL); 432 433 mutex_exit(&cpu_lock); 434 } 435 436 437 /* 438 * Return number of counter events needed to measure capacity and utilization 439 * for specified CPU and fill in list of CPC requests with each counter event 440 * needed if list where to add CPC requests is given 441 * 442 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free 443 * everything that has been successfully allocated if any memory 444 * allocation fails 445 */ 446 static int 447 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) 448 { 449 group_t *cmt_pgs; 450 cu_cntr_info_t **cntr_info_array; 451 cpu_pg_t *cpu_pgs; 452 cu_cpu_info_t *cu_cpu_info; 453 pg_cmt_t *pg_cmt; 454 pghw_t *pg_hw; 455 cu_cntr_stats_t *stats; 456 int nevents; 457 pghw_type_t pg_hw_type; 458 group_iter_t iter; 459 460 ASSERT(MUTEX_HELD(&cpu_lock)); 461 462 /* 463 * There has to be a target CPU for this 464 */ 465 if (cp == NULL) 466 return (-1); 467 468 /* 469 * Return 0 when CPU doesn't belong to any group 470 */ 471 cpu_pgs = cp->cpu_pg; 472 if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1) 473 return (0); 474 475 cmt_pgs = &cpu_pgs->cmt_pgs; 476 cu_cpu_info = cp->cpu_cu_info; 477 478 /* 479 * Grab counter statistics and info 480 */ 481 if (reqs == NULL) { 482 stats = NULL; 483 cntr_info_array = NULL; 484 } else { 485 if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL) 486 return (-2); 487 488 stats = cu_cpu_info->cu_cntr_stats; 489 cntr_info_array = cu_cpu_info->cu_cntr_info; 490 } 491 492 /* 493 * See whether platform (or processor) specific code knows which CPC 494 * events to request, etc. are needed to measure hardware capacity and 495 * utilization on this machine 496 */ 497 nevents = cu_plat_cpc_init(cp, reqs, nreqs); 498 if (nevents >= 0) 499 return (nevents); 500 501 /* 502 * Let common code decide which CPC events to request, etc. to measure 503 * capacity and utilization since platform (or processor) specific does 504 * not know.... 505 * 506 * Walk CPU's PG lineage and do following: 507 * 508 * - Setup CPC request, counter info, and stats needed for each counter 509 * event to measure capacity and and utilization for each of CPU's PG 510 * hardware sharing relationships 511 * 512 * - Create PG CPU kstats to export capacity and utilization for each PG 513 */ 514 nevents = 0; 515 group_iter_init(&iter); 516 while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) { 517 cu_cntr_info_t *cntr_info; 518 int nevents_save; 519 int nstats; 520 521 pg_hw = (pghw_t *)pg_cmt; 522 pg_hw_type = pg_hw->pghw_hw; 523 nevents_save = nevents; 524 nstats = 0; 525 526 switch (pg_hw_type) { 527 case PGHW_IPIPE: 528 if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats, 529 KM_NOSLEEP, &nevents) != 0) 530 continue; 531 nstats = 1; 532 break; 533 534 case PGHW_FPU: 535 if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats, 536 KM_NOSLEEP, &nevents) != 0) 537 continue; 538 nstats = 1; 539 break; 540 541 default: 542 /* 543 * Don't measure capacity and utilization for this kind 544 * of PG hardware relationship so skip to next PG in 545 * CPU's PG lineage 546 */ 547 continue; 548 } 549 550 cntr_info = cntr_info_array[pg_hw_type]; 551 552 /* 553 * Nothing to measure for this hardware sharing relationship 554 */ 555 if (nevents - nevents_save == 0) { 556 if (cntr_info != NULL) 557 kmem_free(cntr_info, sizeof (cu_cntr_info_t)); 558 cntr_info_array[pg_hw_type] = NULL; 559 continue; 560 } 561 562 /* 563 * Fill in counter info for this PG hardware relationship 564 */ 565 if (cntr_info == NULL) { 566 cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t), 567 KM_NOSLEEP); 568 if (cntr_info == NULL) 569 continue; 570 cntr_info_array[pg_hw_type] = cntr_info; 571 } 572 cntr_info->ci_cpu = cp; 573 cntr_info->ci_pg = pg_hw; 574 cntr_info->ci_stats = &stats[nevents_save]; 575 cntr_info->ci_nstats = nstats; 576 577 /* 578 * Create PG CPU kstats for this hardware relationship 579 */ 580 cu_cpu_kstat_create(pg_hw, cntr_info); 581 } 582 583 return (nevents); 584 } 585 586 587 /* 588 * Program counters for capacity and utilization on given CPU 589 * 590 * If any of the following conditions is true, the counters are not programmed: 591 * 592 * - CU framework is disabled 593 * - The cpu_cu_info field of the cpu structure is NULL 594 * - DTrace is active 595 * - Counters are programmed already 596 * - Counters are disabled (by calls to cu_cpu_disable()) 597 */ 598 void 599 cu_cpc_program(cpu_t *cp, int *err) 600 { 601 cu_cpc_ctx_t *cpu_ctx; 602 kcpc_ctx_t *ctx; 603 cu_cpu_info_t *cu_cpu_info; 604 605 ASSERT(IS_HIPIL()); 606 /* 607 * Should be running on given CPU. We disable preemption to keep CPU 608 * from disappearing and make sure flags and CPC context don't change 609 * from underneath us 610 */ 611 kpreempt_disable(); 612 ASSERT(cp == CPU); 613 614 /* 615 * Module not ready to program counters 616 */ 617 if (!(cu_flags & CU_FLAG_ON)) { 618 *err = -1; 619 kpreempt_enable(); 620 return; 621 } 622 623 if (cp == NULL) { 624 *err = -2; 625 kpreempt_enable(); 626 return; 627 } 628 629 cu_cpu_info = cp->cpu_cu_info; 630 if (cu_cpu_info == NULL) { 631 *err = -3; 632 kpreempt_enable(); 633 return; 634 } 635 636 /* 637 * If DTrace CPC is active or counters turned on already or are 638 * disabled, just return. 639 */ 640 if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) || 641 cu_cpu_info->cu_disabled) { 642 *err = 1; 643 kpreempt_enable(); 644 return; 645 } 646 647 if ((CPU->cpu_cpc_ctx != NULL) && 648 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 649 *err = -4; 650 kpreempt_enable(); 651 return; 652 } 653 654 /* 655 * Get CPU's CPC context needed for capacity and utilization 656 */ 657 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 658 ASSERT(cpu_ctx != NULL); 659 ASSERT(cpu_ctx->nctx >= 0); 660 661 ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0); 662 ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz); 663 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || 664 cpu_ctx->ctx_ptr_array_sz <= 0) { 665 *err = -5; 666 kpreempt_enable(); 667 return; 668 } 669 670 /* 671 * Increment index in CPU's CPC context info to point at next context 672 * to program 673 * 674 * NOTE: Do this now instead of after programming counters to ensure 675 * that index will always point at *current* context so we will 676 * always be able to unprogram *current* context if necessary 677 */ 678 cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx; 679 680 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; 681 682 /* 683 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC 684 * context before programming counters 685 * 686 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is 687 * unprogrammed and may be marked with KCPC_CTX_INVALID when 688 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to 689 * invalidate all CPC contexts before they take over all the counters. 690 * 691 * This isn't necessary since these flags are only used for thread bound 692 * CPC contexts not CPU bound CPC contexts like ones used for capacity 693 * and utilization. 694 * 695 * There is no need to protect the flag update since no one is using 696 * this context now. 697 */ 698 ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED); 699 700 /* 701 * Program counters on this CPU 702 */ 703 kcpc_program(ctx, B_FALSE, B_FALSE); 704 705 cp->cpu_cpc_ctx = ctx; 706 707 /* 708 * Set state in CPU structure to say that CPU's counters are programmed 709 * for capacity and utilization now and that they are transitioning from 710 * off to on state. This will cause cu_cpu_update to update stop times 711 * for all programmed counters. 712 */ 713 cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON; 714 715 /* 716 * Update counter statistics 717 */ 718 (void) cu_cpu_update(cp, B_FALSE); 719 720 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON; 721 722 *err = 0; 723 kpreempt_enable(); 724 } 725 726 727 /* 728 * Cross call wrapper routine for cu_cpc_program() 729 * 730 * Checks to make sure that counters on CPU aren't being used by someone else 731 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that 732 * nobody else is using the counters to catch and prevent any broken code. 733 * Also, this check needs to happen on the target CPU since the CPU's CPC 734 * context can only be changed while running on the CPU. 735 * 736 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is 737 * no valid thread bound cpc context. This is important to check to prevent 738 * re-programming thread counters with CU counters when CPU is coming on-line. 739 */ 740 static void 741 cu_cpc_program_xcall(uintptr_t arg, int *err) 742 { 743 boolean_t avoid_thread_context = (boolean_t)arg; 744 745 kpreempt_disable(); 746 747 if (CPU->cpu_cpc_ctx != NULL && 748 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 749 *err = -100; 750 kpreempt_enable(); 751 return; 752 } 753 754 if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) && 755 !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 756 *err = -200; 757 kpreempt_enable(); 758 return; 759 } 760 761 cu_cpc_program(CPU, err); 762 kpreempt_enable(); 763 } 764 765 766 /* 767 * Unprogram counters for capacity and utilization on given CPU 768 * This function should be always executed on the target CPU at high PIL 769 */ 770 void 771 cu_cpc_unprogram(cpu_t *cp, int *err) 772 { 773 cu_cpc_ctx_t *cpu_ctx; 774 kcpc_ctx_t *ctx; 775 cu_cpu_info_t *cu_cpu_info; 776 777 ASSERT(IS_HIPIL()); 778 /* 779 * Should be running on given CPU with preemption disabled to keep CPU 780 * from disappearing and make sure flags and CPC context don't change 781 * from underneath us 782 */ 783 kpreempt_disable(); 784 ASSERT(cp == CPU); 785 786 /* 787 * Module not on 788 */ 789 if (!(cu_flags & CU_FLAG_ON)) { 790 *err = -1; 791 kpreempt_enable(); 792 return; 793 } 794 795 cu_cpu_info = cp->cpu_cu_info; 796 if (cu_cpu_info == NULL) { 797 *err = -3; 798 kpreempt_enable(); 799 return; 800 } 801 802 /* 803 * Counters turned off already 804 */ 805 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) { 806 *err = 1; 807 kpreempt_enable(); 808 return; 809 } 810 811 /* 812 * Update counter statistics 813 */ 814 (void) cu_cpu_update(cp, B_FALSE); 815 816 /* 817 * Get CPU's CPC context needed for capacity and utilization 818 */ 819 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 820 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || 821 cpu_ctx->ctx_ptr_array_sz <= 0) { 822 *err = -5; 823 kpreempt_enable(); 824 return; 825 } 826 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; 827 828 /* 829 * CPU's CPC context should be current capacity and utilization CPC 830 * context 831 */ 832 ASSERT(cp->cpu_cpc_ctx == ctx); 833 if (cp->cpu_cpc_ctx != ctx) { 834 *err = -6; 835 kpreempt_enable(); 836 return; 837 } 838 839 /* 840 * Unprogram counters on CPU. 841 */ 842 kcpc_unprogram(ctx, B_FALSE); 843 844 ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED); 845 846 /* 847 * Unset state in CPU structure saying that CPU's counters are 848 * programmed 849 */ 850 cp->cpu_cpc_ctx = NULL; 851 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON; 852 853 *err = 0; 854 kpreempt_enable(); 855 } 856 857 858 /* 859 * Add given counter event to list of CPC requests 860 */ 861 static int 862 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs, 863 cu_cntr_stats_t *stats, int kmem_flags, int *nevents) 864 { 865 int n; 866 int retval; 867 uint_t flags; 868 869 /* 870 * Return error when no counter event specified, counter event not 871 * supported by CPC's PCBE, or number of events not given 872 */ 873 if (event == NULL || kcpc_event_supported(event) == B_FALSE || 874 nevents == NULL) 875 return (-1); 876 877 n = *nevents; 878 879 /* 880 * Only count number of counter events needed if list 881 * where to add CPC requests not given 882 */ 883 if (reqs == NULL) { 884 n++; 885 *nevents = n; 886 return (-3); 887 } 888 889 /* 890 * Return error when stats not given or not enough room on list of CPC 891 * requests for more counter events 892 */ 893 if (stats == NULL || (nreqs <= 0 && n >= nreqs)) 894 return (-4); 895 896 /* 897 * Use flags in cu_cpc_flags to program counters and enable overflow 898 * interrupts/traps (unless PCBE can't handle overflow interrupts) so 899 * PCBE can catch counters before they wrap to hopefully give us an 900 * accurate (64-bit) virtualized counter 901 */ 902 flags = cu_cpc_flags; 903 if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0) 904 flags &= ~CPC_OVF_NOTIFY_EMT; 905 906 /* 907 * Add CPC request to list 908 */ 909 retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value, 910 flags, 0, NULL, &stats[n], kmem_flags); 911 912 if (retval != 0) 913 return (-5); 914 915 n++; 916 *nevents = n; 917 return (0); 918 } 919 920 static void 921 cu_cpu_info_detach_xcall(void) 922 { 923 ASSERT(IS_HIPIL()); 924 925 CPU->cpu_cu_info = NULL; 926 } 927 928 929 /* 930 * Enable or disable collection of capacity/utilization data for a current CPU. 931 * Counters are enabled if 'on' argument is True and disabled if it is False. 932 * This function should be always executed at high PIL 933 */ 934 static void 935 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2) 936 { 937 cpu_t *cp = (cpu_t *)arg1; 938 boolean_t on = (boolean_t)arg2; 939 int error; 940 cu_cpu_info_t *cu_cpu_info; 941 942 ASSERT(IS_HIPIL()); 943 kpreempt_disable(); 944 ASSERT(cp == CPU); 945 946 if (!(cu_flags & CU_FLAG_ON)) { 947 kpreempt_enable(); 948 return; 949 } 950 951 cu_cpu_info = cp->cpu_cu_info; 952 if (cu_cpu_info == NULL) { 953 kpreempt_enable(); 954 return; 955 } 956 957 ASSERT(!cu_cpu_info->cu_disabled || 958 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); 959 960 if (on) { 961 /* 962 * Decrement the cu_disabled counter. 963 * Once it drops to zero, call cu_cpc_program. 964 */ 965 if (cu_cpu_info->cu_disabled > 0) 966 cu_cpu_info->cu_disabled--; 967 if (cu_cpu_info->cu_disabled == 0) 968 cu_cpc_program(CPU, &error); 969 } else if (cu_cpu_info->cu_disabled++ == 0) { 970 /* 971 * This is the first attempt to disable CU, so turn it off 972 */ 973 cu_cpc_unprogram(cp, &error); 974 ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); 975 } 976 977 kpreempt_enable(); 978 } 979 980 981 /* 982 * Callback for changes in CPU states 983 * Used to enable or disable hardware performance counters on CPUs that are 984 * turned on or off 985 * 986 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU. 987 * We have to use thread_affinity_set to hop to the right CPU because these 988 * routines expect cpu_lock held, so we can't cross-call other CPUs while 989 * holding CPU lock. 990 */ 991 static int 992 /* LINTED E_FUNC_ARG_UNUSED */ 993 cu_cpu_callback(cpu_setup_t what, int id, void *arg) 994 { 995 cpu_t *cp; 996 int retval = 0; 997 998 ASSERT(MUTEX_HELD(&cpu_lock)); 999 1000 if (!(cu_flags & CU_FLAG_ON)) 1001 return (-1); 1002 1003 cp = cpu_get(id); 1004 if (cp == NULL) 1005 return (-2); 1006 1007 switch (what) { 1008 case CPU_ON: 1009 /* 1010 * Setup counters on CPU being turned on 1011 */ 1012 retval = cu_cpu_init(cp, cu_cpc_reqs); 1013 1014 /* 1015 * Reset list of counter event requests so its space can be 1016 * reused for a different set of requests for next CPU 1017 */ 1018 (void) kcpc_reqs_reset(cu_cpc_reqs); 1019 break; 1020 case CPU_INTR_ON: 1021 /* 1022 * Setup counters on CPU being turned on. 1023 */ 1024 retval = cu_cpu_run(cp, cu_cpc_program_xcall, 1025 (uintptr_t)B_TRUE); 1026 break; 1027 case CPU_OFF: 1028 /* 1029 * Disable counters on CPU being turned off. Counters will not 1030 * be re-enabled on this CPU until it comes back online. 1031 */ 1032 cu_cpu_disable(cp); 1033 ASSERT(!CU_CPC_ON(cp)); 1034 retval = cu_cpu_fini(cp); 1035 break; 1036 default: 1037 break; 1038 } 1039 return (retval); 1040 } 1041 1042 1043 /* 1044 * Disable or enable Capacity Utilization counters on a given CPU. This function 1045 * can be called from any CPU to disable counters on the given CPU. 1046 */ 1047 static void 1048 cu_cpu_disable(cpu_t *cp) 1049 { 1050 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE); 1051 } 1052 1053 1054 static void 1055 cu_cpu_enable(cpu_t *cp) 1056 { 1057 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE); 1058 } 1059 1060 1061 /* 1062 * Setup capacity and utilization support for given CPU 1063 * 1064 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free 1065 * everything that has been successfully allocated including cpu_cu_info 1066 * if any memory allocation fails 1067 */ 1068 static int 1069 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs) 1070 { 1071 kcpc_ctx_t **ctx_ptr_array; 1072 size_t ctx_ptr_array_sz; 1073 cu_cpc_ctx_t *cpu_ctx; 1074 cu_cpu_info_t *cu_cpu_info; 1075 int n; 1076 1077 /* 1078 * cpu_lock should be held and protect against CPU going away and races 1079 * with cu_{init,fini,cpu_fini}() 1080 */ 1081 ASSERT(MUTEX_HELD(&cpu_lock)); 1082 1083 /* 1084 * Return if not ready to setup counters yet 1085 */ 1086 if (!(cu_flags & CU_FLAG_READY)) 1087 return (-1); 1088 1089 if (cp->cpu_cu_info == NULL) { 1090 cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t), 1091 KM_NOSLEEP); 1092 if (cp->cpu_cu_info == NULL) 1093 return (-2); 1094 } 1095 1096 /* 1097 * Get capacity and utilization CPC context for CPU and check to see 1098 * whether it has been setup already 1099 */ 1100 cu_cpu_info = cp->cpu_cu_info; 1101 cu_cpu_info->cu_cpu = cp; 1102 cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0; 1103 1104 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 1105 if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL && 1106 cpu_ctx->ctx_ptr_array_sz > 0) { 1107 return (1); 1108 } 1109 1110 /* 1111 * Should have no contexts since it hasn't been setup already 1112 */ 1113 ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL && 1114 cpu_ctx->ctx_ptr_array_sz == 0); 1115 1116 /* 1117 * Determine how many CPC events needed to measure capacity and 1118 * utilization for this CPU, allocate space for counter statistics for 1119 * each event, and fill in list of CPC event requests with corresponding 1120 * counter stats for each request to make attributing counter data 1121 * easier later.... 1122 */ 1123 n = cu_cpc_init(cp, NULL, 0); 1124 if (n <= 0) { 1125 (void) cu_cpu_fini(cp); 1126 return (-3); 1127 } 1128 1129 cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t), 1130 KM_NOSLEEP); 1131 if (cu_cpu_info->cu_cntr_stats == NULL) { 1132 (void) cu_cpu_fini(cp); 1133 return (-4); 1134 } 1135 1136 cu_cpu_info->cu_ncntr_stats = n; 1137 1138 n = cu_cpc_init(cp, reqs, n); 1139 if (n <= 0) { 1140 (void) cu_cpu_fini(cp); 1141 return (-5); 1142 } 1143 1144 /* 1145 * Create CPC context with given requests 1146 */ 1147 ctx_ptr_array = NULL; 1148 ctx_ptr_array_sz = 0; 1149 n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array, 1150 &ctx_ptr_array_sz); 1151 if (n <= 0) { 1152 (void) cu_cpu_fini(cp); 1153 return (-6); 1154 } 1155 1156 /* 1157 * Should have contexts 1158 */ 1159 ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0); 1160 if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) { 1161 (void) cu_cpu_fini(cp); 1162 return (-7); 1163 } 1164 1165 /* 1166 * Fill in CPC context info for CPU needed for capacity and utilization 1167 */ 1168 cpu_ctx->cur_index = 0; 1169 cpu_ctx->nctx = n; 1170 cpu_ctx->ctx_ptr_array = ctx_ptr_array; 1171 cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz; 1172 return (0); 1173 } 1174 1175 /* 1176 * Tear down capacity and utilization support for given CPU 1177 */ 1178 static int 1179 cu_cpu_fini(cpu_t *cp) 1180 { 1181 kcpc_ctx_t *ctx; 1182 cu_cpc_ctx_t *cpu_ctx; 1183 cu_cpu_info_t *cu_cpu_info; 1184 int i; 1185 pghw_type_t pg_hw_type; 1186 1187 /* 1188 * cpu_lock should be held and protect against CPU going away and races 1189 * with cu_{init,fini,cpu_init}() 1190 */ 1191 ASSERT(MUTEX_HELD(&cpu_lock)); 1192 1193 /* 1194 * Have to at least be ready to setup counters to have allocated 1195 * anything that needs to be deallocated now 1196 */ 1197 if (!(cu_flags & CU_FLAG_READY)) 1198 return (-1); 1199 1200 /* 1201 * Nothing to do if CPU's capacity and utilization info doesn't exist 1202 */ 1203 cu_cpu_info = cp->cpu_cu_info; 1204 if (cu_cpu_info == NULL) 1205 return (1); 1206 1207 /* 1208 * Tear down any existing kstats and counter info for each hardware 1209 * sharing relationship 1210 */ 1211 for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS; 1212 pg_hw_type++) { 1213 cu_cntr_info_t *cntr_info; 1214 1215 cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type]; 1216 if (cntr_info == NULL) 1217 continue; 1218 1219 if (cntr_info->ci_kstat != NULL) { 1220 kstat_delete(cntr_info->ci_kstat); 1221 cntr_info->ci_kstat = NULL; 1222 } 1223 kmem_free(cntr_info, sizeof (cu_cntr_info_t)); 1224 } 1225 1226 /* 1227 * Free counter statistics for CPU 1228 */ 1229 ASSERT(cu_cpu_info->cu_cntr_stats == NULL || 1230 cu_cpu_info->cu_ncntr_stats > 0); 1231 if (cu_cpu_info->cu_cntr_stats != NULL && 1232 cu_cpu_info->cu_ncntr_stats > 0) { 1233 kmem_free(cu_cpu_info->cu_cntr_stats, 1234 cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t)); 1235 cu_cpu_info->cu_cntr_stats = NULL; 1236 cu_cpu_info->cu_ncntr_stats = 0; 1237 } 1238 1239 /* 1240 * Get capacity and utilization CPC contexts for given CPU and check to 1241 * see whether they have been freed already 1242 */ 1243 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 1244 if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL && 1245 cpu_ctx->ctx_ptr_array_sz > 0) { 1246 /* 1247 * Free CPC contexts for given CPU 1248 */ 1249 for (i = 0; i < cpu_ctx->nctx; i++) { 1250 ctx = cpu_ctx->ctx_ptr_array[i]; 1251 if (ctx == NULL) 1252 continue; 1253 kcpc_free(ctx, 0); 1254 } 1255 1256 /* 1257 * Free CPC context pointer array 1258 */ 1259 kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz); 1260 1261 /* 1262 * Zero CPC info for CPU 1263 */ 1264 bzero(cpu_ctx, sizeof (cu_cpc_ctx_t)); 1265 } 1266 1267 /* 1268 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure 1269 * that no one is going to access the cpu_cu_info whicch we are going to 1270 * free. 1271 */ 1272 if (cpu_is_online(cp)) 1273 cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0); 1274 else 1275 cp->cpu_cu_info = NULL; 1276 1277 /* 1278 * Free CPU's capacity and utilization info 1279 */ 1280 kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t)); 1281 1282 return (0); 1283 } 1284 1285 /* 1286 * Create capacity & utilization kstats for given PG CPU hardware sharing 1287 * relationship 1288 */ 1289 static void 1290 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info) 1291 { 1292 char *class, *sh_name; 1293 kstat_t *ks; 1294 1295 /* 1296 * Just return when no counter info or CPU 1297 */ 1298 if (cntr_info == NULL || cntr_info->ci_cpu == NULL) 1299 return; 1300 1301 /* 1302 * Get the class name from the leaf PG that this CPU belongs to. 1303 * If there are no PGs, just use the default class "cpu". 1304 */ 1305 class = pg ? pghw_type_string(pg->pghw_hw) : "cpu"; 1306 sh_name = pg ? pghw_type_shortstring(pg->pghw_hw) : "cpu"; 1307 1308 if ((ks = kstat_create_zone("pg_cpu", cntr_info->ci_cpu->cpu_id, 1309 sh_name, class, KSTAT_TYPE_NAMED, 1310 sizeof (cu_cpu_kstat) / sizeof (kstat_named_t), 1311 KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL) 1312 return; 1313 1314 ks->ks_lock = &pg_cpu_kstat_lock; 1315 ks->ks_data = &cu_cpu_kstat; 1316 ks->ks_update = cu_cpu_kstat_update; 1317 1318 ks->ks_private = cntr_info; 1319 cntr_info->ci_kstat = ks; 1320 kstat_install(cntr_info->ci_kstat); 1321 } 1322 1323 1324 /* 1325 * Propagate values from CPU capacity & utilization stats to kstats 1326 */ 1327 static int 1328 cu_cpu_kstat_update(kstat_t *ksp, int rw) 1329 { 1330 cpu_t *cp; 1331 cu_cntr_info_t *cntr_info = ksp->ks_private; 1332 struct cu_cpu_kstat *kstat = &cu_cpu_kstat; 1333 pghw_t *pg; 1334 cu_cntr_stats_t *stats; 1335 1336 if (rw == KSTAT_WRITE) 1337 return (EACCES); 1338 1339 kpreempt_disable(); 1340 1341 /* 1342 * Update capacity and utilization statistics needed for CPU's PG (CPU) 1343 * kstats 1344 */ 1345 cp = cntr_info->ci_cpu; 1346 (void) cu_cpu_update(cp, B_TRUE); 1347 1348 pg = cntr_info->ci_pg; 1349 stats = cntr_info->ci_stats; 1350 kstat->cu_cpu_id.value.ui32 = cp->cpu_id; 1351 kstat->cu_generation.value.ui32 = cp->cpu_generation; 1352 if (pg == NULL) 1353 kstat->cu_pg_id.value.l = -1; 1354 else 1355 kstat->cu_pg_id.value.l = pg->pghw_pg.pg_id; 1356 1357 kstat->cu_cpu_util.value.ui64 = stats->cs_value_total; 1358 kstat->cu_cpu_rate.value.ui64 = stats->cs_rate; 1359 kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max; 1360 kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running; 1361 kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped; 1362 /* 1363 * Counters are stopped now, so the cs_time_stopped was last 1364 * updated at cs_time_start time. Add the time passed since then 1365 * to the stopped time. 1366 */ 1367 if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON)) 1368 kstat->cu_cpu_time_stopped.value.ui64 += 1369 gethrtime() - stats->cs_time_start; 1370 1371 kpreempt_enable(); 1372 1373 return (0); 1374 } 1375 1376 /* 1377 * Run specified function with specified argument on a given CPU and return 1378 * whatever the function returns 1379 */ 1380 static int 1381 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg) 1382 { 1383 int error = 0; 1384 1385 /* 1386 * cpu_call() will call func on the CPU specified with given argument 1387 * and return func's return value in last argument 1388 */ 1389 cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error); 1390 return (error); 1391 } 1392 1393 1394 /* 1395 * Update counter statistics on a given CPU. 1396 * 1397 * If move_to argument is True, execute the function on the CPU specified 1398 * Otherwise, assume that it is already runninng on the right CPU 1399 * 1400 * If move_to is specified, the caller should hold cpu_lock or have preemption 1401 * disabled. Otherwise it is up to the caller to guarantee that things do not 1402 * change in the process. 1403 */ 1404 int 1405 cu_cpu_update(struct cpu *cp, boolean_t move_to) 1406 { 1407 int retval; 1408 cu_cpu_info_t *cu_cpu_info = cp->cpu_cu_info; 1409 hrtime_t time_snap; 1410 1411 ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0); 1412 1413 /* 1414 * Nothing to do if counters are not programmed 1415 */ 1416 if (!(cu_flags & CU_FLAG_ON) || 1417 (cu_cpu_info == NULL) || 1418 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) 1419 return (0); 1420 1421 /* 1422 * Don't update CPU statistics if it was updated recently 1423 * and provide old results instead 1424 */ 1425 time_snap = gethrtime(); 1426 if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) { 1427 DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp); 1428 return (0); 1429 } 1430 1431 cu_cpu_info->cu_sample_time = time_snap; 1432 1433 /* 1434 * CPC counter should be read on the CPU that is running the counter. We 1435 * either have to move ourselves to the target CPU or insure that we 1436 * already run there. 1437 * 1438 * We use cross-call to the target CPU to execute kcpc_read() and 1439 * cu_cpu_update_stats() there. 1440 */ 1441 retval = 0; 1442 if (move_to) 1443 (void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read, 1444 (uintptr_t)cu_cpu_update_stats); 1445 else { 1446 retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats); 1447 /* 1448 * Offset negative return value by -10 so we can distinguish it 1449 * from error return values of this routine vs kcpc_read() 1450 */ 1451 if (retval < 0) 1452 retval -= 10; 1453 } 1454 1455 return (retval); 1456 } 1457 1458 1459 /* 1460 * Update CPU counter statistics for current CPU. 1461 * This function may be called from a cross-call 1462 */ 1463 static int 1464 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value) 1465 { 1466 cu_cpu_info_t *cu_cpu_info = CPU->cpu_cu_info; 1467 uint_t flags; 1468 uint64_t delta; 1469 hrtime_t time_delta; 1470 hrtime_t time_snap; 1471 1472 if (stats == NULL) 1473 return (-1); 1474 1475 /* 1476 * Nothing to do if counters are not programmed. This should not happen, 1477 * but we check just in case. 1478 */ 1479 ASSERT(cu_flags & CU_FLAG_ON); 1480 ASSERT(cu_cpu_info != NULL); 1481 if (!(cu_flags & CU_FLAG_ON) || 1482 (cu_cpu_info == NULL)) 1483 return (-2); 1484 1485 flags = cu_cpu_info->cu_flag; 1486 ASSERT(flags & CU_CPU_CNTRS_ON); 1487 if (!(flags & CU_CPU_CNTRS_ON)) 1488 return (-2); 1489 1490 /* 1491 * Take snapshot of high resolution timer 1492 */ 1493 time_snap = gethrtime(); 1494 1495 /* 1496 * CU counters have just been programmed. We cannot assume that the new 1497 * cntr_value continues from where we left off, so use the cntr_value as 1498 * the new initial value. 1499 */ 1500 if (flags & CU_CPU_CNTRS_OFF_ON) 1501 stats->cs_value_start = cntr_value; 1502 1503 /* 1504 * Calculate delta in counter values between start of sampling period 1505 * and now 1506 */ 1507 delta = cntr_value - stats->cs_value_start; 1508 1509 /* 1510 * Calculate time between start of sampling period and now 1511 */ 1512 time_delta = stats->cs_time_start ? 1513 time_snap - stats->cs_time_start : 1514 0; 1515 stats->cs_time_start = time_snap; 1516 stats->cs_value_start = cntr_value; 1517 1518 if (time_delta > 0) { /* wrap shouldn't happen */ 1519 /* 1520 * Update either running or stopped time based on the transition 1521 * state 1522 */ 1523 if (flags & CU_CPU_CNTRS_OFF_ON) 1524 stats->cs_time_stopped += time_delta; 1525 else 1526 stats->cs_time_running += time_delta; 1527 } 1528 1529 /* 1530 * Update rest of counter statistics if counter value didn't wrap 1531 */ 1532 if (delta > 0) { 1533 /* 1534 * Update utilization rate if the interval between samples is 1535 * sufficient. 1536 */ 1537 ASSERT(cu_sample_interval_min > CU_SCALE); 1538 if (time_delta > cu_sample_interval_min) 1539 stats->cs_rate = CU_RATE(delta, time_delta); 1540 if (stats->cs_rate_max < stats->cs_rate) 1541 stats->cs_rate_max = stats->cs_rate; 1542 1543 stats->cs_value_last = delta; 1544 stats->cs_value_total += delta; 1545 } 1546 1547 return (0); 1548 } 1549 1550 /* 1551 * Update CMT PG utilization data. 1552 * 1553 * This routine computes the running total utilization and times for the 1554 * specified PG by adding up the total utilization and counter running and 1555 * stopped times of all CPUs in the PG and calculates the utilization rate and 1556 * maximum rate for all CPUs in the PG. 1557 */ 1558 void 1559 cu_pg_update(pghw_t *pg) 1560 { 1561 pg_cpu_itr_t cpu_iter; 1562 pghw_type_t pg_hwtype; 1563 cpu_t *cpu; 1564 pghw_util_t *hw_util = &pg->pghw_stats; 1565 uint64_t old_utilization = hw_util->pghw_util; 1566 hrtime_t now; 1567 hrtime_t time_delta; 1568 uint64_t utilization_delta; 1569 1570 ASSERT(MUTEX_HELD(&cpu_lock)); 1571 1572 now = gethrtime(); 1573 1574 pg_hwtype = pg->pghw_hw; 1575 1576 /* 1577 * Initialize running total utilization and times for PG to 0 1578 */ 1579 hw_util->pghw_util = 0; 1580 hw_util->pghw_time_running = 0; 1581 hw_util->pghw_time_stopped = 0; 1582 1583 /* 1584 * Iterate over all CPUs in the PG and aggregate utilization, running 1585 * time and stopped time. 1586 */ 1587 PG_CPU_ITR_INIT(pg, cpu_iter); 1588 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1589 cu_cpu_info_t *cu_cpu_info = cpu->cpu_cu_info; 1590 cu_cntr_info_t *cntr_info; 1591 cu_cntr_stats_t *stats; 1592 1593 if (cu_cpu_info == NULL) 1594 continue; 1595 1596 /* 1597 * Update utilization data for the CPU and then 1598 * aggregate per CPU running totals for PG 1599 */ 1600 (void) cu_cpu_update(cpu, B_TRUE); 1601 cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype]; 1602 1603 if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL) 1604 continue; 1605 1606 hw_util->pghw_util += stats->cs_value_total; 1607 hw_util->pghw_time_running += stats->cs_time_running; 1608 hw_util->pghw_time_stopped += stats->cs_time_stopped; 1609 1610 /* 1611 * If counters are stopped now, the pg_time_stopped was last 1612 * updated at cs_time_start time. Add the time passed since then 1613 * to the stopped time. 1614 */ 1615 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) 1616 hw_util->pghw_time_stopped += 1617 now - stats->cs_time_start; 1618 } 1619 1620 /* 1621 * Compute per PG instruction rate and maximum rate 1622 */ 1623 time_delta = now - hw_util->pghw_time_stamp; 1624 hw_util->pghw_time_stamp = now; 1625 1626 if (old_utilization == 0) 1627 return; 1628 1629 /* 1630 * Calculate change in utilization over sampling period and set this to 1631 * 0 if the delta would be 0 or negative which may happen if any CPUs go 1632 * offline during the sampling period 1633 */ 1634 if (hw_util->pghw_util > old_utilization) 1635 utilization_delta = hw_util->pghw_util - old_utilization; 1636 else 1637 utilization_delta = 0; 1638 1639 /* 1640 * Update utilization rate if the interval between samples is 1641 * sufficient. 1642 */ 1643 ASSERT(cu_sample_interval_min > CU_SCALE); 1644 if (time_delta > CU_SAMPLE_INTERVAL_MIN) 1645 hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta); 1646 1647 /* 1648 * Update the maximum observed rate 1649 */ 1650 if (hw_util->pghw_rate_max < hw_util->pghw_rate) 1651 hw_util->pghw_rate_max = hw_util->pghw_rate; 1652 } 1653