1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Support for determining capacity and utilization of performance relevant 28 * hardware components in a computer 29 * 30 * THEORY 31 * ------ 32 * The capacity and utilization of the performance relevant hardware components 33 * is needed to be able to optimize performance while minimizing the amount of 34 * power used on a system. The idea is to use hardware performance counters 35 * and potentially other means to determine the capacity and utilization of 36 * performance relevant hardware components (eg. execution pipeline, cache, 37 * memory, etc.) and attribute the utilization to the responsible CPU and the 38 * thread running there. 39 * 40 * This will help characterize the utilization of performance relevant 41 * components and how much is used by each CPU and each thread. With 42 * that data, the utilization can be aggregated to all the CPUs sharing each 43 * performance relevant hardware component to calculate the total utilization 44 * of each component and compare that with the component's capacity to 45 * essentially determine the actual hardware load of the component. The 46 * hardware utilization attributed to each running thread can also be 47 * aggregated to determine the total hardware utilization of each component to 48 * a workload. 49 * 50 * Once that is done, one can determine how much of each performance relevant 51 * hardware component is needed by a given thread or set of threads (eg. a 52 * workload) and size up exactly what hardware is needed by the threads and how 53 * much. With this info, we can better place threads among CPUs to match their 54 * exact hardware resource needs and potentially lower or raise the power based 55 * on their utilization or pack threads onto the fewest hardware components 56 * needed and power off any remaining unused components to minimize power 57 * without sacrificing performance. 58 * 59 * IMPLEMENTATION 60 * -------------- 61 * The code has been designed and implemented to make (un)programming and 62 * reading the counters for a given CPU as lightweight and fast as possible. 63 * This is very important because we need to read and potentially (un)program 64 * the counters very often and in performance sensitive code. Specifically, 65 * the counters may need to be (un)programmed during context switch and/or a 66 * cyclic handler when there are more counter events to count than existing 67 * counters. 68 * 69 * Consequently, the code has been split up to allow allocating and 70 * initializing everything needed to program and read the counters on a given 71 * CPU once and make (un)programming and reading the counters for a given CPU 72 * not have to allocate/free memory or grab any locks. To do this, all the 73 * state needed to (un)program and read the counters on a CPU is kept per CPU 74 * and is made lock free by forcing any code that reads or manipulates the 75 * counters or the state needed to (un)program or read the counters to run on 76 * the target CPU and disable preemption while running on the target CPU to 77 * protect any critical sections. All counter manipulation on the target CPU is 78 * happening either from a cross-call to the target CPU or at the same PIL as 79 * used by the cross-call subsystem. This guarantees that counter manipulation 80 * is not interrupted by cross-calls from other CPUs. 81 * 82 * The synchronization has been made lock free or as simple as possible for 83 * performance and to avoid getting the locking all tangled up when we interpose 84 * on the CPC routines that (un)program the counters to manage the counters 85 * between the kernel and user on each CPU. When the user starts using the 86 * counters on a given CPU, the kernel will unprogram the counters that it is 87 * using on that CPU just before they are programmed for the user. Then the 88 * kernel will program the counters on a given CPU for its own use when the user 89 * stops using them. 90 * 91 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc 92 * enables any probe, it requests to disable and unprogram all counters used for 93 * capacity and utilizations. These counters are never re-programmed back until 94 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU 95 * framework and it re-programs the counters. 96 * 97 * When a CPU is going offline, its CU counters are unprogrammed and disabled, 98 * so that they would not be re-programmed again by some other activity on the 99 * CPU that is going offline. 100 * 101 * The counters are programmed during boot. However, a flag is available to 102 * disable this if necessary (see cu_flag below). A handler is provided to 103 * (un)program the counters during CPU on/offline. Basic routines are provided 104 * to initialize and tear down this module, initialize and tear down any state 105 * needed for a given CPU, and (un)program the counters for a given CPU. 106 * Lastly, a handler is provided to read the counters and attribute the 107 * utilization to the responsible CPU. 108 */ 109 #include <sys/types.h> 110 #include <sys/cmn_err.h> 111 #include <sys/cpuvar.h> 112 #include <sys/ddi.h> 113 #include <sys/systm.h> 114 #include <sys/disp.h> 115 #include <sys/sdt.h> 116 #include <sys/sunddi.h> 117 #include <sys/thread.h> 118 #include <sys/pghw.h> 119 #include <sys/cmt.h> 120 #include <sys/policy.h> 121 #include <sys/x_call.h> 122 #include <sys/cap_util.h> 123 124 #include <sys/archsystm.h> 125 #include <sys/promif.h> 126 127 #if defined(__x86) 128 #include <sys/xc_levels.h> 129 #endif 130 131 132 /* 133 * Default CPU hardware performance counter flags to use for measuring capacity 134 * and utilization 135 */ 136 #define CU_CPC_FLAGS_DEFAULT \ 137 (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT) 138 139 /* 140 * Possible Flags for controlling this module. 141 */ 142 #define CU_FLAG_ENABLE 1 /* Enable module */ 143 #define CU_FLAG_READY 2 /* Ready to setup module */ 144 #define CU_FLAG_ON 4 /* Module is on */ 145 146 /* 147 * pg_cpu kstats calculate utilization rate and maximum utilization rate for 148 * some CPUs. The rate is calculated based on data from two subsequent 149 * snapshots. When the time between such two snapshots is too small, the 150 * resulting rate may have low accuracy, so we only consider snapshots which 151 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not 152 * update the rate if the interval is smaller than that. 153 * 154 * Use one tenth of a second as the minimum interval for utilization rate 155 * calculation. 156 * 157 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in 158 * the CU_RATE() macro below to guarantee that we never divide by zero. 159 * 160 * Rate is the number of events per second. The rate is the number of events 161 * divided by time and multiplied by the number of nanoseconds in a second. We 162 * do not want time to be too small since it will cause large errors in 163 * division. 164 * 165 * We do not want to multiply two large numbers (the instruction count and 166 * NANOSEC) either since it may cause integer overflow. So we divide both the 167 * numerator and the denominator by the same value. 168 * 169 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN 170 * above to guarantee that time divided by this value is always non-zero. 171 */ 172 #define CU_RATE(val, time) \ 173 (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE)) 174 175 #define CU_SAMPLE_INTERVAL_MIN (NANOSEC / 10) 176 177 #define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000) 178 179 /* 180 * When the time between two kstat reads for the same CPU is less than 181 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values 182 * for the CPU. This helps reduce cross-calls when kstat consumers read data 183 * very often or when they read PG utilization data and then CPU utilization 184 * data quickly after that. 185 */ 186 #define CU_UPDATE_THRESHOLD (NANOSEC / 10) 187 188 /* 189 * The IS_HIPIL() macro verifies that the code is executed either from a 190 * cross-call or from high-PIL interrupt 191 */ 192 #ifdef DEBUG 193 #define IS_HIPIL() (getpil() >= XCALL_PIL) 194 #else 195 #define IS_HIPIL() 196 #endif /* DEBUG */ 197 198 199 typedef void (*cu_cpu_func_t)(uintptr_t, int *); 200 201 202 /* 203 * Flags to use for programming CPU hardware performance counters to measure 204 * capacity and utilization 205 */ 206 int cu_cpc_flags = CU_CPC_FLAGS_DEFAULT; 207 208 /* 209 * Initial value used for programming hardware counters 210 */ 211 uint64_t cu_cpc_preset_value = 0; 212 213 /* 214 * List of CPC event requests for capacity and utilization. 215 */ 216 static kcpc_request_list_t *cu_cpc_reqs = NULL; 217 218 /* 219 * When a CPU is a member of PG with a sharing relationship that is supported 220 * by the capacity/utilization framework, a kstat is created for that CPU and 221 * sharing relationship. 222 * 223 * These kstats are updated one at a time, so we can have a single scratch 224 * space to fill the data. 225 * 226 * CPU counter kstats fields: 227 * 228 * cu_cpu_id CPU ID for this kstat 229 * 230 * cu_pg_id PG ID for this kstat 231 * 232 * cu_generation Generation value that increases whenever any CPU goes 233 * offline or online. Two kstat snapshots for the same 234 * CPU may only be compared if they have the same 235 * generation. 236 * 237 * cu_pg_id PG ID for the relationship described by this kstat 238 * 239 * cu_cpu_util Running value of CPU utilization for the sharing 240 * relationship 241 * 242 * cu_cpu_time_running Total time spent collecting CU data. The time may be 243 * less than wall time if CU counters were stopped for 244 * some time. 245 * 246 * cu_cpu_time_stopped Total time the CU counters were stopped. 247 * 248 * cu_cpu_rate Utilization rate, expressed in operations per second. 249 * 250 * cu_cpu_rate_max Maximum observed value of utilization rate. 251 * 252 * cu_cpu_relationship Name of sharing relationship for the PG in this kstat 253 */ 254 struct cu_cpu_kstat { 255 kstat_named_t cu_cpu_id; 256 kstat_named_t cu_pg_id; 257 kstat_named_t cu_generation; 258 kstat_named_t cu_cpu_util; 259 kstat_named_t cu_cpu_time_running; 260 kstat_named_t cu_cpu_time_stopped; 261 kstat_named_t cu_cpu_rate; 262 kstat_named_t cu_cpu_rate_max; 263 kstat_named_t cu_cpu_relationship; 264 } cu_cpu_kstat = { 265 { "cpu_id", KSTAT_DATA_UINT32 }, 266 { "pg_id", KSTAT_DATA_INT32 }, 267 { "generation", KSTAT_DATA_UINT32 }, 268 { "hw_util", KSTAT_DATA_UINT64 }, 269 { "hw_util_time_running", KSTAT_DATA_UINT64 }, 270 { "hw_util_time_stopped", KSTAT_DATA_UINT64 }, 271 { "hw_util_rate", KSTAT_DATA_UINT64 }, 272 { "hw_util_rate_max", KSTAT_DATA_UINT64 }, 273 { "relationship", KSTAT_DATA_STRING }, 274 }; 275 276 /* 277 * Flags for controlling this module 278 */ 279 uint_t cu_flags = CU_FLAG_ENABLE; 280 281 /* 282 * Error return value for cu_init() since it can't return anything to be called 283 * from mp_init_tbl[] (:-( 284 */ 285 static int cu_init_error = 0; 286 287 hrtime_t cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN; 288 289 hrtime_t cu_update_threshold = CU_UPDATE_THRESHOLD; 290 291 static kmutex_t pg_cpu_kstat_lock; 292 293 294 /* 295 * Forward declaration of interface routines 296 */ 297 void cu_disable(void); 298 void cu_enable(void); 299 void cu_init(void); 300 void cu_cpc_program(cpu_t *cp, int *err); 301 void cu_cpc_unprogram(cpu_t *cp, int *err); 302 int cu_cpu_update(struct cpu *cp, boolean_t move_to); 303 void cu_pg_update(pghw_t *pg); 304 305 306 /* 307 * Forward declaration of private routines 308 */ 309 static int cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs); 310 static void cu_cpc_program_xcall(uintptr_t arg, int *err); 311 static int cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, 312 int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents); 313 static int cu_cpu_callback(cpu_setup_t what, int id, void *arg); 314 static void cu_cpu_disable(cpu_t *cp); 315 static void cu_cpu_enable(cpu_t *cp); 316 static int cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs); 317 static int cu_cpu_fini(cpu_t *cp); 318 static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info); 319 static int cu_cpu_kstat_update(kstat_t *ksp, int rw); 320 static int cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg); 321 static int cu_cpu_update_stats(cu_cntr_stats_t *stats, 322 uint64_t cntr_value); 323 static void cu_cpu_info_detach_xcall(void); 324 325 /* 326 * Disable or enable Capacity Utilization counters on all CPUs. 327 */ 328 void 329 cu_disable(void) 330 { 331 cpu_t *cp; 332 333 ASSERT(MUTEX_HELD(&cpu_lock)); 334 335 cp = cpu_active; 336 do { 337 if (!(cp->cpu_flags & CPU_OFFLINE)) 338 cu_cpu_disable(cp); 339 } while ((cp = cp->cpu_next_onln) != cpu_active); 340 } 341 342 343 void 344 cu_enable(void) 345 { 346 cpu_t *cp; 347 348 ASSERT(MUTEX_HELD(&cpu_lock)); 349 350 cp = cpu_active; 351 do { 352 if (!(cp->cpu_flags & CPU_OFFLINE)) 353 cu_cpu_enable(cp); 354 } while ((cp = cp->cpu_next_onln) != cpu_active); 355 } 356 357 358 /* 359 * Setup capacity and utilization support 360 */ 361 void 362 cu_init(void) 363 { 364 cpu_t *cp; 365 366 cu_init_error = 0; 367 if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) { 368 cu_init_error = -1; 369 return; 370 } 371 372 if (kcpc_init() != 0) { 373 cu_init_error = -2; 374 return; 375 } 376 377 /* 378 * Can't measure hardware capacity and utilization without CPU 379 * hardware performance counters 380 */ 381 if (cpc_ncounters <= 0) { 382 cu_init_error = -3; 383 return; 384 } 385 386 /* 387 * Setup CPC event request queue 388 */ 389 cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP); 390 391 mutex_enter(&cpu_lock); 392 393 /* 394 * Mark flags to say that module is ready to be setup 395 */ 396 cu_flags |= CU_FLAG_READY; 397 398 cp = cpu_active; 399 do { 400 /* 401 * Allocate and setup state needed to measure capacity and 402 * utilization 403 */ 404 if (cu_cpu_init(cp, cu_cpc_reqs) != 0) 405 cu_init_error = -5; 406 407 /* 408 * Reset list of counter event requests so its space can be 409 * reused for a different set of requests for next CPU 410 */ 411 (void) kcpc_reqs_reset(cu_cpc_reqs); 412 413 cp = cp->cpu_next_onln; 414 } while (cp != cpu_active); 415 416 /* 417 * Mark flags to say that module is on now and counters are ready to be 418 * programmed on all active CPUs 419 */ 420 cu_flags |= CU_FLAG_ON; 421 422 /* 423 * Program counters on currently active CPUs 424 */ 425 cp = cpu_active; 426 do { 427 if (cu_cpu_run(cp, cu_cpc_program_xcall, 428 (uintptr_t)B_FALSE) != 0) 429 cu_init_error = -6; 430 431 cp = cp->cpu_next_onln; 432 } while (cp != cpu_active); 433 434 /* 435 * Register callback for CPU state changes to enable and disable 436 * CPC counters as CPUs come on and offline 437 */ 438 register_cpu_setup_func(cu_cpu_callback, NULL); 439 440 mutex_exit(&cpu_lock); 441 } 442 443 444 /* 445 * Return number of counter events needed to measure capacity and utilization 446 * for specified CPU and fill in list of CPC requests with each counter event 447 * needed if list where to add CPC requests is given 448 * 449 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free 450 * everything that has been successfully allocated if any memory 451 * allocation fails 452 */ 453 static int 454 cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs) 455 { 456 group_t *cmt_pgs; 457 cu_cntr_info_t **cntr_info_array; 458 cpu_pg_t *cpu_pgs; 459 cu_cpu_info_t *cu_cpu_info; 460 pg_cmt_t *pg_cmt; 461 pghw_t *pg_hw; 462 cu_cntr_stats_t *stats; 463 int nevents; 464 pghw_type_t pg_hw_type; 465 group_iter_t iter; 466 467 ASSERT(MUTEX_HELD(&cpu_lock)); 468 469 /* 470 * There has to be a target CPU for this 471 */ 472 if (cp == NULL) 473 return (-1); 474 475 /* 476 * Return 0 when CPU doesn't belong to any group 477 */ 478 cpu_pgs = cp->cpu_pg; 479 if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1) 480 return (0); 481 482 cmt_pgs = &cpu_pgs->cmt_pgs; 483 cu_cpu_info = cp->cpu_cu_info; 484 485 /* 486 * Grab counter statistics and info 487 */ 488 if (reqs == NULL) { 489 stats = NULL; 490 cntr_info_array = NULL; 491 } else { 492 if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL) 493 return (-2); 494 495 stats = cu_cpu_info->cu_cntr_stats; 496 cntr_info_array = cu_cpu_info->cu_cntr_info; 497 } 498 499 /* 500 * See whether platform (or processor) specific code knows which CPC 501 * events to request, etc. are needed to measure hardware capacity and 502 * utilization on this machine 503 */ 504 nevents = cu_plat_cpc_init(cp, reqs, nreqs); 505 if (nevents >= 0) 506 return (nevents); 507 508 /* 509 * Let common code decide which CPC events to request, etc. to measure 510 * capacity and utilization since platform (or processor) specific does 511 * not know.... 512 * 513 * Walk CPU's PG lineage and do following: 514 * 515 * - Setup CPC request, counter info, and stats needed for each counter 516 * event to measure capacity and and utilization for each of CPU's PG 517 * hardware sharing relationships 518 * 519 * - Create PG CPU kstats to export capacity and utilization for each PG 520 */ 521 nevents = 0; 522 group_iter_init(&iter); 523 while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) { 524 cu_cntr_info_t *cntr_info; 525 int nevents_save; 526 int nstats; 527 528 pg_hw = (pghw_t *)pg_cmt; 529 pg_hw_type = pg_hw->pghw_hw; 530 nevents_save = nevents; 531 nstats = 0; 532 533 switch (pg_hw_type) { 534 case PGHW_IPIPE: 535 if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats, 536 KM_NOSLEEP, &nevents) != 0) 537 continue; 538 nstats = 1; 539 break; 540 541 case PGHW_FPU: 542 if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats, 543 KM_NOSLEEP, &nevents) != 0) 544 continue; 545 nstats = 1; 546 break; 547 548 default: 549 /* 550 * Don't measure capacity and utilization for this kind 551 * of PG hardware relationship so skip to next PG in 552 * CPU's PG lineage 553 */ 554 continue; 555 } 556 557 cntr_info = cntr_info_array[pg_hw_type]; 558 559 /* 560 * Nothing to measure for this hardware sharing relationship 561 */ 562 if (nevents - nevents_save == 0) { 563 if (cntr_info != NULL) { 564 kmem_free(cntr_info, sizeof (cu_cntr_info_t)); 565 cntr_info_array[pg_hw_type] = NULL; 566 } 567 continue; 568 } 569 570 /* 571 * Fill in counter info for this PG hardware relationship 572 */ 573 if (cntr_info == NULL) { 574 cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t), 575 KM_NOSLEEP); 576 if (cntr_info == NULL) 577 continue; 578 cntr_info_array[pg_hw_type] = cntr_info; 579 } 580 cntr_info->ci_cpu = cp; 581 cntr_info->ci_pg = pg_hw; 582 cntr_info->ci_stats = &stats[nevents_save]; 583 cntr_info->ci_nstats = nstats; 584 585 /* 586 * Create PG CPU kstats for this hardware relationship 587 */ 588 cu_cpu_kstat_create(pg_hw, cntr_info); 589 } 590 591 return (nevents); 592 } 593 594 595 /* 596 * Program counters for capacity and utilization on given CPU 597 * 598 * If any of the following conditions is true, the counters are not programmed: 599 * 600 * - CU framework is disabled 601 * - The cpu_cu_info field of the cpu structure is NULL 602 * - DTrace is active 603 * - Counters are programmed already 604 * - Counters are disabled (by calls to cu_cpu_disable()) 605 */ 606 void 607 cu_cpc_program(cpu_t *cp, int *err) 608 { 609 cu_cpc_ctx_t *cpu_ctx; 610 kcpc_ctx_t *ctx; 611 cu_cpu_info_t *cu_cpu_info; 612 613 ASSERT(IS_HIPIL()); 614 /* 615 * Should be running on given CPU. We disable preemption to keep CPU 616 * from disappearing and make sure flags and CPC context don't change 617 * from underneath us 618 */ 619 kpreempt_disable(); 620 ASSERT(cp == CPU); 621 622 /* 623 * Module not ready to program counters 624 */ 625 if (!(cu_flags & CU_FLAG_ON)) { 626 *err = -1; 627 kpreempt_enable(); 628 return; 629 } 630 631 if (cp == NULL) { 632 *err = -2; 633 kpreempt_enable(); 634 return; 635 } 636 637 cu_cpu_info = cp->cpu_cu_info; 638 if (cu_cpu_info == NULL) { 639 *err = -3; 640 kpreempt_enable(); 641 return; 642 } 643 644 /* 645 * If DTrace CPC is active or counters turned on already or are 646 * disabled, just return. 647 */ 648 if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) || 649 cu_cpu_info->cu_disabled) { 650 *err = 1; 651 kpreempt_enable(); 652 return; 653 } 654 655 if ((CPU->cpu_cpc_ctx != NULL) && 656 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 657 *err = -4; 658 kpreempt_enable(); 659 return; 660 } 661 662 /* 663 * Get CPU's CPC context needed for capacity and utilization 664 */ 665 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 666 ASSERT(cpu_ctx != NULL); 667 ASSERT(cpu_ctx->nctx >= 0); 668 669 ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0); 670 ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz); 671 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || 672 cpu_ctx->ctx_ptr_array_sz <= 0) { 673 *err = -5; 674 kpreempt_enable(); 675 return; 676 } 677 678 /* 679 * Increment index in CPU's CPC context info to point at next context 680 * to program 681 * 682 * NOTE: Do this now instead of after programming counters to ensure 683 * that index will always point at *current* context so we will 684 * always be able to unprogram *current* context if necessary 685 */ 686 cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx; 687 688 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; 689 690 /* 691 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC 692 * context before programming counters 693 * 694 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is 695 * unprogrammed and may be marked with KCPC_CTX_INVALID when 696 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to 697 * invalidate all CPC contexts before they take over all the counters. 698 * 699 * This isn't necessary since these flags are only used for thread bound 700 * CPC contexts not CPU bound CPC contexts like ones used for capacity 701 * and utilization. 702 * 703 * There is no need to protect the flag update since no one is using 704 * this context now. 705 */ 706 ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED); 707 708 /* 709 * Program counters on this CPU 710 */ 711 kcpc_program(ctx, B_FALSE, B_FALSE); 712 713 cp->cpu_cpc_ctx = ctx; 714 715 /* 716 * Set state in CPU structure to say that CPU's counters are programmed 717 * for capacity and utilization now and that they are transitioning from 718 * off to on state. This will cause cu_cpu_update to update stop times 719 * for all programmed counters. 720 */ 721 cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON; 722 723 /* 724 * Update counter statistics 725 */ 726 (void) cu_cpu_update(cp, B_FALSE); 727 728 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON; 729 730 *err = 0; 731 kpreempt_enable(); 732 } 733 734 735 /* 736 * Cross call wrapper routine for cu_cpc_program() 737 * 738 * Checks to make sure that counters on CPU aren't being used by someone else 739 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that 740 * nobody else is using the counters to catch and prevent any broken code. 741 * Also, this check needs to happen on the target CPU since the CPU's CPC 742 * context can only be changed while running on the CPU. 743 * 744 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is 745 * no valid thread bound cpc context. This is important to check to prevent 746 * re-programming thread counters with CU counters when CPU is coming on-line. 747 */ 748 static void 749 cu_cpc_program_xcall(uintptr_t arg, int *err) 750 { 751 boolean_t avoid_thread_context = (boolean_t)arg; 752 753 kpreempt_disable(); 754 755 if (CPU->cpu_cpc_ctx != NULL && 756 !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 757 *err = -100; 758 kpreempt_enable(); 759 return; 760 } 761 762 if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) && 763 !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) { 764 *err = -200; 765 kpreempt_enable(); 766 return; 767 } 768 769 cu_cpc_program(CPU, err); 770 kpreempt_enable(); 771 } 772 773 774 /* 775 * Unprogram counters for capacity and utilization on given CPU 776 * This function should be always executed on the target CPU at high PIL 777 */ 778 void 779 cu_cpc_unprogram(cpu_t *cp, int *err) 780 { 781 cu_cpc_ctx_t *cpu_ctx; 782 kcpc_ctx_t *ctx; 783 cu_cpu_info_t *cu_cpu_info; 784 785 ASSERT(IS_HIPIL()); 786 /* 787 * Should be running on given CPU with preemption disabled to keep CPU 788 * from disappearing and make sure flags and CPC context don't change 789 * from underneath us 790 */ 791 kpreempt_disable(); 792 ASSERT(cp == CPU); 793 794 /* 795 * Module not on 796 */ 797 if (!(cu_flags & CU_FLAG_ON)) { 798 *err = -1; 799 kpreempt_enable(); 800 return; 801 } 802 803 cu_cpu_info = cp->cpu_cu_info; 804 if (cu_cpu_info == NULL) { 805 *err = -3; 806 kpreempt_enable(); 807 return; 808 } 809 810 /* 811 * Counters turned off already 812 */ 813 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) { 814 *err = 1; 815 kpreempt_enable(); 816 return; 817 } 818 819 /* 820 * Update counter statistics 821 */ 822 (void) cu_cpu_update(cp, B_FALSE); 823 824 /* 825 * Get CPU's CPC context needed for capacity and utilization 826 */ 827 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 828 if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL || 829 cpu_ctx->ctx_ptr_array_sz <= 0) { 830 *err = -5; 831 kpreempt_enable(); 832 return; 833 } 834 ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index]; 835 836 /* 837 * CPU's CPC context should be current capacity and utilization CPC 838 * context 839 */ 840 ASSERT(cp->cpu_cpc_ctx == ctx); 841 if (cp->cpu_cpc_ctx != ctx) { 842 *err = -6; 843 kpreempt_enable(); 844 return; 845 } 846 847 /* 848 * Unprogram counters on CPU. 849 */ 850 kcpc_unprogram(ctx, B_FALSE); 851 852 ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED); 853 854 /* 855 * Unset state in CPU structure saying that CPU's counters are 856 * programmed 857 */ 858 cp->cpu_cpc_ctx = NULL; 859 cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON; 860 861 *err = 0; 862 kpreempt_enable(); 863 } 864 865 866 /* 867 * Add given counter event to list of CPC requests 868 */ 869 static int 870 cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs, 871 cu_cntr_stats_t *stats, int kmem_flags, int *nevents) 872 { 873 int n; 874 int retval; 875 uint_t flags; 876 877 /* 878 * Return error when no counter event specified, counter event not 879 * supported by CPC's PCBE, or number of events not given 880 */ 881 if (event == NULL || kcpc_event_supported(event) == B_FALSE || 882 nevents == NULL) 883 return (-1); 884 885 n = *nevents; 886 887 /* 888 * Only count number of counter events needed if list 889 * where to add CPC requests not given 890 */ 891 if (reqs == NULL) { 892 n++; 893 *nevents = n; 894 return (-3); 895 } 896 897 /* 898 * Return error when stats not given or not enough room on list of CPC 899 * requests for more counter events 900 */ 901 if (stats == NULL || (nreqs <= 0 && n >= nreqs)) 902 return (-4); 903 904 /* 905 * Use flags in cu_cpc_flags to program counters and enable overflow 906 * interrupts/traps (unless PCBE can't handle overflow interrupts) so 907 * PCBE can catch counters before they wrap to hopefully give us an 908 * accurate (64-bit) virtualized counter 909 */ 910 flags = cu_cpc_flags; 911 if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0) 912 flags &= ~CPC_OVF_NOTIFY_EMT; 913 914 /* 915 * Add CPC request to list 916 */ 917 retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value, 918 flags, 0, NULL, &stats[n], kmem_flags); 919 920 if (retval != 0) 921 return (-5); 922 923 n++; 924 *nevents = n; 925 return (0); 926 } 927 928 static void 929 cu_cpu_info_detach_xcall(void) 930 { 931 ASSERT(IS_HIPIL()); 932 933 CPU->cpu_cu_info = NULL; 934 } 935 936 937 /* 938 * Enable or disable collection of capacity/utilization data for a current CPU. 939 * Counters are enabled if 'on' argument is True and disabled if it is False. 940 * This function should be always executed at high PIL 941 */ 942 static void 943 cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2) 944 { 945 cpu_t *cp = (cpu_t *)arg1; 946 boolean_t on = (boolean_t)arg2; 947 int error; 948 cu_cpu_info_t *cu_cpu_info; 949 950 ASSERT(IS_HIPIL()); 951 kpreempt_disable(); 952 ASSERT(cp == CPU); 953 954 if (!(cu_flags & CU_FLAG_ON)) { 955 kpreempt_enable(); 956 return; 957 } 958 959 cu_cpu_info = cp->cpu_cu_info; 960 if (cu_cpu_info == NULL) { 961 kpreempt_enable(); 962 return; 963 } 964 965 ASSERT(!cu_cpu_info->cu_disabled || 966 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); 967 968 if (on) { 969 /* 970 * Decrement the cu_disabled counter. 971 * Once it drops to zero, call cu_cpc_program. 972 */ 973 if (cu_cpu_info->cu_disabled > 0) 974 cu_cpu_info->cu_disabled--; 975 if (cu_cpu_info->cu_disabled == 0) 976 cu_cpc_program(CPU, &error); 977 } else if (cu_cpu_info->cu_disabled++ == 0) { 978 /* 979 * This is the first attempt to disable CU, so turn it off 980 */ 981 cu_cpc_unprogram(cp, &error); 982 ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)); 983 } 984 985 kpreempt_enable(); 986 } 987 988 989 /* 990 * Callback for changes in CPU states 991 * Used to enable or disable hardware performance counters on CPUs that are 992 * turned on or off 993 * 994 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU. 995 * We have to use thread_affinity_set to hop to the right CPU because these 996 * routines expect cpu_lock held, so we can't cross-call other CPUs while 997 * holding CPU lock. 998 */ 999 static int 1000 /* LINTED E_FUNC_ARG_UNUSED */ 1001 cu_cpu_callback(cpu_setup_t what, int id, void *arg) 1002 { 1003 cpu_t *cp; 1004 int retval = 0; 1005 1006 ASSERT(MUTEX_HELD(&cpu_lock)); 1007 1008 if (!(cu_flags & CU_FLAG_ON)) 1009 return (-1); 1010 1011 cp = cpu_get(id); 1012 if (cp == NULL) 1013 return (-2); 1014 1015 switch (what) { 1016 case CPU_ON: 1017 /* 1018 * Setup counters on CPU being turned on 1019 */ 1020 retval = cu_cpu_init(cp, cu_cpc_reqs); 1021 1022 /* 1023 * Reset list of counter event requests so its space can be 1024 * reused for a different set of requests for next CPU 1025 */ 1026 (void) kcpc_reqs_reset(cu_cpc_reqs); 1027 break; 1028 case CPU_INTR_ON: 1029 /* 1030 * Setup counters on CPU being turned on. 1031 */ 1032 retval = cu_cpu_run(cp, cu_cpc_program_xcall, 1033 (uintptr_t)B_TRUE); 1034 break; 1035 case CPU_OFF: 1036 /* 1037 * Disable counters on CPU being turned off. Counters will not 1038 * be re-enabled on this CPU until it comes back online. 1039 */ 1040 cu_cpu_disable(cp); 1041 ASSERT(!CU_CPC_ON(cp)); 1042 retval = cu_cpu_fini(cp); 1043 break; 1044 default: 1045 break; 1046 } 1047 return (retval); 1048 } 1049 1050 1051 /* 1052 * Disable or enable Capacity Utilization counters on a given CPU. This function 1053 * can be called from any CPU to disable counters on the given CPU. 1054 */ 1055 static void 1056 cu_cpu_disable(cpu_t *cp) 1057 { 1058 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE); 1059 } 1060 1061 1062 static void 1063 cu_cpu_enable(cpu_t *cp) 1064 { 1065 cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE); 1066 } 1067 1068 1069 /* 1070 * Setup capacity and utilization support for given CPU 1071 * 1072 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free 1073 * everything that has been successfully allocated including cpu_cu_info 1074 * if any memory allocation fails 1075 */ 1076 static int 1077 cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs) 1078 { 1079 kcpc_ctx_t **ctx_ptr_array; 1080 size_t ctx_ptr_array_sz; 1081 cu_cpc_ctx_t *cpu_ctx; 1082 cu_cpu_info_t *cu_cpu_info; 1083 int n; 1084 1085 /* 1086 * cpu_lock should be held and protect against CPU going away and races 1087 * with cu_{init,fini,cpu_fini}() 1088 */ 1089 ASSERT(MUTEX_HELD(&cpu_lock)); 1090 1091 /* 1092 * Return if not ready to setup counters yet 1093 */ 1094 if (!(cu_flags & CU_FLAG_READY)) 1095 return (-1); 1096 1097 if (cp->cpu_cu_info == NULL) { 1098 cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t), 1099 KM_NOSLEEP); 1100 if (cp->cpu_cu_info == NULL) 1101 return (-2); 1102 } 1103 1104 /* 1105 * Get capacity and utilization CPC context for CPU and check to see 1106 * whether it has been setup already 1107 */ 1108 cu_cpu_info = cp->cpu_cu_info; 1109 cu_cpu_info->cu_cpu = cp; 1110 cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0; 1111 1112 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 1113 if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL && 1114 cpu_ctx->ctx_ptr_array_sz > 0) { 1115 return (1); 1116 } 1117 1118 /* 1119 * Should have no contexts since it hasn't been setup already 1120 */ 1121 ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL && 1122 cpu_ctx->ctx_ptr_array_sz == 0); 1123 1124 /* 1125 * Determine how many CPC events needed to measure capacity and 1126 * utilization for this CPU, allocate space for counter statistics for 1127 * each event, and fill in list of CPC event requests with corresponding 1128 * counter stats for each request to make attributing counter data 1129 * easier later.... 1130 */ 1131 n = cu_cpc_init(cp, NULL, 0); 1132 if (n <= 0) { 1133 (void) cu_cpu_fini(cp); 1134 return (-3); 1135 } 1136 1137 cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t), 1138 KM_NOSLEEP); 1139 if (cu_cpu_info->cu_cntr_stats == NULL) { 1140 (void) cu_cpu_fini(cp); 1141 return (-4); 1142 } 1143 1144 cu_cpu_info->cu_ncntr_stats = n; 1145 1146 n = cu_cpc_init(cp, reqs, n); 1147 if (n <= 0) { 1148 (void) cu_cpu_fini(cp); 1149 return (-5); 1150 } 1151 1152 /* 1153 * Create CPC context with given requests 1154 */ 1155 ctx_ptr_array = NULL; 1156 ctx_ptr_array_sz = 0; 1157 n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array, 1158 &ctx_ptr_array_sz); 1159 if (n <= 0) { 1160 (void) cu_cpu_fini(cp); 1161 return (-6); 1162 } 1163 1164 /* 1165 * Should have contexts 1166 */ 1167 ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0); 1168 if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) { 1169 (void) cu_cpu_fini(cp); 1170 return (-7); 1171 } 1172 1173 /* 1174 * Fill in CPC context info for CPU needed for capacity and utilization 1175 */ 1176 cpu_ctx->cur_index = 0; 1177 cpu_ctx->nctx = n; 1178 cpu_ctx->ctx_ptr_array = ctx_ptr_array; 1179 cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz; 1180 return (0); 1181 } 1182 1183 /* 1184 * Tear down capacity and utilization support for given CPU 1185 */ 1186 static int 1187 cu_cpu_fini(cpu_t *cp) 1188 { 1189 kcpc_ctx_t *ctx; 1190 cu_cpc_ctx_t *cpu_ctx; 1191 cu_cpu_info_t *cu_cpu_info; 1192 int i; 1193 pghw_type_t pg_hw_type; 1194 1195 /* 1196 * cpu_lock should be held and protect against CPU going away and races 1197 * with cu_{init,fini,cpu_init}() 1198 */ 1199 ASSERT(MUTEX_HELD(&cpu_lock)); 1200 1201 /* 1202 * Have to at least be ready to setup counters to have allocated 1203 * anything that needs to be deallocated now 1204 */ 1205 if (!(cu_flags & CU_FLAG_READY)) 1206 return (-1); 1207 1208 /* 1209 * Nothing to do if CPU's capacity and utilization info doesn't exist 1210 */ 1211 cu_cpu_info = cp->cpu_cu_info; 1212 if (cu_cpu_info == NULL) 1213 return (1); 1214 1215 /* 1216 * Tear down any existing kstats and counter info for each hardware 1217 * sharing relationship 1218 */ 1219 for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS; 1220 pg_hw_type++) { 1221 cu_cntr_info_t *cntr_info; 1222 1223 cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type]; 1224 if (cntr_info == NULL) 1225 continue; 1226 1227 if (cntr_info->ci_kstat != NULL) { 1228 kstat_delete(cntr_info->ci_kstat); 1229 cntr_info->ci_kstat = NULL; 1230 } 1231 kmem_free(cntr_info, sizeof (cu_cntr_info_t)); 1232 } 1233 1234 /* 1235 * Free counter statistics for CPU 1236 */ 1237 ASSERT(cu_cpu_info->cu_cntr_stats == NULL || 1238 cu_cpu_info->cu_ncntr_stats > 0); 1239 if (cu_cpu_info->cu_cntr_stats != NULL && 1240 cu_cpu_info->cu_ncntr_stats > 0) { 1241 kmem_free(cu_cpu_info->cu_cntr_stats, 1242 cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t)); 1243 cu_cpu_info->cu_cntr_stats = NULL; 1244 cu_cpu_info->cu_ncntr_stats = 0; 1245 } 1246 1247 /* 1248 * Get capacity and utilization CPC contexts for given CPU and check to 1249 * see whether they have been freed already 1250 */ 1251 cpu_ctx = &cu_cpu_info->cu_cpc_ctx; 1252 if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL && 1253 cpu_ctx->ctx_ptr_array_sz > 0) { 1254 /* 1255 * Free CPC contexts for given CPU 1256 */ 1257 for (i = 0; i < cpu_ctx->nctx; i++) { 1258 ctx = cpu_ctx->ctx_ptr_array[i]; 1259 if (ctx == NULL) 1260 continue; 1261 kcpc_free(ctx, 0); 1262 } 1263 1264 /* 1265 * Free CPC context pointer array 1266 */ 1267 kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz); 1268 1269 /* 1270 * Zero CPC info for CPU 1271 */ 1272 bzero(cpu_ctx, sizeof (cu_cpc_ctx_t)); 1273 } 1274 1275 /* 1276 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure 1277 * that no one is going to access the cpu_cu_info whicch we are going to 1278 * free. 1279 */ 1280 if (cpu_is_online(cp)) 1281 cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0); 1282 else 1283 cp->cpu_cu_info = NULL; 1284 1285 /* 1286 * Free CPU's capacity and utilization info 1287 */ 1288 kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t)); 1289 1290 return (0); 1291 } 1292 1293 /* 1294 * Create capacity & utilization kstats for given PG CPU hardware sharing 1295 * relationship 1296 */ 1297 static void 1298 cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info) 1299 { 1300 kstat_t *ks; 1301 char *sharing = pghw_type_string(pg->pghw_hw); 1302 char name[KSTAT_STRLEN + 1]; 1303 1304 /* 1305 * Just return when no counter info or CPU 1306 */ 1307 if (cntr_info == NULL || cntr_info->ci_cpu == NULL) 1308 return; 1309 1310 /* 1311 * Canonify PG name to conform to kstat name rules 1312 */ 1313 (void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1); 1314 strident_canon(name, TASKQ_NAMELEN + 1); 1315 1316 if ((ks = kstat_create_zone("pg_hw_perf_cpu", 1317 cntr_info->ci_cpu->cpu_id, 1318 name, "processor_group", KSTAT_TYPE_NAMED, 1319 sizeof (cu_cpu_kstat) / sizeof (kstat_named_t), 1320 KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL) 1321 return; 1322 1323 ks->ks_lock = &pg_cpu_kstat_lock; 1324 ks->ks_data = &cu_cpu_kstat; 1325 ks->ks_update = cu_cpu_kstat_update; 1326 ks->ks_data_size += strlen(sharing) + 1; 1327 1328 ks->ks_private = cntr_info; 1329 cntr_info->ci_kstat = ks; 1330 kstat_install(cntr_info->ci_kstat); 1331 } 1332 1333 1334 /* 1335 * Propagate values from CPU capacity & utilization stats to kstats 1336 */ 1337 static int 1338 cu_cpu_kstat_update(kstat_t *ksp, int rw) 1339 { 1340 cpu_t *cp; 1341 cu_cntr_info_t *cntr_info = ksp->ks_private; 1342 struct cu_cpu_kstat *kstat = &cu_cpu_kstat; 1343 pghw_t *pg; 1344 cu_cntr_stats_t *stats; 1345 1346 if (rw == KSTAT_WRITE) 1347 return (EACCES); 1348 1349 cp = cntr_info->ci_cpu; 1350 pg = cntr_info->ci_pg; 1351 kstat->cu_cpu_id.value.ui32 = cp->cpu_id; 1352 kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id; 1353 1354 /* 1355 * The caller should have priv_cpc_cpu privilege to get utilization 1356 * data. Callers who do not have the privilege will see zeroes as the 1357 * values. 1358 */ 1359 if (secpolicy_cpc_cpu(crgetcred()) != 0) { 1360 kstat->cu_generation.value.ui32 = cp->cpu_generation; 1361 kstat_named_setstr(&kstat->cu_cpu_relationship, 1362 pghw_type_string(pg->pghw_hw)); 1363 1364 kstat->cu_cpu_util.value.ui64 = 0; 1365 kstat->cu_cpu_rate.value.ui64 = 0; 1366 kstat->cu_cpu_rate_max.value.ui64 = 0; 1367 kstat->cu_cpu_time_running.value.ui64 = 0; 1368 kstat->cu_cpu_time_stopped.value.ui64 = 0; 1369 1370 return (0); 1371 } 1372 1373 kpreempt_disable(); 1374 1375 /* 1376 * Update capacity and utilization statistics needed for CPU's PG (CPU) 1377 * kstats 1378 */ 1379 1380 (void) cu_cpu_update(cp, B_TRUE); 1381 1382 stats = cntr_info->ci_stats; 1383 kstat->cu_generation.value.ui32 = cp->cpu_generation; 1384 kstat_named_setstr(&kstat->cu_cpu_relationship, 1385 pghw_type_string(pg->pghw_hw)); 1386 1387 kstat->cu_cpu_util.value.ui64 = stats->cs_value_total; 1388 kstat->cu_cpu_rate.value.ui64 = stats->cs_rate; 1389 kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max; 1390 kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running; 1391 kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped; 1392 1393 /* 1394 * Counters are stopped now, so the cs_time_stopped was last 1395 * updated at cs_time_start time. Add the time passed since then 1396 * to the stopped time. 1397 */ 1398 if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON)) 1399 kstat->cu_cpu_time_stopped.value.ui64 += 1400 gethrtime() - stats->cs_time_start; 1401 1402 kpreempt_enable(); 1403 1404 return (0); 1405 } 1406 1407 /* 1408 * Run specified function with specified argument on a given CPU and return 1409 * whatever the function returns 1410 */ 1411 static int 1412 cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg) 1413 { 1414 int error = 0; 1415 1416 /* 1417 * cpu_call() will call func on the CPU specified with given argument 1418 * and return func's return value in last argument 1419 */ 1420 cpu_call(cp, (cpu_call_func_t)func, arg, (uintptr_t)&error); 1421 return (error); 1422 } 1423 1424 1425 /* 1426 * Update counter statistics on a given CPU. 1427 * 1428 * If move_to argument is True, execute the function on the CPU specified 1429 * Otherwise, assume that it is already runninng on the right CPU 1430 * 1431 * If move_to is specified, the caller should hold cpu_lock or have preemption 1432 * disabled. Otherwise it is up to the caller to guarantee that things do not 1433 * change in the process. 1434 */ 1435 int 1436 cu_cpu_update(struct cpu *cp, boolean_t move_to) 1437 { 1438 int retval; 1439 cu_cpu_info_t *cu_cpu_info = cp->cpu_cu_info; 1440 hrtime_t time_snap; 1441 1442 ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0); 1443 1444 /* 1445 * Nothing to do if counters are not programmed 1446 */ 1447 if (!(cu_flags & CU_FLAG_ON) || 1448 (cu_cpu_info == NULL) || 1449 !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) 1450 return (0); 1451 1452 /* 1453 * Don't update CPU statistics if it was updated recently 1454 * and provide old results instead 1455 */ 1456 time_snap = gethrtime(); 1457 if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) { 1458 DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp); 1459 return (0); 1460 } 1461 1462 cu_cpu_info->cu_sample_time = time_snap; 1463 1464 /* 1465 * CPC counter should be read on the CPU that is running the counter. We 1466 * either have to move ourselves to the target CPU or insure that we 1467 * already run there. 1468 * 1469 * We use cross-call to the target CPU to execute kcpc_read() and 1470 * cu_cpu_update_stats() there. 1471 */ 1472 retval = 0; 1473 if (move_to) 1474 (void) cu_cpu_run(cp, (cu_cpu_func_t)kcpc_read, 1475 (uintptr_t)cu_cpu_update_stats); 1476 else { 1477 retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats); 1478 /* 1479 * Offset negative return value by -10 so we can distinguish it 1480 * from error return values of this routine vs kcpc_read() 1481 */ 1482 if (retval < 0) 1483 retval -= 10; 1484 } 1485 1486 return (retval); 1487 } 1488 1489 1490 /* 1491 * Update CPU counter statistics for current CPU. 1492 * This function may be called from a cross-call 1493 */ 1494 static int 1495 cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value) 1496 { 1497 cu_cpu_info_t *cu_cpu_info = CPU->cpu_cu_info; 1498 uint_t flags; 1499 uint64_t delta; 1500 hrtime_t time_delta; 1501 hrtime_t time_snap; 1502 1503 if (stats == NULL) 1504 return (-1); 1505 1506 /* 1507 * Nothing to do if counters are not programmed. This should not happen, 1508 * but we check just in case. 1509 */ 1510 ASSERT(cu_flags & CU_FLAG_ON); 1511 ASSERT(cu_cpu_info != NULL); 1512 if (!(cu_flags & CU_FLAG_ON) || 1513 (cu_cpu_info == NULL)) 1514 return (-2); 1515 1516 flags = cu_cpu_info->cu_flag; 1517 ASSERT(flags & CU_CPU_CNTRS_ON); 1518 if (!(flags & CU_CPU_CNTRS_ON)) 1519 return (-2); 1520 1521 /* 1522 * Take snapshot of high resolution timer 1523 */ 1524 time_snap = gethrtime(); 1525 1526 /* 1527 * CU counters have just been programmed. We cannot assume that the new 1528 * cntr_value continues from where we left off, so use the cntr_value as 1529 * the new initial value. 1530 */ 1531 if (flags & CU_CPU_CNTRS_OFF_ON) 1532 stats->cs_value_start = cntr_value; 1533 1534 /* 1535 * Calculate delta in counter values between start of sampling period 1536 * and now 1537 */ 1538 delta = cntr_value - stats->cs_value_start; 1539 1540 /* 1541 * Calculate time between start of sampling period and now 1542 */ 1543 time_delta = stats->cs_time_start ? 1544 time_snap - stats->cs_time_start : 1545 0; 1546 stats->cs_time_start = time_snap; 1547 stats->cs_value_start = cntr_value; 1548 1549 if (time_delta > 0) { /* wrap shouldn't happen */ 1550 /* 1551 * Update either running or stopped time based on the transition 1552 * state 1553 */ 1554 if (flags & CU_CPU_CNTRS_OFF_ON) 1555 stats->cs_time_stopped += time_delta; 1556 else 1557 stats->cs_time_running += time_delta; 1558 } 1559 1560 /* 1561 * Update rest of counter statistics if counter value didn't wrap 1562 */ 1563 if (delta > 0) { 1564 /* 1565 * Update utilization rate if the interval between samples is 1566 * sufficient. 1567 */ 1568 ASSERT(cu_sample_interval_min > CU_SCALE); 1569 if (time_delta > cu_sample_interval_min) 1570 stats->cs_rate = CU_RATE(delta, time_delta); 1571 if (stats->cs_rate_max < stats->cs_rate) 1572 stats->cs_rate_max = stats->cs_rate; 1573 1574 stats->cs_value_last = delta; 1575 stats->cs_value_total += delta; 1576 } 1577 1578 return (0); 1579 } 1580 1581 /* 1582 * Update CMT PG utilization data. 1583 * 1584 * This routine computes the running total utilization and times for the 1585 * specified PG by adding up the total utilization and counter running and 1586 * stopped times of all CPUs in the PG and calculates the utilization rate and 1587 * maximum rate for all CPUs in the PG. 1588 */ 1589 void 1590 cu_pg_update(pghw_t *pg) 1591 { 1592 pg_cpu_itr_t cpu_iter; 1593 pghw_type_t pg_hwtype; 1594 cpu_t *cpu; 1595 pghw_util_t *hw_util = &pg->pghw_stats; 1596 uint64_t old_utilization = hw_util->pghw_util; 1597 hrtime_t now; 1598 hrtime_t time_delta; 1599 uint64_t utilization_delta; 1600 1601 ASSERT(MUTEX_HELD(&cpu_lock)); 1602 1603 now = gethrtime(); 1604 1605 pg_hwtype = pg->pghw_hw; 1606 1607 /* 1608 * Initialize running total utilization and times for PG to 0 1609 */ 1610 hw_util->pghw_util = 0; 1611 hw_util->pghw_time_running = 0; 1612 hw_util->pghw_time_stopped = 0; 1613 1614 /* 1615 * Iterate over all CPUs in the PG and aggregate utilization, running 1616 * time and stopped time. 1617 */ 1618 PG_CPU_ITR_INIT(pg, cpu_iter); 1619 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1620 cu_cpu_info_t *cu_cpu_info = cpu->cpu_cu_info; 1621 cu_cntr_info_t *cntr_info; 1622 cu_cntr_stats_t *stats; 1623 1624 if (cu_cpu_info == NULL) 1625 continue; 1626 1627 /* 1628 * Update utilization data for the CPU and then 1629 * aggregate per CPU running totals for PG 1630 */ 1631 (void) cu_cpu_update(cpu, B_TRUE); 1632 cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype]; 1633 1634 if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL) 1635 continue; 1636 1637 hw_util->pghw_util += stats->cs_value_total; 1638 hw_util->pghw_time_running += stats->cs_time_running; 1639 hw_util->pghw_time_stopped += stats->cs_time_stopped; 1640 1641 /* 1642 * If counters are stopped now, the pg_time_stopped was last 1643 * updated at cs_time_start time. Add the time passed since then 1644 * to the stopped time. 1645 */ 1646 if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) 1647 hw_util->pghw_time_stopped += 1648 now - stats->cs_time_start; 1649 } 1650 1651 /* 1652 * Compute per PG instruction rate and maximum rate 1653 */ 1654 time_delta = now - hw_util->pghw_time_stamp; 1655 hw_util->pghw_time_stamp = now; 1656 1657 if (old_utilization == 0) 1658 return; 1659 1660 /* 1661 * Calculate change in utilization over sampling period and set this to 1662 * 0 if the delta would be 0 or negative which may happen if any CPUs go 1663 * offline during the sampling period 1664 */ 1665 if (hw_util->pghw_util > old_utilization) 1666 utilization_delta = hw_util->pghw_util - old_utilization; 1667 else 1668 utilization_delta = 0; 1669 1670 /* 1671 * Update utilization rate if the interval between samples is 1672 * sufficient. 1673 */ 1674 ASSERT(cu_sample_interval_min > CU_SCALE); 1675 if (time_delta > CU_SAMPLE_INTERVAL_MIN) 1676 hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta); 1677 1678 /* 1679 * Update the maximum observed rate 1680 */ 1681 if (hw_util->pghw_rate_max < hw_util->pghw_rate) 1682 hw_util->pghw_rate_max = hw_util->pghw_rate; 1683 } 1684