1 /* 2 * Copyright 2009, Intel Corporation 3 * Copyright 2009, Sun Microsystems, Inc 4 * 5 * This file is part of PowerTOP 6 * 7 * This program file is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License as published by the 9 * Free Software Foundation; version 2 of the License. 10 * 11 * This program is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program in a file named COPYING; if not, write to the 18 * Free Software Foundation, Inc., 19 * 51 Franklin Street, Fifth Floor, 20 * Boston, MA 02110-1301 USA 21 * 22 * Authors: 23 * Arjan van de Ven <arjan@linux.intel.com> 24 * Eric C Saxe <eric.saxe@sun.com> 25 * Aubrey Li <aubrey.li@intel.com> 26 */ 27 28 /* 29 * GPL Disclaimer 30 * 31 * For the avoidance of doubt, except that if any license choice other 32 * than GPL or LGPL is available it will apply instead, Sun elects to 33 * use only the General Public License version 2 (GPLv2) at this time 34 * for any software where a choice of GPL license versions is made 35 * available with the language indicating that GPLv2 or any later 36 * version may be used, or where a choice of which version of the GPL 37 * is applied is otherwise unspecified. 38 */ 39 40 #include <stdlib.h> 41 #include <string.h> 42 #include <dtrace.h> 43 #include <kstat.h> 44 #include <errno.h> 45 #include "powertop.h" 46 47 #define HZ2MHZ(speed) ((speed) / MICROSEC) 48 #define DTP_ARG_COUNT 2 49 #define DTP_ARG_LENGTH 5 50 51 static uint64_t max_cpufreq = 0; 52 static dtrace_hdl_t *dtp; 53 static char **dtp_argv; 54 55 /* 56 * Enabling PM through /etc/power.conf 57 * See pt_cpufreq_suggest() 58 */ 59 static char default_conf[] = "/etc/power.conf"; 60 static char default_pmconf[] = "/usr/sbin/pmconfig"; 61 static char cpupm_enable[] = "echo cpupm enable >> /etc/power.conf"; 62 static char cpupm_treshold[] = "echo cpu-threshold 1s >> /etc/power.conf"; 63 64 /* 65 * Buffer containing DTrace program to track CPU frequency transitions 66 */ 67 static const char *dtp_cpufreq = 68 "hrtime_t last[$0];" 69 "" 70 "BEGIN" 71 "{" 72 " begin = timestamp;" 73 "}" 74 "" 75 ":::cpu-change-speed" 76 "/last[(processorid_t)arg0] != 0/" 77 "{" 78 " this->cpu = (processorid_t)arg0;" 79 " this->oldspeed = (uint64_t)arg1;" 80 " @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);" 81 " last[this->cpu] = timestamp;" 82 "}" 83 ":::cpu-change-speed" 84 "/last[(processorid_t)arg0] == 0/" 85 "{" 86 " this->cpu = (processorid_t)arg0;" 87 " this->oldspeed = (uint64_t)arg1;" 88 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);" 89 " last[this->cpu] = timestamp;" 90 "}"; 91 92 /* 93 * Same as above, but only for a specific CPU 94 */ 95 static const char *dtp_cpufreq_c = 96 "hrtime_t last;" 97 "" 98 "BEGIN" 99 "{" 100 " begin = timestamp;" 101 "}" 102 "" 103 ":::cpu-change-speed" 104 "/(processorid_t)arg0 == $1 &&" 105 " last != 0/" 106 "{" 107 " this->cpu = (processorid_t)arg0;" 108 " this->oldspeed = (uint64_t)arg1;" 109 " @times[this->cpu, this->oldspeed] = sum(timestamp - last);" 110 " last = timestamp;" 111 "}" 112 ":::cpu-change-speed" 113 "/(processorid_t)arg0 == $1 &&" 114 " last == 0/" 115 "{" 116 " this->cpu = (processorid_t)arg0;" 117 " this->oldspeed = (uint64_t)arg1;" 118 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);" 119 " last = timestamp;" 120 "}"; 121 122 static int pt_cpufreq_setup(void); 123 static int pt_cpufreq_snapshot(void); 124 static int pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *, void *); 125 static void pt_cpufreq_stat_account(double, uint_t); 126 static int pt_cpufreq_snapshot_cpu(kstat_ctl_t *, uint_t); 127 static int pt_cpufreq_check_pm(void); 128 static void pt_cpufreq_enable(void); 129 130 static int 131 pt_cpufreq_setup(void) 132 { 133 if ((dtp_argv = malloc(sizeof (char *) * DTP_ARG_COUNT)) == NULL) 134 return (EXIT_FAILURE); 135 136 if ((dtp_argv[0] = malloc(sizeof (char) * DTP_ARG_LENGTH)) == NULL) { 137 free(dtp_argv); 138 return (EXIT_FAILURE); 139 } 140 141 (void) snprintf(dtp_argv[0], 5, "%d\0", g_ncpus_observed); 142 143 if (PT_ON_CPU) { 144 if ((dtp_argv[1] = malloc(sizeof (char) * DTP_ARG_LENGTH)) 145 == NULL) { 146 free(dtp_argv[0]); 147 free(dtp_argv); 148 return (EXIT_FAILURE); 149 } 150 (void) snprintf(dtp_argv[1], 5, "%d\0", g_observed_cpu); 151 } 152 153 return (0); 154 } 155 156 /* 157 * Perform setup necessary to enumerate and track CPU speed changes 158 */ 159 int 160 pt_cpufreq_stat_prepare(void) 161 { 162 dtrace_prog_t *prog; 163 dtrace_proginfo_t info; 164 dtrace_optval_t statustime; 165 kstat_ctl_t *kc; 166 kstat_t *ksp; 167 kstat_named_t *knp; 168 freq_state_info_t *state; 169 char *s, *token, *prog_ptr; 170 int err; 171 172 if ((err = pt_cpufreq_setup()) != 0) { 173 pt_error("%s : failed to setup", __FILE__); 174 return (errno); 175 } 176 177 state = g_pstate_info; 178 if ((g_cpu_power_states = calloc((size_t)g_ncpus, 179 sizeof (cpu_power_info_t))) == NULL) 180 return (-1); 181 182 /* 183 * Enumerate the CPU frequencies 184 */ 185 if ((kc = kstat_open()) == NULL) 186 return (errno); 187 188 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[g_observed_cpu], NULL); 189 190 if (ksp == NULL) { 191 err = errno; 192 (void) kstat_close(kc); 193 return (err); 194 } 195 196 (void) kstat_read(kc, ksp, NULL); 197 198 knp = kstat_data_lookup(ksp, "supported_frequencies_Hz"); 199 s = knp->value.str.addr.ptr; 200 201 g_npstates = 0; 202 203 for (token = strtok(s, ":"), s = NULL; 204 NULL != token && g_npstates < NSTATES; 205 token = strtok(NULL, ":")) { 206 207 state->speed = HZ2MHZ(atoll(token)); 208 209 if (state->speed > max_cpufreq) 210 max_cpufreq = state->speed; 211 212 state->total_time = (uint64_t)0; 213 214 g_npstates++; 215 state++; 216 } 217 218 if (token != NULL) 219 pt_error("%s : exceeded NSTATES\n", __FILE__); 220 221 (void) kstat_close(kc); 222 223 /* 224 * Return if speed transition is not supported 225 */ 226 if (g_npstates < 2) 227 return (-1); 228 229 /* 230 * Setup DTrace to look for CPU frequency changes 231 */ 232 if ((dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 233 pt_error("%s : cannot open dtrace library: %s\n", __FILE__, 234 dtrace_errmsg(NULL, err)); 235 return (-2); 236 } 237 238 /* 239 * Execute different scripts (defined above) depending on 240 * user specified options. Default mode uses dtp_cpufreq. 241 */ 242 if (PT_ON_CPU) 243 prog_ptr = (char *)dtp_cpufreq_c; 244 else 245 prog_ptr = (char *)dtp_cpufreq; 246 247 if ((prog = dtrace_program_strcompile(dtp, prog_ptr, 248 DTRACE_PROBESPEC_NAME, 0, (1 + g_argc), dtp_argv)) == NULL) { 249 pt_error("%s : cpu-change-speed probe unavailable\n", __FILE__); 250 return (dtrace_errno(dtp)); 251 } 252 253 if (dtrace_program_exec(dtp, prog, &info) == -1) { 254 pt_error("%s : failed to enable speed probe\n", __FILE__); 255 return (dtrace_errno(dtp)); 256 } 257 258 if (dtrace_setopt(dtp, "aggsize", "128k") == -1) { 259 pt_error("%s : failed to set speed 'aggsize'\n", __FILE__); 260 } 261 262 if (dtrace_setopt(dtp, "aggrate", "0") == -1) { 263 pt_error("%s : failed to set speed 'aggrate'\n", __FILE__); 264 } 265 266 if (dtrace_setopt(dtp, "aggpercpu", 0) == -1) { 267 pt_error("%s : failed to set speed 'aggpercpu'\n", __FILE__); 268 } 269 270 if (dtrace_go(dtp) != 0) { 271 pt_error("%s : failed to start speed observation", __FILE__); 272 return (dtrace_errno(dtp)); 273 } 274 275 if (dtrace_getopt(dtp, "statusrate", &statustime) == -1) { 276 pt_error("%s : failed to get speed 'statusrate'\n", __FILE__); 277 return (dtrace_errno(dtp)); 278 } 279 280 return (0); 281 } 282 283 /* 284 * The DTrace probes have already been enabled, and are tracking 285 * CPU speed transitions. Take a snapshot of the aggregations, and 286 * look for any CPUs that have made a speed transition over the last 287 * sampling interval. Note that the aggregations may be empty if no 288 * speed transitions took place over the last interval. In that case, 289 * notate that we have already accounted for the time, so that when 290 * we do encounter a speed transition in a future sampling interval 291 * we can subtract that time back out. 292 */ 293 int 294 pt_cpufreq_stat_collect(double interval) 295 { 296 int i, ret; 297 298 /* 299 * Zero out the interval time reported by DTrace for 300 * this interval 301 */ 302 for (i = 0; i < g_npstates; i++) 303 g_pstate_info[i].total_time = 0; 304 305 for (i = 0; i < g_ncpus; i++) 306 g_cpu_power_states[i].dtrace_time = 0; 307 308 if (dtrace_status(dtp) == -1) 309 return (-1); 310 311 if (dtrace_aggregate_snap(dtp) != 0) 312 pt_error("%s : failed to add to stats aggregation", __FILE__); 313 314 if (dtrace_aggregate_walk_keyvarsorted(dtp, pt_cpufreq_dtrace_walk, 315 NULL) != 0) 316 pt_error("%s : failed to sort stats aggregation", __FILE__); 317 318 dtrace_aggregate_clear(dtp); 319 320 if ((ret = pt_cpufreq_snapshot()) != 0) { 321 pt_error("%s : failed to add to stats aggregation", __FILE__); 322 return (ret); 323 } 324 325 switch (g_op_mode) { 326 case PT_MODE_CPU: 327 pt_cpufreq_stat_account(interval, g_observed_cpu); 328 break; 329 case PT_MODE_DEFAULT: 330 default: 331 for (i = 0; i < g_ncpus_observed; i++) 332 pt_cpufreq_stat_account(interval, i); 333 break; 334 } 335 336 return (0); 337 } 338 339 static void 340 pt_cpufreq_stat_account(double interval, uint_t cpu) 341 { 342 cpu_power_info_t *cpu_pow; 343 uint64_t speed; 344 hrtime_t duration; 345 int i; 346 347 cpu_pow = &g_cpu_power_states[cpu]; 348 speed = cpu_pow->current_pstate; 349 350 duration = (hrtime_t)(interval * NANOSEC) - cpu_pow->dtrace_time; 351 352 /* 353 * 'duration' may be a negative value when we're using or forcing a 354 * small interval, and the amount of time already accounted ends up 355 * being larger than the the former. 356 */ 357 if (duration < 0) 358 return; 359 360 for (i = 0; i < g_npstates; i++) { 361 if (g_pstate_info[i].speed == speed) { 362 g_pstate_info[i].total_time += duration; 363 cpu_pow->time_accounted += duration; 364 cpu_pow->speed_accounted = speed; 365 } 366 } 367 } 368 369 /* 370 * Take a snapshot of each CPU's speed by looking through the cpu_info kstats. 371 */ 372 static int 373 pt_cpufreq_snapshot(void) 374 { 375 kstat_ctl_t *kc; 376 int ret; 377 uint_t i; 378 379 if ((kc = kstat_open()) == NULL) 380 return (errno); 381 382 switch (g_op_mode) { 383 case PT_MODE_CPU: 384 ret = pt_cpufreq_snapshot_cpu(kc, g_observed_cpu); 385 break; 386 case PT_MODE_DEFAULT: 387 default: 388 for (i = 0; i < g_ncpus_observed; i++) 389 if ((ret = pt_cpufreq_snapshot_cpu(kc, i)) != 0) 390 break; 391 break; 392 } 393 394 if (kstat_close(kc) != 0) 395 pt_error("%s : couldn't close kstat\n", __FILE__); 396 397 return (ret); 398 } 399 400 static int 401 pt_cpufreq_snapshot_cpu(kstat_ctl_t *kc, uint_t cpu) 402 { 403 kstat_t *ksp; 404 kstat_named_t *knp; 405 406 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[cpu], NULL); 407 if (ksp == NULL) { 408 pt_error("%s : couldn't find cpu_info kstat for CPU " 409 "%d\n", __FILE__, cpu); 410 return (1); 411 } 412 413 if (kstat_read(kc, ksp, NULL) == -1) { 414 pt_error("%s : couldn't read cpu_info kstat for " 415 "CPU %d\n", __FILE__, cpu); 416 return (2); 417 } 418 419 knp = kstat_data_lookup(ksp, "current_clock_Hz"); 420 if (knp == NULL) { 421 pt_error("%s : couldn't find current_clock_Hz " 422 "kstat for CPU %d\n", __FILE__, cpu); 423 return (3); 424 } 425 426 g_cpu_power_states[cpu].current_pstate = HZ2MHZ(knp->value.ui64); 427 428 return (0); 429 } 430 431 /* 432 * DTrace aggregation walker that sorts through a snapshot of the 433 * aggregation data collected during firings of the cpu-change-speed 434 * probe. 435 */ 436 /*ARGSUSED*/ 437 static int 438 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *data, void *arg) 439 { 440 dtrace_aggdesc_t *aggdesc = data->dtada_desc; 441 dtrace_recdesc_t *cpu_rec, *speed_rec; 442 cpu_power_info_t *cp; 443 int32_t cpu; 444 uint64_t speed; 445 hrtime_t res; 446 int i; 447 448 if (strcmp(aggdesc->dtagd_name, "times") == 0) { 449 cpu_rec = &aggdesc->dtagd_rec[1]; 450 speed_rec = &aggdesc->dtagd_rec[2]; 451 452 /* LINTED - alignment */ 453 cpu = *(int32_t *)(data->dtada_data + cpu_rec->dtrd_offset); 454 455 /* LINTED - alignment */ 456 res = *((hrtime_t *)(data->dtada_percpu[cpu])); 457 458 /* LINTED - alignment */ 459 speed = *(uint64_t *)(data->dtada_data + 460 speed_rec->dtrd_offset); 461 462 if (speed == 0) 463 speed = max_cpufreq; 464 else 465 speed = HZ2MHZ(speed); 466 467 /* 468 * We have an aggregation record for "cpu" being at "speed" 469 * for an interval of "n" nanoseconds. The reported interval 470 * may exceed the powertop sampling interval, since we only 471 * notice during potentially infrequent firings of the 472 * "speed change" DTrace probe. In this case powertop would 473 * have already accounted for the portions of the interval 474 * that happened during prior powertop samplings, so subtract 475 * out time already accounted. 476 */ 477 cp = &g_cpu_power_states[cpu]; 478 479 for (i = 0; i < g_npstates; i++) { 480 if (g_pstate_info[i].speed == speed) { 481 482 if (cp->time_accounted > 0 && 483 cp->speed_accounted == speed) { 484 if (res > cp->time_accounted) { 485 res -= cp->time_accounted; 486 cp->time_accounted = 0; 487 cp->speed_accounted = 0; 488 } else { 489 return (DTRACE_AGGWALK_NEXT); 490 } 491 } 492 493 g_pstate_info[i].total_time += res; 494 cp->dtrace_time += res; 495 } 496 } 497 } 498 499 return (DTRACE_AGGWALK_NEXT); 500 } 501 502 /* 503 * Checks if PM is enabled in /etc/power.conf, enabling if not 504 */ 505 void 506 pt_cpufreq_suggest(void) 507 { 508 int ret = pt_cpufreq_check_pm(); 509 510 switch (ret) { 511 case 0: 512 pt_sugg_add("Suggestion: enable CPU power management by " 513 "pressing the P key", 40, 'P', (char *)g_msg_freq_enable, 514 pt_cpufreq_enable); 515 break; 516 } 517 } 518 519 /* 520 * Checks /etc/power.conf and returns: 521 * 522 * 0 if CPUPM is not enabled 523 * 1 if there's nothing for us to do because: 524 * (a) the system does not support frequency scaling 525 * (b) there's no power.conf. 526 * 2 if CPUPM is enabled 527 * 3 if the system is running in poll-mode, as opposed to event-mode 528 * 529 * Notice the ordering of the return values, they will be picked up and 530 * switched upon ascendingly. 531 */ 532 static int 533 pt_cpufreq_check_pm(void) 534 { 535 char line[1024]; 536 FILE *file; 537 int ret = 0; 538 539 if (g_npstates < 2 || (file = fopen(default_conf, "r")) == NULL) 540 return (1); 541 542 (void) memset(line, 0, 1024); 543 544 while (fgets(line, 1024, file)) { 545 if (strstr(line, "cpupm")) { 546 if (strstr(line, "enable")) { 547 (void) fclose(file); 548 return (2); 549 } 550 } 551 if (strstr(line, "poll")) 552 ret = 3; 553 } 554 555 (void) fclose(file); 556 557 return (ret); 558 } 559 560 /* 561 * Used as a suggestion, sets PM in /etc/power.conf and 562 * a 1sec threshold, then calls /usr/sbin/pmconfig 563 */ 564 static void 565 pt_cpufreq_enable(void) 566 { 567 (void) system(cpupm_enable); 568 (void) system(cpupm_treshold); 569 (void) system(default_pmconf); 570 571 if (pt_sugg_remove(pt_cpufreq_enable) == 0) 572 pt_error("%s : failed to remove a sugg.\n", __FILE__); 573 } 574