1 /* 2 * Copyright 2009, Intel Corporation 3 * Copyright 2009, Sun Microsystems, Inc 4 * 5 * This file is part of PowerTOP 6 * 7 * This program file is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License as published by the 9 * Free Software Foundation; version 2 of the License. 10 * 11 * This program is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program in a file named COPYING; if not, write to the 18 * Free Software Foundation, Inc., 19 * 51 Franklin Street, Fifth Floor, 20 * Boston, MA 02110-1301 USA 21 * 22 * Authors: 23 * Arjan van de Ven <arjan@linux.intel.com> 24 * Eric C Saxe <eric.saxe@sun.com> 25 * Aubrey Li <aubrey.li@intel.com> 26 */ 27 28 /* 29 * GPL Disclaimer 30 * 31 * For the avoidance of doubt, except that if any license choice other 32 * than GPL or LGPL is available it will apply instead, Sun elects to 33 * use only the General Public License version 2 (GPLv2) at this time 34 * for any software where a choice of GPL license versions is made 35 * available with the language indicating that GPLv2 or any later 36 * version may be used, or where a choice of which version of the GPL 37 * is applied is otherwise unspecified. 38 */ 39 40 #include <stdlib.h> 41 #include <string.h> 42 #include <dtrace.h> 43 #include <kstat.h> 44 #include <errno.h> 45 #include "powertop.h" 46 47 #define HZ2MHZ(speed) ((speed) / MICROSEC) 48 #define DTP_ARG_COUNT 2 49 #define DTP_ARG_LENGTH 5 50 51 static uint64_t max_cpufreq = 0; 52 static dtrace_hdl_t *dtp; 53 static char **dtp_argv; 54 55 /* 56 * Enabling PM through /etc/power.conf 57 * See pt_cpufreq_suggest() 58 */ 59 static char default_conf[] = "/etc/power.conf"; 60 static char default_pmconf[] = "/usr/sbin/pmconfig"; 61 static char cpupm_enable[] = "echo cpupm enable >> /etc/power.conf"; 62 static char cpupm_treshold[] = "echo cpu-threshold 1s >> /etc/power.conf"; 63 64 /* 65 * Buffer containing DTrace program to track CPU frequency transitions 66 */ 67 static const char *dtp_cpufreq = 68 "hrtime_t last[$0];" 69 "" 70 "BEGIN" 71 "{" 72 " begin = timestamp;" 73 "}" 74 "" 75 ":::cpu-change-speed" 76 "/last[(processorid_t)arg0] != 0/" 77 "{" 78 " this->cpu = (processorid_t)arg0;" 79 " this->oldspeed = (uint64_t)arg1;" 80 " @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);" 81 " last[this->cpu] = timestamp;" 82 "}" 83 ":::cpu-change-speed" 84 "/last[(processorid_t)arg0] == 0/" 85 "{" 86 " this->cpu = (processorid_t)arg0;" 87 " this->oldspeed = (uint64_t)arg1;" 88 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);" 89 " last[this->cpu] = timestamp;" 90 "}"; 91 92 /* 93 * Same as above, but only for a specific CPU 94 */ 95 static const char *dtp_cpufreq_c = 96 "hrtime_t last;" 97 "" 98 "BEGIN" 99 "{" 100 " begin = timestamp;" 101 "}" 102 "" 103 ":::cpu-change-speed" 104 "/(processorid_t)arg0 == $1 &&" 105 " last != 0/" 106 "{" 107 " this->cpu = (processorid_t)arg0;" 108 " this->oldspeed = (uint64_t)arg1;" 109 " @times[this->cpu, this->oldspeed] = sum(timestamp - last);" 110 " last = timestamp;" 111 "}" 112 ":::cpu-change-speed" 113 "/(processorid_t)arg0 == $1 &&" 114 " last == 0/" 115 "{" 116 " this->cpu = (processorid_t)arg0;" 117 " this->oldspeed = (uint64_t)arg1;" 118 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);" 119 " last = timestamp;" 120 "}"; 121 122 static int pt_cpufreq_setup(void); 123 static int pt_cpufreq_snapshot(void); 124 static int pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *, void *); 125 static void pt_cpufreq_stat_account(double, uint_t); 126 static int pt_cpufreq_snapshot_cpu(kstat_ctl_t *, uint_t); 127 static int pt_cpufreq_check_pm(void); 128 static void pt_cpufreq_enable(void); 129 130 static int 131 pt_cpufreq_setup(void) 132 { 133 if ((dtp_argv = malloc(sizeof (char *) * DTP_ARG_COUNT)) == NULL) 134 return (1); 135 136 if ((dtp_argv[0] = malloc(sizeof (char) * DTP_ARG_LENGTH)) == NULL) { 137 free(dtp_argv); 138 return (1); 139 } 140 141 (void) snprintf(dtp_argv[0], 5, "%d\0", g_ncpus_observed); 142 143 if (PT_ON_CPU) { 144 if ((dtp_argv[1] = malloc(sizeof (char) * DTP_ARG_LENGTH)) 145 == NULL) { 146 free(dtp_argv[0]); 147 free(dtp_argv); 148 return (1); 149 } 150 (void) snprintf(dtp_argv[1], 5, "%d\0", g_observed_cpu); 151 } 152 153 return (0); 154 } 155 156 /* 157 * Perform setup necessary to enumerate and track CPU speed changes 158 */ 159 int 160 pt_cpufreq_stat_prepare(void) 161 { 162 dtrace_prog_t *prog; 163 dtrace_proginfo_t info; 164 dtrace_optval_t statustime; 165 kstat_ctl_t *kc; 166 kstat_t *ksp; 167 kstat_named_t *knp; 168 freq_state_info_t *state; 169 char *s, *token, *prog_ptr; 170 int err; 171 172 if ((err = pt_cpufreq_setup()) != 0) { 173 pt_error("failed to setup %s report (couldn't allocate " 174 "memory)\n", g_msg_freq_state); 175 return (errno); 176 } 177 178 state = g_pstate_info; 179 if ((g_cpu_power_states = calloc((size_t)g_ncpus, 180 sizeof (cpu_power_info_t))) == NULL) 181 return (-1); 182 183 /* 184 * Enumerate the CPU frequencies 185 */ 186 if ((kc = kstat_open()) == NULL) 187 return (errno); 188 189 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[g_observed_cpu], NULL); 190 191 if (ksp == NULL) { 192 err = errno; 193 (void) kstat_close(kc); 194 return (err); 195 } 196 197 (void) kstat_read(kc, ksp, NULL); 198 199 knp = kstat_data_lookup(ksp, "supported_frequencies_Hz"); 200 s = knp->value.str.addr.ptr; 201 202 g_npstates = 0; 203 204 for (token = strtok(s, ":"), s = NULL; 205 token != NULL && g_npstates < NSTATES; 206 token = strtok(NULL, ":")) { 207 208 state->speed = HZ2MHZ(atoll(token)); 209 210 if (state->speed > max_cpufreq) 211 max_cpufreq = state->speed; 212 213 state->total_time = (uint64_t)0; 214 215 g_npstates++; 216 state++; 217 } 218 219 if (token != NULL) 220 pt_error("CPU exceeds the supported number of %s\n", 221 g_msg_freq_state); 222 223 (void) kstat_close(kc); 224 225 /* 226 * Return if speed transition is not supported 227 */ 228 if (g_npstates < 2) 229 return (-1); 230 231 /* 232 * Setup DTrace to look for CPU frequency changes 233 */ 234 if ((dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 235 pt_error("cannot open dtrace library for the %s report: %s\n", 236 g_msg_freq_state, dtrace_errmsg(NULL, err)); 237 return (-2); 238 } 239 240 /* 241 * Execute different scripts (defined above) depending on 242 * user specified options. Default mode uses dtp_cpufreq. 243 */ 244 if (PT_ON_CPU) 245 prog_ptr = (char *)dtp_cpufreq_c; 246 else 247 prog_ptr = (char *)dtp_cpufreq; 248 249 if ((prog = dtrace_program_strcompile(dtp, prog_ptr, 250 DTRACE_PROBESPEC_NAME, 0, (1 + g_argc), dtp_argv)) == NULL) { 251 pt_error("failed to compile %s program\n", g_msg_freq_state); 252 return (dtrace_errno(dtp)); 253 } 254 255 if (dtrace_program_exec(dtp, prog, &info) == -1) { 256 pt_error("failed to enable %s probes\n", g_msg_freq_state); 257 return (dtrace_errno(dtp)); 258 } 259 260 if (dtrace_setopt(dtp, "aggsize", "128k") == -1) 261 pt_error("failed to set %s 'aggsize'\n", g_msg_freq_state); 262 263 if (dtrace_setopt(dtp, "aggrate", "0") == -1) 264 pt_error("failed to set %s 'aggrate'\n", g_msg_freq_state); 265 266 if (dtrace_setopt(dtp, "aggpercpu", 0) == -1) 267 pt_error("failed to set %s 'aggpercpu'\n", g_msg_freq_state); 268 269 if (dtrace_go(dtp) != 0) { 270 pt_error("failed to start %s observation\n", g_msg_freq_state); 271 return (dtrace_errno(dtp)); 272 } 273 274 if (dtrace_getopt(dtp, "statusrate", &statustime) == -1) { 275 pt_error("failed to get %s 'statusrate'\n", g_msg_freq_state); 276 return (dtrace_errno(dtp)); 277 } 278 279 return (0); 280 } 281 282 /* 283 * The DTrace probes have already been enabled, and are tracking 284 * CPU speed transitions. Take a snapshot of the aggregations, and 285 * look for any CPUs that have made a speed transition over the last 286 * sampling interval. Note that the aggregations may be empty if no 287 * speed transitions took place over the last interval. In that case, 288 * notate that we have already accounted for the time, so that when 289 * we do encounter a speed transition in a future sampling interval 290 * we can subtract that time back out. 291 */ 292 int 293 pt_cpufreq_stat_collect(double interval) 294 { 295 int i, ret; 296 297 /* 298 * Zero out the interval time reported by DTrace for 299 * this interval 300 */ 301 for (i = 0; i < g_npstates; i++) 302 g_pstate_info[i].total_time = 0; 303 304 for (i = 0; i < g_ncpus; i++) 305 g_cpu_power_states[i].dtrace_time = 0; 306 307 if (dtrace_status(dtp) == -1) 308 return (-1); 309 310 if (dtrace_aggregate_snap(dtp) != 0) 311 pt_error("failed to collect data for %s\n", g_msg_freq_state); 312 313 if (dtrace_aggregate_walk_keyvarsorted(dtp, pt_cpufreq_dtrace_walk, 314 NULL) != 0) 315 pt_error("failed to sort data for %s\n", g_msg_freq_state); 316 317 dtrace_aggregate_clear(dtp); 318 319 if ((ret = pt_cpufreq_snapshot()) != 0) { 320 pt_error("failed to snapshot %s state\n", g_msg_freq_state); 321 return (ret); 322 } 323 324 switch (g_op_mode) { 325 case PT_MODE_CPU: 326 pt_cpufreq_stat_account(interval, g_observed_cpu); 327 break; 328 case PT_MODE_DEFAULT: 329 default: 330 for (i = 0; i < g_ncpus_observed; i++) 331 pt_cpufreq_stat_account(interval, i); 332 break; 333 } 334 335 return (0); 336 } 337 338 static void 339 pt_cpufreq_stat_account(double interval, uint_t cpu) 340 { 341 cpu_power_info_t *cpu_pow; 342 uint64_t speed; 343 hrtime_t duration; 344 int i; 345 346 cpu_pow = &g_cpu_power_states[cpu]; 347 speed = cpu_pow->current_pstate; 348 349 duration = (hrtime_t)(interval * NANOSEC) - cpu_pow->dtrace_time; 350 351 /* 352 * 'duration' may be a negative value when we're using or forcing a 353 * small interval, and the amount of time already accounted ends up 354 * being larger than the the former. 355 */ 356 if (duration < 0) 357 return; 358 359 for (i = 0; i < g_npstates; i++) { 360 if (g_pstate_info[i].speed == speed) { 361 g_pstate_info[i].total_time += duration; 362 cpu_pow->time_accounted += duration; 363 cpu_pow->speed_accounted = speed; 364 } 365 } 366 } 367 368 /* 369 * Take a snapshot of each CPU's speed by looking through the cpu_info kstats. 370 */ 371 static int 372 pt_cpufreq_snapshot(void) 373 { 374 kstat_ctl_t *kc; 375 int ret; 376 uint_t i; 377 378 if ((kc = kstat_open()) == NULL) 379 return (errno); 380 381 switch (g_op_mode) { 382 case PT_MODE_CPU: 383 ret = pt_cpufreq_snapshot_cpu(kc, g_observed_cpu); 384 break; 385 case PT_MODE_DEFAULT: 386 default: 387 for (i = 0; i < g_ncpus_observed; i++) 388 if ((ret = pt_cpufreq_snapshot_cpu(kc, i)) != 0) 389 break; 390 break; 391 } 392 393 if (kstat_close(kc) != 0) 394 pt_error("couldn't close %s kstat\n", g_msg_freq_state); 395 396 return (ret); 397 } 398 399 static int 400 pt_cpufreq_snapshot_cpu(kstat_ctl_t *kc, uint_t cpu) 401 { 402 kstat_t *ksp; 403 kstat_named_t *knp; 404 405 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[cpu], NULL); 406 if (ksp == NULL) { 407 pt_error("couldn't find 'cpu_info' kstat for CPU %d\n while " 408 "taking a snapshot of %s\n", cpu, g_msg_freq_state); 409 return (1); 410 } 411 412 if (kstat_read(kc, ksp, NULL) == -1) { 413 pt_error("couldn't read 'cpu_info' kstat for CPU %d\n while " 414 "taking a snapshot of %s\n", cpu, g_msg_freq_state); 415 return (2); 416 } 417 418 knp = kstat_data_lookup(ksp, "current_clock_Hz"); 419 if (knp == NULL) { 420 pt_error("couldn't find 'current_clock_Hz' kstat for CPU %d " 421 "while taking a snapshot of %s\n", cpu, g_msg_freq_state); 422 return (3); 423 } 424 425 g_cpu_power_states[cpu].current_pstate = HZ2MHZ(knp->value.ui64); 426 427 return (0); 428 } 429 430 /* 431 * DTrace aggregation walker that sorts through a snapshot of the 432 * aggregation data collected during firings of the cpu-change-speed 433 * probe. 434 */ 435 /*ARGSUSED*/ 436 static int 437 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *data, void *arg) 438 { 439 dtrace_aggdesc_t *aggdesc = data->dtada_desc; 440 dtrace_recdesc_t *cpu_rec, *speed_rec; 441 cpu_power_info_t *cp; 442 int32_t cpu; 443 uint64_t speed; 444 hrtime_t res; 445 int i; 446 447 if (strcmp(aggdesc->dtagd_name, "times") == 0) { 448 cpu_rec = &aggdesc->dtagd_rec[1]; 449 speed_rec = &aggdesc->dtagd_rec[2]; 450 451 /* LINTED - alignment */ 452 cpu = *(int32_t *)(data->dtada_data + cpu_rec->dtrd_offset); 453 454 /* LINTED - alignment */ 455 res = *((hrtime_t *)(data->dtada_percpu[cpu])); 456 457 /* LINTED - alignment */ 458 speed = *(uint64_t *)(data->dtada_data + 459 speed_rec->dtrd_offset); 460 461 if (speed == 0) 462 speed = max_cpufreq; 463 else 464 speed = HZ2MHZ(speed); 465 466 /* 467 * We have an aggregation record for "cpu" being at "speed" 468 * for an interval of "n" nanoseconds. The reported interval 469 * may exceed the powertop sampling interval, since we only 470 * notice during potentially infrequent firings of the 471 * "speed change" DTrace probe. In this case powertop would 472 * have already accounted for the portions of the interval 473 * that happened during prior powertop samplings, so subtract 474 * out time already accounted. 475 */ 476 cp = &g_cpu_power_states[cpu]; 477 478 for (i = 0; i < g_npstates; i++) { 479 if (g_pstate_info[i].speed == speed) { 480 481 if (cp->time_accounted > 0 && 482 cp->speed_accounted == speed) { 483 if (res > cp->time_accounted) { 484 res -= cp->time_accounted; 485 cp->time_accounted = 0; 486 cp->speed_accounted = 0; 487 } else { 488 return (DTRACE_AGGWALK_NEXT); 489 } 490 } 491 492 g_pstate_info[i].total_time += res; 493 cp->dtrace_time += res; 494 } 495 } 496 } 497 498 return (DTRACE_AGGWALK_NEXT); 499 } 500 501 /* 502 * Checks if PM is enabled in /etc/power.conf, enabling if not 503 */ 504 void 505 pt_cpufreq_suggest(void) 506 { 507 int ret = pt_cpufreq_check_pm(); 508 509 switch (ret) { 510 case 0: 511 pt_sugg_add("Suggestion: enable CPU power management by " 512 "pressing the P key", 40, 'P', (char *)g_msg_freq_enable, 513 pt_cpufreq_enable); 514 break; 515 } 516 } 517 518 /* 519 * Checks /etc/power.conf and returns: 520 * 521 * 0 if CPUPM is not enabled 522 * 1 if there's nothing for us to do because: 523 * (a) the system does not support frequency scaling 524 * (b) there's no power.conf. 525 * 2 if CPUPM is enabled 526 * 3 if the system is running in poll-mode, as opposed to event-mode 527 * 528 * Notice the ordering of the return values, they will be picked up and 529 * switched upon ascendingly. 530 */ 531 static int 532 pt_cpufreq_check_pm(void) 533 { 534 char line[1024]; 535 FILE *file; 536 int ret = 0; 537 538 if (g_npstates < 2 || (file = fopen(default_conf, "r")) == NULL) 539 return (1); 540 541 (void) memset(line, 0, 1024); 542 543 while (fgets(line, 1024, file)) { 544 if (strstr(line, "cpupm")) { 545 if (strstr(line, "enable")) { 546 (void) fclose(file); 547 return (2); 548 } 549 } 550 if (strstr(line, "poll")) 551 ret = 3; 552 } 553 554 (void) fclose(file); 555 556 return (ret); 557 } 558 559 /* 560 * Used as a suggestion, sets PM in /etc/power.conf and 561 * a 1sec threshold, then calls /usr/sbin/pmconfig 562 */ 563 static void 564 pt_cpufreq_enable(void) 565 { 566 (void) system(cpupm_enable); 567 (void) system(cpupm_treshold); 568 (void) system(default_pmconf); 569 570 if (pt_sugg_remove(pt_cpufreq_enable) == 0) 571 pt_error("failed to remove a %s suggestion\n", 572 g_msg_freq_state); 573 } 574