xref: /titanic_51/usr/src/cmd/powertop/common/cpufreq.c (revision 65488c97aeb108aeffd7b61db3b2b3bcb4fc9d72)
1 /*
2  * Copyright 2009, Intel Corporation
3  * Copyright 2009, Sun Microsystems, Inc
4  *
5  * This file is part of PowerTOP
6  *
7  * This program file is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License as published by the
9  * Free Software Foundation; version 2 of the License.
10  *
11  * This program is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program in a file named COPYING; if not, write to the
18  * Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor,
20  * Boston, MA 02110-1301 USA
21  *
22  * Authors:
23  *	Arjan van de Ven <arjan@linux.intel.com>
24  *	Eric C Saxe <eric.saxe@sun.com>
25  *	Aubrey Li <aubrey.li@intel.com>
26  */
27 
28 /*
29  * GPL Disclaimer
30  *
31  * For the avoidance of doubt, except that if any license choice other
32  * than GPL or LGPL is available it will apply instead, Sun elects to
33  * use only the General Public License version 2 (GPLv2) at this time
34  * for any software where a choice of GPL license versions is made
35  * available with the language indicating that GPLv2 or any later
36  * version may be used, or where a choice of which version of the GPL
37  * is applied is otherwise unspecified.
38  */
39 
40 #include <stdlib.h>
41 #include <string.h>
42 #include <dtrace.h>
43 #include <kstat.h>
44 #include <errno.h>
45 #include "powertop.h"
46 
47 #define	HZ2MHZ(speed)	((speed) / MICROSEC)
48 #define	DTP_ARG_COUNT	2
49 #define	DTP_ARG_LENGTH	5
50 
51 static uint64_t		max_cpufreq = 0;
52 static dtrace_hdl_t	*dtp;
53 static char		**dtp_argv;
54 
55 /*
56  * Enabling PM through /etc/power.conf
57  * See pt_cpufreq_suggest()
58  */
59 static char default_conf[]	= "/etc/power.conf";
60 static char default_pmconf[]	= "/usr/sbin/pmconfig";
61 static char cpupm_enable[]	= "echo cpupm enable >> /etc/power.conf";
62 static char cpupm_treshold[]	= "echo cpu-threshold 1s >> /etc/power.conf";
63 
64 /*
65  * Buffer containing DTrace program to track CPU frequency transitions
66  */
67 static const char *dtp_cpufreq =
68 "hrtime_t last[$0];"
69 ""
70 "BEGIN"
71 "{"
72 "	begin = timestamp;"
73 "}"
74 ""
75 ":::cpu-change-speed"
76 "/last[(processorid_t)arg0] != 0/"
77 "{"
78 "	this->cpu = (processorid_t)arg0;"
79 "	this->oldspeed = (uint64_t)arg1;"
80 "	@times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
81 "	last[this->cpu] = timestamp;"
82 "}"
83 ":::cpu-change-speed"
84 "/last[(processorid_t)arg0] == 0/"
85 "{"
86 "	this->cpu = (processorid_t)arg0;"
87 "	this->oldspeed = (uint64_t)arg1;"
88 "	@times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
89 "	last[this->cpu] = timestamp;"
90 "}";
91 
92 /*
93  * Same as above, but only for a specific CPU
94  */
95 static const char *dtp_cpufreq_c =
96 "hrtime_t last;"
97 ""
98 "BEGIN"
99 "{"
100 "	begin = timestamp;"
101 "}"
102 ""
103 ":::cpu-change-speed"
104 "/(processorid_t)arg0 == $1 &&"
105 " last != 0/"
106 "{"
107 "	this->cpu = (processorid_t)arg0;"
108 "	this->oldspeed = (uint64_t)arg1;"
109 "	@times[this->cpu, this->oldspeed] = sum(timestamp - last);"
110 "	last = timestamp;"
111 "}"
112 ":::cpu-change-speed"
113 "/(processorid_t)arg0 == $1 &&"
114 " last == 0/"
115 "{"
116 "	this->cpu = (processorid_t)arg0;"
117 "	this->oldspeed = (uint64_t)arg1;"
118 "	@times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
119 "	last = timestamp;"
120 "}";
121 
122 static int	pt_cpufreq_setup(void);
123 static int	pt_cpufreq_snapshot(void);
124 static int	pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *, void *);
125 static void	pt_cpufreq_stat_account(double, uint_t);
126 static int	pt_cpufreq_snapshot_cpu(kstat_ctl_t *, uint_t);
127 static int	pt_cpufreq_check_pm(void);
128 static void	pt_cpufreq_enable(void);
129 
130 static int
131 pt_cpufreq_setup(void)
132 {
133 	if ((dtp_argv = malloc(sizeof (char *) * DTP_ARG_COUNT)) == NULL)
134 		return (EXIT_FAILURE);
135 
136 	if ((dtp_argv[0] = malloc(sizeof (char) * DTP_ARG_LENGTH)) == NULL) {
137 		free(dtp_argv);
138 		return (EXIT_FAILURE);
139 	}
140 
141 	(void) snprintf(dtp_argv[0], 5, "%d\0", g_ncpus_observed);
142 
143 	if (PT_ON_CPU) {
144 		if ((dtp_argv[1] = malloc(sizeof (char) * DTP_ARG_LENGTH))
145 		    == NULL) {
146 			free(dtp_argv[0]);
147 			free(dtp_argv);
148 			return (EXIT_FAILURE);
149 		}
150 		(void) snprintf(dtp_argv[1], 5, "%d\0", g_observed_cpu);
151 	}
152 
153 	return (0);
154 }
155 
156 /*
157  * Perform setup necessary to enumerate and track CPU speed changes
158  */
159 int
160 pt_cpufreq_stat_prepare(void)
161 {
162 	dtrace_prog_t 		*prog;
163 	dtrace_proginfo_t 	info;
164 	dtrace_optval_t 	statustime;
165 	kstat_ctl_t 		*kc;
166 	kstat_t 		*ksp;
167 	kstat_named_t 		*knp;
168 	freq_state_info_t 	*state;
169 	char 			*s, *token, *prog_ptr;
170 	int 			err;
171 
172 	if ((err = pt_cpufreq_setup()) != 0) {
173 		pt_error("%s : failed to setup", __FILE__);
174 		return (errno);
175 	}
176 
177 	state = g_pstate_info;
178 	if ((g_cpu_power_states = calloc((size_t)g_ncpus,
179 	    sizeof (cpu_power_info_t))) == NULL)
180 		return (-1);
181 
182 	/*
183 	 * Enumerate the CPU frequencies
184 	 */
185 	if ((kc = kstat_open()) == NULL)
186 		return (errno);
187 
188 	ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[g_observed_cpu], NULL);
189 
190 	if (ksp == NULL) {
191 		err = errno;
192 		(void) kstat_close(kc);
193 		return (err);
194 	}
195 
196 	(void) kstat_read(kc, ksp, NULL);
197 
198 	knp = kstat_data_lookup(ksp, "supported_frequencies_Hz");
199 	s = knp->value.str.addr.ptr;
200 
201 	g_npstates = 0;
202 
203 	for (token = strtok(s, ":"), s = NULL;
204 	    NULL != token && g_npstates < NSTATES;
205 	    token = strtok(NULL, ":")) {
206 
207 		state->speed = HZ2MHZ(atoll(token));
208 
209 		if (state->speed > max_cpufreq)
210 			max_cpufreq = state->speed;
211 
212 		state->total_time = (uint64_t)0;
213 
214 		g_npstates++;
215 		state++;
216 	}
217 
218 	if (token != NULL)
219 		pt_error("%s : exceeded NSTATES\n", __FILE__);
220 
221 	(void) kstat_close(kc);
222 
223 	/*
224 	 * Return if speed transition is not supported
225 	 */
226 	if (g_npstates < 2)
227 		return (-1);
228 
229 	/*
230 	 * Setup DTrace to look for CPU frequency changes
231 	 */
232 	if ((dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
233 		pt_error("%s : cannot open dtrace library: %s\n", __FILE__,
234 		    dtrace_errmsg(NULL, err));
235 		return (-2);
236 	}
237 
238 	/*
239 	 * Execute different scripts (defined above) depending on
240 	 * user specified options. Default mode uses dtp_cpufreq.
241 	 */
242 	if (PT_ON_CPU)
243 		prog_ptr = (char *)dtp_cpufreq_c;
244 	else
245 		prog_ptr = (char *)dtp_cpufreq;
246 
247 	if ((prog = dtrace_program_strcompile(dtp, prog_ptr,
248 	    DTRACE_PROBESPEC_NAME, 0, (1 + g_argc), dtp_argv)) == NULL) {
249 		pt_error("%s : cpu-change-speed probe unavailable\n", __FILE__);
250 		return (dtrace_errno(dtp));
251 	}
252 
253 	if (dtrace_program_exec(dtp, prog, &info) == -1) {
254 		pt_error("%s : failed to enable speed probe\n", __FILE__);
255 		return (dtrace_errno(dtp));
256 	}
257 
258 	if (dtrace_setopt(dtp, "aggsize", "128k") == -1) {
259 		pt_error("%s : failed to set speed 'aggsize'\n", __FILE__);
260 	}
261 
262 	if (dtrace_setopt(dtp, "aggrate", "0") == -1) {
263 		pt_error("%s : failed to set speed 'aggrate'\n", __FILE__);
264 	}
265 
266 	if (dtrace_setopt(dtp, "aggpercpu", 0) == -1) {
267 		pt_error("%s : failed to set speed 'aggpercpu'\n", __FILE__);
268 	}
269 
270 	if (dtrace_go(dtp) != 0) {
271 		pt_error("%s : failed to start speed observation", __FILE__);
272 		return (dtrace_errno(dtp));
273 	}
274 
275 	if (dtrace_getopt(dtp, "statusrate", &statustime) == -1) {
276 		pt_error("%s : failed to get speed 'statusrate'\n", __FILE__);
277 		return (dtrace_errno(dtp));
278 	}
279 
280 	return (0);
281 }
282 
283 /*
284  * The DTrace probes have already been enabled, and are tracking
285  * CPU speed transitions. Take a snapshot of the aggregations, and
286  * look for any CPUs that have made a speed transition over the last
287  * sampling interval. Note that the aggregations may be empty if no
288  * speed transitions took place over the last interval. In that case,
289  * notate that we have already accounted for the time, so that when
290  * we do encounter a speed transition in a future sampling interval
291  * we can subtract that time back out.
292  */
293 int
294 pt_cpufreq_stat_collect(double interval)
295 {
296 	int i, ret;
297 
298 	/*
299 	 * Zero out the interval time reported by DTrace for
300 	 * this interval
301 	 */
302 	for (i = 0; i < g_npstates; i++)
303 		g_pstate_info[i].total_time = 0;
304 
305 	for (i = 0; i < g_ncpus; i++)
306 		g_cpu_power_states[i].dtrace_time = 0;
307 
308 	if (dtrace_status(dtp) == -1)
309 		return (-1);
310 
311 	if (dtrace_aggregate_snap(dtp) != 0)
312 		pt_error("%s : failed to add to stats aggregation", __FILE__);
313 
314 	if (dtrace_aggregate_walk_keyvarsorted(dtp, pt_cpufreq_dtrace_walk,
315 	    NULL) != 0)
316 		pt_error("%s : failed to sort stats aggregation", __FILE__);
317 
318 	dtrace_aggregate_clear(dtp);
319 
320 	if ((ret = pt_cpufreq_snapshot()) != 0) {
321 		pt_error("%s : failed to add to stats aggregation", __FILE__);
322 		return (ret);
323 	}
324 
325 	switch (g_op_mode) {
326 	case PT_MODE_CPU:
327 		pt_cpufreq_stat_account(interval, g_observed_cpu);
328 		break;
329 	case PT_MODE_DEFAULT:
330 	default:
331 		for (i = 0; i < g_ncpus_observed; i++)
332 			pt_cpufreq_stat_account(interval, i);
333 		break;
334 	}
335 
336 	return (0);
337 }
338 
339 static void
340 pt_cpufreq_stat_account(double interval, uint_t cpu)
341 {
342 	cpu_power_info_t 	*cpu_pow;
343 	uint64_t 		speed;
344 	hrtime_t 		duration;
345 	int			i;
346 
347 	cpu_pow = &g_cpu_power_states[cpu];
348 	speed = cpu_pow->current_pstate;
349 
350 	duration = (hrtime_t)(interval * NANOSEC) - cpu_pow->dtrace_time;
351 
352 	/*
353 	 * 'duration' may be a negative value when we're using or forcing a
354 	 * small interval, and the amount of time already accounted ends up
355 	 * being larger than the the former.
356 	 */
357 	if (duration < 0)
358 		return;
359 
360 	for (i = 0; i < g_npstates; i++) {
361 		if (g_pstate_info[i].speed == speed) {
362 			g_pstate_info[i].total_time += duration;
363 			cpu_pow->time_accounted += duration;
364 			cpu_pow->speed_accounted = speed;
365 		}
366 	}
367 }
368 
369 /*
370  * Take a snapshot of each CPU's speed by looking through the cpu_info kstats.
371  */
372 static int
373 pt_cpufreq_snapshot(void)
374 {
375 	kstat_ctl_t 	*kc;
376 	int 		ret;
377 	uint_t		i;
378 
379 	if ((kc = kstat_open()) == NULL)
380 		return (errno);
381 
382 	switch (g_op_mode) {
383 	case PT_MODE_CPU:
384 		ret = pt_cpufreq_snapshot_cpu(kc, g_observed_cpu);
385 		break;
386 	case PT_MODE_DEFAULT:
387 	default:
388 		for (i = 0; i < g_ncpus_observed; i++)
389 			if ((ret = pt_cpufreq_snapshot_cpu(kc, i)) != 0)
390 				break;
391 		break;
392 	}
393 
394 	if (kstat_close(kc) != 0)
395 		pt_error("%s : couldn't close kstat\n", __FILE__);
396 
397 	return (ret);
398 }
399 
400 static int
401 pt_cpufreq_snapshot_cpu(kstat_ctl_t *kc, uint_t cpu)
402 {
403 	kstat_t 		*ksp;
404 	kstat_named_t 		*knp;
405 
406 	ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[cpu], NULL);
407 	if (ksp == NULL) {
408 		pt_error("%s : couldn't find cpu_info kstat for CPU "
409 		"%d\n", __FILE__, cpu);
410 		return (1);
411 	}
412 
413 	if (kstat_read(kc, ksp, NULL) == -1) {
414 		pt_error("%s : couldn't read cpu_info kstat for "
415 		    "CPU %d\n", __FILE__, cpu);
416 		return (2);
417 	}
418 
419 	knp = kstat_data_lookup(ksp, "current_clock_Hz");
420 	if (knp == NULL) {
421 		pt_error("%s : couldn't find current_clock_Hz "
422 		    "kstat for CPU %d\n", __FILE__, cpu);
423 		return (3);
424 	}
425 
426 	g_cpu_power_states[cpu].current_pstate = HZ2MHZ(knp->value.ui64);
427 
428 	return (0);
429 }
430 
431 /*
432  * DTrace aggregation walker that sorts through a snapshot of the
433  * aggregation data collected during firings of the cpu-change-speed
434  * probe.
435  */
436 /*ARGSUSED*/
437 static int
438 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *data, void *arg)
439 {
440 	dtrace_aggdesc_t 	*aggdesc = data->dtada_desc;
441 	dtrace_recdesc_t 	*cpu_rec, *speed_rec;
442 	cpu_power_info_t 	*cp;
443 	int32_t 		cpu;
444 	uint64_t 		speed;
445 	hrtime_t 		res;
446 	int 			i;
447 
448 	if (strcmp(aggdesc->dtagd_name, "times") == 0) {
449 		cpu_rec = &aggdesc->dtagd_rec[1];
450 		speed_rec = &aggdesc->dtagd_rec[2];
451 
452 		/* LINTED - alignment */
453 		cpu = *(int32_t *)(data->dtada_data + cpu_rec->dtrd_offset);
454 
455 		/* LINTED - alignment */
456 		res = *((hrtime_t *)(data->dtada_percpu[cpu]));
457 
458 		/* LINTED - alignment */
459 		speed = *(uint64_t *)(data->dtada_data +
460 		    speed_rec->dtrd_offset);
461 
462 		if (speed == 0)
463 			speed = max_cpufreq;
464 		else
465 			speed = HZ2MHZ(speed);
466 
467 		/*
468 		 * We have an aggregation record for "cpu" being at "speed"
469 		 * for an interval of "n" nanoseconds. The reported interval
470 		 * may exceed the powertop sampling interval, since we only
471 		 * notice during potentially infrequent firings of the
472 		 * "speed change" DTrace probe. In this case powertop would
473 		 * have already accounted for the portions of the interval
474 		 * that happened during prior powertop samplings, so subtract
475 		 * out time already accounted.
476 		 */
477 		cp = &g_cpu_power_states[cpu];
478 
479 		for (i = 0; i < g_npstates; i++) {
480 			if (g_pstate_info[i].speed == speed) {
481 
482 				if (cp->time_accounted > 0 &&
483 				    cp->speed_accounted == speed) {
484 					if (res > cp->time_accounted) {
485 						res -= cp->time_accounted;
486 						cp->time_accounted = 0;
487 						cp->speed_accounted = 0;
488 					} else {
489 						return (DTRACE_AGGWALK_NEXT);
490 					}
491 				}
492 
493 				g_pstate_info[i].total_time += res;
494 				cp->dtrace_time += res;
495 			}
496 		}
497 	}
498 
499 	return (DTRACE_AGGWALK_NEXT);
500 }
501 
502 /*
503  * Checks if PM is enabled in /etc/power.conf, enabling if not
504  */
505 void
506 pt_cpufreq_suggest(void)
507 {
508 	int ret = pt_cpufreq_check_pm();
509 
510 	switch (ret) {
511 	case 0:
512 		pt_sugg_add("Suggestion: enable CPU power management by "
513 		    "pressing the P key", 40, 'P', (char *)g_msg_freq_enable,
514 		    pt_cpufreq_enable);
515 		break;
516 	}
517 }
518 
519 /*
520  * Checks /etc/power.conf and returns:
521  *
522  *     0 if CPUPM is not enabled
523  *     1 if there's nothing for us to do because:
524  *         (a) the system does not support frequency scaling
525  *         (b) there's no power.conf.
526  *     2 if CPUPM is enabled
527  *     3 if the system is running in poll-mode, as opposed to event-mode
528  *
529  * Notice the ordering of the return values, they will be picked up and
530  * switched upon ascendingly.
531  */
532 static int
533 pt_cpufreq_check_pm(void)
534 {
535 	char line[1024];
536 	FILE *file;
537 	int ret = 0;
538 
539 	if (g_npstates < 2 || (file = fopen(default_conf, "r")) == NULL)
540 		return (1);
541 
542 	(void) memset(line, 0, 1024);
543 
544 	while (fgets(line, 1024, file)) {
545 		if (strstr(line, "cpupm")) {
546 			if (strstr(line, "enable")) {
547 				(void) fclose(file);
548 				return (2);
549 			}
550 		}
551 		if (strstr(line, "poll"))
552 			ret = 3;
553 	}
554 
555 	(void) fclose(file);
556 
557 	return (ret);
558 }
559 
560 /*
561  * Used as a suggestion, sets PM in /etc/power.conf and
562  * a 1sec threshold, then calls /usr/sbin/pmconfig
563  */
564 static void
565 pt_cpufreq_enable(void)
566 {
567 	(void) system(cpupm_enable);
568 	(void) system(cpupm_treshold);
569 	(void) system(default_pmconf);
570 
571 	if (pt_sugg_remove(pt_cpufreq_enable) == 0)
572 		pt_error("%s : failed to remove a sugg.\n", __FILE__);
573 }
574