xref: /titanic_50/usr/src/cmd/powertop/common/cpufreq.c (revision 47ab0c7c6702159d8bb84e3b1533d9f9843dd568)
1 /*
2  * Copyright 2009, Intel Corporation
3  * Copyright 2009, Sun Microsystems, Inc
4  *
5  * This file is part of PowerTOP
6  *
7  * This program file is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License as published by the
9  * Free Software Foundation; version 2 of the License.
10  *
11  * This program is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program in a file named COPYING; if not, write to the
18  * Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor,
20  * Boston, MA 02110-1301 USA
21  *
22  * Authors:
23  *	Arjan van de Ven <arjan@linux.intel.com>
24  *	Eric C Saxe <eric.saxe@sun.com>
25  *	Aubrey Li <aubrey.li@intel.com>
26  */
27 
28 /*
29  * GPL Disclaimer
30  *
31  * For the avoidance of doubt, except that if any license choice other
32  * than GPL or LGPL is available it will apply instead, Sun elects to
33  * use only the General Public License version 2 (GPLv2) at this time
34  * for any software where a choice of GPL license versions is made
35  * available with the language indicating that GPLv2 or any later
36  * version may be used, or where a choice of which version of the GPL
37  * is applied is otherwise unspecified.
38  */
39 
40 #include <stdlib.h>
41 #include <string.h>
42 #include <dtrace.h>
43 #include <kstat.h>
44 #include <errno.h>
45 #include "powertop.h"
46 
47 #define	HZ2MHZ(speed)	((speed) / 1000000)
48 #define	DTP_ARG_COUNT	2
49 #define	DTP_ARG_LENGTH	5
50 
51 static uint64_t		max_cpufreq = 0;
52 static dtrace_hdl_t	*dtp;
53 static char		**dtp_argv;
54 
55 /*
56  * Enabling PM through /etc/power.conf
57  * See suggest_p_state()
58  */
59 static char default_conf[]	= "/etc/power.conf";
60 static char default_pmconf[]	= "/usr/sbin/pmconfig";
61 static char cpupm_enable[]	= " echo cpupm enable >> /etc/power.conf";
62 static char cpupm_treshold[]	= " echo cpu-threshold 1s >> /etc/power.conf";
63 
64 /*
65  * Buffer containing DTrace program to track CPU frequency transitions
66  */
67 static const char *dtp_cpufreq =
68 "hrtime_t last[$0];"
69 ""
70 "BEGIN"
71 "{"
72 "	begin = timestamp;"
73 "}"
74 ""
75 ":::cpu-change-speed"
76 "/last[(processorid_t)arg0] != 0/"
77 "{"
78 "	this->cpu = (processorid_t)arg0;"
79 "	this->oldspeed = (uint32_t)(arg1/1000000);"
80 "	@times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
81 "	last[this->cpu] = timestamp;"
82 "}"
83 ":::cpu-change-speed"
84 "/last[(processorid_t)arg0] == 0/"
85 "{"
86 "	this->cpu = (processorid_t)arg0;"
87 "	this->oldspeed = (uint32_t)(arg1/1000000);"
88 "	@times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
89 "	last[this->cpu] = timestamp;"
90 "}";
91 
92 /*
93  * Same as above, but only for a specific CPU
94  */
95 static const char *dtp_cpufreq_c =
96 "hrtime_t last;"
97 ""
98 "BEGIN"
99 "{"
100 "	begin = timestamp;"
101 "}"
102 ""
103 ":::cpu-change-speed"
104 "/(processorid_t)arg0 == $1 &&"
105 " last != 0/"
106 "{"
107 "	this->cpu = (processorid_t)arg0;"
108 "	this->oldspeed = (uint32_t)(arg1/1000000);"
109 "	@times[this->cpu, this->oldspeed] = sum(timestamp - last);"
110 "	last = timestamp;"
111 "}"
112 ":::cpu-change-speed"
113 "/(processorid_t)arg0 == $1 &&"
114 " last == 0/"
115 "{"
116 "	this->cpu = (processorid_t)arg0;"
117 "	this->oldspeed = (uint32_t)(arg1/1000000);"
118 "	@times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
119 "	last = timestamp;"
120 "}";
121 
122 static int	pt_cpufreq_setup(void);
123 static int	pt_cpufreq_snapshot(void);
124 static int	pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *, void *);
125 static void	pt_cpufreq_stat_account(double, uint_t);
126 static int	pt_cpufreq_snapshot_cpu(kstat_ctl_t *,
127     uint_t);
128 
129 static int
130 pt_cpufreq_setup(void)
131 {
132 	if ((dtp_argv = malloc(sizeof (char *) * DTP_ARG_COUNT)) == NULL)
133 		return (EXIT_FAILURE);
134 
135 	if ((dtp_argv[0] = malloc(sizeof (char) * DTP_ARG_LENGTH)) == NULL) {
136 		free(dtp_argv);
137 		return (EXIT_FAILURE);
138 	}
139 
140 	(void) snprintf(dtp_argv[0], 5, "%d\0", g_ncpus_observed);
141 
142 	if (PTOP_ON_CPU) {
143 		if ((dtp_argv[1] = malloc(sizeof (char) * DTP_ARG_LENGTH))
144 		    == NULL) {
145 			free(dtp_argv[0]);
146 			free(dtp_argv);
147 			return (EXIT_FAILURE);
148 		}
149 		(void) snprintf(dtp_argv[1], 5, "%d\0", g_observed_cpu);
150 	}
151 
152 	return (0);
153 }
154 
155 /*
156  * Perform setup necessary to enumerate and track CPU speed changes
157  */
158 int
159 pt_cpufreq_stat_prepare(void)
160 {
161 	dtrace_prog_t 		*prog;
162 	dtrace_proginfo_t 	info;
163 	dtrace_optval_t 	statustime;
164 	kstat_ctl_t 		*kc;
165 	kstat_t 		*ksp;
166 	kstat_named_t 		*knp;
167 	freq_state_info_t 	*state;
168 	char 			*s, *token, *prog_ptr;
169 	int 			err;
170 
171 	if ((err = pt_cpufreq_setup()) != 0) {
172 		pt_error("%s : failed to setup", __FILE__);
173 		return (errno);
174 	}
175 
176 	state = g_pstate_info;
177 	if ((g_cpu_power_states = calloc((size_t)g_ncpus,
178 	    sizeof (cpu_power_info_t))) == NULL)
179 		return (-1);
180 
181 	/*
182 	 * Enumerate the CPU frequencies
183 	 */
184 	if ((kc = kstat_open()) == NULL)
185 		return (errno);
186 
187 	ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[g_observed_cpu], NULL);
188 
189 	if (ksp == NULL) {
190 		err = errno;
191 		(void) kstat_close(kc);
192 		return (err);
193 	}
194 
195 	(void) kstat_read(kc, ksp, NULL);
196 
197 	knp = kstat_data_lookup(ksp, "supported_frequencies_Hz");
198 	s = knp->value.str.addr.ptr;
199 
200 	g_npstates = 0;
201 
202 	for (token = strtok(s, ":"), s = NULL;
203 	    NULL != token && g_npstates < NSTATES;
204 	    token = strtok(NULL, ":")) {
205 
206 		state->speed = HZ2MHZ(atoll(token));
207 
208 		if (state->speed > max_cpufreq)
209 			max_cpufreq = state->speed;
210 
211 		state->total_time = (uint64_t)0;
212 
213 		g_npstates++;
214 		state++;
215 	}
216 
217 	if (token != NULL)
218 		pt_error("%s : exceeded NSTATES\n", __FILE__);
219 
220 	(void) kstat_close(kc);
221 
222 	/*
223 	 * Return if speed transition is not supported
224 	 */
225 	if (g_npstates < 2)
226 		return (-1);
227 
228 	/*
229 	 * Setup DTrace to look for CPU frequency changes
230 	 */
231 	if ((dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
232 		pt_error("%s : cannot open dtrace library: %s\n", __FILE__,
233 		    dtrace_errmsg(NULL, err));
234 		return (-2);
235 	}
236 
237 	/*
238 	 * Execute different scripts (defined above) depending on
239 	 * user specified options. Default mode uses dtp_cpufreq.
240 	 */
241 	if (PTOP_ON_CPU)
242 		prog_ptr = (char *)dtp_cpufreq_c;
243 	else
244 		prog_ptr = (char *)dtp_cpufreq;
245 
246 	if ((prog = dtrace_program_strcompile(dtp, prog_ptr,
247 	    DTRACE_PROBESPEC_NAME, 0, (1 + g_argc), dtp_argv)) == NULL) {
248 		pt_error("%s : cpu-change-speed probe unavailable\n", __FILE__);
249 		return (dtrace_errno(dtp));
250 	}
251 
252 	if (dtrace_program_exec(dtp, prog, &info) == -1) {
253 		pt_error("%s : failed to enable speed probe\n", __FILE__);
254 		return (dtrace_errno(dtp));
255 	}
256 
257 	if (dtrace_setopt(dtp, "aggsize", "128k") == -1) {
258 		pt_error("%s : failed to set speed 'aggsize'\n", __FILE__);
259 	}
260 
261 	if (dtrace_setopt(dtp, "aggrate", "0") == -1) {
262 		pt_error("%s : failed to set speed 'aggrate'\n", __FILE__);
263 	}
264 
265 	if (dtrace_setopt(dtp, "aggpercpu", 0) == -1) {
266 		pt_error("%s : failed to set speed 'aggpercpu'\n", __FILE__);
267 	}
268 
269 	if (dtrace_go(dtp) != 0) {
270 		pt_error("%s : failed to start speed observation", __FILE__);
271 		return (dtrace_errno(dtp));
272 	}
273 
274 	if (dtrace_getopt(dtp, "statusrate", &statustime) == -1) {
275 		pt_error("%s : failed to get speed 'statusrate'\n", __FILE__);
276 		return (dtrace_errno(dtp));
277 	}
278 
279 	return (0);
280 }
281 
282 /*
283  * The DTrace probes have already been enabled, and are tracking
284  * CPU speed transitions. Take a snapshot of the aggregations, and
285  * look for any CPUs that have made a speed transition over the last
286  * sampling interval. Note that the aggregations may be empty if no
287  * speed transitions took place over the last interval. In that case,
288  * notate that we have already accounted for the time, so that when
289  * we do encounter a speed transition in a future sampling interval
290  * we can subtract that time back out.
291  */
292 int
293 pt_cpufreq_stat_collect(double interval)
294 {
295 	int	i, ret;
296 
297 	/*
298 	 * Zero out the interval time reported by DTrace for
299 	 * this interval
300 	 */
301 	for (i = 0; i < g_npstates; i++)
302 		g_pstate_info[i].total_time = 0;
303 
304 	for (i = 0; i < g_ncpus; i++)
305 		g_cpu_power_states[i].dtrace_time = 0;
306 
307 	if (dtrace_status(dtp) == -1)
308 		return (-1);
309 
310 	if (dtrace_aggregate_snap(dtp) != 0)
311 		pt_error("%s : failed to add to stats aggregation", __FILE__);
312 
313 	if (dtrace_aggregate_walk_keyvarsorted(dtp, pt_cpufreq_dtrace_walk,
314 	    NULL) != 0)
315 		pt_error("%s : failed to sort stats aggregation", __FILE__);
316 
317 	dtrace_aggregate_clear(dtp);
318 
319 	if ((ret = pt_cpufreq_snapshot()) != 0) {
320 		pt_error("%s : failed to add to stats aggregation", __FILE__);
321 		return (ret);
322 	}
323 
324 	switch (g_op_mode) {
325 	case PTOP_MODE_CPU:
326 		pt_cpufreq_stat_account(interval, g_observed_cpu);
327 		break;
328 	case PTOP_MODE_DEFAULT:
329 	default:
330 		for (i = 0; i < g_ncpus_observed; i++)
331 			pt_cpufreq_stat_account(interval, i);
332 		break;
333 	}
334 
335 	return (0);
336 }
337 
338 static void
339 pt_cpufreq_stat_account(double interval, uint_t cpu)
340 {
341 	uint64_t 		speed;
342 	hrtime_t 		duration;
343 	cpu_power_info_t 	*cpu_pow;
344 	int			i;
345 
346 	cpu_pow = &g_cpu_power_states[cpu];
347 	speed = cpu_pow->current_pstate;
348 
349 	duration = (hrtime_t)((interval * NANOSEC)) - cpu_pow->dtrace_time;
350 
351 	for (i = 0; i < g_npstates; i++) {
352 		if (g_pstate_info[i].speed == speed) {
353 			g_pstate_info[i].total_time += duration;
354 			cpu_pow->time_accounted += duration;
355 		}
356 	}
357 }
358 
359 /*
360  * Take a snapshot of each CPU's speed by looking through the cpu_info kstats.
361  */
362 static int
363 pt_cpufreq_snapshot(void)
364 {
365 	kstat_ctl_t 		*kc;
366 	int 			ret;
367 	uint_t			i;
368 
369 	if ((kc = kstat_open()) == NULL)
370 		return (errno);
371 
372 	switch (g_op_mode) {
373 	case PTOP_MODE_CPU:
374 		ret = pt_cpufreq_snapshot_cpu(kc, g_observed_cpu);
375 		break;
376 	case PTOP_MODE_DEFAULT:
377 	default:
378 		for (i = 0; i < g_ncpus_observed; i++)
379 			if ((ret = pt_cpufreq_snapshot_cpu(kc, i)) != 0)
380 				break;
381 		break;
382 	}
383 
384 	if (kstat_close(kc) != 0)
385 		pt_error("%s : couldn't close kstat\n", __FILE__);
386 
387 	return (ret);
388 }
389 
390 static int
391 pt_cpufreq_snapshot_cpu(kstat_ctl_t *kc, uint_t cpu)
392 {
393 	kstat_t 		*ksp;
394 	kstat_named_t 		*knp;
395 
396 	ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[cpu], NULL);
397 	if (ksp == NULL) {
398 		pt_error("%s : couldn't find cpu_info kstat for CPU "
399 		"%d\n", __FILE__, cpu);
400 		return (1);
401 	}
402 
403 	if (kstat_read(kc, ksp, NULL) == -1) {
404 		pt_error("%s : couldn't read cpu_info kstat for "
405 		    "CPU %d\n", __FILE__, cpu);
406 		return (2);
407 	}
408 
409 	knp = kstat_data_lookup(ksp, "current_clock_Hz");
410 	if (knp == NULL) {
411 		pt_error("%s : couldn't find current_clock_Hz "
412 		    "kstat for CPU %d\n", __FILE__, cpu);
413 		return (3);
414 	}
415 
416 	g_cpu_power_states[cpu].current_pstate = HZ2MHZ(knp->value.ui64);
417 
418 	return (0);
419 }
420 
421 /*
422  * DTrace aggregation walker that sorts through a snapshot of the
423  * aggregation data collected during firings of the cpu-change-speed
424  * probe.
425  */
426 /*ARGSUSED*/
427 static int
428 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *data, void *arg)
429 {
430 	dtrace_aggdesc_t 	*aggdesc = data->dtada_desc;
431 	dtrace_recdesc_t 	*cpu_rec, *speed_rec;
432 	cpu_power_info_t 	*cpu_pow;
433 	int32_t 		cpu;
434 	uint64_t 		speed;
435 	hrtime_t 		dt_state_time = 0;
436 	int 			i;
437 
438 	if (strcmp(aggdesc->dtagd_name, "times") == 0) {
439 		cpu_rec = &aggdesc->dtagd_rec[1];
440 		speed_rec = &aggdesc->dtagd_rec[2];
441 
442 		for (i = 0; i < g_ncpus; i++) {
443 			/* LINTED - alignment */
444 			dt_state_time += *((hrtime_t *)(data->dtada_percpu[i]));
445 		}
446 
447 		/* LINTED - alignment */
448 		cpu = *(int32_t *)(data->dtada_data + cpu_rec->dtrd_offset);
449 		/* LINTED - alignment */
450 		speed = *(uint64_t *)(data->dtada_data +
451 		    speed_rec->dtrd_offset);
452 
453 		if (speed == 0) {
454 			speed = max_cpufreq;
455 		}
456 
457 		/*
458 		 * We have an aggregation record for "cpu" being at "speed"
459 		 * for an interval of "n" nanoseconds. The reported interval
460 		 * may exceed the powertop sampling interval, since we only
461 		 * notice during potentially infrequent firings of the
462 		 * "speed change" DTrace probe. In this case powertop would
463 		 * have already accounted for the portions of the interval
464 		 * that happened during prior powertop samplings, so subtract
465 		 * out time already accounted.
466 		 */
467 		cpu_pow = &g_cpu_power_states[cpu];
468 
469 		for (i = 0; i < g_npstates; i++) {
470 			if (g_pstate_info[i].speed == speed) {
471 				if (cpu_pow->time_accounted > 0) {
472 					if (dt_state_time == 0)
473 						continue;
474 					if (dt_state_time >
475 					    cpu_pow->time_accounted) {
476 						dt_state_time -=
477 						    cpu_pow->time_accounted;
478 						cpu_pow->time_accounted = 0;
479 					}
480 				}
481 				g_pstate_info[i].total_time += dt_state_time;
482 				cpu_pow->dtrace_time += dt_state_time;
483 			}
484 		}
485 	}
486 	return (DTRACE_AGGWALK_NEXT);
487 }
488 
489 /*
490  * Used as a suggestion, sets PM in /etc/power.conf and
491  * a 1sec threshold, then calls /usr/sbin/pmconfig
492  */
493 void
494 enable_p_state(void)
495 {
496 	(void) system(cpupm_enable);
497 	(void) system(cpupm_treshold);
498 	(void) system(default_pmconf);
499 }
500 
501 /*
502  * Checks if PM is enabled in /etc/power.conf, enabling if not
503  */
504 void
505 suggest_p_state(void)
506 {
507 	char 	line[1024];
508 	FILE 	*file;
509 
510 	/*
511 	 * Return if speed transition is not supported
512 	 */
513 	if (g_npstates < 2)
514 		return;
515 
516 	file = fopen(default_conf, "r");
517 
518 	if (!file)
519 		return;
520 
521 	(void) memset(line, 0, 1024);
522 
523 	while (fgets(line, 1023, file)) {
524 		if (strstr(line, "cpupm")) {
525 			if (strstr(line, "enable")) {
526 				(void) fclose(file);
527 				return;
528 			}
529 		}
530 	}
531 
532 	add_suggestion("Suggestion: enable CPU power management by "
533 	    "pressing the P key",  40, 'P', "P - Enable p-state",
534 	    enable_p_state);
535 
536 	(void) fclose(file);
537 }
538