1 /* 2 * drivers/cpufreq/cpufreq_conservative.c 3 * 4 * Copyright (C) 2001 Russell King 5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. 6 * Jun Nakajima <jun.nakajima@intel.com> 7 * (C) 2009 Alexander Clouter <alex@digriz.org.uk> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 */ 13 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/init.h> 17 #include <linux/cpufreq.h> 18 #include <linux/cpu.h> 19 #include <linux/jiffies.h> 20 #include <linux/kernel_stat.h> 21 #include <linux/mutex.h> 22 #include <linux/hrtimer.h> 23 #include <linux/tick.h> 24 #include <linux/ktime.h> 25 #include <linux/sched.h> 26 27 /* 28 * dbs is used in this file as a shortform for demandbased switching 29 * It helps to keep variable names smaller, simpler 30 */ 31 32 #define DEF_FREQUENCY_UP_THRESHOLD (80) 33 #define DEF_FREQUENCY_DOWN_THRESHOLD (20) 34 35 /* 36 * The polling frequency of this governor depends on the capability of 37 * the processor. Default polling frequency is 1000 times the transition 38 * latency of the processor. The governor will work on any processor with 39 * transition latency <= 10mS, using appropriate sampling 40 * rate. 41 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) 42 * this governor will not work. 43 * All times here are in uS. 44 */ 45 #define MIN_SAMPLING_RATE_RATIO (2) 46 47 static unsigned int min_sampling_rate; 48 49 #define LATENCY_MULTIPLIER (1000) 50 #define MIN_LATENCY_MULTIPLIER (100) 51 #define DEF_SAMPLING_DOWN_FACTOR (1) 52 #define MAX_SAMPLING_DOWN_FACTOR (10) 53 #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) 54 55 static void do_dbs_timer(struct work_struct *work); 56 57 struct cpu_dbs_info_s { 58 cputime64_t prev_cpu_idle; 59 cputime64_t prev_cpu_wall; 60 cputime64_t prev_cpu_nice; 61 struct cpufreq_policy *cur_policy; 62 struct delayed_work work; 63 unsigned int down_skip; 64 unsigned int requested_freq; 65 int cpu; 66 unsigned int enable:1; 67 /* 68 * percpu mutex that serializes governor limit change with 69 * do_dbs_timer invocation. We do not want do_dbs_timer to run 70 * when user is changing the governor or limits. 71 */ 72 struct mutex timer_mutex; 73 }; 74 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); 75 76 static unsigned int dbs_enable; /* number of CPUs using this policy */ 77 78 /* 79 * dbs_mutex protects dbs_enable in governor start/stop. 80 */ 81 static DEFINE_MUTEX(dbs_mutex); 82 83 static struct dbs_tuners { 84 unsigned int sampling_rate; 85 unsigned int sampling_down_factor; 86 unsigned int up_threshold; 87 unsigned int down_threshold; 88 unsigned int ignore_nice; 89 unsigned int freq_step; 90 } dbs_tuners_ins = { 91 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 92 .down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD, 93 .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, 94 .ignore_nice = 0, 95 .freq_step = 5, 96 }; 97 98 static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) 99 { 100 u64 idle_time; 101 u64 cur_wall_time; 102 u64 busy_time; 103 104 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 105 106 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; 107 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; 108 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; 109 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; 110 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; 111 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; 112 113 idle_time = cur_wall_time - busy_time; 114 if (wall) 115 *wall = jiffies_to_usecs(cur_wall_time); 116 117 return jiffies_to_usecs(idle_time); 118 } 119 120 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 121 { 122 u64 idle_time = get_cpu_idle_time_us(cpu, NULL); 123 124 if (idle_time == -1ULL) 125 return get_cpu_idle_time_jiffy(cpu, wall); 126 else 127 idle_time += get_cpu_iowait_time_us(cpu, wall); 128 129 return idle_time; 130 } 131 132 /* keep track of frequency transitions */ 133 static int 134 dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 135 void *data) 136 { 137 struct cpufreq_freqs *freq = data; 138 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, 139 freq->cpu); 140 141 struct cpufreq_policy *policy; 142 143 if (!this_dbs_info->enable) 144 return 0; 145 146 policy = this_dbs_info->cur_policy; 147 148 /* 149 * we only care if our internally tracked freq moves outside 150 * the 'valid' ranges of freqency available to us otherwise 151 * we do not change it 152 */ 153 if (this_dbs_info->requested_freq > policy->max 154 || this_dbs_info->requested_freq < policy->min) 155 this_dbs_info->requested_freq = freq->new; 156 157 return 0; 158 } 159 160 static struct notifier_block dbs_cpufreq_notifier_block = { 161 .notifier_call = dbs_cpufreq_notifier 162 }; 163 164 /************************** sysfs interface ************************/ 165 static ssize_t show_sampling_rate_min(struct kobject *kobj, 166 struct attribute *attr, char *buf) 167 { 168 return sprintf(buf, "%u\n", min_sampling_rate); 169 } 170 171 define_one_global_ro(sampling_rate_min); 172 173 /* cpufreq_conservative Governor Tunables */ 174 #define show_one(file_name, object) \ 175 static ssize_t show_##file_name \ 176 (struct kobject *kobj, struct attribute *attr, char *buf) \ 177 { \ 178 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ 179 } 180 show_one(sampling_rate, sampling_rate); 181 show_one(sampling_down_factor, sampling_down_factor); 182 show_one(up_threshold, up_threshold); 183 show_one(down_threshold, down_threshold); 184 show_one(ignore_nice_load, ignore_nice); 185 show_one(freq_step, freq_step); 186 187 static ssize_t store_sampling_down_factor(struct kobject *a, 188 struct attribute *b, 189 const char *buf, size_t count) 190 { 191 unsigned int input; 192 int ret; 193 ret = sscanf(buf, "%u", &input); 194 195 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 196 return -EINVAL; 197 198 dbs_tuners_ins.sampling_down_factor = input; 199 return count; 200 } 201 202 static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, 203 const char *buf, size_t count) 204 { 205 unsigned int input; 206 int ret; 207 ret = sscanf(buf, "%u", &input); 208 209 if (ret != 1) 210 return -EINVAL; 211 212 dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); 213 return count; 214 } 215 216 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, 217 const char *buf, size_t count) 218 { 219 unsigned int input; 220 int ret; 221 ret = sscanf(buf, "%u", &input); 222 223 if (ret != 1 || input > 100 || 224 input <= dbs_tuners_ins.down_threshold) 225 return -EINVAL; 226 227 dbs_tuners_ins.up_threshold = input; 228 return count; 229 } 230 231 static ssize_t store_down_threshold(struct kobject *a, struct attribute *b, 232 const char *buf, size_t count) 233 { 234 unsigned int input; 235 int ret; 236 ret = sscanf(buf, "%u", &input); 237 238 /* cannot be lower than 11 otherwise freq will not fall */ 239 if (ret != 1 || input < 11 || input > 100 || 240 input >= dbs_tuners_ins.up_threshold) 241 return -EINVAL; 242 243 dbs_tuners_ins.down_threshold = input; 244 return count; 245 } 246 247 static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, 248 const char *buf, size_t count) 249 { 250 unsigned int input; 251 int ret; 252 253 unsigned int j; 254 255 ret = sscanf(buf, "%u", &input); 256 if (ret != 1) 257 return -EINVAL; 258 259 if (input > 1) 260 input = 1; 261 262 if (input == dbs_tuners_ins.ignore_nice) /* nothing to do */ 263 return count; 264 265 dbs_tuners_ins.ignore_nice = input; 266 267 /* we need to re-evaluate prev_cpu_idle */ 268 for_each_online_cpu(j) { 269 struct cpu_dbs_info_s *dbs_info; 270 dbs_info = &per_cpu(cs_cpu_dbs_info, j); 271 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 272 &dbs_info->prev_cpu_wall); 273 if (dbs_tuners_ins.ignore_nice) 274 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 275 } 276 return count; 277 } 278 279 static ssize_t store_freq_step(struct kobject *a, struct attribute *b, 280 const char *buf, size_t count) 281 { 282 unsigned int input; 283 int ret; 284 ret = sscanf(buf, "%u", &input); 285 286 if (ret != 1) 287 return -EINVAL; 288 289 if (input > 100) 290 input = 100; 291 292 /* no need to test here if freq_step is zero as the user might actually 293 * want this, they would be crazy though :) */ 294 dbs_tuners_ins.freq_step = input; 295 return count; 296 } 297 298 define_one_global_rw(sampling_rate); 299 define_one_global_rw(sampling_down_factor); 300 define_one_global_rw(up_threshold); 301 define_one_global_rw(down_threshold); 302 define_one_global_rw(ignore_nice_load); 303 define_one_global_rw(freq_step); 304 305 static struct attribute *dbs_attributes[] = { 306 &sampling_rate_min.attr, 307 &sampling_rate.attr, 308 &sampling_down_factor.attr, 309 &up_threshold.attr, 310 &down_threshold.attr, 311 &ignore_nice_load.attr, 312 &freq_step.attr, 313 NULL 314 }; 315 316 static struct attribute_group dbs_attr_group = { 317 .attrs = dbs_attributes, 318 .name = "conservative", 319 }; 320 321 /************************** sysfs end ************************/ 322 323 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) 324 { 325 unsigned int load = 0; 326 unsigned int max_load = 0; 327 unsigned int freq_target; 328 329 struct cpufreq_policy *policy; 330 unsigned int j; 331 332 policy = this_dbs_info->cur_policy; 333 334 /* 335 * Every sampling_rate, we check, if current idle time is less 336 * than 20% (default), then we try to increase frequency 337 * Every sampling_rate*sampling_down_factor, we check, if current 338 * idle time is more than 80%, then we try to decrease frequency 339 * 340 * Any frequency increase takes it to the maximum frequency. 341 * Frequency reduction happens at minimum steps of 342 * 5% (default) of maximum frequency 343 */ 344 345 /* Get Absolute Load */ 346 for_each_cpu(j, policy->cpus) { 347 struct cpu_dbs_info_s *j_dbs_info; 348 cputime64_t cur_wall_time, cur_idle_time; 349 unsigned int idle_time, wall_time; 350 351 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); 352 353 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 354 355 wall_time = (unsigned int) 356 (cur_wall_time - j_dbs_info->prev_cpu_wall); 357 j_dbs_info->prev_cpu_wall = cur_wall_time; 358 359 idle_time = (unsigned int) 360 (cur_idle_time - j_dbs_info->prev_cpu_idle); 361 j_dbs_info->prev_cpu_idle = cur_idle_time; 362 363 if (dbs_tuners_ins.ignore_nice) { 364 u64 cur_nice; 365 unsigned long cur_nice_jiffies; 366 367 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - 368 j_dbs_info->prev_cpu_nice; 369 /* 370 * Assumption: nice time between sampling periods will 371 * be less than 2^32 jiffies for 32 bit sys 372 */ 373 cur_nice_jiffies = (unsigned long) 374 cputime64_to_jiffies64(cur_nice); 375 376 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 377 idle_time += jiffies_to_usecs(cur_nice_jiffies); 378 } 379 380 if (unlikely(!wall_time || wall_time < idle_time)) 381 continue; 382 383 load = 100 * (wall_time - idle_time) / wall_time; 384 385 if (load > max_load) 386 max_load = load; 387 } 388 389 /* 390 * break out if we 'cannot' reduce the speed as the user might 391 * want freq_step to be zero 392 */ 393 if (dbs_tuners_ins.freq_step == 0) 394 return; 395 396 /* Check for frequency increase */ 397 if (max_load > dbs_tuners_ins.up_threshold) { 398 this_dbs_info->down_skip = 0; 399 400 /* if we are already at full speed then break out early */ 401 if (this_dbs_info->requested_freq == policy->max) 402 return; 403 404 freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100; 405 406 /* max freq cannot be less than 100. But who knows.... */ 407 if (unlikely(freq_target == 0)) 408 freq_target = 5; 409 410 this_dbs_info->requested_freq += freq_target; 411 if (this_dbs_info->requested_freq > policy->max) 412 this_dbs_info->requested_freq = policy->max; 413 414 __cpufreq_driver_target(policy, this_dbs_info->requested_freq, 415 CPUFREQ_RELATION_H); 416 return; 417 } 418 419 /* 420 * The optimal frequency is the frequency that is the lowest that 421 * can support the current CPU usage without triggering the up 422 * policy. To be safe, we focus 10 points under the threshold. 423 */ 424 if (max_load < (dbs_tuners_ins.down_threshold - 10)) { 425 freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100; 426 427 this_dbs_info->requested_freq -= freq_target; 428 if (this_dbs_info->requested_freq < policy->min) 429 this_dbs_info->requested_freq = policy->min; 430 431 /* 432 * if we cannot reduce the frequency anymore, break out early 433 */ 434 if (policy->cur == policy->min) 435 return; 436 437 __cpufreq_driver_target(policy, this_dbs_info->requested_freq, 438 CPUFREQ_RELATION_H); 439 return; 440 } 441 } 442 443 static void do_dbs_timer(struct work_struct *work) 444 { 445 struct cpu_dbs_info_s *dbs_info = 446 container_of(work, struct cpu_dbs_info_s, work.work); 447 unsigned int cpu = dbs_info->cpu; 448 449 /* We want all CPUs to do sampling nearly on same jiffy */ 450 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 451 452 delay -= jiffies % delay; 453 454 mutex_lock(&dbs_info->timer_mutex); 455 456 dbs_check_cpu(dbs_info); 457 458 schedule_delayed_work_on(cpu, &dbs_info->work, delay); 459 mutex_unlock(&dbs_info->timer_mutex); 460 } 461 462 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) 463 { 464 /* We want all CPUs to do sampling nearly on same jiffy */ 465 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 466 delay -= jiffies % delay; 467 468 dbs_info->enable = 1; 469 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); 470 schedule_delayed_work_on(dbs_info->cpu, &dbs_info->work, delay); 471 } 472 473 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 474 { 475 dbs_info->enable = 0; 476 cancel_delayed_work_sync(&dbs_info->work); 477 } 478 479 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 480 unsigned int event) 481 { 482 unsigned int cpu = policy->cpu; 483 struct cpu_dbs_info_s *this_dbs_info; 484 unsigned int j; 485 int rc; 486 487 this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); 488 489 switch (event) { 490 case CPUFREQ_GOV_START: 491 if ((!cpu_online(cpu)) || (!policy->cur)) 492 return -EINVAL; 493 494 mutex_lock(&dbs_mutex); 495 496 for_each_cpu(j, policy->cpus) { 497 struct cpu_dbs_info_s *j_dbs_info; 498 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); 499 j_dbs_info->cur_policy = policy; 500 501 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 502 &j_dbs_info->prev_cpu_wall); 503 if (dbs_tuners_ins.ignore_nice) 504 j_dbs_info->prev_cpu_nice = 505 kcpustat_cpu(j).cpustat[CPUTIME_NICE]; 506 } 507 this_dbs_info->down_skip = 0; 508 this_dbs_info->requested_freq = policy->cur; 509 510 mutex_init(&this_dbs_info->timer_mutex); 511 dbs_enable++; 512 /* 513 * Start the timerschedule work, when this governor 514 * is used for first time 515 */ 516 if (dbs_enable == 1) { 517 unsigned int latency; 518 /* policy latency is in nS. Convert it to uS first */ 519 latency = policy->cpuinfo.transition_latency / 1000; 520 if (latency == 0) 521 latency = 1; 522 523 rc = sysfs_create_group(cpufreq_global_kobject, 524 &dbs_attr_group); 525 if (rc) { 526 mutex_unlock(&dbs_mutex); 527 return rc; 528 } 529 530 /* 531 * conservative does not implement micro like ondemand 532 * governor, thus we are bound to jiffes/HZ 533 */ 534 min_sampling_rate = 535 MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); 536 /* Bring kernel and HW constraints together */ 537 min_sampling_rate = max(min_sampling_rate, 538 MIN_LATENCY_MULTIPLIER * latency); 539 dbs_tuners_ins.sampling_rate = 540 max(min_sampling_rate, 541 latency * LATENCY_MULTIPLIER); 542 543 cpufreq_register_notifier( 544 &dbs_cpufreq_notifier_block, 545 CPUFREQ_TRANSITION_NOTIFIER); 546 } 547 mutex_unlock(&dbs_mutex); 548 549 dbs_timer_init(this_dbs_info); 550 551 break; 552 553 case CPUFREQ_GOV_STOP: 554 dbs_timer_exit(this_dbs_info); 555 556 mutex_lock(&dbs_mutex); 557 dbs_enable--; 558 mutex_destroy(&this_dbs_info->timer_mutex); 559 560 /* 561 * Stop the timerschedule work, when this governor 562 * is used for first time 563 */ 564 if (dbs_enable == 0) 565 cpufreq_unregister_notifier( 566 &dbs_cpufreq_notifier_block, 567 CPUFREQ_TRANSITION_NOTIFIER); 568 569 mutex_unlock(&dbs_mutex); 570 if (!dbs_enable) 571 sysfs_remove_group(cpufreq_global_kobject, 572 &dbs_attr_group); 573 574 break; 575 576 case CPUFREQ_GOV_LIMITS: 577 mutex_lock(&this_dbs_info->timer_mutex); 578 if (policy->max < this_dbs_info->cur_policy->cur) 579 __cpufreq_driver_target( 580 this_dbs_info->cur_policy, 581 policy->max, CPUFREQ_RELATION_H); 582 else if (policy->min > this_dbs_info->cur_policy->cur) 583 __cpufreq_driver_target( 584 this_dbs_info->cur_policy, 585 policy->min, CPUFREQ_RELATION_L); 586 mutex_unlock(&this_dbs_info->timer_mutex); 587 588 break; 589 } 590 return 0; 591 } 592 593 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE 594 static 595 #endif 596 struct cpufreq_governor cpufreq_gov_conservative = { 597 .name = "conservative", 598 .governor = cpufreq_governor_dbs, 599 .max_transition_latency = TRANSITION_LATENCY_LIMIT, 600 .owner = THIS_MODULE, 601 }; 602 603 static int __init cpufreq_gov_dbs_init(void) 604 { 605 return cpufreq_register_governor(&cpufreq_gov_conservative); 606 } 607 608 static void __exit cpufreq_gov_dbs_exit(void) 609 { 610 cpufreq_unregister_governor(&cpufreq_gov_conservative); 611 } 612 613 614 MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>"); 615 MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for " 616 "Low Latency Frequency Transition capable processors " 617 "optimised for use in a battery environment"); 618 MODULE_LICENSE("GPL"); 619 620 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE 621 fs_initcall(cpufreq_gov_dbs_init); 622 #else 623 module_init(cpufreq_gov_dbs_init); 624 #endif 625 module_exit(cpufreq_gov_dbs_exit); 626