1 /* 2 * drivers/cpufreq/cpufreq_conservative.c 3 * 4 * Copyright (C) 2001 Russell King 5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>. 6 * Jun Nakajima <jun.nakajima@intel.com> 7 * (C) 2009 Alexander Clouter <alex@digriz.org.uk> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 */ 13 14 #include <linux/kernel.h> 15 #include <linux/module.h> 16 #include <linux/init.h> 17 #include <linux/cpufreq.h> 18 #include <linux/cpu.h> 19 #include <linux/jiffies.h> 20 #include <linux/kernel_stat.h> 21 #include <linux/mutex.h> 22 #include <linux/hrtimer.h> 23 #include <linux/tick.h> 24 #include <linux/ktime.h> 25 #include <linux/sched.h> 26 27 /* 28 * dbs is used in this file as a shortform for demandbased switching 29 * It helps to keep variable names smaller, simpler 30 */ 31 32 #define DEF_FREQUENCY_UP_THRESHOLD (80) 33 #define DEF_FREQUENCY_DOWN_THRESHOLD (20) 34 35 /* 36 * The polling frequency of this governor depends on the capability of 37 * the processor. Default polling frequency is 1000 times the transition 38 * latency of the processor. The governor will work on any processor with 39 * transition latency <= 10mS, using appropriate sampling 40 * rate. 41 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL) 42 * this governor will not work. 43 * All times here are in uS. 44 */ 45 #define MIN_SAMPLING_RATE_RATIO (2) 46 47 static unsigned int min_sampling_rate; 48 49 #define LATENCY_MULTIPLIER (1000) 50 #define MIN_LATENCY_MULTIPLIER (100) 51 #define DEF_SAMPLING_DOWN_FACTOR (1) 52 #define MAX_SAMPLING_DOWN_FACTOR (10) 53 #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) 54 55 static void do_dbs_timer(struct work_struct *work); 56 57 struct cpu_dbs_info_s { 58 cputime64_t prev_cpu_idle; 59 cputime64_t prev_cpu_wall; 60 cputime64_t prev_cpu_nice; 61 struct cpufreq_policy *cur_policy; 62 struct delayed_work work; 63 unsigned int down_skip; 64 unsigned int requested_freq; 65 int cpu; 66 unsigned int enable:1; 67 /* 68 * percpu mutex that serializes governor limit change with 69 * do_dbs_timer invocation. We do not want do_dbs_timer to run 70 * when user is changing the governor or limits. 71 */ 72 struct mutex timer_mutex; 73 }; 74 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); 75 76 static unsigned int dbs_enable; /* number of CPUs using this policy */ 77 78 /* 79 * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on 80 * different CPUs. It protects dbs_enable in governor start/stop. 81 */ 82 static DEFINE_MUTEX(dbs_mutex); 83 84 static struct workqueue_struct *kconservative_wq; 85 86 static struct dbs_tuners { 87 unsigned int sampling_rate; 88 unsigned int sampling_down_factor; 89 unsigned int up_threshold; 90 unsigned int down_threshold; 91 unsigned int ignore_nice; 92 unsigned int freq_step; 93 } dbs_tuners_ins = { 94 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 95 .down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD, 96 .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR, 97 .ignore_nice = 0, 98 .freq_step = 5, 99 }; 100 101 static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 102 cputime64_t *wall) 103 { 104 cputime64_t idle_time; 105 cputime64_t cur_wall_time; 106 cputime64_t busy_time; 107 108 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 109 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, 110 kstat_cpu(cpu).cpustat.system); 111 112 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 113 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 114 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 115 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 116 117 idle_time = cputime64_sub(cur_wall_time, busy_time); 118 if (wall) 119 *wall = cur_wall_time; 120 121 return idle_time; 122 } 123 124 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 125 { 126 u64 idle_time = get_cpu_idle_time_us(cpu, wall); 127 128 if (idle_time == -1ULL) 129 return get_cpu_idle_time_jiffy(cpu, wall); 130 131 return idle_time; 132 } 133 134 /* keep track of frequency transitions */ 135 static int 136 dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 137 void *data) 138 { 139 struct cpufreq_freqs *freq = data; 140 struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, 141 freq->cpu); 142 143 struct cpufreq_policy *policy; 144 145 if (!this_dbs_info->enable) 146 return 0; 147 148 policy = this_dbs_info->cur_policy; 149 150 /* 151 * we only care if our internally tracked freq moves outside 152 * the 'valid' ranges of freqency available to us otherwise 153 * we do not change it 154 */ 155 if (this_dbs_info->requested_freq > policy->max 156 || this_dbs_info->requested_freq < policy->min) 157 this_dbs_info->requested_freq = freq->new; 158 159 return 0; 160 } 161 162 static struct notifier_block dbs_cpufreq_notifier_block = { 163 .notifier_call = dbs_cpufreq_notifier 164 }; 165 166 /************************** sysfs interface ************************/ 167 static ssize_t show_sampling_rate_max(struct cpufreq_policy *policy, char *buf) 168 { 169 printk_once(KERN_INFO "CPUFREQ: conservative sampling_rate_max " 170 "sysfs file is deprecated - used by: %s\n", current->comm); 171 return sprintf(buf, "%u\n", -1U); 172 } 173 174 static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf) 175 { 176 return sprintf(buf, "%u\n", min_sampling_rate); 177 } 178 179 #define define_one_ro(_name) \ 180 static struct freq_attr _name = \ 181 __ATTR(_name, 0444, show_##_name, NULL) 182 183 define_one_ro(sampling_rate_max); 184 define_one_ro(sampling_rate_min); 185 186 /* cpufreq_conservative Governor Tunables */ 187 #define show_one(file_name, object) \ 188 static ssize_t show_##file_name \ 189 (struct cpufreq_policy *unused, char *buf) \ 190 { \ 191 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ 192 } 193 show_one(sampling_rate, sampling_rate); 194 show_one(sampling_down_factor, sampling_down_factor); 195 show_one(up_threshold, up_threshold); 196 show_one(down_threshold, down_threshold); 197 show_one(ignore_nice_load, ignore_nice); 198 show_one(freq_step, freq_step); 199 200 static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused, 201 const char *buf, size_t count) 202 { 203 unsigned int input; 204 int ret; 205 ret = sscanf(buf, "%u", &input); 206 207 if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1) 208 return -EINVAL; 209 210 mutex_lock(&dbs_mutex); 211 dbs_tuners_ins.sampling_down_factor = input; 212 mutex_unlock(&dbs_mutex); 213 214 return count; 215 } 216 217 static ssize_t store_sampling_rate(struct cpufreq_policy *unused, 218 const char *buf, size_t count) 219 { 220 unsigned int input; 221 int ret; 222 ret = sscanf(buf, "%u", &input); 223 224 if (ret != 1) 225 return -EINVAL; 226 227 mutex_lock(&dbs_mutex); 228 dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate); 229 mutex_unlock(&dbs_mutex); 230 231 return count; 232 } 233 234 static ssize_t store_up_threshold(struct cpufreq_policy *unused, 235 const char *buf, size_t count) 236 { 237 unsigned int input; 238 int ret; 239 ret = sscanf(buf, "%u", &input); 240 241 mutex_lock(&dbs_mutex); 242 if (ret != 1 || input > 100 || 243 input <= dbs_tuners_ins.down_threshold) { 244 mutex_unlock(&dbs_mutex); 245 return -EINVAL; 246 } 247 248 dbs_tuners_ins.up_threshold = input; 249 mutex_unlock(&dbs_mutex); 250 251 return count; 252 } 253 254 static ssize_t store_down_threshold(struct cpufreq_policy *unused, 255 const char *buf, size_t count) 256 { 257 unsigned int input; 258 int ret; 259 ret = sscanf(buf, "%u", &input); 260 261 mutex_lock(&dbs_mutex); 262 /* cannot be lower than 11 otherwise freq will not fall */ 263 if (ret != 1 || input < 11 || input > 100 || 264 input >= dbs_tuners_ins.up_threshold) { 265 mutex_unlock(&dbs_mutex); 266 return -EINVAL; 267 } 268 269 dbs_tuners_ins.down_threshold = input; 270 mutex_unlock(&dbs_mutex); 271 272 return count; 273 } 274 275 static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, 276 const char *buf, size_t count) 277 { 278 unsigned int input; 279 int ret; 280 281 unsigned int j; 282 283 ret = sscanf(buf, "%u", &input); 284 if (ret != 1) 285 return -EINVAL; 286 287 if (input > 1) 288 input = 1; 289 290 mutex_lock(&dbs_mutex); 291 if (input == dbs_tuners_ins.ignore_nice) { /* nothing to do */ 292 mutex_unlock(&dbs_mutex); 293 return count; 294 } 295 dbs_tuners_ins.ignore_nice = input; 296 297 /* we need to re-evaluate prev_cpu_idle */ 298 for_each_online_cpu(j) { 299 struct cpu_dbs_info_s *dbs_info; 300 dbs_info = &per_cpu(cs_cpu_dbs_info, j); 301 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 302 &dbs_info->prev_cpu_wall); 303 if (dbs_tuners_ins.ignore_nice) 304 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 305 } 306 mutex_unlock(&dbs_mutex); 307 308 return count; 309 } 310 311 static ssize_t store_freq_step(struct cpufreq_policy *policy, 312 const char *buf, size_t count) 313 { 314 unsigned int input; 315 int ret; 316 ret = sscanf(buf, "%u", &input); 317 318 if (ret != 1) 319 return -EINVAL; 320 321 if (input > 100) 322 input = 100; 323 324 /* no need to test here if freq_step is zero as the user might actually 325 * want this, they would be crazy though :) */ 326 mutex_lock(&dbs_mutex); 327 dbs_tuners_ins.freq_step = input; 328 mutex_unlock(&dbs_mutex); 329 330 return count; 331 } 332 333 #define define_one_rw(_name) \ 334 static struct freq_attr _name = \ 335 __ATTR(_name, 0644, show_##_name, store_##_name) 336 337 define_one_rw(sampling_rate); 338 define_one_rw(sampling_down_factor); 339 define_one_rw(up_threshold); 340 define_one_rw(down_threshold); 341 define_one_rw(ignore_nice_load); 342 define_one_rw(freq_step); 343 344 static struct attribute *dbs_attributes[] = { 345 &sampling_rate_max.attr, 346 &sampling_rate_min.attr, 347 &sampling_rate.attr, 348 &sampling_down_factor.attr, 349 &up_threshold.attr, 350 &down_threshold.attr, 351 &ignore_nice_load.attr, 352 &freq_step.attr, 353 NULL 354 }; 355 356 static struct attribute_group dbs_attr_group = { 357 .attrs = dbs_attributes, 358 .name = "conservative", 359 }; 360 361 /************************** sysfs end ************************/ 362 363 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) 364 { 365 unsigned int load = 0; 366 unsigned int freq_target; 367 368 struct cpufreq_policy *policy; 369 unsigned int j; 370 371 policy = this_dbs_info->cur_policy; 372 373 /* 374 * Every sampling_rate, we check, if current idle time is less 375 * than 20% (default), then we try to increase frequency 376 * Every sampling_rate*sampling_down_factor, we check, if current 377 * idle time is more than 80%, then we try to decrease frequency 378 * 379 * Any frequency increase takes it to the maximum frequency. 380 * Frequency reduction happens at minimum steps of 381 * 5% (default) of maximum frequency 382 */ 383 384 /* Get Absolute Load */ 385 for_each_cpu(j, policy->cpus) { 386 struct cpu_dbs_info_s *j_dbs_info; 387 cputime64_t cur_wall_time, cur_idle_time; 388 unsigned int idle_time, wall_time; 389 390 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); 391 392 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 393 394 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 395 j_dbs_info->prev_cpu_wall); 396 j_dbs_info->prev_cpu_wall = cur_wall_time; 397 398 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 399 j_dbs_info->prev_cpu_idle); 400 j_dbs_info->prev_cpu_idle = cur_idle_time; 401 402 if (dbs_tuners_ins.ignore_nice) { 403 cputime64_t cur_nice; 404 unsigned long cur_nice_jiffies; 405 406 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 407 j_dbs_info->prev_cpu_nice); 408 /* 409 * Assumption: nice time between sampling periods will 410 * be less than 2^32 jiffies for 32 bit sys 411 */ 412 cur_nice_jiffies = (unsigned long) 413 cputime64_to_jiffies64(cur_nice); 414 415 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 416 idle_time += jiffies_to_usecs(cur_nice_jiffies); 417 } 418 419 if (unlikely(!wall_time || wall_time < idle_time)) 420 continue; 421 422 load = 100 * (wall_time - idle_time) / wall_time; 423 } 424 425 /* 426 * break out if we 'cannot' reduce the speed as the user might 427 * want freq_step to be zero 428 */ 429 if (dbs_tuners_ins.freq_step == 0) 430 return; 431 432 /* Check for frequency increase */ 433 if (load > dbs_tuners_ins.up_threshold) { 434 this_dbs_info->down_skip = 0; 435 436 /* if we are already at full speed then break out early */ 437 if (this_dbs_info->requested_freq == policy->max) 438 return; 439 440 freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100; 441 442 /* max freq cannot be less than 100. But who knows.... */ 443 if (unlikely(freq_target == 0)) 444 freq_target = 5; 445 446 this_dbs_info->requested_freq += freq_target; 447 if (this_dbs_info->requested_freq > policy->max) 448 this_dbs_info->requested_freq = policy->max; 449 450 __cpufreq_driver_target(policy, this_dbs_info->requested_freq, 451 CPUFREQ_RELATION_H); 452 return; 453 } 454 455 /* 456 * The optimal frequency is the frequency that is the lowest that 457 * can support the current CPU usage without triggering the up 458 * policy. To be safe, we focus 10 points under the threshold. 459 */ 460 if (load < (dbs_tuners_ins.down_threshold - 10)) { 461 freq_target = (dbs_tuners_ins.freq_step * policy->max) / 100; 462 463 this_dbs_info->requested_freq -= freq_target; 464 if (this_dbs_info->requested_freq < policy->min) 465 this_dbs_info->requested_freq = policy->min; 466 467 /* 468 * if we cannot reduce the frequency anymore, break out early 469 */ 470 if (policy->cur == policy->min) 471 return; 472 473 __cpufreq_driver_target(policy, this_dbs_info->requested_freq, 474 CPUFREQ_RELATION_H); 475 return; 476 } 477 } 478 479 static void do_dbs_timer(struct work_struct *work) 480 { 481 struct cpu_dbs_info_s *dbs_info = 482 container_of(work, struct cpu_dbs_info_s, work.work); 483 unsigned int cpu = dbs_info->cpu; 484 485 /* We want all CPUs to do sampling nearly on same jiffy */ 486 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 487 488 delay -= jiffies % delay; 489 490 mutex_lock(&dbs_info->timer_mutex); 491 492 dbs_check_cpu(dbs_info); 493 494 queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay); 495 mutex_unlock(&dbs_info->timer_mutex); 496 } 497 498 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) 499 { 500 /* We want all CPUs to do sampling nearly on same jiffy */ 501 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 502 delay -= jiffies % delay; 503 504 dbs_info->enable = 1; 505 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); 506 queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, 507 delay); 508 } 509 510 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 511 { 512 dbs_info->enable = 0; 513 cancel_delayed_work_sync(&dbs_info->work); 514 } 515 516 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 517 unsigned int event) 518 { 519 unsigned int cpu = policy->cpu; 520 struct cpu_dbs_info_s *this_dbs_info; 521 unsigned int j; 522 int rc; 523 524 this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); 525 526 switch (event) { 527 case CPUFREQ_GOV_START: 528 if ((!cpu_online(cpu)) || (!policy->cur)) 529 return -EINVAL; 530 531 mutex_lock(&dbs_mutex); 532 533 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); 534 if (rc) { 535 mutex_unlock(&dbs_mutex); 536 return rc; 537 } 538 539 for_each_cpu(j, policy->cpus) { 540 struct cpu_dbs_info_s *j_dbs_info; 541 j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); 542 j_dbs_info->cur_policy = policy; 543 544 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 545 &j_dbs_info->prev_cpu_wall); 546 if (dbs_tuners_ins.ignore_nice) { 547 j_dbs_info->prev_cpu_nice = 548 kstat_cpu(j).cpustat.nice; 549 } 550 } 551 this_dbs_info->down_skip = 0; 552 this_dbs_info->requested_freq = policy->cur; 553 554 mutex_init(&this_dbs_info->timer_mutex); 555 dbs_enable++; 556 /* 557 * Start the timerschedule work, when this governor 558 * is used for first time 559 */ 560 if (dbs_enable == 1) { 561 unsigned int latency; 562 /* policy latency is in nS. Convert it to uS first */ 563 latency = policy->cpuinfo.transition_latency / 1000; 564 if (latency == 0) 565 latency = 1; 566 567 /* 568 * conservative does not implement micro like ondemand 569 * governor, thus we are bound to jiffes/HZ 570 */ 571 min_sampling_rate = 572 MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10); 573 /* Bring kernel and HW constraints together */ 574 min_sampling_rate = max(min_sampling_rate, 575 MIN_LATENCY_MULTIPLIER * latency); 576 dbs_tuners_ins.sampling_rate = 577 max(min_sampling_rate, 578 latency * LATENCY_MULTIPLIER); 579 580 cpufreq_register_notifier( 581 &dbs_cpufreq_notifier_block, 582 CPUFREQ_TRANSITION_NOTIFIER); 583 } 584 mutex_unlock(&dbs_mutex); 585 586 dbs_timer_init(this_dbs_info); 587 588 break; 589 590 case CPUFREQ_GOV_STOP: 591 dbs_timer_exit(this_dbs_info); 592 593 mutex_lock(&dbs_mutex); 594 sysfs_remove_group(&policy->kobj, &dbs_attr_group); 595 dbs_enable--; 596 mutex_destroy(&this_dbs_info->timer_mutex); 597 598 /* 599 * Stop the timerschedule work, when this governor 600 * is used for first time 601 */ 602 if (dbs_enable == 0) 603 cpufreq_unregister_notifier( 604 &dbs_cpufreq_notifier_block, 605 CPUFREQ_TRANSITION_NOTIFIER); 606 607 mutex_unlock(&dbs_mutex); 608 609 break; 610 611 case CPUFREQ_GOV_LIMITS: 612 mutex_lock(&this_dbs_info->timer_mutex); 613 if (policy->max < this_dbs_info->cur_policy->cur) 614 __cpufreq_driver_target( 615 this_dbs_info->cur_policy, 616 policy->max, CPUFREQ_RELATION_H); 617 else if (policy->min > this_dbs_info->cur_policy->cur) 618 __cpufreq_driver_target( 619 this_dbs_info->cur_policy, 620 policy->min, CPUFREQ_RELATION_L); 621 mutex_unlock(&this_dbs_info->timer_mutex); 622 623 break; 624 } 625 return 0; 626 } 627 628 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE 629 static 630 #endif 631 struct cpufreq_governor cpufreq_gov_conservative = { 632 .name = "conservative", 633 .governor = cpufreq_governor_dbs, 634 .max_transition_latency = TRANSITION_LATENCY_LIMIT, 635 .owner = THIS_MODULE, 636 }; 637 638 static int __init cpufreq_gov_dbs_init(void) 639 { 640 int err; 641 642 kconservative_wq = create_workqueue("kconservative"); 643 if (!kconservative_wq) { 644 printk(KERN_ERR "Creation of kconservative failed\n"); 645 return -EFAULT; 646 } 647 648 err = cpufreq_register_governor(&cpufreq_gov_conservative); 649 if (err) 650 destroy_workqueue(kconservative_wq); 651 652 return err; 653 } 654 655 static void __exit cpufreq_gov_dbs_exit(void) 656 { 657 cpufreq_unregister_governor(&cpufreq_gov_conservative); 658 destroy_workqueue(kconservative_wq); 659 } 660 661 662 MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>"); 663 MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for " 664 "Low Latency Frequency Transition capable processors " 665 "optimised for use in a battery environment"); 666 MODULE_LICENSE("GPL"); 667 668 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE 669 fs_initcall(cpufreq_gov_dbs_init); 670 #else 671 module_init(cpufreq_gov_dbs_init); 672 #endif 673 module_exit(cpufreq_gov_dbs_exit); 674