1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 #include <uapi/linux/sched/types.h> 9 #include "sched.h" 10 11 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 12 13 struct sugov_tunables { 14 struct gov_attr_set attr_set; 15 unsigned int rate_limit_us; 16 }; 17 18 struct sugov_policy { 19 struct cpufreq_policy *policy; 20 21 struct sugov_tunables *tunables; 22 struct list_head tunables_hook; 23 24 raw_spinlock_t update_lock; 25 u64 last_freq_update_time; 26 s64 freq_update_delay_ns; 27 unsigned int next_freq; 28 unsigned int cached_raw_freq; 29 30 /* The next fields are only needed if fast switch cannot be used: */ 31 struct irq_work irq_work; 32 struct kthread_work work; 33 struct mutex work_lock; 34 struct kthread_worker worker; 35 struct task_struct *thread; 36 bool work_in_progress; 37 38 bool limits_changed; 39 bool need_freq_update; 40 }; 41 42 struct sugov_cpu { 43 struct update_util_data update_util; 44 struct sugov_policy *sg_policy; 45 unsigned int cpu; 46 47 bool iowait_boost_pending; 48 unsigned int iowait_boost; 49 u64 last_update; 50 51 unsigned long util; 52 unsigned long bw_min; 53 54 /* The field below is for single-CPU policies only: */ 55 #ifdef CONFIG_NO_HZ_COMMON 56 unsigned long saved_idle_calls; 57 #endif 58 }; 59 60 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 61 62 /************************ Governor internals ***********************/ 63 64 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 65 { 66 s64 delta_ns; 67 68 /* 69 * Since cpufreq_update_util() is called with rq->lock held for 70 * the @target_cpu, our per-CPU data is fully serialized. 71 * 72 * However, drivers cannot in general deal with cross-CPU 73 * requests, so while get_next_freq() will work, our 74 * sugov_update_commit() call may not for the fast switching platforms. 75 * 76 * Hence stop here for remote requests if they aren't supported 77 * by the hardware, as calculating the frequency is pointless if 78 * we cannot in fact act on it. 79 * 80 * This is needed on the slow switching platforms too to prevent CPUs 81 * going offline from leaving stale IRQ work items behind. 82 */ 83 if (!cpufreq_this_cpu_can_update(sg_policy->policy)) 84 return false; 85 86 if (unlikely(READ_ONCE(sg_policy->limits_changed))) { 87 WRITE_ONCE(sg_policy->limits_changed, false); 88 sg_policy->need_freq_update = true; 89 90 /* 91 * The above limits_changed update must occur before the reads 92 * of policy limits in cpufreq_driver_resolve_freq() or a policy 93 * limits update might be missed, so use a memory barrier to 94 * ensure it. 95 * 96 * This pairs with the write memory barrier in sugov_limits(). 97 */ 98 smp_mb(); 99 100 return true; 101 } else if (sg_policy->need_freq_update) { 102 /* ignore_dl_rate_limit() wants a new frequency to be found. */ 103 return true; 104 } 105 106 delta_ns = time - sg_policy->last_freq_update_time; 107 108 return delta_ns >= sg_policy->freq_update_delay_ns; 109 } 110 111 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 112 unsigned int next_freq) 113 { 114 if (sg_policy->need_freq_update) { 115 sg_policy->need_freq_update = false; 116 /* 117 * The policy limits have changed, but if the return value of 118 * cpufreq_driver_resolve_freq() after applying the new limits 119 * is still equal to the previously selected frequency, the 120 * driver callback need not be invoked unless the driver 121 * specifically wants that to happen on every update of the 122 * policy limits. 123 */ 124 if (sg_policy->next_freq == next_freq && 125 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) 126 return false; 127 } else if (sg_policy->next_freq == next_freq) { 128 return false; 129 } 130 131 sg_policy->next_freq = next_freq; 132 sg_policy->last_freq_update_time = time; 133 134 return true; 135 } 136 137 static void sugov_deferred_update(struct sugov_policy *sg_policy) 138 { 139 if (!sg_policy->work_in_progress) { 140 sg_policy->work_in_progress = true; 141 irq_work_queue(&sg_policy->irq_work); 142 } 143 } 144 145 /** 146 * get_capacity_ref_freq - get the reference frequency that has been used to 147 * correlate frequency and compute capacity for a given cpufreq policy. We use 148 * the CPU managing it for the arch_scale_freq_ref() call in the function. 149 * @policy: the cpufreq policy of the CPU in question. 150 * 151 * Return: the reference CPU frequency to compute a capacity. 152 */ 153 static __always_inline 154 unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) 155 { 156 unsigned int freq = arch_scale_freq_ref(policy->cpu); 157 158 if (freq) 159 return freq; 160 161 if (arch_scale_freq_invariant()) 162 return policy->cpuinfo.max_freq; 163 164 /* 165 * Apply a 25% margin so that we select a higher frequency than 166 * the current one before the CPU is fully busy: 167 */ 168 return policy->cur + (policy->cur >> 2); 169 } 170 171 /** 172 * get_next_freq - Compute a new frequency for a given cpufreq policy. 173 * @sg_policy: schedutil policy object to compute the new frequency for. 174 * @util: Current CPU utilization. 175 * @max: CPU capacity. 176 * 177 * If the utilization is frequency-invariant, choose the new frequency to be 178 * proportional to it, that is 179 * 180 * next_freq = C * max_freq * util / max 181 * 182 * Otherwise, approximate the would-be frequency-invariant utilization by 183 * util_raw * (curr_freq / max_freq) which leads to 184 * 185 * next_freq = C * curr_freq * util_raw / max 186 * 187 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 188 * 189 * The lowest driver-supported frequency which is equal or greater than the raw 190 * next_freq (as calculated above) is returned, subject to policy min/max and 191 * cpufreq driver limitations. 192 */ 193 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 194 unsigned long util, unsigned long max) 195 { 196 struct cpufreq_policy *policy = sg_policy->policy; 197 unsigned int freq; 198 199 freq = get_capacity_ref_freq(policy); 200 freq = map_util_freq(util, freq, max); 201 202 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) 203 return sg_policy->next_freq; 204 205 sg_policy->cached_raw_freq = freq; 206 return cpufreq_driver_resolve_freq(policy, freq); 207 } 208 209 unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, 210 unsigned long min, 211 unsigned long max) 212 { 213 /* Add dvfs headroom to actual utilization */ 214 actual = map_util_perf(actual); 215 /* Actually we don't need to target the max performance */ 216 if (actual < max) 217 max = actual; 218 219 /* 220 * Ensure at least minimum performance while providing more compute 221 * capacity when possible. 222 */ 223 return max(min, max); 224 } 225 226 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) 227 { 228 unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); 229 230 if (!scx_switched_all()) 231 util += cpu_util_cfs_boost(sg_cpu->cpu); 232 util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); 233 util = max(util, boost); 234 sg_cpu->bw_min = min; 235 sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); 236 } 237 238 /** 239 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 240 * @sg_cpu: the sugov data for the CPU to boost 241 * @time: the update time from the caller 242 * @set_iowait_boost: true if an IO boost has been requested 243 * 244 * The IO wait boost of a task is disabled after a tick since the last update 245 * of a CPU. If a new IO wait boost is requested after more then a tick, then 246 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 247 * efficiency by ignoring sporadic wakeups from IO. 248 */ 249 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 250 bool set_iowait_boost) 251 { 252 s64 delta_ns = time - sg_cpu->last_update; 253 254 /* Reset boost only if a tick has elapsed since last request */ 255 if (delta_ns <= TICK_NSEC) 256 return false; 257 258 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 259 sg_cpu->iowait_boost_pending = set_iowait_boost; 260 261 return true; 262 } 263 264 /** 265 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 266 * @sg_cpu: the sugov data for the CPU to boost 267 * @time: the update time from the caller 268 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 269 * 270 * Each time a task wakes up after an IO operation, the CPU utilization can be 271 * boosted to a certain utilization which doubles at each "frequent and 272 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 273 * of the maximum OPP. 274 * 275 * To keep doubling, an IO boost has to be requested at least once per tick, 276 * otherwise we restart from the utilization of the minimum OPP. 277 */ 278 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 279 unsigned int flags) 280 { 281 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 282 283 /* Reset boost if the CPU appears to have been idle enough */ 284 if (sg_cpu->iowait_boost && 285 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 286 return; 287 288 /* Boost only tasks waking up after IO */ 289 if (!set_iowait_boost) 290 return; 291 292 /* Ensure boost doubles only one time at each request */ 293 if (sg_cpu->iowait_boost_pending) 294 return; 295 sg_cpu->iowait_boost_pending = true; 296 297 /* Double the boost at each request */ 298 if (sg_cpu->iowait_boost) { 299 sg_cpu->iowait_boost = 300 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 301 return; 302 } 303 304 /* First wakeup after IO: start with minimum boost */ 305 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 306 } 307 308 /** 309 * sugov_iowait_apply() - Apply the IO boost to a CPU. 310 * @sg_cpu: the sugov data for the cpu to boost 311 * @time: the update time from the caller 312 * @max_cap: the max CPU capacity 313 * 314 * A CPU running a task which woken up after an IO operation can have its 315 * utilization boosted to speed up the completion of those IO operations. 316 * The IO boost value is increased each time a task wakes up from IO, in 317 * sugov_iowait_apply(), and it's instead decreased by this function, 318 * each time an increase has not been requested (!iowait_boost_pending). 319 * 320 * A CPU which also appears to have been idle for at least one tick has also 321 * its IO boost utilization reset. 322 * 323 * This mechanism is designed to boost high frequently IO waiting tasks, while 324 * being more conservative on tasks which does sporadic IO operations. 325 */ 326 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 327 unsigned long max_cap) 328 { 329 /* No boost currently required */ 330 if (!sg_cpu->iowait_boost) 331 return 0; 332 333 /* Reset boost if the CPU appears to have been idle enough */ 334 if (sugov_iowait_reset(sg_cpu, time, false)) 335 return 0; 336 337 if (!sg_cpu->iowait_boost_pending) { 338 /* 339 * No boost pending; reduce the boost value. 340 */ 341 sg_cpu->iowait_boost >>= 1; 342 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 343 sg_cpu->iowait_boost = 0; 344 return 0; 345 } 346 } 347 348 sg_cpu->iowait_boost_pending = false; 349 350 /* 351 * sg_cpu->util is already in capacity scale; convert iowait_boost 352 * into the same scale so we can compare. 353 */ 354 return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; 355 } 356 357 #ifdef CONFIG_NO_HZ_COMMON 358 static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) 359 { 360 unsigned long idle_calls; 361 bool ret; 362 363 /* 364 * The heuristics in this function is for the fair class. For SCX, the 365 * performance target comes directly from the BPF scheduler. Let's just 366 * follow it. 367 */ 368 if (scx_switched_all()) 369 return false; 370 371 /* if capped by uclamp_max, always update to be in compliance */ 372 if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) 373 return false; 374 375 /* 376 * Maintain the frequency if the CPU has not been idle recently, as 377 * reduction is likely to be premature. 378 */ 379 idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 380 ret = idle_calls == sg_cpu->saved_idle_calls; 381 382 sg_cpu->saved_idle_calls = idle_calls; 383 return ret; 384 } 385 #else /* !CONFIG_NO_HZ_COMMON: */ 386 static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } 387 #endif /* !CONFIG_NO_HZ_COMMON */ 388 389 /* 390 * Make sugov_should_update_freq() ignore the rate limit when DL 391 * has increased the utilization. 392 */ 393 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) 394 { 395 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) 396 sg_cpu->sg_policy->need_freq_update = true; 397 } 398 399 static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, 400 u64 time, unsigned long max_cap, 401 unsigned int flags) 402 { 403 unsigned long boost; 404 405 sugov_iowait_boost(sg_cpu, time, flags); 406 sg_cpu->last_update = time; 407 408 ignore_dl_rate_limit(sg_cpu); 409 410 if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) 411 return false; 412 413 boost = sugov_iowait_apply(sg_cpu, time, max_cap); 414 sugov_get_util(sg_cpu, boost); 415 416 return true; 417 } 418 419 static void sugov_update_single_freq(struct update_util_data *hook, u64 time, 420 unsigned int flags) 421 { 422 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 423 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 424 unsigned int cached_freq = sg_policy->cached_raw_freq; 425 unsigned long max_cap; 426 unsigned int next_f; 427 428 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 429 430 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 431 return; 432 433 next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); 434 435 if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && 436 !sg_policy->need_freq_update) { 437 next_f = sg_policy->next_freq; 438 439 /* Restore cached freq as next_freq has changed */ 440 sg_policy->cached_raw_freq = cached_freq; 441 } 442 443 if (!sugov_update_next_freq(sg_policy, time, next_f)) 444 return; 445 446 /* 447 * This code runs under rq->lock for the target CPU, so it won't run 448 * concurrently on two different CPUs for the same target and it is not 449 * necessary to acquire the lock in the fast switch case. 450 */ 451 if (sg_policy->policy->fast_switch_enabled) { 452 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 453 } else { 454 raw_spin_lock(&sg_policy->update_lock); 455 sugov_deferred_update(sg_policy); 456 raw_spin_unlock(&sg_policy->update_lock); 457 } 458 } 459 460 static void sugov_update_single_perf(struct update_util_data *hook, u64 time, 461 unsigned int flags) 462 { 463 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 464 unsigned long prev_util = sg_cpu->util; 465 unsigned long max_cap; 466 467 /* 468 * Fall back to the "frequency" path if frequency invariance is not 469 * supported, because the direct mapping between the utilization and 470 * the performance levels depends on the frequency invariance. 471 */ 472 if (!arch_scale_freq_invariant()) { 473 sugov_update_single_freq(hook, time, flags); 474 return; 475 } 476 477 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 478 479 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 480 return; 481 482 if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) 483 sg_cpu->util = prev_util; 484 485 cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, 486 sg_cpu->util, max_cap); 487 488 sg_cpu->sg_policy->last_freq_update_time = time; 489 } 490 491 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 492 { 493 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 494 struct cpufreq_policy *policy = sg_policy->policy; 495 unsigned long util = 0, max_cap; 496 unsigned int j; 497 498 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 499 500 for_each_cpu(j, policy->cpus) { 501 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 502 unsigned long boost; 503 504 boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); 505 sugov_get_util(j_sg_cpu, boost); 506 507 util = max(j_sg_cpu->util, util); 508 } 509 510 return get_next_freq(sg_policy, util, max_cap); 511 } 512 513 static void 514 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 515 { 516 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 517 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 518 unsigned int next_f; 519 520 raw_spin_lock(&sg_policy->update_lock); 521 522 sugov_iowait_boost(sg_cpu, time, flags); 523 sg_cpu->last_update = time; 524 525 ignore_dl_rate_limit(sg_cpu); 526 527 if (sugov_should_update_freq(sg_policy, time)) { 528 next_f = sugov_next_freq_shared(sg_cpu, time); 529 530 if (!sugov_update_next_freq(sg_policy, time, next_f)) 531 goto unlock; 532 533 if (sg_policy->policy->fast_switch_enabled) 534 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 535 else 536 sugov_deferred_update(sg_policy); 537 } 538 unlock: 539 raw_spin_unlock(&sg_policy->update_lock); 540 } 541 542 static void sugov_work(struct kthread_work *work) 543 { 544 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 545 unsigned int freq; 546 unsigned long flags; 547 548 /* 549 * Hold sg_policy->update_lock shortly to handle the case where: 550 * in case sg_policy->next_freq is read here, and then updated by 551 * sugov_deferred_update() just before work_in_progress is set to false 552 * here, we may miss queueing the new update. 553 * 554 * Note: If a work was queued after the update_lock is released, 555 * sugov_work() will just be called again by kthread_work code; and the 556 * request will be proceed before the sugov thread sleeps. 557 */ 558 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 559 freq = sg_policy->next_freq; 560 sg_policy->work_in_progress = false; 561 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 562 563 mutex_lock(&sg_policy->work_lock); 564 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 565 mutex_unlock(&sg_policy->work_lock); 566 } 567 568 static void sugov_irq_work(struct irq_work *irq_work) 569 { 570 struct sugov_policy *sg_policy; 571 572 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 573 574 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 575 } 576 577 /************************** sysfs interface ************************/ 578 579 static struct sugov_tunables *global_tunables; 580 static DEFINE_MUTEX(global_tunables_lock); 581 582 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 583 { 584 return container_of(attr_set, struct sugov_tunables, attr_set); 585 } 586 587 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 588 { 589 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 590 591 return sprintf(buf, "%u\n", tunables->rate_limit_us); 592 } 593 594 static ssize_t 595 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 596 { 597 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 598 struct sugov_policy *sg_policy; 599 unsigned int rate_limit_us; 600 601 if (kstrtouint(buf, 10, &rate_limit_us)) 602 return -EINVAL; 603 604 tunables->rate_limit_us = rate_limit_us; 605 606 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 607 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 608 609 return count; 610 } 611 612 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 613 614 static struct attribute *sugov_attrs[] = { 615 &rate_limit_us.attr, 616 NULL 617 }; 618 ATTRIBUTE_GROUPS(sugov); 619 620 static void sugov_tunables_free(struct kobject *kobj) 621 { 622 struct gov_attr_set *attr_set = to_gov_attr_set(kobj); 623 624 kfree(to_sugov_tunables(attr_set)); 625 } 626 627 static const struct kobj_type sugov_tunables_ktype = { 628 .default_groups = sugov_groups, 629 .sysfs_ops = &governor_sysfs_ops, 630 .release = &sugov_tunables_free, 631 }; 632 633 /********************** cpufreq governor interface *********************/ 634 635 static struct cpufreq_governor schedutil_gov; 636 637 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 638 { 639 struct sugov_policy *sg_policy; 640 641 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 642 if (!sg_policy) 643 return NULL; 644 645 sg_policy->policy = policy; 646 raw_spin_lock_init(&sg_policy->update_lock); 647 return sg_policy; 648 } 649 650 static void sugov_policy_free(struct sugov_policy *sg_policy) 651 { 652 kfree(sg_policy); 653 } 654 655 static int sugov_kthread_create(struct sugov_policy *sg_policy) 656 { 657 struct task_struct *thread; 658 struct sched_attr attr = { 659 .size = sizeof(struct sched_attr), 660 .sched_policy = SCHED_DEADLINE, 661 .sched_flags = SCHED_FLAG_SUGOV, 662 .sched_nice = 0, 663 .sched_priority = 0, 664 /* 665 * Fake (unused) bandwidth; workaround to "fix" 666 * priority inheritance. 667 */ 668 .sched_runtime = NSEC_PER_MSEC, 669 .sched_deadline = 10 * NSEC_PER_MSEC, 670 .sched_period = 10 * NSEC_PER_MSEC, 671 }; 672 struct cpufreq_policy *policy = sg_policy->policy; 673 int ret; 674 675 /* kthread only required for slow path */ 676 if (policy->fast_switch_enabled) 677 return 0; 678 679 kthread_init_work(&sg_policy->work, sugov_work); 680 kthread_init_worker(&sg_policy->worker); 681 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 682 "sugov:%d", 683 cpumask_first(policy->related_cpus)); 684 if (IS_ERR(thread)) { 685 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 686 return PTR_ERR(thread); 687 } 688 689 ret = sched_setattr_nocheck(thread, &attr); 690 if (ret) { 691 kthread_stop(thread); 692 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 693 return ret; 694 } 695 696 sg_policy->thread = thread; 697 if (policy->dvfs_possible_from_any_cpu) 698 set_cpus_allowed_ptr(thread, policy->related_cpus); 699 else 700 kthread_bind_mask(thread, policy->related_cpus); 701 702 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 703 mutex_init(&sg_policy->work_lock); 704 705 wake_up_process(thread); 706 707 return 0; 708 } 709 710 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 711 { 712 /* kthread only required for slow path */ 713 if (sg_policy->policy->fast_switch_enabled) 714 return; 715 716 kthread_flush_worker(&sg_policy->worker); 717 kthread_stop(sg_policy->thread); 718 mutex_destroy(&sg_policy->work_lock); 719 } 720 721 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 722 { 723 struct sugov_tunables *tunables; 724 725 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 726 if (tunables) { 727 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 728 if (!have_governor_per_policy()) 729 global_tunables = tunables; 730 } 731 return tunables; 732 } 733 734 static void sugov_clear_global_tunables(void) 735 { 736 if (!have_governor_per_policy()) 737 global_tunables = NULL; 738 } 739 740 static int sugov_init(struct cpufreq_policy *policy) 741 { 742 struct sugov_policy *sg_policy; 743 struct sugov_tunables *tunables; 744 int ret = 0; 745 746 /* State should be equivalent to EXIT */ 747 if (policy->governor_data) 748 return -EBUSY; 749 750 cpufreq_enable_fast_switch(policy); 751 752 sg_policy = sugov_policy_alloc(policy); 753 if (!sg_policy) { 754 ret = -ENOMEM; 755 goto disable_fast_switch; 756 } 757 758 ret = sugov_kthread_create(sg_policy); 759 if (ret) 760 goto free_sg_policy; 761 762 mutex_lock(&global_tunables_lock); 763 764 if (global_tunables) { 765 if (WARN_ON(have_governor_per_policy())) { 766 ret = -EINVAL; 767 goto stop_kthread; 768 } 769 policy->governor_data = sg_policy; 770 sg_policy->tunables = global_tunables; 771 772 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 773 goto out; 774 } 775 776 tunables = sugov_tunables_alloc(sg_policy); 777 if (!tunables) { 778 ret = -ENOMEM; 779 goto stop_kthread; 780 } 781 782 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 783 784 policy->governor_data = sg_policy; 785 sg_policy->tunables = tunables; 786 787 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 788 get_governor_parent_kobj(policy), "%s", 789 schedutil_gov.name); 790 if (ret) 791 goto fail; 792 793 out: 794 /* 795 * Schedutil is the preferred governor for EAS, so rebuild sched domains 796 * on governor changes to make sure the scheduler knows about them. 797 */ 798 em_rebuild_sched_domains(); 799 mutex_unlock(&global_tunables_lock); 800 return 0; 801 802 fail: 803 kobject_put(&tunables->attr_set.kobj); 804 policy->governor_data = NULL; 805 sugov_clear_global_tunables(); 806 807 stop_kthread: 808 sugov_kthread_stop(sg_policy); 809 mutex_unlock(&global_tunables_lock); 810 811 free_sg_policy: 812 sugov_policy_free(sg_policy); 813 814 disable_fast_switch: 815 cpufreq_disable_fast_switch(policy); 816 817 pr_err("initialization failed (error %d)\n", ret); 818 return ret; 819 } 820 821 static void sugov_exit(struct cpufreq_policy *policy) 822 { 823 struct sugov_policy *sg_policy = policy->governor_data; 824 struct sugov_tunables *tunables = sg_policy->tunables; 825 unsigned int count; 826 827 mutex_lock(&global_tunables_lock); 828 829 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 830 policy->governor_data = NULL; 831 if (!count) 832 sugov_clear_global_tunables(); 833 834 mutex_unlock(&global_tunables_lock); 835 836 sugov_kthread_stop(sg_policy); 837 sugov_policy_free(sg_policy); 838 cpufreq_disable_fast_switch(policy); 839 840 em_rebuild_sched_domains(); 841 } 842 843 static int sugov_start(struct cpufreq_policy *policy) 844 { 845 struct sugov_policy *sg_policy = policy->governor_data; 846 void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); 847 unsigned int cpu; 848 849 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 850 sg_policy->last_freq_update_time = 0; 851 sg_policy->next_freq = 0; 852 sg_policy->work_in_progress = false; 853 sg_policy->limits_changed = false; 854 sg_policy->cached_raw_freq = 0; 855 856 sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); 857 858 if (policy_is_shared(policy)) 859 uu = sugov_update_shared; 860 else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) 861 uu = sugov_update_single_perf; 862 else 863 uu = sugov_update_single_freq; 864 865 for_each_cpu(cpu, policy->cpus) { 866 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 867 868 memset(sg_cpu, 0, sizeof(*sg_cpu)); 869 sg_cpu->cpu = cpu; 870 sg_cpu->sg_policy = sg_policy; 871 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); 872 } 873 return 0; 874 } 875 876 static void sugov_stop(struct cpufreq_policy *policy) 877 { 878 struct sugov_policy *sg_policy = policy->governor_data; 879 unsigned int cpu; 880 881 for_each_cpu(cpu, policy->cpus) 882 cpufreq_remove_update_util_hook(cpu); 883 884 synchronize_rcu(); 885 886 if (!policy->fast_switch_enabled) { 887 irq_work_sync(&sg_policy->irq_work); 888 kthread_cancel_work_sync(&sg_policy->work); 889 } 890 } 891 892 static void sugov_limits(struct cpufreq_policy *policy) 893 { 894 struct sugov_policy *sg_policy = policy->governor_data; 895 896 if (!policy->fast_switch_enabled) { 897 mutex_lock(&sg_policy->work_lock); 898 cpufreq_policy_apply_limits(policy); 899 mutex_unlock(&sg_policy->work_lock); 900 } 901 902 /* 903 * The limits_changed update below must take place before the updates 904 * of policy limits in cpufreq_set_policy() or a policy limits update 905 * might be missed, so use a memory barrier to ensure it. 906 * 907 * This pairs with the memory barrier in sugov_should_update_freq(). 908 */ 909 smp_wmb(); 910 911 WRITE_ONCE(sg_policy->limits_changed, true); 912 } 913 914 static struct cpufreq_governor schedutil_gov = { 915 .name = "schedutil", 916 .owner = THIS_MODULE, 917 .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, 918 .init = sugov_init, 919 .exit = sugov_exit, 920 .start = sugov_start, 921 .stop = sugov_stop, 922 .limits = sugov_limits, 923 }; 924 925 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 926 struct cpufreq_governor *cpufreq_default_governor(void) 927 { 928 return &schedutil_gov; 929 } 930 #endif 931 932 bool sugov_is_governor(struct cpufreq_policy *policy) 933 { 934 return policy->governor == &schedutil_gov; 935 } 936 937 cpufreq_governor_init(schedutil_gov); 938