1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 #include <uapi/linux/sched/types.h> 9 #include "sched.h" 10 11 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 12 13 struct sugov_tunables { 14 struct gov_attr_set attr_set; 15 unsigned int rate_limit_us; 16 }; 17 18 struct sugov_policy { 19 struct cpufreq_policy *policy; 20 21 struct sugov_tunables *tunables; 22 struct list_head tunables_hook; 23 24 raw_spinlock_t update_lock; 25 u64 last_freq_update_time; 26 s64 freq_update_delay_ns; 27 unsigned int next_freq; 28 unsigned int cached_raw_freq; 29 30 /* The next fields are only needed if fast switch cannot be used: */ 31 struct irq_work irq_work; 32 struct kthread_work work; 33 struct mutex work_lock; 34 struct kthread_worker worker; 35 struct task_struct *thread; 36 bool work_in_progress; 37 38 bool limits_changed; 39 bool need_freq_update; 40 }; 41 42 struct sugov_cpu { 43 struct update_util_data update_util; 44 struct sugov_policy *sg_policy; 45 unsigned int cpu; 46 47 bool iowait_boost_pending; 48 unsigned int iowait_boost; 49 u64 last_update; 50 51 unsigned long util; 52 unsigned long bw_min; 53 54 /* The field below is for single-CPU policies only: */ 55 #ifdef CONFIG_NO_HZ_COMMON 56 unsigned long saved_idle_calls; 57 #endif 58 }; 59 60 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 61 62 /************************ Governor internals ***********************/ 63 64 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 65 { 66 s64 delta_ns; 67 68 /* 69 * Since cpufreq_update_util() is called with rq->lock held for 70 * the @target_cpu, our per-CPU data is fully serialized. 71 * 72 * However, drivers cannot in general deal with cross-CPU 73 * requests, so while get_next_freq() will work, our 74 * sugov_update_commit() call may not for the fast switching platforms. 75 * 76 * Hence stop here for remote requests if they aren't supported 77 * by the hardware, as calculating the frequency is pointless if 78 * we cannot in fact act on it. 79 * 80 * This is needed on the slow switching platforms too to prevent CPUs 81 * going offline from leaving stale IRQ work items behind. 82 */ 83 if (!cpufreq_this_cpu_can_update(sg_policy->policy)) 84 return false; 85 86 if (unlikely(READ_ONCE(sg_policy->limits_changed))) { 87 WRITE_ONCE(sg_policy->limits_changed, false); 88 sg_policy->need_freq_update = true; 89 90 /* 91 * The above limits_changed update must occur before the reads 92 * of policy limits in cpufreq_driver_resolve_freq() or a policy 93 * limits update might be missed, so use a memory barrier to 94 * ensure it. 95 * 96 * This pairs with the write memory barrier in sugov_limits(). 97 */ 98 smp_mb(); 99 100 return true; 101 } else if (sg_policy->need_freq_update) { 102 /* ignore_dl_rate_limit() wants a new frequency to be found. */ 103 return true; 104 } 105 106 delta_ns = time - sg_policy->last_freq_update_time; 107 108 return delta_ns >= sg_policy->freq_update_delay_ns; 109 } 110 111 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 112 unsigned int next_freq) 113 { 114 if (sg_policy->need_freq_update) { 115 sg_policy->need_freq_update = false; 116 /* 117 * The policy limits have changed, but if the return value of 118 * cpufreq_driver_resolve_freq() after applying the new limits 119 * is still equal to the previously selected frequency, the 120 * driver callback need not be invoked unless the driver 121 * specifically wants that to happen on every update of the 122 * policy limits. 123 */ 124 if (sg_policy->next_freq == next_freq && 125 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) 126 return false; 127 } else if (sg_policy->next_freq == next_freq) { 128 return false; 129 } 130 131 sg_policy->next_freq = next_freq; 132 sg_policy->last_freq_update_time = time; 133 134 return true; 135 } 136 137 static void sugov_deferred_update(struct sugov_policy *sg_policy) 138 { 139 if (!sg_policy->work_in_progress) { 140 sg_policy->work_in_progress = true; 141 irq_work_queue(&sg_policy->irq_work); 142 } 143 } 144 145 /** 146 * get_capacity_ref_freq - get the reference frequency that has been used to 147 * correlate frequency and compute capacity for a given cpufreq policy. We use 148 * the CPU managing it for the arch_scale_freq_ref() call in the function. 149 * @policy: the cpufreq policy of the CPU in question. 150 * 151 * Return: the reference CPU frequency to compute a capacity. 152 */ 153 static __always_inline 154 unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) 155 { 156 unsigned int freq = arch_scale_freq_ref(policy->cpu); 157 158 if (freq) 159 return freq; 160 161 if (arch_scale_freq_invariant()) 162 return policy->cpuinfo.max_freq; 163 164 /* 165 * Apply a 25% margin so that we select a higher frequency than 166 * the current one before the CPU is fully busy: 167 */ 168 return policy->cur + (policy->cur >> 2); 169 } 170 171 /** 172 * get_next_freq - Compute a new frequency for a given cpufreq policy. 173 * @sg_policy: schedutil policy object to compute the new frequency for. 174 * @util: Current CPU utilization. 175 * @max: CPU capacity. 176 * 177 * If the utilization is frequency-invariant, choose the new frequency to be 178 * proportional to it, that is 179 * 180 * next_freq = C * max_freq * util / max 181 * 182 * Otherwise, approximate the would-be frequency-invariant utilization by 183 * util_raw * (curr_freq / max_freq) which leads to 184 * 185 * next_freq = C * curr_freq * util_raw / max 186 * 187 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 188 * 189 * The lowest driver-supported frequency which is equal or greater than the raw 190 * next_freq (as calculated above) is returned, subject to policy min/max and 191 * cpufreq driver limitations. 192 */ 193 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 194 unsigned long util, unsigned long max) 195 { 196 struct cpufreq_policy *policy = sg_policy->policy; 197 unsigned int freq; 198 199 freq = get_capacity_ref_freq(policy); 200 freq = map_util_freq(util, freq, max); 201 202 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) 203 return sg_policy->next_freq; 204 205 sg_policy->cached_raw_freq = freq; 206 return cpufreq_driver_resolve_freq(policy, freq); 207 } 208 209 unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, 210 unsigned long min, 211 unsigned long max) 212 { 213 /* Add dvfs headroom to actual utilization */ 214 actual = map_util_perf(actual); 215 /* Actually we don't need to target the max performance */ 216 if (actual < max) 217 max = actual; 218 219 /* 220 * Ensure at least minimum performance while providing more compute 221 * capacity when possible. 222 */ 223 return max(min, max); 224 } 225 226 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) 227 { 228 unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); 229 230 if (!scx_switched_all()) 231 util += cpu_util_cfs_boost(sg_cpu->cpu); 232 util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); 233 util = max(util, boost); 234 sg_cpu->bw_min = min; 235 sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); 236 } 237 238 /** 239 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 240 * @sg_cpu: the sugov data for the CPU to boost 241 * @time: the update time from the caller 242 * @set_iowait_boost: true if an IO boost has been requested 243 * 244 * The IO wait boost of a task is disabled after a tick since the last update 245 * of a CPU. If a new IO wait boost is requested after more then a tick, then 246 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 247 * efficiency by ignoring sporadic wakeups from IO. 248 */ 249 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 250 bool set_iowait_boost) 251 { 252 s64 delta_ns = time - sg_cpu->last_update; 253 254 /* Reset boost only if a tick has elapsed since last request */ 255 if (delta_ns <= TICK_NSEC) 256 return false; 257 258 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 259 sg_cpu->iowait_boost_pending = set_iowait_boost; 260 261 return true; 262 } 263 264 /** 265 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 266 * @sg_cpu: the sugov data for the CPU to boost 267 * @time: the update time from the caller 268 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 269 * 270 * Each time a task wakes up after an IO operation, the CPU utilization can be 271 * boosted to a certain utilization which doubles at each "frequent and 272 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 273 * of the maximum OPP. 274 * 275 * To keep doubling, an IO boost has to be requested at least once per tick, 276 * otherwise we restart from the utilization of the minimum OPP. 277 */ 278 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 279 unsigned int flags) 280 { 281 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 282 283 /* Reset boost if the CPU appears to have been idle enough */ 284 if (sg_cpu->iowait_boost && 285 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 286 return; 287 288 /* Boost only tasks waking up after IO */ 289 if (!set_iowait_boost) 290 return; 291 292 /* Ensure boost doubles only one time at each request */ 293 if (sg_cpu->iowait_boost_pending) 294 return; 295 sg_cpu->iowait_boost_pending = true; 296 297 /* Double the boost at each request */ 298 if (sg_cpu->iowait_boost) { 299 sg_cpu->iowait_boost = 300 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 301 return; 302 } 303 304 /* First wakeup after IO: start with minimum boost */ 305 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 306 } 307 308 /** 309 * sugov_iowait_apply() - Apply the IO boost to a CPU. 310 * @sg_cpu: the sugov data for the cpu to boost 311 * @time: the update time from the caller 312 * @max_cap: the max CPU capacity 313 * 314 * A CPU running a task which woken up after an IO operation can have its 315 * utilization boosted to speed up the completion of those IO operations. 316 * The IO boost value is increased each time a task wakes up from IO, in 317 * sugov_iowait_apply(), and it's instead decreased by this function, 318 * each time an increase has not been requested (!iowait_boost_pending). 319 * 320 * A CPU which also appears to have been idle for at least one tick has also 321 * its IO boost utilization reset. 322 * 323 * This mechanism is designed to boost high frequently IO waiting tasks, while 324 * being more conservative on tasks which does sporadic IO operations. 325 */ 326 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 327 unsigned long max_cap) 328 { 329 /* No boost currently required */ 330 if (!sg_cpu->iowait_boost) 331 return 0; 332 333 /* Reset boost if the CPU appears to have been idle enough */ 334 if (sugov_iowait_reset(sg_cpu, time, false)) 335 return 0; 336 337 if (!sg_cpu->iowait_boost_pending) { 338 /* 339 * No boost pending; reduce the boost value. 340 */ 341 sg_cpu->iowait_boost >>= 1; 342 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 343 sg_cpu->iowait_boost = 0; 344 return 0; 345 } 346 } 347 348 sg_cpu->iowait_boost_pending = false; 349 350 /* 351 * sg_cpu->util is already in capacity scale; convert iowait_boost 352 * into the same scale so we can compare. 353 */ 354 return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; 355 } 356 357 #ifdef CONFIG_NO_HZ_COMMON 358 static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) 359 { 360 unsigned long idle_calls; 361 bool ret; 362 363 /* 364 * The heuristics in this function is for the fair class. For SCX, the 365 * performance target comes directly from the BPF scheduler. Let's just 366 * follow it. 367 */ 368 if (scx_switched_all()) 369 return false; 370 371 /* if capped by uclamp_max, always update to be in compliance */ 372 if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) 373 return false; 374 375 /* 376 * Maintain the frequency if the CPU has not been idle recently, as 377 * reduction is likely to be premature. 378 */ 379 idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 380 ret = idle_calls == sg_cpu->saved_idle_calls; 381 382 sg_cpu->saved_idle_calls = idle_calls; 383 return ret; 384 } 385 #else /* !CONFIG_NO_HZ_COMMON: */ 386 static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } 387 #endif /* !CONFIG_NO_HZ_COMMON */ 388 389 /* 390 * Make sugov_should_update_freq() ignore the rate limit when DL 391 * has increased the utilization. 392 */ 393 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) 394 { 395 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) 396 sg_cpu->sg_policy->need_freq_update = true; 397 } 398 399 static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, 400 u64 time, unsigned long max_cap, 401 unsigned int flags) 402 { 403 unsigned long boost; 404 405 sugov_iowait_boost(sg_cpu, time, flags); 406 sg_cpu->last_update = time; 407 408 ignore_dl_rate_limit(sg_cpu); 409 410 if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) 411 return false; 412 413 boost = sugov_iowait_apply(sg_cpu, time, max_cap); 414 sugov_get_util(sg_cpu, boost); 415 416 return true; 417 } 418 419 static void sugov_update_single_freq(struct update_util_data *hook, u64 time, 420 unsigned int flags) 421 { 422 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 423 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 424 unsigned int cached_freq = sg_policy->cached_raw_freq; 425 unsigned long max_cap; 426 unsigned int next_f; 427 428 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 429 430 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 431 return; 432 433 next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); 434 435 if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && 436 !sg_policy->need_freq_update) { 437 next_f = sg_policy->next_freq; 438 439 /* Restore cached freq as next_freq has changed */ 440 sg_policy->cached_raw_freq = cached_freq; 441 } 442 443 if (!sugov_update_next_freq(sg_policy, time, next_f)) 444 return; 445 446 /* 447 * This code runs under rq->lock for the target CPU, so it won't run 448 * concurrently on two different CPUs for the same target and it is not 449 * necessary to acquire the lock in the fast switch case. 450 */ 451 if (sg_policy->policy->fast_switch_enabled) { 452 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 453 } else { 454 raw_spin_lock(&sg_policy->update_lock); 455 sugov_deferred_update(sg_policy); 456 raw_spin_unlock(&sg_policy->update_lock); 457 } 458 } 459 460 static void sugov_update_single_perf(struct update_util_data *hook, u64 time, 461 unsigned int flags) 462 { 463 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 464 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 465 unsigned long prev_util = sg_cpu->util; 466 unsigned long max_cap; 467 468 /* 469 * Fall back to the "frequency" path if frequency invariance is not 470 * supported, because the direct mapping between the utilization and 471 * the performance levels depends on the frequency invariance. 472 */ 473 if (!arch_scale_freq_invariant()) { 474 sugov_update_single_freq(hook, time, flags); 475 return; 476 } 477 478 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 479 480 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 481 return; 482 483 if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) 484 sg_cpu->util = prev_util; 485 486 cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, 487 sg_cpu->util, max_cap); 488 489 sg_policy->last_freq_update_time = time; 490 } 491 492 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 493 { 494 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 495 struct cpufreq_policy *policy = sg_policy->policy; 496 unsigned long util = 0, max_cap; 497 unsigned int j; 498 499 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 500 501 for_each_cpu(j, policy->cpus) { 502 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 503 unsigned long boost; 504 505 boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); 506 sugov_get_util(j_sg_cpu, boost); 507 508 util = max(j_sg_cpu->util, util); 509 } 510 511 return get_next_freq(sg_policy, util, max_cap); 512 } 513 514 static void 515 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 516 { 517 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 518 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 519 unsigned int next_f; 520 521 raw_spin_lock(&sg_policy->update_lock); 522 523 sugov_iowait_boost(sg_cpu, time, flags); 524 sg_cpu->last_update = time; 525 526 ignore_dl_rate_limit(sg_cpu); 527 528 if (sugov_should_update_freq(sg_policy, time)) { 529 next_f = sugov_next_freq_shared(sg_cpu, time); 530 531 if (!sugov_update_next_freq(sg_policy, time, next_f)) 532 goto unlock; 533 534 if (sg_policy->policy->fast_switch_enabled) 535 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 536 else 537 sugov_deferred_update(sg_policy); 538 } 539 unlock: 540 raw_spin_unlock(&sg_policy->update_lock); 541 } 542 543 static void sugov_work(struct kthread_work *work) 544 { 545 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 546 unsigned int freq; 547 unsigned long flags; 548 549 /* 550 * Hold sg_policy->update_lock shortly to handle the case where: 551 * in case sg_policy->next_freq is read here, and then updated by 552 * sugov_deferred_update() just before work_in_progress is set to false 553 * here, we may miss queueing the new update. 554 * 555 * Note: If a work was queued after the update_lock is released, 556 * sugov_work() will just be called again by kthread_work code; and the 557 * request will be proceed before the sugov thread sleeps. 558 */ 559 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 560 freq = sg_policy->next_freq; 561 sg_policy->work_in_progress = false; 562 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 563 564 mutex_lock(&sg_policy->work_lock); 565 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 566 mutex_unlock(&sg_policy->work_lock); 567 } 568 569 static void sugov_irq_work(struct irq_work *irq_work) 570 { 571 struct sugov_policy *sg_policy; 572 573 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 574 575 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 576 } 577 578 /************************** sysfs interface ************************/ 579 580 static struct sugov_tunables *global_tunables; 581 static DEFINE_MUTEX(global_tunables_lock); 582 583 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 584 { 585 return container_of(attr_set, struct sugov_tunables, attr_set); 586 } 587 588 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 589 { 590 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 591 592 return sprintf(buf, "%u\n", tunables->rate_limit_us); 593 } 594 595 static ssize_t 596 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 597 { 598 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 599 struct sugov_policy *sg_policy; 600 unsigned int rate_limit_us; 601 602 if (kstrtouint(buf, 10, &rate_limit_us)) 603 return -EINVAL; 604 605 tunables->rate_limit_us = rate_limit_us; 606 607 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 608 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 609 610 return count; 611 } 612 613 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 614 615 static struct attribute *sugov_attrs[] = { 616 &rate_limit_us.attr, 617 NULL 618 }; 619 ATTRIBUTE_GROUPS(sugov); 620 621 static void sugov_tunables_free(struct kobject *kobj) 622 { 623 struct gov_attr_set *attr_set = to_gov_attr_set(kobj); 624 625 kfree(to_sugov_tunables(attr_set)); 626 } 627 628 static const struct kobj_type sugov_tunables_ktype = { 629 .default_groups = sugov_groups, 630 .sysfs_ops = &governor_sysfs_ops, 631 .release = &sugov_tunables_free, 632 }; 633 634 /********************** cpufreq governor interface *********************/ 635 636 static struct cpufreq_governor schedutil_gov; 637 638 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 639 { 640 struct sugov_policy *sg_policy; 641 642 sg_policy = kzalloc_obj(*sg_policy); 643 if (!sg_policy) 644 return NULL; 645 646 sg_policy->policy = policy; 647 raw_spin_lock_init(&sg_policy->update_lock); 648 return sg_policy; 649 } 650 651 static void sugov_policy_free(struct sugov_policy *sg_policy) 652 { 653 kfree(sg_policy); 654 } 655 656 static int sugov_kthread_create(struct sugov_policy *sg_policy) 657 { 658 struct task_struct *thread; 659 struct sched_attr attr = { 660 .size = sizeof(struct sched_attr), 661 .sched_policy = SCHED_DEADLINE, 662 .sched_flags = SCHED_FLAG_SUGOV, 663 .sched_nice = 0, 664 .sched_priority = 0, 665 /* 666 * Fake (unused) bandwidth; workaround to "fix" 667 * priority inheritance. 668 */ 669 .sched_runtime = NSEC_PER_MSEC, 670 .sched_deadline = 10 * NSEC_PER_MSEC, 671 .sched_period = 10 * NSEC_PER_MSEC, 672 }; 673 struct cpufreq_policy *policy = sg_policy->policy; 674 int ret; 675 676 /* kthread only required for slow path */ 677 if (policy->fast_switch_enabled) 678 return 0; 679 680 kthread_init_work(&sg_policy->work, sugov_work); 681 kthread_init_worker(&sg_policy->worker); 682 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 683 "sugov:%d", 684 cpumask_first(policy->related_cpus)); 685 if (IS_ERR(thread)) { 686 pr_err("failed to create sugov thread: %pe\n", thread); 687 return PTR_ERR(thread); 688 } 689 690 ret = sched_setattr_nocheck(thread, &attr); 691 if (ret) { 692 kthread_stop(thread); 693 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 694 return ret; 695 } 696 697 sg_policy->thread = thread; 698 if (policy->dvfs_possible_from_any_cpu) 699 set_cpus_allowed_ptr(thread, policy->related_cpus); 700 else 701 kthread_bind_mask(thread, policy->related_cpus); 702 703 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 704 mutex_init(&sg_policy->work_lock); 705 706 wake_up_process(thread); 707 708 return 0; 709 } 710 711 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 712 { 713 /* kthread only required for slow path */ 714 if (sg_policy->policy->fast_switch_enabled) 715 return; 716 717 kthread_flush_worker(&sg_policy->worker); 718 kthread_stop(sg_policy->thread); 719 mutex_destroy(&sg_policy->work_lock); 720 } 721 722 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 723 { 724 struct sugov_tunables *tunables; 725 726 tunables = kzalloc_obj(*tunables); 727 if (tunables) { 728 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 729 if (!have_governor_per_policy()) 730 global_tunables = tunables; 731 } 732 return tunables; 733 } 734 735 static void sugov_clear_global_tunables(void) 736 { 737 if (!have_governor_per_policy()) 738 global_tunables = NULL; 739 } 740 741 static int sugov_init(struct cpufreq_policy *policy) 742 { 743 struct sugov_policy *sg_policy; 744 struct sugov_tunables *tunables; 745 int ret = 0; 746 747 /* State should be equivalent to EXIT */ 748 if (policy->governor_data) 749 return -EBUSY; 750 751 cpufreq_enable_fast_switch(policy); 752 753 sg_policy = sugov_policy_alloc(policy); 754 if (!sg_policy) { 755 ret = -ENOMEM; 756 goto disable_fast_switch; 757 } 758 759 ret = sugov_kthread_create(sg_policy); 760 if (ret) 761 goto free_sg_policy; 762 763 mutex_lock(&global_tunables_lock); 764 765 if (global_tunables) { 766 if (WARN_ON(have_governor_per_policy())) { 767 ret = -EINVAL; 768 goto stop_kthread; 769 } 770 policy->governor_data = sg_policy; 771 sg_policy->tunables = global_tunables; 772 773 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 774 goto out; 775 } 776 777 tunables = sugov_tunables_alloc(sg_policy); 778 if (!tunables) { 779 ret = -ENOMEM; 780 goto stop_kthread; 781 } 782 783 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 784 785 policy->governor_data = sg_policy; 786 sg_policy->tunables = tunables; 787 788 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 789 get_governor_parent_kobj(policy), "%s", 790 schedutil_gov.name); 791 if (ret) 792 goto fail; 793 794 out: 795 /* 796 * Schedutil is the preferred governor for EAS, so rebuild sched domains 797 * on governor changes to make sure the scheduler knows about them. 798 */ 799 em_rebuild_sched_domains(); 800 mutex_unlock(&global_tunables_lock); 801 return 0; 802 803 fail: 804 kobject_put(&tunables->attr_set.kobj); 805 policy->governor_data = NULL; 806 sugov_clear_global_tunables(); 807 808 stop_kthread: 809 sugov_kthread_stop(sg_policy); 810 mutex_unlock(&global_tunables_lock); 811 812 free_sg_policy: 813 sugov_policy_free(sg_policy); 814 815 disable_fast_switch: 816 cpufreq_disable_fast_switch(policy); 817 818 pr_err("initialization failed (error %d)\n", ret); 819 return ret; 820 } 821 822 static void sugov_exit(struct cpufreq_policy *policy) 823 { 824 struct sugov_policy *sg_policy = policy->governor_data; 825 struct sugov_tunables *tunables = sg_policy->tunables; 826 unsigned int count; 827 828 mutex_lock(&global_tunables_lock); 829 830 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 831 policy->governor_data = NULL; 832 if (!count) 833 sugov_clear_global_tunables(); 834 835 mutex_unlock(&global_tunables_lock); 836 837 sugov_kthread_stop(sg_policy); 838 sugov_policy_free(sg_policy); 839 cpufreq_disable_fast_switch(policy); 840 841 em_rebuild_sched_domains(); 842 } 843 844 static int sugov_start(struct cpufreq_policy *policy) 845 { 846 struct sugov_policy *sg_policy = policy->governor_data; 847 void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); 848 unsigned int cpu; 849 850 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 851 sg_policy->last_freq_update_time = 0; 852 sg_policy->next_freq = 0; 853 sg_policy->work_in_progress = false; 854 sg_policy->limits_changed = false; 855 sg_policy->cached_raw_freq = 0; 856 857 sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); 858 859 if (policy_is_shared(policy)) 860 uu = sugov_update_shared; 861 else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) 862 uu = sugov_update_single_perf; 863 else 864 uu = sugov_update_single_freq; 865 866 for_each_cpu(cpu, policy->cpus) { 867 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 868 869 memset(sg_cpu, 0, sizeof(*sg_cpu)); 870 sg_cpu->cpu = cpu; 871 sg_cpu->sg_policy = sg_policy; 872 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); 873 } 874 return 0; 875 } 876 877 static void sugov_stop(struct cpufreq_policy *policy) 878 { 879 struct sugov_policy *sg_policy = policy->governor_data; 880 unsigned int cpu; 881 882 for_each_cpu(cpu, policy->cpus) 883 cpufreq_remove_update_util_hook(cpu); 884 885 synchronize_rcu(); 886 887 if (!policy->fast_switch_enabled) { 888 irq_work_sync(&sg_policy->irq_work); 889 kthread_cancel_work_sync(&sg_policy->work); 890 } 891 } 892 893 static void sugov_limits(struct cpufreq_policy *policy) 894 { 895 struct sugov_policy *sg_policy = policy->governor_data; 896 897 if (!policy->fast_switch_enabled) { 898 mutex_lock(&sg_policy->work_lock); 899 cpufreq_policy_apply_limits(policy); 900 mutex_unlock(&sg_policy->work_lock); 901 } 902 903 /* 904 * The limits_changed update below must take place before the updates 905 * of policy limits in cpufreq_set_policy() or a policy limits update 906 * might be missed, so use a memory barrier to ensure it. 907 * 908 * This pairs with the memory barrier in sugov_should_update_freq(). 909 */ 910 smp_wmb(); 911 912 WRITE_ONCE(sg_policy->limits_changed, true); 913 } 914 915 static struct cpufreq_governor schedutil_gov = { 916 .name = "schedutil", 917 .owner = THIS_MODULE, 918 .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, 919 .init = sugov_init, 920 .exit = sugov_exit, 921 .start = sugov_start, 922 .stop = sugov_stop, 923 .limits = sugov_limits, 924 }; 925 926 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 927 struct cpufreq_governor *cpufreq_default_governor(void) 928 { 929 return &schedutil_gov; 930 } 931 #endif 932 933 bool sugov_is_governor(struct cpufreq_policy *policy) 934 { 935 return policy->governor == &schedutil_gov; 936 } 937 938 cpufreq_governor_init(schedutil_gov); 939