1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Energy Model of devices 4 * 5 * Copyright (c) 2018-2021, Arm ltd. 6 * Written by: Quentin Perret, Arm ltd. 7 * Improvements provided by: Lukasz Luba, Arm ltd. 8 */ 9 10 #define pr_fmt(fmt) "energy_model: " fmt 11 12 #include <linux/cpu.h> 13 #include <linux/cpufreq.h> 14 #include <linux/cpumask.h> 15 #include <linux/debugfs.h> 16 #include <linux/energy_model.h> 17 #include <linux/sched/topology.h> 18 #include <linux/slab.h> 19 20 /* 21 * Mutex serializing the registrations of performance domains and letting 22 * callbacks defined by drivers sleep. 23 */ 24 static DEFINE_MUTEX(em_pd_mutex); 25 26 static void em_cpufreq_update_efficiencies(struct device *dev, 27 struct em_perf_state *table); 28 static void em_check_capacity_update(void); 29 static void em_update_workfn(struct work_struct *work); 30 static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn); 31 32 static bool _is_cpu_device(struct device *dev) 33 { 34 return (dev->bus == &cpu_subsys); 35 } 36 37 #ifdef CONFIG_DEBUG_FS 38 static struct dentry *rootdir; 39 40 struct em_dbg_info { 41 struct em_perf_domain *pd; 42 int ps_id; 43 }; 44 45 #define DEFINE_EM_DBG_SHOW(name, fname) \ 46 static int em_debug_##fname##_show(struct seq_file *s, void *unused) \ 47 { \ 48 struct em_dbg_info *em_dbg = s->private; \ 49 struct em_perf_state *table; \ 50 unsigned long val; \ 51 \ 52 rcu_read_lock(); \ 53 table = em_perf_state_from_pd(em_dbg->pd); \ 54 val = table[em_dbg->ps_id].name; \ 55 rcu_read_unlock(); \ 56 \ 57 seq_printf(s, "%lu\n", val); \ 58 return 0; \ 59 } \ 60 DEFINE_SHOW_ATTRIBUTE(em_debug_##fname) 61 62 DEFINE_EM_DBG_SHOW(frequency, frequency); 63 DEFINE_EM_DBG_SHOW(power, power); 64 DEFINE_EM_DBG_SHOW(cost, cost); 65 DEFINE_EM_DBG_SHOW(performance, performance); 66 DEFINE_EM_DBG_SHOW(flags, inefficiency); 67 68 static void em_debug_create_ps(struct em_perf_domain *em_pd, 69 struct em_dbg_info *em_dbg, int i, 70 struct dentry *pd) 71 { 72 struct em_perf_state *table; 73 unsigned long freq; 74 struct dentry *d; 75 char name[24]; 76 77 em_dbg[i].pd = em_pd; 78 em_dbg[i].ps_id = i; 79 80 rcu_read_lock(); 81 table = em_perf_state_from_pd(em_pd); 82 freq = table[i].frequency; 83 rcu_read_unlock(); 84 85 snprintf(name, sizeof(name), "ps:%lu", freq); 86 87 /* Create per-ps directory */ 88 d = debugfs_create_dir(name, pd); 89 debugfs_create_file("frequency", 0444, d, &em_dbg[i], 90 &em_debug_frequency_fops); 91 debugfs_create_file("power", 0444, d, &em_dbg[i], 92 &em_debug_power_fops); 93 debugfs_create_file("cost", 0444, d, &em_dbg[i], 94 &em_debug_cost_fops); 95 debugfs_create_file("performance", 0444, d, &em_dbg[i], 96 &em_debug_performance_fops); 97 debugfs_create_file("inefficient", 0444, d, &em_dbg[i], 98 &em_debug_inefficiency_fops); 99 } 100 101 static int em_debug_cpus_show(struct seq_file *s, void *unused) 102 { 103 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); 104 105 return 0; 106 } 107 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 108 109 static int em_debug_flags_show(struct seq_file *s, void *unused) 110 { 111 struct em_perf_domain *pd = s->private; 112 113 seq_printf(s, "%#lx\n", pd->flags); 114 115 return 0; 116 } 117 DEFINE_SHOW_ATTRIBUTE(em_debug_flags); 118 119 static void em_debug_create_pd(struct device *dev) 120 { 121 struct em_dbg_info *em_dbg; 122 struct dentry *d; 123 int i; 124 125 /* Create the directory of the performance domain */ 126 d = debugfs_create_dir(dev_name(dev), rootdir); 127 128 if (_is_cpu_device(dev)) 129 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 130 &em_debug_cpus_fops); 131 132 debugfs_create_file("flags", 0444, d, dev->em_pd, 133 &em_debug_flags_fops); 134 135 em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, 136 sizeof(*em_dbg), GFP_KERNEL); 137 if (!em_dbg) 138 return; 139 140 /* Create a sub-directory for each performance state */ 141 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 142 em_debug_create_ps(dev->em_pd, em_dbg, i, d); 143 144 } 145 146 static void em_debug_remove_pd(struct device *dev) 147 { 148 debugfs_lookup_and_remove(dev_name(dev), rootdir); 149 } 150 151 static int __init em_debug_init(void) 152 { 153 /* Create /sys/kernel/debug/energy_model directory */ 154 rootdir = debugfs_create_dir("energy_model", NULL); 155 156 return 0; 157 } 158 fs_initcall(em_debug_init); 159 #else /* CONFIG_DEBUG_FS */ 160 static void em_debug_create_pd(struct device *dev) {} 161 static void em_debug_remove_pd(struct device *dev) {} 162 #endif 163 164 static void em_release_table_kref(struct kref *kref) 165 { 166 /* It was the last owner of this table so we can free */ 167 kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); 168 } 169 170 /** 171 * em_table_free() - Handles safe free of the EM table when needed 172 * @table : EM table which is going to be freed 173 * 174 * No return values. 175 */ 176 void em_table_free(struct em_perf_table *table) 177 { 178 kref_put(&table->kref, em_release_table_kref); 179 } 180 181 /** 182 * em_table_alloc() - Allocate a new EM table 183 * @pd : EM performance domain for which this must be done 184 * 185 * Allocate a new EM table and initialize its kref to indicate that it 186 * has a user. 187 * Returns allocated table or NULL. 188 */ 189 struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) 190 { 191 struct em_perf_table *table; 192 int table_size; 193 194 table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; 195 196 table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL); 197 if (!table) 198 return NULL; 199 200 kref_init(&table->kref); 201 202 return table; 203 } 204 205 static void em_init_performance(struct device *dev, struct em_perf_domain *pd, 206 struct em_perf_state *table, int nr_states) 207 { 208 u64 fmax, max_cap; 209 int i, cpu; 210 211 /* This is needed only for CPUs and EAS skip other devices */ 212 if (!_is_cpu_device(dev)) 213 return; 214 215 cpu = cpumask_first(em_span_cpus(pd)); 216 217 /* 218 * Calculate the performance value for each frequency with 219 * linear relationship. The final CPU capacity might not be ready at 220 * boot time, but the EM will be updated a bit later with correct one. 221 */ 222 fmax = (u64) table[nr_states - 1].frequency; 223 max_cap = (u64) arch_scale_cpu_capacity(cpu); 224 for (i = 0; i < nr_states; i++) 225 table[i].performance = div64_u64(max_cap * table[i].frequency, 226 fmax); 227 } 228 229 static int em_compute_costs(struct device *dev, struct em_perf_state *table, 230 const struct em_data_callback *cb, int nr_states, 231 unsigned long flags) 232 { 233 unsigned long prev_cost = ULONG_MAX; 234 int i, ret; 235 236 /* This is needed only for CPUs and EAS skip other devices */ 237 if (!_is_cpu_device(dev)) 238 return 0; 239 240 /* Compute the cost of each performance state. */ 241 for (i = nr_states - 1; i >= 0; i--) { 242 unsigned long power_res, cost; 243 244 if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) { 245 ret = cb->get_cost(dev, table[i].frequency, &cost); 246 if (ret || !cost || cost > EM_MAX_POWER) { 247 dev_err(dev, "EM: invalid cost %lu %d\n", 248 cost, ret); 249 return -EINVAL; 250 } 251 } else { 252 /* increase resolution of 'cost' precision */ 253 power_res = table[i].power * 10; 254 cost = power_res / table[i].performance; 255 } 256 257 table[i].cost = cost; 258 259 if (table[i].cost >= prev_cost) { 260 table[i].flags = EM_PERF_STATE_INEFFICIENT; 261 dev_dbg(dev, "EM: OPP:%lu is inefficient\n", 262 table[i].frequency); 263 } else { 264 prev_cost = table[i].cost; 265 } 266 } 267 268 return 0; 269 } 270 271 /** 272 * em_dev_compute_costs() - Calculate cost values for new runtime EM table 273 * @dev : Device for which the EM table is to be updated 274 * @table : The new EM table that is going to get the costs calculated 275 * @nr_states : Number of performance states 276 * 277 * Calculate the em_perf_state::cost values for new runtime EM table. The 278 * values are used for EAS during task placement. It also calculates and sets 279 * the efficiency flag for each performance state. When the function finish 280 * successfully the EM table is ready to be updated and used by EAS. 281 * 282 * Return 0 on success or a proper error in case of failure. 283 */ 284 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, 285 int nr_states) 286 { 287 return em_compute_costs(dev, table, NULL, nr_states, 0); 288 } 289 290 /** 291 * em_dev_update_perf_domain() - Update runtime EM table for a device 292 * @dev : Device for which the EM is to be updated 293 * @new_table : The new EM table that is going to be used from now 294 * 295 * Update EM runtime modifiable table for the @dev using the provided @table. 296 * 297 * This function uses a mutex to serialize writers, so it must not be called 298 * from a non-sleeping context. 299 * 300 * Return 0 on success or an error code on failure. 301 */ 302 int em_dev_update_perf_domain(struct device *dev, 303 struct em_perf_table *new_table) 304 { 305 struct em_perf_table *old_table; 306 struct em_perf_domain *pd; 307 308 if (!dev) 309 return -EINVAL; 310 311 /* Serialize update/unregister or concurrent updates */ 312 mutex_lock(&em_pd_mutex); 313 314 if (!dev->em_pd) { 315 mutex_unlock(&em_pd_mutex); 316 return -EINVAL; 317 } 318 pd = dev->em_pd; 319 320 kref_get(&new_table->kref); 321 322 old_table = rcu_dereference_protected(pd->em_table, 323 lockdep_is_held(&em_pd_mutex)); 324 rcu_assign_pointer(pd->em_table, new_table); 325 326 em_cpufreq_update_efficiencies(dev, new_table->state); 327 328 em_table_free(old_table); 329 330 mutex_unlock(&em_pd_mutex); 331 return 0; 332 } 333 EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); 334 335 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, 336 struct em_perf_state *table, 337 const struct em_data_callback *cb, 338 unsigned long flags) 339 { 340 unsigned long power, freq, prev_freq = 0; 341 int nr_states = pd->nr_perf_states; 342 int i, ret; 343 344 /* Build the list of performance states for this performance domain */ 345 for (i = 0, freq = 0; i < nr_states; i++, freq++) { 346 /* 347 * active_power() is a driver callback which ceils 'freq' to 348 * lowest performance state of 'dev' above 'freq' and updates 349 * 'power' and 'freq' accordingly. 350 */ 351 ret = cb->active_power(dev, &power, &freq); 352 if (ret) { 353 dev_err(dev, "EM: invalid perf. state: %d\n", 354 ret); 355 return -EINVAL; 356 } 357 358 /* 359 * We expect the driver callback to increase the frequency for 360 * higher performance states. 361 */ 362 if (freq <= prev_freq) { 363 dev_err(dev, "EM: non-increasing freq: %lu\n", 364 freq); 365 return -EINVAL; 366 } 367 368 /* 369 * The power returned by active_state() is expected to be 370 * positive and be in range. 371 */ 372 if (!power || power > EM_MAX_POWER) { 373 dev_err(dev, "EM: invalid power: %lu\n", 374 power); 375 return -EINVAL; 376 } 377 378 table[i].power = power; 379 table[i].frequency = prev_freq = freq; 380 } 381 382 em_init_performance(dev, pd, table, nr_states); 383 384 ret = em_compute_costs(dev, table, cb, nr_states, flags); 385 if (ret) 386 return -EINVAL; 387 388 return 0; 389 } 390 391 static int em_create_pd(struct device *dev, int nr_states, 392 const struct em_data_callback *cb, 393 const cpumask_t *cpus, 394 unsigned long flags) 395 { 396 struct em_perf_table *em_table; 397 struct em_perf_domain *pd; 398 struct device *cpu_dev; 399 int cpu, ret, num_cpus; 400 401 if (_is_cpu_device(dev)) { 402 num_cpus = cpumask_weight(cpus); 403 404 /* Prevent max possible energy calculation to not overflow */ 405 if (num_cpus > EM_MAX_NUM_CPUS) { 406 dev_err(dev, "EM: too many CPUs, overflow possible\n"); 407 return -EINVAL; 408 } 409 410 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 411 if (!pd) 412 return -ENOMEM; 413 414 cpumask_copy(em_span_cpus(pd), cpus); 415 } else { 416 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 417 if (!pd) 418 return -ENOMEM; 419 } 420 421 pd->nr_perf_states = nr_states; 422 423 em_table = em_table_alloc(pd); 424 if (!em_table) 425 goto free_pd; 426 427 ret = em_create_perf_table(dev, pd, em_table->state, cb, flags); 428 if (ret) 429 goto free_pd_table; 430 431 rcu_assign_pointer(pd->em_table, em_table); 432 433 if (_is_cpu_device(dev)) 434 for_each_cpu(cpu, cpus) { 435 cpu_dev = get_cpu_device(cpu); 436 cpu_dev->em_pd = pd; 437 } 438 439 dev->em_pd = pd; 440 441 return 0; 442 443 free_pd_table: 444 kfree(em_table); 445 free_pd: 446 kfree(pd); 447 return -EINVAL; 448 } 449 450 static void 451 em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table) 452 { 453 struct em_perf_domain *pd = dev->em_pd; 454 struct cpufreq_policy *policy; 455 int found = 0; 456 int i, cpu; 457 458 if (!_is_cpu_device(dev)) 459 return; 460 461 /* Try to get a CPU which is active and in this PD */ 462 cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask); 463 if (cpu >= nr_cpu_ids) { 464 dev_warn(dev, "EM: No online CPU for CPUFreq policy\n"); 465 return; 466 } 467 468 policy = cpufreq_cpu_get(cpu); 469 if (!policy) { 470 dev_warn(dev, "EM: Access to CPUFreq policy failed\n"); 471 return; 472 } 473 474 for (i = 0; i < pd->nr_perf_states; i++) { 475 if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) 476 continue; 477 478 if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) 479 found++; 480 } 481 482 cpufreq_cpu_put(policy); 483 484 if (!found) 485 return; 486 487 /* 488 * Efficiencies have been installed in CPUFreq, inefficient frequencies 489 * will be skipped. The EM can do the same. 490 */ 491 pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; 492 } 493 494 /** 495 * em_pd_get() - Return the performance domain for a device 496 * @dev : Device to find the performance domain for 497 * 498 * Returns the performance domain to which @dev belongs, or NULL if it doesn't 499 * exist. 500 */ 501 struct em_perf_domain *em_pd_get(struct device *dev) 502 { 503 if (IS_ERR_OR_NULL(dev)) 504 return NULL; 505 506 return dev->em_pd; 507 } 508 EXPORT_SYMBOL_GPL(em_pd_get); 509 510 /** 511 * em_cpu_get() - Return the performance domain for a CPU 512 * @cpu : CPU to find the performance domain for 513 * 514 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't 515 * exist. 516 */ 517 struct em_perf_domain *em_cpu_get(int cpu) 518 { 519 struct device *cpu_dev; 520 521 cpu_dev = get_cpu_device(cpu); 522 if (!cpu_dev) 523 return NULL; 524 525 return em_pd_get(cpu_dev); 526 } 527 EXPORT_SYMBOL_GPL(em_cpu_get); 528 529 /** 530 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device 531 * @dev : Device for which the EM is to register 532 * @nr_states : Number of performance states to register 533 * @cb : Callback functions providing the data of the Energy Model 534 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 535 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 536 * type of devices this should be set to NULL. 537 * @microwatts : Flag indicating that the power values are in micro-Watts or 538 * in some other scale. It must be set properly. 539 * 540 * Create Energy Model tables for a performance domain using the callbacks 541 * defined in cb. 542 * 543 * The @microwatts is important to set with correct value. Some kernel 544 * sub-systems might rely on this flag and check if all devices in the EM are 545 * using the same scale. 546 * 547 * If multiple clients register the same performance domain, all but the first 548 * registration will be ignored. 549 * 550 * Return 0 on success 551 */ 552 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 553 const struct em_data_callback *cb, 554 const cpumask_t *cpus, bool microwatts) 555 { 556 int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); 557 558 if (_is_cpu_device(dev)) 559 em_check_capacity_update(); 560 561 return ret; 562 } 563 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); 564 565 /** 566 * em_dev_register_pd_no_update() - Register a perf domain for a device 567 * @dev : Device to register the PD for 568 * @nr_states : Number of performance states in the new PD 569 * @cb : Callback functions for populating the energy model 570 * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) 571 * @microwatts : Whether or not the power values in the EM will be in uW 572 * 573 * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity 574 * update after registering the PD, even if @dev is a CPU device. 575 */ 576 int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, 577 const struct em_data_callback *cb, 578 const cpumask_t *cpus, bool microwatts) 579 { 580 struct em_perf_table *em_table; 581 unsigned long cap, prev_cap = 0; 582 unsigned long flags = 0; 583 int cpu, ret; 584 585 if (!dev || !nr_states || !cb) 586 return -EINVAL; 587 588 /* 589 * Use a mutex to serialize the registration of performance domains and 590 * let the driver-defined callback functions sleep. 591 */ 592 mutex_lock(&em_pd_mutex); 593 594 if (dev->em_pd) { 595 ret = -EEXIST; 596 goto unlock; 597 } 598 599 if (_is_cpu_device(dev)) { 600 if (!cpus) { 601 dev_err(dev, "EM: invalid CPU mask\n"); 602 ret = -EINVAL; 603 goto unlock; 604 } 605 606 for_each_cpu(cpu, cpus) { 607 if (em_cpu_get(cpu)) { 608 dev_err(dev, "EM: exists for CPU%d\n", cpu); 609 ret = -EEXIST; 610 goto unlock; 611 } 612 /* 613 * All CPUs of a domain must have the same 614 * micro-architecture since they all share the same 615 * table. 616 */ 617 cap = arch_scale_cpu_capacity(cpu); 618 if (prev_cap && prev_cap != cap) { 619 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", 620 cpumask_pr_args(cpus)); 621 622 ret = -EINVAL; 623 goto unlock; 624 } 625 prev_cap = cap; 626 } 627 } 628 629 if (microwatts) 630 flags |= EM_PERF_DOMAIN_MICROWATTS; 631 else if (cb->get_cost) 632 flags |= EM_PERF_DOMAIN_ARTIFICIAL; 633 634 /* 635 * EM only supports uW (exception is artificial EM). 636 * Therefore, check and force the drivers to provide 637 * power in uW. 638 */ 639 if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) { 640 dev_err(dev, "EM: only supports uW power values\n"); 641 ret = -EINVAL; 642 goto unlock; 643 } 644 645 ret = em_create_pd(dev, nr_states, cb, cpus, flags); 646 if (ret) 647 goto unlock; 648 649 dev->em_pd->flags |= flags; 650 dev->em_pd->min_perf_state = 0; 651 dev->em_pd->max_perf_state = nr_states - 1; 652 653 em_table = rcu_dereference_protected(dev->em_pd->em_table, 654 lockdep_is_held(&em_pd_mutex)); 655 em_cpufreq_update_efficiencies(dev, em_table->state); 656 657 em_debug_create_pd(dev); 658 dev_info(dev, "EM: created perf domain\n"); 659 660 unlock: 661 mutex_unlock(&em_pd_mutex); 662 663 return ret; 664 } 665 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); 666 667 /** 668 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device 669 * @dev : Device for which the EM is registered 670 * 671 * Unregister the EM for the specified @dev (but not a CPU device). 672 */ 673 void em_dev_unregister_perf_domain(struct device *dev) 674 { 675 if (IS_ERR_OR_NULL(dev) || !dev->em_pd) 676 return; 677 678 if (_is_cpu_device(dev)) 679 return; 680 681 /* 682 * The mutex separates all register/unregister requests and protects 683 * from potential clean-up/setup issues in the debugfs directories. 684 * The debugfs directory name is the same as device's name. 685 */ 686 mutex_lock(&em_pd_mutex); 687 em_debug_remove_pd(dev); 688 689 em_table_free(rcu_dereference_protected(dev->em_pd->em_table, 690 lockdep_is_held(&em_pd_mutex))); 691 692 kfree(dev->em_pd); 693 dev->em_pd = NULL; 694 mutex_unlock(&em_pd_mutex); 695 } 696 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); 697 698 static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) 699 { 700 struct em_perf_table *em_table; 701 struct em_perf_state *ps, *new_ps; 702 int ps_size; 703 704 em_table = em_table_alloc(pd); 705 if (!em_table) 706 return NULL; 707 708 new_ps = em_table->state; 709 710 rcu_read_lock(); 711 ps = em_perf_state_from_pd(pd); 712 /* Initialize data based on old table */ 713 ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states; 714 memcpy(new_ps, ps, ps_size); 715 716 rcu_read_unlock(); 717 718 return em_table; 719 } 720 721 static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, 722 struct em_perf_table *em_table) 723 { 724 int ret; 725 726 if (!em_is_artificial(pd)) { 727 ret = em_compute_costs(dev, em_table->state, NULL, 728 pd->nr_perf_states, pd->flags); 729 if (ret) 730 goto free_em_table; 731 } 732 733 ret = em_dev_update_perf_domain(dev, em_table); 734 if (ret) 735 goto free_em_table; 736 737 /* 738 * This is one-time-update, so give up the ownership in this updater. 739 * The EM framework has incremented the usage counter and from now 740 * will keep the reference (then free the memory when needed). 741 */ 742 free_em_table: 743 em_table_free(em_table); 744 return ret; 745 } 746 747 /* 748 * Adjustment of CPU performance values after boot, when all CPUs capacites 749 * are correctly calculated. 750 */ 751 static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, 752 struct em_perf_domain *pd) 753 { 754 unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); 755 struct em_perf_table *em_table; 756 struct em_perf_state *table; 757 unsigned long em_max_perf; 758 759 rcu_read_lock(); 760 table = em_perf_state_from_pd(pd); 761 em_max_perf = table[pd->nr_perf_states - 1].performance; 762 rcu_read_unlock(); 763 764 if (em_max_perf == cpu_capacity) 765 return; 766 767 pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, 768 cpu_capacity, em_max_perf); 769 770 em_table = em_table_dup(pd); 771 if (!em_table) { 772 dev_warn(dev, "EM: allocation failed\n"); 773 return; 774 } 775 776 em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); 777 778 em_recalc_and_update(dev, pd, em_table); 779 } 780 781 /** 782 * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. 783 * @cpu: Target CPU. 784 * 785 * Adjust the existing EM for @cpu after a capacity update under the assumption 786 * that the capacity has been updated in the same way for all of the CPUs in 787 * the same perf domain. 788 */ 789 void em_adjust_cpu_capacity(unsigned int cpu) 790 { 791 struct device *dev = get_cpu_device(cpu); 792 struct em_perf_domain *pd; 793 794 pd = em_pd_get(dev); 795 if (pd) 796 em_adjust_new_capacity(cpu, dev, pd); 797 } 798 799 static void em_check_capacity_update(void) 800 { 801 cpumask_var_t cpu_done_mask; 802 int cpu; 803 804 if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { 805 pr_warn("no free memory\n"); 806 return; 807 } 808 809 /* Check if CPUs capacity has changed than update EM */ 810 for_each_possible_cpu(cpu) { 811 struct cpufreq_policy *policy; 812 struct em_perf_domain *pd; 813 struct device *dev; 814 815 if (cpumask_test_cpu(cpu, cpu_done_mask)) 816 continue; 817 818 policy = cpufreq_cpu_get(cpu); 819 if (!policy) { 820 pr_debug("Accessing cpu%d policy failed\n", cpu); 821 schedule_delayed_work(&em_update_work, 822 msecs_to_jiffies(1000)); 823 break; 824 } 825 cpufreq_cpu_put(policy); 826 827 dev = get_cpu_device(cpu); 828 pd = em_pd_get(dev); 829 if (!pd || em_is_artificial(pd)) 830 continue; 831 832 cpumask_or(cpu_done_mask, cpu_done_mask, 833 em_span_cpus(pd)); 834 835 em_adjust_new_capacity(cpu, dev, pd); 836 } 837 838 free_cpumask_var(cpu_done_mask); 839 } 840 841 static void em_update_workfn(struct work_struct *work) 842 { 843 em_check_capacity_update(); 844 } 845 846 /** 847 * em_dev_update_chip_binning() - Update Energy Model after the new voltage 848 * information is present in the OPPs. 849 * @dev : Device for which the Energy Model has to be updated. 850 * 851 * This function allows to update easily the EM with new values available in 852 * the OPP framework and DT. It can be used after the chip has been properly 853 * verified by device drivers and the voltages adjusted for the 'chip binning'. 854 */ 855 int em_dev_update_chip_binning(struct device *dev) 856 { 857 struct em_perf_table *em_table; 858 struct em_perf_domain *pd; 859 int i, ret; 860 861 if (IS_ERR_OR_NULL(dev)) 862 return -EINVAL; 863 864 pd = em_pd_get(dev); 865 if (!pd) { 866 dev_warn(dev, "Couldn't find Energy Model\n"); 867 return -EINVAL; 868 } 869 870 em_table = em_table_dup(pd); 871 if (!em_table) { 872 dev_warn(dev, "EM: allocation failed\n"); 873 return -ENOMEM; 874 } 875 876 /* Update power values which might change due to new voltage in OPPs */ 877 for (i = 0; i < pd->nr_perf_states; i++) { 878 unsigned long freq = em_table->state[i].frequency; 879 unsigned long power; 880 881 ret = dev_pm_opp_calc_power(dev, &power, &freq); 882 if (ret) { 883 em_table_free(em_table); 884 return ret; 885 } 886 887 em_table->state[i].power = power; 888 } 889 890 return em_recalc_and_update(dev, pd, em_table); 891 } 892 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); 893 894 895 /** 896 * em_update_performance_limits() - Update Energy Model with performance 897 * limits information. 898 * @pd : Performance Domain with EM that has to be updated. 899 * @freq_min_khz : New minimum allowed frequency for this device. 900 * @freq_max_khz : New maximum allowed frequency for this device. 901 * 902 * This function allows to update the EM with information about available 903 * performance levels. It takes the minimum and maximum frequency in kHz 904 * and does internal translation to performance levels. 905 * Returns 0 on success or -EINVAL when failed. 906 */ 907 int em_update_performance_limits(struct em_perf_domain *pd, 908 unsigned long freq_min_khz, unsigned long freq_max_khz) 909 { 910 struct em_perf_state *table; 911 int min_ps = -1; 912 int max_ps = -1; 913 int i; 914 915 if (!pd) 916 return -EINVAL; 917 918 rcu_read_lock(); 919 table = em_perf_state_from_pd(pd); 920 921 for (i = 0; i < pd->nr_perf_states; i++) { 922 if (freq_min_khz == table[i].frequency) 923 min_ps = i; 924 if (freq_max_khz == table[i].frequency) 925 max_ps = i; 926 } 927 rcu_read_unlock(); 928 929 /* Only update when both are found and sane */ 930 if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) 931 return -EINVAL; 932 933 934 /* Guard simultaneous updates and make them atomic */ 935 mutex_lock(&em_pd_mutex); 936 pd->min_perf_state = min_ps; 937 pd->max_perf_state = max_ps; 938 mutex_unlock(&em_pd_mutex); 939 940 return 0; 941 } 942 EXPORT_SYMBOL_GPL(em_update_performance_limits); 943 944 static void rebuild_sd_workfn(struct work_struct *work) 945 { 946 rebuild_sched_domains_energy(); 947 } 948 949 void em_rebuild_sched_domains(void) 950 { 951 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); 952 953 /* 954 * When called from the cpufreq_register_driver() path, the 955 * cpu_hotplug_lock is already held, so use a work item to 956 * avoid nested locking in rebuild_sched_domains(). 957 */ 958 schedule_work(&rebuild_sd_work); 959 } 960