1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Energy Model of devices
4 *
5 * Copyright (c) 2018-2021, Arm ltd.
6 * Written by: Quentin Perret, Arm ltd.
7 * Improvements provided by: Lukasz Luba, Arm ltd.
8 */
9
10 #define pr_fmt(fmt) "energy_model: " fmt
11
12 #include <linux/cpu.h>
13 #include <linux/cpufreq.h>
14 #include <linux/cpumask.h>
15 #include <linux/debugfs.h>
16 #include <linux/energy_model.h>
17 #include <linux/sched/topology.h>
18 #include <linux/slab.h>
19
20 #include "em_netlink.h"
21
22 /*
23 * Mutex serializing the registrations of performance domains and letting
24 * callbacks defined by drivers sleep.
25 */
26 static DEFINE_MUTEX(em_pd_mutex);
27
28 /*
29 * Manage performance domains with IDs. One can iterate the performance domains
30 * through the list and pick one with their associated ID. The mutex serializes
31 * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
32 * taken to avoid potential deadlock.
33 */
34 static DEFINE_IDA(em_pd_ida);
35 static LIST_HEAD(em_pd_list);
36 static DEFINE_MUTEX(em_pd_list_mutex);
37
38 static void em_cpufreq_update_efficiencies(struct device *dev,
39 struct em_perf_state *table);
40 static void em_check_capacity_update(void);
41 static void em_update_workfn(struct work_struct *work);
42 static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
43
_is_cpu_device(struct device * dev)44 static bool _is_cpu_device(struct device *dev)
45 {
46 return (dev->bus == &cpu_subsys);
47 }
48
49 #ifdef CONFIG_DEBUG_FS
50 static struct dentry *rootdir;
51
52 struct em_dbg_info {
53 struct em_perf_domain *pd;
54 int ps_id;
55 };
56
57 #define DEFINE_EM_DBG_SHOW(name, fname) \
58 static int em_debug_##fname##_show(struct seq_file *s, void *unused) \
59 { \
60 struct em_dbg_info *em_dbg = s->private; \
61 struct em_perf_state *table; \
62 unsigned long val; \
63 \
64 rcu_read_lock(); \
65 table = em_perf_state_from_pd(em_dbg->pd); \
66 val = table[em_dbg->ps_id].name; \
67 rcu_read_unlock(); \
68 \
69 seq_printf(s, "%lu\n", val); \
70 return 0; \
71 } \
72 DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
73
74 DEFINE_EM_DBG_SHOW(frequency, frequency);
75 DEFINE_EM_DBG_SHOW(power, power);
76 DEFINE_EM_DBG_SHOW(cost, cost);
77 DEFINE_EM_DBG_SHOW(performance, performance);
78 DEFINE_EM_DBG_SHOW(flags, inefficiency);
79
em_debug_create_ps(struct em_perf_domain * em_pd,struct em_dbg_info * em_dbg,int i,struct dentry * pd)80 static void em_debug_create_ps(struct em_perf_domain *em_pd,
81 struct em_dbg_info *em_dbg, int i,
82 struct dentry *pd)
83 {
84 struct em_perf_state *table;
85 unsigned long freq;
86 struct dentry *d;
87 char name[24];
88
89 em_dbg[i].pd = em_pd;
90 em_dbg[i].ps_id = i;
91
92 rcu_read_lock();
93 table = em_perf_state_from_pd(em_pd);
94 freq = table[i].frequency;
95 rcu_read_unlock();
96
97 snprintf(name, sizeof(name), "ps:%lu", freq);
98
99 /* Create per-ps directory */
100 d = debugfs_create_dir(name, pd);
101 debugfs_create_file("frequency", 0444, d, &em_dbg[i],
102 &em_debug_frequency_fops);
103 debugfs_create_file("power", 0444, d, &em_dbg[i],
104 &em_debug_power_fops);
105 debugfs_create_file("cost", 0444, d, &em_dbg[i],
106 &em_debug_cost_fops);
107 debugfs_create_file("performance", 0444, d, &em_dbg[i],
108 &em_debug_performance_fops);
109 debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
110 &em_debug_inefficiency_fops);
111 }
112
em_debug_cpus_show(struct seq_file * s,void * unused)113 static int em_debug_cpus_show(struct seq_file *s, void *unused)
114 {
115 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
116
117 return 0;
118 }
119 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
120
em_debug_flags_show(struct seq_file * s,void * unused)121 static int em_debug_flags_show(struct seq_file *s, void *unused)
122 {
123 struct em_perf_domain *pd = s->private;
124
125 seq_printf(s, "%#lx\n", pd->flags);
126
127 return 0;
128 }
129 DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
130
em_debug_id_show(struct seq_file * s,void * unused)131 static int em_debug_id_show(struct seq_file *s, void *unused)
132 {
133 struct em_perf_domain *pd = s->private;
134
135 seq_printf(s, "%d\n", pd->id);
136
137 return 0;
138 }
139 DEFINE_SHOW_ATTRIBUTE(em_debug_id);
140
em_debug_create_pd(struct device * dev)141 static void em_debug_create_pd(struct device *dev)
142 {
143 struct em_dbg_info *em_dbg;
144 struct dentry *d;
145 int i;
146
147 /* Create the directory of the performance domain */
148 d = debugfs_create_dir(dev_name(dev), rootdir);
149
150 if (_is_cpu_device(dev))
151 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
152 &em_debug_cpus_fops);
153
154 debugfs_create_file("flags", 0444, d, dev->em_pd,
155 &em_debug_flags_fops);
156
157 debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
158
159 em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
160 sizeof(*em_dbg), GFP_KERNEL);
161 if (!em_dbg)
162 return;
163
164 /* Create a sub-directory for each performance state */
165 for (i = 0; i < dev->em_pd->nr_perf_states; i++)
166 em_debug_create_ps(dev->em_pd, em_dbg, i, d);
167
168 }
169
em_debug_remove_pd(struct device * dev)170 static void em_debug_remove_pd(struct device *dev)
171 {
172 debugfs_lookup_and_remove(dev_name(dev), rootdir);
173 }
174
em_debug_init(void)175 static int __init em_debug_init(void)
176 {
177 /* Create /sys/kernel/debug/energy_model directory */
178 rootdir = debugfs_create_dir("energy_model", NULL);
179
180 return 0;
181 }
182 fs_initcall(em_debug_init);
183 #else /* CONFIG_DEBUG_FS */
em_debug_create_pd(struct device * dev)184 static void em_debug_create_pd(struct device *dev) {}
em_debug_remove_pd(struct device * dev)185 static void em_debug_remove_pd(struct device *dev) {}
186 #endif
187
em_release_table_kref(struct kref * kref)188 static void em_release_table_kref(struct kref *kref)
189 {
190 /* It was the last owner of this table so we can free */
191 kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
192 }
193
194 /**
195 * em_table_free() - Handles safe free of the EM table when needed
196 * @table : EM table which is going to be freed
197 *
198 * No return values.
199 */
em_table_free(struct em_perf_table * table)200 void em_table_free(struct em_perf_table *table)
201 {
202 kref_put(&table->kref, em_release_table_kref);
203 }
204
205 /**
206 * em_table_alloc() - Allocate a new EM table
207 * @pd : EM performance domain for which this must be done
208 *
209 * Allocate a new EM table and initialize its kref to indicate that it
210 * has a user.
211 * Returns allocated table or NULL.
212 */
em_table_alloc(struct em_perf_domain * pd)213 struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
214 {
215 struct em_perf_table *table;
216 int table_size;
217
218 table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
219
220 table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
221 if (!table)
222 return NULL;
223
224 kref_init(&table->kref);
225
226 return table;
227 }
228
em_init_performance(struct device * dev,struct em_perf_domain * pd,struct em_perf_state * table,int nr_states)229 static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
230 struct em_perf_state *table, int nr_states)
231 {
232 u64 fmax, max_cap;
233 int i, cpu;
234
235 /* This is needed only for CPUs and EAS skip other devices */
236 if (!_is_cpu_device(dev))
237 return;
238
239 cpu = cpumask_first(em_span_cpus(pd));
240
241 /*
242 * Calculate the performance value for each frequency with
243 * linear relationship. The final CPU capacity might not be ready at
244 * boot time, but the EM will be updated a bit later with correct one.
245 */
246 fmax = (u64) table[nr_states - 1].frequency;
247 max_cap = (u64) arch_scale_cpu_capacity(cpu);
248 for (i = 0; i < nr_states; i++)
249 table[i].performance = div64_u64(max_cap * table[i].frequency,
250 fmax);
251 }
252
em_compute_costs(struct device * dev,struct em_perf_state * table,const struct em_data_callback * cb,int nr_states,unsigned long flags)253 static int em_compute_costs(struct device *dev, struct em_perf_state *table,
254 const struct em_data_callback *cb, int nr_states,
255 unsigned long flags)
256 {
257 unsigned long prev_cost = ULONG_MAX;
258 int i, ret;
259
260 /* This is needed only for CPUs and EAS skip other devices */
261 if (!_is_cpu_device(dev))
262 return 0;
263
264 /* Compute the cost of each performance state. */
265 for (i = nr_states - 1; i >= 0; i--) {
266 unsigned long power_res, cost;
267
268 if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
269 ret = cb->get_cost(dev, table[i].frequency, &cost);
270 if (ret || !cost || cost > EM_MAX_POWER) {
271 dev_err(dev, "EM: invalid cost %lu %d\n",
272 cost, ret);
273 return -EINVAL;
274 }
275 } else {
276 /* increase resolution of 'cost' precision */
277 power_res = table[i].power * 10;
278 cost = power_res / table[i].performance;
279 }
280
281 table[i].cost = cost;
282
283 if (table[i].cost >= prev_cost) {
284 table[i].flags = EM_PERF_STATE_INEFFICIENT;
285 dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
286 table[i].frequency);
287 } else {
288 prev_cost = table[i].cost;
289 }
290 }
291
292 return 0;
293 }
294
295 /**
296 * em_dev_compute_costs() - Calculate cost values for new runtime EM table
297 * @dev : Device for which the EM table is to be updated
298 * @table : The new EM table that is going to get the costs calculated
299 * @nr_states : Number of performance states
300 *
301 * Calculate the em_perf_state::cost values for new runtime EM table. The
302 * values are used for EAS during task placement. It also calculates and sets
303 * the efficiency flag for each performance state. When the function finish
304 * successfully the EM table is ready to be updated and used by EAS.
305 *
306 * Return 0 on success or a proper error in case of failure.
307 */
em_dev_compute_costs(struct device * dev,struct em_perf_state * table,int nr_states)308 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
309 int nr_states)
310 {
311 return em_compute_costs(dev, table, NULL, nr_states, 0);
312 }
313
314 /**
315 * em_dev_update_perf_domain() - Update runtime EM table for a device
316 * @dev : Device for which the EM is to be updated
317 * @new_table : The new EM table that is going to be used from now
318 *
319 * Update EM runtime modifiable table for the @dev using the provided @table.
320 *
321 * This function uses a mutex to serialize writers, so it must not be called
322 * from a non-sleeping context.
323 *
324 * Return 0 on success or an error code on failure.
325 */
em_dev_update_perf_domain(struct device * dev,struct em_perf_table * new_table)326 int em_dev_update_perf_domain(struct device *dev,
327 struct em_perf_table *new_table)
328 {
329 struct em_perf_table *old_table;
330 struct em_perf_domain *pd;
331
332 if (!dev)
333 return -EINVAL;
334
335 /* Serialize update/unregister or concurrent updates */
336 mutex_lock(&em_pd_mutex);
337
338 if (!dev->em_pd) {
339 mutex_unlock(&em_pd_mutex);
340 return -EINVAL;
341 }
342 pd = dev->em_pd;
343
344 kref_get(&new_table->kref);
345
346 old_table = rcu_dereference_protected(pd->em_table,
347 lockdep_is_held(&em_pd_mutex));
348 rcu_assign_pointer(pd->em_table, new_table);
349
350 em_cpufreq_update_efficiencies(dev, new_table->state);
351
352 em_table_free(old_table);
353
354 mutex_unlock(&em_pd_mutex);
355
356 em_notify_pd_updated(pd);
357 return 0;
358 }
359 EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
360
em_create_perf_table(struct device * dev,struct em_perf_domain * pd,struct em_perf_state * table,const struct em_data_callback * cb,unsigned long flags)361 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
362 struct em_perf_state *table,
363 const struct em_data_callback *cb,
364 unsigned long flags)
365 {
366 unsigned long power, freq, prev_freq = 0;
367 int nr_states = pd->nr_perf_states;
368 int i, ret;
369
370 /* Build the list of performance states for this performance domain */
371 for (i = 0, freq = 0; i < nr_states; i++, freq++) {
372 /*
373 * active_power() is a driver callback which ceils 'freq' to
374 * lowest performance state of 'dev' above 'freq' and updates
375 * 'power' and 'freq' accordingly.
376 */
377 ret = cb->active_power(dev, &power, &freq);
378 if (ret) {
379 dev_err(dev, "EM: invalid perf. state: %d\n",
380 ret);
381 return -EINVAL;
382 }
383
384 /*
385 * We expect the driver callback to increase the frequency for
386 * higher performance states.
387 */
388 if (freq <= prev_freq) {
389 dev_err(dev, "EM: non-increasing freq: %lu\n",
390 freq);
391 return -EINVAL;
392 }
393
394 /*
395 * The power returned by active_state() is expected to be
396 * positive and be in range.
397 */
398 if (!power || power > EM_MAX_POWER) {
399 dev_err(dev, "EM: invalid power: %lu\n",
400 power);
401 return -EINVAL;
402 }
403
404 table[i].power = power;
405 table[i].frequency = prev_freq = freq;
406 }
407
408 em_init_performance(dev, pd, table, nr_states);
409
410 ret = em_compute_costs(dev, table, cb, nr_states, flags);
411 if (ret)
412 return -EINVAL;
413
414 return 0;
415 }
416
em_create_pd(struct device * dev,int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,unsigned long flags)417 static int em_create_pd(struct device *dev, int nr_states,
418 const struct em_data_callback *cb,
419 const cpumask_t *cpus,
420 unsigned long flags)
421 {
422 struct em_perf_table *em_table;
423 struct em_perf_domain *pd;
424 struct device *cpu_dev;
425 int cpu, ret, num_cpus, id;
426
427 if (_is_cpu_device(dev)) {
428 num_cpus = cpumask_weight(cpus);
429
430 /* Prevent max possible energy calculation to not overflow */
431 if (num_cpus > EM_MAX_NUM_CPUS) {
432 dev_err(dev, "EM: too many CPUs, overflow possible\n");
433 return -EINVAL;
434 }
435
436 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
437 if (!pd)
438 return -ENOMEM;
439
440 cpumask_copy(em_span_cpus(pd), cpus);
441 } else {
442 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
443 if (!pd)
444 return -ENOMEM;
445 }
446
447 pd->nr_perf_states = nr_states;
448
449 INIT_LIST_HEAD(&pd->node);
450
451 id = ida_alloc(&em_pd_ida, GFP_KERNEL);
452 if (id < 0) {
453 kfree(pd);
454 return id;
455 }
456 pd->id = id;
457
458 em_table = em_table_alloc(pd);
459 if (!em_table)
460 goto free_pd;
461
462 ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
463 if (ret)
464 goto free_pd_table;
465
466 rcu_assign_pointer(pd->em_table, em_table);
467
468 if (_is_cpu_device(dev))
469 for_each_cpu(cpu, cpus) {
470 cpu_dev = get_cpu_device(cpu);
471 cpu_dev->em_pd = pd;
472 }
473
474 dev->em_pd = pd;
475
476 return 0;
477
478 free_pd_table:
479 kfree(em_table);
480 free_pd:
481 kfree(pd);
482 ida_free(&em_pd_ida, id);
483 return -EINVAL;
484 }
485
486 static void
em_cpufreq_update_efficiencies(struct device * dev,struct em_perf_state * table)487 em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
488 {
489 struct em_perf_domain *pd = dev->em_pd;
490 struct cpufreq_policy *policy;
491 int found = 0;
492 int i, cpu;
493
494 if (!_is_cpu_device(dev))
495 return;
496
497 /* Try to get a CPU which is active and in this PD */
498 cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
499 if (cpu >= nr_cpu_ids) {
500 dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
501 return;
502 }
503
504 policy = cpufreq_cpu_get(cpu);
505 if (!policy) {
506 dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
507 return;
508 }
509
510 for (i = 0; i < pd->nr_perf_states; i++) {
511 if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
512 continue;
513
514 if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
515 found++;
516 }
517
518 cpufreq_cpu_put(policy);
519
520 if (!found)
521 return;
522
523 /*
524 * Efficiencies have been installed in CPUFreq, inefficient frequencies
525 * will be skipped. The EM can do the same.
526 */
527 pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
528 }
529
530 /**
531 * em_pd_get() - Return the performance domain for a device
532 * @dev : Device to find the performance domain for
533 *
534 * Returns the performance domain to which @dev belongs, or NULL if it doesn't
535 * exist.
536 */
em_pd_get(struct device * dev)537 struct em_perf_domain *em_pd_get(struct device *dev)
538 {
539 if (IS_ERR_OR_NULL(dev))
540 return NULL;
541
542 return dev->em_pd;
543 }
544 EXPORT_SYMBOL_GPL(em_pd_get);
545
546 /**
547 * em_cpu_get() - Return the performance domain for a CPU
548 * @cpu : CPU to find the performance domain for
549 *
550 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
551 * exist.
552 */
em_cpu_get(int cpu)553 struct em_perf_domain *em_cpu_get(int cpu)
554 {
555 struct device *cpu_dev;
556
557 cpu_dev = get_cpu_device(cpu);
558 if (!cpu_dev)
559 return NULL;
560
561 return em_pd_get(cpu_dev);
562 }
563 EXPORT_SYMBOL_GPL(em_cpu_get);
564
565 /**
566 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
567 * @dev : Device for which the EM is to register
568 * @nr_states : Number of performance states to register
569 * @cb : Callback functions providing the data of the Energy Model
570 * @cpus : Pointer to cpumask_t, which in case of a CPU device is
571 * obligatory. It can be taken from i.e. 'policy->cpus'. For other
572 * type of devices this should be set to NULL.
573 * @microwatts : Flag indicating that the power values are in micro-Watts or
574 * in some other scale. It must be set properly.
575 *
576 * Create Energy Model tables for a performance domain using the callbacks
577 * defined in cb.
578 *
579 * The @microwatts is important to set with correct value. Some kernel
580 * sub-systems might rely on this flag and check if all devices in the EM are
581 * using the same scale.
582 *
583 * If multiple clients register the same performance domain, all but the first
584 * registration will be ignored.
585 *
586 * Return 0 on success
587 */
em_dev_register_perf_domain(struct device * dev,unsigned int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,bool microwatts)588 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
589 const struct em_data_callback *cb,
590 const cpumask_t *cpus, bool microwatts)
591 {
592 int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
593
594 if (_is_cpu_device(dev))
595 em_check_capacity_update();
596
597 return ret;
598 }
599 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
600
601 /**
602 * em_dev_register_pd_no_update() - Register a perf domain for a device
603 * @dev : Device to register the PD for
604 * @nr_states : Number of performance states in the new PD
605 * @cb : Callback functions for populating the energy model
606 * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
607 * @microwatts : Whether or not the power values in the EM will be in uW
608 *
609 * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
610 * update after registering the PD, even if @dev is a CPU device.
611 */
em_dev_register_pd_no_update(struct device * dev,unsigned int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,bool microwatts)612 int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
613 const struct em_data_callback *cb,
614 const cpumask_t *cpus, bool microwatts)
615 {
616 struct em_perf_table *em_table;
617 unsigned long cap, prev_cap = 0;
618 unsigned long flags = 0;
619 int cpu, ret;
620
621 if (!dev || !nr_states || !cb)
622 return -EINVAL;
623
624 /*
625 * Use a mutex to serialize the registration of performance domains and
626 * let the driver-defined callback functions sleep.
627 */
628 mutex_lock(&em_pd_mutex);
629
630 if (dev->em_pd) {
631 ret = -EEXIST;
632 goto unlock;
633 }
634
635 if (_is_cpu_device(dev)) {
636 if (!cpus) {
637 dev_err(dev, "EM: invalid CPU mask\n");
638 ret = -EINVAL;
639 goto unlock;
640 }
641
642 for_each_cpu(cpu, cpus) {
643 if (em_cpu_get(cpu)) {
644 dev_err(dev, "EM: exists for CPU%d\n", cpu);
645 ret = -EEXIST;
646 goto unlock;
647 }
648 /*
649 * All CPUs of a domain must have the same
650 * micro-architecture since they all share the same
651 * table.
652 */
653 cap = arch_scale_cpu_capacity(cpu);
654 if (prev_cap && prev_cap != cap) {
655 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
656 cpumask_pr_args(cpus));
657
658 ret = -EINVAL;
659 goto unlock;
660 }
661 prev_cap = cap;
662 }
663 }
664
665 if (microwatts)
666 flags |= EM_PERF_DOMAIN_MICROWATTS;
667 else if (cb->get_cost)
668 flags |= EM_PERF_DOMAIN_ARTIFICIAL;
669
670 /*
671 * EM only supports uW (exception is artificial EM).
672 * Therefore, check and force the drivers to provide
673 * power in uW.
674 */
675 if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
676 dev_err(dev, "EM: only supports uW power values\n");
677 ret = -EINVAL;
678 goto unlock;
679 }
680
681 ret = em_create_pd(dev, nr_states, cb, cpus, flags);
682 if (ret)
683 goto unlock;
684
685 dev->em_pd->flags |= flags;
686 dev->em_pd->min_perf_state = 0;
687 dev->em_pd->max_perf_state = nr_states - 1;
688
689 em_table = rcu_dereference_protected(dev->em_pd->em_table,
690 lockdep_is_held(&em_pd_mutex));
691 em_cpufreq_update_efficiencies(dev, em_table->state);
692
693 em_debug_create_pd(dev);
694 dev_info(dev, "EM: created perf domain\n");
695
696 unlock:
697 mutex_unlock(&em_pd_mutex);
698 if (ret)
699 return ret;
700
701 mutex_lock(&em_pd_list_mutex);
702 list_add_tail(&dev->em_pd->node, &em_pd_list);
703 mutex_unlock(&em_pd_list_mutex);
704
705 em_notify_pd_created(dev->em_pd);
706
707 return 0;
708 }
709 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
710
711 /**
712 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
713 * @dev : Device for which the EM is registered
714 *
715 * Unregister the EM for the specified @dev (but not a CPU device).
716 */
em_dev_unregister_perf_domain(struct device * dev)717 void em_dev_unregister_perf_domain(struct device *dev)
718 {
719 if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
720 return;
721
722 if (_is_cpu_device(dev))
723 return;
724
725 mutex_lock(&em_pd_list_mutex);
726 list_del_init(&dev->em_pd->node);
727 mutex_unlock(&em_pd_list_mutex);
728
729 em_notify_pd_deleted(dev->em_pd);
730
731 /*
732 * The mutex separates all register/unregister requests and protects
733 * from potential clean-up/setup issues in the debugfs directories.
734 * The debugfs directory name is the same as device's name.
735 */
736 mutex_lock(&em_pd_mutex);
737 em_debug_remove_pd(dev);
738
739 em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
740 lockdep_is_held(&em_pd_mutex)));
741
742 ida_free(&em_pd_ida, dev->em_pd->id);
743
744 kfree(dev->em_pd);
745 dev->em_pd = NULL;
746 mutex_unlock(&em_pd_mutex);
747 }
748 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
749
em_table_dup(struct em_perf_domain * pd)750 static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
751 {
752 struct em_perf_table *em_table;
753 struct em_perf_state *ps, *new_ps;
754 int ps_size;
755
756 em_table = em_table_alloc(pd);
757 if (!em_table)
758 return NULL;
759
760 new_ps = em_table->state;
761
762 rcu_read_lock();
763 ps = em_perf_state_from_pd(pd);
764 /* Initialize data based on old table */
765 ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
766 memcpy(new_ps, ps, ps_size);
767
768 rcu_read_unlock();
769
770 return em_table;
771 }
772
em_recalc_and_update(struct device * dev,struct em_perf_domain * pd,struct em_perf_table * em_table)773 static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
774 struct em_perf_table *em_table)
775 {
776 int ret;
777
778 if (!em_is_artificial(pd)) {
779 ret = em_compute_costs(dev, em_table->state, NULL,
780 pd->nr_perf_states, pd->flags);
781 if (ret)
782 goto free_em_table;
783 }
784
785 ret = em_dev_update_perf_domain(dev, em_table);
786 if (ret)
787 goto free_em_table;
788
789 /*
790 * This is one-time-update, so give up the ownership in this updater.
791 * The EM framework has incremented the usage counter and from now
792 * will keep the reference (then free the memory when needed).
793 */
794 free_em_table:
795 em_table_free(em_table);
796 return ret;
797 }
798
799 /*
800 * Adjustment of CPU performance values after boot, when all CPUs capacites
801 * are correctly calculated.
802 */
em_adjust_new_capacity(unsigned int cpu,struct device * dev,struct em_perf_domain * pd)803 static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
804 struct em_perf_domain *pd)
805 {
806 unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
807 struct em_perf_table *em_table;
808 struct em_perf_state *table;
809 unsigned long em_max_perf;
810
811 rcu_read_lock();
812 table = em_perf_state_from_pd(pd);
813 em_max_perf = table[pd->nr_perf_states - 1].performance;
814 rcu_read_unlock();
815
816 if (em_max_perf == cpu_capacity)
817 return;
818
819 pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
820 cpu_capacity, em_max_perf);
821
822 em_table = em_table_dup(pd);
823 if (!em_table) {
824 dev_warn(dev, "EM: allocation failed\n");
825 return;
826 }
827
828 em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
829
830 em_recalc_and_update(dev, pd, em_table);
831 }
832
833 /**
834 * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
835 * @cpu: Target CPU.
836 *
837 * Adjust the existing EM for @cpu after a capacity update under the assumption
838 * that the capacity has been updated in the same way for all of the CPUs in
839 * the same perf domain.
840 */
em_adjust_cpu_capacity(unsigned int cpu)841 void em_adjust_cpu_capacity(unsigned int cpu)
842 {
843 struct device *dev = get_cpu_device(cpu);
844 struct em_perf_domain *pd;
845
846 pd = em_pd_get(dev);
847 if (pd)
848 em_adjust_new_capacity(cpu, dev, pd);
849 }
850
em_check_capacity_update(void)851 static void em_check_capacity_update(void)
852 {
853 cpumask_var_t cpu_done_mask;
854 int cpu, failed_cpus = 0;
855
856 if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
857 pr_warn("no free memory\n");
858 return;
859 }
860
861 /* Check if CPUs capacity has changed than update EM */
862 for_each_possible_cpu(cpu) {
863 struct cpufreq_policy *policy;
864 struct em_perf_domain *pd;
865 struct device *dev;
866
867 if (cpumask_test_cpu(cpu, cpu_done_mask))
868 continue;
869
870 policy = cpufreq_cpu_get(cpu);
871 if (!policy) {
872 failed_cpus++;
873 continue;
874 }
875 cpufreq_cpu_put(policy);
876
877 dev = get_cpu_device(cpu);
878 pd = em_pd_get(dev);
879 if (!pd || em_is_artificial(pd))
880 continue;
881
882 cpumask_or(cpu_done_mask, cpu_done_mask,
883 em_span_cpus(pd));
884
885 em_adjust_new_capacity(cpu, dev, pd);
886 }
887
888 if (failed_cpus)
889 schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000));
890
891 free_cpumask_var(cpu_done_mask);
892 }
893
em_update_workfn(struct work_struct * work)894 static void em_update_workfn(struct work_struct *work)
895 {
896 em_check_capacity_update();
897 }
898
899 /**
900 * em_dev_update_chip_binning() - Update Energy Model after the new voltage
901 * information is present in the OPPs.
902 * @dev : Device for which the Energy Model has to be updated.
903 *
904 * This function allows to update easily the EM with new values available in
905 * the OPP framework and DT. It can be used after the chip has been properly
906 * verified by device drivers and the voltages adjusted for the 'chip binning'.
907 */
em_dev_update_chip_binning(struct device * dev)908 int em_dev_update_chip_binning(struct device *dev)
909 {
910 struct em_perf_table *em_table;
911 struct em_perf_domain *pd;
912 int i, ret;
913
914 if (IS_ERR_OR_NULL(dev))
915 return -EINVAL;
916
917 pd = em_pd_get(dev);
918 if (!pd) {
919 dev_warn(dev, "Couldn't find Energy Model\n");
920 return -EINVAL;
921 }
922
923 em_table = em_table_dup(pd);
924 if (!em_table) {
925 dev_warn(dev, "EM: allocation failed\n");
926 return -ENOMEM;
927 }
928
929 /* Update power values which might change due to new voltage in OPPs */
930 for (i = 0; i < pd->nr_perf_states; i++) {
931 unsigned long freq = em_table->state[i].frequency;
932 unsigned long power;
933
934 ret = dev_pm_opp_calc_power(dev, &power, &freq);
935 if (ret) {
936 em_table_free(em_table);
937 return ret;
938 }
939
940 em_table->state[i].power = power;
941 }
942
943 return em_recalc_and_update(dev, pd, em_table);
944 }
945 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
946
947
948 /**
949 * em_update_performance_limits() - Update Energy Model with performance
950 * limits information.
951 * @pd : Performance Domain with EM that has to be updated.
952 * @freq_min_khz : New minimum allowed frequency for this device.
953 * @freq_max_khz : New maximum allowed frequency for this device.
954 *
955 * This function allows to update the EM with information about available
956 * performance levels. It takes the minimum and maximum frequency in kHz
957 * and does internal translation to performance levels.
958 * Returns 0 on success or -EINVAL when failed.
959 */
em_update_performance_limits(struct em_perf_domain * pd,unsigned long freq_min_khz,unsigned long freq_max_khz)960 int em_update_performance_limits(struct em_perf_domain *pd,
961 unsigned long freq_min_khz, unsigned long freq_max_khz)
962 {
963 struct em_perf_state *table;
964 int min_ps = -1;
965 int max_ps = -1;
966 int i;
967
968 if (!pd)
969 return -EINVAL;
970
971 rcu_read_lock();
972 table = em_perf_state_from_pd(pd);
973
974 for (i = 0; i < pd->nr_perf_states; i++) {
975 if (freq_min_khz == table[i].frequency)
976 min_ps = i;
977 if (freq_max_khz == table[i].frequency)
978 max_ps = i;
979 }
980 rcu_read_unlock();
981
982 /* Only update when both are found and sane */
983 if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
984 return -EINVAL;
985
986
987 /* Guard simultaneous updates and make them atomic */
988 mutex_lock(&em_pd_mutex);
989 pd->min_perf_state = min_ps;
990 pd->max_perf_state = max_ps;
991 mutex_unlock(&em_pd_mutex);
992
993 return 0;
994 }
995 EXPORT_SYMBOL_GPL(em_update_performance_limits);
996
rebuild_sd_workfn(struct work_struct * work)997 static void rebuild_sd_workfn(struct work_struct *work)
998 {
999 rebuild_sched_domains_energy();
1000 }
1001
em_rebuild_sched_domains(void)1002 void em_rebuild_sched_domains(void)
1003 {
1004 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
1005
1006 /*
1007 * When called from the cpufreq_register_driver() path, the
1008 * cpu_hotplug_lock is already held, so use a work item to
1009 * avoid nested locking in rebuild_sched_domains().
1010 */
1011 schedule_work(&rebuild_sd_work);
1012 }
1013
1014 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
for_each_em_perf_domain(int (* cb)(struct em_perf_domain *,void *),void * data)1015 int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
1016 void *data)
1017 {
1018 struct em_perf_domain *pd;
1019
1020 lockdep_assert_not_held(&em_pd_mutex);
1021 guard(mutex)(&em_pd_list_mutex);
1022
1023 list_for_each_entry(pd, &em_pd_list, node) {
1024 int ret;
1025
1026 ret = cb(pd, data);
1027 if (ret)
1028 return ret;
1029 }
1030
1031 return 0;
1032 }
1033
em_perf_domain_get_by_id(int id)1034 struct em_perf_domain *em_perf_domain_get_by_id(int id)
1035 {
1036 struct em_perf_domain *pd;
1037
1038 lockdep_assert_not_held(&em_pd_mutex);
1039 guard(mutex)(&em_pd_list_mutex);
1040
1041 list_for_each_entry(pd, &em_pd_list, node) {
1042 if (pd->id == id)
1043 return pd;
1044 }
1045
1046 return NULL;
1047 }
1048 #endif
1049