xref: /linux/kernel/power/energy_model.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Energy Model of devices
4  *
5  * Copyright (c) 2018-2021, Arm ltd.
6  * Written by: Quentin Perret, Arm ltd.
7  * Improvements provided by: Lukasz Luba, Arm ltd.
8  */
9 
10 #define pr_fmt(fmt) "energy_model: " fmt
11 
12 #include <linux/cpu.h>
13 #include <linux/cpufreq.h>
14 #include <linux/cpumask.h>
15 #include <linux/debugfs.h>
16 #include <linux/energy_model.h>
17 #include <linux/sched/topology.h>
18 #include <linux/slab.h>
19 
20 #include "em_netlink.h"
21 
22 /*
23  * Mutex serializing the registrations of performance domains and letting
24  * callbacks defined by drivers sleep.
25  */
26 static DEFINE_MUTEX(em_pd_mutex);
27 
28 /*
29  * Manage performance domains with IDs. One can iterate the performance domains
30  * through the list and pick one with their associated ID. The mutex serializes
31  * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
32  * taken to avoid potential deadlock.
33  */
34 static DEFINE_IDA(em_pd_ida);
35 static LIST_HEAD(em_pd_list);
36 static DEFINE_MUTEX(em_pd_list_mutex);
37 
38 static void em_cpufreq_update_efficiencies(struct device *dev,
39 					   struct em_perf_state *table);
40 static void em_check_capacity_update(void);
41 static void em_update_workfn(struct work_struct *work);
42 static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
43 
44 static bool _is_cpu_device(struct device *dev)
45 {
46 	return (dev->bus == &cpu_subsys);
47 }
48 
49 #ifdef CONFIG_DEBUG_FS
50 static struct dentry *rootdir;
51 
52 struct em_dbg_info {
53 	struct em_perf_domain *pd;
54 	int ps_id;
55 };
56 
57 #define DEFINE_EM_DBG_SHOW(name, fname)					\
58 static int em_debug_##fname##_show(struct seq_file *s, void *unused)	\
59 {									\
60 	struct em_dbg_info *em_dbg = s->private;			\
61 	struct em_perf_state *table;					\
62 	unsigned long val;						\
63 									\
64 	rcu_read_lock();						\
65 	table = em_perf_state_from_pd(em_dbg->pd);			\
66 	val = table[em_dbg->ps_id].name;				\
67 	rcu_read_unlock();						\
68 									\
69 	seq_printf(s, "%lu\n", val);					\
70 	return 0;							\
71 }									\
72 DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
73 
74 DEFINE_EM_DBG_SHOW(frequency, frequency);
75 DEFINE_EM_DBG_SHOW(power, power);
76 DEFINE_EM_DBG_SHOW(cost, cost);
77 DEFINE_EM_DBG_SHOW(performance, performance);
78 DEFINE_EM_DBG_SHOW(flags, inefficiency);
79 
80 static void em_debug_create_ps(struct em_perf_domain *em_pd,
81 			       struct em_dbg_info *em_dbg, int i,
82 			       struct dentry *pd)
83 {
84 	struct em_perf_state *table;
85 	unsigned long freq;
86 	struct dentry *d;
87 	char name[24];
88 
89 	em_dbg[i].pd = em_pd;
90 	em_dbg[i].ps_id = i;
91 
92 	rcu_read_lock();
93 	table = em_perf_state_from_pd(em_pd);
94 	freq = table[i].frequency;
95 	rcu_read_unlock();
96 
97 	snprintf(name, sizeof(name), "ps:%lu", freq);
98 
99 	/* Create per-ps directory */
100 	d = debugfs_create_dir(name, pd);
101 	debugfs_create_file("frequency", 0444, d, &em_dbg[i],
102 			    &em_debug_frequency_fops);
103 	debugfs_create_file("power", 0444, d, &em_dbg[i],
104 			    &em_debug_power_fops);
105 	debugfs_create_file("cost", 0444, d, &em_dbg[i],
106 			    &em_debug_cost_fops);
107 	debugfs_create_file("performance", 0444, d, &em_dbg[i],
108 			    &em_debug_performance_fops);
109 	debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
110 			    &em_debug_inefficiency_fops);
111 }
112 
113 static int em_debug_cpus_show(struct seq_file *s, void *unused)
114 {
115 	seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
116 
117 	return 0;
118 }
119 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
120 
121 static int em_debug_flags_show(struct seq_file *s, void *unused)
122 {
123 	struct em_perf_domain *pd = s->private;
124 
125 	seq_printf(s, "%#lx\n", pd->flags);
126 
127 	return 0;
128 }
129 DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
130 
131 static int em_debug_id_show(struct seq_file *s, void *unused)
132 {
133 	struct em_perf_domain *pd = s->private;
134 
135 	seq_printf(s, "%d\n", pd->id);
136 
137 	return 0;
138 }
139 DEFINE_SHOW_ATTRIBUTE(em_debug_id);
140 
141 static void em_debug_create_pd(struct device *dev)
142 {
143 	struct em_dbg_info *em_dbg;
144 	struct dentry *d;
145 	int i;
146 
147 	/* Create the directory of the performance domain */
148 	d = debugfs_create_dir(dev_name(dev), rootdir);
149 
150 	if (_is_cpu_device(dev))
151 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
152 				    &em_debug_cpus_fops);
153 
154 	debugfs_create_file("flags", 0444, d, dev->em_pd,
155 			    &em_debug_flags_fops);
156 
157 	debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
158 
159 	em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
160 			      sizeof(*em_dbg), GFP_KERNEL);
161 	if (!em_dbg)
162 		return;
163 
164 	/* Create a sub-directory for each performance state */
165 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
166 		em_debug_create_ps(dev->em_pd, em_dbg, i, d);
167 
168 }
169 
170 static void em_debug_remove_pd(struct device *dev)
171 {
172 	debugfs_lookup_and_remove(dev_name(dev), rootdir);
173 }
174 
175 static int __init em_debug_init(void)
176 {
177 	/* Create /sys/kernel/debug/energy_model directory */
178 	rootdir = debugfs_create_dir("energy_model", NULL);
179 
180 	return 0;
181 }
182 fs_initcall(em_debug_init);
183 #else /* CONFIG_DEBUG_FS */
184 static void em_debug_create_pd(struct device *dev) {}
185 static void em_debug_remove_pd(struct device *dev) {}
186 #endif
187 
188 static void em_release_table_kref(struct kref *kref)
189 {
190 	/* It was the last owner of this table so we can free */
191 	kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
192 }
193 
194 /**
195  * em_table_free() - Handles safe free of the EM table when needed
196  * @table : EM table which is going to be freed
197  *
198  * No return values.
199  */
200 void em_table_free(struct em_perf_table *table)
201 {
202 	kref_put(&table->kref, em_release_table_kref);
203 }
204 
205 /**
206  * em_table_alloc() - Allocate a new EM table
207  * @pd		: EM performance domain for which this must be done
208  *
209  * Allocate a new EM table and initialize its kref to indicate that it
210  * has a user.
211  * Returns allocated table or NULL.
212  */
213 struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
214 {
215 	struct em_perf_table *table;
216 	int table_size;
217 
218 	table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
219 
220 	table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
221 	if (!table)
222 		return NULL;
223 
224 	kref_init(&table->kref);
225 
226 	return table;
227 }
228 
229 static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
230 				struct em_perf_state *table, int nr_states)
231 {
232 	u64 fmax, max_cap;
233 	int i, cpu;
234 
235 	/* This is needed only for CPUs and EAS skip other devices */
236 	if (!_is_cpu_device(dev))
237 		return;
238 
239 	cpu = cpumask_first(em_span_cpus(pd));
240 
241 	/*
242 	 * Calculate the performance value for each frequency with
243 	 * linear relationship. The final CPU capacity might not be ready at
244 	 * boot time, but the EM will be updated a bit later with correct one.
245 	 */
246 	fmax = (u64) table[nr_states - 1].frequency;
247 	max_cap = (u64) arch_scale_cpu_capacity(cpu);
248 	for (i = 0; i < nr_states; i++)
249 		table[i].performance = div64_u64(max_cap * table[i].frequency,
250 						 fmax);
251 }
252 
253 static int em_compute_costs(struct device *dev, struct em_perf_state *table,
254 			    const struct em_data_callback *cb, int nr_states,
255 			    unsigned long flags)
256 {
257 	unsigned long prev_cost = ULONG_MAX;
258 	int i, ret;
259 
260 	/* This is needed only for CPUs and EAS skip other devices */
261 	if (!_is_cpu_device(dev))
262 		return 0;
263 
264 	/* Compute the cost of each performance state. */
265 	for (i = nr_states - 1; i >= 0; i--) {
266 		unsigned long power_res, cost;
267 
268 		if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
269 			ret = cb->get_cost(dev, table[i].frequency, &cost);
270 			if (ret || !cost || cost > EM_MAX_POWER) {
271 				dev_err(dev, "EM: invalid cost %lu %d\n",
272 					cost, ret);
273 				return -EINVAL;
274 			}
275 		} else {
276 			/* increase resolution of 'cost' precision */
277 			power_res = table[i].power * 10;
278 			cost = power_res / table[i].performance;
279 		}
280 
281 		table[i].cost = cost;
282 
283 		if (table[i].cost >= prev_cost) {
284 			table[i].flags = EM_PERF_STATE_INEFFICIENT;
285 			dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
286 				table[i].frequency);
287 		} else {
288 			prev_cost = table[i].cost;
289 		}
290 	}
291 
292 	return 0;
293 }
294 
295 /**
296  * em_dev_compute_costs() - Calculate cost values for new runtime EM table
297  * @dev		: Device for which the EM table is to be updated
298  * @table	: The new EM table that is going to get the costs calculated
299  * @nr_states	: Number of performance states
300  *
301  * Calculate the em_perf_state::cost values for new runtime EM table. The
302  * values are used for EAS during task placement. It also calculates and sets
303  * the efficiency flag for each performance state. When the function finish
304  * successfully the EM table is ready to be updated and used by EAS.
305  *
306  * Return 0 on success or a proper error in case of failure.
307  */
308 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
309 			 int nr_states)
310 {
311 	return em_compute_costs(dev, table, NULL, nr_states, 0);
312 }
313 
314 /**
315  * em_dev_update_perf_domain() - Update runtime EM table for a device
316  * @dev		: Device for which the EM is to be updated
317  * @new_table	: The new EM table that is going to be used from now
318  *
319  * Update EM runtime modifiable table for the @dev using the provided @table.
320  *
321  * This function uses a mutex to serialize writers, so it must not be called
322  * from a non-sleeping context.
323  *
324  * Return 0 on success or an error code on failure.
325  */
326 int em_dev_update_perf_domain(struct device *dev,
327 			      struct em_perf_table *new_table)
328 {
329 	struct em_perf_table *old_table;
330 	struct em_perf_domain *pd;
331 
332 	if (!dev)
333 		return -EINVAL;
334 
335 	/* Serialize update/unregister or concurrent updates */
336 	mutex_lock(&em_pd_mutex);
337 
338 	if (!dev->em_pd) {
339 		mutex_unlock(&em_pd_mutex);
340 		return -EINVAL;
341 	}
342 	pd = dev->em_pd;
343 
344 	kref_get(&new_table->kref);
345 
346 	old_table = rcu_dereference_protected(pd->em_table,
347 					      lockdep_is_held(&em_pd_mutex));
348 	rcu_assign_pointer(pd->em_table, new_table);
349 
350 	em_cpufreq_update_efficiencies(dev, new_table->state);
351 
352 	em_table_free(old_table);
353 
354 	mutex_unlock(&em_pd_mutex);
355 
356 	em_notify_pd_updated(pd);
357 	return 0;
358 }
359 EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
360 
361 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
362 				struct em_perf_state *table,
363 				const struct em_data_callback *cb,
364 				unsigned long flags)
365 {
366 	unsigned long power, freq, prev_freq = 0;
367 	int nr_states = pd->nr_perf_states;
368 	int i, ret;
369 
370 	/* Build the list of performance states for this performance domain */
371 	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
372 		/*
373 		 * active_power() is a driver callback which ceils 'freq' to
374 		 * lowest performance state of 'dev' above 'freq' and updates
375 		 * 'power' and 'freq' accordingly.
376 		 */
377 		ret = cb->active_power(dev, &power, &freq);
378 		if (ret) {
379 			dev_err(dev, "EM: invalid perf. state: %d\n",
380 				ret);
381 			return -EINVAL;
382 		}
383 
384 		/*
385 		 * We expect the driver callback to increase the frequency for
386 		 * higher performance states.
387 		 */
388 		if (freq <= prev_freq) {
389 			dev_err(dev, "EM: non-increasing freq: %lu\n",
390 				freq);
391 			return -EINVAL;
392 		}
393 
394 		/*
395 		 * The power returned by active_state() is expected to be
396 		 * positive and be in range.
397 		 */
398 		if (!power || power > EM_MAX_POWER) {
399 			dev_err(dev, "EM: invalid power: %lu\n",
400 				power);
401 			return -EINVAL;
402 		}
403 
404 		table[i].power = power;
405 		table[i].frequency = prev_freq = freq;
406 	}
407 
408 	em_init_performance(dev, pd, table, nr_states);
409 
410 	ret = em_compute_costs(dev, table, cb, nr_states, flags);
411 	if (ret)
412 		return -EINVAL;
413 
414 	return 0;
415 }
416 
417 static int em_create_pd(struct device *dev, int nr_states,
418 			const struct em_data_callback *cb,
419 			const cpumask_t *cpus,
420 			unsigned long flags)
421 {
422 	struct em_perf_table *em_table;
423 	struct em_perf_domain *pd;
424 	struct device *cpu_dev;
425 	int cpu, ret, num_cpus, id;
426 
427 	if (_is_cpu_device(dev)) {
428 		num_cpus = cpumask_weight(cpus);
429 
430 		/* Prevent max possible energy calculation to not overflow */
431 		if (num_cpus > EM_MAX_NUM_CPUS) {
432 			dev_err(dev, "EM: too many CPUs, overflow possible\n");
433 			return -EINVAL;
434 		}
435 
436 		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
437 		if (!pd)
438 			return -ENOMEM;
439 
440 		cpumask_copy(em_span_cpus(pd), cpus);
441 	} else {
442 		pd = kzalloc(sizeof(*pd), GFP_KERNEL);
443 		if (!pd)
444 			return -ENOMEM;
445 	}
446 
447 	pd->nr_perf_states = nr_states;
448 
449 	INIT_LIST_HEAD(&pd->node);
450 
451 	id = ida_alloc(&em_pd_ida, GFP_KERNEL);
452 	if (id < 0)
453 		return -ENOMEM;
454 	pd->id = id;
455 
456 	em_table = em_table_alloc(pd);
457 	if (!em_table)
458 		goto free_pd;
459 
460 	ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
461 	if (ret)
462 		goto free_pd_table;
463 
464 	rcu_assign_pointer(pd->em_table, em_table);
465 
466 	if (_is_cpu_device(dev))
467 		for_each_cpu(cpu, cpus) {
468 			cpu_dev = get_cpu_device(cpu);
469 			cpu_dev->em_pd = pd;
470 		}
471 
472 	dev->em_pd = pd;
473 
474 	return 0;
475 
476 free_pd_table:
477 	kfree(em_table);
478 free_pd:
479 	kfree(pd);
480 	ida_free(&em_pd_ida, id);
481 	return -EINVAL;
482 }
483 
484 static void
485 em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
486 {
487 	struct em_perf_domain *pd = dev->em_pd;
488 	struct cpufreq_policy *policy;
489 	int found = 0;
490 	int i, cpu;
491 
492 	if (!_is_cpu_device(dev))
493 		return;
494 
495 	/* Try to get a CPU which is active and in this PD */
496 	cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
497 	if (cpu >= nr_cpu_ids) {
498 		dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
499 		return;
500 	}
501 
502 	policy = cpufreq_cpu_get(cpu);
503 	if (!policy) {
504 		dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
505 		return;
506 	}
507 
508 	for (i = 0; i < pd->nr_perf_states; i++) {
509 		if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
510 			continue;
511 
512 		if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
513 			found++;
514 	}
515 
516 	cpufreq_cpu_put(policy);
517 
518 	if (!found)
519 		return;
520 
521 	/*
522 	 * Efficiencies have been installed in CPUFreq, inefficient frequencies
523 	 * will be skipped. The EM can do the same.
524 	 */
525 	pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
526 }
527 
528 /**
529  * em_pd_get() - Return the performance domain for a device
530  * @dev : Device to find the performance domain for
531  *
532  * Returns the performance domain to which @dev belongs, or NULL if it doesn't
533  * exist.
534  */
535 struct em_perf_domain *em_pd_get(struct device *dev)
536 {
537 	if (IS_ERR_OR_NULL(dev))
538 		return NULL;
539 
540 	return dev->em_pd;
541 }
542 EXPORT_SYMBOL_GPL(em_pd_get);
543 
544 /**
545  * em_cpu_get() - Return the performance domain for a CPU
546  * @cpu : CPU to find the performance domain for
547  *
548  * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
549  * exist.
550  */
551 struct em_perf_domain *em_cpu_get(int cpu)
552 {
553 	struct device *cpu_dev;
554 
555 	cpu_dev = get_cpu_device(cpu);
556 	if (!cpu_dev)
557 		return NULL;
558 
559 	return em_pd_get(cpu_dev);
560 }
561 EXPORT_SYMBOL_GPL(em_cpu_get);
562 
563 /**
564  * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
565  * @dev		: Device for which the EM is to register
566  * @nr_states	: Number of performance states to register
567  * @cb		: Callback functions providing the data of the Energy Model
568  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
569  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
570  *		type of devices this should be set to NULL.
571  * @microwatts	: Flag indicating that the power values are in micro-Watts or
572  *		in some other scale. It must be set properly.
573  *
574  * Create Energy Model tables for a performance domain using the callbacks
575  * defined in cb.
576  *
577  * The @microwatts is important to set with correct value. Some kernel
578  * sub-systems might rely on this flag and check if all devices in the EM are
579  * using the same scale.
580  *
581  * If multiple clients register the same performance domain, all but the first
582  * registration will be ignored.
583  *
584  * Return 0 on success
585  */
586 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
587 				const struct em_data_callback *cb,
588 				const cpumask_t *cpus, bool microwatts)
589 {
590 	int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
591 
592 	if (_is_cpu_device(dev))
593 		em_check_capacity_update();
594 
595 	return ret;
596 }
597 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
598 
599 /**
600  * em_dev_register_pd_no_update() - Register a perf domain for a device
601  * @dev : Device to register the PD for
602  * @nr_states : Number of performance states in the new PD
603  * @cb : Callback functions for populating the energy model
604  * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
605  * @microwatts : Whether or not the power values in the EM will be in uW
606  *
607  * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
608  * update after registering the PD, even if @dev is a CPU device.
609  */
610 int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
611 				 const struct em_data_callback *cb,
612 				 const cpumask_t *cpus, bool microwatts)
613 {
614 	struct em_perf_table *em_table;
615 	unsigned long cap, prev_cap = 0;
616 	unsigned long flags = 0;
617 	int cpu, ret;
618 
619 	if (!dev || !nr_states || !cb)
620 		return -EINVAL;
621 
622 	/*
623 	 * Use a mutex to serialize the registration of performance domains and
624 	 * let the driver-defined callback functions sleep.
625 	 */
626 	mutex_lock(&em_pd_mutex);
627 
628 	if (dev->em_pd) {
629 		ret = -EEXIST;
630 		goto unlock;
631 	}
632 
633 	if (_is_cpu_device(dev)) {
634 		if (!cpus) {
635 			dev_err(dev, "EM: invalid CPU mask\n");
636 			ret = -EINVAL;
637 			goto unlock;
638 		}
639 
640 		for_each_cpu(cpu, cpus) {
641 			if (em_cpu_get(cpu)) {
642 				dev_err(dev, "EM: exists for CPU%d\n", cpu);
643 				ret = -EEXIST;
644 				goto unlock;
645 			}
646 			/*
647 			 * All CPUs of a domain must have the same
648 			 * micro-architecture since they all share the same
649 			 * table.
650 			 */
651 			cap = arch_scale_cpu_capacity(cpu);
652 			if (prev_cap && prev_cap != cap) {
653 				dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
654 					cpumask_pr_args(cpus));
655 
656 				ret = -EINVAL;
657 				goto unlock;
658 			}
659 			prev_cap = cap;
660 		}
661 	}
662 
663 	if (microwatts)
664 		flags |= EM_PERF_DOMAIN_MICROWATTS;
665 	else if (cb->get_cost)
666 		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
667 
668 	/*
669 	 * EM only supports uW (exception is artificial EM).
670 	 * Therefore, check and force the drivers to provide
671 	 * power in uW.
672 	 */
673 	if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
674 		dev_err(dev, "EM: only supports uW power values\n");
675 		ret = -EINVAL;
676 		goto unlock;
677 	}
678 
679 	ret = em_create_pd(dev, nr_states, cb, cpus, flags);
680 	if (ret)
681 		goto unlock;
682 
683 	dev->em_pd->flags |= flags;
684 	dev->em_pd->min_perf_state = 0;
685 	dev->em_pd->max_perf_state = nr_states - 1;
686 
687 	em_table = rcu_dereference_protected(dev->em_pd->em_table,
688 					     lockdep_is_held(&em_pd_mutex));
689 	em_cpufreq_update_efficiencies(dev, em_table->state);
690 
691 	em_debug_create_pd(dev);
692 	dev_info(dev, "EM: created perf domain\n");
693 
694 unlock:
695 	mutex_unlock(&em_pd_mutex);
696 	if (ret)
697 		return ret;
698 
699 	mutex_lock(&em_pd_list_mutex);
700 	list_add_tail(&dev->em_pd->node, &em_pd_list);
701 	mutex_unlock(&em_pd_list_mutex);
702 
703 	em_notify_pd_created(dev->em_pd);
704 
705 	return 0;
706 }
707 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
708 
709 /**
710  * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
711  * @dev		: Device for which the EM is registered
712  *
713  * Unregister the EM for the specified @dev (but not a CPU device).
714  */
715 void em_dev_unregister_perf_domain(struct device *dev)
716 {
717 	if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
718 		return;
719 
720 	if (_is_cpu_device(dev))
721 		return;
722 
723 	mutex_lock(&em_pd_list_mutex);
724 	list_del_init(&dev->em_pd->node);
725 	mutex_unlock(&em_pd_list_mutex);
726 
727 	em_notify_pd_deleted(dev->em_pd);
728 
729 	/*
730 	 * The mutex separates all register/unregister requests and protects
731 	 * from potential clean-up/setup issues in the debugfs directories.
732 	 * The debugfs directory name is the same as device's name.
733 	 */
734 	mutex_lock(&em_pd_mutex);
735 	em_debug_remove_pd(dev);
736 
737 	em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
738 						lockdep_is_held(&em_pd_mutex)));
739 
740 	ida_free(&em_pd_ida, dev->em_pd->id);
741 
742 	kfree(dev->em_pd);
743 	dev->em_pd = NULL;
744 	mutex_unlock(&em_pd_mutex);
745 }
746 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
747 
748 static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
749 {
750 	struct em_perf_table *em_table;
751 	struct em_perf_state *ps, *new_ps;
752 	int ps_size;
753 
754 	em_table = em_table_alloc(pd);
755 	if (!em_table)
756 		return NULL;
757 
758 	new_ps = em_table->state;
759 
760 	rcu_read_lock();
761 	ps = em_perf_state_from_pd(pd);
762 	/* Initialize data based on old table */
763 	ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
764 	memcpy(new_ps, ps, ps_size);
765 
766 	rcu_read_unlock();
767 
768 	return em_table;
769 }
770 
771 static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
772 				struct em_perf_table *em_table)
773 {
774 	int ret;
775 
776 	if (!em_is_artificial(pd)) {
777 		ret = em_compute_costs(dev, em_table->state, NULL,
778 				       pd->nr_perf_states, pd->flags);
779 		if (ret)
780 			goto free_em_table;
781 	}
782 
783 	ret = em_dev_update_perf_domain(dev, em_table);
784 	if (ret)
785 		goto free_em_table;
786 
787 	/*
788 	 * This is one-time-update, so give up the ownership in this updater.
789 	 * The EM framework has incremented the usage counter and from now
790 	 * will keep the reference (then free the memory when needed).
791 	 */
792 free_em_table:
793 	em_table_free(em_table);
794 	return ret;
795 }
796 
797 /*
798  * Adjustment of CPU performance values after boot, when all CPUs capacites
799  * are correctly calculated.
800  */
801 static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
802 				   struct em_perf_domain *pd)
803 {
804 	unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
805 	struct em_perf_table *em_table;
806 	struct em_perf_state *table;
807 	unsigned long em_max_perf;
808 
809 	rcu_read_lock();
810 	table = em_perf_state_from_pd(pd);
811 	em_max_perf = table[pd->nr_perf_states - 1].performance;
812 	rcu_read_unlock();
813 
814 	if (em_max_perf == cpu_capacity)
815 		return;
816 
817 	pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
818 		 cpu_capacity, em_max_perf);
819 
820 	em_table = em_table_dup(pd);
821 	if (!em_table) {
822 		dev_warn(dev, "EM: allocation failed\n");
823 		return;
824 	}
825 
826 	em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
827 
828 	em_recalc_and_update(dev, pd, em_table);
829 }
830 
831 /**
832  * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
833  * @cpu: Target CPU.
834  *
835  * Adjust the existing EM for @cpu after a capacity update under the assumption
836  * that the capacity has been updated in the same way for all of the CPUs in
837  * the same perf domain.
838  */
839 void em_adjust_cpu_capacity(unsigned int cpu)
840 {
841 	struct device *dev = get_cpu_device(cpu);
842 	struct em_perf_domain *pd;
843 
844 	pd = em_pd_get(dev);
845 	if (pd)
846 		em_adjust_new_capacity(cpu, dev, pd);
847 }
848 
849 static void em_check_capacity_update(void)
850 {
851 	cpumask_var_t cpu_done_mask;
852 	int cpu, failed_cpus = 0;
853 
854 	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
855 		pr_warn("no free memory\n");
856 		return;
857 	}
858 
859 	/* Check if CPUs capacity has changed than update EM */
860 	for_each_possible_cpu(cpu) {
861 		struct cpufreq_policy *policy;
862 		struct em_perf_domain *pd;
863 		struct device *dev;
864 
865 		if (cpumask_test_cpu(cpu, cpu_done_mask))
866 			continue;
867 
868 		policy = cpufreq_cpu_get(cpu);
869 		if (!policy) {
870 			failed_cpus++;
871 			continue;
872 		}
873 		cpufreq_cpu_put(policy);
874 
875 		dev = get_cpu_device(cpu);
876 		pd = em_pd_get(dev);
877 		if (!pd || em_is_artificial(pd))
878 			continue;
879 
880 		cpumask_or(cpu_done_mask, cpu_done_mask,
881 			   em_span_cpus(pd));
882 
883 		em_adjust_new_capacity(cpu, dev, pd);
884 	}
885 
886 	if (failed_cpus)
887 		schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000));
888 
889 	free_cpumask_var(cpu_done_mask);
890 }
891 
892 static void em_update_workfn(struct work_struct *work)
893 {
894 	em_check_capacity_update();
895 }
896 
897 /**
898  * em_dev_update_chip_binning() - Update Energy Model after the new voltage
899  *				information is present in the OPPs.
900  * @dev		: Device for which the Energy Model has to be updated.
901  *
902  * This function allows to update easily the EM with new values available in
903  * the OPP framework and DT. It can be used after the chip has been properly
904  * verified by device drivers and the voltages adjusted for the 'chip binning'.
905  */
906 int em_dev_update_chip_binning(struct device *dev)
907 {
908 	struct em_perf_table *em_table;
909 	struct em_perf_domain *pd;
910 	int i, ret;
911 
912 	if (IS_ERR_OR_NULL(dev))
913 		return -EINVAL;
914 
915 	pd = em_pd_get(dev);
916 	if (!pd) {
917 		dev_warn(dev, "Couldn't find Energy Model\n");
918 		return -EINVAL;
919 	}
920 
921 	em_table = em_table_dup(pd);
922 	if (!em_table) {
923 		dev_warn(dev, "EM: allocation failed\n");
924 		return -ENOMEM;
925 	}
926 
927 	/* Update power values which might change due to new voltage in OPPs */
928 	for (i = 0; i < pd->nr_perf_states; i++) {
929 		unsigned long freq = em_table->state[i].frequency;
930 		unsigned long power;
931 
932 		ret = dev_pm_opp_calc_power(dev, &power, &freq);
933 		if (ret) {
934 			em_table_free(em_table);
935 			return ret;
936 		}
937 
938 		em_table->state[i].power = power;
939 	}
940 
941 	return em_recalc_and_update(dev, pd, em_table);
942 }
943 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
944 
945 
946 /**
947  * em_update_performance_limits() - Update Energy Model with performance
948  *				limits information.
949  * @pd			: Performance Domain with EM that has to be updated.
950  * @freq_min_khz	: New minimum allowed frequency for this device.
951  * @freq_max_khz	: New maximum allowed frequency for this device.
952  *
953  * This function allows to update the EM with information about available
954  * performance levels. It takes the minimum and maximum frequency in kHz
955  * and does internal translation to performance levels.
956  * Returns 0 on success or -EINVAL when failed.
957  */
958 int em_update_performance_limits(struct em_perf_domain *pd,
959 		unsigned long freq_min_khz, unsigned long freq_max_khz)
960 {
961 	struct em_perf_state *table;
962 	int min_ps = -1;
963 	int max_ps = -1;
964 	int i;
965 
966 	if (!pd)
967 		return -EINVAL;
968 
969 	rcu_read_lock();
970 	table = em_perf_state_from_pd(pd);
971 
972 	for (i = 0; i < pd->nr_perf_states; i++) {
973 		if (freq_min_khz == table[i].frequency)
974 			min_ps = i;
975 		if (freq_max_khz == table[i].frequency)
976 			max_ps = i;
977 	}
978 	rcu_read_unlock();
979 
980 	/* Only update when both are found and sane */
981 	if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
982 		return -EINVAL;
983 
984 
985 	/* Guard simultaneous updates and make them atomic */
986 	mutex_lock(&em_pd_mutex);
987 	pd->min_perf_state = min_ps;
988 	pd->max_perf_state = max_ps;
989 	mutex_unlock(&em_pd_mutex);
990 
991 	return 0;
992 }
993 EXPORT_SYMBOL_GPL(em_update_performance_limits);
994 
995 static void rebuild_sd_workfn(struct work_struct *work)
996 {
997 	rebuild_sched_domains_energy();
998 }
999 
1000 void em_rebuild_sched_domains(void)
1001 {
1002 	static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
1003 
1004 	/*
1005 	 * When called from the cpufreq_register_driver() path, the
1006 	 * cpu_hotplug_lock is already held, so use a work item to
1007 	 * avoid nested locking in rebuild_sched_domains().
1008 	 */
1009 	schedule_work(&rebuild_sd_work);
1010 }
1011 
1012 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
1013 int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
1014 			    void *data)
1015 {
1016 	struct em_perf_domain *pd;
1017 
1018 	lockdep_assert_not_held(&em_pd_mutex);
1019 	guard(mutex)(&em_pd_list_mutex);
1020 
1021 	list_for_each_entry(pd, &em_pd_list, node) {
1022 		int ret;
1023 
1024 		ret = cb(pd, data);
1025 		if (ret)
1026 			return ret;
1027 	}
1028 
1029 	return 0;
1030 }
1031 
1032 struct em_perf_domain *em_perf_domain_get_by_id(int id)
1033 {
1034 	struct em_perf_domain *pd;
1035 
1036 	lockdep_assert_not_held(&em_pd_mutex);
1037 	guard(mutex)(&em_pd_list_mutex);
1038 
1039 	list_for_each_entry(pd, &em_pd_list, node) {
1040 		if (pd->id == id)
1041 			return pd;
1042 	}
1043 
1044 	return NULL;
1045 }
1046 #endif
1047