xref: /linux/kernel/power/energy_model.c (revision b62ce2547fe8a8ba15857bb974bcad250c5420d6)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Energy Model of devices
4  *
5  * Copyright (c) 2018-2021, Arm ltd.
6  * Written by: Quentin Perret, Arm ltd.
7  * Improvements provided by: Lukasz Luba, Arm ltd.
8  */
9 
10 #define pr_fmt(fmt) "energy_model: " fmt
11 
12 #include <linux/cpu.h>
13 #include <linux/cpufreq.h>
14 #include <linux/cpumask.h>
15 #include <linux/debugfs.h>
16 #include <linux/energy_model.h>
17 #include <linux/sched/topology.h>
18 #include <linux/slab.h>
19 
20 #include "em_netlink.h"
21 
22 /*
23  * Mutex serializing the registrations of performance domains and letting
24  * callbacks defined by drivers sleep.
25  */
26 static DEFINE_MUTEX(em_pd_mutex);
27 
28 /*
29  * Manage performance domains with IDs. One can iterate the performance domains
30  * through the list and pick one with their associated ID. The mutex serializes
31  * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
32  * taken to avoid potential deadlock.
33  */
34 static DEFINE_IDA(em_pd_ida);
35 static LIST_HEAD(em_pd_list);
36 static DEFINE_MUTEX(em_pd_list_mutex);
37 
38 static void em_cpufreq_update_efficiencies(struct device *dev,
39 					   struct em_perf_state *table);
40 static void em_check_capacity_update(void);
41 static void em_update_workfn(struct work_struct *work);
42 static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
43 
_is_cpu_device(struct device * dev)44 static bool _is_cpu_device(struct device *dev)
45 {
46 	return (dev->bus == &cpu_subsys);
47 }
48 
49 #ifdef CONFIG_DEBUG_FS
50 static struct dentry *rootdir;
51 
52 struct em_dbg_info {
53 	struct em_perf_domain *pd;
54 	int ps_id;
55 };
56 
57 #define DEFINE_EM_DBG_SHOW(name, fname)					\
58 static int em_debug_##fname##_show(struct seq_file *s, void *unused)	\
59 {									\
60 	struct em_dbg_info *em_dbg = s->private;			\
61 	struct em_perf_state *table;					\
62 	unsigned long val;						\
63 									\
64 	rcu_read_lock();						\
65 	table = em_perf_state_from_pd(em_dbg->pd);			\
66 	val = table[em_dbg->ps_id].name;				\
67 	rcu_read_unlock();						\
68 									\
69 	seq_printf(s, "%lu\n", val);					\
70 	return 0;							\
71 }									\
72 DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
73 
74 DEFINE_EM_DBG_SHOW(frequency, frequency);
75 DEFINE_EM_DBG_SHOW(power, power);
76 DEFINE_EM_DBG_SHOW(cost, cost);
77 DEFINE_EM_DBG_SHOW(performance, performance);
78 DEFINE_EM_DBG_SHOW(flags, inefficiency);
79 
em_debug_create_ps(struct em_perf_domain * em_pd,struct em_dbg_info * em_dbg,int i,struct dentry * pd)80 static void em_debug_create_ps(struct em_perf_domain *em_pd,
81 			       struct em_dbg_info *em_dbg, int i,
82 			       struct dentry *pd)
83 {
84 	struct em_perf_state *table;
85 	unsigned long freq;
86 	struct dentry *d;
87 	char name[24];
88 
89 	em_dbg[i].pd = em_pd;
90 	em_dbg[i].ps_id = i;
91 
92 	rcu_read_lock();
93 	table = em_perf_state_from_pd(em_pd);
94 	freq = table[i].frequency;
95 	rcu_read_unlock();
96 
97 	snprintf(name, sizeof(name), "ps:%lu", freq);
98 
99 	/* Create per-ps directory */
100 	d = debugfs_create_dir(name, pd);
101 	debugfs_create_file("frequency", 0444, d, &em_dbg[i],
102 			    &em_debug_frequency_fops);
103 	debugfs_create_file("power", 0444, d, &em_dbg[i],
104 			    &em_debug_power_fops);
105 	debugfs_create_file("cost", 0444, d, &em_dbg[i],
106 			    &em_debug_cost_fops);
107 	debugfs_create_file("performance", 0444, d, &em_dbg[i],
108 			    &em_debug_performance_fops);
109 	debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
110 			    &em_debug_inefficiency_fops);
111 }
112 
em_debug_cpus_show(struct seq_file * s,void * unused)113 static int em_debug_cpus_show(struct seq_file *s, void *unused)
114 {
115 	seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
116 
117 	return 0;
118 }
119 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
120 
em_debug_flags_show(struct seq_file * s,void * unused)121 static int em_debug_flags_show(struct seq_file *s, void *unused)
122 {
123 	struct em_perf_domain *pd = s->private;
124 
125 	seq_printf(s, "%#lx\n", pd->flags);
126 
127 	return 0;
128 }
129 DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
130 
em_debug_id_show(struct seq_file * s,void * unused)131 static int em_debug_id_show(struct seq_file *s, void *unused)
132 {
133 	struct em_perf_domain *pd = s->private;
134 
135 	seq_printf(s, "%d\n", pd->id);
136 
137 	return 0;
138 }
139 DEFINE_SHOW_ATTRIBUTE(em_debug_id);
140 
em_debug_create_pd(struct device * dev)141 static void em_debug_create_pd(struct device *dev)
142 {
143 	struct em_dbg_info *em_dbg;
144 	struct dentry *d;
145 	int i;
146 
147 	/* Create the directory of the performance domain */
148 	d = debugfs_create_dir(dev_name(dev), rootdir);
149 
150 	if (_is_cpu_device(dev))
151 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
152 				    &em_debug_cpus_fops);
153 
154 	debugfs_create_file("flags", 0444, d, dev->em_pd,
155 			    &em_debug_flags_fops);
156 
157 	debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
158 
159 	em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
160 			      sizeof(*em_dbg), GFP_KERNEL);
161 	if (!em_dbg)
162 		return;
163 
164 	/* Create a sub-directory for each performance state */
165 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
166 		em_debug_create_ps(dev->em_pd, em_dbg, i, d);
167 
168 }
169 
em_debug_remove_pd(struct device * dev)170 static void em_debug_remove_pd(struct device *dev)
171 {
172 	debugfs_lookup_and_remove(dev_name(dev), rootdir);
173 }
174 
em_debug_init(void)175 static int __init em_debug_init(void)
176 {
177 	/* Create /sys/kernel/debug/energy_model directory */
178 	rootdir = debugfs_create_dir("energy_model", NULL);
179 
180 	return 0;
181 }
182 fs_initcall(em_debug_init);
183 #else /* CONFIG_DEBUG_FS */
em_debug_create_pd(struct device * dev)184 static void em_debug_create_pd(struct device *dev) {}
em_debug_remove_pd(struct device * dev)185 static void em_debug_remove_pd(struct device *dev) {}
186 #endif
187 
em_release_table_kref(struct kref * kref)188 static void em_release_table_kref(struct kref *kref)
189 {
190 	/* It was the last owner of this table so we can free */
191 	kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
192 }
193 
194 /**
195  * em_table_free() - Handles safe free of the EM table when needed
196  * @table : EM table which is going to be freed
197  *
198  * No return values.
199  */
em_table_free(struct em_perf_table * table)200 void em_table_free(struct em_perf_table *table)
201 {
202 	kref_put(&table->kref, em_release_table_kref);
203 }
204 
205 /**
206  * em_table_alloc() - Allocate a new EM table
207  * @pd		: EM performance domain for which this must be done
208  *
209  * Allocate a new EM table and initialize its kref to indicate that it
210  * has a user.
211  * Returns allocated table or NULL.
212  */
em_table_alloc(struct em_perf_domain * pd)213 struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
214 {
215 	struct em_perf_table *table;
216 	int table_size;
217 
218 	table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
219 
220 	table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
221 	if (!table)
222 		return NULL;
223 
224 	kref_init(&table->kref);
225 
226 	return table;
227 }
228 
em_init_performance(struct device * dev,struct em_perf_domain * pd,struct em_perf_state * table,int nr_states)229 static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
230 				struct em_perf_state *table, int nr_states)
231 {
232 	u64 fmax, max_cap;
233 	int i, cpu;
234 
235 	/* This is needed only for CPUs and EAS skip other devices */
236 	if (!_is_cpu_device(dev))
237 		return;
238 
239 	cpu = cpumask_first(em_span_cpus(pd));
240 
241 	/*
242 	 * Calculate the performance value for each frequency with
243 	 * linear relationship. The final CPU capacity might not be ready at
244 	 * boot time, but the EM will be updated a bit later with correct one.
245 	 */
246 	fmax = (u64) table[nr_states - 1].frequency;
247 	max_cap = (u64) arch_scale_cpu_capacity(cpu);
248 	for (i = 0; i < nr_states; i++)
249 		table[i].performance = div64_u64(max_cap * table[i].frequency,
250 						 fmax);
251 }
252 
em_compute_costs(struct device * dev,struct em_perf_state * table,const struct em_data_callback * cb,int nr_states,unsigned long flags)253 static int em_compute_costs(struct device *dev, struct em_perf_state *table,
254 			    const struct em_data_callback *cb, int nr_states,
255 			    unsigned long flags)
256 {
257 	unsigned long prev_cost = ULONG_MAX;
258 	int i, ret;
259 
260 	/* This is needed only for CPUs and EAS skip other devices */
261 	if (!_is_cpu_device(dev))
262 		return 0;
263 
264 	/* Compute the cost of each performance state. */
265 	for (i = nr_states - 1; i >= 0; i--) {
266 		unsigned long power_res, cost;
267 
268 		if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
269 			ret = cb->get_cost(dev, table[i].frequency, &cost);
270 			if (ret || !cost || cost > EM_MAX_POWER) {
271 				dev_err(dev, "EM: invalid cost %lu %d\n",
272 					cost, ret);
273 				return -EINVAL;
274 			}
275 		} else {
276 			/* increase resolution of 'cost' precision */
277 			power_res = table[i].power * 10;
278 			cost = power_res / table[i].performance;
279 		}
280 
281 		table[i].cost = cost;
282 
283 		if (table[i].cost >= prev_cost) {
284 			table[i].flags = EM_PERF_STATE_INEFFICIENT;
285 			dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
286 				table[i].frequency);
287 		} else {
288 			prev_cost = table[i].cost;
289 		}
290 	}
291 
292 	return 0;
293 }
294 
295 /**
296  * em_dev_compute_costs() - Calculate cost values for new runtime EM table
297  * @dev		: Device for which the EM table is to be updated
298  * @table	: The new EM table that is going to get the costs calculated
299  * @nr_states	: Number of performance states
300  *
301  * Calculate the em_perf_state::cost values for new runtime EM table. The
302  * values are used for EAS during task placement. It also calculates and sets
303  * the efficiency flag for each performance state. When the function finish
304  * successfully the EM table is ready to be updated and used by EAS.
305  *
306  * Return 0 on success or a proper error in case of failure.
307  */
em_dev_compute_costs(struct device * dev,struct em_perf_state * table,int nr_states)308 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
309 			 int nr_states)
310 {
311 	return em_compute_costs(dev, table, NULL, nr_states, 0);
312 }
313 
314 /**
315  * em_dev_update_perf_domain() - Update runtime EM table for a device
316  * @dev		: Device for which the EM is to be updated
317  * @new_table	: The new EM table that is going to be used from now
318  *
319  * Update EM runtime modifiable table for the @dev using the provided @table.
320  *
321  * This function uses a mutex to serialize writers, so it must not be called
322  * from a non-sleeping context.
323  *
324  * Return 0 on success or an error code on failure.
325  */
em_dev_update_perf_domain(struct device * dev,struct em_perf_table * new_table)326 int em_dev_update_perf_domain(struct device *dev,
327 			      struct em_perf_table *new_table)
328 {
329 	struct em_perf_table *old_table;
330 	struct em_perf_domain *pd;
331 
332 	if (!dev)
333 		return -EINVAL;
334 
335 	/* Serialize update/unregister or concurrent updates */
336 	mutex_lock(&em_pd_mutex);
337 
338 	if (!dev->em_pd) {
339 		mutex_unlock(&em_pd_mutex);
340 		return -EINVAL;
341 	}
342 	pd = dev->em_pd;
343 
344 	kref_get(&new_table->kref);
345 
346 	old_table = rcu_dereference_protected(pd->em_table,
347 					      lockdep_is_held(&em_pd_mutex));
348 	rcu_assign_pointer(pd->em_table, new_table);
349 
350 	em_cpufreq_update_efficiencies(dev, new_table->state);
351 
352 	em_table_free(old_table);
353 
354 	mutex_unlock(&em_pd_mutex);
355 
356 	em_notify_pd_updated(pd);
357 	return 0;
358 }
359 EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
360 
em_create_perf_table(struct device * dev,struct em_perf_domain * pd,struct em_perf_state * table,const struct em_data_callback * cb,unsigned long flags)361 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
362 				struct em_perf_state *table,
363 				const struct em_data_callback *cb,
364 				unsigned long flags)
365 {
366 	unsigned long power, freq, prev_freq = 0;
367 	int nr_states = pd->nr_perf_states;
368 	int i, ret;
369 
370 	/* Build the list of performance states for this performance domain */
371 	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
372 		/*
373 		 * active_power() is a driver callback which ceils 'freq' to
374 		 * lowest performance state of 'dev' above 'freq' and updates
375 		 * 'power' and 'freq' accordingly.
376 		 */
377 		ret = cb->active_power(dev, &power, &freq);
378 		if (ret) {
379 			dev_err(dev, "EM: invalid perf. state: %d\n",
380 				ret);
381 			return -EINVAL;
382 		}
383 
384 		/*
385 		 * We expect the driver callback to increase the frequency for
386 		 * higher performance states.
387 		 */
388 		if (freq <= prev_freq) {
389 			dev_err(dev, "EM: non-increasing freq: %lu\n",
390 				freq);
391 			return -EINVAL;
392 		}
393 
394 		/*
395 		 * The power returned by active_state() is expected to be
396 		 * positive and be in range.
397 		 */
398 		if (!power || power > EM_MAX_POWER) {
399 			dev_err(dev, "EM: invalid power: %lu\n",
400 				power);
401 			return -EINVAL;
402 		}
403 
404 		table[i].power = power;
405 		table[i].frequency = prev_freq = freq;
406 	}
407 
408 	em_init_performance(dev, pd, table, nr_states);
409 
410 	ret = em_compute_costs(dev, table, cb, nr_states, flags);
411 	if (ret)
412 		return -EINVAL;
413 
414 	return 0;
415 }
416 
em_create_pd(struct device * dev,int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,unsigned long flags)417 static int em_create_pd(struct device *dev, int nr_states,
418 			const struct em_data_callback *cb,
419 			const cpumask_t *cpus,
420 			unsigned long flags)
421 {
422 	struct em_perf_table *em_table;
423 	struct em_perf_domain *pd;
424 	struct device *cpu_dev;
425 	int cpu, ret, num_cpus, id;
426 
427 	if (_is_cpu_device(dev)) {
428 		num_cpus = cpumask_weight(cpus);
429 
430 		/* Prevent max possible energy calculation to not overflow */
431 		if (num_cpus > EM_MAX_NUM_CPUS) {
432 			dev_err(dev, "EM: too many CPUs, overflow possible\n");
433 			return -EINVAL;
434 		}
435 
436 		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
437 		if (!pd)
438 			return -ENOMEM;
439 
440 		cpumask_copy(em_span_cpus(pd), cpus);
441 	} else {
442 		pd = kzalloc(sizeof(*pd), GFP_KERNEL);
443 		if (!pd)
444 			return -ENOMEM;
445 	}
446 
447 	pd->nr_perf_states = nr_states;
448 
449 	INIT_LIST_HEAD(&pd->node);
450 
451 	id = ida_alloc(&em_pd_ida, GFP_KERNEL);
452 	if (id < 0) {
453 		kfree(pd);
454 		return id;
455 	}
456 	pd->id = id;
457 
458 	em_table = em_table_alloc(pd);
459 	if (!em_table)
460 		goto free_pd;
461 
462 	ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
463 	if (ret)
464 		goto free_pd_table;
465 
466 	rcu_assign_pointer(pd->em_table, em_table);
467 
468 	if (_is_cpu_device(dev))
469 		for_each_cpu(cpu, cpus) {
470 			cpu_dev = get_cpu_device(cpu);
471 			cpu_dev->em_pd = pd;
472 		}
473 
474 	dev->em_pd = pd;
475 
476 	return 0;
477 
478 free_pd_table:
479 	kfree(em_table);
480 free_pd:
481 	kfree(pd);
482 	ida_free(&em_pd_ida, id);
483 	return -EINVAL;
484 }
485 
486 static void
em_cpufreq_update_efficiencies(struct device * dev,struct em_perf_state * table)487 em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
488 {
489 	struct em_perf_domain *pd = dev->em_pd;
490 	struct cpufreq_policy *policy;
491 	int found = 0;
492 	int i, cpu;
493 
494 	if (!_is_cpu_device(dev))
495 		return;
496 
497 	/* Try to get a CPU which is active and in this PD */
498 	cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
499 	if (cpu >= nr_cpu_ids) {
500 		dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
501 		return;
502 	}
503 
504 	policy = cpufreq_cpu_get(cpu);
505 	if (!policy) {
506 		dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
507 		return;
508 	}
509 
510 	for (i = 0; i < pd->nr_perf_states; i++) {
511 		if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
512 			continue;
513 
514 		if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
515 			found++;
516 	}
517 
518 	cpufreq_cpu_put(policy);
519 
520 	if (!found)
521 		return;
522 
523 	/*
524 	 * Efficiencies have been installed in CPUFreq, inefficient frequencies
525 	 * will be skipped. The EM can do the same.
526 	 */
527 	pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
528 }
529 
530 /**
531  * em_pd_get() - Return the performance domain for a device
532  * @dev : Device to find the performance domain for
533  *
534  * Returns the performance domain to which @dev belongs, or NULL if it doesn't
535  * exist.
536  */
em_pd_get(struct device * dev)537 struct em_perf_domain *em_pd_get(struct device *dev)
538 {
539 	if (IS_ERR_OR_NULL(dev))
540 		return NULL;
541 
542 	return dev->em_pd;
543 }
544 EXPORT_SYMBOL_GPL(em_pd_get);
545 
546 /**
547  * em_cpu_get() - Return the performance domain for a CPU
548  * @cpu : CPU to find the performance domain for
549  *
550  * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
551  * exist.
552  */
em_cpu_get(int cpu)553 struct em_perf_domain *em_cpu_get(int cpu)
554 {
555 	struct device *cpu_dev;
556 
557 	cpu_dev = get_cpu_device(cpu);
558 	if (!cpu_dev)
559 		return NULL;
560 
561 	return em_pd_get(cpu_dev);
562 }
563 EXPORT_SYMBOL_GPL(em_cpu_get);
564 
565 /**
566  * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
567  * @dev		: Device for which the EM is to register
568  * @nr_states	: Number of performance states to register
569  * @cb		: Callback functions providing the data of the Energy Model
570  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
571  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
572  *		type of devices this should be set to NULL.
573  * @microwatts	: Flag indicating that the power values are in micro-Watts or
574  *		in some other scale. It must be set properly.
575  *
576  * Create Energy Model tables for a performance domain using the callbacks
577  * defined in cb.
578  *
579  * The @microwatts is important to set with correct value. Some kernel
580  * sub-systems might rely on this flag and check if all devices in the EM are
581  * using the same scale.
582  *
583  * If multiple clients register the same performance domain, all but the first
584  * registration will be ignored.
585  *
586  * Return 0 on success
587  */
em_dev_register_perf_domain(struct device * dev,unsigned int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,bool microwatts)588 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
589 				const struct em_data_callback *cb,
590 				const cpumask_t *cpus, bool microwatts)
591 {
592 	int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
593 
594 	if (_is_cpu_device(dev))
595 		em_check_capacity_update();
596 
597 	return ret;
598 }
599 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
600 
601 /**
602  * em_dev_register_pd_no_update() - Register a perf domain for a device
603  * @dev : Device to register the PD for
604  * @nr_states : Number of performance states in the new PD
605  * @cb : Callback functions for populating the energy model
606  * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
607  * @microwatts : Whether or not the power values in the EM will be in uW
608  *
609  * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
610  * update after registering the PD, even if @dev is a CPU device.
611  */
em_dev_register_pd_no_update(struct device * dev,unsigned int nr_states,const struct em_data_callback * cb,const cpumask_t * cpus,bool microwatts)612 int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
613 				 const struct em_data_callback *cb,
614 				 const cpumask_t *cpus, bool microwatts)
615 {
616 	struct em_perf_table *em_table;
617 	unsigned long cap, prev_cap = 0;
618 	unsigned long flags = 0;
619 	int cpu, ret;
620 
621 	if (!dev || !nr_states || !cb)
622 		return -EINVAL;
623 
624 	/*
625 	 * Use a mutex to serialize the registration of performance domains and
626 	 * let the driver-defined callback functions sleep.
627 	 */
628 	mutex_lock(&em_pd_mutex);
629 
630 	if (dev->em_pd) {
631 		ret = -EEXIST;
632 		goto unlock;
633 	}
634 
635 	if (_is_cpu_device(dev)) {
636 		if (!cpus) {
637 			dev_err(dev, "EM: invalid CPU mask\n");
638 			ret = -EINVAL;
639 			goto unlock;
640 		}
641 
642 		for_each_cpu(cpu, cpus) {
643 			if (em_cpu_get(cpu)) {
644 				dev_err(dev, "EM: exists for CPU%d\n", cpu);
645 				ret = -EEXIST;
646 				goto unlock;
647 			}
648 			/*
649 			 * All CPUs of a domain must have the same
650 			 * micro-architecture since they all share the same
651 			 * table.
652 			 */
653 			cap = arch_scale_cpu_capacity(cpu);
654 			if (prev_cap && prev_cap != cap) {
655 				dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
656 					cpumask_pr_args(cpus));
657 
658 				ret = -EINVAL;
659 				goto unlock;
660 			}
661 			prev_cap = cap;
662 		}
663 	}
664 
665 	if (microwatts)
666 		flags |= EM_PERF_DOMAIN_MICROWATTS;
667 	else if (cb->get_cost)
668 		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
669 
670 	/*
671 	 * EM only supports uW (exception is artificial EM).
672 	 * Therefore, check and force the drivers to provide
673 	 * power in uW.
674 	 */
675 	if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
676 		dev_err(dev, "EM: only supports uW power values\n");
677 		ret = -EINVAL;
678 		goto unlock;
679 	}
680 
681 	ret = em_create_pd(dev, nr_states, cb, cpus, flags);
682 	if (ret)
683 		goto unlock;
684 
685 	dev->em_pd->flags |= flags;
686 	dev->em_pd->min_perf_state = 0;
687 	dev->em_pd->max_perf_state = nr_states - 1;
688 
689 	em_table = rcu_dereference_protected(dev->em_pd->em_table,
690 					     lockdep_is_held(&em_pd_mutex));
691 	em_cpufreq_update_efficiencies(dev, em_table->state);
692 
693 	em_debug_create_pd(dev);
694 	dev_info(dev, "EM: created perf domain\n");
695 
696 unlock:
697 	mutex_unlock(&em_pd_mutex);
698 	if (ret)
699 		return ret;
700 
701 	mutex_lock(&em_pd_list_mutex);
702 	list_add_tail(&dev->em_pd->node, &em_pd_list);
703 	mutex_unlock(&em_pd_list_mutex);
704 
705 	em_notify_pd_created(dev->em_pd);
706 
707 	return 0;
708 }
709 EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
710 
711 /**
712  * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
713  * @dev		: Device for which the EM is registered
714  *
715  * Unregister the EM for the specified @dev (but not a CPU device).
716  */
em_dev_unregister_perf_domain(struct device * dev)717 void em_dev_unregister_perf_domain(struct device *dev)
718 {
719 	if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
720 		return;
721 
722 	if (_is_cpu_device(dev))
723 		return;
724 
725 	mutex_lock(&em_pd_list_mutex);
726 	list_del_init(&dev->em_pd->node);
727 	mutex_unlock(&em_pd_list_mutex);
728 
729 	em_notify_pd_deleted(dev->em_pd);
730 
731 	/*
732 	 * The mutex separates all register/unregister requests and protects
733 	 * from potential clean-up/setup issues in the debugfs directories.
734 	 * The debugfs directory name is the same as device's name.
735 	 */
736 	mutex_lock(&em_pd_mutex);
737 	em_debug_remove_pd(dev);
738 
739 	em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
740 						lockdep_is_held(&em_pd_mutex)));
741 
742 	ida_free(&em_pd_ida, dev->em_pd->id);
743 
744 	kfree(dev->em_pd);
745 	dev->em_pd = NULL;
746 	mutex_unlock(&em_pd_mutex);
747 }
748 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
749 
em_table_dup(struct em_perf_domain * pd)750 static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
751 {
752 	struct em_perf_table *em_table;
753 	struct em_perf_state *ps, *new_ps;
754 	int ps_size;
755 
756 	em_table = em_table_alloc(pd);
757 	if (!em_table)
758 		return NULL;
759 
760 	new_ps = em_table->state;
761 
762 	rcu_read_lock();
763 	ps = em_perf_state_from_pd(pd);
764 	/* Initialize data based on old table */
765 	ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
766 	memcpy(new_ps, ps, ps_size);
767 
768 	rcu_read_unlock();
769 
770 	return em_table;
771 }
772 
em_recalc_and_update(struct device * dev,struct em_perf_domain * pd,struct em_perf_table * em_table)773 static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
774 				struct em_perf_table *em_table)
775 {
776 	int ret;
777 
778 	if (!em_is_artificial(pd)) {
779 		ret = em_compute_costs(dev, em_table->state, NULL,
780 				       pd->nr_perf_states, pd->flags);
781 		if (ret)
782 			goto free_em_table;
783 	}
784 
785 	ret = em_dev_update_perf_domain(dev, em_table);
786 	if (ret)
787 		goto free_em_table;
788 
789 	/*
790 	 * This is one-time-update, so give up the ownership in this updater.
791 	 * The EM framework has incremented the usage counter and from now
792 	 * will keep the reference (then free the memory when needed).
793 	 */
794 free_em_table:
795 	em_table_free(em_table);
796 	return ret;
797 }
798 
799 /*
800  * Adjustment of CPU performance values after boot, when all CPUs capacites
801  * are correctly calculated.
802  */
em_adjust_new_capacity(unsigned int cpu,struct device * dev,struct em_perf_domain * pd)803 static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
804 				   struct em_perf_domain *pd)
805 {
806 	unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
807 	struct em_perf_table *em_table;
808 	struct em_perf_state *table;
809 	unsigned long em_max_perf;
810 
811 	rcu_read_lock();
812 	table = em_perf_state_from_pd(pd);
813 	em_max_perf = table[pd->nr_perf_states - 1].performance;
814 	rcu_read_unlock();
815 
816 	if (em_max_perf == cpu_capacity)
817 		return;
818 
819 	pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
820 		 cpu_capacity, em_max_perf);
821 
822 	em_table = em_table_dup(pd);
823 	if (!em_table) {
824 		dev_warn(dev, "EM: allocation failed\n");
825 		return;
826 	}
827 
828 	em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
829 
830 	em_recalc_and_update(dev, pd, em_table);
831 }
832 
833 /**
834  * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
835  * @cpu: Target CPU.
836  *
837  * Adjust the existing EM for @cpu after a capacity update under the assumption
838  * that the capacity has been updated in the same way for all of the CPUs in
839  * the same perf domain.
840  */
em_adjust_cpu_capacity(unsigned int cpu)841 void em_adjust_cpu_capacity(unsigned int cpu)
842 {
843 	struct device *dev = get_cpu_device(cpu);
844 	struct em_perf_domain *pd;
845 
846 	pd = em_pd_get(dev);
847 	if (pd)
848 		em_adjust_new_capacity(cpu, dev, pd);
849 }
850 
em_check_capacity_update(void)851 static void em_check_capacity_update(void)
852 {
853 	cpumask_var_t cpu_done_mask;
854 	int cpu, failed_cpus = 0;
855 
856 	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
857 		pr_warn("no free memory\n");
858 		return;
859 	}
860 
861 	/* Check if CPUs capacity has changed than update EM */
862 	for_each_possible_cpu(cpu) {
863 		struct cpufreq_policy *policy;
864 		struct em_perf_domain *pd;
865 		struct device *dev;
866 
867 		if (cpumask_test_cpu(cpu, cpu_done_mask))
868 			continue;
869 
870 		policy = cpufreq_cpu_get(cpu);
871 		if (!policy) {
872 			failed_cpus++;
873 			continue;
874 		}
875 		cpufreq_cpu_put(policy);
876 
877 		dev = get_cpu_device(cpu);
878 		pd = em_pd_get(dev);
879 		if (!pd || em_is_artificial(pd))
880 			continue;
881 
882 		cpumask_or(cpu_done_mask, cpu_done_mask,
883 			   em_span_cpus(pd));
884 
885 		em_adjust_new_capacity(cpu, dev, pd);
886 	}
887 
888 	if (failed_cpus)
889 		schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000));
890 
891 	free_cpumask_var(cpu_done_mask);
892 }
893 
em_update_workfn(struct work_struct * work)894 static void em_update_workfn(struct work_struct *work)
895 {
896 	em_check_capacity_update();
897 }
898 
899 /**
900  * em_dev_update_chip_binning() - Update Energy Model after the new voltage
901  *				information is present in the OPPs.
902  * @dev		: Device for which the Energy Model has to be updated.
903  *
904  * This function allows to update easily the EM with new values available in
905  * the OPP framework and DT. It can be used after the chip has been properly
906  * verified by device drivers and the voltages adjusted for the 'chip binning'.
907  */
em_dev_update_chip_binning(struct device * dev)908 int em_dev_update_chip_binning(struct device *dev)
909 {
910 	struct em_perf_table *em_table;
911 	struct em_perf_domain *pd;
912 	int i, ret;
913 
914 	if (IS_ERR_OR_NULL(dev))
915 		return -EINVAL;
916 
917 	pd = em_pd_get(dev);
918 	if (!pd) {
919 		dev_warn(dev, "Couldn't find Energy Model\n");
920 		return -EINVAL;
921 	}
922 
923 	em_table = em_table_dup(pd);
924 	if (!em_table) {
925 		dev_warn(dev, "EM: allocation failed\n");
926 		return -ENOMEM;
927 	}
928 
929 	/* Update power values which might change due to new voltage in OPPs */
930 	for (i = 0; i < pd->nr_perf_states; i++) {
931 		unsigned long freq = em_table->state[i].frequency;
932 		unsigned long power;
933 
934 		ret = dev_pm_opp_calc_power(dev, &power, &freq);
935 		if (ret) {
936 			em_table_free(em_table);
937 			return ret;
938 		}
939 
940 		em_table->state[i].power = power;
941 	}
942 
943 	return em_recalc_and_update(dev, pd, em_table);
944 }
945 EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
946 
947 
948 /**
949  * em_update_performance_limits() - Update Energy Model with performance
950  *				limits information.
951  * @pd			: Performance Domain with EM that has to be updated.
952  * @freq_min_khz	: New minimum allowed frequency for this device.
953  * @freq_max_khz	: New maximum allowed frequency for this device.
954  *
955  * This function allows to update the EM with information about available
956  * performance levels. It takes the minimum and maximum frequency in kHz
957  * and does internal translation to performance levels.
958  * Returns 0 on success or -EINVAL when failed.
959  */
em_update_performance_limits(struct em_perf_domain * pd,unsigned long freq_min_khz,unsigned long freq_max_khz)960 int em_update_performance_limits(struct em_perf_domain *pd,
961 		unsigned long freq_min_khz, unsigned long freq_max_khz)
962 {
963 	struct em_perf_state *table;
964 	int min_ps = -1;
965 	int max_ps = -1;
966 	int i;
967 
968 	if (!pd)
969 		return -EINVAL;
970 
971 	rcu_read_lock();
972 	table = em_perf_state_from_pd(pd);
973 
974 	for (i = 0; i < pd->nr_perf_states; i++) {
975 		if (freq_min_khz == table[i].frequency)
976 			min_ps = i;
977 		if (freq_max_khz == table[i].frequency)
978 			max_ps = i;
979 	}
980 	rcu_read_unlock();
981 
982 	/* Only update when both are found and sane */
983 	if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
984 		return -EINVAL;
985 
986 
987 	/* Guard simultaneous updates and make them atomic */
988 	mutex_lock(&em_pd_mutex);
989 	pd->min_perf_state = min_ps;
990 	pd->max_perf_state = max_ps;
991 	mutex_unlock(&em_pd_mutex);
992 
993 	return 0;
994 }
995 EXPORT_SYMBOL_GPL(em_update_performance_limits);
996 
rebuild_sd_workfn(struct work_struct * work)997 static void rebuild_sd_workfn(struct work_struct *work)
998 {
999 	rebuild_sched_domains_energy();
1000 }
1001 
em_rebuild_sched_domains(void)1002 void em_rebuild_sched_domains(void)
1003 {
1004 	static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
1005 
1006 	/*
1007 	 * When called from the cpufreq_register_driver() path, the
1008 	 * cpu_hotplug_lock is already held, so use a work item to
1009 	 * avoid nested locking in rebuild_sched_domains().
1010 	 */
1011 	schedule_work(&rebuild_sd_work);
1012 }
1013 
1014 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
for_each_em_perf_domain(int (* cb)(struct em_perf_domain *,void *),void * data)1015 int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
1016 			    void *data)
1017 {
1018 	struct em_perf_domain *pd;
1019 
1020 	lockdep_assert_not_held(&em_pd_mutex);
1021 	guard(mutex)(&em_pd_list_mutex);
1022 
1023 	list_for_each_entry(pd, &em_pd_list, node) {
1024 		int ret;
1025 
1026 		ret = cb(pd, data);
1027 		if (ret)
1028 			return ret;
1029 	}
1030 
1031 	return 0;
1032 }
1033 
em_perf_domain_get_by_id(int id)1034 struct em_perf_domain *em_perf_domain_get_by_id(int id)
1035 {
1036 	struct em_perf_domain *pd;
1037 
1038 	lockdep_assert_not_held(&em_pd_mutex);
1039 	guard(mutex)(&em_pd_list_mutex);
1040 
1041 	list_for_each_entry(pd, &em_pd_list, node) {
1042 		if (pd->id == id)
1043 			return pd;
1044 	}
1045 
1046 	return NULL;
1047 }
1048 #endif
1049