1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Energy Model of devices 4 * 5 * Copyright (c) 2018-2021, Arm ltd. 6 * Written by: Quentin Perret, Arm ltd. 7 * Improvements provided by: Lukasz Luba, Arm ltd. 8 */ 9 10 #define pr_fmt(fmt) "energy_model: " fmt 11 12 #include <linux/cpu.h> 13 #include <linux/cpufreq.h> 14 #include <linux/cpumask.h> 15 #include <linux/debugfs.h> 16 #include <linux/energy_model.h> 17 #include <linux/sched/topology.h> 18 #include <linux/slab.h> 19 20 /* 21 * Mutex serializing the registrations of performance domains and letting 22 * callbacks defined by drivers sleep. 23 */ 24 static DEFINE_MUTEX(em_pd_mutex); 25 26 static bool _is_cpu_device(struct device *dev) 27 { 28 return (dev->bus == &cpu_subsys); 29 } 30 31 #ifdef CONFIG_DEBUG_FS 32 static struct dentry *rootdir; 33 34 static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) 35 { 36 struct dentry *d; 37 char name[24]; 38 39 snprintf(name, sizeof(name), "ps:%lu", ps->frequency); 40 41 /* Create per-ps directory */ 42 d = debugfs_create_dir(name, pd); 43 debugfs_create_ulong("frequency", 0444, d, &ps->frequency); 44 debugfs_create_ulong("power", 0444, d, &ps->power); 45 debugfs_create_ulong("cost", 0444, d, &ps->cost); 46 debugfs_create_ulong("inefficient", 0444, d, &ps->flags); 47 } 48 49 static int em_debug_cpus_show(struct seq_file *s, void *unused) 50 { 51 seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); 52 53 return 0; 54 } 55 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 56 57 static int em_debug_flags_show(struct seq_file *s, void *unused) 58 { 59 struct em_perf_domain *pd = s->private; 60 61 seq_printf(s, "%#lx\n", pd->flags); 62 63 return 0; 64 } 65 DEFINE_SHOW_ATTRIBUTE(em_debug_flags); 66 67 static void em_debug_create_pd(struct device *dev) 68 { 69 struct dentry *d; 70 int i; 71 72 /* Create the directory of the performance domain */ 73 d = debugfs_create_dir(dev_name(dev), rootdir); 74 75 if (_is_cpu_device(dev)) 76 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 77 &em_debug_cpus_fops); 78 79 debugfs_create_file("flags", 0444, d, dev->em_pd, 80 &em_debug_flags_fops); 81 82 /* Create a sub-directory for each performance state */ 83 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 84 em_debug_create_ps(&dev->em_pd->table[i], d); 85 86 } 87 88 static void em_debug_remove_pd(struct device *dev) 89 { 90 debugfs_lookup_and_remove(dev_name(dev), rootdir); 91 } 92 93 static int __init em_debug_init(void) 94 { 95 /* Create /sys/kernel/debug/energy_model directory */ 96 rootdir = debugfs_create_dir("energy_model", NULL); 97 98 return 0; 99 } 100 fs_initcall(em_debug_init); 101 #else /* CONFIG_DEBUG_FS */ 102 static void em_debug_create_pd(struct device *dev) {} 103 static void em_debug_remove_pd(struct device *dev) {} 104 #endif 105 106 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, 107 int nr_states, struct em_data_callback *cb, 108 unsigned long flags) 109 { 110 unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; 111 struct em_perf_state *table; 112 int i, ret; 113 u64 fmax; 114 115 table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); 116 if (!table) 117 return -ENOMEM; 118 119 /* Build the list of performance states for this performance domain */ 120 for (i = 0, freq = 0; i < nr_states; i++, freq++) { 121 /* 122 * active_power() is a driver callback which ceils 'freq' to 123 * lowest performance state of 'dev' above 'freq' and updates 124 * 'power' and 'freq' accordingly. 125 */ 126 ret = cb->active_power(dev, &power, &freq); 127 if (ret) { 128 dev_err(dev, "EM: invalid perf. state: %d\n", 129 ret); 130 goto free_ps_table; 131 } 132 133 /* 134 * We expect the driver callback to increase the frequency for 135 * higher performance states. 136 */ 137 if (freq <= prev_freq) { 138 dev_err(dev, "EM: non-increasing freq: %lu\n", 139 freq); 140 goto free_ps_table; 141 } 142 143 /* 144 * The power returned by active_state() is expected to be 145 * positive and be in range. 146 */ 147 if (!power || power > EM_MAX_POWER) { 148 dev_err(dev, "EM: invalid power: %lu\n", 149 power); 150 goto free_ps_table; 151 } 152 153 table[i].power = power; 154 table[i].frequency = prev_freq = freq; 155 } 156 157 /* Compute the cost of each performance state. */ 158 fmax = (u64) table[nr_states - 1].frequency; 159 for (i = nr_states - 1; i >= 0; i--) { 160 unsigned long power_res, cost; 161 162 if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { 163 ret = cb->get_cost(dev, table[i].frequency, &cost); 164 if (ret || !cost || cost > EM_MAX_POWER) { 165 dev_err(dev, "EM: invalid cost %lu %d\n", 166 cost, ret); 167 goto free_ps_table; 168 } 169 } else { 170 power_res = table[i].power; 171 cost = div64_u64(fmax * power_res, table[i].frequency); 172 } 173 174 table[i].cost = cost; 175 176 if (table[i].cost >= prev_cost) { 177 table[i].flags = EM_PERF_STATE_INEFFICIENT; 178 dev_dbg(dev, "EM: OPP:%lu is inefficient\n", 179 table[i].frequency); 180 } else { 181 prev_cost = table[i].cost; 182 } 183 } 184 185 pd->table = table; 186 pd->nr_perf_states = nr_states; 187 188 return 0; 189 190 free_ps_table: 191 kfree(table); 192 return -EINVAL; 193 } 194 195 static int em_create_pd(struct device *dev, int nr_states, 196 struct em_data_callback *cb, cpumask_t *cpus, 197 unsigned long flags) 198 { 199 struct em_perf_domain *pd; 200 struct device *cpu_dev; 201 int cpu, ret, num_cpus; 202 203 if (_is_cpu_device(dev)) { 204 num_cpus = cpumask_weight(cpus); 205 206 /* Prevent max possible energy calculation to not overflow */ 207 if (num_cpus > EM_MAX_NUM_CPUS) { 208 dev_err(dev, "EM: too many CPUs, overflow possible\n"); 209 return -EINVAL; 210 } 211 212 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 213 if (!pd) 214 return -ENOMEM; 215 216 cpumask_copy(em_span_cpus(pd), cpus); 217 } else { 218 pd = kzalloc(sizeof(*pd), GFP_KERNEL); 219 if (!pd) 220 return -ENOMEM; 221 } 222 223 ret = em_create_perf_table(dev, pd, nr_states, cb, flags); 224 if (ret) { 225 kfree(pd); 226 return ret; 227 } 228 229 if (_is_cpu_device(dev)) 230 for_each_cpu(cpu, cpus) { 231 cpu_dev = get_cpu_device(cpu); 232 cpu_dev->em_pd = pd; 233 } 234 235 dev->em_pd = pd; 236 237 return 0; 238 } 239 240 static void em_cpufreq_update_efficiencies(struct device *dev) 241 { 242 struct em_perf_domain *pd = dev->em_pd; 243 struct em_perf_state *table; 244 struct cpufreq_policy *policy; 245 int found = 0; 246 int i; 247 248 if (!_is_cpu_device(dev) || !pd) 249 return; 250 251 policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); 252 if (!policy) { 253 dev_warn(dev, "EM: Access to CPUFreq policy failed"); 254 return; 255 } 256 257 table = pd->table; 258 259 for (i = 0; i < pd->nr_perf_states; i++) { 260 if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) 261 continue; 262 263 if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) 264 found++; 265 } 266 267 cpufreq_cpu_put(policy); 268 269 if (!found) 270 return; 271 272 /* 273 * Efficiencies have been installed in CPUFreq, inefficient frequencies 274 * will be skipped. The EM can do the same. 275 */ 276 pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; 277 } 278 279 /** 280 * em_pd_get() - Return the performance domain for a device 281 * @dev : Device to find the performance domain for 282 * 283 * Returns the performance domain to which @dev belongs, or NULL if it doesn't 284 * exist. 285 */ 286 struct em_perf_domain *em_pd_get(struct device *dev) 287 { 288 if (IS_ERR_OR_NULL(dev)) 289 return NULL; 290 291 return dev->em_pd; 292 } 293 EXPORT_SYMBOL_GPL(em_pd_get); 294 295 /** 296 * em_cpu_get() - Return the performance domain for a CPU 297 * @cpu : CPU to find the performance domain for 298 * 299 * Returns the performance domain to which @cpu belongs, or NULL if it doesn't 300 * exist. 301 */ 302 struct em_perf_domain *em_cpu_get(int cpu) 303 { 304 struct device *cpu_dev; 305 306 cpu_dev = get_cpu_device(cpu); 307 if (!cpu_dev) 308 return NULL; 309 310 return em_pd_get(cpu_dev); 311 } 312 EXPORT_SYMBOL_GPL(em_cpu_get); 313 314 /** 315 * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device 316 * @dev : Device for which the EM is to register 317 * @nr_states : Number of performance states to register 318 * @cb : Callback functions providing the data of the Energy Model 319 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 320 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 321 * type of devices this should be set to NULL. 322 * @microwatts : Flag indicating that the power values are in micro-Watts or 323 * in some other scale. It must be set properly. 324 * 325 * Create Energy Model tables for a performance domain using the callbacks 326 * defined in cb. 327 * 328 * The @microwatts is important to set with correct value. Some kernel 329 * sub-systems might rely on this flag and check if all devices in the EM are 330 * using the same scale. 331 * 332 * If multiple clients register the same performance domain, all but the first 333 * registration will be ignored. 334 * 335 * Return 0 on success 336 */ 337 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 338 struct em_data_callback *cb, cpumask_t *cpus, 339 bool microwatts) 340 { 341 unsigned long cap, prev_cap = 0; 342 unsigned long flags = 0; 343 int cpu, ret; 344 345 if (!dev || !nr_states || !cb) 346 return -EINVAL; 347 348 /* 349 * Use a mutex to serialize the registration of performance domains and 350 * let the driver-defined callback functions sleep. 351 */ 352 mutex_lock(&em_pd_mutex); 353 354 if (dev->em_pd) { 355 ret = -EEXIST; 356 goto unlock; 357 } 358 359 if (_is_cpu_device(dev)) { 360 if (!cpus) { 361 dev_err(dev, "EM: invalid CPU mask\n"); 362 ret = -EINVAL; 363 goto unlock; 364 } 365 366 for_each_cpu(cpu, cpus) { 367 if (em_cpu_get(cpu)) { 368 dev_err(dev, "EM: exists for CPU%d\n", cpu); 369 ret = -EEXIST; 370 goto unlock; 371 } 372 /* 373 * All CPUs of a domain must have the same 374 * micro-architecture since they all share the same 375 * table. 376 */ 377 cap = arch_scale_cpu_capacity(cpu); 378 if (prev_cap && prev_cap != cap) { 379 dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", 380 cpumask_pr_args(cpus)); 381 382 ret = -EINVAL; 383 goto unlock; 384 } 385 prev_cap = cap; 386 } 387 } 388 389 if (microwatts) 390 flags |= EM_PERF_DOMAIN_MICROWATTS; 391 else if (cb->get_cost) 392 flags |= EM_PERF_DOMAIN_ARTIFICIAL; 393 394 ret = em_create_pd(dev, nr_states, cb, cpus, flags); 395 if (ret) 396 goto unlock; 397 398 dev->em_pd->flags |= flags; 399 400 em_cpufreq_update_efficiencies(dev); 401 402 em_debug_create_pd(dev); 403 dev_info(dev, "EM: created perf domain\n"); 404 405 unlock: 406 mutex_unlock(&em_pd_mutex); 407 return ret; 408 } 409 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); 410 411 /** 412 * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device 413 * @dev : Device for which the EM is registered 414 * 415 * Unregister the EM for the specified @dev (but not a CPU device). 416 */ 417 void em_dev_unregister_perf_domain(struct device *dev) 418 { 419 if (IS_ERR_OR_NULL(dev) || !dev->em_pd) 420 return; 421 422 if (_is_cpu_device(dev)) 423 return; 424 425 /* 426 * The mutex separates all register/unregister requests and protects 427 * from potential clean-up/setup issues in the debugfs directories. 428 * The debugfs directory name is the same as device's name. 429 */ 430 mutex_lock(&em_pd_mutex); 431 em_debug_remove_pd(dev); 432 433 kfree(dev->em_pd->table); 434 kfree(dev->em_pd); 435 dev->em_pd = NULL; 436 mutex_unlock(&em_pd_mutex); 437 } 438 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); 439