1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * devfreq_cooling: Thermal cooling device implementation for devices using 4 * devfreq 5 * 6 * Copyright (C) 2014-2015 ARM Limited 7 * 8 * TODO: 9 * - If OPPs are added or removed after devfreq cooling has 10 * registered, the devfreq cooling won't react to it. 11 */ 12 13 #include <linux/devfreq.h> 14 #include <linux/devfreq_cooling.h> 15 #include <linux/energy_model.h> 16 #include <linux/export.h> 17 #include <linux/slab.h> 18 #include <linux/pm_opp.h> 19 #include <linux/pm_qos.h> 20 #include <linux/thermal.h> 21 #include <linux/units.h> 22 23 #include "thermal_trace.h" 24 25 #define SCALE_ERROR_MITIGATION 100 26 27 /** 28 * struct devfreq_cooling_device - Devfreq cooling device 29 * devfreq_cooling_device registered. 30 * @cdev: Pointer to associated thermal cooling device. 31 * @cooling_ops: devfreq callbacks to thermal cooling device ops 32 * @devfreq: Pointer to associated devfreq device. 33 * @cooling_state: Current cooling state. 34 * @freq_table: Pointer to a table with the frequencies sorted in descending 35 * order. You can index the table by cooling device state 36 * @max_state: It is the last index, that is, one less than the number of the 37 * OPPs 38 * @power_ops: Pointer to devfreq_cooling_power, a more precised model. 39 * @res_util: Resource utilization scaling factor for the power. 40 * It is multiplied by 100 to minimize the error. It is used 41 * for estimation of the power budget instead of using 42 * 'utilization' (which is 'busy_time' / 'total_time'). 43 * The 'res_util' range is from 100 to power * 100 for the 44 * corresponding 'state'. 45 * @capped_state: index to cooling state with in dynamic power budget 46 * @req_max_freq: PM QoS request for limiting the maximum frequency 47 * of the devfreq device. 48 * @em_pd: Energy Model for the associated Devfreq device 49 */ 50 struct devfreq_cooling_device { 51 struct thermal_cooling_device *cdev; 52 struct thermal_cooling_device_ops cooling_ops; 53 struct devfreq *devfreq; 54 unsigned long cooling_state; 55 u32 *freq_table; 56 size_t max_state; 57 struct devfreq_cooling_power *power_ops; 58 u32 res_util; 59 int capped_state; 60 struct dev_pm_qos_request req_max_freq; 61 struct em_perf_domain *em_pd; 62 }; 63 64 static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev, 65 unsigned long *state) 66 { 67 struct devfreq_cooling_device *dfc = cdev->devdata; 68 69 *state = dfc->max_state; 70 71 return 0; 72 } 73 74 static int devfreq_cooling_get_cur_state(struct thermal_cooling_device *cdev, 75 unsigned long *state) 76 { 77 struct devfreq_cooling_device *dfc = cdev->devdata; 78 79 *state = dfc->cooling_state; 80 81 return 0; 82 } 83 84 static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev, 85 unsigned long state) 86 { 87 struct devfreq_cooling_device *dfc = cdev->devdata; 88 struct devfreq *df = dfc->devfreq; 89 struct device *dev = df->dev.parent; 90 struct em_perf_state *table; 91 unsigned long freq; 92 int perf_idx; 93 94 if (state == dfc->cooling_state) 95 return 0; 96 97 dev_dbg(dev, "Setting cooling state %lu\n", state); 98 99 if (state > dfc->max_state) 100 return -EINVAL; 101 102 if (dfc->em_pd) { 103 perf_idx = dfc->max_state - state; 104 105 rcu_read_lock(); 106 table = em_perf_state_from_pd(dfc->em_pd); 107 freq = table[perf_idx].frequency * 1000; 108 rcu_read_unlock(); 109 } else { 110 freq = dfc->freq_table[state]; 111 } 112 113 dev_pm_qos_update_request(&dfc->req_max_freq, 114 DIV_ROUND_UP(freq, HZ_PER_KHZ)); 115 116 dfc->cooling_state = state; 117 118 return 0; 119 } 120 121 /** 122 * get_perf_idx() - get the performance index corresponding to a frequency 123 * @em_pd: Pointer to device's Energy Model 124 * @freq: frequency in kHz 125 * 126 * Return: the performance index associated with the @freq, or 127 * -EINVAL if it wasn't found. 128 */ 129 static int get_perf_idx(struct em_perf_domain *em_pd, unsigned long freq) 130 { 131 struct em_perf_state *table; 132 int i, idx = -EINVAL; 133 134 rcu_read_lock(); 135 table = em_perf_state_from_pd(em_pd); 136 for (i = 0; i < em_pd->nr_perf_states; i++) { 137 if (table[i].frequency != freq) 138 continue; 139 140 idx = i; 141 break; 142 } 143 rcu_read_unlock(); 144 145 return idx; 146 } 147 148 static unsigned long get_voltage(struct devfreq *df, unsigned long freq) 149 { 150 struct device *dev = df->dev.parent; 151 unsigned long voltage; 152 struct dev_pm_opp *opp; 153 154 opp = dev_pm_opp_find_freq_exact(dev, freq, true); 155 if (PTR_ERR(opp) == -ERANGE) 156 opp = dev_pm_opp_find_freq_exact(dev, freq, false); 157 158 if (IS_ERR(opp)) { 159 dev_err_ratelimited(dev, "Failed to find OPP for frequency %lu: %ld\n", 160 freq, PTR_ERR(opp)); 161 return 0; 162 } 163 164 voltage = dev_pm_opp_get_voltage(opp) / 1000; /* mV */ 165 dev_pm_opp_put(opp); 166 167 if (voltage == 0) { 168 dev_err_ratelimited(dev, 169 "Failed to get voltage for frequency %lu\n", 170 freq); 171 } 172 173 return voltage; 174 } 175 176 static void _normalize_load(struct devfreq_dev_status *status) 177 { 178 if (status->total_time > 0xfffff) { 179 status->total_time >>= 10; 180 status->busy_time >>= 10; 181 } 182 183 status->busy_time <<= 10; 184 status->busy_time /= status->total_time ? : 1; 185 186 status->busy_time = status->busy_time ? : 1; 187 status->total_time = 1024; 188 } 189 190 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev, 191 u32 *power) 192 { 193 struct devfreq_cooling_device *dfc = cdev->devdata; 194 struct devfreq *df = dfc->devfreq; 195 struct devfreq_dev_status status; 196 struct em_perf_state *table; 197 unsigned long state; 198 unsigned long freq; 199 unsigned long voltage; 200 int res, perf_idx; 201 202 mutex_lock(&df->lock); 203 status = df->last_status; 204 mutex_unlock(&df->lock); 205 206 freq = status.current_frequency; 207 208 if (dfc->power_ops && dfc->power_ops->get_real_power) { 209 voltage = get_voltage(df, freq); 210 if (voltage == 0) { 211 res = -EINVAL; 212 goto fail; 213 } 214 215 res = dfc->power_ops->get_real_power(df, power, freq, voltage); 216 if (!res) { 217 state = dfc->capped_state; 218 219 /* Convert EM power into milli-Watts first */ 220 rcu_read_lock(); 221 table = em_perf_state_from_pd(dfc->em_pd); 222 dfc->res_util = table[state].power; 223 rcu_read_unlock(); 224 225 dfc->res_util /= MICROWATT_PER_MILLIWATT; 226 227 dfc->res_util *= SCALE_ERROR_MITIGATION; 228 229 if (*power > 1) 230 dfc->res_util /= *power; 231 } else { 232 goto fail; 233 } 234 } else { 235 /* Energy Model frequencies are in kHz */ 236 perf_idx = get_perf_idx(dfc->em_pd, freq / 1000); 237 if (perf_idx < 0) { 238 res = -EAGAIN; 239 goto fail; 240 } 241 242 _normalize_load(&status); 243 244 /* Convert EM power into milli-Watts first */ 245 rcu_read_lock(); 246 table = em_perf_state_from_pd(dfc->em_pd); 247 *power = table[perf_idx].power; 248 rcu_read_unlock(); 249 250 *power /= MICROWATT_PER_MILLIWATT; 251 /* Scale power for utilization */ 252 *power *= status.busy_time; 253 *power >>= 10; 254 } 255 256 trace_thermal_power_devfreq_get_power(cdev, &status, freq, *power); 257 258 return 0; 259 fail: 260 /* It is safe to set max in this case */ 261 dfc->res_util = SCALE_ERROR_MITIGATION; 262 return res; 263 } 264 265 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev, 266 unsigned long state, u32 *power) 267 { 268 struct devfreq_cooling_device *dfc = cdev->devdata; 269 struct em_perf_state *table; 270 int perf_idx; 271 272 if (state > dfc->max_state) 273 return -EINVAL; 274 275 perf_idx = dfc->max_state - state; 276 277 rcu_read_lock(); 278 table = em_perf_state_from_pd(dfc->em_pd); 279 *power = table[perf_idx].power; 280 rcu_read_unlock(); 281 282 *power /= MICROWATT_PER_MILLIWATT; 283 284 return 0; 285 } 286 287 static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev, 288 u32 power, unsigned long *state) 289 { 290 struct devfreq_cooling_device *dfc = cdev->devdata; 291 struct devfreq *df = dfc->devfreq; 292 struct devfreq_dev_status status; 293 unsigned long freq, em_power_mw; 294 struct em_perf_state *table; 295 s32 est_power; 296 int i; 297 298 mutex_lock(&df->lock); 299 status = df->last_status; 300 mutex_unlock(&df->lock); 301 302 freq = status.current_frequency; 303 304 if (dfc->power_ops && dfc->power_ops->get_real_power) { 305 /* Scale for resource utilization */ 306 est_power = power * dfc->res_util; 307 est_power /= SCALE_ERROR_MITIGATION; 308 } else { 309 /* Scale dynamic power for utilization */ 310 _normalize_load(&status); 311 est_power = power << 10; 312 est_power /= status.busy_time; 313 } 314 315 /* 316 * Find the first cooling state that is within the power 317 * budget. The EM power table is sorted ascending. 318 */ 319 rcu_read_lock(); 320 table = em_perf_state_from_pd(dfc->em_pd); 321 for (i = dfc->max_state; i > 0; i--) { 322 /* Convert EM power to milli-Watts to make safe comparison */ 323 em_power_mw = table[i].power; 324 em_power_mw /= MICROWATT_PER_MILLIWATT; 325 if (est_power >= em_power_mw) 326 break; 327 } 328 rcu_read_unlock(); 329 330 *state = dfc->max_state - i; 331 dfc->capped_state = *state; 332 333 trace_thermal_power_devfreq_limit(cdev, freq, *state, power); 334 return 0; 335 } 336 337 /** 338 * devfreq_cooling_gen_tables() - Generate frequency table. 339 * @dfc: Pointer to devfreq cooling device. 340 * @num_opps: Number of OPPs 341 * 342 * Generate frequency table which holds the frequencies in descending 343 * order. That way its indexed by cooling device state. This is for 344 * compatibility with drivers which do not register Energy Model. 345 * 346 * Return: 0 on success, negative error code on failure. 347 */ 348 static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc, 349 int num_opps) 350 { 351 struct devfreq *df = dfc->devfreq; 352 struct device *dev = df->dev.parent; 353 unsigned long freq; 354 int i; 355 356 dfc->freq_table = kcalloc(num_opps, sizeof(*dfc->freq_table), 357 GFP_KERNEL); 358 if (!dfc->freq_table) 359 return -ENOMEM; 360 361 for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) { 362 struct dev_pm_opp *opp; 363 364 opp = dev_pm_opp_find_freq_floor(dev, &freq); 365 if (IS_ERR(opp)) { 366 kfree(dfc->freq_table); 367 return PTR_ERR(opp); 368 } 369 370 dev_pm_opp_put(opp); 371 dfc->freq_table[i] = freq; 372 } 373 374 return 0; 375 } 376 377 /** 378 * of_devfreq_cooling_register_power() - Register devfreq cooling device, 379 * with OF and power information. 380 * @np: Pointer to OF device_node. 381 * @df: Pointer to devfreq device. 382 * @dfc_power: Pointer to devfreq_cooling_power. 383 * 384 * Register a devfreq cooling device. The available OPPs must be 385 * registered on the device. 386 * 387 * If @dfc_power is provided, the cooling device is registered with the 388 * power extensions. For the power extensions to work correctly, 389 * devfreq should use the simple_ondemand governor, other governors 390 * are not currently supported. 391 */ 392 struct thermal_cooling_device * 393 of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, 394 struct devfreq_cooling_power *dfc_power) 395 { 396 struct thermal_cooling_device *cdev; 397 struct device *dev = df->dev.parent; 398 struct devfreq_cooling_device *dfc; 399 struct em_perf_domain *em; 400 struct thermal_cooling_device_ops *ops; 401 char *name; 402 int err, num_opps; 403 404 405 dfc = kzalloc(sizeof(*dfc), GFP_KERNEL); 406 if (!dfc) 407 return ERR_PTR(-ENOMEM); 408 409 dfc->devfreq = df; 410 411 ops = &dfc->cooling_ops; 412 ops->get_max_state = devfreq_cooling_get_max_state; 413 ops->get_cur_state = devfreq_cooling_get_cur_state; 414 ops->set_cur_state = devfreq_cooling_set_cur_state; 415 416 em = em_pd_get(dev); 417 if (em && !em_is_artificial(em)) { 418 dfc->em_pd = em; 419 ops->get_requested_power = 420 devfreq_cooling_get_requested_power; 421 ops->state2power = devfreq_cooling_state2power; 422 ops->power2state = devfreq_cooling_power2state; 423 424 dfc->power_ops = dfc_power; 425 426 num_opps = em_pd_nr_perf_states(dfc->em_pd); 427 } else { 428 /* Backward compatibility for drivers which do not use IPA */ 429 dev_dbg(dev, "missing proper EM for cooling device\n"); 430 431 num_opps = dev_pm_opp_get_opp_count(dev); 432 433 err = devfreq_cooling_gen_tables(dfc, num_opps); 434 if (err) 435 goto free_dfc; 436 } 437 438 if (num_opps <= 0) { 439 err = -EINVAL; 440 goto free_dfc; 441 } 442 443 /* max_state is an index, not a counter */ 444 dfc->max_state = num_opps - 1; 445 446 err = dev_pm_qos_add_request(dev, &dfc->req_max_freq, 447 DEV_PM_QOS_MAX_FREQUENCY, 448 PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); 449 if (err < 0) 450 goto free_table; 451 452 err = -ENOMEM; 453 name = kasprintf(GFP_KERNEL, "devfreq-%s", dev_name(dev)); 454 if (!name) 455 goto remove_qos_req; 456 457 cdev = thermal_of_cooling_device_register(np, name, dfc, ops); 458 kfree(name); 459 460 if (IS_ERR(cdev)) { 461 err = PTR_ERR(cdev); 462 dev_err(dev, 463 "Failed to register devfreq cooling device (%d)\n", 464 err); 465 goto remove_qos_req; 466 } 467 468 dfc->cdev = cdev; 469 470 return cdev; 471 472 remove_qos_req: 473 dev_pm_qos_remove_request(&dfc->req_max_freq); 474 free_table: 475 kfree(dfc->freq_table); 476 free_dfc: 477 kfree(dfc); 478 479 return ERR_PTR(err); 480 } 481 EXPORT_SYMBOL_GPL(of_devfreq_cooling_register_power); 482 483 /** 484 * of_devfreq_cooling_register() - Register devfreq cooling device, 485 * with OF information. 486 * @np: Pointer to OF device_node. 487 * @df: Pointer to devfreq device. 488 */ 489 struct thermal_cooling_device * 490 of_devfreq_cooling_register(struct device_node *np, struct devfreq *df) 491 { 492 return of_devfreq_cooling_register_power(np, df, NULL); 493 } 494 EXPORT_SYMBOL_GPL(of_devfreq_cooling_register); 495 496 /** 497 * devfreq_cooling_register() - Register devfreq cooling device. 498 * @df: Pointer to devfreq device. 499 */ 500 struct thermal_cooling_device *devfreq_cooling_register(struct devfreq *df) 501 { 502 return of_devfreq_cooling_register(NULL, df); 503 } 504 EXPORT_SYMBOL_GPL(devfreq_cooling_register); 505 506 /** 507 * devfreq_cooling_em_register() - Register devfreq cooling device with 508 * power information and automatically register Energy Model (EM) 509 * @df: Pointer to devfreq device. 510 * @dfc_power: Pointer to devfreq_cooling_power. 511 * 512 * Register a devfreq cooling device and automatically register EM. The 513 * available OPPs must be registered for the device. 514 * 515 * If @dfc_power is provided, the cooling device is registered with the 516 * power extensions. It is using the simple Energy Model which requires 517 * "dynamic-power-coefficient" a devicetree property. To not break drivers 518 * which miss that DT property, the function won't bail out when the EM 519 * registration failed. The cooling device will be registered if everything 520 * else is OK. 521 */ 522 struct thermal_cooling_device * 523 devfreq_cooling_em_register(struct devfreq *df, 524 struct devfreq_cooling_power *dfc_power) 525 { 526 struct thermal_cooling_device *cdev; 527 struct device *dev; 528 int ret; 529 530 if (IS_ERR_OR_NULL(df)) 531 return ERR_PTR(-EINVAL); 532 533 dev = df->dev.parent; 534 535 ret = dev_pm_opp_of_register_em(dev, NULL); 536 if (ret) 537 dev_dbg(dev, "Unable to register EM for devfreq cooling device (%d)\n", 538 ret); 539 540 cdev = of_devfreq_cooling_register_power(dev->of_node, df, dfc_power); 541 542 if (IS_ERR_OR_NULL(cdev)) 543 em_dev_unregister_perf_domain(dev); 544 545 return cdev; 546 } 547 EXPORT_SYMBOL_GPL(devfreq_cooling_em_register); 548 549 /** 550 * devfreq_cooling_unregister() - Unregister devfreq cooling device. 551 * @cdev: Pointer to devfreq cooling device to unregister. 552 * 553 * Unregisters devfreq cooling device and related Energy Model if it was 554 * present. 555 */ 556 void devfreq_cooling_unregister(struct thermal_cooling_device *cdev) 557 { 558 struct devfreq_cooling_device *dfc; 559 struct device *dev; 560 561 if (IS_ERR_OR_NULL(cdev)) 562 return; 563 564 dfc = cdev->devdata; 565 dev = dfc->devfreq->dev.parent; 566 567 thermal_cooling_device_unregister(dfc->cdev); 568 dev_pm_qos_remove_request(&dfc->req_max_freq); 569 570 em_dev_unregister_perf_domain(dev); 571 572 kfree(dfc->freq_table); 573 kfree(dfc); 574 } 575 EXPORT_SYMBOL_GPL(devfreq_cooling_unregister); 576