1 /* 2 * x86_pkg_temp_thermal driver 3 * Copyright (c) 2013, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 * You should have received a copy of the GNU General Public License along with 15 * this program; if not, write to the Free Software Foundation, Inc. 16 * 17 */ 18 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 19 20 #include <linux/module.h> 21 #include <linux/init.h> 22 #include <linux/err.h> 23 #include <linux/param.h> 24 #include <linux/device.h> 25 #include <linux/platform_device.h> 26 #include <linux/cpu.h> 27 #include <linux/smp.h> 28 #include <linux/slab.h> 29 #include <linux/pm.h> 30 #include <linux/thermal.h> 31 #include <linux/debugfs.h> 32 #include <asm/cpu_device_id.h> 33 #include <asm/mce.h> 34 35 /* 36 * Rate control delay: Idea is to introduce denounce effect 37 * This should be long enough to avoid reduce events, when 38 * threshold is set to a temperature, which is constantly 39 * violated, but at the short enough to take any action. 40 * The action can be remove threshold or change it to next 41 * interesting setting. Based on experiments, in around 42 * every 5 seconds under load will give us a significant 43 * temperature change. 44 */ 45 #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 46 static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; 47 module_param(notify_delay_ms, int, 0644); 48 MODULE_PARM_DESC(notify_delay_ms, 49 "User space notification delay in milli seconds."); 50 51 /* Number of trip points in thermal zone. Currently it can't 52 * be more than 2. MSR can allow setting and getting notifications 53 * for only 2 thresholds. This define enforces this, if there 54 * is some wrong values returned by cpuid for number of thresholds. 55 */ 56 #define MAX_NUMBER_OF_TRIPS 2 57 58 struct pkg_device { 59 int cpu; 60 bool work_scheduled; 61 u32 tj_max; 62 u32 msr_pkg_therm_low; 63 u32 msr_pkg_therm_high; 64 struct delayed_work work; 65 struct thermal_zone_device *tzone; 66 struct cpumask cpumask; 67 }; 68 69 static struct thermal_zone_params pkg_temp_tz_params = { 70 .no_hwmon = true, 71 }; 72 73 /* Keep track of how many package pointers we allocated in init() */ 74 static int max_packages __read_mostly; 75 /* Array of package pointers */ 76 static struct pkg_device **packages; 77 /* Serializes interrupt notification, work and hotplug */ 78 static DEFINE_SPINLOCK(pkg_temp_lock); 79 /* Protects zone operation in the work function against hotplug removal */ 80 static DEFINE_MUTEX(thermal_zone_mutex); 81 82 /* The dynamically assigned cpu hotplug state for module_exit() */ 83 static enum cpuhp_state pkg_thermal_hp_state __read_mostly; 84 85 /* Debug counters to show using debugfs */ 86 static struct dentry *debugfs; 87 static unsigned int pkg_interrupt_cnt; 88 static unsigned int pkg_work_cnt; 89 90 static int pkg_temp_debugfs_init(void) 91 { 92 struct dentry *d; 93 94 debugfs = debugfs_create_dir("pkg_temp_thermal", NULL); 95 if (!debugfs) 96 return -ENOENT; 97 98 d = debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs, 99 &pkg_interrupt_cnt); 100 if (!d) 101 goto err_out; 102 103 d = debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs, 104 &pkg_work_cnt); 105 if (!d) 106 goto err_out; 107 108 return 0; 109 110 err_out: 111 debugfs_remove_recursive(debugfs); 112 return -ENOENT; 113 } 114 115 /* 116 * Protection: 117 * 118 * - cpu hotplug: Read serialized by cpu hotplug lock 119 * Write must hold pkg_temp_lock 120 * 121 * - Other callsites: Must hold pkg_temp_lock 122 */ 123 static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu) 124 { 125 int pkgid = topology_logical_package_id(cpu); 126 127 if (pkgid >= 0 && pkgid < max_packages) 128 return packages[pkgid]; 129 return NULL; 130 } 131 132 /* 133 * tj-max is is interesting because threshold is set relative to this 134 * temperature. 135 */ 136 static int get_tj_max(int cpu, u32 *tj_max) 137 { 138 u32 eax, edx, val; 139 int err; 140 141 err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); 142 if (err) 143 return err; 144 145 val = (eax >> 16) & 0xff; 146 *tj_max = val * 1000; 147 148 return val ? 0 : -EINVAL; 149 } 150 151 static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp) 152 { 153 struct pkg_device *pkgdev = tzd->devdata; 154 u32 eax, edx; 155 156 rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx); 157 if (eax & 0x80000000) { 158 *temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000; 159 pr_debug("sys_get_curr_temp %d\n", *temp); 160 return 0; 161 } 162 return -EINVAL; 163 } 164 165 static int sys_get_trip_temp(struct thermal_zone_device *tzd, 166 int trip, int *temp) 167 { 168 struct pkg_device *pkgdev = tzd->devdata; 169 unsigned long thres_reg_value; 170 u32 mask, shift, eax, edx; 171 int ret; 172 173 if (trip >= MAX_NUMBER_OF_TRIPS) 174 return -EINVAL; 175 176 if (trip) { 177 mask = THERM_MASK_THRESHOLD1; 178 shift = THERM_SHIFT_THRESHOLD1; 179 } else { 180 mask = THERM_MASK_THRESHOLD0; 181 shift = THERM_SHIFT_THRESHOLD0; 182 } 183 184 ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 185 &eax, &edx); 186 if (ret < 0) 187 return ret; 188 189 thres_reg_value = (eax & mask) >> shift; 190 if (thres_reg_value) 191 *temp = pkgdev->tj_max - thres_reg_value * 1000; 192 else 193 *temp = 0; 194 pr_debug("sys_get_trip_temp %d\n", *temp); 195 196 return 0; 197 } 198 199 static int 200 sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp) 201 { 202 struct pkg_device *pkgdev = tzd->devdata; 203 u32 l, h, mask, shift, intr; 204 int ret; 205 206 if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max) 207 return -EINVAL; 208 209 ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, 210 &l, &h); 211 if (ret < 0) 212 return ret; 213 214 if (trip) { 215 mask = THERM_MASK_THRESHOLD1; 216 shift = THERM_SHIFT_THRESHOLD1; 217 intr = THERM_INT_THRESHOLD1_ENABLE; 218 } else { 219 mask = THERM_MASK_THRESHOLD0; 220 shift = THERM_SHIFT_THRESHOLD0; 221 intr = THERM_INT_THRESHOLD0_ENABLE; 222 } 223 l &= ~mask; 224 /* 225 * When users space sets a trip temperature == 0, which is indication 226 * that, it is no longer interested in receiving notifications. 227 */ 228 if (!temp) { 229 l &= ~intr; 230 } else { 231 l |= (pkgdev->tj_max - temp)/1000 << shift; 232 l |= intr; 233 } 234 235 return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 236 } 237 238 static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip, 239 enum thermal_trip_type *type) 240 { 241 *type = THERMAL_TRIP_PASSIVE; 242 return 0; 243 } 244 245 /* Thermal zone callback registry */ 246 static struct thermal_zone_device_ops tzone_ops = { 247 .get_temp = sys_get_curr_temp, 248 .get_trip_temp = sys_get_trip_temp, 249 .get_trip_type = sys_get_trip_type, 250 .set_trip_temp = sys_set_trip_temp, 251 }; 252 253 static bool pkg_thermal_rate_control(void) 254 { 255 return true; 256 } 257 258 /* Enable threshold interrupt on local package/cpu */ 259 static inline void enable_pkg_thres_interrupt(void) 260 { 261 u8 thres_0, thres_1; 262 u32 l, h; 263 264 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 265 /* only enable/disable if it had valid threshold value */ 266 thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; 267 thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; 268 if (thres_0) 269 l |= THERM_INT_THRESHOLD0_ENABLE; 270 if (thres_1) 271 l |= THERM_INT_THRESHOLD1_ENABLE; 272 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 273 } 274 275 /* Disable threshold interrupt on local package/cpu */ 276 static inline void disable_pkg_thres_interrupt(void) 277 { 278 u32 l, h; 279 280 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 281 282 l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE); 283 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 284 } 285 286 static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) 287 { 288 struct thermal_zone_device *tzone = NULL; 289 int cpu = smp_processor_id(); 290 struct pkg_device *pkgdev; 291 u64 msr_val, wr_val; 292 293 mutex_lock(&thermal_zone_mutex); 294 spin_lock_irq(&pkg_temp_lock); 295 ++pkg_work_cnt; 296 297 pkgdev = pkg_temp_thermal_get_dev(cpu); 298 if (!pkgdev) { 299 spin_unlock_irq(&pkg_temp_lock); 300 mutex_unlock(&thermal_zone_mutex); 301 return; 302 } 303 pkgdev->work_scheduled = false; 304 305 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 306 wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1); 307 if (wr_val != msr_val) { 308 wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val); 309 tzone = pkgdev->tzone; 310 } 311 312 enable_pkg_thres_interrupt(); 313 spin_unlock_irq(&pkg_temp_lock); 314 315 /* 316 * If tzone is not NULL, then thermal_zone_mutex will prevent the 317 * concurrent removal in the cpu offline callback. 318 */ 319 if (tzone) 320 thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED); 321 322 mutex_unlock(&thermal_zone_mutex); 323 } 324 325 static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) 326 { 327 unsigned long ms = msecs_to_jiffies(notify_delay_ms); 328 329 schedule_delayed_work_on(cpu, work, ms); 330 } 331 332 static int pkg_thermal_notify(u64 msr_val) 333 { 334 int cpu = smp_processor_id(); 335 struct pkg_device *pkgdev; 336 unsigned long flags; 337 338 spin_lock_irqsave(&pkg_temp_lock, flags); 339 ++pkg_interrupt_cnt; 340 341 disable_pkg_thres_interrupt(); 342 343 /* Work is per package, so scheduling it once is enough. */ 344 pkgdev = pkg_temp_thermal_get_dev(cpu); 345 if (pkgdev && !pkgdev->work_scheduled) { 346 pkgdev->work_scheduled = true; 347 pkg_thermal_schedule_work(pkgdev->cpu, &pkgdev->work); 348 } 349 350 spin_unlock_irqrestore(&pkg_temp_lock, flags); 351 return 0; 352 } 353 354 static int pkg_temp_thermal_device_add(unsigned int cpu) 355 { 356 int pkgid = topology_logical_package_id(cpu); 357 u32 tj_max, eax, ebx, ecx, edx; 358 struct pkg_device *pkgdev; 359 int thres_count, err; 360 361 if (pkgid >= max_packages) 362 return -ENOMEM; 363 364 cpuid(6, &eax, &ebx, &ecx, &edx); 365 thres_count = ebx & 0x07; 366 if (!thres_count) 367 return -ENODEV; 368 369 thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); 370 371 err = get_tj_max(cpu, &tj_max); 372 if (err) 373 return err; 374 375 pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL); 376 if (!pkgdev) 377 return -ENOMEM; 378 379 INIT_DELAYED_WORK(&pkgdev->work, pkg_temp_thermal_threshold_work_fn); 380 pkgdev->cpu = cpu; 381 pkgdev->tj_max = tj_max; 382 pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp", 383 thres_count, 384 (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01, 385 pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0); 386 if (IS_ERR(pkgdev->tzone)) { 387 err = PTR_ERR(pkgdev->tzone); 388 kfree(pkgdev); 389 return err; 390 } 391 /* Store MSR value for package thermal interrupt, to restore at exit */ 392 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, pkgdev->msr_pkg_therm_low, 393 pkgdev->msr_pkg_therm_high); 394 395 cpumask_set_cpu(cpu, &pkgdev->cpumask); 396 spin_lock_irq(&pkg_temp_lock); 397 packages[pkgid] = pkgdev; 398 spin_unlock_irq(&pkg_temp_lock); 399 return 0; 400 } 401 402 static int pkg_thermal_cpu_offline(unsigned int cpu) 403 { 404 struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); 405 bool lastcpu, was_target; 406 int target; 407 408 if (!pkgdev) 409 return 0; 410 411 target = cpumask_any_but(&pkgdev->cpumask, cpu); 412 cpumask_clear_cpu(cpu, &pkgdev->cpumask); 413 lastcpu = target >= nr_cpu_ids; 414 /* 415 * Remove the sysfs files, if this is the last cpu in the package 416 * before doing further cleanups. 417 */ 418 if (lastcpu) { 419 struct thermal_zone_device *tzone = pkgdev->tzone; 420 421 /* 422 * We must protect against a work function calling 423 * thermal_zone_update, after/while unregister. We null out 424 * the pointer under the zone mutex, so the worker function 425 * won't try to call. 426 */ 427 mutex_lock(&thermal_zone_mutex); 428 pkgdev->tzone = NULL; 429 mutex_unlock(&thermal_zone_mutex); 430 431 thermal_zone_device_unregister(tzone); 432 } 433 434 /* Protect against work and interrupts */ 435 spin_lock_irq(&pkg_temp_lock); 436 437 /* 438 * Check whether this cpu was the current target and store the new 439 * one. When we drop the lock, then the interrupt notify function 440 * will see the new target. 441 */ 442 was_target = pkgdev->cpu == cpu; 443 pkgdev->cpu = target; 444 445 /* 446 * If this is the last CPU in the package remove the package 447 * reference from the array and restore the interrupt MSR. When we 448 * drop the lock neither the interrupt notify function nor the 449 * worker will see the package anymore. 450 */ 451 if (lastcpu) { 452 packages[topology_logical_package_id(cpu)] = NULL; 453 /* After this point nothing touches the MSR anymore. */ 454 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, 455 pkgdev->msr_pkg_therm_low, pkgdev->msr_pkg_therm_high); 456 } 457 458 /* 459 * Check whether there is work scheduled and whether the work is 460 * targeted at the outgoing CPU. 461 */ 462 if (pkgdev->work_scheduled && was_target) { 463 /* 464 * To cancel the work we need to drop the lock, otherwise 465 * we might deadlock if the work needs to be flushed. 466 */ 467 spin_unlock_irq(&pkg_temp_lock); 468 cancel_delayed_work_sync(&pkgdev->work); 469 spin_lock_irq(&pkg_temp_lock); 470 /* 471 * If this is not the last cpu in the package and the work 472 * did not run after we dropped the lock above, then we 473 * need to reschedule the work, otherwise the interrupt 474 * stays disabled forever. 475 */ 476 if (!lastcpu && pkgdev->work_scheduled) 477 pkg_thermal_schedule_work(target, &pkgdev->work); 478 } 479 480 spin_unlock_irq(&pkg_temp_lock); 481 482 /* Final cleanup if this is the last cpu */ 483 if (lastcpu) 484 kfree(pkgdev); 485 return 0; 486 } 487 488 static int pkg_thermal_cpu_online(unsigned int cpu) 489 { 490 struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); 491 struct cpuinfo_x86 *c = &cpu_data(cpu); 492 493 /* Paranoia check */ 494 if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) 495 return -ENODEV; 496 497 /* If the package exists, nothing to do */ 498 if (pkgdev) { 499 cpumask_set_cpu(cpu, &pkgdev->cpumask); 500 return 0; 501 } 502 return pkg_temp_thermal_device_add(cpu); 503 } 504 505 static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { 506 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS }, 507 {} 508 }; 509 MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); 510 511 static int __init pkg_temp_thermal_init(void) 512 { 513 int ret; 514 515 if (!x86_match_cpu(pkg_temp_thermal_ids)) 516 return -ENODEV; 517 518 max_packages = topology_max_packages(); 519 packages = kcalloc(max_packages, sizeof(struct pkg_device *), 520 GFP_KERNEL); 521 if (!packages) 522 return -ENOMEM; 523 524 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online", 525 pkg_thermal_cpu_online, pkg_thermal_cpu_offline); 526 if (ret < 0) 527 goto err; 528 529 /* Store the state for module exit */ 530 pkg_thermal_hp_state = ret; 531 532 platform_thermal_package_notify = pkg_thermal_notify; 533 platform_thermal_package_rate_control = pkg_thermal_rate_control; 534 535 /* Don't care if it fails */ 536 pkg_temp_debugfs_init(); 537 return 0; 538 539 err: 540 kfree(packages); 541 return ret; 542 } 543 module_init(pkg_temp_thermal_init) 544 545 static void __exit pkg_temp_thermal_exit(void) 546 { 547 platform_thermal_package_notify = NULL; 548 platform_thermal_package_rate_control = NULL; 549 550 cpuhp_remove_state(pkg_thermal_hp_state); 551 debugfs_remove_recursive(debugfs); 552 kfree(packages); 553 } 554 module_exit(pkg_temp_thermal_exit) 555 556 MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver"); 557 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>"); 558 MODULE_LICENSE("GPL v2"); 559