1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 * Copyright 2024 Oxide Computer Company 15 */ 16 17 /* 18 * This implements a temperature sensor for AMD Zen family products that rely 19 * upon the SMN framework for getting temperature information. 20 * 21 * ---------- 22 * Background 23 * ---------- 24 * 25 * When we think of temperature sensors, we generally think of an external or 26 * embedded diode that measures a value in Celsius or Fahrenheit with some 27 * accuracy and resolution. The most common forms of these are called Tj and 28 * Tcase for the junction and case temperature. The junction temperature is the 29 * one that comes up most inside of devices like a CPU as it looks at the 30 * temperature of the actual transistors inside the part. On AMD, these Tj 31 * sensors are often called Tdie, because they represent the temperature of a 32 * particular die. 33 * 34 * While this is represented as a single number, there are often numerous diodes 35 * that have some amount of post-processing applied to them from different 36 * sources that are used to combine and make up this number. 37 * 38 * While AMD has various Tdie sensors (we'll get back to them later), the 39 * primary thing that the CPU exposes and is used for overall health is quite 40 * different and called Tctl, the control temperature. Unlike normal sensors 41 * Tctl is not a measure of temperature in a traditional sense and is instead 42 * used as part of the processor's control loop and is a unitless quantity that 43 * ranges between 0 and 100. There are two notable thresholds: 44 * 45 * 1) At a value of 95, the CPU will begin internal thermal throttling. 46 * 2) At a value of 100, after some period of time the CPU will shutdown. This 47 * likely involves asserting the THERMTRIP_L signal, which is a dedicated pin on 48 * the CPU socket. 49 * 50 * It's notable that this value is calculated and has various slew rates 51 * applied. While for a few Zen 1 ThreadRipper CPUs, there was a suggestion from 52 * the Ryzen Master software that there was a straightforward relationship 53 * between Tctl and Tdie, we've found that this isn't quite true in practice and 54 * that it's not helpful to try to convert Tctl to Tdie. There is no simple way 55 * to do so. As such, we don't pretend to do so anymore, though we did in an 56 * earlier life of this driver. The addition of the various CCD-specific sensors 57 * is an aid here. 58 * 59 * ------------------------------------- 60 * System Management Network and Sensors 61 * ------------------------------------- 62 * 63 * The SMN (system management network) exists on a per-die basis. That is there 64 * is one for each I/O die and connected devices in the system. In the context 65 * of Zen 2+, there is usually only a single SMN network per socket. In Zen 1, 66 * there was one for each Zepplin die, which combined both the core complexes 67 * and I/O. See uts/intel/os/cpuid.c for more background here. 68 * 69 * As a result of this split there are two different groups of sensors that 70 * exist within a single die: 71 * 72 * 1) SMU::THM::THM_TCON_CUR_TMP provides Tctl for the overall I/O die and 73 * connected components. This is the unitless measurement mentioned above. The 74 * aforementioned register is a shadow of whatever the die actually maintains 75 * and is read-only for all intents and purposes for us due to its nature as a 76 * shadow, despite what the PPR says. 77 * 78 * 2) SMU::THM::THM_DIEx_TEMP provides Tdie for a single die. Unlike Tctl, this 79 * is a valid measurement in degrees Celsius. Notably, this is also a shadow 80 * register that is updated by the SMU, while each die has its own underlying 81 * diodes and control temperature calculations that are performed. There are 82 * generally a fixed number of these die sensors at given offsets on the CPU. 83 * These are sourced by the thermal monitor and have a valid bit. The general 84 * assumption is that there is a 1:1 mapping on CPUs and APUs to CCDs. 85 * 86 * ------------------- 87 * Sensor Organization 88 * ------------------- 89 * 90 * The driver uses DDI_NT_SENSOR_TEMP_CPU, which will put us in the 91 * /dev/sensors/cpu directory. Each DF maps to the cpuid.c procnode concept. The 92 * Tctl sensor is named 'procnode.%u'. The Tdie sensors are named 93 * 'procnode.%u:die.%u'. This allows us to have them exist on a per-DF basis. 94 * The expectation is that consumers who care will make the assumption that 95 * these are CCD-specific sensors rather than this driver itself. 96 * 97 * To represent this, the driver, which is rooted in the smntemp_t structure, 98 * the smntemp_data global, contains a number of smntemp_df_t structures. One 99 * for each df that exists. Each DF contains one smntemp_temp_t structure that 100 * represents Tctl and a variable number of Tdie sensors based on how many the 101 * SoC supports. 102 * 103 * Because of our desire not to assume that these are specifically CCD sensors 104 * here (though they realistically speaking are), we don't try iterating the 105 * CCDs as a way to scope which Tdie sensors exist and instead leverage the 106 * valid bit that they have to determine which ksensors to create. 107 */ 108 109 #include <sys/modctl.h> 110 #include <sys/conf.h> 111 #include <sys/devops.h> 112 #include <sys/types.h> 113 #include <sys/cred.h> 114 #include <sys/ddi.h> 115 #include <sys/sunddi.h> 116 #include <sys/cmn_err.h> 117 #include <sys/stdbool.h> 118 #include <sys/x86_archext.h> 119 #include <sys/cpuvar.h> 120 #include <sys/sensors.h> 121 #include <sys/sysmacros.h> 122 #include <sys/amdzen/smn.h> 123 #include <sys/amdzen/thm.h> 124 #include <amdzen_client.h> 125 126 typedef enum { 127 SMNTEMP_F_MUTEX = 1 << 0, 128 SMNTEMP_F_VALID = 1 << 1 129 } smntemp_flags_t; 130 131 typedef enum { 132 SMNTEMP_K_TCTL = 1, 133 SMNTEMP_K_TDIE 134 } smntemp_kind_t; 135 136 typedef struct smntemp_temp smntemp_temp_t; 137 typedef struct smntemp_df smntemp_df_t; 138 typedef struct smntemp smntemp_t; 139 140 /* 141 * This represents the per-temperature data that we keep around per exposed 142 * ksensor. 143 */ 144 struct smntemp_temp { 145 smntemp_kind_t stt_kind; 146 smntemp_df_t *stt_df; 147 smn_reg_t stt_reg; 148 smntemp_flags_t stt_flags; 149 id_t stt_ksensor; 150 kmutex_t stt_mutex; 151 hrtime_t stt_last_read; 152 uint32_t stt_raw; 153 int64_t stt_temp; 154 }; 155 156 /* 157 * This represents a single DF in the system and contains all of the temperature 158 * sensors for it, both its Tctl and however many Tdie exist. 159 */ 160 struct smntemp_df { 161 uint32_t sd_dfno; 162 smntemp_temp_t sd_tctl; 163 uint32_t sd_nccd; 164 uint32_t sd_nccd_valid; 165 smntemp_temp_t *sd_tdie; 166 }; 167 168 /* 169 * Primary driver state structure. 170 */ 171 struct smntemp { 172 dev_info_t *smn_dip; 173 x86_processor_family_t smn_fam; 174 uint_t smn_ndf; 175 smntemp_df_t *smn_df; 176 }; 177 178 static smntemp_t smntemp_data; 179 180 /* 181 * Determine if the "temperature" requires adjustment in some form. Tdie is 182 * always adjusted. Tctl may in two different circumstances: 183 * 184 * (1) If the range bit, 'THM_CURTEMP_GET_RANGE' is set. 185 * (2) if the mode is set to r/w. While the former is made much more explicit, 186 * the latter is something that AMD has suggested, but hasn't been formally 187 * documented in the PPR. However, experimentally this has proven to hold. 188 */ 189 static int64_t 190 smntemp_temp_adjust(smntemp_temp_t *stt) 191 { 192 if (stt->stt_kind == SMNTEMP_K_TDIE) { 193 return (THM_CURTEMP_RANGE_ADJ); 194 } 195 196 if (THM_CURTEMP_GET_RANGE(stt->stt_raw) == THM_CURTEMP_RANGE_N49_206 || 197 THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_RW) { 198 return (THM_CURTEMP_RANGE_ADJ); 199 } 200 201 return (0); 202 } 203 204 static int 205 smntemp_temp_update(smntemp_temp_t *stt) 206 { 207 int ret; 208 uint32_t reg; 209 int64_t raw, decimal; 210 211 ASSERT(MUTEX_HELD((&stt->stt_mutex))); 212 213 if ((ret = amdzen_c_smn_read(stt->stt_df->sd_dfno, stt->stt_reg, 214 ®)) != 0) { 215 return (ret); 216 } 217 218 stt->stt_last_read = gethrtime(); 219 stt->stt_raw = reg; 220 if (stt->stt_kind == SMNTEMP_K_TCTL) { 221 raw = THM_CURTEMP_GET_TEMP(reg); 222 } else { 223 raw = THM_DIE_GET_TEMP(reg); 224 } 225 226 decimal = raw & THM_CURTEMP_TEMP_DEC_MASK; 227 raw = raw >> THM_CURTEMP_TEMP_DEC_BITS; 228 raw += smntemp_temp_adjust(stt); 229 230 stt->stt_temp = raw << THM_CURTEMP_TEMP_DEC_BITS; 231 stt->stt_temp += decimal; 232 233 return (0); 234 } 235 236 static uint32_t 237 smntemp_temp_unit(smntemp_temp_t *stt) 238 { 239 ASSERT(MUTEX_HELD(&stt->stt_mutex)); 240 241 if (stt->stt_kind == SMNTEMP_K_TDIE) { 242 return (SENSOR_UNIT_CELSIUS); 243 } else if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == 244 THM_CURTEMP_TJ_SEL_TJ) { 245 return (SENSOR_UNIT_CELSIUS); 246 } else { 247 return (SENSOR_UNIT_NONE); 248 } 249 } 250 251 static int 252 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp) 253 { 254 int ret; 255 smntemp_temp_t *stt = arg; 256 257 mutex_enter(&stt->stt_mutex); 258 if ((ret = smntemp_temp_update(stt)) != 0) { 259 mutex_exit(&stt->stt_mutex); 260 return (ret); 261 } 262 263 temp->sis_unit = smntemp_temp_unit(stt); 264 temp->sis_value = stt->stt_temp; 265 /* This is the same between Tctl and Tdie */ 266 temp->sis_gran = THM_CURTEMP_TEMP_DEC_GRAN; 267 mutex_exit(&stt->stt_mutex); 268 269 return (0); 270 } 271 272 /* 273 * Because Tctl is usually a control temperature, but isn't guaranteed, we 274 * cannot use a stock ksensor function and must implement this ourselves. 275 */ 276 static int 277 smntemp_temp_kind(void *arg, sensor_ioctl_kind_t *kind) 278 { 279 smntemp_temp_t *stt = arg; 280 281 if (stt->stt_kind == SMNTEMP_K_TDIE) { 282 kind->sik_kind = SENSOR_KIND_TEMPERATURE; 283 return (0); 284 } 285 286 mutex_enter(&stt->stt_mutex); 287 if (stt->stt_raw == 0) { 288 int ret = smntemp_temp_update(stt); 289 if (ret != 0) { 290 mutex_exit(&stt->stt_mutex); 291 return (ret); 292 } 293 } 294 295 if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_TJ) { 296 kind->sik_kind = SENSOR_KIND_TEMPERATURE; 297 } else { 298 kind->sik_kind = SENSOR_KIND_SYNTHETIC; 299 kind->sik_derive = SENSOR_KIND_TEMPERATURE; 300 } 301 302 mutex_exit(&stt->stt_mutex); 303 return (0); 304 } 305 306 static const ksensor_ops_t smntemp_temp_ops = { 307 .kso_kind = smntemp_temp_kind, 308 .kso_scalar = smntemp_temp_read 309 }; 310 311 static bool 312 smntemp_create_tdie(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp, 313 uint32_t ccdno) 314 { 315 int ret; 316 uint32_t val; 317 char buf[128]; 318 319 temp->stt_kind = SMNTEMP_K_TDIE; 320 temp->stt_df = df; 321 temp->stt_reg = THM_DIE(ccdno, smn->smn_fam); 322 mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL); 323 temp->stt_flags = SMNTEMP_F_MUTEX; 324 325 /* 326 * Tdie sensors have a valid bit that we need to check before we 327 * register with the ksensor framework. 328 */ 329 if (snprintf(buf, sizeof (buf), "procnode.%u.die.%u", df->sd_dfno, 330 ccdno) >= sizeof (buf)) { 331 dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name " 332 "overrun assembling DF/CCD %u/%u Tdie", df->sd_dfno, 333 ccdno); 334 return (false); 335 } 336 337 if ((ret = amdzen_c_smn_read(temp->stt_df->sd_dfno, temp->stt_reg, 338 &val)) != 0) { 339 dev_err(smn->smn_dip, CE_WARN, "!unexpected SMN read failure " 340 "reading DF/CCD %u/%u Tdie: %d", df->sd_dfno, ccdno, ret); 341 return (false); 342 } 343 344 /* 345 * Tdie sensors have a valid bit in them. We more or less assume that 346 * this valid bit is set by the SMU early in life and remains valid 347 * throughout a given system boot. 348 */ 349 if (THM_DIE_GET_VALID(val) == 0) { 350 return (true); 351 } 352 353 df->sd_nccd_valid++; 354 temp->stt_flags |= SMNTEMP_F_VALID; 355 356 if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf, 357 DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) { 358 dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: " 359 "%d", buf, ret); 360 return (false); 361 } 362 363 return (true); 364 } 365 366 static bool 367 smntemp_create_tctl(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp) 368 { 369 int ret; 370 char buf[128]; 371 372 temp->stt_kind = SMNTEMP_K_TCTL; 373 temp->stt_df = df; 374 temp->stt_reg = THM_CURTEMP; 375 mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL); 376 temp->stt_flags = SMNTEMP_F_VALID | SMNTEMP_F_MUTEX; 377 378 if (snprintf(buf, sizeof (buf), "procnode.%u", df->sd_dfno) >= 379 sizeof (buf)) { 380 dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name " 381 "overrun assembling DF %u Tctl", df->sd_dfno); 382 return (false); 383 } 384 385 if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf, 386 DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) { 387 dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: " 388 "%d", buf, ret); 389 return (false); 390 } 391 392 return (true); 393 } 394 395 static void 396 smntemp_cleanup_temp(smntemp_temp_t *temp) 397 { 398 temp->stt_flags &= ~SMNTEMP_F_VALID; 399 if ((temp->stt_flags & SMNTEMP_F_MUTEX) != 0) { 400 mutex_destroy(&temp->stt_mutex); 401 temp->stt_flags &= ~SMNTEMP_F_MUTEX; 402 } 403 ASSERT0(temp->stt_flags); 404 } 405 406 static void 407 smntemp_cleanup(smntemp_t *smn) 408 { 409 (void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS); 410 411 for (uint32_t dfno = 0; dfno < smn->smn_ndf; dfno++) { 412 smntemp_df_t *df = &smn->smn_df[dfno]; 413 smntemp_cleanup_temp(&df->sd_tctl); 414 for (uint32_t ccdno = 0; ccdno < df->sd_nccd; ccdno++) { 415 smntemp_cleanup_temp(&df->sd_tdie[ccdno]); 416 } 417 418 if (df->sd_nccd > 0) { 419 kmem_free(df->sd_tdie, df->sd_nccd * 420 sizeof (smntemp_temp_t)); 421 df->sd_nccd = 0; 422 df->sd_tdie = NULL; 423 } 424 } 425 if (smn->smn_ndf > 0) { 426 kmem_free(smn->smn_df, sizeof (smntemp_df_t) * smn->smn_ndf); 427 smn->smn_ndf = 0; 428 smn->smn_df = NULL; 429 } 430 431 if (smn->smn_dip != NULL) { 432 ddi_remove_minor_node(smn->smn_dip, NULL); 433 ddi_set_driver_private(smn->smn_dip, NULL); 434 smn->smn_dip = NULL; 435 } 436 } 437 438 static int 439 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 440 { 441 smntemp_t *smntemp = &smntemp_data; 442 443 if (cmd == DDI_RESUME) { 444 return (DDI_SUCCESS); 445 } else if (cmd != DDI_ATTACH) { 446 return (DDI_FAILURE); 447 } 448 449 if (smntemp->smn_dip != NULL) { 450 dev_err(dip, CE_WARN, "!smntemp already attached"); 451 return (DDI_FAILURE); 452 } 453 smntemp->smn_dip = dip; 454 smntemp->smn_fam = chiprev_family(cpuid_getchiprev(CPU)); 455 456 /* 457 * First account for each actual DF instance. Then determine the number 458 * of CCD entries we need to care about per SoC. 459 */ 460 smntemp->smn_ndf = amdzen_c_df_count(); 461 if (smntemp->smn_ndf == 0) { 462 dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp"); 463 goto err; 464 } 465 smntemp->smn_df = kmem_zalloc(sizeof (smntemp_df_t) * smntemp->smn_ndf, 466 KM_SLEEP); 467 for (uint32_t dfno = 0; dfno < smntemp->smn_ndf; dfno++) { 468 smntemp_df_t *df = &smntemp->smn_df[dfno]; 469 df->sd_dfno = dfno; 470 df->sd_nccd = THM_DIE_MAX_UNITS(smntemp->smn_fam); 471 472 if (!smntemp_create_tctl(smntemp, df, &df->sd_tctl)) { 473 goto err; 474 } 475 476 if (df->sd_nccd > 0) { 477 df->sd_tdie = kmem_zalloc(sizeof (smntemp_temp_t) * 478 df->sd_nccd, KM_SLEEP); 479 } 480 481 for (uint32_t i = 0; i < df->sd_nccd; i++) { 482 if (!smntemp_create_tdie(smntemp, df, 483 &df->sd_tdie[i], i)) { 484 goto err; 485 } 486 } 487 } 488 489 ddi_set_driver_private(dip, smntemp); 490 return (DDI_SUCCESS); 491 492 err: 493 smntemp_cleanup(smntemp); 494 return (DDI_FAILURE); 495 } 496 497 static int 498 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 499 { 500 smntemp_t *smntemp = &smntemp_data; 501 502 if (cmd == DDI_SUSPEND) { 503 return (DDI_SUCCESS); 504 } else if (cmd != DDI_DETACH) { 505 return (DDI_FAILURE); 506 } 507 508 if (smntemp->smn_dip == NULL) { 509 dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn " 510 "instance %d that was never attached", 511 ddi_get_instance(dip)); 512 return (DDI_FAILURE); 513 } 514 515 smntemp_cleanup(smntemp); 516 return (DDI_SUCCESS); 517 } 518 519 static struct dev_ops smntemp_dev_ops = { 520 .devo_rev = DEVO_REV, 521 .devo_refcnt = 0, 522 .devo_getinfo = nodev, 523 .devo_identify = nulldev, 524 .devo_probe = nulldev, 525 .devo_attach = smntemp_attach, 526 .devo_detach = smntemp_detach, 527 .devo_reset = nodev, 528 .devo_quiesce = ddi_quiesce_not_needed, 529 }; 530 531 static struct modldrv smntemp_modldrv = { 532 .drv_modops = &mod_driverops, 533 .drv_linkinfo = "AMD SMN Temperature Driver", 534 .drv_dev_ops = &smntemp_dev_ops 535 }; 536 537 static struct modlinkage smntemp_modlinkage = { 538 .ml_rev = MODREV_1, 539 .ml_linkage = { &smntemp_modldrv, NULL } 540 }; 541 542 int 543 _init(void) 544 { 545 return (mod_install(&smntemp_modlinkage)); 546 } 547 548 int 549 _info(struct modinfo *modinfop) 550 { 551 return (mod_info(&smntemp_modlinkage, modinfop)); 552 } 553 554 int 555 _fini(void) 556 { 557 return (mod_remove(&smntemp_modlinkage)); 558 } 559