1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019, Joyent, Inc.
14 * Copyright 2024 Oxide Computer Company
15 */
16
17 /*
18 * This implements a temperature sensor for AMD Zen family products that rely
19 * upon the SMN framework for getting temperature information.
20 *
21 * ----------
22 * Background
23 * ----------
24 *
25 * When we think of temperature sensors, we generally think of an external or
26 * embedded diode that measures a value in Celsius or Fahrenheit with some
27 * accuracy and resolution. The most common forms of these are called Tj and
28 * Tcase for the junction and case temperature. The junction temperature is the
29 * one that comes up most inside of devices like a CPU as it looks at the
30 * temperature of the actual transistors inside the part. On AMD, these Tj
31 * sensors are often called Tdie, because they represent the temperature of a
32 * particular die.
33 *
34 * While this is represented as a single number, there are often numerous diodes
35 * that have some amount of post-processing applied to them from different
36 * sources that are used to combine and make up this number.
37 *
38 * While AMD has various Tdie sensors (we'll get back to them later), the
39 * primary thing that the CPU exposes and is used for overall health is quite
40 * different and called Tctl, the control temperature. Unlike normal sensors
41 * Tctl is not a measure of temperature in a traditional sense and is instead
42 * used as part of the processor's control loop and is a unitless quantity that
43 * ranges between 0 and 100. There are two notable thresholds:
44 *
45 * 1) At a value of 95, the CPU will begin internal thermal throttling.
46 * 2) At a value of 100, after some period of time the CPU will shutdown. This
47 * likely involves asserting the THERMTRIP_L signal, which is a dedicated pin on
48 * the CPU socket.
49 *
50 * It's notable that this value is calculated and has various slew rates
51 * applied. While for a few Zen 1 ThreadRipper CPUs, there was a suggestion from
52 * the Ryzen Master software that there was a straightforward relationship
53 * between Tctl and Tdie, we've found that this isn't quite true in practice and
54 * that it's not helpful to try to convert Tctl to Tdie. There is no simple way
55 * to do so. As such, we don't pretend to do so anymore, though we did in an
56 * earlier life of this driver. The addition of the various CCD-specific sensors
57 * is an aid here.
58 *
59 * -------------------------------------
60 * System Management Network and Sensors
61 * -------------------------------------
62 *
63 * The SMN (system management network) exists on a per-die basis. That is there
64 * is one for each I/O die and connected devices in the system. In the context
65 * of Zen 2+, there is usually only a single SMN network per socket. In Zen 1,
66 * there was one for each Zepplin die, which combined both the core complexes
67 * and I/O. See uts/intel/os/cpuid.c for more background here.
68 *
69 * As a result of this split there are two different groups of sensors that
70 * exist within a single die:
71 *
72 * 1) SMU::THM::THM_TCON_CUR_TMP provides Tctl for the overall I/O die and
73 * connected components. This is the unitless measurement mentioned above. The
74 * aforementioned register is a shadow of whatever the die actually maintains
75 * and is read-only for all intents and purposes for us due to its nature as a
76 * shadow, despite what the PPR says.
77 *
78 * 2) SMU::THM::THM_DIEx_TEMP provides Tdie for a single die. Unlike Tctl, this
79 * is a valid measurement in degrees Celsius. Notably, this is also a shadow
80 * register that is updated by the SMU, while each die has its own underlying
81 * diodes and control temperature calculations that are performed. There are
82 * generally a fixed number of these die sensors at given offsets on the CPU.
83 * These are sourced by the thermal monitor and have a valid bit. The general
84 * assumption is that there is a 1:1 mapping on CPUs and APUs to CCDs.
85 *
86 * -------------------
87 * Sensor Organization
88 * -------------------
89 *
90 * The driver uses DDI_NT_SENSOR_TEMP_CPU, which will put us in the
91 * /dev/sensors/cpu directory. Each DF maps to the cpuid.c procnode concept. The
92 * Tctl sensor is named 'procnode.%u'. The Tdie sensors are named
93 * 'procnode.%u:die.%u'. This allows us to have them exist on a per-DF basis.
94 * The expectation is that consumers who care will make the assumption that
95 * these are CCD-specific sensors rather than this driver itself.
96 *
97 * To represent this, the driver, which is rooted in the smntemp_t structure,
98 * the smntemp_data global, contains a number of smntemp_df_t structures. One
99 * for each df that exists. Each DF contains one smntemp_temp_t structure that
100 * represents Tctl and a variable number of Tdie sensors based on how many the
101 * SoC supports.
102 *
103 * Because of our desire not to assume that these are specifically CCD sensors
104 * here (though they realistically speaking are), we don't try iterating the
105 * CCDs as a way to scope which Tdie sensors exist and instead leverage the
106 * valid bit that they have to determine which ksensors to create.
107 */
108
109 #include <sys/modctl.h>
110 #include <sys/conf.h>
111 #include <sys/devops.h>
112 #include <sys/types.h>
113 #include <sys/cred.h>
114 #include <sys/ddi.h>
115 #include <sys/sunddi.h>
116 #include <sys/cmn_err.h>
117 #include <sys/stdbool.h>
118 #include <sys/x86_archext.h>
119 #include <sys/cpuvar.h>
120 #include <sys/sensors.h>
121 #include <sys/sysmacros.h>
122 #include <sys/amdzen/smn.h>
123 #include <sys/amdzen/thm.h>
124 #include <amdzen_client.h>
125
126 typedef enum {
127 SMNTEMP_F_MUTEX = 1 << 0,
128 SMNTEMP_F_VALID = 1 << 1
129 } smntemp_flags_t;
130
131 typedef enum {
132 SMNTEMP_K_TCTL = 1,
133 SMNTEMP_K_TDIE
134 } smntemp_kind_t;
135
136 typedef struct smntemp_temp smntemp_temp_t;
137 typedef struct smntemp_df smntemp_df_t;
138 typedef struct smntemp smntemp_t;
139
140 /*
141 * This represents the per-temperature data that we keep around per exposed
142 * ksensor.
143 */
144 struct smntemp_temp {
145 smntemp_kind_t stt_kind;
146 smntemp_df_t *stt_df;
147 smn_reg_t stt_reg;
148 smntemp_flags_t stt_flags;
149 id_t stt_ksensor;
150 kmutex_t stt_mutex;
151 hrtime_t stt_last_read;
152 uint32_t stt_raw;
153 int64_t stt_temp;
154 };
155
156 /*
157 * This represents a single DF in the system and contains all of the temperature
158 * sensors for it, both its Tctl and however many Tdie exist.
159 */
160 struct smntemp_df {
161 uint32_t sd_dfno;
162 smntemp_temp_t sd_tctl;
163 uint32_t sd_nccd;
164 uint32_t sd_nccd_valid;
165 smntemp_temp_t *sd_tdie;
166 };
167
168 /*
169 * Primary driver state structure.
170 */
171 struct smntemp {
172 dev_info_t *smn_dip;
173 x86_processor_family_t smn_fam;
174 uint_t smn_ndf;
175 smntemp_df_t *smn_df;
176 };
177
178 static smntemp_t smntemp_data;
179
180 /*
181 * Determine if the "temperature" requires adjustment in some form. Tdie is
182 * always adjusted. Tctl may in two different circumstances:
183 *
184 * (1) If the range bit, 'THM_CURTEMP_GET_RANGE' is set.
185 * (2) if the mode is set to r/w. While the former is made much more explicit,
186 * the latter is something that AMD has suggested, but hasn't been formally
187 * documented in the PPR. However, experimentally this has proven to hold.
188 */
189 static int64_t
smntemp_temp_adjust(smntemp_temp_t * stt)190 smntemp_temp_adjust(smntemp_temp_t *stt)
191 {
192 if (stt->stt_kind == SMNTEMP_K_TDIE) {
193 return (THM_CURTEMP_RANGE_ADJ);
194 }
195
196 if (THM_CURTEMP_GET_RANGE(stt->stt_raw) == THM_CURTEMP_RANGE_N49_206 ||
197 THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_RW) {
198 return (THM_CURTEMP_RANGE_ADJ);
199 }
200
201 return (0);
202 }
203
204 static int
smntemp_temp_update(smntemp_temp_t * stt)205 smntemp_temp_update(smntemp_temp_t *stt)
206 {
207 int ret;
208 uint32_t reg;
209 int64_t raw, decimal;
210
211 ASSERT(MUTEX_HELD((&stt->stt_mutex)));
212
213 if ((ret = amdzen_c_smn_read(stt->stt_df->sd_dfno, stt->stt_reg,
214 ®)) != 0) {
215 return (ret);
216 }
217
218 stt->stt_last_read = gethrtime();
219 stt->stt_raw = reg;
220 if (stt->stt_kind == SMNTEMP_K_TCTL) {
221 raw = THM_CURTEMP_GET_TEMP(reg);
222 } else {
223 raw = THM_DIE_GET_TEMP(reg);
224 }
225
226 decimal = raw & THM_CURTEMP_TEMP_DEC_MASK;
227 raw = raw >> THM_CURTEMP_TEMP_DEC_BITS;
228 raw += smntemp_temp_adjust(stt);
229
230 stt->stt_temp = raw << THM_CURTEMP_TEMP_DEC_BITS;
231 stt->stt_temp += decimal;
232
233 return (0);
234 }
235
236 static uint32_t
smntemp_temp_unit(smntemp_temp_t * stt)237 smntemp_temp_unit(smntemp_temp_t *stt)
238 {
239 ASSERT(MUTEX_HELD(&stt->stt_mutex));
240
241 if (stt->stt_kind == SMNTEMP_K_TDIE) {
242 return (SENSOR_UNIT_CELSIUS);
243 } else if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) ==
244 THM_CURTEMP_TJ_SEL_TJ) {
245 return (SENSOR_UNIT_CELSIUS);
246 } else {
247 return (SENSOR_UNIT_NONE);
248 }
249 }
250
251 static int
smntemp_temp_read(void * arg,sensor_ioctl_scalar_t * temp)252 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp)
253 {
254 int ret;
255 smntemp_temp_t *stt = arg;
256
257 mutex_enter(&stt->stt_mutex);
258 if ((ret = smntemp_temp_update(stt)) != 0) {
259 mutex_exit(&stt->stt_mutex);
260 return (ret);
261 }
262
263 temp->sis_unit = smntemp_temp_unit(stt);
264 temp->sis_value = stt->stt_temp;
265 /* This is the same between Tctl and Tdie */
266 temp->sis_gran = THM_CURTEMP_TEMP_DEC_GRAN;
267 mutex_exit(&stt->stt_mutex);
268
269 return (0);
270 }
271
272 /*
273 * Because Tctl is usually a control temperature, but isn't guaranteed, we
274 * cannot use a stock ksensor function and must implement this ourselves.
275 */
276 static int
smntemp_temp_kind(void * arg,sensor_ioctl_kind_t * kind)277 smntemp_temp_kind(void *arg, sensor_ioctl_kind_t *kind)
278 {
279 smntemp_temp_t *stt = arg;
280
281 if (stt->stt_kind == SMNTEMP_K_TDIE) {
282 kind->sik_kind = SENSOR_KIND_TEMPERATURE;
283 return (0);
284 }
285
286 mutex_enter(&stt->stt_mutex);
287 if (stt->stt_raw == 0) {
288 int ret = smntemp_temp_update(stt);
289 if (ret != 0) {
290 mutex_exit(&stt->stt_mutex);
291 return (ret);
292 }
293 }
294
295 if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_TJ) {
296 kind->sik_kind = SENSOR_KIND_TEMPERATURE;
297 } else {
298 kind->sik_kind = SENSOR_KIND_SYNTHETIC;
299 kind->sik_derive = SENSOR_KIND_TEMPERATURE;
300 }
301
302 mutex_exit(&stt->stt_mutex);
303 return (0);
304 }
305
306 static const ksensor_ops_t smntemp_temp_ops = {
307 .kso_kind = smntemp_temp_kind,
308 .kso_scalar = smntemp_temp_read
309 };
310
311 static bool
smntemp_create_tdie(smntemp_t * smn,smntemp_df_t * df,smntemp_temp_t * temp,uint32_t ccdno)312 smntemp_create_tdie(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp,
313 uint32_t ccdno)
314 {
315 int ret;
316 uint32_t val;
317 char buf[128];
318
319 temp->stt_kind = SMNTEMP_K_TDIE;
320 temp->stt_df = df;
321 temp->stt_reg = THM_DIE(ccdno, smn->smn_fam);
322 mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL);
323 temp->stt_flags = SMNTEMP_F_MUTEX;
324
325 /*
326 * Tdie sensors have a valid bit that we need to check before we
327 * register with the ksensor framework.
328 */
329 if (snprintf(buf, sizeof (buf), "procnode.%u.die.%u", df->sd_dfno,
330 ccdno) >= sizeof (buf)) {
331 dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name "
332 "overrun assembling DF/CCD %u/%u Tdie", df->sd_dfno,
333 ccdno);
334 return (false);
335 }
336
337 if ((ret = amdzen_c_smn_read(temp->stt_df->sd_dfno, temp->stt_reg,
338 &val)) != 0) {
339 dev_err(smn->smn_dip, CE_WARN, "!unexpected SMN read failure "
340 "reading DF/CCD %u/%u Tdie: %d", df->sd_dfno, ccdno, ret);
341 return (false);
342 }
343
344 /*
345 * Tdie sensors have a valid bit in them. We more or less assume that
346 * this valid bit is set by the SMU early in life and remains valid
347 * throughout a given system boot.
348 */
349 if (THM_DIE_GET_VALID(val) == 0) {
350 return (true);
351 }
352
353 df->sd_nccd_valid++;
354 temp->stt_flags |= SMNTEMP_F_VALID;
355
356 if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf,
357 DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) {
358 dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: "
359 "%d", buf, ret);
360 return (false);
361 }
362
363 return (true);
364 }
365
366 static bool
smntemp_create_tctl(smntemp_t * smn,smntemp_df_t * df,smntemp_temp_t * temp)367 smntemp_create_tctl(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp)
368 {
369 int ret;
370 char buf[128];
371
372 temp->stt_kind = SMNTEMP_K_TCTL;
373 temp->stt_df = df;
374 temp->stt_reg = THM_CURTEMP;
375 mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL);
376 temp->stt_flags = SMNTEMP_F_VALID | SMNTEMP_F_MUTEX;
377
378 if (snprintf(buf, sizeof (buf), "procnode.%u", df->sd_dfno) >=
379 sizeof (buf)) {
380 dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name "
381 "overrun assembling DF %u Tctl", df->sd_dfno);
382 return (false);
383 }
384
385 if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf,
386 DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) {
387 dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: "
388 "%d", buf, ret);
389 return (false);
390 }
391
392 return (true);
393 }
394
395 static void
smntemp_cleanup_temp(smntemp_temp_t * temp)396 smntemp_cleanup_temp(smntemp_temp_t *temp)
397 {
398 temp->stt_flags &= ~SMNTEMP_F_VALID;
399 if ((temp->stt_flags & SMNTEMP_F_MUTEX) != 0) {
400 mutex_destroy(&temp->stt_mutex);
401 temp->stt_flags &= ~SMNTEMP_F_MUTEX;
402 }
403 ASSERT0(temp->stt_flags);
404 }
405
406 static void
smntemp_cleanup(smntemp_t * smn)407 smntemp_cleanup(smntemp_t *smn)
408 {
409 (void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS);
410
411 for (uint32_t dfno = 0; dfno < smn->smn_ndf; dfno++) {
412 smntemp_df_t *df = &smn->smn_df[dfno];
413 smntemp_cleanup_temp(&df->sd_tctl);
414 for (uint32_t ccdno = 0; ccdno < df->sd_nccd; ccdno++) {
415 smntemp_cleanup_temp(&df->sd_tdie[ccdno]);
416 }
417
418 if (df->sd_nccd > 0) {
419 kmem_free(df->sd_tdie, df->sd_nccd *
420 sizeof (smntemp_temp_t));
421 df->sd_nccd = 0;
422 df->sd_tdie = NULL;
423 }
424 }
425 if (smn->smn_ndf > 0) {
426 kmem_free(smn->smn_df, sizeof (smntemp_df_t) * smn->smn_ndf);
427 smn->smn_ndf = 0;
428 smn->smn_df = NULL;
429 }
430
431 if (smn->smn_dip != NULL) {
432 ddi_remove_minor_node(smn->smn_dip, NULL);
433 ddi_set_driver_private(smn->smn_dip, NULL);
434 smn->smn_dip = NULL;
435 }
436 }
437
438 static int
smntemp_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)439 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
440 {
441 smntemp_t *smntemp = &smntemp_data;
442
443 if (cmd == DDI_RESUME) {
444 return (DDI_SUCCESS);
445 } else if (cmd != DDI_ATTACH) {
446 return (DDI_FAILURE);
447 }
448
449 if (smntemp->smn_dip != NULL) {
450 dev_err(dip, CE_WARN, "!smntemp already attached");
451 return (DDI_FAILURE);
452 }
453 smntemp->smn_dip = dip;
454 smntemp->smn_fam = chiprev_family(cpuid_getchiprev(CPU));
455
456 /*
457 * First account for each actual DF instance. Then determine the number
458 * of CCD entries we need to care about per SoC.
459 */
460 smntemp->smn_ndf = amdzen_c_df_count();
461 if (smntemp->smn_ndf == 0) {
462 dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp");
463 goto err;
464 }
465 smntemp->smn_df = kmem_zalloc(sizeof (smntemp_df_t) * smntemp->smn_ndf,
466 KM_SLEEP);
467 for (uint32_t dfno = 0; dfno < smntemp->smn_ndf; dfno++) {
468 smntemp_df_t *df = &smntemp->smn_df[dfno];
469 df->sd_dfno = dfno;
470 df->sd_nccd = THM_DIE_MAX_UNITS(smntemp->smn_fam);
471
472 if (!smntemp_create_tctl(smntemp, df, &df->sd_tctl)) {
473 goto err;
474 }
475
476 if (df->sd_nccd > 0) {
477 df->sd_tdie = kmem_zalloc(sizeof (smntemp_temp_t) *
478 df->sd_nccd, KM_SLEEP);
479 }
480
481 for (uint32_t i = 0; i < df->sd_nccd; i++) {
482 if (!smntemp_create_tdie(smntemp, df,
483 &df->sd_tdie[i], i)) {
484 goto err;
485 }
486 }
487 }
488
489 ddi_set_driver_private(dip, smntemp);
490 return (DDI_SUCCESS);
491
492 err:
493 smntemp_cleanup(smntemp);
494 return (DDI_FAILURE);
495 }
496
497 static int
smntemp_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)498 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
499 {
500 smntemp_t *smntemp = &smntemp_data;
501
502 if (cmd == DDI_SUSPEND) {
503 return (DDI_SUCCESS);
504 } else if (cmd != DDI_DETACH) {
505 return (DDI_FAILURE);
506 }
507
508 if (smntemp->smn_dip == NULL) {
509 dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn "
510 "instance %d that was never attached",
511 ddi_get_instance(dip));
512 return (DDI_FAILURE);
513 }
514
515 smntemp_cleanup(smntemp);
516 return (DDI_SUCCESS);
517 }
518
519 static struct dev_ops smntemp_dev_ops = {
520 .devo_rev = DEVO_REV,
521 .devo_refcnt = 0,
522 .devo_getinfo = nodev,
523 .devo_identify = nulldev,
524 .devo_probe = nulldev,
525 .devo_attach = smntemp_attach,
526 .devo_detach = smntemp_detach,
527 .devo_reset = nodev,
528 .devo_quiesce = ddi_quiesce_not_needed,
529 };
530
531 static struct modldrv smntemp_modldrv = {
532 .drv_modops = &mod_driverops,
533 .drv_linkinfo = "AMD SMN Temperature Driver",
534 .drv_dev_ops = &smntemp_dev_ops
535 };
536
537 static struct modlinkage smntemp_modlinkage = {
538 .ml_rev = MODREV_1,
539 .ml_linkage = { &smntemp_modldrv, NULL }
540 };
541
542 int
_init(void)543 _init(void)
544 {
545 return (mod_install(&smntemp_modlinkage));
546 }
547
548 int
_info(struct modinfo * modinfop)549 _info(struct modinfo *modinfop)
550 {
551 return (mod_info(&smntemp_modlinkage, modinfop));
552 }
553
554 int
_fini(void)555 _fini(void)
556 {
557 return (mod_remove(&smntemp_modlinkage));
558 }
559