xref: /illumos-gate/usr/src/uts/intel/io/amdzen/smntemp.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  * Copyright 2024 Oxide Computer Company
15  */
16 
17 /*
18  * This implements a temperature sensor for AMD Zen family products that rely
19  * upon the SMN framework for getting temperature information.
20  *
21  * ----------
22  * Background
23  * ----------
24  *
25  * When we think of temperature sensors, we generally think of an external or
26  * embedded diode that measures a value in Celsius or Fahrenheit with some
27  * accuracy and resolution. The most common forms of these are called Tj and
28  * Tcase for the junction and case temperature. The junction temperature is the
29  * one that comes up most inside of devices like a CPU as it looks at the
30  * temperature of the actual transistors inside the part. On AMD, these Tj
31  * sensors are often called Tdie, because they represent the temperature of a
32  * particular die.
33  *
34  * While this is represented as a single number, there are often numerous diodes
35  * that have some amount of post-processing applied to them from different
36  * sources that are used to combine and make up this number.
37  *
38  * While AMD has various Tdie sensors (we'll get back to them later), the
39  * primary thing that the CPU exposes and is used for overall health is quite
40  * different and called Tctl, the control temperature. Unlike normal sensors
41  * Tctl is not a measure of temperature in a traditional sense and is instead
42  * used as part of the processor's control loop and is a unitless quantity that
43  * ranges between 0 and 100. There are two notable thresholds:
44  *
45  * 1) At a value of 95, the CPU will begin internal thermal throttling.
46  * 2) At a value of 100, after some period of time the CPU will shutdown. This
47  * likely involves asserting the THERMTRIP_L signal, which is a dedicated pin on
48  * the CPU socket.
49  *
50  * It's notable that this value is calculated and has various slew rates
51  * applied. While for a few Zen 1 ThreadRipper CPUs, there was a suggestion from
52  * the Ryzen Master software that there was a straightforward relationship
53  * between Tctl and Tdie, we've found that this isn't quite true in practice and
54  * that it's not helpful to try to convert Tctl to Tdie. There is no simple way
55  * to do so. As such, we don't pretend to do so anymore, though we did in an
56  * earlier life of this driver. The addition of the various CCD-specific sensors
57  * is an aid here.
58  *
59  * -------------------------------------
60  * System Management Network and Sensors
61  * -------------------------------------
62  *
63  * The SMN (system management network) exists on a per-die basis. That is there
64  * is one for each I/O die and connected devices in the system. In the context
65  * of Zen 2+, there is usually only a single SMN network per socket. In Zen 1,
66  * there was one for each Zepplin die, which combined both the core complexes
67  * and I/O. See uts/intel/os/cpuid.c for more background here.
68  *
69  * As a result of this split there are two different groups of sensors that
70  * exist within a single die:
71  *
72  * 1) SMU::THM::THM_TCON_CUR_TMP provides Tctl for the overall I/O die and
73  * connected components. This is the unitless measurement mentioned above. The
74  * aforementioned register is a shadow of whatever the die actually maintains
75  * and is read-only for all intents and purposes for us due to its nature as a
76  * shadow, despite what the PPR says.
77  *
78  * 2) SMU::THM::THM_DIEx_TEMP provides Tdie for a single die. Unlike Tctl, this
79  * is a valid measurement in degrees Celsius. Notably, this is also a shadow
80  * register that is updated by the SMU, while each die has its own underlying
81  * diodes and control temperature calculations that are performed. There are
82  * generally a fixed number of these die sensors at given offsets on the CPU.
83  * These are sourced by the thermal monitor and have a valid bit. The general
84  * assumption is that there is a 1:1 mapping on CPUs and APUs to CCDs.
85  *
86  * -------------------
87  * Sensor Organization
88  * -------------------
89  *
90  * The driver uses DDI_NT_SENSOR_TEMP_CPU, which will put us in the
91  * /dev/sensors/cpu directory. Each DF maps to the cpuid.c procnode concept. The
92  * Tctl sensor is named 'procnode.%u'. The Tdie sensors are named
93  * 'procnode.%u:die.%u'. This allows us to have them exist on a per-DF basis.
94  * The expectation is that consumers who care will make the assumption that
95  * these are CCD-specific sensors rather than this driver itself.
96  *
97  * To represent this, the driver, which is rooted in the smntemp_t structure,
98  * the smntemp_data global, contains a number of smntemp_df_t structures. One
99  * for each df that exists. Each DF contains one smntemp_temp_t structure that
100  * represents Tctl and a variable number of Tdie sensors based on how many the
101  * SoC supports.
102  *
103  * Because of our desire not to assume that these are specifically CCD sensors
104  * here (though they realistically speaking are), we don't try iterating the
105  * CCDs as a way to scope which Tdie sensors exist and instead leverage the
106  * valid bit that they have to determine which ksensors to create.
107  */
108 
109 #include <sys/modctl.h>
110 #include <sys/conf.h>
111 #include <sys/devops.h>
112 #include <sys/types.h>
113 #include <sys/cred.h>
114 #include <sys/ddi.h>
115 #include <sys/sunddi.h>
116 #include <sys/cmn_err.h>
117 #include <sys/stdbool.h>
118 #include <sys/x86_archext.h>
119 #include <sys/cpuvar.h>
120 #include <sys/sensors.h>
121 #include <sys/sysmacros.h>
122 #include <sys/amdzen/smn.h>
123 #include <sys/amdzen/thm.h>
124 #include <amdzen_client.h>
125 
126 typedef enum {
127 	SMNTEMP_F_MUTEX	= 1 << 0,
128 	SMNTEMP_F_VALID = 1 << 1
129 } smntemp_flags_t;
130 
131 typedef enum {
132 	SMNTEMP_K_TCTL = 1,
133 	SMNTEMP_K_TDIE
134 } smntemp_kind_t;
135 
136 typedef struct smntemp_temp smntemp_temp_t;
137 typedef struct smntemp_df smntemp_df_t;
138 typedef struct smntemp smntemp_t;
139 
140 /*
141  * This represents the per-temperature data that we keep around per exposed
142  * ksensor.
143  */
144 struct smntemp_temp {
145 	smntemp_kind_t stt_kind;
146 	smntemp_df_t *stt_df;
147 	smn_reg_t stt_reg;
148 	smntemp_flags_t stt_flags;
149 	id_t stt_ksensor;
150 	kmutex_t stt_mutex;
151 	hrtime_t stt_last_read;
152 	uint32_t stt_raw;
153 	int64_t stt_temp;
154 };
155 
156 /*
157  * This represents a single DF in the system and contains all of the temperature
158  * sensors for it, both its Tctl and however many Tdie exist.
159  */
160 struct smntemp_df {
161 	uint32_t sd_dfno;
162 	smntemp_temp_t sd_tctl;
163 	uint32_t sd_nccd;
164 	uint32_t sd_nccd_valid;
165 	smntemp_temp_t *sd_tdie;
166 };
167 
168 /*
169  * Primary driver state structure.
170  */
171 struct smntemp {
172 	dev_info_t *smn_dip;
173 	x86_processor_family_t smn_fam;
174 	uint_t smn_ndf;
175 	smntemp_df_t *smn_df;
176 };
177 
178 static smntemp_t smntemp_data;
179 
180 /*
181  * Determine if the "temperature" requires adjustment in some form. Tdie is
182  * always adjusted. Tctl may in two different circumstances:
183  *
184  * (1) If the range bit, 'THM_CURTEMP_GET_RANGE' is set.
185  * (2) if the mode is set to r/w. While the former is made much more explicit,
186  * the latter is something that AMD has suggested, but hasn't been formally
187  * documented in the PPR. However, experimentally this has proven to hold.
188  */
189 static int64_t
190 smntemp_temp_adjust(smntemp_temp_t *stt)
191 {
192 	if (stt->stt_kind == SMNTEMP_K_TDIE) {
193 		return (THM_CURTEMP_RANGE_ADJ);
194 	}
195 
196 	if (THM_CURTEMP_GET_RANGE(stt->stt_raw) == THM_CURTEMP_RANGE_N49_206 ||
197 	    THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_RW) {
198 		return (THM_CURTEMP_RANGE_ADJ);
199 	}
200 
201 	return (0);
202 }
203 
204 static int
205 smntemp_temp_update(smntemp_temp_t *stt)
206 {
207 	int ret;
208 	uint32_t reg;
209 	int64_t raw, decimal;
210 
211 	ASSERT(MUTEX_HELD((&stt->stt_mutex)));
212 
213 	if ((ret = amdzen_c_smn_read(stt->stt_df->sd_dfno, stt->stt_reg,
214 	    &reg)) != 0) {
215 		return (ret);
216 	}
217 
218 	stt->stt_last_read = gethrtime();
219 	stt->stt_raw = reg;
220 	if (stt->stt_kind == SMNTEMP_K_TCTL) {
221 		raw = THM_CURTEMP_GET_TEMP(reg);
222 	} else {
223 		raw = THM_DIE_GET_TEMP(reg);
224 	}
225 
226 	decimal = raw & THM_CURTEMP_TEMP_DEC_MASK;
227 	raw = raw >> THM_CURTEMP_TEMP_DEC_BITS;
228 	raw += smntemp_temp_adjust(stt);
229 
230 	stt->stt_temp = raw << THM_CURTEMP_TEMP_DEC_BITS;
231 	stt->stt_temp += decimal;
232 
233 	return (0);
234 }
235 
236 static uint32_t
237 smntemp_temp_unit(smntemp_temp_t *stt)
238 {
239 	ASSERT(MUTEX_HELD(&stt->stt_mutex));
240 
241 	if (stt->stt_kind == SMNTEMP_K_TDIE) {
242 		return (SENSOR_UNIT_CELSIUS);
243 	} else if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) ==
244 	    THM_CURTEMP_TJ_SEL_TJ) {
245 		return (SENSOR_UNIT_CELSIUS);
246 	} else {
247 		return (SENSOR_UNIT_NONE);
248 	}
249 }
250 
251 static int
252 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp)
253 {
254 	int ret;
255 	smntemp_temp_t *stt = arg;
256 
257 	mutex_enter(&stt->stt_mutex);
258 	if ((ret = smntemp_temp_update(stt)) != 0) {
259 		mutex_exit(&stt->stt_mutex);
260 		return (ret);
261 	}
262 
263 	temp->sis_unit = smntemp_temp_unit(stt);
264 	temp->sis_value = stt->stt_temp;
265 	/* This is the same between Tctl and Tdie */
266 	temp->sis_gran = THM_CURTEMP_TEMP_DEC_GRAN;
267 	mutex_exit(&stt->stt_mutex);
268 
269 	return (0);
270 }
271 
272 /*
273  * Because Tctl is usually a control temperature, but isn't guaranteed, we
274  * cannot use a stock ksensor function and must implement this ourselves.
275  */
276 static int
277 smntemp_temp_kind(void *arg, sensor_ioctl_kind_t *kind)
278 {
279 	smntemp_temp_t *stt = arg;
280 
281 	if (stt->stt_kind == SMNTEMP_K_TDIE) {
282 		kind->sik_kind = SENSOR_KIND_TEMPERATURE;
283 		return (0);
284 	}
285 
286 	mutex_enter(&stt->stt_mutex);
287 	if (stt->stt_raw == 0) {
288 		int ret = smntemp_temp_update(stt);
289 		if (ret != 0) {
290 			mutex_exit(&stt->stt_mutex);
291 			return (ret);
292 		}
293 	}
294 
295 	if (THM_CURTEMP_GET_TJ_SEL(stt->stt_raw) == THM_CURTEMP_TJ_SEL_TJ) {
296 		kind->sik_kind = SENSOR_KIND_TEMPERATURE;
297 	} else {
298 		kind->sik_kind = SENSOR_KIND_SYNTHETIC;
299 		kind->sik_derive = SENSOR_KIND_TEMPERATURE;
300 	}
301 
302 	mutex_exit(&stt->stt_mutex);
303 	return (0);
304 }
305 
306 static const ksensor_ops_t smntemp_temp_ops = {
307 	.kso_kind = smntemp_temp_kind,
308 	.kso_scalar = smntemp_temp_read
309 };
310 
311 static bool
312 smntemp_create_tdie(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp,
313     uint32_t ccdno)
314 {
315 	int ret;
316 	uint32_t val;
317 	char buf[128];
318 
319 	temp->stt_kind = SMNTEMP_K_TDIE;
320 	temp->stt_df = df;
321 	temp->stt_reg = THM_DIE(ccdno, smn->smn_fam);
322 	mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL);
323 	temp->stt_flags = SMNTEMP_F_MUTEX;
324 
325 	/*
326 	 * Tdie sensors have a valid bit that we need to check before we
327 	 * register with the ksensor framework.
328 	 */
329 	if (snprintf(buf, sizeof (buf), "procnode.%u.die.%u", df->sd_dfno,
330 	    ccdno) >= sizeof (buf)) {
331 		dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name "
332 		    "overrun assembling DF/CCD %u/%u Tdie", df->sd_dfno,
333 		    ccdno);
334 		return (false);
335 	}
336 
337 	if ((ret = amdzen_c_smn_read(temp->stt_df->sd_dfno, temp->stt_reg,
338 	    &val)) != 0) {
339 		dev_err(smn->smn_dip, CE_WARN, "!unexpected SMN read failure "
340 		    "reading DF/CCD %u/%u Tdie: %d", df->sd_dfno, ccdno, ret);
341 		return (false);
342 	}
343 
344 	/*
345 	 * Tdie sensors have a valid bit in them. We more or less assume that
346 	 * this valid bit is set by the SMU early in life and remains valid
347 	 * throughout a given system boot.
348 	 */
349 	if (THM_DIE_GET_VALID(val) == 0) {
350 		return (true);
351 	}
352 
353 	df->sd_nccd_valid++;
354 	temp->stt_flags |= SMNTEMP_F_VALID;
355 
356 	if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf,
357 	    DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) {
358 		dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: "
359 		    "%d", buf, ret);
360 		return (false);
361 	}
362 
363 	return (true);
364 }
365 
366 static bool
367 smntemp_create_tctl(smntemp_t *smn, smntemp_df_t *df, smntemp_temp_t *temp)
368 {
369 	int ret;
370 	char buf[128];
371 
372 	temp->stt_kind = SMNTEMP_K_TCTL;
373 	temp->stt_df = df;
374 	temp->stt_reg = THM_CURTEMP;
375 	mutex_init(&temp->stt_mutex, NULL, MUTEX_DRIVER, NULL);
376 	temp->stt_flags = SMNTEMP_F_VALID | SMNTEMP_F_MUTEX;
377 
378 	if (snprintf(buf, sizeof (buf), "procnode.%u", df->sd_dfno) >=
379 	    sizeof (buf)) {
380 		dev_err(smn->smn_dip, CE_WARN, "!unexpected buffer name "
381 		    "overrun assembling DF %u Tctl", df->sd_dfno);
382 		return (false);
383 	}
384 
385 	if ((ret = ksensor_create(smn->smn_dip, &smntemp_temp_ops, temp, buf,
386 	    DDI_NT_SENSOR_TEMP_CPU, &temp->stt_ksensor)) != 0) {
387 		dev_err(smn->smn_dip, CE_WARN, "!failed to create sensor %s: "
388 		    "%d", buf, ret);
389 		return (false);
390 	}
391 
392 	return (true);
393 }
394 
395 static void
396 smntemp_cleanup_temp(smntemp_temp_t *temp)
397 {
398 	temp->stt_flags &= ~SMNTEMP_F_VALID;
399 	if ((temp->stt_flags & SMNTEMP_F_MUTEX) != 0) {
400 		mutex_destroy(&temp->stt_mutex);
401 		temp->stt_flags &= ~SMNTEMP_F_MUTEX;
402 	}
403 	ASSERT0(temp->stt_flags);
404 }
405 
406 static void
407 smntemp_cleanup(smntemp_t *smn)
408 {
409 	(void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS);
410 
411 	for (uint32_t dfno = 0; dfno < smn->smn_ndf; dfno++) {
412 		smntemp_df_t *df = &smn->smn_df[dfno];
413 		smntemp_cleanup_temp(&df->sd_tctl);
414 		for (uint32_t ccdno = 0; ccdno < df->sd_nccd; ccdno++) {
415 			smntemp_cleanup_temp(&df->sd_tdie[ccdno]);
416 		}
417 
418 		if (df->sd_nccd > 0) {
419 			kmem_free(df->sd_tdie, df->sd_nccd *
420 			    sizeof (smntemp_temp_t));
421 			df->sd_nccd = 0;
422 			df->sd_tdie = NULL;
423 		}
424 	}
425 	if (smn->smn_ndf > 0) {
426 		kmem_free(smn->smn_df, sizeof (smntemp_df_t) * smn->smn_ndf);
427 		smn->smn_ndf = 0;
428 		smn->smn_df = NULL;
429 	}
430 
431 	if (smn->smn_dip != NULL) {
432 		ddi_remove_minor_node(smn->smn_dip, NULL);
433 		ddi_set_driver_private(smn->smn_dip, NULL);
434 		smn->smn_dip = NULL;
435 	}
436 }
437 
438 static int
439 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
440 {
441 	smntemp_t *smntemp = &smntemp_data;
442 
443 	if (cmd == DDI_RESUME) {
444 		return (DDI_SUCCESS);
445 	} else if (cmd != DDI_ATTACH) {
446 		return (DDI_FAILURE);
447 	}
448 
449 	if (smntemp->smn_dip != NULL) {
450 		dev_err(dip, CE_WARN, "!smntemp already attached");
451 		return (DDI_FAILURE);
452 	}
453 	smntemp->smn_dip = dip;
454 	smntemp->smn_fam = chiprev_family(cpuid_getchiprev(CPU));
455 
456 	/*
457 	 * First account for each actual DF instance. Then determine the number
458 	 * of CCD entries we need to care about per SoC.
459 	 */
460 	smntemp->smn_ndf = amdzen_c_df_count();
461 	if (smntemp->smn_ndf == 0) {
462 		dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp");
463 		goto err;
464 	}
465 	smntemp->smn_df = kmem_zalloc(sizeof (smntemp_df_t) * smntemp->smn_ndf,
466 	    KM_SLEEP);
467 	for (uint32_t dfno = 0; dfno < smntemp->smn_ndf; dfno++) {
468 		smntemp_df_t *df = &smntemp->smn_df[dfno];
469 		df->sd_dfno = dfno;
470 		df->sd_nccd = THM_DIE_MAX_UNITS(smntemp->smn_fam);
471 
472 		if (!smntemp_create_tctl(smntemp, df, &df->sd_tctl)) {
473 			goto err;
474 		}
475 
476 		if (df->sd_nccd > 0) {
477 			df->sd_tdie = kmem_zalloc(sizeof (smntemp_temp_t) *
478 			    df->sd_nccd, KM_SLEEP);
479 		}
480 
481 		for (uint32_t i = 0; i < df->sd_nccd; i++) {
482 			if (!smntemp_create_tdie(smntemp, df,
483 			    &df->sd_tdie[i], i)) {
484 				goto err;
485 			}
486 		}
487 	}
488 
489 	ddi_set_driver_private(dip, smntemp);
490 	return (DDI_SUCCESS);
491 
492 err:
493 	smntemp_cleanup(smntemp);
494 	return (DDI_FAILURE);
495 }
496 
497 static int
498 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
499 {
500 	smntemp_t *smntemp = &smntemp_data;
501 
502 	if (cmd == DDI_SUSPEND) {
503 		return (DDI_SUCCESS);
504 	} else if (cmd != DDI_DETACH) {
505 		return (DDI_FAILURE);
506 	}
507 
508 	if (smntemp->smn_dip == NULL) {
509 		dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn "
510 		    "instance %d that was never attached",
511 		    ddi_get_instance(dip));
512 		return (DDI_FAILURE);
513 	}
514 
515 	smntemp_cleanup(smntemp);
516 	return (DDI_SUCCESS);
517 }
518 
519 static struct dev_ops smntemp_dev_ops = {
520 	.devo_rev = DEVO_REV,
521 	.devo_refcnt = 0,
522 	.devo_getinfo = nodev,
523 	.devo_identify = nulldev,
524 	.devo_probe = nulldev,
525 	.devo_attach = smntemp_attach,
526 	.devo_detach = smntemp_detach,
527 	.devo_reset = nodev,
528 	.devo_quiesce = ddi_quiesce_not_needed,
529 };
530 
531 static struct modldrv smntemp_modldrv = {
532 	.drv_modops = &mod_driverops,
533 	.drv_linkinfo = "AMD SMN Temperature Driver",
534 	.drv_dev_ops = &smntemp_dev_ops
535 };
536 
537 static struct modlinkage smntemp_modlinkage = {
538 	.ml_rev = MODREV_1,
539 	.ml_linkage = { &smntemp_modldrv, NULL }
540 };
541 
542 int
543 _init(void)
544 {
545 	return (mod_install(&smntemp_modlinkage));
546 }
547 
548 int
549 _info(struct modinfo *modinfop)
550 {
551 	return (mod_info(&smntemp_modlinkage, modinfop));
552 }
553 
554 int
555 _fini(void)
556 {
557 	return (mod_remove(&smntemp_modlinkage));
558 }
559