1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019, Joyent, Inc.
14 * Copyright 2021 Oxide Computer Company
15 */
16
17 /*
18 * Intel CPU Thermal sensor driver
19 *
20 * These MSRs that were used were introduced with the 'Core' family processors
21 * and have since spread beyond there, even to the Atom line. Currently,
22 * temperature sensors exist on a per-core basis and optionally on a per-package
23 * basis. The temperature sensor exposes a reading that's relative to the
24 * processor's maximum junction temperature, often referred to as Tj. We
25 * currently only support models where we can determine that junction
26 * temperature programmatically. For older processors, we would need to track
27 * down the datasheet. Unfortunately, the values here are often on a per-brand
28 * string basis. As in two CPUs with the same model and stepping, but have
29 * binned differently have different temperatures.
30 *
31 * The temperature is exposed through /dev and uses a semi-standard sensor
32 * framework. We expose one minor node per CPU core and one minor node per CPU
33 * package, if that is supported. Reads are rate-limited in the driver at 100ms
34 * by default per the global variable coretemp_cache_ms.
35 */
36
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/devops.h>
40 #include <sys/types.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/stat.h>
44 #include <sys/cred.h>
45 #include <sys/ddi.h>
46 #include <sys/sunddi.h>
47 #include <sys/list.h>
48 #include <sys/stddef.h>
49 #include <sys/cmn_err.h>
50 #include <sys/x86_archext.h>
51 #include <sys/cpu_module.h>
52 #include <sys/ontrap.h>
53 #include <sys/cpuvar.h>
54 #include <sys/x_call.h>
55 #include <sys/sensors.h>
56
57 /*
58 * The Intel SDM says that the measurements we get are always in degrees
59 * Celsius.
60 */
61 #define CORETEMP_GRANULARITY 1
62
63 typedef enum coretemp_sensor_type {
64 CORETEMP_S_CORE,
65 CORETEMP_S_SOCKET
66 } coretemp_sensor_type_t;
67
68 typedef struct coretemp_sensor {
69 list_node_t cs_link;
70 struct coretemp *cs_coretemp;
71 char cs_name[128];
72 id_t cs_sensor;
73 coretemp_sensor_type_t cs_type;
74 enum cmi_hdl_class cs_class;
75 uint_t cs_chip;
76 uint_t cs_core;
77 uint_t cs_strand;
78 uint_t cs_tjmax;
79 uint_t cs_status_msr;
80 uint_t cs_intr_msr;
81 hrtime_t cs_last_read;
82 uint64_t cs_status;
83 uint64_t cs_intr;
84 /* The following fields are derived from above */
85 uint_t cs_temperature;
86 uint_t cs_resolution;
87 } coretemp_sensor_t;
88
89 typedef struct coretemp {
90 dev_info_t *coretemp_dip;
91 cpuset_t *coretemp_cpuset;
92 boolean_t coretemp_pkg;
93 kmutex_t coretemp_mutex;
94 list_t coretemp_sensors;
95 } coretemp_t;
96
97 coretemp_t *coretemp;
98
99 /*
100 * This indicates a number of milliseconds that we should wait between reads.
101 * This is somewhat arbitrary, but the goal is to reduce cross call activity
102 * and reflect that the sensor may not update all the time.
103 */
104 uint_t coretemp_cache_ms = 100;
105
106 static int
coretemp_rdmsr_xc(xc_arg_t arg1,xc_arg_t arg2,xc_arg_t arg3)107 coretemp_rdmsr_xc(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
108 {
109 uint_t msr = (uint_t)arg1;
110 uint64_t *valp = (uint64_t *)arg2;
111 cmi_errno_t *errp = (cmi_errno_t *)arg3;
112
113 on_trap_data_t otd;
114
115 if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
116 if (checked_rdmsr(msr, valp) == 0) {
117 *errp = CMI_SUCCESS;
118 } else {
119 *errp = CMIERR_NOTSUP;
120 }
121 } else {
122 *errp = CMIERR_MSRGPF;
123 }
124 no_trap();
125
126 return (0);
127 }
128
129 /*
130 * This really should just be a call to the CMI handle to provide us the MSR.
131 * However, that routine, cmi_hdl_rdmsr(), cannot be safely used until it is
132 * fixed for use outside of a panic-like context.
133 */
134 static int
coretemp_rdmsr(coretemp_t * ct,cmi_hdl_t hdl,uint_t msr,uint64_t * valp)135 coretemp_rdmsr(coretemp_t *ct, cmi_hdl_t hdl, uint_t msr, uint64_t *valp)
136 {
137 id_t cpu = cmi_hdl_logical_id(hdl);
138 int ret = CMI_SUCCESS;
139
140 ASSERT(MUTEX_HELD(&ct->coretemp_mutex));
141 kpreempt_disable();
142 if (CPU->cpu_id == cpu) {
143 (void) coretemp_rdmsr_xc((xc_arg_t)msr, (xc_arg_t)valp,
144 (xc_arg_t)&ret);
145 } else {
146 cpuset_only(ct->coretemp_cpuset, (uint_t)cpu);
147 xc_call((xc_arg_t)msr, (xc_arg_t)valp, (xc_arg_t)&ret,
148 (ulong_t *)ct->coretemp_cpuset, coretemp_rdmsr_xc);
149 }
150 kpreempt_enable();
151
152 return (ret);
153 }
154
155 static int
coretemp_cmi_errno(cmi_errno_t e)156 coretemp_cmi_errno(cmi_errno_t e)
157 {
158 switch (e) {
159 case CMIERR_NOTSUP:
160 return (ENOTSUP);
161 default:
162 return (EIO);
163 }
164 }
165
166 /*
167 * Answer the question of whether or not the driver can support the CPU in
168 * question. Right now we have the following constraints for supporting the CPU:
169 *
170 * o The CPU is made by Intel
171 * o The CPU has the Digital Thermal Sensor
172 * o The CPU family is 6, which is usually implicit from the above
173 * o We can determine its junction temperature through an MSR
174 *
175 * If we can't determine the junction temperature programmatically, then we need
176 * to set up tables of CPUs to do so. This can be fleshed out and improved.
177 */
178 static boolean_t
coretemp_supported(void)179 coretemp_supported(void)
180 {
181 uint_t model;
182
183 if (cpuid_getvendor(CPU) != X86_VENDOR_Intel) {
184 return (B_FALSE);
185 }
186
187 if (!is_x86_feature(x86_featureset, X86FSET_CORE_THERMAL)) {
188 return (B_FALSE);
189 }
190
191 if (cpuid_getfamily(CPU) != 6) {
192 return (B_FALSE);
193 }
194
195 model = cpuid_getmodel(CPU);
196 if (model <= INTC_MODEL_PENRYN || model == INTC_MODEL_SILVERTHORNE ||
197 model == INTC_MODEL_LINCROFT || model == INTC_MODEL_PENWELL ||
198 model == INTC_MODEL_CLOVERVIEW || model == INTC_MODEL_CEDARVIEW) {
199 return (B_FALSE);
200 }
201
202 return (B_TRUE);
203 }
204
205 /*
206 * We need to determine the value of Tj Max as all temperature sensors are
207 * derived from this value. The ease of this depends on how old the processor in
208 * question is. The Core family processors after Penryn have support for an MSR
209 * that tells us what to go for. In the Atom family, processors starting with
210 * Silvermont have support for an MSR that documents this value. For older
211 * processors, one needs to track down the datasheet for a specific processor.
212 * Two processors in the same family/model may have different values of Tj Max.
213 * At the moment, we only support this on processors that have that MSR.
214 */
215 static int
coretemp_calculate_tjmax(coretemp_t * ct,cmi_hdl_t hdl,uint_t * tjmax)216 coretemp_calculate_tjmax(coretemp_t *ct, cmi_hdl_t hdl, uint_t *tjmax)
217 {
218 cmi_errno_t e;
219 uint64_t val = 0;
220
221 e = coretemp_rdmsr(ct, hdl, MSR_TEMPERATURE_TARGET, &val);
222 if (e != CMI_SUCCESS) {
223 return (coretemp_cmi_errno(e));
224 } else if (val == 0) {
225 return (EINVAL);
226 }
227
228 *tjmax = MSR_TEMPERATURE_TARGET_TARGET(val);
229 return (0);
230 }
231
232 static int
coretemp_update(coretemp_t * ct,coretemp_sensor_t * sensor,cmi_hdl_t hdl)233 coretemp_update(coretemp_t *ct, coretemp_sensor_t *sensor, cmi_hdl_t hdl)
234 {
235 cmi_errno_t e;
236 int err = 0;
237 uint64_t intr, status;
238
239 if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_status_msr, &status)) !=
240 CMI_SUCCESS) {
241 err = coretemp_cmi_errno(e);
242 dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal "
243 "status on %s: %d", sensor->cs_name, err);
244 return (err);
245 }
246
247 if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_intr_msr, &intr)) !=
248 CMI_SUCCESS) {
249 err = coretemp_cmi_errno(e);
250 dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal "
251 "interrupt on %s: %d", sensor->cs_name, err);
252 return (err);
253 }
254
255 sensor->cs_status = status;
256 sensor->cs_intr = intr;
257 sensor->cs_last_read = gethrtime();
258 return (0);
259 }
260
261 static int
coretemp_read(void * arg,sensor_ioctl_scalar_t * scalar)262 coretemp_read(void *arg, sensor_ioctl_scalar_t *scalar)
263 {
264 coretemp_sensor_t *sensor = arg;
265 coretemp_t *ct = sensor->cs_coretemp;
266 hrtime_t diff;
267 uint_t reading, resolution;
268
269 mutex_enter(&ct->coretemp_mutex);
270 diff = NSEC2MSEC(gethrtime() - sensor->cs_last_read);
271 if (diff > 0 && diff > (hrtime_t)coretemp_cache_ms) {
272 int ret;
273 cmi_hdl_t hdl;
274
275 if ((hdl = cmi_hdl_lookup(sensor->cs_class, sensor->cs_chip,
276 sensor->cs_core, sensor->cs_strand)) == NULL) {
277 mutex_exit(&ct->coretemp_mutex);
278 return (ENXIO);
279 }
280 ret = coretemp_update(ct, sensor, hdl);
281 cmi_hdl_rele(hdl);
282 if (ret != 0) {
283 mutex_exit(&ct->coretemp_mutex);
284 return (ret);
285 }
286 }
287
288 switch (sensor->cs_type) {
289 case CORETEMP_S_CORE:
290 if ((sensor->cs_status & IA32_THERM_STATUS_READ_VALID) == 0) {
291 mutex_exit(&ct->coretemp_mutex);
292 return (EIO);
293 }
294 reading = IA32_THERM_STATUS_READING(sensor->cs_status);
295 resolution = IA32_THERM_STATUS_RESOLUTION(sensor->cs_status);
296 break;
297 case CORETEMP_S_SOCKET:
298 reading = IA32_PKG_THERM_STATUS_READING(sensor->cs_status);
299 resolution = 0;
300 break;
301 default:
302 mutex_exit(&ct->coretemp_mutex);
303 return (ENXIO);
304 }
305 if (reading >= sensor->cs_tjmax) {
306 dev_err(ct->coretemp_dip, CE_WARN, "!found invalid temperature "
307 "on sensor %s: readout: %u, tjmax: %u, raw: 0x%"
308 PRIx64, sensor->cs_name, reading, sensor->cs_tjmax,
309 sensor->cs_status);
310 mutex_exit(&ct->coretemp_mutex);
311 return (EIO);
312 }
313 sensor->cs_temperature = sensor->cs_tjmax - reading;
314 sensor->cs_resolution = resolution;
315
316 scalar->sis_unit = SENSOR_UNIT_CELSIUS;
317 scalar->sis_value = sensor->cs_temperature;
318 scalar->sis_gran = CORETEMP_GRANULARITY;
319 scalar->sis_prec = sensor->cs_resolution;
320 mutex_exit(&ct->coretemp_mutex);
321
322 return (0);
323 }
324
325 static const ksensor_ops_t coretemp_temp_ops = {
326 .kso_kind = ksensor_kind_temperature,
327 .kso_scalar = coretemp_read
328 };
329
330 static void
coretemp_destroy(coretemp_t * ct)331 coretemp_destroy(coretemp_t *ct)
332 {
333 coretemp_sensor_t *sensor;
334
335 (void) ksensor_remove(ct->coretemp_dip, KSENSOR_ALL_IDS);
336 while ((sensor = list_remove_head(&ct->coretemp_sensors)) != NULL) {
337 kmem_free(sensor, sizeof (coretemp_sensor_t));
338 }
339 list_destroy(&ct->coretemp_sensors);
340
341 if (ct->coretemp_cpuset != NULL) {
342 cpuset_free(ct->coretemp_cpuset);
343 }
344
345 mutex_destroy(&ct->coretemp_mutex);
346 kmem_free(ct, sizeof (coretemp_t));
347 }
348
349 static boolean_t
coretemp_create_sensor(coretemp_t * ct,cmi_hdl_t hdl,uint_t tjmax,coretemp_sensor_type_t type)350 coretemp_create_sensor(coretemp_t *ct, cmi_hdl_t hdl, uint_t tjmax,
351 coretemp_sensor_type_t type)
352 {
353 int err;
354 coretemp_sensor_t *sensor;
355
356 sensor = kmem_zalloc(sizeof (coretemp_sensor_t), KM_SLEEP);
357 sensor->cs_coretemp = ct;
358 sensor->cs_type = type;
359 sensor->cs_class = cmi_hdl_class(hdl);
360 sensor->cs_chip = cmi_hdl_chipid(hdl);
361 sensor->cs_core = cmi_hdl_coreid(hdl);
362 sensor->cs_strand = 0;
363 sensor->cs_tjmax = tjmax;
364
365 switch (sensor->cs_type) {
366 case CORETEMP_S_CORE:
367 if (snprintf(sensor->cs_name, sizeof (sensor->cs_name),
368 "chip%u.core%u", sensor->cs_chip, sensor->cs_core) >=
369 sizeof (sensor->cs_name)) {
370 goto err;
371 }
372 sensor->cs_status_msr = MSR_IA32_THERM_STATUS;
373 sensor->cs_intr_msr = MSR_IA32_THERM_INTERRUPT;
374 break;
375 case CORETEMP_S_SOCKET:
376 if (snprintf(sensor->cs_name, sizeof (sensor->cs_name),
377 "chip%u", sensor->cs_chip) >= sizeof (sensor->cs_name)) {
378 goto err;
379 }
380 sensor->cs_status_msr = MSR_IA32_PACKAGE_THERM_STATUS;
381 sensor->cs_intr_msr = MSR_IA32_PACKAGE_THERM_INTERRUPT;
382 break;
383 }
384
385 if ((err = ksensor_create(ct->coretemp_dip, &coretemp_temp_ops, sensor,
386 sensor->cs_name, DDI_NT_SENSOR_TEMP_CPU, &sensor->cs_sensor)) !=
387 0) {
388 dev_err(ct->coretemp_dip, CE_WARN, "failed to create ksensor "
389 "for %s: %d", sensor->cs_name, err);
390 }
391
392 ASSERT(MUTEX_HELD(&ct->coretemp_mutex));
393 list_insert_tail(&ct->coretemp_sensors, sensor);
394
395 return (B_TRUE);
396 err:
397 kmem_free(sensor, sizeof (coretemp_sensor_t));
398 return (B_FALSE);
399 }
400
401 static int
coretemp_walk(cmi_hdl_t hdl,void * arg1,void * arg2,void * arg3)402 coretemp_walk(cmi_hdl_t hdl, void *arg1, void *arg2, void *arg3)
403 {
404 coretemp_t *ct = arg1;
405 boolean_t *walkerr = arg2;
406 uint_t tjmax;
407 int err;
408
409 /*
410 * The temperature sensor only exists on a per-core basis. Therefore we
411 * ignore any non-zero strand.
412 */
413 if (cmi_hdl_strandid(hdl) != 0) {
414 return (CMI_HDL_WALK_NEXT);
415 }
416
417 if ((err = coretemp_calculate_tjmax(ct, hdl, &tjmax)) != 0) {
418 dev_err(ct->coretemp_dip, CE_WARN,
419 "failed to read Tj Max on %u/%u: %d", cmi_hdl_chipid(hdl),
420 cmi_hdl_coreid(hdl), err);
421 *walkerr = B_TRUE;
422 return (CMI_HDL_WALK_DONE);
423 }
424
425 if (!coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_CORE)) {
426 *walkerr = B_TRUE;
427 return (CMI_HDL_WALK_DONE);
428 }
429
430 if (ct->coretemp_pkg && cmi_hdl_coreid(hdl) == 0 &&
431 !coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_SOCKET)) {
432 *walkerr = B_TRUE;
433 return (CMI_HDL_WALK_DONE);
434 }
435
436 return (CMI_HDL_WALK_NEXT);
437 }
438
439 static int
coretemp_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)440 coretemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
441 {
442 boolean_t walkerr;
443 coretemp_t *ct = NULL;
444
445 if (cmd == DDI_RESUME) {
446 return (DDI_SUCCESS);
447 } else if (cmd != DDI_ATTACH) {
448 return (DDI_FAILURE);
449 }
450
451 if (coretemp != NULL) {
452 return (DDI_FAILURE);
453 }
454
455 ct = kmem_zalloc(sizeof (coretemp_t), KM_SLEEP);
456 ct->coretemp_dip = dip;
457 ct->coretemp_pkg = is_x86_feature(x86_featureset, X86FSET_PKG_THERMAL);
458 list_create(&ct->coretemp_sensors, sizeof (coretemp_sensor_t),
459 offsetof(coretemp_sensor_t, cs_link));
460 mutex_init(&ct->coretemp_mutex, NULL, MUTEX_DRIVER, NULL);
461 ct->coretemp_cpuset = cpuset_alloc(KM_SLEEP);
462
463 mutex_enter(&ct->coretemp_mutex);
464 walkerr = B_FALSE;
465 cmi_hdl_walk(coretemp_walk, ct, &walkerr, NULL);
466
467 if (walkerr) {
468 mutex_exit(&ct->coretemp_mutex);
469 goto fail;
470 }
471
472 coretemp = ct;
473 mutex_exit(&ct->coretemp_mutex);
474 return (DDI_SUCCESS);
475 fail:
476 coretemp = NULL;
477 coretemp_destroy(ct);
478 return (DDI_FAILURE);
479
480 }
481
482 static int
coretemp_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)483 coretemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
484 {
485 if (cmd == DDI_SUSPEND) {
486 return (DDI_SUCCESS);
487 } else if (cmd != DDI_DETACH) {
488 return (DDI_FAILURE);
489 }
490
491 if (coretemp == NULL) {
492 return (DDI_FAILURE);
493 }
494
495 coretemp_destroy(coretemp);
496 coretemp = NULL;
497
498 return (DDI_SUCCESS);
499 }
500
501 static struct dev_ops coretemp_dev_ops = {
502 .devo_rev = DEVO_REV,
503 .devo_refcnt = 0,
504 .devo_getinfo = nodev,
505 .devo_identify = nulldev,
506 .devo_probe = nulldev,
507 .devo_attach = coretemp_attach,
508 .devo_detach = coretemp_detach,
509 .devo_reset = nodev,
510 .devo_quiesce = ddi_quiesce_not_needed
511 };
512
513 static struct modldrv coretemp_modldrv = {
514 .drv_modops = &mod_driverops,
515 .drv_linkinfo = "Intel CPU/Package thermal sensor",
516 .drv_dev_ops = &coretemp_dev_ops
517 };
518
519 static struct modlinkage coretemp_modlinkage = {
520 .ml_rev = MODREV_1,
521 .ml_linkage = { &coretemp_modldrv, NULL }
522 };
523
524 int
_init(void)525 _init(void)
526 {
527 if (!coretemp_supported()) {
528 return (ENOTSUP);
529 }
530
531 return (mod_install(&coretemp_modlinkage));
532 }
533
534 int
_info(struct modinfo * modinfop)535 _info(struct modinfo *modinfop)
536 {
537 return (mod_info(&coretemp_modlinkage, modinfop));
538 }
539
540 int
_fini(void)541 _fini(void)
542 {
543 return (mod_remove(&coretemp_modlinkage));
544 }
545