xref: /illumos-gate/usr/src/uts/intel/io/amdzen/smntemp.c (revision a92282e44f968185a6bba094d1e5fece2da819cf)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  * Copyright 2020 Oxide Computer Company
15  */
16 
17 /*
18  * This implements a temperature sensor for AMD Zen family products that rely
19  * upon the SMN framework for getting temperature information.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/types.h>
26 #include <sys/cred.h>
27 #include <sys/ddi.h>
28 #include <sys/sunddi.h>
29 #include <sys/cmn_err.h>
30 #include <sys/x86_archext.h>
31 #include <sys/cpuvar.h>
32 #include <sys/sensors.h>
33 #include <sys/sysmacros.h>
34 #include <amdzen_client.h>
35 
36 /*
37  * The following are register offsets and the meaning of their bits related to
38  * temperature. These addresses reside in the System Management Network which is
39  * accessed through the northbridge. They are not addresses in PCI configuration
40  * space.
41  */
42 #define	SMN_SMU_THERMAL_CURTEMP			0x00059800
43 #define	SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(x)	((x) >> 21)
44 #define	SMN_SMU_THERMAL_CURTEMP_RANGE_SEL		(1 << 19)
45 
46 #define	SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ		(-49)
47 #define	SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS		3
48 #define	SMN_SMU_THERMAL_CURTEMP_BITS_MASK		0x7
49 
50 /*
51  * The temperature sensor in Family 17 is measured in terms of 0.125 C steps.
52  */
53 #define	SMN_THERMAL_GRANULARITY	8
54 
55 typedef enum {
56 	SMNTEMP_F_MUTEX	= 1 << 0
57 } smntemp_flags_t;
58 
59 typedef struct {
60 	uint_t stt_dfno;
61 	id_t stt_ksensor;
62 	struct smntemp *stt_smn;
63 	smntemp_flags_t stt_flags;
64 	kmutex_t stt_mutex;
65 	hrtime_t stt_last_read;
66 	uint32_t stt_reg;
67 	int64_t stt_temp;
68 } smntemp_temp_t;
69 
70 typedef struct smntemp {
71 	dev_info_t *smn_dip;
72 	uint_t smn_ntemps;
73 	int smn_offset;
74 	smntemp_temp_t *smn_temps;
75 } smntemp_t;
76 
77 static smntemp_t smntemp_data;
78 
79 /*
80  * AMD processors report a control temperature (called Tctl) which may be
81  * different from the junction temperature, which is the value that is actually
82  * measured from the die (sometimes called Tdie or Tjct). This is done so that
83  * socket-based environmental monitoring can be consistent from a platform
84  * perspective, but doesn't help us. Unfortunately, these values aren't in
85  * datasheets that we can find, but have been documented partially in a series
86  * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software
87  * for Windows.
88  *
89  * The brand strings below may contain partial matches such in the Threadripper
90  * cases so we can match the entire family of processors. The offset value is
91  * the quantity in degrees that we should adjust Tctl to reach Tdie.
92  */
93 typedef struct {
94 	const char	*sto_brand;
95 	uint_t		sto_family;
96 	int		sto_off;
97 } smntemp_offset_t;
98 
99 static const smntemp_offset_t smntemp_offsets[] = {
100 	{ "AMD Ryzen 5 1600X", 0x17, -20 },
101 	{ "AMD Ryzen 7 1700X", 0x17, -20 },
102 	{ "AMD Ryzen 7 1800X", 0x17, -20 },
103 	{ "AMD Ryzen 7 2700X", 0x17, -10 },
104 	{ "AMD Ryzen Threadripper 19", 0x17, -27 },
105 	{ "AMD Ryzen Threadripper 29", 0x17, -27 },
106 	{ NULL }
107 };
108 
109 static int
110 smntemp_temp_update(smntemp_t *smn, smntemp_temp_t *stt)
111 {
112 	int ret;
113 	uint32_t reg;
114 	int64_t raw, decimal;
115 
116 	ASSERT(MUTEX_HELD((&stt->stt_mutex)));
117 
118 	if ((ret = amdzen_c_smn_read32(stt->stt_dfno, SMN_SMU_THERMAL_CURTEMP,
119 	    &reg)) != 0) {
120 		return (ret);
121 	}
122 
123 	stt->stt_last_read = gethrtime();
124 	stt->stt_reg = reg;
125 	raw = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) >>
126 	    SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
127 	decimal = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) &
128 	    SMN_SMU_THERMAL_CURTEMP_BITS_MASK;
129 	if ((reg & SMN_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) {
130 		raw += SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ;
131 	}
132 	raw += smn->smn_offset;
133 	stt->stt_temp = raw << SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
134 	stt->stt_temp += decimal;
135 
136 	return (0);
137 }
138 
139 static int
140 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp)
141 {
142 	int ret;
143 	smntemp_temp_t *stt = arg;
144 	smntemp_t *smn = stt->stt_smn;
145 
146 	mutex_enter(&stt->stt_mutex);
147 	if ((ret = smntemp_temp_update(smn, stt)) != 0) {
148 		mutex_exit(&stt->stt_mutex);
149 		return (ret);
150 	}
151 
152 	temp->sis_unit = SENSOR_UNIT_CELSIUS;
153 	temp->sis_value = stt->stt_temp;
154 	temp->sis_gran = SMN_THERMAL_GRANULARITY;
155 	mutex_exit(&stt->stt_mutex);
156 
157 	return (0);
158 }
159 
160 static const ksensor_ops_t smntemp_temp_ops = {
161 	.kso_kind = ksensor_kind_temperature,
162 	.kso_scalar = smntemp_temp_read
163 };
164 
165 static void
166 smntemp_cleanup(smntemp_t *smn)
167 {
168 	if (smn->smn_temps != NULL) {
169 		uint_t i;
170 
171 		(void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS);
172 		for (i = 0; i < smn->smn_ntemps; i++) {
173 			if ((smn->smn_temps[i].stt_flags & SMNTEMP_F_MUTEX) !=
174 			    0) {
175 				mutex_destroy(&smn->smn_temps[i].stt_mutex);
176 				smn->smn_temps[i].stt_flags &= ~SMNTEMP_F_MUTEX;
177 			}
178 		}
179 		kmem_free(smn->smn_temps, sizeof (smntemp_temp_t) *
180 		    smn->smn_ntemps);
181 		smn->smn_temps = NULL;
182 		smn->smn_ntemps = 0;
183 	}
184 
185 	if (smn->smn_dip != NULL) {
186 		ddi_remove_minor_node(smn->smn_dip, NULL);
187 		ddi_set_driver_private(smn->smn_dip, NULL);
188 		smn->smn_dip = NULL;
189 	}
190 }
191 
192 static boolean_t
193 smntemp_find_offset(smntemp_t *smn)
194 {
195 	uint_t i, family;
196 	char buf[256];
197 
198 	if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) {
199 		dev_err(smn->smn_dip, CE_WARN, "!failed to read processor "
200 		    "brand string, brand larger than internal buffer");
201 		return (B_FALSE);
202 	}
203 
204 	family = cpuid_getfamily(CPU);
205 
206 	for (i = 0; i < ARRAY_SIZE(smntemp_offsets); i++) {
207 		if (family != smntemp_offsets[i].sto_family)
208 			continue;
209 		if (strncmp(buf, smntemp_offsets[i].sto_brand,
210 		    strlen(smntemp_offsets[i].sto_brand)) == 0) {
211 			smn->smn_offset = smntemp_offsets[i].sto_off;
212 			break;
213 		}
214 	}
215 
216 	return (B_TRUE);
217 }
218 
219 static int
220 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
221 {
222 	uint_t i;
223 	smntemp_t *smntemp = &smntemp_data;
224 
225 	if (cmd == DDI_RESUME) {
226 		return (DDI_SUCCESS);
227 	} else if (cmd != DDI_ATTACH) {
228 		return (DDI_FAILURE);
229 	}
230 
231 	if (smntemp->smn_dip != NULL) {
232 		dev_err(dip, CE_WARN, "!smntemp already attached");
233 		return (DDI_FAILURE);
234 	}
235 	smntemp->smn_dip = dip;
236 	ddi_set_driver_private(dip, smntemp);
237 
238 	if (!smntemp_find_offset(smntemp)) {
239 		goto err;
240 	}
241 
242 	smntemp->smn_ntemps = amdzen_c_df_count();
243 	if (smntemp->smn_ntemps == 0) {
244 		dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp");
245 		goto err;
246 	}
247 	smntemp->smn_temps = kmem_zalloc(sizeof (smntemp_temp_t) *
248 	    smntemp->smn_ntemps, KM_SLEEP);
249 	for (i = 0; i < smntemp->smn_ntemps; i++) {
250 		int ret;
251 		char buf[128];
252 
253 		smntemp->smn_temps[i].stt_smn = smntemp;
254 		smntemp->smn_temps[i].stt_dfno = i;
255 		mutex_init(&smntemp->smn_temps[i].stt_mutex, NULL, MUTEX_DRIVER,
256 		    NULL);
257 		smntemp->smn_temps[i].stt_flags |= SMNTEMP_F_MUTEX;
258 
259 		if (snprintf(buf, sizeof (buf), "procnode.%u", i) >=
260 		    sizeof (buf)) {
261 			dev_err(dip, CE_WARN, "!unexpected buffer name overrun "
262 			    "assembling temperature minor %u", i);
263 			goto err;
264 		}
265 
266 		if ((ret = ksensor_create(dip, &smntemp_temp_ops,
267 		    &smntemp->smn_temps[i], buf, DDI_NT_SENSOR_TEMP_CPU,
268 		    &smntemp->smn_temps[i].stt_ksensor)) != 0) {
269 			dev_err(dip, CE_WARN, "!failed to create sensor %s: %d",
270 			    buf, ret);
271 			goto err;
272 		}
273 	}
274 
275 	return (DDI_SUCCESS);
276 
277 err:
278 	smntemp_cleanup(smntemp);
279 	return (DDI_FAILURE);
280 }
281 
282 static int
283 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
284 {
285 	smntemp_t *smntemp = &smntemp_data;
286 
287 	if (cmd == DDI_SUSPEND) {
288 		return (DDI_SUCCESS);
289 	} else if (cmd != DDI_DETACH) {
290 		return (DDI_FAILURE);
291 	}
292 
293 	if (smntemp->smn_dip == NULL) {
294 		dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn "
295 		    "instance %d that was never attached",
296 		    ddi_get_instance(dip));
297 		return (DDI_FAILURE);
298 	}
299 
300 	smntemp_cleanup(smntemp);
301 	return (DDI_SUCCESS);
302 }
303 
304 static struct dev_ops smntemp_dev_ops = {
305 	.devo_rev = DEVO_REV,
306 	.devo_refcnt = 0,
307 	.devo_getinfo = nodev,
308 	.devo_identify = nulldev,
309 	.devo_probe = nulldev,
310 	.devo_attach = smntemp_attach,
311 	.devo_detach = smntemp_detach,
312 	.devo_reset = nodev,
313 	.devo_quiesce = ddi_quiesce_not_needed,
314 };
315 
316 static struct modldrv smntemp_modldrv = {
317 	.drv_modops = &mod_driverops,
318 	.drv_linkinfo = "AMD SMN Temperature Driver",
319 	.drv_dev_ops = &smntemp_dev_ops
320 };
321 
322 static struct modlinkage smntemp_modlinkage = {
323 	.ml_rev = MODREV_1,
324 	.ml_linkage = { &smntemp_modldrv, NULL }
325 };
326 
327 int
328 _init(void)
329 {
330 	return (mod_install(&smntemp_modlinkage));
331 }
332 
333 int
334 _info(struct modinfo *modinfop)
335 {
336 	return (mod_info(&smntemp_modlinkage, modinfop));
337 }
338 
339 int
340 _fini(void)
341 {
342 	return (mod_remove(&smntemp_modlinkage));
343 }
344