1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 * Copyright 2022 Oxide Computer Company 15 */ 16 17 /* 18 * This implements a temperature sensor for AMD Zen family products that rely 19 * upon the SMN framework for getting temperature information. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/types.h> 26 #include <sys/cred.h> 27 #include <sys/ddi.h> 28 #include <sys/sunddi.h> 29 #include <sys/cmn_err.h> 30 #include <sys/x86_archext.h> 31 #include <sys/cpuvar.h> 32 #include <sys/sensors.h> 33 #include <sys/sysmacros.h> 34 #include <sys/amdzen/smn.h> 35 #include <amdzen_client.h> 36 37 /* 38 * The following are register offsets and the meaning of their bits related to 39 * temperature. These addresses reside in the System Management Network which is 40 * accessed through the northbridge. They are not addresses in PCI configuration 41 * space. 42 */ 43 #define SMN_SMU_THERMAL_CURTEMP SMN_MAKE_REG(0x00059800) 44 #define SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(x) ((x) >> 21) 45 #define SMN_SMU_THERMAL_CURTEMP_RANGE_SEL (1 << 19) 46 47 #define SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ (-49) 48 #define SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS 3 49 #define SMN_SMU_THERMAL_CURTEMP_BITS_MASK 0x7 50 51 /* 52 * The temperature sensor in Family 17 is measured in terms of 0.125 C steps. 53 */ 54 #define SMN_THERMAL_GRANULARITY 8 55 56 typedef enum { 57 SMNTEMP_F_MUTEX = 1 << 0 58 } smntemp_flags_t; 59 60 typedef struct { 61 uint_t stt_dfno; 62 id_t stt_ksensor; 63 struct smntemp *stt_smn; 64 smntemp_flags_t stt_flags; 65 kmutex_t stt_mutex; 66 hrtime_t stt_last_read; 67 uint32_t stt_reg; 68 int64_t stt_temp; 69 } smntemp_temp_t; 70 71 typedef struct smntemp { 72 dev_info_t *smn_dip; 73 uint_t smn_ntemps; 74 int smn_offset; 75 smntemp_temp_t *smn_temps; 76 } smntemp_t; 77 78 static smntemp_t smntemp_data; 79 80 /* 81 * AMD processors report a control temperature (called Tctl) which may be 82 * different from the junction temperature, which is the value that is actually 83 * measured from the die (sometimes called Tdie or Tjct). This is done so that 84 * socket-based environmental monitoring can be consistent from a platform 85 * perspective, but doesn't help us. Unfortunately, these values aren't in 86 * datasheets that we can find, but have been documented partially in a series 87 * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software 88 * for Windows. 89 * 90 * The brand strings below may contain partial matches such in the Threadripper 91 * cases so we can match the entire family of processors. The offset value is 92 * the quantity in degrees that we should adjust Tctl to reach Tdie. 93 */ 94 typedef struct { 95 const char *sto_brand; 96 uint_t sto_family; 97 int sto_off; 98 } smntemp_offset_t; 99 100 static const smntemp_offset_t smntemp_offsets[] = { 101 { "AMD Ryzen 5 1600X", 0x17, -20 }, 102 { "AMD Ryzen 7 1700X", 0x17, -20 }, 103 { "AMD Ryzen 7 1800X", 0x17, -20 }, 104 { "AMD Ryzen 7 2700X", 0x17, -10 }, 105 { "AMD Ryzen Threadripper 19", 0x17, -27 }, 106 { "AMD Ryzen Threadripper 29", 0x17, -27 }, 107 { NULL } 108 }; 109 110 static int 111 smntemp_temp_update(smntemp_t *smn, smntemp_temp_t *stt) 112 { 113 int ret; 114 uint32_t reg; 115 int64_t raw, decimal; 116 117 ASSERT(MUTEX_HELD((&stt->stt_mutex))); 118 119 if ((ret = amdzen_c_smn_read32(stt->stt_dfno, SMN_SMU_THERMAL_CURTEMP, 120 ®)) != 0) { 121 return (ret); 122 } 123 124 stt->stt_last_read = gethrtime(); 125 stt->stt_reg = reg; 126 raw = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) >> 127 SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS; 128 decimal = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) & 129 SMN_SMU_THERMAL_CURTEMP_BITS_MASK; 130 if ((reg & SMN_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) { 131 raw += SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ; 132 } 133 raw += smn->smn_offset; 134 stt->stt_temp = raw << SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS; 135 stt->stt_temp += decimal; 136 137 return (0); 138 } 139 140 static int 141 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp) 142 { 143 int ret; 144 smntemp_temp_t *stt = arg; 145 smntemp_t *smn = stt->stt_smn; 146 147 mutex_enter(&stt->stt_mutex); 148 if ((ret = smntemp_temp_update(smn, stt)) != 0) { 149 mutex_exit(&stt->stt_mutex); 150 return (ret); 151 } 152 153 temp->sis_unit = SENSOR_UNIT_CELSIUS; 154 temp->sis_value = stt->stt_temp; 155 temp->sis_gran = SMN_THERMAL_GRANULARITY; 156 mutex_exit(&stt->stt_mutex); 157 158 return (0); 159 } 160 161 static const ksensor_ops_t smntemp_temp_ops = { 162 .kso_kind = ksensor_kind_temperature, 163 .kso_scalar = smntemp_temp_read 164 }; 165 166 static void 167 smntemp_cleanup(smntemp_t *smn) 168 { 169 if (smn->smn_temps != NULL) { 170 uint_t i; 171 172 (void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS); 173 for (i = 0; i < smn->smn_ntemps; i++) { 174 if ((smn->smn_temps[i].stt_flags & SMNTEMP_F_MUTEX) != 175 0) { 176 mutex_destroy(&smn->smn_temps[i].stt_mutex); 177 smn->smn_temps[i].stt_flags &= ~SMNTEMP_F_MUTEX; 178 } 179 } 180 kmem_free(smn->smn_temps, sizeof (smntemp_temp_t) * 181 smn->smn_ntemps); 182 smn->smn_temps = NULL; 183 smn->smn_ntemps = 0; 184 } 185 186 if (smn->smn_dip != NULL) { 187 ddi_remove_minor_node(smn->smn_dip, NULL); 188 ddi_set_driver_private(smn->smn_dip, NULL); 189 smn->smn_dip = NULL; 190 } 191 } 192 193 static boolean_t 194 smntemp_find_offset(smntemp_t *smn) 195 { 196 uint_t i, family; 197 char buf[256]; 198 199 if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) { 200 dev_err(smn->smn_dip, CE_WARN, "!failed to read processor " 201 "brand string, brand larger than internal buffer"); 202 return (B_FALSE); 203 } 204 205 family = cpuid_getfamily(CPU); 206 207 for (i = 0; i < ARRAY_SIZE(smntemp_offsets); i++) { 208 if (family != smntemp_offsets[i].sto_family) 209 continue; 210 if (strncmp(buf, smntemp_offsets[i].sto_brand, 211 strlen(smntemp_offsets[i].sto_brand)) == 0) { 212 smn->smn_offset = smntemp_offsets[i].sto_off; 213 break; 214 } 215 } 216 217 return (B_TRUE); 218 } 219 220 static int 221 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 222 { 223 uint_t i; 224 smntemp_t *smntemp = &smntemp_data; 225 226 if (cmd == DDI_RESUME) { 227 return (DDI_SUCCESS); 228 } else if (cmd != DDI_ATTACH) { 229 return (DDI_FAILURE); 230 } 231 232 if (smntemp->smn_dip != NULL) { 233 dev_err(dip, CE_WARN, "!smntemp already attached"); 234 return (DDI_FAILURE); 235 } 236 smntemp->smn_dip = dip; 237 ddi_set_driver_private(dip, smntemp); 238 239 if (!smntemp_find_offset(smntemp)) { 240 goto err; 241 } 242 243 smntemp->smn_ntemps = amdzen_c_df_count(); 244 if (smntemp->smn_ntemps == 0) { 245 dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp"); 246 goto err; 247 } 248 smntemp->smn_temps = kmem_zalloc(sizeof (smntemp_temp_t) * 249 smntemp->smn_ntemps, KM_SLEEP); 250 for (i = 0; i < smntemp->smn_ntemps; i++) { 251 int ret; 252 char buf[128]; 253 254 smntemp->smn_temps[i].stt_smn = smntemp; 255 smntemp->smn_temps[i].stt_dfno = i; 256 mutex_init(&smntemp->smn_temps[i].stt_mutex, NULL, MUTEX_DRIVER, 257 NULL); 258 smntemp->smn_temps[i].stt_flags |= SMNTEMP_F_MUTEX; 259 260 if (snprintf(buf, sizeof (buf), "procnode.%u", i) >= 261 sizeof (buf)) { 262 dev_err(dip, CE_WARN, "!unexpected buffer name overrun " 263 "assembling temperature minor %u", i); 264 goto err; 265 } 266 267 if ((ret = ksensor_create(dip, &smntemp_temp_ops, 268 &smntemp->smn_temps[i], buf, DDI_NT_SENSOR_TEMP_CPU, 269 &smntemp->smn_temps[i].stt_ksensor)) != 0) { 270 dev_err(dip, CE_WARN, "!failed to create sensor %s: %d", 271 buf, ret); 272 goto err; 273 } 274 } 275 276 return (DDI_SUCCESS); 277 278 err: 279 smntemp_cleanup(smntemp); 280 return (DDI_FAILURE); 281 } 282 283 static int 284 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 285 { 286 smntemp_t *smntemp = &smntemp_data; 287 288 if (cmd == DDI_SUSPEND) { 289 return (DDI_SUCCESS); 290 } else if (cmd != DDI_DETACH) { 291 return (DDI_FAILURE); 292 } 293 294 if (smntemp->smn_dip == NULL) { 295 dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn " 296 "instance %d that was never attached", 297 ddi_get_instance(dip)); 298 return (DDI_FAILURE); 299 } 300 301 smntemp_cleanup(smntemp); 302 return (DDI_SUCCESS); 303 } 304 305 static struct dev_ops smntemp_dev_ops = { 306 .devo_rev = DEVO_REV, 307 .devo_refcnt = 0, 308 .devo_getinfo = nodev, 309 .devo_identify = nulldev, 310 .devo_probe = nulldev, 311 .devo_attach = smntemp_attach, 312 .devo_detach = smntemp_detach, 313 .devo_reset = nodev, 314 .devo_quiesce = ddi_quiesce_not_needed, 315 }; 316 317 static struct modldrv smntemp_modldrv = { 318 .drv_modops = &mod_driverops, 319 .drv_linkinfo = "AMD SMN Temperature Driver", 320 .drv_dev_ops = &smntemp_dev_ops 321 }; 322 323 static struct modlinkage smntemp_modlinkage = { 324 .ml_rev = MODREV_1, 325 .ml_linkage = { &smntemp_modldrv, NULL } 326 }; 327 328 int 329 _init(void) 330 { 331 return (mod_install(&smntemp_modlinkage)); 332 } 333 334 int 335 _info(struct modinfo *modinfop) 336 { 337 return (mod_info(&smntemp_modlinkage, modinfop)); 338 } 339 340 int 341 _fini(void) 342 { 343 return (mod_remove(&smntemp_modlinkage)); 344 } 345