1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 * Copyright 2020 Oxide Computer Company 15 */ 16 17 /* 18 * This implements a temperature sensor for AMD Zen family products that rely 19 * upon the SMN framework for getting temperature information. 20 */ 21 22 #include <sys/modctl.h> 23 #include <sys/conf.h> 24 #include <sys/devops.h> 25 #include <sys/types.h> 26 #include <sys/cred.h> 27 #include <sys/ddi.h> 28 #include <sys/sunddi.h> 29 #include <sys/cmn_err.h> 30 #include <sys/x86_archext.h> 31 #include <sys/cpuvar.h> 32 #include <sys/sensors.h> 33 #include <sys/sysmacros.h> 34 #include <amdzen_client.h> 35 36 /* 37 * The following are register offsets and the meaning of their bits related to 38 * temperature. These addresses reside in the System Management Network which is 39 * accessed through the northbridge. They are not addresses in PCI configuration 40 * space. 41 */ 42 #define SMN_SMU_THERMAL_CURTEMP 0x00059800 43 #define SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(x) ((x) >> 21) 44 #define SMN_SMU_THERMAL_CURTEMP_RANGE_SEL (1 << 19) 45 46 #define SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ (-49) 47 #define SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS 3 48 #define SMN_SMU_THERMAL_CURTEMP_BITS_MASK 0x7 49 50 /* 51 * The temperature sensor in Family 17 is measured in terms of 0.125 C steps. 52 */ 53 #define SMN_THERMAL_GRANULARITY 8 54 55 typedef enum { 56 SMNTEMP_F_MUTEX = 1 << 0 57 } smntemp_flags_t; 58 59 typedef struct { 60 uint_t stt_dfno; 61 id_t stt_ksensor; 62 struct smntemp *stt_smn; 63 smntemp_flags_t stt_flags; 64 kmutex_t stt_mutex; 65 hrtime_t stt_last_read; 66 uint32_t stt_reg; 67 int64_t stt_temp; 68 } smntemp_temp_t; 69 70 typedef struct smntemp { 71 dev_info_t *smn_dip; 72 uint_t smn_ntemps; 73 int smn_offset; 74 smntemp_temp_t *smn_temps; 75 } smntemp_t; 76 77 static smntemp_t smntemp_data; 78 79 /* 80 * AMD processors report a control temperature (called Tctl) which may be 81 * different from the junction temperature, which is the value that is actually 82 * measured from the die (sometimes called Tdie or Tjct). This is done so that 83 * socket-based environmental monitoring can be consistent from a platform 84 * perspective, but doesn't help us. Unfortunately, these values aren't in 85 * datasheets that we can find, but have been documented partially in a series 86 * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software 87 * for Windows. 88 * 89 * The brand strings below may contain partial matches such in the Threadripper 90 * cases so we can match the entire family of processors. The offset value is 91 * the quantity in degrees that we should adjust Tctl to reach Tdie. 92 */ 93 typedef struct { 94 const char *sto_brand; 95 uint_t sto_family; 96 int sto_off; 97 } smntemp_offset_t; 98 99 static const smntemp_offset_t smntemp_offsets[] = { 100 { "AMD Ryzen 5 1600X", 0x17, -20 }, 101 { "AMD Ryzen 7 1700X", 0x17, -20 }, 102 { "AMD Ryzen 7 1800X", 0x17, -20 }, 103 { "AMD Ryzen 7 2700X", 0x17, -10 }, 104 { "AMD Ryzen Threadripper 19", 0x17, -27 }, 105 { "AMD Ryzen Threadripper 29", 0x17, -27 }, 106 { NULL } 107 }; 108 109 static int 110 smntemp_temp_update(smntemp_t *smn, smntemp_temp_t *stt) 111 { 112 int ret; 113 uint32_t reg; 114 int64_t raw, decimal; 115 116 ASSERT(MUTEX_HELD((&stt->stt_mutex))); 117 118 if ((ret = amdzen_c_smn_read32(stt->stt_dfno, SMN_SMU_THERMAL_CURTEMP, 119 ®)) != 0) { 120 return (ret); 121 } 122 123 stt->stt_last_read = gethrtime(); 124 stt->stt_reg = reg; 125 raw = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) >> 126 SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS; 127 decimal = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) & 128 SMN_SMU_THERMAL_CURTEMP_BITS_MASK; 129 if ((reg & SMN_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) { 130 raw += SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ; 131 } 132 raw += smn->smn_offset; 133 stt->stt_temp = raw << SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS; 134 stt->stt_temp += decimal; 135 136 return (0); 137 } 138 139 static int 140 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp) 141 { 142 int ret; 143 smntemp_temp_t *stt = arg; 144 smntemp_t *smn = stt->stt_smn; 145 146 mutex_enter(&stt->stt_mutex); 147 if ((ret = smntemp_temp_update(smn, stt)) != 0) { 148 mutex_exit(&stt->stt_mutex); 149 return (ret); 150 } 151 152 temp->sis_unit = SENSOR_UNIT_CELSIUS; 153 temp->sis_value = stt->stt_temp; 154 temp->sis_gran = SMN_THERMAL_GRANULARITY; 155 mutex_exit(&stt->stt_mutex); 156 157 return (0); 158 } 159 160 static const ksensor_ops_t smntemp_temp_ops = { 161 .kso_kind = ksensor_kind_temperature, 162 .kso_scalar = smntemp_temp_read 163 }; 164 165 static void 166 smntemp_cleanup(smntemp_t *smn) 167 { 168 if (smn->smn_temps != NULL) { 169 uint_t i; 170 171 (void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS); 172 for (i = 0; i < smn->smn_ntemps; i++) { 173 if ((smn->smn_temps[i].stt_flags & SMNTEMP_F_MUTEX) != 174 0) { 175 mutex_destroy(&smn->smn_temps[i].stt_mutex); 176 smn->smn_temps[i].stt_flags &= ~SMNTEMP_F_MUTEX; 177 } 178 } 179 kmem_free(smn->smn_temps, sizeof (smntemp_temp_t) * 180 smn->smn_ntemps); 181 smn->smn_temps = NULL; 182 smn->smn_ntemps = 0; 183 } 184 185 if (smn->smn_dip != NULL) { 186 ddi_remove_minor_node(smn->smn_dip, NULL); 187 ddi_set_driver_private(smn->smn_dip, NULL); 188 smn->smn_dip = NULL; 189 } 190 } 191 192 static boolean_t 193 smntemp_find_offset(smntemp_t *smn) 194 { 195 uint_t i, family; 196 char buf[256]; 197 198 if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) { 199 dev_err(smn->smn_dip, CE_WARN, "!failed to read processor " 200 "brand string, brand larger than internal buffer"); 201 return (B_FALSE); 202 } 203 204 family = cpuid_getfamily(CPU); 205 206 for (i = 0; i < ARRAY_SIZE(smntemp_offsets); i++) { 207 if (family != smntemp_offsets[i].sto_family) 208 continue; 209 if (strncmp(buf, smntemp_offsets[i].sto_brand, 210 strlen(smntemp_offsets[i].sto_brand)) == 0) { 211 smn->smn_offset = smntemp_offsets[i].sto_off; 212 break; 213 } 214 } 215 216 return (B_TRUE); 217 } 218 219 static int 220 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 221 { 222 uint_t i; 223 smntemp_t *smntemp = &smntemp_data; 224 225 if (cmd == DDI_RESUME) { 226 return (DDI_SUCCESS); 227 } else if (cmd != DDI_ATTACH) { 228 return (DDI_FAILURE); 229 } 230 231 if (smntemp->smn_dip != NULL) { 232 dev_err(dip, CE_WARN, "!smntemp already attached"); 233 return (DDI_FAILURE); 234 } 235 smntemp->smn_dip = dip; 236 ddi_set_driver_private(dip, smntemp); 237 238 if (!smntemp_find_offset(smntemp)) { 239 goto err; 240 } 241 242 smntemp->smn_ntemps = amdzen_c_df_count(); 243 if (smntemp->smn_ntemps == 0) { 244 dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp"); 245 goto err; 246 } 247 smntemp->smn_temps = kmem_zalloc(sizeof (smntemp_temp_t) * 248 smntemp->smn_ntemps, KM_SLEEP); 249 for (i = 0; i < smntemp->smn_ntemps; i++) { 250 int ret; 251 char buf[128]; 252 253 smntemp->smn_temps[i].stt_smn = smntemp; 254 smntemp->smn_temps[i].stt_dfno = i; 255 mutex_init(&smntemp->smn_temps[i].stt_mutex, NULL, MUTEX_DRIVER, 256 NULL); 257 smntemp->smn_temps[i].stt_flags |= SMNTEMP_F_MUTEX; 258 259 if (snprintf(buf, sizeof (buf), "procnode.%u", i) >= 260 sizeof (buf)) { 261 dev_err(dip, CE_WARN, "!unexpected buffer name overrun " 262 "assembling temperature minor %u", i); 263 goto err; 264 } 265 266 if ((ret = ksensor_create(dip, &smntemp_temp_ops, 267 &smntemp->smn_temps[i], buf, DDI_NT_SENSOR_TEMP_CPU, 268 &smntemp->smn_temps[i].stt_ksensor)) != 0) { 269 dev_err(dip, CE_WARN, "!failed to create sensor %s: %d", 270 buf, ret); 271 goto err; 272 } 273 } 274 275 return (DDI_SUCCESS); 276 277 err: 278 smntemp_cleanup(smntemp); 279 return (DDI_FAILURE); 280 } 281 282 static int 283 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 284 { 285 smntemp_t *smntemp = &smntemp_data; 286 287 if (cmd == DDI_SUSPEND) { 288 return (DDI_SUCCESS); 289 } else if (cmd != DDI_DETACH) { 290 return (DDI_FAILURE); 291 } 292 293 if (smntemp->smn_dip == NULL) { 294 dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn " 295 "instance %d that was never attached", 296 ddi_get_instance(dip)); 297 return (DDI_FAILURE); 298 } 299 300 smntemp_cleanup(smntemp); 301 return (DDI_SUCCESS); 302 } 303 304 static struct dev_ops smntemp_dev_ops = { 305 .devo_rev = DEVO_REV, 306 .devo_refcnt = 0, 307 .devo_getinfo = nodev, 308 .devo_identify = nulldev, 309 .devo_probe = nulldev, 310 .devo_attach = smntemp_attach, 311 .devo_detach = smntemp_detach, 312 .devo_reset = nodev, 313 .devo_quiesce = ddi_quiesce_not_needed, 314 }; 315 316 static struct modldrv smntemp_modldrv = { 317 .drv_modops = &mod_driverops, 318 .drv_linkinfo = "AMD SMN Temperature Driver", 319 .drv_dev_ops = &smntemp_dev_ops 320 }; 321 322 static struct modlinkage smntemp_modlinkage = { 323 .ml_rev = MODREV_1, 324 .ml_linkage = { &smntemp_modldrv, NULL } 325 }; 326 327 int 328 _init(void) 329 { 330 return (mod_install(&smntemp_modlinkage)); 331 } 332 333 int 334 _info(struct modinfo *modinfop) 335 { 336 return (mod_info(&smntemp_modlinkage, modinfop)); 337 } 338 339 int 340 _fini(void) 341 { 342 return (mod_remove(&smntemp_modlinkage)); 343 } 344