xref: /linux/drivers/hwmon/drivetemp.c (revision a634dda26186cf9a51567020fcce52bcba5e1e59)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Hwmon client for disk and solid state drives with temperature sensors
4  * Copyright (C) 2019 Zodiac Inflight Innovations
5  *
6  * With input from:
7  *    Hwmon client for S.M.A.R.T. hard disk drives with temperature sensors.
8  *    (C) 2018 Linus Walleij
9  *
10  *    hwmon: Driver for SCSI/ATA temperature sensors
11  *    by Constantin Baranov <const@mimas.ru>, submitted September 2009
12  *
13  * This drive supports reporting the temperature of SATA drives. It can be
14  * easily extended to report the temperature of SCSI drives.
15  *
16  * The primary means to read drive temperatures and temperature limits
17  * for ATA drives is the SCT Command Transport feature set as specified in
18  * ATA8-ACS.
19  * It can be used to read the current drive temperature, temperature limits,
20  * and historic minimum and maximum temperatures. The SCT Command Transport
21  * feature set is documented in "AT Attachment 8 - ATA/ATAPI Command Set
22  * (ATA8-ACS)".
23  *
24  * If the SCT Command Transport feature set is not available, drive temperatures
25  * may be readable through SMART attributes. Since SMART attributes are not well
26  * defined, this method is only used as fallback mechanism.
27  *
28  * There are three SMART attributes which may report drive temperatures.
29  * Those are defined as follows (from
30  * http://www.cropel.com/library/smart-attribute-list.aspx).
31  *
32  * 190	Temperature	Temperature, monitored by a sensor somewhere inside
33  *			the drive. Raw value typicaly holds the actual
34  *			temperature (hexadecimal) in its rightmost two digits.
35  *
36  * 194	Temperature	Temperature, monitored by a sensor somewhere inside
37  *			the drive. Raw value typicaly holds the actual
38  *			temperature (hexadecimal) in its rightmost two digits.
39  *
40  * 231	Temperature	Temperature, monitored by a sensor somewhere inside
41  *			the drive. Raw value typicaly holds the actual
42  *			temperature (hexadecimal) in its rightmost two digits.
43  *
44  * Wikipedia defines attributes a bit differently.
45  *
46  * 190	Temperature	Value is equal to (100-temp. °C), allowing manufacturer
47  *	Difference or	to set a minimum threshold which corresponds to a
48  *	Airflow		maximum temperature. This also follows the convention of
49  *	Temperature	100 being a best-case value and lower values being
50  *			undesirable. However, some older drives may instead
51  *			report raw Temperature (identical to 0xC2) or
52  *			Temperature minus 50 here.
53  * 194	Temperature or	Indicates the device temperature, if the appropriate
54  *	Temperature	sensor is fitted. Lowest byte of the raw value contains
55  *	Celsius		the exact temperature value (Celsius degrees).
56  * 231	Life Left	Indicates the approximate SSD life left, in terms of
57  *	(SSDs) or	program/erase cycles or available reserved blocks.
58  *	Temperature	A normalized value of 100 represents a new drive, with
59  *			a threshold value at 10 indicating a need for
60  *			replacement. A value of 0 may mean that the drive is
61  *			operating in read-only mode to allow data recovery.
62  *			Previously (pre-2010) occasionally used for Drive
63  *			Temperature (more typically reported at 0xC2).
64  *
65  * Common denominator is that the first raw byte reports the temperature
66  * in degrees C on almost all drives. Some drives may report a fractional
67  * temperature in the second raw byte.
68  *
69  * Known exceptions (from libatasmart):
70  * - SAMSUNG SV0412H and SAMSUNG SV1204H) report the temperature in 10th
71  *   degrees C in the first two raw bytes.
72  * - A few Maxtor drives report an unknown or bad value in attribute 194.
73  * - Certain Apple SSD drives report an unknown value in attribute 190.
74  *   Only certain firmware versions are affected.
75  *
76  * Those exceptions affect older ATA drives and are currently ignored.
77  * Also, the second raw byte (possibly reporting the fractional temperature)
78  * is currently ignored.
79  *
80  * Many drives also report temperature limits in additional SMART data raw
81  * bytes. The format of those is not well defined and varies widely.
82  * The driver does not currently attempt to report those limits.
83  *
84  * According to data in smartmontools, attribute 231 is rarely used to report
85  * drive temperatures. At the same time, several drives report SSD life left
86  * in attribute 231, but do not support temperature sensors. For this reason,
87  * attribute 231 is currently ignored.
88  *
89  * Following above definitions, temperatures are reported as follows.
90  *   If SCT Command Transport is supported, it is used to read the
91  *   temperature and, if available, temperature limits.
92  * - Otherwise, if SMART attribute 194 is supported, it is used to read
93  *   the temperature.
94  * - Otherwise, if SMART attribute 190 is supported, it is used to read
95  *   the temperature.
96  */
97 
98 #include <linux/ata.h>
99 #include <linux/bits.h>
100 #include <linux/device.h>
101 #include <linux/hwmon.h>
102 #include <linux/kernel.h>
103 #include <linux/list.h>
104 #include <linux/module.h>
105 #include <linux/mutex.h>
106 #include <scsi/scsi_cmnd.h>
107 #include <scsi/scsi_device.h>
108 #include <scsi/scsi_driver.h>
109 #include <scsi/scsi_proto.h>
110 
111 struct drivetemp_data {
112 	struct list_head list;		/* list of instantiated devices */
113 	struct mutex lock;		/* protect data buffer accesses */
114 	struct scsi_device *sdev;	/* SCSI device */
115 	struct device *dev;		/* instantiating device */
116 	struct device *hwdev;		/* hardware monitoring device */
117 	u8 smartdata[ATA_SECT_SIZE];	/* local buffer */
118 	int (*get_temp)(struct drivetemp_data *st, u32 attr, long *val);
119 	bool have_temp_lowest;		/* lowest temp in SCT status */
120 	bool have_temp_highest;		/* highest temp in SCT status */
121 	bool have_temp_min;		/* have min temp */
122 	bool have_temp_max;		/* have max temp */
123 	bool have_temp_lcrit;		/* have lower critical limit */
124 	bool have_temp_crit;		/* have critical limit */
125 	int temp_min;			/* min temp */
126 	int temp_max;			/* max temp */
127 	int temp_lcrit;			/* lower critical limit */
128 	int temp_crit;			/* critical limit */
129 };
130 
131 static LIST_HEAD(drivetemp_devlist);
132 
133 #define ATA_MAX_SMART_ATTRS	30
134 #define SMART_TEMP_PROP_190	190
135 #define SMART_TEMP_PROP_194	194
136 
137 #define SCT_STATUS_REQ_ADDR	0xe0
138 #define  SCT_STATUS_VERSION_LOW		0	/* log byte offsets */
139 #define  SCT_STATUS_VERSION_HIGH	1
140 #define  SCT_STATUS_TEMP		200
141 #define  SCT_STATUS_TEMP_LOWEST		201
142 #define  SCT_STATUS_TEMP_HIGHEST	202
143 #define SCT_READ_LOG_ADDR	0xe1
144 #define  SMART_READ_LOG			0xd5
145 #define  SMART_WRITE_LOG		0xd6
146 
147 #define INVALID_TEMP		0x80
148 
149 #define temp_is_valid(temp)	((temp) != INVALID_TEMP)
150 #define temp_from_sct(temp)	(((s8)(temp)) * 1000)
151 
152 static inline bool ata_id_smart_supported(u16 *id)
153 {
154 	return id[ATA_ID_COMMAND_SET_1] & BIT(0);
155 }
156 
157 static inline bool ata_id_smart_enabled(u16 *id)
158 {
159 	return id[ATA_ID_CFS_ENABLE_1] & BIT(0);
160 }
161 
162 static int drivetemp_scsi_command(struct drivetemp_data *st,
163 				 u8 ata_command, u8 feature,
164 				 u8 lba_low, u8 lba_mid, u8 lba_high)
165 {
166 	u8 scsi_cmd[MAX_COMMAND_SIZE];
167 	enum req_op op;
168 	int err;
169 
170 	memset(scsi_cmd, 0, sizeof(scsi_cmd));
171 	scsi_cmd[0] = ATA_16;
172 	if (ata_command == ATA_CMD_SMART && feature == SMART_WRITE_LOG) {
173 		scsi_cmd[1] = (5 << 1);	/* PIO Data-out */
174 		/*
175 		 * No off.line or cc, write to dev, block count in sector count
176 		 * field.
177 		 */
178 		scsi_cmd[2] = 0x06;
179 		op = REQ_OP_DRV_OUT;
180 	} else {
181 		scsi_cmd[1] = (4 << 1);	/* PIO Data-in */
182 		/*
183 		 * No off.line or cc, read from dev, block count in sector count
184 		 * field.
185 		 */
186 		scsi_cmd[2] = 0x0e;
187 		op = REQ_OP_DRV_IN;
188 	}
189 	scsi_cmd[4] = feature;
190 	scsi_cmd[6] = 1;	/* 1 sector */
191 	scsi_cmd[8] = lba_low;
192 	scsi_cmd[10] = lba_mid;
193 	scsi_cmd[12] = lba_high;
194 	scsi_cmd[14] = ata_command;
195 
196 	err = scsi_execute_cmd(st->sdev, scsi_cmd, op, st->smartdata,
197 			       ATA_SECT_SIZE, HZ, 5, NULL);
198 	if (err > 0)
199 		err = -EIO;
200 	return err;
201 }
202 
203 static int drivetemp_ata_command(struct drivetemp_data *st, u8 feature,
204 				 u8 select)
205 {
206 	return drivetemp_scsi_command(st, ATA_CMD_SMART, feature, select,
207 				     ATA_SMART_LBAM_PASS, ATA_SMART_LBAH_PASS);
208 }
209 
210 static int drivetemp_get_smarttemp(struct drivetemp_data *st, u32 attr,
211 				  long *temp)
212 {
213 	u8 *buf = st->smartdata;
214 	bool have_temp = false;
215 	u8 temp_raw;
216 	u8 csum;
217 	int err;
218 	int i;
219 
220 	err = drivetemp_ata_command(st, ATA_SMART_READ_VALUES, 0);
221 	if (err)
222 		return err;
223 
224 	/* Checksum the read value table */
225 	csum = 0;
226 	for (i = 0; i < ATA_SECT_SIZE; i++)
227 		csum += buf[i];
228 	if (csum) {
229 		dev_dbg(&st->sdev->sdev_gendev,
230 			"checksum error reading SMART values\n");
231 		return -EIO;
232 	}
233 
234 	for (i = 0; i < ATA_MAX_SMART_ATTRS; i++) {
235 		u8 *attr = buf + i * 12;
236 		int id = attr[2];
237 
238 		if (!id)
239 			continue;
240 
241 		if (id == SMART_TEMP_PROP_190) {
242 			temp_raw = attr[7];
243 			have_temp = true;
244 		}
245 		if (id == SMART_TEMP_PROP_194) {
246 			temp_raw = attr[7];
247 			have_temp = true;
248 			break;
249 		}
250 	}
251 
252 	if (have_temp) {
253 		*temp = temp_raw * 1000;
254 		return 0;
255 	}
256 
257 	return -ENXIO;
258 }
259 
260 static int drivetemp_get_scttemp(struct drivetemp_data *st, u32 attr, long *val)
261 {
262 	u8 *buf = st->smartdata;
263 	int err;
264 
265 	err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_STATUS_REQ_ADDR);
266 	if (err)
267 		return err;
268 	switch (attr) {
269 	case hwmon_temp_input:
270 		if (!temp_is_valid(buf[SCT_STATUS_TEMP]))
271 			return -ENODATA;
272 		*val = temp_from_sct(buf[SCT_STATUS_TEMP]);
273 		break;
274 	case hwmon_temp_lowest:
275 		if (!temp_is_valid(buf[SCT_STATUS_TEMP_LOWEST]))
276 			return -ENODATA;
277 		*val = temp_from_sct(buf[SCT_STATUS_TEMP_LOWEST]);
278 		break;
279 	case hwmon_temp_highest:
280 		if (!temp_is_valid(buf[SCT_STATUS_TEMP_HIGHEST]))
281 			return -ENODATA;
282 		*val = temp_from_sct(buf[SCT_STATUS_TEMP_HIGHEST]);
283 		break;
284 	default:
285 		err = -EINVAL;
286 		break;
287 	}
288 	return err;
289 }
290 
291 static const char * const sct_avoid_models[] = {
292 /*
293  * These drives will have WRITE FPDMA QUEUED command timeouts and sometimes just
294  * freeze until power-cycled under heavy write loads when their temperature is
295  * getting polled in SCT mode. The SMART mode seems to be fine, though.
296  *
297  * While only the 3 TB model (DT01ACA3) was actually caught exhibiting the
298  * problem let's play safe here to avoid data corruption and ban the whole
299  * DT01ACAx family.
300 
301  * The models from this array are prefix-matched.
302  */
303 	"TOSHIBA DT01ACA",
304 };
305 
306 static bool drivetemp_sct_avoid(struct drivetemp_data *st)
307 {
308 	struct scsi_device *sdev = st->sdev;
309 	unsigned int ctr;
310 
311 	if (!sdev->model)
312 		return false;
313 
314 	/*
315 	 * The "model" field contains just the raw SCSI INQUIRY response
316 	 * "product identification" field, which has a width of 16 bytes.
317 	 * This field is space-filled, but is NOT NULL-terminated.
318 	 */
319 	for (ctr = 0; ctr < ARRAY_SIZE(sct_avoid_models); ctr++)
320 		if (!strncmp(sdev->model, sct_avoid_models[ctr],
321 			     strlen(sct_avoid_models[ctr])))
322 			return true;
323 
324 	return false;
325 }
326 
327 static int drivetemp_identify_sata(struct drivetemp_data *st)
328 {
329 	struct scsi_device *sdev = st->sdev;
330 	u8 *buf = st->smartdata;
331 	struct scsi_vpd *vpd;
332 	bool is_ata, is_sata;
333 	bool have_sct_data_table;
334 	bool have_sct_temp;
335 	bool have_smart;
336 	bool have_sct;
337 	u16 *ata_id;
338 	u16 version;
339 	long temp;
340 	int err;
341 
342 	/* SCSI-ATA Translation present? */
343 	rcu_read_lock();
344 	vpd = rcu_dereference(sdev->vpd_pg89);
345 
346 	/*
347 	 * Verify that ATA IDENTIFY DEVICE data is included in ATA Information
348 	 * VPD and that the drive implements the SATA protocol.
349 	 */
350 	if (!vpd || vpd->len < 572 || vpd->data[56] != ATA_CMD_ID_ATA ||
351 	    vpd->data[36] != 0x34) {
352 		rcu_read_unlock();
353 		return -ENODEV;
354 	}
355 	ata_id = (u16 *)&vpd->data[60];
356 	is_ata = ata_id_is_ata(ata_id);
357 	is_sata = ata_id_is_sata(ata_id);
358 	have_sct = ata_id_sct_supported(ata_id);
359 	have_sct_data_table = ata_id_sct_data_tables(ata_id);
360 	have_smart = ata_id_smart_supported(ata_id) &&
361 				ata_id_smart_enabled(ata_id);
362 
363 	rcu_read_unlock();
364 
365 	/* bail out if this is not a SATA device */
366 	if (!is_ata || !is_sata)
367 		return -ENODEV;
368 
369 	if (have_sct && drivetemp_sct_avoid(st)) {
370 		dev_notice(&sdev->sdev_gendev,
371 			   "will avoid using SCT for temperature monitoring\n");
372 		have_sct = false;
373 	}
374 
375 	if (!have_sct)
376 		goto skip_sct;
377 
378 	err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_STATUS_REQ_ADDR);
379 	if (err)
380 		goto skip_sct;
381 
382 	version = (buf[SCT_STATUS_VERSION_HIGH] << 8) |
383 		  buf[SCT_STATUS_VERSION_LOW];
384 	if (version != 2 && version != 3)
385 		goto skip_sct;
386 
387 	have_sct_temp = temp_is_valid(buf[SCT_STATUS_TEMP]);
388 	if (!have_sct_temp)
389 		goto skip_sct;
390 
391 	st->have_temp_lowest = temp_is_valid(buf[SCT_STATUS_TEMP_LOWEST]);
392 	st->have_temp_highest = temp_is_valid(buf[SCT_STATUS_TEMP_HIGHEST]);
393 
394 	if (!have_sct_data_table)
395 		goto skip_sct_data;
396 
397 	/* Request and read temperature history table */
398 	memset(buf, '\0', sizeof(st->smartdata));
399 	buf[0] = 5;	/* data table command */
400 	buf[2] = 1;	/* read table */
401 	buf[4] = 2;	/* temperature history table */
402 
403 	err = drivetemp_ata_command(st, SMART_WRITE_LOG, SCT_STATUS_REQ_ADDR);
404 	if (err)
405 		goto skip_sct_data;
406 
407 	err = drivetemp_ata_command(st, SMART_READ_LOG, SCT_READ_LOG_ADDR);
408 	if (err)
409 		goto skip_sct_data;
410 
411 	/*
412 	 * Temperature limits per AT Attachment 8 -
413 	 * ATA/ATAPI Command Set (ATA8-ACS)
414 	 */
415 	st->have_temp_max = temp_is_valid(buf[6]);
416 	st->have_temp_crit = temp_is_valid(buf[7]);
417 	st->have_temp_min = temp_is_valid(buf[8]);
418 	st->have_temp_lcrit = temp_is_valid(buf[9]);
419 
420 	st->temp_max = temp_from_sct(buf[6]);
421 	st->temp_crit = temp_from_sct(buf[7]);
422 	st->temp_min = temp_from_sct(buf[8]);
423 	st->temp_lcrit = temp_from_sct(buf[9]);
424 
425 skip_sct_data:
426 	if (have_sct_temp) {
427 		st->get_temp = drivetemp_get_scttemp;
428 		return 0;
429 	}
430 skip_sct:
431 	if (!have_smart)
432 		return -ENODEV;
433 	st->get_temp = drivetemp_get_smarttemp;
434 	return drivetemp_get_smarttemp(st, hwmon_temp_input, &temp);
435 }
436 
437 static int drivetemp_identify(struct drivetemp_data *st)
438 {
439 	struct scsi_device *sdev = st->sdev;
440 
441 	/* Bail out immediately if there is no inquiry data */
442 	if (!sdev->inquiry || sdev->inquiry_len < 16)
443 		return -ENODEV;
444 
445 	/* Disk device? */
446 	if (sdev->type != TYPE_DISK && sdev->type != TYPE_ZBC)
447 		return -ENODEV;
448 
449 	return drivetemp_identify_sata(st);
450 }
451 
452 static int drivetemp_read(struct device *dev, enum hwmon_sensor_types type,
453 			 u32 attr, int channel, long *val)
454 {
455 	struct drivetemp_data *st = dev_get_drvdata(dev);
456 	int err = 0;
457 
458 	if (type != hwmon_temp)
459 		return -EINVAL;
460 
461 	switch (attr) {
462 	case hwmon_temp_input:
463 	case hwmon_temp_lowest:
464 	case hwmon_temp_highest:
465 		mutex_lock(&st->lock);
466 		err = st->get_temp(st, attr, val);
467 		mutex_unlock(&st->lock);
468 		break;
469 	case hwmon_temp_lcrit:
470 		*val = st->temp_lcrit;
471 		break;
472 	case hwmon_temp_min:
473 		*val = st->temp_min;
474 		break;
475 	case hwmon_temp_max:
476 		*val = st->temp_max;
477 		break;
478 	case hwmon_temp_crit:
479 		*val = st->temp_crit;
480 		break;
481 	default:
482 		err = -EINVAL;
483 		break;
484 	}
485 	return err;
486 }
487 
488 static umode_t drivetemp_is_visible(const void *data,
489 				   enum hwmon_sensor_types type,
490 				   u32 attr, int channel)
491 {
492 	const struct drivetemp_data *st = data;
493 
494 	switch (type) {
495 	case hwmon_temp:
496 		switch (attr) {
497 		case hwmon_temp_input:
498 			return 0444;
499 		case hwmon_temp_lowest:
500 			if (st->have_temp_lowest)
501 				return 0444;
502 			break;
503 		case hwmon_temp_highest:
504 			if (st->have_temp_highest)
505 				return 0444;
506 			break;
507 		case hwmon_temp_min:
508 			if (st->have_temp_min)
509 				return 0444;
510 			break;
511 		case hwmon_temp_max:
512 			if (st->have_temp_max)
513 				return 0444;
514 			break;
515 		case hwmon_temp_lcrit:
516 			if (st->have_temp_lcrit)
517 				return 0444;
518 			break;
519 		case hwmon_temp_crit:
520 			if (st->have_temp_crit)
521 				return 0444;
522 			break;
523 		default:
524 			break;
525 		}
526 		break;
527 	default:
528 		break;
529 	}
530 	return 0;
531 }
532 
533 static const struct hwmon_channel_info * const drivetemp_info[] = {
534 	HWMON_CHANNEL_INFO(chip,
535 			   HWMON_C_REGISTER_TZ),
536 	HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT |
537 			   HWMON_T_LOWEST | HWMON_T_HIGHEST |
538 			   HWMON_T_MIN | HWMON_T_MAX |
539 			   HWMON_T_LCRIT | HWMON_T_CRIT),
540 	NULL
541 };
542 
543 static const struct hwmon_ops drivetemp_ops = {
544 	.is_visible = drivetemp_is_visible,
545 	.read = drivetemp_read,
546 };
547 
548 static const struct hwmon_chip_info drivetemp_chip_info = {
549 	.ops = &drivetemp_ops,
550 	.info = drivetemp_info,
551 };
552 
553 /*
554  * The device argument points to sdev->sdev_dev. Its parent is
555  * sdev->sdev_gendev, which we can use to get the scsi_device pointer.
556  */
557 static int drivetemp_add(struct device *dev)
558 {
559 	struct scsi_device *sdev = to_scsi_device(dev->parent);
560 	struct drivetemp_data *st;
561 	int err;
562 
563 	st = kzalloc(sizeof(*st), GFP_KERNEL);
564 	if (!st)
565 		return -ENOMEM;
566 
567 	st->sdev = sdev;
568 	st->dev = dev;
569 	mutex_init(&st->lock);
570 
571 	if (drivetemp_identify(st)) {
572 		err = -ENODEV;
573 		goto abort;
574 	}
575 
576 	st->hwdev = hwmon_device_register_with_info(dev->parent, "drivetemp",
577 						    st, &drivetemp_chip_info,
578 						    NULL);
579 	if (IS_ERR(st->hwdev)) {
580 		err = PTR_ERR(st->hwdev);
581 		goto abort;
582 	}
583 
584 	list_add(&st->list, &drivetemp_devlist);
585 	return 0;
586 
587 abort:
588 	kfree(st);
589 	return err;
590 }
591 
592 static void drivetemp_remove(struct device *dev)
593 {
594 	struct drivetemp_data *st, *tmp;
595 
596 	list_for_each_entry_safe(st, tmp, &drivetemp_devlist, list) {
597 		if (st->dev == dev) {
598 			list_del(&st->list);
599 			hwmon_device_unregister(st->hwdev);
600 			kfree(st);
601 			break;
602 		}
603 	}
604 }
605 
606 static struct class_interface drivetemp_interface = {
607 	.add_dev = drivetemp_add,
608 	.remove_dev = drivetemp_remove,
609 };
610 
611 static int __init drivetemp_init(void)
612 {
613 	return scsi_register_interface(&drivetemp_interface);
614 }
615 
616 static void __exit drivetemp_exit(void)
617 {
618 	scsi_unregister_interface(&drivetemp_interface);
619 }
620 
621 module_init(drivetemp_init);
622 module_exit(drivetemp_exit);
623 
624 MODULE_AUTHOR("Guenter Roeck <linus@roeck-us.net>");
625 MODULE_DESCRIPTION("Hard drive temperature monitor");
626 MODULE_LICENSE("GPL");
627 MODULE_ALIAS("platform:drivetemp");
628