xref: /freebsd/sys/contrib/openzfs/cmd/zpool/zpool.d/defect (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1#!/bin/sh
2#
3# Show SMART stats
4#
5
6helpstr="
7smart:		Show SMART temperature and error stats (specific to drive type)
8smartx:		Show SMART extended drive stats (specific to drive type).
9temp:		Show SMART drive temperature in celsius (all drives).
10health:		Show reported SMART status (all drives).
11r_proc:		Show SMART read GBytes processed over drive lifetime (SAS).
12w_proc:		Show SMART write GBytes processed over drive lifetime (SAS).
13r_ucor:		Show SMART read uncorrectable errors (SAS).
14w_ucor:		Show SMART write uncorrectable errors (SAS).
15nonmed:		Show SMART non-medium errors (SAS).
16defect:		Show SMART grown defect list (SAS).
17hours_on:	Show number of hours drive powered on (all drives).
18realloc:	Show SMART reallocated sectors count (ATA).
19rep_ucor:	Show SMART reported uncorrectable count (ATA).
20cmd_to:		Show SMART command timeout count (ATA).
21pend_sec:	Show SMART current pending sector count (ATA).
22off_ucor:	Show SMART offline uncorrectable errors (ATA).
23ata_err:	Show SMART ATA errors (ATA).
24pwr_cyc:	Show SMART power cycle count (ATA).
25serial:		Show disk serial number.
26nvme_err:	Show SMART NVMe errors (NVMe).
27smart_test:	Show SMART self-test results summary.
28test_type:	Show SMART self-test type (short, long... ).
29test_status:	Show SMART self-test status.
30test_progress:	Show SMART self-test percentage done.
31test_ended:	Show when the last SMART self-test ended (if supported).
32"
33
34# Hack for developer testing
35#
36# If you set $samples to a directory containing smartctl output text files,
37# we will use them instead of running smartctl on the vdevs.  This can be
38# useful if you want to test a bunch of different smartctl outputs.  Also, if
39# $samples is set, and additional 'file' column is added to the zpool output
40# showing the filename.
41samples=
42
43# get_filename_from_dir DIR
44#
45# Look in directory DIR and return a filename from it.  The filename returned
46# is chosen quasi-sequentially (based off our PID).  This allows us to return
47# a different filename every time this script is invoked (which we do for each
48# vdev), without having to maintain state.
49get_filename_from_dir()
50{
51	dir=$1
52	pid="$$"
53	num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
54	mod=$((pid % num_files))
55	i=0
56	find "$dir" -type f -printf '%f\n' | while read -r file ; do
57		if [ "$mod" = "$i" ] ; then
58			echo "$file"
59			break
60		fi
61		i=$((i+1))
62	done
63}
64
65script="${0##*/}"
66
67if [ "$1" = "-h" ] ; then
68        echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
69        exit
70fi
71
72# Sometimes, UPATH ends up /dev/(null).
73# That should be corrected, but for now...
74# shellcheck disable=SC2154
75if [ ! -b "$VDEV_UPATH" ]; then
76	somepath="${VDEV_PATH}"
77else
78	somepath="${VDEV_UPATH}"
79fi
80
81if [ -b "$somepath" ] && PATH="/usr/sbin:$PATH" command -v smartctl > /dev/null || [ -n "$samples" ] ; then
82	if [ -n "$samples" ] ; then
83		# cat a smartctl output text file instead of running smartctl
84		# on a vdev (only used for developer testing).
85		file=$(get_filename_from_dir "$samples")
86		echo "file=$file"
87		raw_out=$(cat "$samples/$file")
88	else
89		raw_out=$(sudo smartctl -a "$somepath")
90	fi
91
92	# What kind of drive are we?  Look for the right line in smartctl:
93	#
94	# SAS:
95	#	Transport protocol:   SAS
96	#
97	# SATA:
98	#	ATA Version is:   8
99	#
100	# NVMe:
101	#       SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
102	#
103	out=$(echo "$raw_out" | awk '
104# SAS specific
105/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
106/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
107/Non-medium error count/{print "nonmed="$4}
108/Elements in grown defect list/{print "defect="$6}
109
110# SAS common
111/SAS/{type="sas"}
112/Drive Temperature:/{print "temp="$4}
113# Status can be a long string, substitute spaces for '_'
114/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
115/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
116/Serial number:/{print "serial="$3}
117
118# SATA specific
119/Reallocated_Sector_Ct/{print "realloc="$10}
120/Reported_Uncorrect/{print "rep_ucor="$10}
121/Command_Timeout/{print "cmd_to="$10}
122/Current_Pending_Sector/{print "pend_sec="$10}
123/Offline_Uncorrectable/{print "off_ucor="$10}
124/ATA Error Count:/{print "ata_err="$4}
125/Power_Cycle_Count/{print "pwr_cyc="$10}
126
127# SATA common
128/SATA/{type="sata"}
129/Temperature_Celsius/{print "temp="$10}
130/Airflow_Temperature_Cel/{print "temp="$10}
131/Current Temperature:/{print "temp="$3}
132/SMART overall-health self-assessment test result:/{print "health="$6}
133/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
134/Serial Number:/{print "serial="$3}
135
136# NVMe common
137/NVMe/{type="nvme"}
138/Temperature:/{print "temp="$2}
139/SMART overall-health self-assessment test result:/{print "health="$6}
140/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
141/Serial Number:/{print "serial="$3}
142/Power Cycles:/{print "pwr_cyc="$3}
143
144# NVMe specific
145/Media and Data Integrity Errors:/{print "nvme_err="$6}
146
147# SMART self-test info
148/Self-test execution status:/{progress=tolower($4)} # SAS
149/SMART Self-test log/{test_seen=1} # SAS
150/SMART Extended Self-test Log/{test_seen=1} # SATA
151/# 1/{
152	test_type=tolower($3"_"$4);
153	# Status could be one word ("Completed") or multiple ("Completed: read
154	# failure").  Look for the ":" to see if we need to grab more words.
155
156	if ($5 ~ ":")
157		status=tolower($5""$6"_"$7)
158	else
159		status=tolower($5)
160	if (status=="self")
161		status="running";
162
163	if (type == "sas") {
164		hours=int($(NF-4))
165	} else {
166		hours=int($(NF-1))
167		# SATA reports percent remaining, rather than percent done
168		# Convert it to percent done.
169		progress=(100-int($(NF-2)))"%"
170	}
171	# When we int()-ify "hours", it converts stuff like "NOW" and "-" into
172	# 0.  In those cases, set it to hours_on, so they will cancel out in
173	# the "hours_ago" calculation later on.
174	if (hours == 0)
175		hours=hours_on
176
177	if (test_seen) {
178		print "test="hours_on
179		print "test_type="test_type
180		print "test_status="status
181		print "test_progress="progress
182	}
183	# Not all drives report hours_on
184	if (hours_on && hours) {
185		total_hours_ago=(hours_on-hours)
186		days_ago=int(total_hours_ago/24)
187		hours_ago=(total_hours_ago % 24)
188		if (days_ago != 0)
189			ago_str=days_ago"d"
190		if (hours_ago !=0)
191			ago_str=ago_str""hours_ago"h"
192		print "test_ended="ago_str
193	}
194}
195
196END {print "type="type; ORS="\n"; print ""}
197');
198fi
199type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
200
201# If type is not set by now, either we don't have a block device
202# or smartctl failed. Either way, default to ATA and set $out to
203# nothing.
204if [ -z "$type" ]; then
205	type="sata"
206	out=
207fi
208
209case $script in
210smart)
211	# Print temperature plus common predictors of drive failure
212	if [ "$type" = "sas" ] ; then
213		scripts="temp|health|r_ucor|w_ucor"
214	elif [ "$type" = "sata" ] ; then
215		scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
216	elif [ "$type" = "nvme" ] ; then
217		scripts="temp|health|nvme_err"
218	fi
219	;;
220smartx)
221	# Print some other interesting stats
222	if [ "$type" = "sas" ] ; then
223		scripts="hours_on|defect|nonmed|r_proc|w_proc"
224	elif [ "$type" = "sata" ] ; then
225		scripts="hours_on|pwr_cyc"
226	elif [ "$type" = "nvme" ] ; then
227		scripts="hours_on|pwr_cyc"
228	fi
229	;;
230smart_test)
231	scripts="test_type|test_status|test_progress|test_ended"
232	;;
233*)
234	scripts="$script"
235esac
236
237with_vals=$(echo "$out" | grep -E "$scripts")
238if [ -n "$with_vals" ]; then
239	echo "$with_vals"
240	without_vals=$(echo "$scripts" | tr '|' '\n' |
241		grep -v -E "$(echo "$with_vals" |
242		awk -F "=" '{print $1}')" | awk '{print $0"="}')
243else
244	without_vals=$(echo "$scripts" | tr '|' '\n' | awk '{print $0"="}')
245fi
246
247if [ -n "$without_vals" ]; then
248	echo "$without_vals"
249fi
250