1#!/bin/sh 2# 3# Show SMART stats 4# 5 6helpstr=" 7smart: Show SMART temperature and error stats (specific to drive type) 8smartx: Show SMART extended drive stats (specific to drive type). 9temp: Show SMART drive temperature in celsius (all drives). 10health: Show reported SMART status (all drives). 11r_proc: Show SMART read GBytes processed over drive lifetime (SAS). 12w_proc: Show SMART write GBytes processed over drive lifetime (SAS). 13r_ucor: Show SMART read uncorrectable errors (SAS). 14w_ucor: Show SMART write uncorrectable errors (SAS). 15nonmed: Show SMART non-medium errors (SAS). 16defect: Show SMART grown defect list (SAS). 17hours_on: Show number of hours drive powered on (all drives). 18realloc: Show SMART reallocated sectors count (ATA). 19rep_ucor: Show SMART reported uncorrectable count (ATA). 20cmd_to: Show SMART command timeout count (ATA). 21pend_sec: Show SMART current pending sector count (ATA). 22off_ucor: Show SMART offline uncorrectable errors (ATA). 23ata_err: Show SMART ATA errors (ATA). 24pwr_cyc: Show SMART power cycle count (ATA). 25serial: Show disk serial number. 26nvme_err: Show SMART NVMe errors (NVMe). 27smart_test: Show SMART self-test results summary. 28test_type: Show SMART self-test type (short, long... ). 29test_status: Show SMART self-test status. 30test_progress: Show SMART self-test percentage done. 31test_ended: Show when the last SMART self-test ended (if supported). 32" 33 34# Hack for developer testing 35# 36# If you set $samples to a directory containing smartctl output text files, 37# we will use them instead of running smartctl on the vdevs. This can be 38# useful if you want to test a bunch of different smartctl outputs. Also, if 39# $samples is set, and additional 'file' column is added to the zpool output 40# showing the filename. 41samples= 42 43# get_filename_from_dir DIR 44# 45# Look in directory DIR and return a filename from it. The filename returned 46# is chosen quasi-sequentially (based off our PID). This allows us to return 47# a different filename every time this script is invoked (which we do for each 48# vdev), without having to maintain state. 49get_filename_from_dir() 50{ 51 dir=$1 52 pid="$$" 53 num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) 54 mod=$((pid % num_files)) 55 i=0 56 find "$dir" -type f -printf '%f\n' | while read -r file ; do 57 if [ "$mod" = "$i" ] ; then 58 echo "$file" 59 break 60 fi 61 i=$((i+1)) 62 done 63} 64 65script="${0##*/}" 66 67if [ "$1" = "-h" ] ; then 68 echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- 69 exit 70fi 71 72# Sometimes, UPATH ends up /dev/(null). 73# That should be corrected, but for now... 74# shellcheck disable=SC2154 75if [ ! -b "$VDEV_UPATH" ]; then 76 somepath="${VDEV_PATH}" 77else 78 somepath="${VDEV_UPATH}" 79fi 80 81if [ -b "$somepath" ] && PATH="/usr/sbin:$PATH" command -v smartctl > /dev/null || [ -n "$samples" ] ; then 82 if [ -n "$samples" ] ; then 83 # cat a smartctl output text file instead of running smartctl 84 # on a vdev (only used for developer testing). 85 file=$(get_filename_from_dir "$samples") 86 echo "file=$file" 87 raw_out=$(cat "$samples/$file") 88 else 89 raw_out=$(sudo smartctl -a "$somepath") 90 fi 91 92 # What kind of drive are we? Look for the right line in smartctl: 93 # 94 # SAS: 95 # Transport protocol: SAS 96 # 97 # SATA: 98 # ATA Version is: 8 99 # 100 # NVMe: 101 # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) 102 # 103 out=$(echo "$raw_out" | awk ' 104# SAS specific 105/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} 106/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8} 107/Non-medium error count/{print "nonmed="$4} 108/Elements in grown defect list/{print "defect="$6} 109 110# SAS common 111/SAS/{type="sas"} 112/Drive Temperature:/{print "temp="$4} 113# Status can be a long string, substitute spaces for '_' 114/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} 115/number of hours powered up/{print "hours_on="$7; hours_on=int($7)} 116/Serial number:/{print "serial="$3} 117 118# SATA specific 119/Reallocated_Sector_Ct/{print "realloc="$10} 120/Reported_Uncorrect/{print "rep_ucor="$10} 121/Command_Timeout/{print "cmd_to="$10} 122/Current_Pending_Sector/{print "pend_sec="$10} 123/Offline_Uncorrectable/{print "off_ucor="$10} 124/ATA Error Count:/{print "ata_err="$4} 125/Power_Cycle_Count/{print "pwr_cyc="$10} 126 127# SATA common 128/SATA/{type="sata"} 129/Temperature_Celsius/{print "temp="$10} 130/Airflow_Temperature_Cel/{print "temp="$10} 131/Current Temperature:/{print "temp="$3} 132/SMART overall-health self-assessment test result:/{print "health="$6} 133/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} 134/Serial Number:/{print "serial="$3} 135 136# NVMe common 137/NVMe/{type="nvme"} 138/Temperature:/{print "temp="$2} 139/SMART overall-health self-assessment test result:/{print "health="$6} 140/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} 141/Serial Number:/{print "serial="$3} 142/Power Cycles:/{print "pwr_cyc="$3} 143 144# NVMe specific 145/Media and Data Integrity Errors:/{print "nvme_err="$6} 146 147# SMART self-test info 148/Self-test execution status:/{progress=tolower($4)} # SAS 149/SMART Self-test log/{test_seen=1} # SAS 150/SMART Extended Self-test Log/{test_seen=1} # SATA 151/# 1/{ 152 test_type=tolower($3"_"$4); 153 # Status could be one word ("Completed") or multiple ("Completed: read 154 # failure"). Look for the ":" to see if we need to grab more words. 155 156 if ($5 ~ ":") 157 status=tolower($5""$6"_"$7) 158 else 159 status=tolower($5) 160 if (status=="self") 161 status="running"; 162 163 if (type == "sas") { 164 hours=int($(NF-4)) 165 } else { 166 hours=int($(NF-1)) 167 # SATA reports percent remaining, rather than percent done 168 # Convert it to percent done. 169 progress=(100-int($(NF-2)))"%" 170 } 171 # When we int()-ify "hours", it converts stuff like "NOW" and "-" into 172 # 0. In those cases, set it to hours_on, so they will cancel out in 173 # the "hours_ago" calculation later on. 174 if (hours == 0) 175 hours=hours_on 176 177 if (test_seen) { 178 print "test="hours_on 179 print "test_type="test_type 180 print "test_status="status 181 print "test_progress="progress 182 } 183 # Not all drives report hours_on 184 if (hours_on && hours) { 185 total_hours_ago=(hours_on-hours) 186 days_ago=int(total_hours_ago/24) 187 hours_ago=(total_hours_ago % 24) 188 if (days_ago != 0) 189 ago_str=days_ago"d" 190 if (hours_ago !=0) 191 ago_str=ago_str""hours_ago"h" 192 print "test_ended="ago_str 193 } 194} 195 196END {print "type="type; ORS="\n"; print ""} 197'); 198fi 199type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) 200 201# If type is not set by now, either we don't have a block device 202# or smartctl failed. Either way, default to ATA and set $out to 203# nothing. 204if [ -z "$type" ]; then 205 type="sata" 206 out= 207fi 208 209case $script in 210smart) 211 # Print temperature plus common predictors of drive failure 212 if [ "$type" = "sas" ] ; then 213 scripts="temp|health|r_ucor|w_ucor" 214 elif [ "$type" = "sata" ] ; then 215 scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" 216 elif [ "$type" = "nvme" ] ; then 217 scripts="temp|health|nvme_err" 218 fi 219 ;; 220smartx) 221 # Print some other interesting stats 222 if [ "$type" = "sas" ] ; then 223 scripts="hours_on|defect|nonmed|r_proc|w_proc" 224 elif [ "$type" = "sata" ] ; then 225 scripts="hours_on|pwr_cyc" 226 elif [ "$type" = "nvme" ] ; then 227 scripts="hours_on|pwr_cyc" 228 fi 229 ;; 230smart_test) 231 scripts="test_type|test_status|test_progress|test_ended" 232 ;; 233*) 234 scripts="$script" 235esac 236 237with_vals=$(echo "$out" | grep -E "$scripts") 238if [ -n "$with_vals" ]; then 239 echo "$with_vals" 240 without_vals=$(echo "$scripts" | tr '|' '\n' | 241 grep -v -E "$(echo "$with_vals" | 242 awk -F "=" '{print $1}')" | awk '{print $0"="}') 243else 244 without_vals=$(echo "$scripts" | tr '|' '\n' | awk '{print $0"="}') 245fi 246 247if [ -n "$without_vals" ]; then 248 echo "$without_vals" 249fi 250