诺诺博客

  • 概述:
  • 无RAID组:
  • smartctl:
  • Shell script:
  • 有RAID组:
  •  主 页
  •  Linux
  •  微 软
  •  信 创
  •  虚 拟
  •  网 络
  •  生 活
  •  归 档
  •  友 链
  •  关 于

Linux 操作系统下查看硬盘健康状态

  • 诺诺
  • 2024-09-18
  • 0

概述:

这里介绍两种两种场景三种方式的检测方式,也是比较常见的场景:服务器没做RAID组和做了RAID组,检测的方式是不同的,没做RAID检查比较简单,可以直接使用 smartctl 来检查/dev/sd* 的健康状态,再或者使用提供我脚本来快速批量检测;那么如果服务器做了RAID的话,上述的方式就不管用了,这里就可以试试本文推荐的方式。

无RAID组:

smartctl:

yum install smartmontools -y
smartctl -H /dev/sda  

Shell script:

#!/usr/bin/env bash
# smart-check.sh
#
# Purpose:
#   simple bash script that parses the output provided by smartctl
#   to show relevant information about predictive failures
#   that we actually care about.
#
# Original Author: James Greig
# Licence: GNU GPL v3

VERSION=02/01/2024

PATH=/usr/local/sbin:$PATH
export PATH


POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"

case $key in
    -d|--debug)
    DEBUG=1
    shift
    ;;
    -u|--update)
    wget -O /usr/sbin/smart-check.sh https://mirrors.host-it.co.uk/files/smart-check.sh
    shift
    exit 0;
    ;;
    -v|--version)
    echo $VERSION
    shift
    exit 0;
    ;;
    -h|--help)
    printf "\r\n-d debug\r\n-hp check HP cciss devices\r\n-u update\r\n-h help\r\n-v version\r\n"
    shift
    exit 0;
    ;;
    -hp|--hp)
    HP=1
    shift
    ;;
    *)
    printf "Unknown Argument \r\n"
    exit 0;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters

#What system is this?
# 3ware? smartctl -a -d 3ware,1 /dev/twl0 -T permissive
# for i in `ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'`; do smartctl -a $i | egrep -i 'serial|Reallocated_Sector|wear_leveling'; done
# lsi megaraid:-  smartctl -a -d megaraid,6 /dev/sda -T permissive

#disks=($(ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'))

SERIALS=()
function smartcheck()
{
        error=0
	# Obtain the serial

        smartoutput=$(smartctl -v 1,raw48:54 -v 7,raw48:54 -v 195,raw48:54 -a $1)
        serial=`echo "$smartoutput" | egrep "Serial Number:|Serial number:" | grep -v "\[No" | awk '{print $3}'`
	if [[ "${SERIALS[@]}" =~ "${serial}" ]] &&  [ ${serial} ]; then
		# we've seen this serial before from another device
		return 0;
	fi
        if [ ${serial} ]
        then
                SERIALS+=(${serial})
	        if [ ${DEBUG} ] 
        	then
                	echo -e "---"
                	echo -e "\e[0;40mScanning $1 Serial: ${serial} \e[0m"
		fi
        fi

	# SSD Wear Level

        #var=`smartctl -a $1 | grep Wear_Leveling | awk '{print $4}' | sed 's/^0\|^00//'`
        var=`echo "$smartoutput" | egrep -i "177 Wear_Leveling|231 SSD_Life_Left|^173 Un|233 Media_Wearout_" | awk '{print $4}' | sed 's/^0\|^00//' | head -n 1`
        if [[ ${var#0} -lt 20 ]] && [[ ${var#0} -gt 0 ]]
        then
                echo -e "\e[0;41m$1 is at ${var#0}% SSD wear\e[0m"
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ ${var#0} -gt 0 ]]
        then
                echo -e "\e[30;42m$1 is at ${var#0}% SSD wear\e[0m"
        fi


        # Reallocated sectors on SATA drives

        #var=`smartctl -a $1 | grep Reallocated_Sector | awk '{print $10}' `
        var=`echo "$smartoutput" | grep Reallocated_Sector | awk '{print $10}' `
        if [[ $var -gt 0 ]]
        then
                echo -e "\e[0;41m$1 has $var Sector Errors\e[0m"
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$  ]]
        then
                echo -e "\e[30;42m$1 has $var Sector Errors\e[0m"
        fi

	# Early Warning Offline_Uncorrectable

        #var=`smartctl -a $1 | grep Offline_Uncorrectable | awk '{print $10}' `
        var=`echo "$smartoutput" | grep Offline_Uncorrectable | awk '{print $10}' `
        if [[ $var -gt 0 ]]
        then
                echo -e "\e[0;41m$1 has $var Offline Uncorrectable Errors\e[0m"
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$  ]]
        then
                echo -e "\e[30;42m$1 has $var Offline Uncorrectable Errors\e[0m"
        fi

	# Early Warning Raw_Read_Error_Rate

        var=`echo "$smartoutput" | egrep -i "1 Raw_Read_Error_Rate" | awk '{print $10}' | sed 's/^0\|^00//'`
        if [[ ${var#0} -gt 10 ]] && [[ ${var#0} -gt 0 ]]
        then
                echo -e "\e[0;41m$1 has a Read Error Rate of ${var#0}\e[0m"
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ ${var#0} -gt 0 ]]
        then
                echo -e "\e[30;42m$1 has a Read Error Rate of ${var#0} \e[0m"
        fi


        # SAS Read errors

        #var=`smartctl -a $1 | egrep "read:" | awk '{print $8}'`
        var=`echo "$smartoutput" | egrep "read:" | awk '{print $8}'`
        if [[ $var -gt 0 ]]
        then
                echo -e "\e[0;41m$1 $var SAS Read Errors\e[0m"
		sas=1
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
        then
                echo -e "\e[30;42m$1 $var SAS Read Errors\e[0m"
        fi

        # SAS Write errors

        #var=`smartctl -a $1 | egrep "write:" | awk '{print $8}'`
        var=`echo "$smartoutput"  | egrep "write:" | awk '{print $8}'`
        if [[ $var -gt 0 ]]
        then
                echo -e "\e[0;41m$1 $var SAS Write Errors\e[0m"
		sas=1
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
        then
                echo -e "\e[30;42m$1 $var SAS Write Errors\e[0m"
        fi

        # SAS Verify errors

        #var=`smartctl -a $1 | egrep "verify:" | awk '{print $8}'`
        var=`echo "$smartoutput"  | egrep "verify:" | awk '{print $8}'`
        if [[ $var -gt 0 ]]
        then
                echo -e "\e[0;41m$1 $var SAS Verify Errors\e[0m"
		sas=1
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
        then
                echo -e "\e[30;42m$1 $var SAS Verify Errors\e[0m"
        fi

        # SAS post factory defects

        var=`echo "$smartoutput"  | grep -i "grown defect" | sed 's/Elements in grown defect list: //' | grep -iv "not available"`
        if [[ $var -gt 0 ]] && [ ${DEBUG} ]
        then
                sleep 0
                echo -e "\e[30;43m$1 $var SAS accumulated defects\e[0m"
                #error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
        then
                echo -e "\e[30;42m$1 $var SAS accumulated defects\e[0m"
        fi

	# Power On Errors at the bottom of smart output

        var=`echo "$smartoutput"  | grep -i "Error 1 occurred at disk power-on lifetime"`
        if [[ ${#var} -gt 0 ]]
        then
                echo -e "\e[0;41m$1 Disk is showing signs of power-on errors\e[0m"
                error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ ${#var} -gt 0 ]]
        then
                echo -e "\e[0;41m$1 Disk is showing signs of power-on errors\e[0m"
        fi

        # Check for ATA Errors
        var=`echo "$smartoutput"  | grep -i "ATA Error Count:" | sed 's/ATA Error Count: //'`
        if [[ $var -gt 0 ]] && [ ${DEBUG} ]
        then
                sleep 0
                echo -e "\e[30;43m$1 $var ATA Errors\e[0m"
                #error=$(($error + 1))
        elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
        then
                echo -e "\e[30;42m$1 $var ATA Errors\e[0m"
        fi



        return $error
}
if [ ! -x "$(command -v smartctl)" ]
then
        echo "Error: Smartctl command not found"
        exit 1
fi

agerror=0

# Check disks attached to the board directly or in passthrough
#for i in `ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'`;
if [[ ${DEBUG} ]]
then
  echo 'Checking directly attached drives'
fi
for i in `find /dev -type b -name 'sd*' | egrep '^(\/)dev(\/)sd[a-z]$'`;
do
        smartcheck $i $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
done
# Check disks attached to the board directly or in passthrough (BSD)
#for i in `ls /dev/pass*|egrep '^(\/)dev(\/)pass[0-9]+$'`;
if [[ ${DEBUG} ]]
then
  echo 'Checking directly attached drives (passthrough, BSD)'
fi
for i in `find /dev -type c -name 'pass*' | egrep '^(\/)dev(\/)pass[0-9]+$'`;
do
        smartcheck "$i" $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
done

# Check disks behind LSISAS2008 LV
#for i in `ls /dev/sg*|egrep '^(\/)dev(\/)sg[0-9]+$'`;
if [[ ${DEBUG} ]]
then
  echo 'Checking LSISAS2008 Drives'
fi
for i in `find /dev -type c -name 'sg*' | egrep '^(\/)dev(\/)sg[0-9]+$'`;
do
        smartcheck $i $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
done
# Check disks behind a 3ware card
if [ -f "/dev/twl0" ]
then
  if [[ ${DEBUG} ]]
  then
    echo 'Checking 3ware drives'
  fi
  for i in `seq 0 20`
  do
        smartcheck "-d 3ware,$i /dev/twl0 -T permissive" $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
  done
fi
# Check scsi disks behind an lsi card - fixed at sda at the moment
if test $(command -v lspci) && test $(lspci | grep -i LSI | wc -l) != "0";
then
  for i in `seq 0 20`
  do
        smartcheck "-d megaraid,$i /dev/sda -T permissive" $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
  done

  #Check disks behind an lsi card - fixed at sda at the moment
  if [[ ${DEBUG} ]]
  then
    echo 'Checking LSI drives'
  fi
  for i in `seq 0 20`
  do
        smartcheck "-d sat+megaraid,$i /dev/sda -T permissive" $DEBUG
        rval=$?
        agerror=$(($agerror + $rval))
  done
fi
# Check scsi disks behind an HPcard - fixed at sda at the moment
# 2> /dev/null <- append this if it's spouting crap
if [[ ${DEBUG} ]]
then
  echo 'Checking HP drives'
fi
if [[ ${HP} ]]
then
	for i in `seq 0 20`
	do
        	smartcheck "-d cciss,$i /dev/sda -T permissive" $DEBUG
        	rval=$?
        	agerror=$(($agerror + $rval))
	done
fi

if [[ ${DEBUG} &&  ${sas} ]]
then
	echo -e "---"
	echo "NOTICE: SAS error counters can be reset using sg3_utils command"
	echo "sg_logs -R /dev/device"
	echo -e "---"
fi

if [[ $agerror -gt 0 ]]
then
        echo -e "\e[0;41m$agerror Errors were found\e[0m"
        exit $agerror
else
        echo -e "\e[30;42mNo errors were found\e[0m"
        exit 0
fi

有RAID组:

安装硬盘哨兵

wget https://www.hdsentinel.com/hdslin/hdsentinel-020c-x64.zip
tar -zxvf hdsentinel-020c-x64.zip
cd hdsent*
chmod 755 HDSentinel
./HDSentinel
Linux 操作系统下查看硬盘健康状态-诺诺博客

© 2025 诺诺博客 蜀ICP备2024099071号-1 如有侵权请联系删除 | 网站地图 | 百度统计 | 又拍云CDN加速
为了获得更好的浏览效果 建议您使用IE8.0及以上版本浏览器登陆本站点 · 服务器托管于腾讯云
  • {{ item.name }}
  • {{ item.name }}