Linux 操作系统下查看硬盘健康状态

文章目录
  • 概述:
  • 无RAID组:
  • 有RAID组:
  • 概述:

    这里介绍两种两种场景三种方式的检测方式,也是比较常见的场景:服务器没做RAID组和做了RAID组,检测的方式是不同的,没做RAID检查比较简单,可以直接使用 smartctl 来检查/dev/sd* 的健康状态,再或者使用提供我脚本来快速批量检测;那么如果服务器做了RAID的话,上述的方式就不管用了,这里就可以试试本文推荐的方式。

    无RAID组:

    smartctl:

    yum install smartmontools -y
    smartctl -H /dev/sda  

    Shell script:

    #!/usr/bin/env bash
    # smart-check.sh
    #
    # Purpose:
    #   simple bash script that parses the output provided by smartctl
    #   to show relevant information about predictive failures
    #   that we actually care about.
    #
    # Original Author: James Greig
    # Licence: GNU GPL v3
    
    VERSION=02/01/2024
    
    PATH=/usr/local/sbin:$PATH
    export PATH
    
    
    POSITIONAL=()
    while [[ $# -gt 0 ]]
    do
    key="$1"
    
    case $key in
        -d|--debug)
        DEBUG=1
        shift
        ;;
        -u|--update)
        wget -O /usr/sbin/smart-check.sh https://mirrors.host-it.co.uk/files/smart-check.sh
        shift
        exit 0;
        ;;
        -v|--version)
        echo $VERSION
        shift
        exit 0;
        ;;
        -h|--help)
        printf "\r\n-d debug\r\n-hp check HP cciss devices\r\n-u update\r\n-h help\r\n-v version\r\n"
        shift
        exit 0;
        ;;
        -hp|--hp)
        HP=1
        shift
        ;;
        *)
        printf "Unknown Argument \r\n"
        exit 0;
    esac
    done
    set -- "${POSITIONAL[@]}" # restore positional parameters
    
    #What system is this?
    # 3ware? smartctl -a -d 3ware,1 /dev/twl0 -T permissive
    # for i in `ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'`; do smartctl -a $i | egrep -i 'serial|Reallocated_Sector|wear_leveling'; done
    # lsi megaraid:-  smartctl -a -d megaraid,6 /dev/sda -T permissive
    
    #disks=($(ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'))
    
    SERIALS=()
    function smartcheck()
    {
            error=0
    	# Obtain the serial
    
            smartoutput=$(smartctl -v 1,raw48:54 -v 7,raw48:54 -v 195,raw48:54 -a $1)
            serial=`echo "$smartoutput" | egrep "Serial Number:|Serial number:" | grep -v "\[No" | awk '{print $3}'`
    	if [[ "${SERIALS[@]}" =~ "${serial}" ]] &&  [ ${serial} ]; then
    		# we've seen this serial before from another device
    		return 0;
    	fi
            if [ ${serial} ]
            then
                    SERIALS+=(${serial})
    	        if [ ${DEBUG} ] 
            	then
                    	echo -e "---"
                    	echo -e "\e[0;40mScanning $1 Serial: ${serial} \e[0m"
    		fi
            fi
    
    	# SSD Wear Level
    
            #var=`smartctl -a $1 | grep Wear_Leveling | awk '{print $4}' | sed 's/^0\|^00//'`
            var=`echo "$smartoutput" | egrep -i "177 Wear_Leveling|231 SSD_Life_Left|^173 Un|233 Media_Wearout_" | awk '{print $4}' | sed 's/^0\|^00//' | head -n 1`
            if [[ ${var#0} -lt 20 ]] && [[ ${var#0} -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 is at ${var#0}% SSD wear\e[0m"
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ ${var#0} -gt 0 ]]
            then
                    echo -e "\e[30;42m$1 is at ${var#0}% SSD wear\e[0m"
            fi
    
    
            # Reallocated sectors on SATA drives
    
            #var=`smartctl -a $1 | grep Reallocated_Sector | awk '{print $10}' `
            var=`echo "$smartoutput" | grep Reallocated_Sector | awk '{print $10}' `
            if [[ $var -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 has $var Sector Errors\e[0m"
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$  ]]
            then
                    echo -e "\e[30;42m$1 has $var Sector Errors\e[0m"
            fi
    
    	# Early Warning Offline_Uncorrectable
    
            #var=`smartctl -a $1 | grep Offline_Uncorrectable | awk '{print $10}' `
            var=`echo "$smartoutput" | grep Offline_Uncorrectable | awk '{print $10}' `
            if [[ $var -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 has $var Offline Uncorrectable Errors\e[0m"
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$  ]]
            then
                    echo -e "\e[30;42m$1 has $var Offline Uncorrectable Errors\e[0m"
            fi
    
    	# Early Warning Raw_Read_Error_Rate
    
            var=`echo "$smartoutput" | egrep -i "1 Raw_Read_Error_Rate" | awk '{print $10}' | sed 's/^0\|^00//'`
            if [[ ${var#0} -gt 10 ]] && [[ ${var#0} -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 has a Read Error Rate of ${var#0}\e[0m"
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ ${var#0} -gt 0 ]]
            then
                    echo -e "\e[30;42m$1 has a Read Error Rate of ${var#0} \e[0m"
            fi
    
    
            # SAS Read errors
    
            #var=`smartctl -a $1 | egrep "read:" | awk '{print $8}'`
            var=`echo "$smartoutput" | egrep "read:" | awk '{print $8}'`
            if [[ $var -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 $var SAS Read Errors\e[0m"
    		sas=1
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
            then
                    echo -e "\e[30;42m$1 $var SAS Read Errors\e[0m"
            fi
    
            # SAS Write errors
    
            #var=`smartctl -a $1 | egrep "write:" | awk '{print $8}'`
            var=`echo "$smartoutput"  | egrep "write:" | awk '{print $8}'`
            if [[ $var -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 $var SAS Write Errors\e[0m"
    		sas=1
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
            then
                    echo -e "\e[30;42m$1 $var SAS Write Errors\e[0m"
            fi
    
            # SAS Verify errors
    
            #var=`smartctl -a $1 | egrep "verify:" | awk '{print $8}'`
            var=`echo "$smartoutput"  | egrep "verify:" | awk '{print $8}'`
            if [[ $var -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 $var SAS Verify Errors\e[0m"
    		sas=1
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
            then
                    echo -e "\e[30;42m$1 $var SAS Verify Errors\e[0m"
            fi
    
            # SAS post factory defects
    
            var=`echo "$smartoutput"  | grep -i "grown defect" | sed 's/Elements in grown defect list: //' | grep -iv "not available"`
            if [[ $var -gt 0 ]] && [ ${DEBUG} ]
            then
                    sleep 0
                    echo -e "\e[30;43m$1 $var SAS accumulated defects\e[0m"
                    #error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
            then
                    echo -e "\e[30;42m$1 $var SAS accumulated defects\e[0m"
            fi
    
    	# Power On Errors at the bottom of smart output
    
            var=`echo "$smartoutput"  | grep -i "Error 1 occurred at disk power-on lifetime"`
            if [[ ${#var} -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 Disk is showing signs of power-on errors\e[0m"
                    error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ ${#var} -gt 0 ]]
            then
                    echo -e "\e[0;41m$1 Disk is showing signs of power-on errors\e[0m"
            fi
    
            # Check for ATA Errors
            var=`echo "$smartoutput"  | grep -i "ATA Error Count:" | sed 's/ATA Error Count: //'`
            if [[ $var -gt 0 ]] && [ ${DEBUG} ]
            then
                    sleep 0
                    echo -e "\e[30;43m$1 $var ATA Errors\e[0m"
                    #error=$(($error + 1))
            elif [ ${DEBUG} ] && [[ $var =~ ^[0-9]+$ ]]
            then
                    echo -e "\e[30;42m$1 $var ATA Errors\e[0m"
            fi
    
    
    
            return $error
    }
    if [ ! -x "$(command -v smartctl)" ]
    then
            echo "Error: Smartctl command not found"
            exit 1
    fi
    
    agerror=0
    
    # Check disks attached to the board directly or in passthrough
    #for i in `ls /dev/sd*|egrep '^(\/)dev(\/)sd[a-z]$'`;
    if [[ ${DEBUG} ]]
    then
      echo 'Checking directly attached drives'
    fi
    for i in `find /dev -type b -name 'sd*' | egrep '^(\/)dev(\/)sd[a-z]$'`;
    do
            smartcheck $i $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
    done
    # Check disks attached to the board directly or in passthrough (BSD)
    #for i in `ls /dev/pass*|egrep '^(\/)dev(\/)pass[0-9]+$'`;
    if [[ ${DEBUG} ]]
    then
      echo 'Checking directly attached drives (passthrough, BSD)'
    fi
    for i in `find /dev -type c -name 'pass*' | egrep '^(\/)dev(\/)pass[0-9]+$'`;
    do
            smartcheck "$i" $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
    done
    
    # Check disks behind LSISAS2008 LV
    #for i in `ls /dev/sg*|egrep '^(\/)dev(\/)sg[0-9]+$'`;
    if [[ ${DEBUG} ]]
    then
      echo 'Checking LSISAS2008 Drives'
    fi
    for i in `find /dev -type c -name 'sg*' | egrep '^(\/)dev(\/)sg[0-9]+$'`;
    do
            smartcheck $i $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
    done
    # Check disks behind a 3ware card
    if [ -f "/dev/twl0" ]
    then
      if [[ ${DEBUG} ]]
      then
        echo 'Checking 3ware drives'
      fi
      for i in `seq 0 20`
      do
            smartcheck "-d 3ware,$i /dev/twl0 -T permissive" $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
      done
    fi
    # Check scsi disks behind an lsi card - fixed at sda at the moment
    if test $(command -v lspci) && test $(lspci | grep -i LSI | wc -l) != "0";
    then
      for i in `seq 0 20`
      do
            smartcheck "-d megaraid,$i /dev/sda -T permissive" $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
      done
    
      #Check disks behind an lsi card - fixed at sda at the moment
      if [[ ${DEBUG} ]]
      then
        echo 'Checking LSI drives'
      fi
      for i in `seq 0 20`
      do
            smartcheck "-d sat+megaraid,$i /dev/sda -T permissive" $DEBUG
            rval=$?
            agerror=$(($agerror + $rval))
      done
    fi
    # Check scsi disks behind an HPcard - fixed at sda at the moment
    # 2> /dev/null <- append this if it's spouting crap
    if [[ ${DEBUG} ]]
    then
      echo 'Checking HP drives'
    fi
    if [[ ${HP} ]]
    then
    	for i in `seq 0 20`
    	do
            	smartcheck "-d cciss,$i /dev/sda -T permissive" $DEBUG
            	rval=$?
            	agerror=$(($agerror + $rval))
    	done
    fi
    
    if [[ ${DEBUG} &&  ${sas} ]]
    then
    	echo -e "---"
    	echo "NOTICE: SAS error counters can be reset using sg3_utils command"
    	echo "sg_logs -R /dev/device"
    	echo -e "---"
    fi
    
    if [[ $agerror -gt 0 ]]
    then
            echo -e "\e[0;41m$agerror Errors were found\e[0m"
            exit $agerror
    else
            echo -e "\e[30;42mNo errors were found\e[0m"
            exit 0
    fi

    有RAID组:

    安装硬盘哨兵

    wget https://www.hdsentinel.com/hdslin/hdsentinel-020c-x64.zip
    tar -zxvf hdsentinel-020c-x64.zip
    cd hdsent*
    chmod 755 HDSentinel
    ./HDSentinel
    
    

    0

    1. This post has no comment yet

    发表回复

    您的邮箱地址不会被公开。 必填项已用 * 标注

    使用DLVM本地部署DeepSeek(补充)
    使用DLVM本地部署DeepSeek(补充)
    Linux 6 张图
    Error response from daemon: Get “https://registry-1.docker.io/v2/”: net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)
    Error response from daemon: Get “https://registry-1.docker.io/v2/”: net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)
    Linux 2 张图
    MySQL 30周年庆 OCP 认证免费
    MySQL 30周年庆 OCP 认证免费
    Linux 5 张图
    使用Kubeadm在Ubuntu 20.04中部署Kubernetes
    使用Kubeadm在Ubuntu 20.04中部署Kubernetes
    Linux 24 张图
    在Linux上安装和配置Squid代理服务器
    在Linux上安装和配置Squid代理服务器
    Linux 10 张图
    Wifipumpkin3 安装记录
    Wifipumpkin3 安装记录
    Linux 1 张图
    © 2025 诺诺博客如有侵权请联系删除 | 网站地图 | 百度统计 | 又拍云CDN加速
    为了获得更好的浏览效果 建议您使用IE8.0及以上版本浏览器登陆本站点 · 服务器托管于腾讯云
    📢 小站正在装修中,如页面异常请包涵!