Linux磁盘I/O性能优化与故障排查:解决系统卡顿、进程阻塞的实战方案

Linux磁盘I/O性能问题是生产环境中最常见的故障之一,直接导致系统卡顿、服务超时。本文针对I/O瓶颈定位、性能调优、进程阻塞等高频问题,通过真实性能故障案例,提供从基础诊断到深度优化的完整解决方案,帮助企业快速恢复业务性能。

图片[1]-Linux磁盘I/O性能优化与故障排查:解决系统卡顿、进程阻塞的实战方案-Vc博客

一、磁盘I/O性能问题快速诊断

1. I/O瓶颈实时定位

(1)系统级I/O性能监控

<strong>#!/bin/bash</strong>
# io_performance_snapshot.sh

echo "====== 磁盘I/O性能快照 ======"
echo "检测时间: $(date)"
echo ""

# 1. 系统整体I/O压力
echo "1. 系统I/O统计:"
iostat -x 1 3 | tail -n +7

# 2. 各设备I/O负载
echo -e "\n2. 块设备负载详情:"
for device in $(lsblk -d -o NAME | grep -v NAME); do
    if [ -f "/sys/block/$device/stat" ]; then
        read rio rsect wio wsect < "/sys/block/$device/stat"
        echo "设备 $device: 读IO=$rio, 写IO=$wio"
    fi
done

# 3. I/O等待CPU时间
echo -e "\n3. CPU I/O等待:"
mpstat 1 3 | grep -i average | awk '{printf "I/O等待: %.2f%%\n", $6}'

# 4. 内存脏页和回写
echo -e "\n4. 内存脏页状态:"
grep -E "Dirty|Writeback" /proc/meminfo

# 5. 进程I/O排名
echo -e "\n5. 进程I/O使用TOP10:"
pidstat -dl 1 1 | head -15

(2)I/O瓶颈自动化检测

<strong>#!/bin/bash</strong>
# io_bottleneck_detector.sh

# 阈值配置
IO_AWAIT_THRESHOLD=20    # iowait百分比
UTIL_THRESHOLD=80        # 设备使用率百分比
QUEUE_THRESHOLD=8        # 平均队列长度

check_io_bottleneck() {
    echo "开始I/O瓶颈检测..."
    
    # 检查iowait
    local io_wait=$(iostat -c 1 2 | tail -1 | awk '{print $4}')
    if (( $(echo "$io_wait > $IO_AWAIT_THRESHOLD" | bc -l) )); then
        echo "警告: CPU I/O等待过高 - ${io_wait}%"
    fi
    
    # 检查设备使用率
    iostat -x 1 1 | grep -v '^$' | tail -n +4 | while read line; do
        local util=$(echo $line | awk '{print $14}')
        local device=$(echo $line | awk '{print $1}')
        
        if (( $(echo "$util > $UTIL_THRESHOLD" | bc -l) )); then
            echo "警告: 设备 $device 使用率过高 - ${util}%"
        fi
    done
    
    # 检查IO队列
    iostat -x 1 1 | grep -v '^$' | tail -n +4 | while read line; do
        local avgqu=$(echo $line | awk '{print $9}')
        local device=$(echo $line | awk '{print $1}')
        
        if (( $(echo "$avgqu > $QUEUE_THRESHOLD" | bc -l) )); then
            echo "警告: 设备 $device 队列过长 - ${avgqu}"
        fi
    done
}

# 持续监控
while true; do
    check_io_bottleneck
    echo "----------------------------------------"
    sleep 30
done

2. 高I/O进程精准定位

(1)进程级I/O分析工具

<strong>#!/bin/bash</strong>
# high_io_process_finder.sh

echo "查找高I/O占用进程..."
echo ""

# 方法1: 使用pidstat
echo "1. pidstat检测结果:"
pidstat -d 1 3 | grep -v Linux | sort -k4 -nr | head -10

# 方法2: 使用iotop
echo -e "\n2. iotop检测结果:"
if command -v iotop >/dev/null <strong>2</strong>><strong>&1</strong>; then
    iotop -b -n 3 -o | head -15
else
    echo "请安装iotop: yum install iotop 或 apt-get install iotop"
fi

# 方法3: 直接读取/proc
echo -e "\n3. /proc统计结果:"
for pid in $(find /proc -maxdepth 1 -type d -name '[0-9]*' | cut -d/ -f3); do
    if [ -f "/proc/$pid/io" ]; then
        read_bytes=$(grep read_bytes "/proc/$pid/io" | awk '{print $2}')
        write_bytes=$(grep write_bytes "/proc/$pid/io" | awk '{print $2}')
        comm=$(cat "/proc/$pid/comm" <strong>2</strong>>/dev/null)
        
        if [ "$read_bytes" -gt 1048576 ] || [ "$write_bytes" -gt 1048576 ]; then
            echo "PID: $pid, 进程: $comm, 读: $((read_bytes/1024/1024))MB, 写: $((write_bytes/1024/1024))MB"
        fi
    fi
done | sort -k6 -nr | head -10

二、常见I/O性能问题解决方案

1. 磁盘满导致的I/O问题

(1)智能磁盘空间清理

<strong>#!/bin/bash</strong>
# disk_space_cleaner.sh

echo "开始智能磁盘空间清理..."
echo ""

# 1. 查找大文件
echo "1. 查找大于100MB的文件:"
find / -type f -size +100M -exec ls -lh {} \; <strong>2</strong>>/dev/null | sort -k5 -hr | head -20

# 2. 清理日志文件
echo -e "\n2. 清理日志文件:"
for log_dir in /var/log /opt/*/logs; do
    if [ -d "$log_dir" ]; then
        echo "清理 $log_dir:"
        find "$log_dir" -name "*.log.*" -mtime +7 -exec ls -lh {} \; | head -10
        find "$log_dir" -name "*.log.*" -mtime +7 -delete
    fi
done

# 3. 清理临时文件
echo -e "\n3. 清理临时文件:"
for tmp_dir in /tmp /var/tmp; do
    if [ -d "$tmp_dir" ]; then
        echo "清理 $tmp_dir:"
        find "$tmp_dir" -type f -atime +7 -exec ls -lh {} \; <strong>2</strong>>/dev/null | head -10
        find "$tmp_dir" -type f -atime +7 -delete <strong>2</strong>>/dev/null
    fi
done

# 4. 清理包缓存
echo -e "\n4. 清理包管理器缓存:"
if command -v apt-get >/dev/null <strong>2</strong>><strong>&1</strong>; then
    apt-get clean
    echo "已清理APT缓存"
elif command -v yum >/dev/null <strong>2</strong>><strong>&1</strong>; then
    yum clean all
    echo "已清理YUM缓存"
fi

# 5. 清理Docker资源
echo -e "\n5. 清理Docker资源:"
if command -v docker >/dev/null <strong>2</strong>><strong>&1</strong>; then
    docker system prune -f
    echo "已清理Docker资源"
fi

2. 内存不足导致的Swap I/O

(1)Swap使用分析与优化

<strong>#!/bin/bash</strong>
# swap_optimizer.sh

echo "Swap使用情况分析..."
echo ""

# 1. 当前Swap状态
echo "1. Swap当前状态:"
free -h
swapon -s

# 2. 检查Swap使用进程
echo -e "\n2. 使用Swap的进程:"
for file in /proc/*/status; do
    if [ -f "$file" ]; then
        vmswap=$(grep VmSwap "$file" | awk '{print $2}')
        if [ -n "$vmswap" ] && [ "$vmswap" -gt 0 ]; then
            pid=$(echo "$file" | cut -d/ -f3)
            comm=$(cat "/proc/$pid/comm" <strong>2</strong>>/dev/null)
            echo "PID: $pid, 进程: $comm, Swap使用: ${vmswap}KB"
        fi
    fi
done | sort -k6 -nr | head -10

# 3. 优化建议
echo -e "\n3. 优化建议:"
mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
swap_total=$(grep SwapTotal /proc/meminfo | awk '{print $2}')

if [ "$swap_total" -gt "$mem_total" ]; then
    echo "建议: Swap空间过大,考虑减少Swap使用"
fi

if grep -q "swappiness" /etc/sysctl.conf; then
    current_swappiness=$(sysctl -n vm.swappiness)
    if [ "$current_swappiness" -gt 10 ]; then
        echo "建议: 降低swappiness值,当前为 $current_swappiness"
        echo "执行: echo 'vm.swappiness=10' >> /etc/sysctl.conf && sysctl -p"
    fi
fi

三、磁盘I/O性能深度调优

1. 文件系统参数优化

(1)EXT4文件系统调优

<strong>#!/bin/bash</strong>
# ext4_optimizer.sh

echo "EXT4文件系统优化..."
echo ""

# 检查当前挂载参数
echo "1. 当前文件系统挂载参数:"
mount | grep -E "ext4|xfs" | while read line; do
    echo "$line"
done

# 优化建议
echo -e "\n2. 优化建议:"

# 检查磁盘类型
for device in $(lsblk -d -o NAME | grep -v NAME); do
    if [ -f "/sys/block/$device/queue/rotational" ]; then
        rotational=$(cat "/sys/block/$device/queue/rotational")
        if [ "$rotational" -eq 0 ]; then
            echo "设备 $device 是SSD,建议挂载参数:"
            echo "  defaults,noatime,nodiratime,discard,barrier=0"
        else
            echo "设备 $device 是HDD,建议挂载参数:"
            echo "  defaults,noatime,nodiratime,barrier=1"
        fi
    fi
done

# 检查inode使用
echo -e "\n3. Inode使用情况:"
df -i | grep -v tmpfs

# 调整预读值
echo -e "\n4. 调整预读参数:"
for device in $(lsblk -d -o NAME | grep -v NAME); do
    current_readahead=$(blockdev --getra "/dev/$device" <strong>2</strong>>/dev/null)
    if [ -n "$current_readahead" ]; then
        if [ "$current_readahead" -lt 16384 ]; then
            echo "设备 $device 预读值较低: $current_readahead"
            echo "建议设置: blockdev --setra 16384 /dev/$device"
        fi
    fi
done

2. I/O调度器优化配置

(1)调度器智能配置

<strong>#!/bin/bash</strong>
# io_scheduler_tuner.sh

echo "I/O调度器优化配置..."
echo ""

# 检测并优化调度器
for device in $(lsblk -d -o NAME | grep -v NAME); do
    device_path="/dev/$device"
    scheduler_file="/sys/block/$device/queue/scheduler"
    
    if [ -f "$scheduler_file" ]; then
        current_scheduler=$(grep -o '\[.*\]' "$scheduler_file" | tr -d '[]')
        available_schedulers=$(cat "$scheduler_file")
        
        echo "设备: $device_path"
        echo "当前调度器: $current_scheduler"
        echo "可用调度器: $available_schedulers"
        
        # 根据设备类型推荐调度器
        if [ -f "/sys/block/$device/queue/rotational" ]; then
            rotational=$(cat "/sys/block/$device/queue/rotational")
            
            if [ "$rotational" -eq 0 ]; then
                # SSD设备
                recommended="none"
                if echo "$available_schedulers" | grep -q "none"; then
                    echo "推荐调度器: none (SSD优化)"
                    if [ "$current_scheduler" != "none" ]; then
                        echo "执行: echo none > $scheduler_file"
                        echo none > "$scheduler_file"
                    fi
                elif echo "$available_schedulers" | grep -q "noop"; then
                    echo "推荐调度器: noop (SSD次选)"
                    if [ "$current_scheduler" != "noop" ]; then
                        echo "执行: echo noop > $scheduler_file"
                        echo noop > "$scheduler_file"
                    fi
                fi
            else
                # HDD设备
                if echo "$available_schedulers" | grep -q "mq-deadline"; then
                    echo "推荐调度器: mq-deadline (HDD优化)"
                    if [ "$current_scheduler" != "mq-deadline" ]; then
                        echo "执行: echo mq-deadline > $scheduler_file"
                        echo mq-deadline > "$scheduler_file"
                    fi
                elif echo "$available_schedulers" | grep -q "deadline"; then
                    echo "推荐调度器: deadline (HDD次选)"
                    if [ "$current_scheduler" != "deadline" ]; then
                        echo "执行: echo deadline > $scheduler_file"
                        echo deadline > "$scheduler_file"
                    fi
                fi
            fi
        fi
        echo ""
    fi
done

# 优化队列参数
echo "优化I/O队列参数..."
for device in $(lsblk -d -o NAME | grep -v NAME); do
    nr_requests_file="/sys/block/$device/queue/nr_requests"
    read_ahead_file="/sys/block/$device/queue/read_ahead_kb"
    
    if [ -f "/sys/block/$device/queue/rotational" ]; then
        rotational=$(cat "/sys/block/$device/queue/rotational")
        
        if [ "$rotational" -eq 0 ]; then
            # SSD优化
            [ -f "$nr_requests_file" ] && echo 256 > "$nr_requests_file"
            [ -f "$read_ahead_file" ] && echo 128 > "$read_ahead_file"
        else
            # HDD优化
            [ -f "$nr_requests_file" ] && echo 128 > "$nr_requests_file"
            [ -f "$read_ahead_file" ] && echo 512 > "$read_ahead_file"
        fi
    fi
done

echo "I/O调度器优化完成"

四、数据库I/O性能专项优化

1. MySQL数据库I/O调优

(1)MySQL I/O配置优化

<strong>#!/bin/bash</strong>
# mysql_io_optimizer.sh

echo "MySQL I/O性能优化检查..."
echo ""

# 检查MySQL配置
check_mysql_config() {
    echo "1. MySQL当前I/O相关配置:"
    
    mysql -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" <strong>2</strong>>/dev/null
    mysql -e "SHOW VARIABLES LIKE 'innodb_log_file_size';" <strong>2</strong>>/dev/null
    mysql -e "SHOW VARIABLES LIKE 'innodb_flush_log_at_trx_commit';" <strong>2</strong>>/dev/null
    mysql -e "SHOW VARIABLES LIKE 'sync_binlog';" <strong>2</strong>>/dev/null
    mysql -e "SHOW VARIABLES LIKE 'innodb_io_capacity';" <strong>2</strong>>/dev/null
    
    echo -e "\n2. MySQL状态信息:"
    mysql -e "SHOW ENGINE INNODB STATUS\G" <strong>2</strong>>/dev/null | grep -A 10 "BUFFER POOL AND MEMORY"
}

# 生成优化建议
generate_optimization_suggestions() {
    echo -e "\n3. 优化建议:"
    
    # 获取系统内存
    total_memory=$(grep MemTotal /proc/meminfo | awk '{print $2}')
    memory_mb=$((total_memory / 1024))
    
    # 计算推荐的缓冲池大小
    recommended_buffer_pool=$((memory_mb * 70 / 100))
    
    echo "根据系统内存 ${memory_mb}MB 推荐配置:"
    echo "innodb_buffer_pool_size = ${recommended_buffer_pool}M"
    echo "innodb_log_file_size = 1G"
    echo "innodb_flush_log_at_trx_commit = 2  (性能优先)"
    echo "sync_binlog = 0  (性能优先)"
    
    # 检查磁盘类型推荐io_capacity
    for device in $(lsblk -d -o NAME | grep -v NAME); do
        if [ -f "/sys/block/$device/queue/rotational" ]; then
            rotational=$(cat "/sys/block/$device/queue/rotational")
            if [ "$rotational" -eq 0 ]; then
                echo "innodb_io_capacity = 2000  (SSD优化)"
                echo "innodb_io_capacity_max = 4000"
            else
                echo "innodb_io_capacity = 200   (HDD优化)"
                echo "innodb_io_capacity_max = 400"
            fi
            break
        fi
    done
}

# 检查表碎片
check_table_fragmentation() {
    echo -e "\n4. 检查表碎片:"
    mysql -e "SELECT TABLE_SCHEMA, TABLE_NAME, DATA_FREE FROM information_schema.TABLES WHERE DATA_FREE > 0 ORDER BY DATA_FREE DESC LIMIT 10;" <strong>2</strong>>/dev/null
}

check_mysql_config
generate_optimization_suggestions
check_table_fragmentation

五、I/O性能监控与告警

1. 实时I/O性能看板

<strong>#!/bin/bash</strong>
# io_performance_dashboard.sh

# 颜色定义
RED='3[0;31m'
GREEN='3[0;32m'
YELLOW='3[1;33m'
BLUE='3[0;34m'
NC='3[0m'

# 实时监控函数
monitor_io_performance() {
    clear
    echo -e "${BLUE}====== I/O性能实时监控看板 ======${NC}"
    echo "刷新时间: $(date)"
    echo ""
    
    # 1. 系统级I/O统计
    echo -e "${GREEN}1. 系统I/O统计:${NC}"
    iostat -x 1 1 | tail -n +4
    
    # 2. 设备使用率告警
    echo -e "${GREEN}2. 设备使用率监控:${NC}"
    iostat -x 1 1 | tail -n +4 | while read line; do
        util=$(echo $line | awk '{print $14}')
        device=$(echo $line | awk '{print $1}')
        
        if (( $(echo "$util > 80" | bc -l) )); then
            echo -e "${RED}警告: 设备 $device 使用率 ${util}%${NC}"
        elif (( $(echo "$util > 60" | bc -l) )); then
            echo -e "${YELLOW}注意: 设备 $device 使用率 ${util}%${NC}"
        else
            echo -e "${GREEN}正常: 设备 $device 使用率 ${util}%${NC}"
        fi
    done
    
    # 3. 进程I/O监控
    echo -e "${GREEN}3. 实时进程I/O排名:${NC}"
    ps aux --sort=-%cpu | head -5 | awk 'NR>1{print "高CPU进程: "$11" (CPU: "$3"%)"}'
    
    # 4. 内存和Swap监控
    echo -e "${GREEN}4. 内存使用情况:${NC}"
    free -h | grep -E "Mem|Swap"
}

# 主循环
while true; do
    monitor_io_performance
    sleep 2
done

总结

Linux磁盘I/O性能问题需要系统化的诊断和优化方案。通过本文提供的监控脚本、分析工具和调优方法,可以快速定位I/O瓶颈并实施有效的优化措施。建议在生产环境中建立完善的I/O性能监控体系,定期进行性能评估和优化调整。

© 版权声明
THE END
喜欢就支持一下吧
点赞6 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容