Linux磁盘I/O性能问题是生产环境中最常见的故障之一,直接导致系统卡顿、服务超时。本文针对I/O瓶颈定位、性能调优、进程阻塞等高频问题,通过真实性能故障案例,提供从基础诊断到深度优化的完整解决方案,帮助企业快速恢复业务性能。
![图片[1]-Linux磁盘I/O性能优化与故障排查:解决系统卡顿、进程阻塞的实战方案-Vc博客](https://blogimg.vcvcc.cc/2025/11/20251107021111482.png?imageView2/0/format/webp/q/75)
一、磁盘I/O性能问题快速诊断
1. I/O瓶颈实时定位
(1)系统级I/O性能监控
<strong>#!/bin/bash</strong>
# io_performance_snapshot.sh
echo "====== 磁盘I/O性能快照 ======"
echo "检测时间: $(date)"
echo ""
# 1. 系统整体I/O压力
echo "1. 系统I/O统计:"
iostat -x 1 3 | tail -n +7
# 2. 各设备I/O负载
echo -e "\n2. 块设备负载详情:"
for device in $(lsblk -d -o NAME | grep -v NAME); do
if [ -f "/sys/block/$device/stat" ]; then
read rio rsect wio wsect < "/sys/block/$device/stat"
echo "设备 $device: 读IO=$rio, 写IO=$wio"
fi
done
# 3. I/O等待CPU时间
echo -e "\n3. CPU I/O等待:"
mpstat 1 3 | grep -i average | awk '{printf "I/O等待: %.2f%%\n", $6}'
# 4. 内存脏页和回写
echo -e "\n4. 内存脏页状态:"
grep -E "Dirty|Writeback" /proc/meminfo
# 5. 进程I/O排名
echo -e "\n5. 进程I/O使用TOP10:"
pidstat -dl 1 1 | head -15
(2)I/O瓶颈自动化检测
<strong>#!/bin/bash</strong>
# io_bottleneck_detector.sh
# 阈值配置
IO_AWAIT_THRESHOLD=20 # iowait百分比
UTIL_THRESHOLD=80 # 设备使用率百分比
QUEUE_THRESHOLD=8 # 平均队列长度
check_io_bottleneck() {
echo "开始I/O瓶颈检测..."
# 检查iowait
local io_wait=$(iostat -c 1 2 | tail -1 | awk '{print $4}')
if (( $(echo "$io_wait > $IO_AWAIT_THRESHOLD" | bc -l) )); then
echo "警告: CPU I/O等待过高 - ${io_wait}%"
fi
# 检查设备使用率
iostat -x 1 1 | grep -v '^$' | tail -n +4 | while read line; do
local util=$(echo $line | awk '{print $14}')
local device=$(echo $line | awk '{print $1}')
if (( $(echo "$util > $UTIL_THRESHOLD" | bc -l) )); then
echo "警告: 设备 $device 使用率过高 - ${util}%"
fi
done
# 检查IO队列
iostat -x 1 1 | grep -v '^$' | tail -n +4 | while read line; do
local avgqu=$(echo $line | awk '{print $9}')
local device=$(echo $line | awk '{print $1}')
if (( $(echo "$avgqu > $QUEUE_THRESHOLD" | bc -l) )); then
echo "警告: 设备 $device 队列过长 - ${avgqu}"
fi
done
}
# 持续监控
while true; do
check_io_bottleneck
echo "----------------------------------------"
sleep 30
done
2. 高I/O进程精准定位
(1)进程级I/O分析工具
<strong>#!/bin/bash</strong>
# high_io_process_finder.sh
echo "查找高I/O占用进程..."
echo ""
# 方法1: 使用pidstat
echo "1. pidstat检测结果:"
pidstat -d 1 3 | grep -v Linux | sort -k4 -nr | head -10
# 方法2: 使用iotop
echo -e "\n2. iotop检测结果:"
if command -v iotop >/dev/null <strong>2</strong>><strong>&1</strong>; then
iotop -b -n 3 -o | head -15
else
echo "请安装iotop: yum install iotop 或 apt-get install iotop"
fi
# 方法3: 直接读取/proc
echo -e "\n3. /proc统计结果:"
for pid in $(find /proc -maxdepth 1 -type d -name '[0-9]*' | cut -d/ -f3); do
if [ -f "/proc/$pid/io" ]; then
read_bytes=$(grep read_bytes "/proc/$pid/io" | awk '{print $2}')
write_bytes=$(grep write_bytes "/proc/$pid/io" | awk '{print $2}')
comm=$(cat "/proc/$pid/comm" <strong>2</strong>>/dev/null)
if [ "$read_bytes" -gt 1048576 ] || [ "$write_bytes" -gt 1048576 ]; then
echo "PID: $pid, 进程: $comm, 读: $((read_bytes/1024/1024))MB, 写: $((write_bytes/1024/1024))MB"
fi
fi
done | sort -k6 -nr | head -10
二、常见I/O性能问题解决方案
1. 磁盘满导致的I/O问题
(1)智能磁盘空间清理
<strong>#!/bin/bash</strong>
# disk_space_cleaner.sh
echo "开始智能磁盘空间清理..."
echo ""
# 1. 查找大文件
echo "1. 查找大于100MB的文件:"
find / -type f -size +100M -exec ls -lh {} \; <strong>2</strong>>/dev/null | sort -k5 -hr | head -20
# 2. 清理日志文件
echo -e "\n2. 清理日志文件:"
for log_dir in /var/log /opt/*/logs; do
if [ -d "$log_dir" ]; then
echo "清理 $log_dir:"
find "$log_dir" -name "*.log.*" -mtime +7 -exec ls -lh {} \; | head -10
find "$log_dir" -name "*.log.*" -mtime +7 -delete
fi
done
# 3. 清理临时文件
echo -e "\n3. 清理临时文件:"
for tmp_dir in /tmp /var/tmp; do
if [ -d "$tmp_dir" ]; then
echo "清理 $tmp_dir:"
find "$tmp_dir" -type f -atime +7 -exec ls -lh {} \; <strong>2</strong>>/dev/null | head -10
find "$tmp_dir" -type f -atime +7 -delete <strong>2</strong>>/dev/null
fi
done
# 4. 清理包缓存
echo -e "\n4. 清理包管理器缓存:"
if command -v apt-get >/dev/null <strong>2</strong>><strong>&1</strong>; then
apt-get clean
echo "已清理APT缓存"
elif command -v yum >/dev/null <strong>2</strong>><strong>&1</strong>; then
yum clean all
echo "已清理YUM缓存"
fi
# 5. 清理Docker资源
echo -e "\n5. 清理Docker资源:"
if command -v docker >/dev/null <strong>2</strong>><strong>&1</strong>; then
docker system prune -f
echo "已清理Docker资源"
fi
2. 内存不足导致的Swap I/O
(1)Swap使用分析与优化
<strong>#!/bin/bash</strong>
# swap_optimizer.sh
echo "Swap使用情况分析..."
echo ""
# 1. 当前Swap状态
echo "1. Swap当前状态:"
free -h
swapon -s
# 2. 检查Swap使用进程
echo -e "\n2. 使用Swap的进程:"
for file in /proc/*/status; do
if [ -f "$file" ]; then
vmswap=$(grep VmSwap "$file" | awk '{print $2}')
if [ -n "$vmswap" ] && [ "$vmswap" -gt 0 ]; then
pid=$(echo "$file" | cut -d/ -f3)
comm=$(cat "/proc/$pid/comm" <strong>2</strong>>/dev/null)
echo "PID: $pid, 进程: $comm, Swap使用: ${vmswap}KB"
fi
fi
done | sort -k6 -nr | head -10
# 3. 优化建议
echo -e "\n3. 优化建议:"
mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
swap_total=$(grep SwapTotal /proc/meminfo | awk '{print $2}')
if [ "$swap_total" -gt "$mem_total" ]; then
echo "建议: Swap空间过大,考虑减少Swap使用"
fi
if grep -q "swappiness" /etc/sysctl.conf; then
current_swappiness=$(sysctl -n vm.swappiness)
if [ "$current_swappiness" -gt 10 ]; then
echo "建议: 降低swappiness值,当前为 $current_swappiness"
echo "执行: echo 'vm.swappiness=10' >> /etc/sysctl.conf && sysctl -p"
fi
fi
三、磁盘I/O性能深度调优
1. 文件系统参数优化
(1)EXT4文件系统调优
<strong>#!/bin/bash</strong>
# ext4_optimizer.sh
echo "EXT4文件系统优化..."
echo ""
# 检查当前挂载参数
echo "1. 当前文件系统挂载参数:"
mount | grep -E "ext4|xfs" | while read line; do
echo "$line"
done
# 优化建议
echo -e "\n2. 优化建议:"
# 检查磁盘类型
for device in $(lsblk -d -o NAME | grep -v NAME); do
if [ -f "/sys/block/$device/queue/rotational" ]; then
rotational=$(cat "/sys/block/$device/queue/rotational")
if [ "$rotational" -eq 0 ]; then
echo "设备 $device 是SSD,建议挂载参数:"
echo " defaults,noatime,nodiratime,discard,barrier=0"
else
echo "设备 $device 是HDD,建议挂载参数:"
echo " defaults,noatime,nodiratime,barrier=1"
fi
fi
done
# 检查inode使用
echo -e "\n3. Inode使用情况:"
df -i | grep -v tmpfs
# 调整预读值
echo -e "\n4. 调整预读参数:"
for device in $(lsblk -d -o NAME | grep -v NAME); do
current_readahead=$(blockdev --getra "/dev/$device" <strong>2</strong>>/dev/null)
if [ -n "$current_readahead" ]; then
if [ "$current_readahead" -lt 16384 ]; then
echo "设备 $device 预读值较低: $current_readahead"
echo "建议设置: blockdev --setra 16384 /dev/$device"
fi
fi
done
2. I/O调度器优化配置
(1)调度器智能配置
<strong>#!/bin/bash</strong>
# io_scheduler_tuner.sh
echo "I/O调度器优化配置..."
echo ""
# 检测并优化调度器
for device in $(lsblk -d -o NAME | grep -v NAME); do
device_path="/dev/$device"
scheduler_file="/sys/block/$device/queue/scheduler"
if [ -f "$scheduler_file" ]; then
current_scheduler=$(grep -o '\[.*\]' "$scheduler_file" | tr -d '[]')
available_schedulers=$(cat "$scheduler_file")
echo "设备: $device_path"
echo "当前调度器: $current_scheduler"
echo "可用调度器: $available_schedulers"
# 根据设备类型推荐调度器
if [ -f "/sys/block/$device/queue/rotational" ]; then
rotational=$(cat "/sys/block/$device/queue/rotational")
if [ "$rotational" -eq 0 ]; then
# SSD设备
recommended="none"
if echo "$available_schedulers" | grep -q "none"; then
echo "推荐调度器: none (SSD优化)"
if [ "$current_scheduler" != "none" ]; then
echo "执行: echo none > $scheduler_file"
echo none > "$scheduler_file"
fi
elif echo "$available_schedulers" | grep -q "noop"; then
echo "推荐调度器: noop (SSD次选)"
if [ "$current_scheduler" != "noop" ]; then
echo "执行: echo noop > $scheduler_file"
echo noop > "$scheduler_file"
fi
fi
else
# HDD设备
if echo "$available_schedulers" | grep -q "mq-deadline"; then
echo "推荐调度器: mq-deadline (HDD优化)"
if [ "$current_scheduler" != "mq-deadline" ]; then
echo "执行: echo mq-deadline > $scheduler_file"
echo mq-deadline > "$scheduler_file"
fi
elif echo "$available_schedulers" | grep -q "deadline"; then
echo "推荐调度器: deadline (HDD次选)"
if [ "$current_scheduler" != "deadline" ]; then
echo "执行: echo deadline > $scheduler_file"
echo deadline > "$scheduler_file"
fi
fi
fi
fi
echo ""
fi
done
# 优化队列参数
echo "优化I/O队列参数..."
for device in $(lsblk -d -o NAME | grep -v NAME); do
nr_requests_file="/sys/block/$device/queue/nr_requests"
read_ahead_file="/sys/block/$device/queue/read_ahead_kb"
if [ -f "/sys/block/$device/queue/rotational" ]; then
rotational=$(cat "/sys/block/$device/queue/rotational")
if [ "$rotational" -eq 0 ]; then
# SSD优化
[ -f "$nr_requests_file" ] && echo 256 > "$nr_requests_file"
[ -f "$read_ahead_file" ] && echo 128 > "$read_ahead_file"
else
# HDD优化
[ -f "$nr_requests_file" ] && echo 128 > "$nr_requests_file"
[ -f "$read_ahead_file" ] && echo 512 > "$read_ahead_file"
fi
fi
done
echo "I/O调度器优化完成"
四、数据库I/O性能专项优化
1. MySQL数据库I/O调优
(1)MySQL I/O配置优化
<strong>#!/bin/bash</strong>
# mysql_io_optimizer.sh
echo "MySQL I/O性能优化检查..."
echo ""
# 检查MySQL配置
check_mysql_config() {
echo "1. MySQL当前I/O相关配置:"
mysql -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" <strong>2</strong>>/dev/null
mysql -e "SHOW VARIABLES LIKE 'innodb_log_file_size';" <strong>2</strong>>/dev/null
mysql -e "SHOW VARIABLES LIKE 'innodb_flush_log_at_trx_commit';" <strong>2</strong>>/dev/null
mysql -e "SHOW VARIABLES LIKE 'sync_binlog';" <strong>2</strong>>/dev/null
mysql -e "SHOW VARIABLES LIKE 'innodb_io_capacity';" <strong>2</strong>>/dev/null
echo -e "\n2. MySQL状态信息:"
mysql -e "SHOW ENGINE INNODB STATUS\G" <strong>2</strong>>/dev/null | grep -A 10 "BUFFER POOL AND MEMORY"
}
# 生成优化建议
generate_optimization_suggestions() {
echo -e "\n3. 优化建议:"
# 获取系统内存
total_memory=$(grep MemTotal /proc/meminfo | awk '{print $2}')
memory_mb=$((total_memory / 1024))
# 计算推荐的缓冲池大小
recommended_buffer_pool=$((memory_mb * 70 / 100))
echo "根据系统内存 ${memory_mb}MB 推荐配置:"
echo "innodb_buffer_pool_size = ${recommended_buffer_pool}M"
echo "innodb_log_file_size = 1G"
echo "innodb_flush_log_at_trx_commit = 2 (性能优先)"
echo "sync_binlog = 0 (性能优先)"
# 检查磁盘类型推荐io_capacity
for device in $(lsblk -d -o NAME | grep -v NAME); do
if [ -f "/sys/block/$device/queue/rotational" ]; then
rotational=$(cat "/sys/block/$device/queue/rotational")
if [ "$rotational" -eq 0 ]; then
echo "innodb_io_capacity = 2000 (SSD优化)"
echo "innodb_io_capacity_max = 4000"
else
echo "innodb_io_capacity = 200 (HDD优化)"
echo "innodb_io_capacity_max = 400"
fi
break
fi
done
}
# 检查表碎片
check_table_fragmentation() {
echo -e "\n4. 检查表碎片:"
mysql -e "SELECT TABLE_SCHEMA, TABLE_NAME, DATA_FREE FROM information_schema.TABLES WHERE DATA_FREE > 0 ORDER BY DATA_FREE DESC LIMIT 10;" <strong>2</strong>>/dev/null
}
check_mysql_config
generate_optimization_suggestions
check_table_fragmentation
五、I/O性能监控与告警
1. 实时I/O性能看板
<strong>#!/bin/bash</strong>
# io_performance_dashboard.sh
# 颜色定义
RED='3[0;31m'
GREEN='3[0;32m'
YELLOW='3[1;33m'
BLUE='3[0;34m'
NC='3[0m'
# 实时监控函数
monitor_io_performance() {
clear
echo -e "${BLUE}====== I/O性能实时监控看板 ======${NC}"
echo "刷新时间: $(date)"
echo ""
# 1. 系统级I/O统计
echo -e "${GREEN}1. 系统I/O统计:${NC}"
iostat -x 1 1 | tail -n +4
# 2. 设备使用率告警
echo -e "${GREEN}2. 设备使用率监控:${NC}"
iostat -x 1 1 | tail -n +4 | while read line; do
util=$(echo $line | awk '{print $14}')
device=$(echo $line | awk '{print $1}')
if (( $(echo "$util > 80" | bc -l) )); then
echo -e "${RED}警告: 设备 $device 使用率 ${util}%${NC}"
elif (( $(echo "$util > 60" | bc -l) )); then
echo -e "${YELLOW}注意: 设备 $device 使用率 ${util}%${NC}"
else
echo -e "${GREEN}正常: 设备 $device 使用率 ${util}%${NC}"
fi
done
# 3. 进程I/O监控
echo -e "${GREEN}3. 实时进程I/O排名:${NC}"
ps aux --sort=-%cpu | head -5 | awk 'NR>1{print "高CPU进程: "$11" (CPU: "$3"%)"}'
# 4. 内存和Swap监控
echo -e "${GREEN}4. 内存使用情况:${NC}"
free -h | grep -E "Mem|Swap"
}
# 主循环
while true; do
monitor_io_performance
sleep 2
done
总结
Linux磁盘I/O性能问题需要系统化的诊断和优化方案。通过本文提供的监控脚本、分析工具和调优方法,可以快速定位I/O瓶颈并实施有效的优化措施。建议在生产环境中建立完善的I/O性能监控体系,定期进行性能评估和优化调整。
© 版权声明
THE END












暂无评论内容