本文全面解析Linux系统中服务异常与进程故障的诊断与修复方法,涵盖systemd服务管理、进程状态监控、核心转储分析、资源泄漏排查等关键技术,提供完整的故障排查流程和实战修复案例。
![图片[1]-Linux服务异常排查完全指南 - 从进程崩溃到服务故障的系统级解决方案](https://blogimg.vcvcc.cc/2025/12/20251202113737415-1024x768.png?imageView2/0/format/webp/q/75)
Linux进程生命周期与异常状态分析
进程状态监控与异常识别
深入理解Linux进程状态对于诊断异常至关重要,这里提供了完整的进程状态监控方案。
<strong>#!/bin/bash</strong>
# process_monitor.sh - 高级进程监控与异常检测脚本
# 监控进程状态、资源使用和异常行为
set -o pipefail
# 配置
LOG_FILE="/var/log/process_monitor_$(date +%Y%m%d).log"
ALERT_THRESHOLDS=(
"CPU_USAGE=90" # CPU使用率阈值
"MEM_USAGE=80" # 内存使用率阈值
"ZOMBIE_COUNT=5" # 僵尸进程阈值
"D_STATE_COUNT=3" # D状态进程阈值
"FD_COUNT=10240" # 文件描述符阈值
)
ALERT_EMAIL="admin@example.com"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 初始化日志
init_logging() {
echo "=== 进程监控开始 $(date) ===" >> "$LOG_FILE"
}
# 进程状态分析
analyze_process_states() {
echo -e "${BLUE}=== 进程状态分析 ===${NC}"
# 获取所有进程状态统计
local states=$(ps -eo state --no-headers | sort | uniq -c)
echo -e "进程状态统计:\n$states"
# 检查僵尸进程
local zombie_count=$(echo "$states" | grep -E '[Zz]' | awk '{print $1}' || echo "0")
echo -e "僵尸进程数量: ${RED}$zombie_count${NC}"
# 检查D状态(不可中断睡眠)
local d_state_count=$(echo "$states" | grep 'D' | awk '{print $1}' || echo "0")
echo -e "D状态进程数量: ${YELLOW}$d_state_count${NC}"
# 检测可能的进程挂起
detect_hung_processes
# 记录到日志
{
echo "时间: $(date)"
echo "进程状态统计:"
echo "$states"
echo "僵尸进程: $zombie_count"
echo "D状态进程: $d_state_count"
} >> "$LOG_FILE"
}
# 检测挂起进程
detect_hung_processes() {
echo -e "\n${CYAN}=== 检测挂起进程 ===${NC}"
# 查找长时间运行且状态异常的进程
ps -eo pid,state,etime,cmd --sort=-etime | head -20 | while read line; do
local pid=$(echo "$line" | awk '{print $1}')
local state=$(echo "$line" | awk '{print $2}')
local etime=$(echo "$line" | awk '{print $3}')
# 检查D状态进程
if [[ "$state" == "D" ]]; then
echo -e "${RED}警告: 进程 $pid 处于不可中断睡眠状态 (运行时间: $etime)${NC}"
analyze_blocked_process "$pid"
fi
# 检查僵尸进程
if [[ "$state" == "Z" ]]; then
echo -e "${RED}警告: 进程 $pid 是僵尸进程${NC}"
analyze_zombie_process "$pid"
fi
done
}
# 分析被阻塞的进程
analyze_blocked_process() {
local pid=$1
echo -e "${YELLOW}分析被阻塞进程 $pid...${NC}"
# 检查进程的堆栈跟踪
if [[ -f "/proc/$pid/stack" ]]; then
echo "进程堆栈:"
cat "/proc/$pid/stack" <strong>2</strong>>/dev/null || echo "无法读取堆栈信息"
fi
# 检查进程等待的IO
if [[ -f "/proc/$pid/io" ]]; then
echo "进程IO统计:"
cat "/proc/$pid/io" <strong>2</strong>>/dev/null | head -10
fi
# 检查进程打开的文件
echo "进程打开的文件描述符:"
ls -l "/proc/$pid/fd" <strong>2</strong>>/dev/null | head -5
}
# 分析僵尸进程
analyze_zombie_process() {
local pid=$1
echo -e "${YELLOW}分析僵尸进程 $pid...${NC}"
# 获取父进程ID
local ppid=$(ps -o ppid= -p "$pid" <strong>2</strong>>/dev/null | xargs)
if [[ -n "$ppid" ]]; then
echo "父进程ID: $ppid"
echo "父进程命令: $(ps -o cmd= -p "$ppid" <strong>2</strong>>/dev/null || echo '未知')"
# 检查父进程状态
local pstate=$(ps -o state= -p "$ppid" <strong>2</strong>>/dev/null)
echo "父进程状态: $pstate"
# 尝试向父进程发送信号
if [[ "$pstate" != "Z" ]]; then
echo "尝试通知父进程回收子进程..."
kill -CHLD "$ppid" <strong>2</strong>>/dev/null && echo "已发送SIGCHLD信号" || echo "发送信号失败"
fi
fi
}
# 资源使用分析
analyze_resource_usage() {
echo -e "\n${PURPLE}=== 资源使用分析 ===${NC}"
# 查找高资源使用进程
echo -e "${CYAN}CPU使用前10:${NC}"
ps -eo pid,ppid,cmd,%cpu,%mem --sort=-%cpu | head -11
echo -e "\n${CYAN}内存使用前10:${NC}"
ps -eo pid,ppid,cmd,%cpu,%mem --sort=-%mem | head -11
# 检查文件描述符使用
echo -e "\n${CYAN}文件描述符使用:${NC}"
local max_fd=$(ulimit -n)
local fd_usage=$(lsof -u "$(whoami)" <strong>2</strong>>/dev/null | wc -l)
local fd_percent=$((fd_usage * 100 / max_fd))
echo "文件描述符限制: $max_fd"
echo "当前使用: $fd_usage"
echo "使用率: ${fd_percent}%"
if [[ $fd_percent -gt 80 ]]; then
echo -e "${RED}警告: 文件描述符使用率过高${NC}"
fi
}
# 服务状态检查
check_service_status() {
echo -e "\n${GREEN}=== 服务状态检查 ===${NC}"
if command -v systemctl &> /dev/null; then
echo "检查失败的systemd服务..."
# 查找失败的服务
local failed_services=$(systemctl --failed --no-legend --no-pager | awk '{print $1}')
if [[ -n "$failed_services" ]]; then
echo -e "${RED}发现失败的服务:${NC}"
for service in $failed_services; do
echo -e "\n服务: $service"
echo "状态:"
systemctl status "$service" --no-pager | head -20
# 查看服务日志
echo "最近日志:"
journalctl -u "$service" -n 5 --no-pager
done
else
echo -e "${GREEN}所有systemd服务运行正常${NC}"
fi
else
echo "systemctl不可用,检查init.d服务..."
service --status-all <strong>2</strong>>/dev/null | grep -E '\[ - \]' || echo "未发现异常服务"
fi
}
# 生成诊断报告
generate_diagnostic_report() {
local report_file="/tmp/process_diagnostic_$(date +%Y%m%d_%H%M%S).txt"
{
echo "=== Linux进程诊断报告 ==="
echo "生成时间: $(date)"
echo "主机名: $(hostname)"
echo "内核版本: $(uname -r)"
echo "系统负载: $(uptime)"
echo ""
echo "=== 系统概览 ==="
free -h
echo ""
df -h
echo ""
echo "=== 进程状态 ==="
ps aux --sort=-%cpu | head -15
echo ""
echo "=== 网络连接 ==="
netstat -tulpn | head -20
echo ""
echo "=== 系统日志最近错误 ==="
dmesg | tail -20
} > "$report_file"
echo -e "${GREEN}诊断报告已生成: $report_file${NC}"
}
# 发送警报
send_alert() {
local subject=$1
local message=$2
if [[ -n "$ALERT_EMAIL" ]]; then
echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
echo "已发送警报到 $ALERT_EMAIL"
fi
}
# 主监控循环
main_monitor() {
init_logging
while true; do
clear
echo -e "${BLUE}=== Linux进程监控面板 ===${NC}"
echo "时间: $(date)"
echo ""
analyze_process_states
analyze_resource_usage
check_service_status
echo -e "\n${CYAN}=== 操作选项 ===${NC}"
echo "1. 生成详细诊断报告"
echo "2. 清理僵尸进程"
echo "3. 重启失败的服务"
echo "4. 查看监控日志"
echo "5. 退出监控"
read -p "请选择操作 [1-5]: " choice
case $choice in
1) generate_diagnostic_report ;;
2) cleanup_zombie_processes ;;
3) restart_failed_services ;;
4) view_monitor_logs ;;
5) break ;;
*) echo "无效选择" ;;
esac
sleep 5
done
}
# 清理僵尸进程
cleanup_zombie_processes() {
echo "清理僵尸进程..."
local zombies=$(ps aux | awk '$8=="Z" {print $2}')
if [[ -n "$zombies" ]]; then
for pid in $zombies; do
local ppid=$(ps -o ppid= -p "$pid")
echo "清理僵尸进程 $pid (父进程: $ppid)"
# 尝试让父进程回收
kill -CHLD "$ppid" <strong>2</strong>>/dev/null
# 如果父进程也是僵尸,终止父进程
if ps -o state= -p "$ppid" | grep -q "Z"; then
echo "父进程 $ppid 也是僵尸,终止它"
kill -9 "$ppid" <strong>2</strong>>/dev/null
fi
done
else
echo "未发现僵尸进程"
fi
}
# 重启失败的服务
restart_failed_services() {
if command -v systemctl &> /dev/null; then
local failed_services=$(systemctl --failed --no-legend --no-pager | awk '{print $1}')
for service in $failed_services; do
echo "重启服务: $service"
systemctl restart "$service"
sleep 2
if systemctl is-active --quiet "$service"; then
echo "✓ 服务 $service 重启成功"
else
echo "✗ 服务 $service 重启失败"
journalctl -u "$service" -n 10 --no-pager
fi
done
fi
}
# 查看监控日志
view_monitor_logs() {
if [[ -f "$LOG_FILE" ]]; then
tail -50 "$LOG_FILE" | less
else
echo "日志文件不存在: $LOG_FILE"
fi
}
# 脚本入口
if [[ $# -eq 0 ]]; then
main_monitor
else
# 命令行参数处理
case $1 in
--analyze)
analyze_process_states
analyze_resource_usage
;;
--report)
generate_diagnostic_report
;;
--cleanup)
cleanup_zombie_processes
;;
--service-check)
check_service_status
;;
*)
echo "用法: $0 [选项]"
echo "选项:"
echo " --analyze 分析进程状态"
echo " --report 生成诊断报告"
echo " --cleanup 清理僵尸进程"
echo " --service-check 检查服务状态"
echo " 无参数 进入交互监控模式"
;;
esac
fi
核心转储分析与调试
配置和使用核心转储进行故障分析
<strong>#!/bin/bash</strong>
# core_dump_manager.sh - 核心转储配置与管理工具
set -euo pipefail
# 配置
CORE_DIR="/var/coredump"
CORE_PATTERN="${CORE_DIR}/core.%e.%p.%t"
ULIMIT_CORE="unlimited"
# 颜色输出
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 初始化核心转储目录
init_core_dump() {
echo -e "${BLUE}=== 初始化核心转储配置 ===${NC}"
# 创建核心转储目录
mkdir -p "$CORE_DIR"
chmod 1777 "$CORE_DIR" # 设置粘滞位
# 配置核心转储路径
echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern
# 启用核心转储
ulimit -c "$ULIMIT_CORE"
# 永久配置
cat > /etc/sysctl.d/99-coredump.conf << EOF
# 核心转储配置
kernel.core_pattern = $CORE_PATTERN
kernel.core_uses_pid = 1
fs.suid_dumpable = 2
EOF
sysctl -p /etc/sysctl.d/99-coredump.conf
echo -e "${GREEN}✓ 核心转储配置完成${NC}"
echo "核心转储目录: $CORE_DIR"
echo "核心转储模式: $CORE_PATTERN"
}
# 分析核心转储文件
analyze_core_dump() {
local core_file=$1
local executable=$2
echo -e "${BLUE}=== 分析核心转储文件 ===${NC}"
echo "核心文件: $core_file"
echo "可执行文件: $executable"
if [[ ! -f "$core_file" ]]; then
echo -e "${RED}错误: 核心文件不存在${NC}"
return 1
fi
if [[ ! -x "$executable" ]]; then
echo -e "${RED}错误: 可执行文件不存在或不可执行${NC}"
return 1
fi
# 使用GDB分析
echo -e "\n${YELLOW}=== GDB分析 ===${NC}"
gdb -q "$executable" "$core_file" << 'EOF'
set pagination off
echo \n=== 线程信息 ===\n
info threads
echo \n=== 崩溃线程堆栈 ===\n
thread apply all bt full
echo \n=== 寄存器信息 ===\n
info registers
echo \n=== 内存映射 ===\n
info proc mappings
echo \n=== 共享库 ===\n
info sharedlibrary
echo \n=== 信号信息 ===\n
info signals
quit
EOF
# 使用objdump分析
echo -e "\n${YELLOW}=== objdump分析 ===${NC}"
objdump -x "$executable" | grep -A5 "Program Header"
# 检查核心文件信息
echo -e "\n${YELLOW}=== 核心文件信息 ===${NC}"
file "$core_file"
}
# 自动分析最新的核心转储
auto_analyze_latest_core() {
echo -e "${BLUE}=== 自动分析最新核心转储 ===${NC}"
local latest_core=$(find "$CORE_DIR" -name "core.*" -type f -exec ls -t {} + | head -1)
if [[ -z "$latest_core" ]]; then
echo -e "${YELLOW}未找到核心转储文件${NC}"
return
fi
echo "发现核心文件: $latest_core"
# 从核心文件名提取程序名
local prog_name=$(basename "$latest_core" | cut -d. -f2)
# 查找可执行文件
local executable=$(which "$prog_name" <strong>2</strong>>/dev/null || \
find /usr/bin /usr/sbin /bin /sbin -name "$prog_name" <strong>2</strong>>/dev/null | head -1)
if [[ -n "$executable" ]]; then
analyze_core_dump "$latest_core" "$executable"
else
echo -e "${RED}无法找到可执行文件: $prog_name${NC}"
echo "尝试在系统中查找..."
find / -type f -executable -name "$prog_name" <strong>2</strong>>/dev/null | head -5
fi
}
# 创建测试崩溃程序
create_test_crash() {
echo -e "${BLUE}=== 创建测试崩溃程序 ===${NC}"
cat > /tmp/test_crash.c << 'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
void crash_function() {
char *ptr = NULL;
*ptr = 'x'; // 空指针解引用
}
void stack_overflow(int depth) {
char buffer[1024];
memset(buffer, 0, sizeof(buffer));
if (depth < 10) {
stack_overflow(depth + 1);
} else {
crash_function();
}
}
void handle_signal(int sig) {
printf("收到信号: %d\n", sig);
exit(1);
}
int main() {
signal(SIGSEGV, handle_signal);
printf("测试崩溃程序启动\n");
printf("PID: %d\n", getpid());
stack_overflow(0);
return 0;
}
EOF
# 编译测试程序
gcc -g -o /tmp/test_crash /tmp/test_crash.c
echo -e "${GREEN}测试程序已创建: /tmp/test_crash${NC}"
echo "使用: /tmp/test_crash 来触发崩溃"
}
# 监控核心转储生成
monitor_core_dumps() {
echo -e "${BLUE}=== 监控核心转储生成 ===${NC}"
inotifywait -m "$CORE_DIR" -e create <strong>2</strong>>/dev/null | while read path action file; do
if [[ "$file" =~ ^core\..* ]]; then
echo -e "${RED}检测到新的核心转储: $file${NC}"
echo "时间: $(date)"
# 分析新生成的核心转储
auto_analyze_latest_core
# 发送通知
send_core_alert "$file"
fi
done
}
# 发送核心转储警报
send_core_alert() {
local core_file=$1
local subject="[警报] 检测到程序崩溃 - $core_file"
local message="检测到新的核心转储文件: $core_file
时间: $(date)
主机: $(hostname)
"
# 可以配置邮件发送或其他通知方式
echo -e "${YELLOW}$message${NC}"
# 记录到系统日志
logger -t "core_dump" "检测到核心转储: $core_file"
}
# 清理旧的核心转储
cleanup_old_cores() {
local days=${1:-7}
echo -e "${BLUE}=== 清理旧的核心转储 ===${NC}"
echo "清理 $days 天前的核心转储文件"
find "$CORE_DIR" -name "core.*" -type f -mtime +$days -exec rm -v {} \;
echo -e "${GREEN}✓ 清理完成${NC}"
}
# 生成核心转储报告
generate_core_report() {
local report_file="/tmp/core_dump_report_$(date +%Y%m%d).txt"
{
echo "=== 核心转储分析报告 ==="
echo "生成时间: $(date)"
echo "主机名: $(hostname)"
echo ""
echo "=== 核心转储配置 ==="
echo "核心转储目录: $CORE_DIR"
echo "核心转储模式: $(cat /proc/sys/kernel/core_pattern)"
echo "核心转储限制: $(ulimit -c)"
echo ""
echo "=== 核心转储文件 ==="
find "$CORE_DIR" -name "core.*" -type f -exec ls -lh {} + <strong>2</strong>>/dev/null | \
while read line; do
echo "$line"
local file=$(echo "$line" | awk '{print $9}')
local prog=$(basename "$file" | cut -d. -f2)
echo " 程序: $prog"
echo " 生成时间: $(stat -c %y "$file" <strong>2</strong>>/dev/null)"
echo ""
done
echo ""
echo "=== 系统崩溃统计 ==="
dmesg | grep -i "segfault\|panic\|oops\|general protection" | tail -10
} > "$report_file"
echo -e "${GREEN}报告已生成: $report_file${NC}"
}
# 主菜单
main_menu() {
while true; do
clear
echo -e "${BLUE}=== Linux核心转储管理工具 ===${NC}"
echo "1. 配置核心转储"
echo "2. 分析最新核心转储"
echo "3. 创建测试崩溃程序"
echo "4. 监控核心转储生成"
echo "5. 清理旧的核心转储"
echo "6. 生成核心转储报告"
echo "7. 退出"
read -p "请选择操作 [1-7]: " choice
case $choice in
1) init_core_dump ;;
2) auto_analyze_latest_core ;;
3) create_test_crash ;;
4) monitor_core_dumps ;;
5)
read -p "清理多少天前的文件? [7]: " days
cleanup_old_cores "${days:-7}"
;;
6) generate_core_report ;;
7) exit 0 ;;
*) echo "无效选择" ;;
esac
echo -e "\n按回车键继续..."
read
done
}
# 脚本入口
if [[ $# -eq 0 ]]; then
main_menu
else
case $1 in
--init)
init_core_dump
;;
--analyze)
if [[ $# -ge 3 ]]; then
analyze_core_dump "$2" "$3"
else
auto_analyze_latest_core
fi
;;
--monitor)
monitor_core_dumps
;;
--cleanup)
cleanup_old_cores "${2:-7}"
;;
--report)
generate_core_report
;;
*)
echo "用法: $0 [选项]"
echo "选项:"
echo " --init 初始化核心转储配置"
echo " --analyze [core] [exe] 分析核心转储文件"
echo " --monitor 监控核心转储生成"
echo " --cleanup [days] 清理旧的核心转储"
echo " --report 生成核心转储报告"
echo " 无参数 进入交互菜单"
;;
esac
fi
systemd服务深度诊断与修复
systemd服务故障排查框架
<strong>#!/bin/bash</strong>
# systemd_diagnostic.sh - systemd服务深度诊断工具
set -o nounset
set -o pipefail
# 配置
DEBUG_MODE="${DEBUG:-false}"
LOG_DIR="/var/log/systemd_diagnostic"
REPORT_DIR="/tmp/systemd_reports"
SERVICE_TIMEOUT=30
# 初始化目录
mkdir -p "$LOG_DIR" "$REPORT_DIR"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'
# 日志函数
log() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "[$timestamp] [$level] $message"
echo "[$timestamp] [$level] $message" >> "$LOG_DIR/diagnostic.log"
}
# 检查服务状态
check_service_status() {
local service=$1
log "INFO" "检查服务状态: $service"
if ! systemctl is-enabled "$service" &>/dev/null; then
log "WARN" "服务 $service 未启用"
return 2
fi
local status=$(systemctl is-active "$service" <strong>2</strong>>/dev/null)
case $status in
active)
log "INFO" "服务 $service 运行正常"
return 0
;;
inactive)
log "WARN" "服务 $service 未运行"
return 1
;;
failed)
log "ERROR" "服务 $service 运行失败"
return 3
;;
*)
log "WARN" "服务 $service 状态未知: $status"
return 4
;;
esac
}
# 深度分析服务
deep_analyze_service() {
local service=$1
local report_file="$REPORT_DIR/${service}_analysis_$(date +%Y%m%d_%H%M%S).txt"
log "INFO" "深度分析服务: $service"
{
echo "=== systemd服务深度分析报告 ==="
echo "服务: $service"
echo "分析时间: $(date)"
echo "主机: $(hostname)"
echo ""
echo "=== 1. 服务基本信息 ==="
systemctl show "$service" --no-pager | grep -E "(LoadState|ActiveState|SubState|MainPID|ControlPID|Result)"
echo ""
echo "=== 2. 服务单元文件 ==="
systemctl cat "$service" <strong>2</strong>>/dev/null || echo "无法读取单元文件"
echo ""
echo "=== 3. 服务依赖关系 ==="
systemctl list-dependencies "$service" --plain --no-pager
echo ""
echo "=== 4. 服务日志分析 ==="
journalctl -u "$service" --since "1 hour ago" --no-pager | tail -50
echo ""
echo "=== 5. 进程信息 ==="
local main_pid=$(systemctl show "$service" --property=MainPID --value)
if [[ "$main_pid" -ne 0 ]]; then
echo "主进程PID: $main_pid"
echo "进程状态:"
ps -fp "$main_pid" --no-headers
echo ""
echo "进程树:"
pstree -p "$main_pid"
echo ""
echo "进程资源使用:"
ps -p "$main_pid" -o pid,ppid,pgid,sid,tty,stat,start_time,%cpu,%mem,rss,vsz,cmd --no-headers
else
echo "服务无活动进程"
fi
echo ""
echo "=== 6. 网络连接 ==="
if [[ "$main_pid" -ne 0 ]]; then
ss -tulpn | grep "pid=$main_pid" || echo "无网络连接"
fi
echo ""
echo "=== 7. 文件描述符 ==="
if [[ "$main_pid" -ne 0 ]]; then
ls -l "/proc/$main_pid/fd" <strong>2</strong>>/dev/null | wc -l | xargs echo "文件描述符数量: "
lsof -p "$main_pid" <strong>2</strong>>/dev/null | head -20
fi
echo ""
echo "=== 8. 系统资源检查 ==="
echo "内存使用:"
free -h
echo ""
echo "磁盘使用:"
df -h
} > "$report_file"
log "INFO" "分析报告已生成: $report_file"
echo "$report_file"
}
# 诊断服务启动失败
diagnose_start_failure() {
local service=$1
log "ERROR" "诊断服务启动失败: $service"
# 尝试手动启动并捕获输出
echo -e "${RED}尝试诊断服务启动失败原因...${NC}"
# 检查单元文件语法
echo -e "\n${YELLOW}检查单元文件语法:${NC}"
systemd-analyze verify "$service.service" <strong>2</strong>><strong>&1</strong>
# 检查依赖关系
echo -e "\n${YELLOW}检查依赖服务:${NC}"
local requires=$(systemctl show "$service" --property=Requires --value)
for req in $requires; do
if ! systemctl is-active "$req" &>/dev/null; then
echo -e "${RED}依赖服务未运行: $req${NC}"
fi
done
# 检查环境变量
echo -e "\n${YELLOW}检查环境变量:${NC}"
systemctl show "$service" --property=Environment --value
# 检查WorkingDirectory
echo -e "\n${YELLOW}检查工作目录:${NC}"
local work_dir=$(systemctl show "$service" --property=WorkingDirectory --value)
if [[ -n "$work_dir" ]]; then
if [[ -d "$work_dir" ]]; then
echo "工作目录存在: $work_dir"
ls -la "$work_dir"
else
echo -e "${RED}工作目录不存在: $work_dir${NC}"
fi
fi
# 尝试在foreground运行
echo -e "\n${YELLOW}尝试前台运行:${NC}"
local exec_start=$(systemctl show "$service" --property=ExecStart --value)
if [[ -n "$exec_start" ]]; then
echo "执行命令: $exec_start"
# 这里可以实际执行命令来测试
fi
}
# 自动修复常见问题
auto_fix_service() {
local service=$1
log "INFO" "尝试自动修复服务: $service"
# 1. 重新加载systemd配置
echo "重新加载systemd配置..."
systemctl daemon-reload
# 2. 重置服务状态
echo "重置服务状态..."
systemctl reset-failed "$service" <strong>2</strong>>/dev/null
# 3. 检查并修复单元文件
local unit_file=$(systemctl show "$service" --property=FragmentPath --value)
if [[ -f "$unit_file" ]]; then
echo "检查单元文件权限..."
chmod 644 "$unit_file"
fi
# 4. 检查日志目录权限
local log_path=$(journalctl -u "$service" --no-pager | grep -o "/var/log/[^ ]*" | head -1)
if [[ -n "$log_path" ]]; then
local log_dir=$(dirname "$log_path")
echo "检查日志目录权限: $log_dir"
mkdir -p "$log_dir"
chmod 755 "$log_dir"
fi
# 5. 重启服务
echo "尝试重启服务..."
systemctl restart "$service" --no-block
# 等待并检查结果
sleep 5
if systemctl is-active --quiet "$service"; then
log "INFO" "服务修复成功: $service"
echo -e "${GREEN}✓ 服务修复成功${NC}"
return 0
else
log "ERROR" "服务修复失败: $service"
echo -e "${RED}✗ 服务修复失败${NC}"
return 1
fi
}
# 监控服务健康状况
monitor_service_health() {
local service=$1
local check_interval=${2:-10}
log "INFO" "开始监控服务健康状况: $service"
echo -e "${BLUE}监控服务: $service (间隔: ${check_interval}s)${NC}"
echo "按 Ctrl+C 停止监控"
while true; do
clear
echo -e "${CYAN}=== 服务健康监控 ===${NC}"
echo "服务: $service"
echo "时间: $(date)"
echo ""
# 检查服务状态
local status=$(systemctl is-active "$service" <strong>2</strong>>/dev/null)
local main_pid=$(systemctl show "$service" --property=MainPID --value)
echo -e "状态: ${status^^}"
if [[ "$status" == "active" ]] && [[ "$main_pid" -ne 0 ]]; then
# 显示进程资源使用
echo -e "\n${YELLOW}进程资源使用:${NC}"
ps -p "$main_pid" -o pid,%cpu,%mem,rss,vsz,etime,cmd --no-headers
# 检查内存泄漏趋势
local rss=$(ps -p "$main_pid" -o rss --no-headers | xargs)
echo "RSS内存: ${rss}KB"
# 检查文件描述符
local fd_count=0
if [[ -d "/proc/$main_pid/fd" ]]; then
fd_count=$(ls -1 "/proc/$main_pid/fd" <strong>2</strong>>/dev/null | wc -l)
fi
echo "文件描述符: $fd_count"
# 检查网络连接
local conn_count=$(ss -tunp <strong>2</strong>>/dev/null | grep "pid=$main_pid" | wc -l)
echo "网络连接: $conn_count"
# 警告阈值
if [[ "$fd_count" -gt 1000 ]]; then
echo -e "${RED}警告: 文件描述符数量过高${NC}"
fi
if [[ "$rss" -gt 1000000 ]]; then # 超过1GB
echo -e "${RED}警告: 内存使用过高${NC}"
fi
else
echo -e "\n${RED}服务未运行或异常${NC}"
echo "尝试恢复..."
systemctl restart "$service"
fi
sleep "$check_interval"
done
}
# 批量检查所有服务
check_all_services() {
log "INFO" "批量检查所有服务"
local all_services=$(systemctl list-unit-files --type=service --no-legend --no-pager | awk '{print $1}')
local failed_services=()
echo -e "${BLUE}=== 批量服务检查 ===${NC}"
echo "检查服务数量: $(echo "$all_services" | wc -l)"
echo ""
for service in $all_services; do
echo -n "检查 $service ... "
if check_service_status "$service"; then
echo -e "${GREEN}✓${NC}"
else
echo -e "${RED}✗${NC}"
failed_services+=("$service")
fi
done
if [[ ${#failed_services[@]} -gt 0 ]]; then
echo -e "\n${RED}发现异常服务 (${#failed_services[@]}个):${NC}"
for svc in "${failed_services[@]}"; do
echo " - $svc"
done
# 生成异常报告
local report_file="$REPORT_DIR/failed_services_$(date +%Y%m%d_%H%M%S).txt"
{
echo "异常服务报告"
echo "生成时间: $(date)"
echo ""
for svc in "${failed_services[@]}"; do
echo "=== $svc ==="
systemctl status "$svc" --no-pager
echo ""
done
} > "$report_file"
echo -e "\n${YELLOW}详细报告: $report_file${NC}"
else
echo -e "\n${GREEN}所有服务运行正常 ✓${NC}"
fi
}
# 主菜单
main_menu() {
while true; do
clear
echo -e "${BLUE}=== systemd服务诊断工具 ===${NC}"
echo "1. 检查单个服务状态"
echo "2. 深度分析服务"
echo "3. 诊断服务启动失败"
echo "4. 自动修复服务"
echo "5. 监控服务健康状况"
echo "6. 批量检查所有服务"
echo "7. 查看诊断日志"
echo "8. 退出"
read -p "请选择操作 [1-8]: " choice
case $choice in
1)
read -p "请输入服务名: " service
check_service_status "$service"
;;
2)
read -p "请输入服务名: " service
deep_analyze_service "$service"
;;
3)
read -p "请输入服务名: " service
diagnose_start_failure "$service"
;;
4)
read -p "请输入服务名: " service
auto_fix_service "$service"
;;
5)
read -p "请输入服务名: " service
read -p "监控间隔(秒) [10]: " interval
monitor_service_health "$service" "${interval:-10}"
;;
6)
check_all_services
;;
7)
view_diagnostic_logs
;;
8)
exit 0
;;
*)
echo "无效选择"
;;
esac
echo -e "\n按回车键继续..."
read
done
}
# 查看诊断日志
view_diagnostic_logs() {
if [[ -f "$LOG_DIR/diagnostic.log" ]]; then
tail -100 "$LOG_DIR/diagnostic.log" | less
else
echo "诊断日志不存在"
fi
}
# 脚本入口
if [[ $# -eq 0 ]]; then
main_menu
else
# 命令行模式
case $1 in
--check)
if [[ $# -ge 2 ]]; then
check_service_status "$2"
else
echo "用法: $0 --check <服务名>"
fi
;;
--analyze)
if [[ $# -ge 2 ]]; then
deep_analyze_service "$2"
else
echo "用法: $0 --analyze <服务名>"
fi
;;
--diagnose)
if [[ $# -ge 2 ]]; then
diagnose_start_failure "$2"
else
echo "用法: $0 --diagnose <服务名>"
fi
;;
--fix)
if [[ $# -ge 2 ]]; then
auto_fix_service "$2"
else
echo "用法: $0 --fix <服务名>"
fi
;;
--monitor)
if [[ $# -ge 2 ]]; then
monitor_service_health "$2" "${3:-10}"
else
echo "用法: $0 --monitor <服务名> [间隔秒数]"
fi
;;
--batch-check)
check_all_services
;;
*)
echo "用法: $0 [选项]"
echo "选项:"
echo " --check <服务名> 检查服务状态"
echo " --analyze <服务名> 深度分析服务"
echo " --diagnose <服务名> 诊断启动失败"
echo " --fix <服务名> 自动修复服务"
echo " --monitor <服务名> [秒] 监控服务健康"
echo " --batch-check 批量检查所有服务"
echo " 无参数 进入交互菜单"
;;
esac
fi
总结
本文提供了完整的Linux服务与进程异常诊断解决方案:
- 进程状态监控:实时监控进程状态,识别僵尸进程、D状态进程等异常
- 核心转储分析:配置和分析核心转储文件,定位程序崩溃原因
- systemd服务诊断:深度分析systemd服务状态,自动修复常见问题
- 资源泄漏排查:监控内存、文件描述符等资源使用,预防泄漏
- 自动化修复:提供多种自动修复方案,快速恢复服务
这些工具和脚本可以直接在生产环境中使用,涵盖了从监控、诊断到修复的完整流程。通过系统化的故障排查方法,可以显著提高Linux系统的稳定性和可维护性。
© 版权声明
THE END













暂无评论内容