whiletrue; do # 检查CPU使用率 cpu_alerts=$(check_cpu_usage) if [ -n "$cpu_alerts" ]; then echo"$cpu_alerts" | whileread alert; do log_message "$alert" done fi
# 检查内存使用率 mem_alerts=$(check_mem_usage) if [ -n "$mem_alerts" ]; then echo"$mem_alerts" | whileread alert; do log_message "$alert" done fi
# 监控进程文件描述符使用 #!/bin/bash # fd_monitor.sh PID=$1 if [ -z "$PID" ]; then echo"Usage: $0 <PID>" exit 1 fi
echo"Monitoring file descriptors for PID $PID" echo"Time,FD Count"
whiletrue; do if ps -p $PID > /dev/null 2>&1; then fd_count=$(ls /proc/$PID/fd 2>/dev/null | wc -l) echo"$(date '+%H:%M:%S'),$fd_count"
if [ $fd_count -gt 1000 ]; then echo"WARNING: High FD count detected!" lsof -p $PID | head -20 fi else echo"Process $PID not found" break fi sleep 5 done
for process in"${CRITICAL_PROCESSES[@]}"; do pid=$(pgrep $process)
if [ -z "$pid" ]; then echo"CRITICAL: Process $process is not running!" # 发送告警 # send_alert "Process $process is down" else echo"OK: Process $process is running (PID: $pid)"
send_email_alert() { local subject="$1" local message="$2" echo"$message" | mail -s "$subject"$ALERT_EMAIL }
send_slack_alert() { local message="$1" curl -X POST -H 'Content-type: application/json' \ --data "{\"text\":\"$message\"}" \ $ALERT_WEBHOOK }
# 检查进程状态 check_process_status() { local process_name="$1" local pid=$(pgrep $process_name)
if [ -z "$pid" ]; then local alert_msg="CRITICAL: Process $process_name is not running!" send_email_alert "Process Down Alert""$alert_msg" send_slack_alert "$alert_msg" return 1 fi
return 0 }
# 检查资源使用 check_resource_usage() { local pid="$1" local process_name="$2"
if (( $(echo "$cpu_usage > 90" | bc -l) )); then local alert_msg="WARNING: Process $process_name CPU usage is ${cpu_usage}%" send_email_alert "High CPU Usage""$alert_msg" send_slack_alert "$alert_msg" fi
if (( $(echo "$mem_usage > 90" | bc -l) )); then local alert_msg="WARNING: Process $process_name memory usage is ${mem_usage}%" send_email_alert "High Memory Usage""$alert_msg" send_slack_alert "$alert_msg" fi }
# 主检查逻辑 main() { local processes=("nginx""mysql""redis""java")
for process in"${processes[@]}"; do if check_process_status "$process"; then local pid=$(pgrep $process) check_resource_usage "$pid""$process" fi done }
if [ -z "$PROCESS_NAME" ]; then echo"Usage: $0 <process_name> [max_restarts] [restart_interval]" exit 1 fi
restart_count=0 last_restart=0
whiletrue; do pid=$(pgrep $PROCESS_NAME)
if [ -z "$pid" ]; then current_time=$(date +%s)
if [ $((current_time - last_restart)) -gt $RESTART_INTERVAL ]; then if [ $restart_count -lt $MAX_RESTARTS ]; then echo"$(date): Process $PROCESS_NAME not running, attempting restart..."
if pgrep $PROCESS_NAME > /dev/null; then echo"$(date): Process $PROCESS_NAME restarted successfully" restart_count=0 # 重置计数器 else echo"$(date): Failed to restart process $PROCESS_NAME" fi else echo"$(date): Maximum restart attempts reached for $PROCESS_NAME" # 发送告警 send_alert "Process $PROCESS_NAME failed to restart after $MAX_RESTARTS attempts" fi fi else restart_count=0 # 进程正常运行,重置计数器 fi
check_process_health() { local pid="$1" local process_name="$2"
# 检查进程是否存在 if ! ps -p $pid > /dev/null 2>&1; then log_health_status "CRITICAL: Process $process_name (PID: $pid) is not running" return 1 fi
# 检查进程状态 local status=$(ps -p $pid -o stat= | cut -c1) if [ "$status" = "Z" ]; then log_health_status "CRITICAL: Process $process_name (PID: $pid) is zombie" return 1 fi
if (( $(echo "$cpu_usage > 95" | bc -l) )); then log_health_status "WARNING: Process $process_name CPU usage is ${cpu_usage}%" fi
if (( $(echo "$mem_usage > 95" | bc -l) )); then log_health_status "WARNING: Process $process_name memory usage is ${mem_usage}%" fi
# 检查文件描述符 local fd_count=$(ls /proc/$pid/fd 2>/dev/null | wc -l) if [ $fd_count -gt 10000 ]; then log_health_status "WARNING: Process $process_name has $fd_count file descriptors" fi
# 检查进程响应性 if ! kill -0 $pid 2>/dev/null; then log_health_status "WARNING: Process $process_name is not responding to signals" fi
log_health_status "OK: Process $process_name is healthy (CPU: ${cpu_usage}%, MEM: ${mem_usage}%, FD: $fd_count)" return 0 }
main() { local processes=("nginx""mysql""redis""java")
whiletrue; do for process in"${processes[@]}"; do local pids=$(pgrep $process)
if [ -n "$pids" ]; then for pid in$pids; do check_process_health "$pid""$process" done else log_health_status "CRITICAL: No instances of process $process found" fi done
# 连接统计 echo -e "\nConnection Statistics:" mysql -e "SHOW STATUS LIKE 'Connections';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Threads_running';" 2>/dev/null
# 内存使用 echo -e "\nMemory Usage:" mysql -e "SHOW STATUS LIKE 'Innodb_buffer_pool_pages_data';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Innodb_buffer_pool_pages_total';" 2>/dev/null
# Redis信息 echo -e "\nRedis Info:" redis-cli info server 2>/dev/null | head -10 redis-cli info memory 2>/dev/null | head -10 redis-cli info stats 2>/dev/null | head -10
# 连接统计 echo -e "\nConnection Statistics:" redis-cli info clients 2>/dev/null