# 监控进程文件描述符使用 #!/bin/bash # fd_monitor.sh PID=$1 if [ -z "$PID" ]; then echo"Usage: $0 <PID>" exit 1 fi
echo"Monitoring file descriptors for PID $PID" echo"Time,FD Count"
whiletrue; do if ps -p $PID > /dev/null 2>&1; then fd_count=$(ls /proc/$PID/fd 2>/dev/null | wc -l) echo"$(date '+%H:%M:%S'),$fd_count" if [ $fd_count -gt 1000 ]; then echo"WARNING: High FD count detected!" lsof -p $PID | head -20 fi else echo"Process $PID not found" break fi sleep 5 done
for process in"${CRITICAL_PROCESSES[@]}"; do pid=$(pgrep $process) if [ -z "$pid" ]; then echo"CRITICAL: Process $process is not running!" # 发送告警 # send_alert "Process $process is down" else echo"OK: Process $process is running (PID: $pid)" # 检查资源使用 cpu_usage=$(ps -p $pid -o %cpu= | tr -d ' ') mem_usage=$(ps -p $pid -o %mem= | tr -d ' ') if (( $(echo "$cpu_usage > 80" | bc -l) )); then echo"WARNING: Process $process CPU usage is ${cpu_usage}%" fi if (( $(echo "$mem_usage > 80" | bc -l) )); then echo"WARNING: Process $process memory usage is ${mem_usage}%" fi fi done
send_email_alert() { local subject="$1" local message="$2" echo"$message" | mail -s "$subject"$ALERT_EMAIL }
send_slack_alert() { local message="$1" curl -X POST -H 'Content-type: application/json' \ --data "{\"text\":\"$message\"}" \ $ALERT_WEBHOOK }
# 检查进程状态 check_process_status() { local process_name="$1" local pid=$(pgrep $process_name) if [ -z "$pid" ]; then local alert_msg="CRITICAL: Process $process_name is not running!" send_email_alert "Process Down Alert""$alert_msg" send_slack_alert "$alert_msg" return 1 fi return 0 }
# 检查资源使用 check_resource_usage() { local pid="$1" local process_name="$2" local cpu_usage=$(ps -p $pid -o %cpu= | tr -d ' ') local mem_usage=$(ps -p $pid -o %mem= | tr -d ' ') if (( $(echo "$cpu_usage > 90" | bc -l) )); then local alert_msg="WARNING: Process $process_name CPU usage is ${cpu_usage}%" send_email_alert "High CPU Usage""$alert_msg" send_slack_alert "$alert_msg" fi if (( $(echo "$mem_usage > 90" | bc -l) )); then local alert_msg="WARNING: Process $process_name memory usage is ${mem_usage}%" send_email_alert "High Memory Usage""$alert_msg" send_slack_alert "$alert_msg" fi }
# 主检查逻辑 main() { local processes=("nginx""mysql""redis""java") for process in"${processes[@]}"; do if check_process_status "$process"; then local pid=$(pgrep $process) check_resource_usage "$pid""$process" fi done }
if [ -z "$PROCESS_NAME" ]; then echo"Usage: $0 <process_name> [max_restarts] [restart_interval]" exit 1 fi
restart_count=0 last_restart=0
whiletrue; do pid=$(pgrep $PROCESS_NAME) if [ -z "$pid" ]; then current_time=$(date +%s) if [ $((current_time - last_restart)) -gt $RESTART_INTERVAL ]; then if [ $restart_count -lt $MAX_RESTARTS ]; then echo"$(date): Process $PROCESS_NAME not running, attempting restart..." # 根据进程类型执行不同的重启命令 case$PROCESS_NAMEin "nginx") systemctl start nginx ;; "mysql") systemctl start mysql ;; "redis") systemctl start redis ;; *) echo"Unknown process type: $PROCESS_NAME" ;; esac restart_count=$((restart_count + 1)) last_restart=$current_time sleep 10 if pgrep $PROCESS_NAME > /dev/null; then echo"$(date): Process $PROCESS_NAME restarted successfully" restart_count=0 # 重置计数器 else echo"$(date): Failed to restart process $PROCESS_NAME" fi else echo"$(date): Maximum restart attempts reached for $PROCESS_NAME" # 发送告警 send_alert "Process $PROCESS_NAME failed to restart after $MAX_RESTARTS attempts" fi fi else restart_count=0 # 进程正常运行,重置计数器 fi sleep 60 done
check_process_health() { local pid="$1" local process_name="$2" # 检查进程是否存在 if ! ps -p $pid > /dev/null 2>&1; then log_health_status "CRITICAL: Process $process_name (PID: $pid) is not running" return 1 fi # 检查进程状态 local status=$(ps -p $pid -o stat= | cut -c1) if [ "$status" = "Z" ]; then log_health_status "CRITICAL: Process $process_name (PID: $pid) is zombie" return 1 fi # 检查资源使用 local cpu_usage=$(ps -p $pid -o %cpu= | tr -d ' ') local mem_usage=$(ps -p $pid -o %mem= | tr -d ' ') if (( $(echo "$cpu_usage > 95" | bc -l) )); then log_health_status "WARNING: Process $process_name CPU usage is ${cpu_usage}%" fi if (( $(echo "$mem_usage > 95" | bc -l) )); then log_health_status "WARNING: Process $process_name memory usage is ${mem_usage}%" fi # 检查文件描述符 local fd_count=$(ls /proc/$pid/fd 2>/dev/null | wc -l) if [ $fd_count -gt 10000 ]; then log_health_status "WARNING: Process $process_name has $fd_count file descriptors" fi # 检查进程响应性 if ! kill -0 $pid 2>/dev/null; then log_health_status "WARNING: Process $process_name is not responding to signals" fi log_health_status "OK: Process $process_name is healthy (CPU: ${cpu_usage}%, MEM: ${mem_usage}%, FD: $fd_count)" return 0 }
main() { local processes=("nginx""mysql""redis""java") whiletrue; do for process in"${processes[@]}"; do local pids=$(pgrep $process) if [ -n "$pids" ]; then for pid in$pids; do check_process_health "$pid""$process" done else log_health_status "CRITICAL: No instances of process $process found" fi done sleep$HEALTH_CHECK_INTERVAL done }
# 连接统计 echo -e "\nConnection Statistics:" mysql -e "SHOW STATUS LIKE 'Connections';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Threads_connected';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Threads_running';" 2>/dev/null
# 内存使用 echo -e "\nMemory Usage:" mysql -e "SHOW STATUS LIKE 'Innodb_buffer_pool_pages_data';" 2>/dev/null mysql -e "SHOW STATUS LIKE 'Innodb_buffer_pool_pages_total';" 2>/dev/null
# Redis信息 echo -e "\nRedis Info:" redis-cli info server 2>/dev/null | head -10 redis-cli info memory 2>/dev/null | head -10 redis-cli info stats 2>/dev/null | head -10
# 连接统计 echo -e "\nConnection Statistics:" redis-cli info clients 2>/dev/null